1 // © 2017 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
4 // umutablecptrie.cpp (inspired by utrie2_builder.cpp)
5 // created: 2017dec29 Markus W. Scherer
7 // #define UCPTRIE_DEBUG
12 #include "unicode/utypes.h"
13 #include "unicode/ucptrie.h"
14 #include "unicode/umutablecptrie.h"
15 #include "unicode/uobject.h"
16 #include "unicode/utf16.h"
19 #include "ucptrie_impl.h"
21 // ICU-20235 In case Microsoft math.h has defined this, undefine it.
30 constexpr int32_t MAX_UNICODE
= 0x10ffff;
32 constexpr int32_t UNICODE_LIMIT
= 0x110000;
33 constexpr int32_t BMP_LIMIT
= 0x10000;
34 constexpr int32_t ASCII_LIMIT
= 0x80;
36 constexpr int32_t I_LIMIT
= UNICODE_LIMIT
>> UCPTRIE_SHIFT_3
;
37 constexpr int32_t BMP_I_LIMIT
= BMP_LIMIT
>> UCPTRIE_SHIFT_3
;
38 constexpr int32_t ASCII_I_LIMIT
= ASCII_LIMIT
>> UCPTRIE_SHIFT_3
;
40 constexpr int32_t SMALL_DATA_BLOCKS_PER_BMP_BLOCK
= (1 << (UCPTRIE_FAST_SHIFT
- UCPTRIE_SHIFT_3
));
42 // Flag values for data blocks.
43 constexpr uint8_t ALL_SAME
= 0;
44 constexpr uint8_t MIXED
= 1;
45 constexpr uint8_t SAME_AS
= 2;
47 /** Start with allocation of 16k data entries. */
48 constexpr int32_t INITIAL_DATA_LENGTH
= ((int32_t)1 << 14);
50 /** Grow about 8x each time. */
51 constexpr int32_t MEDIUM_DATA_LENGTH
= ((int32_t)1 << 17);
54 * Maximum length of the build-time data array.
55 * One entry per 0x110000 code points.
57 constexpr int32_t MAX_DATA_LENGTH
= UNICODE_LIMIT
;
59 // Flag values for index-3 blocks while compacting/building.
60 constexpr uint8_t I3_NULL
= 0;
61 constexpr uint8_t I3_BMP
= 1;
62 constexpr uint8_t I3_16
= 2;
63 constexpr uint8_t I3_18
= 3;
65 constexpr int32_t INDEX_3_18BIT_BLOCK_LENGTH
= UCPTRIE_INDEX_3_BLOCK_LENGTH
+ UCPTRIE_INDEX_3_BLOCK_LENGTH
/ 8;
70 class MutableCodePointTrie
: public UMemory
{
72 MutableCodePointTrie(uint32_t initialValue
, uint32_t errorValue
, UErrorCode
&errorCode
);
73 MutableCodePointTrie(const MutableCodePointTrie
&other
, UErrorCode
&errorCode
);
74 MutableCodePointTrie(const MutableCodePointTrie
&other
) = delete;
75 ~MutableCodePointTrie();
77 MutableCodePointTrie
&operator=(const MutableCodePointTrie
&other
) = delete;
79 static MutableCodePointTrie
*fromUCPMap(const UCPMap
*map
, UErrorCode
&errorCode
);
80 static MutableCodePointTrie
*fromUCPTrie(const UCPTrie
*trie
, UErrorCode
&errorCode
);
82 uint32_t get(UChar32 c
) const;
83 int32_t getRange(UChar32 start
, UCPMapValueFilter
*filter
, const void *context
,
84 uint32_t *pValue
) const;
86 void set(UChar32 c
, uint32_t value
, UErrorCode
&errorCode
);
87 void setRange(UChar32 start
, UChar32 end
, uint32_t value
, UErrorCode
&errorCode
);
89 UCPTrie
*build(UCPTrieType type
, UCPTrieValueWidth valueWidth
, UErrorCode
&errorCode
);
94 bool ensureHighStart(UChar32 c
);
95 int32_t allocDataBlock(int32_t blockLength
);
96 int32_t getDataBlock(int32_t i
);
98 void maskValues(uint32_t mask
);
99 UChar32
findHighStart() const;
100 int32_t compactWholeDataBlocks(int32_t fastILimit
, AllSameBlocks
&allSameBlocks
);
102 int32_t fastILimit
, uint32_t *newData
, int32_t newDataCapacity
,
103 int32_t dataNullIndex
, MixedBlocks
&mixedBlocks
, UErrorCode
&errorCode
);
104 int32_t compactIndex(int32_t fastILimit
, MixedBlocks
&mixedBlocks
, UErrorCode
&errorCode
);
105 int32_t compactTrie(int32_t fastILimit
, UErrorCode
&errorCode
);
107 uint32_t *index
= nullptr;
108 int32_t indexCapacity
= 0;
109 int32_t index3NullOffset
= -1;
110 uint32_t *data
= nullptr;
111 int32_t dataCapacity
= 0;
112 int32_t dataLength
= 0;
113 int32_t dataNullOffset
= -1;
115 uint32_t origInitialValue
;
116 uint32_t initialValue
;
125 /** Temporary array while building the final data. */
126 uint16_t *index16
= nullptr;
127 uint8_t flags
[UNICODE_LIMIT
>> UCPTRIE_SHIFT_3
];
130 MutableCodePointTrie::MutableCodePointTrie(uint32_t iniValue
, uint32_t errValue
, UErrorCode
&errorCode
) :
131 origInitialValue(iniValue
), initialValue(iniValue
), errorValue(errValue
),
132 highStart(0), highValue(initialValue
)
137 if (U_FAILURE(errorCode
)) { return; }
138 index
= (uint32_t *)uprv_malloc(BMP_I_LIMIT
* 4);
139 data
= (uint32_t *)uprv_malloc(INITIAL_DATA_LENGTH
* 4);
140 if (index
== nullptr || data
== nullptr) {
141 errorCode
= U_MEMORY_ALLOCATION_ERROR
;
144 indexCapacity
= BMP_I_LIMIT
;
145 dataCapacity
= INITIAL_DATA_LENGTH
;
148 MutableCodePointTrie::MutableCodePointTrie(const MutableCodePointTrie
&other
, UErrorCode
&errorCode
) :
149 index3NullOffset(other
.index3NullOffset
),
150 dataNullOffset(other
.dataNullOffset
),
151 origInitialValue(other
.origInitialValue
), initialValue(other
.initialValue
),
152 errorValue(other
.errorValue
),
153 highStart(other
.highStart
), highValue(other
.highValue
)
155 , name("mutable clone")
158 if (U_FAILURE(errorCode
)) { return; }
159 int32_t iCapacity
= highStart
<= BMP_LIMIT
? BMP_I_LIMIT
: I_LIMIT
;
160 index
= (uint32_t *)uprv_malloc(iCapacity
* 4);
161 data
= (uint32_t *)uprv_malloc(other
.dataCapacity
* 4);
162 if (index
== nullptr || data
== nullptr) {
163 errorCode
= U_MEMORY_ALLOCATION_ERROR
;
166 indexCapacity
= iCapacity
;
167 dataCapacity
= other
.dataCapacity
;
169 int32_t iLimit
= highStart
>> UCPTRIE_SHIFT_3
;
170 uprv_memcpy(flags
, other
.flags
, iLimit
);
171 uprv_memcpy(index
, other
.index
, iLimit
* 4);
172 uprv_memcpy(data
, other
.data
, (size_t)other
.dataLength
* 4);
173 dataLength
= other
.dataLength
;
174 U_ASSERT(other
.index16
== nullptr);
177 MutableCodePointTrie::~MutableCodePointTrie() {
183 MutableCodePointTrie
*MutableCodePointTrie::fromUCPMap(const UCPMap
*map
, UErrorCode
&errorCode
) {
184 // Use the highValue as the initialValue to reduce the highStart.
185 uint32_t errorValue
= ucpmap_get(map
, -1);
186 uint32_t initialValue
= ucpmap_get(map
, 0x10ffff);
187 LocalPointer
<MutableCodePointTrie
> mutableTrie(
188 new MutableCodePointTrie(initialValue
, errorValue
, errorCode
),
190 if (U_FAILURE(errorCode
)) {
193 UChar32 start
= 0, end
;
195 while ((end
= ucpmap_getRange(map
, start
, UCPMAP_RANGE_NORMAL
, 0,
196 nullptr, nullptr, &value
)) >= 0) {
197 if (value
!= initialValue
) {
199 mutableTrie
->set(start
, value
, errorCode
);
201 mutableTrie
->setRange(start
, end
, value
, errorCode
);
206 if (U_SUCCESS(errorCode
)) {
207 return mutableTrie
.orphan();
213 MutableCodePointTrie
*MutableCodePointTrie::fromUCPTrie(const UCPTrie
*trie
, UErrorCode
&errorCode
) {
214 // Use the highValue as the initialValue to reduce the highStart.
216 uint32_t initialValue
;
217 switch (trie
->valueWidth
) {
218 case UCPTRIE_VALUE_BITS_16
:
219 errorValue
= trie
->data
.ptr16
[trie
->dataLength
- UCPTRIE_ERROR_VALUE_NEG_DATA_OFFSET
];
220 initialValue
= trie
->data
.ptr16
[trie
->dataLength
- UCPTRIE_HIGH_VALUE_NEG_DATA_OFFSET
];
222 case UCPTRIE_VALUE_BITS_32
:
223 errorValue
= trie
->data
.ptr32
[trie
->dataLength
- UCPTRIE_ERROR_VALUE_NEG_DATA_OFFSET
];
224 initialValue
= trie
->data
.ptr32
[trie
->dataLength
- UCPTRIE_HIGH_VALUE_NEG_DATA_OFFSET
];
226 case UCPTRIE_VALUE_BITS_8
:
227 errorValue
= trie
->data
.ptr8
[trie
->dataLength
- UCPTRIE_ERROR_VALUE_NEG_DATA_OFFSET
];
228 initialValue
= trie
->data
.ptr8
[trie
->dataLength
- UCPTRIE_HIGH_VALUE_NEG_DATA_OFFSET
];
231 // Unreachable if the trie is properly initialized.
232 errorCode
= U_ILLEGAL_ARGUMENT_ERROR
;
235 LocalPointer
<MutableCodePointTrie
> mutableTrie(
236 new MutableCodePointTrie(initialValue
, errorValue
, errorCode
),
238 if (U_FAILURE(errorCode
)) {
241 UChar32 start
= 0, end
;
243 while ((end
= ucptrie_getRange(trie
, start
, UCPMAP_RANGE_NORMAL
, 0,
244 nullptr, nullptr, &value
)) >= 0) {
245 if (value
!= initialValue
) {
247 mutableTrie
->set(start
, value
, errorCode
);
249 mutableTrie
->setRange(start
, end
, value
, errorCode
);
254 if (U_SUCCESS(errorCode
)) {
255 return mutableTrie
.orphan();
261 void MutableCodePointTrie::clear() {
262 index3NullOffset
= dataNullOffset
= -1;
264 highValue
= initialValue
= origInitialValue
;
270 uint32_t MutableCodePointTrie::get(UChar32 c
) const {
271 if ((uint32_t)c
> MAX_UNICODE
) {
274 if (c
>= highStart
) {
277 int32_t i
= c
>> UCPTRIE_SHIFT_3
;
278 if (flags
[i
] == ALL_SAME
) {
281 return data
[index
[i
] + (c
& UCPTRIE_SMALL_DATA_MASK
)];
285 inline uint32_t maybeFilterValue(uint32_t value
, uint32_t initialValue
, uint32_t nullValue
,
286 UCPMapValueFilter
*filter
, const void *context
) {
287 if (value
== initialValue
) {
289 } else if (filter
!= nullptr) {
290 value
= filter(context
, value
);
295 UChar32
MutableCodePointTrie::getRange(
296 UChar32 start
, UCPMapValueFilter
*filter
, const void *context
,
297 uint32_t *pValue
) const {
298 if ((uint32_t)start
> MAX_UNICODE
) {
301 if (start
>= highStart
) {
302 if (pValue
!= nullptr) {
303 uint32_t value
= highValue
;
304 if (filter
!= nullptr) { value
= filter(context
, value
); }
309 uint32_t nullValue
= initialValue
;
310 if (filter
!= nullptr) { nullValue
= filter(context
, nullValue
); }
312 uint32_t trieValue
, value
;
313 bool haveValue
= false;
314 int32_t i
= c
>> UCPTRIE_SHIFT_3
;
316 if (flags
[i
] == ALL_SAME
) {
317 uint32_t trieValue2
= index
[i
];
319 if (trieValue2
!= trieValue
) {
320 if (filter
== nullptr ||
321 maybeFilterValue(trieValue2
, initialValue
, nullValue
,
322 filter
, context
) != value
) {
325 trieValue
= trieValue2
; // may or may not help
328 trieValue
= trieValue2
;
329 value
= maybeFilterValue(trieValue2
, initialValue
, nullValue
, filter
, context
);
330 if (pValue
!= nullptr) { *pValue
= value
; }
333 c
= (c
+ UCPTRIE_SMALL_DATA_BLOCK_LENGTH
) & ~UCPTRIE_SMALL_DATA_MASK
;
335 int32_t di
= index
[i
] + (c
& UCPTRIE_SMALL_DATA_MASK
);
336 uint32_t trieValue2
= data
[di
];
338 if (trieValue2
!= trieValue
) {
339 if (filter
== nullptr ||
340 maybeFilterValue(trieValue2
, initialValue
, nullValue
,
341 filter
, context
) != value
) {
344 trieValue
= trieValue2
; // may or may not help
347 trieValue
= trieValue2
;
348 value
= maybeFilterValue(trieValue2
, initialValue
, nullValue
, filter
, context
);
349 if (pValue
!= nullptr) { *pValue
= value
; }
352 while ((++c
& UCPTRIE_SMALL_DATA_MASK
) != 0) {
353 trieValue2
= data
[++di
];
354 if (trieValue2
!= trieValue
) {
355 if (filter
== nullptr ||
356 maybeFilterValue(trieValue2
, initialValue
, nullValue
,
357 filter
, context
) != value
) {
361 trieValue
= trieValue2
; // may or may not help
365 } while (c
< highStart
);
367 if (maybeFilterValue(highValue
, initialValue
, nullValue
,
368 filter
, context
) != value
) {
376 writeBlock(uint32_t *block
, uint32_t value
) {
377 uint32_t *limit
= block
+ UCPTRIE_SMALL_DATA_BLOCK_LENGTH
;
378 while (block
< limit
) {
383 bool MutableCodePointTrie::ensureHighStart(UChar32 c
) {
384 if (c
>= highStart
) {
385 // Round up to a UCPTRIE_CP_PER_INDEX_2_ENTRY boundary to simplify compaction.
386 c
= (c
+ UCPTRIE_CP_PER_INDEX_2_ENTRY
) & ~(UCPTRIE_CP_PER_INDEX_2_ENTRY
- 1);
387 int32_t i
= highStart
>> UCPTRIE_SHIFT_3
;
388 int32_t iLimit
= c
>> UCPTRIE_SHIFT_3
;
389 if (iLimit
> indexCapacity
) {
390 uint32_t *newIndex
= (uint32_t *)uprv_malloc(I_LIMIT
* 4);
391 if (newIndex
== nullptr) { return false; }
392 uprv_memcpy(newIndex
, index
, i
* 4);
395 indexCapacity
= I_LIMIT
;
399 index
[i
] = initialValue
;
400 } while(++i
< iLimit
);
406 int32_t MutableCodePointTrie::allocDataBlock(int32_t blockLength
) {
407 int32_t newBlock
= dataLength
;
408 int32_t newTop
= newBlock
+ blockLength
;
409 if (newTop
> dataCapacity
) {
411 if (dataCapacity
< MEDIUM_DATA_LENGTH
) {
412 capacity
= MEDIUM_DATA_LENGTH
;
413 } else if (dataCapacity
< MAX_DATA_LENGTH
) {
414 capacity
= MAX_DATA_LENGTH
;
416 // Should never occur.
417 // Either MAX_DATA_LENGTH is incorrect,
418 // or the code writes more values than should be possible.
421 uint32_t *newData
= (uint32_t *)uprv_malloc(capacity
* 4);
422 if (newData
== nullptr) {
425 uprv_memcpy(newData
, data
, (size_t)dataLength
* 4);
428 dataCapacity
= capacity
;
435 * No error checking for illegal arguments.
437 * @return -1 if no new data block available (out of memory in data array)
440 int32_t MutableCodePointTrie::getDataBlock(int32_t i
) {
441 if (flags
[i
] == MIXED
) {
444 if (i
< BMP_I_LIMIT
) {
445 int32_t newBlock
= allocDataBlock(UCPTRIE_FAST_DATA_BLOCK_LENGTH
);
446 if (newBlock
< 0) { return newBlock
; }
447 int32_t iStart
= i
& ~(SMALL_DATA_BLOCKS_PER_BMP_BLOCK
-1);
448 int32_t iLimit
= iStart
+ SMALL_DATA_BLOCKS_PER_BMP_BLOCK
;
450 U_ASSERT(flags
[iStart
] == ALL_SAME
);
451 writeBlock(data
+ newBlock
, index
[iStart
]);
452 flags
[iStart
] = MIXED
;
453 index
[iStart
++] = newBlock
;
454 newBlock
+= UCPTRIE_SMALL_DATA_BLOCK_LENGTH
;
455 } while (iStart
< iLimit
);
458 int32_t newBlock
= allocDataBlock(UCPTRIE_SMALL_DATA_BLOCK_LENGTH
);
459 if (newBlock
< 0) { return newBlock
; }
460 writeBlock(data
+ newBlock
, index
[i
]);
467 void MutableCodePointTrie::set(UChar32 c
, uint32_t value
, UErrorCode
&errorCode
) {
468 if (U_FAILURE(errorCode
)) {
471 if ((uint32_t)c
> MAX_UNICODE
) {
472 errorCode
= U_ILLEGAL_ARGUMENT_ERROR
;
477 if (!ensureHighStart(c
) || (block
= getDataBlock(c
>> UCPTRIE_SHIFT_3
)) < 0) {
478 errorCode
= U_MEMORY_ALLOCATION_ERROR
;
482 data
[block
+ (c
& UCPTRIE_SMALL_DATA_MASK
)] = value
;
486 fillBlock(uint32_t *block
, UChar32 start
, UChar32 limit
, uint32_t value
) {
487 uint32_t *pLimit
= block
+ limit
;
489 while (block
< pLimit
) {
494 void MutableCodePointTrie::setRange(UChar32 start
, UChar32 end
, uint32_t value
, UErrorCode
&errorCode
) {
495 if (U_FAILURE(errorCode
)) {
498 if ((uint32_t)start
> MAX_UNICODE
|| (uint32_t)end
> MAX_UNICODE
|| start
> end
) {
499 errorCode
= U_ILLEGAL_ARGUMENT_ERROR
;
502 if (!ensureHighStart(end
)) {
503 errorCode
= U_MEMORY_ALLOCATION_ERROR
;
507 UChar32 limit
= end
+ 1;
508 if (start
& UCPTRIE_SMALL_DATA_MASK
) {
509 // Set partial block at [start..following block boundary[.
510 int32_t block
= getDataBlock(start
>> UCPTRIE_SHIFT_3
);
512 errorCode
= U_MEMORY_ALLOCATION_ERROR
;
516 UChar32 nextStart
= (start
+ UCPTRIE_SMALL_DATA_MASK
) & ~UCPTRIE_SMALL_DATA_MASK
;
517 if (nextStart
<= limit
) {
518 fillBlock(data
+ block
, start
& UCPTRIE_SMALL_DATA_MASK
, UCPTRIE_SMALL_DATA_BLOCK_LENGTH
,
522 fillBlock(data
+ block
, start
& UCPTRIE_SMALL_DATA_MASK
, limit
& UCPTRIE_SMALL_DATA_MASK
,
528 // Number of positions in the last, partial block.
529 int32_t rest
= limit
& UCPTRIE_SMALL_DATA_MASK
;
531 // Round down limit to a block boundary.
532 limit
&= ~UCPTRIE_SMALL_DATA_MASK
;
534 // Iterate over all-value blocks.
535 while (start
< limit
) {
536 int32_t i
= start
>> UCPTRIE_SHIFT_3
;
537 if (flags
[i
] == ALL_SAME
) {
540 fillBlock(data
+ index
[i
], 0, UCPTRIE_SMALL_DATA_BLOCK_LENGTH
, value
);
542 start
+= UCPTRIE_SMALL_DATA_BLOCK_LENGTH
;
546 // Set partial block at [last block boundary..limit[.
547 int32_t block
= getDataBlock(start
>> UCPTRIE_SHIFT_3
);
549 errorCode
= U_MEMORY_ALLOCATION_ERROR
;
553 fillBlock(data
+ block
, 0, rest
, value
);
557 /* compaction --------------------------------------------------------------- */
559 void MutableCodePointTrie::maskValues(uint32_t mask
) {
560 initialValue
&= mask
;
563 int32_t iLimit
= highStart
>> UCPTRIE_SHIFT_3
;
564 for (int32_t i
= 0; i
< iLimit
; ++i
) {
565 if (flags
[i
] == ALL_SAME
) {
569 for (int32_t i
= 0; i
< dataLength
; ++i
) {
574 template<typename UIntA
, typename UIntB
>
575 bool equalBlocks(const UIntA
*s
, const UIntB
*t
, int32_t length
) {
576 while (length
> 0 && *s
== *t
) {
584 bool allValuesSameAs(const uint32_t *p
, int32_t length
, uint32_t value
) {
585 const uint32_t *pLimit
= p
+ length
;
586 while (p
< pLimit
&& *p
== value
) { ++p
; }
590 /** Search for an identical block. */
591 int32_t findSameBlock(const uint16_t *p
, int32_t pStart
, int32_t length
,
592 const uint16_t *q
, int32_t qStart
, int32_t blockLength
) {
593 // Ensure that we do not even partially get past length.
594 length
-= blockLength
;
597 while (pStart
<= length
) {
598 if (equalBlocks(p
+ pStart
, q
, blockLength
)) {
606 int32_t findAllSameBlock(const uint32_t *p
, int32_t start
, int32_t limit
,
607 uint32_t value
, int32_t blockLength
) {
608 // Ensure that we do not even partially get past limit.
609 limit
-= blockLength
;
611 for (int32_t block
= start
; block
<= limit
; ++block
) {
612 if (p
[block
] == value
) {
613 for (int32_t i
= 1;; ++i
) {
614 if (i
== blockLength
) {
617 if (p
[block
+ i
] != value
) {
628 * Look for maximum overlap of the beginning of the other block
629 * with the previous, adjacent block.
631 template<typename UIntA
, typename UIntB
>
632 int32_t getOverlap(const UIntA
*p
, int32_t length
,
633 const UIntB
*q
, int32_t qStart
, int32_t blockLength
) {
634 int32_t overlap
= blockLength
- 1;
635 U_ASSERT(overlap
<= length
);
637 while (overlap
> 0 && !equalBlocks(p
+ (length
- overlap
), q
, overlap
)) {
643 int32_t getAllSameOverlap(const uint32_t *p
, int32_t length
, uint32_t value
,
644 int32_t blockLength
) {
645 int32_t min
= length
- (blockLength
- 1);
647 while (min
< i
&& p
[i
- 1] == value
) { --i
; }
651 bool isStartOfSomeFastBlock(uint32_t dataOffset
, const uint32_t index
[], int32_t fastILimit
) {
652 for (int32_t i
= 0; i
< fastILimit
; i
+= SMALL_DATA_BLOCKS_PER_BMP_BLOCK
) {
653 if (index
[i
] == dataOffset
) {
661 * Finds the start of the last range in the trie by enumerating backward.
662 * Indexes for code points higher than this will be omitted.
664 UChar32
MutableCodePointTrie::findHighStart() const {
665 int32_t i
= highStart
>> UCPTRIE_SHIFT_3
;
668 if (flags
[--i
] == ALL_SAME
) {
669 match
= index
[i
] == highValue
;
671 const uint32_t *p
= data
+ index
[i
];
672 for (int32_t j
= 0;; ++j
) {
673 if (j
== UCPTRIE_SMALL_DATA_BLOCK_LENGTH
) {
677 if (p
[j
] != highValue
) {
684 return (i
+ 1) << UCPTRIE_SHIFT_3
;
690 class AllSameBlocks
{
692 static constexpr int32_t NEW_UNIQUE
= -1;
693 static constexpr int32_t OVERFLOW
= -2;
695 AllSameBlocks() : length(0), mostRecent(-1) {}
697 int32_t findOrAdd(int32_t index
, int32_t count
, uint32_t value
) {
698 if (mostRecent
>= 0 && values
[mostRecent
] == value
) {
699 refCounts
[mostRecent
] += count
;
700 return indexes
[mostRecent
];
702 for (int32_t i
= 0; i
< length
; ++i
) {
703 if (values
[i
] == value
) {
705 refCounts
[i
] += count
;
709 if (length
== CAPACITY
) {
713 indexes
[length
] = index
;
714 values
[length
] = value
;
715 refCounts
[length
++] = count
;
719 /** Replaces the block which has the lowest reference count. */
720 void add(int32_t index
, int32_t count
, uint32_t value
) {
721 U_ASSERT(length
== CAPACITY
);
723 int32_t leastCount
= I_LIMIT
;
724 for (int32_t i
= 0; i
< length
; ++i
) {
725 U_ASSERT(values
[i
] != value
);
726 if (refCounts
[i
] < leastCount
) {
728 leastCount
= refCounts
[i
];
731 U_ASSERT(least
>= 0);
733 indexes
[least
] = index
;
734 values
[least
] = value
;
735 refCounts
[least
] = count
;
738 int32_t findMostUsed() const {
739 if (length
== 0) { return -1; }
741 int32_t maxCount
= 0;
742 for (int32_t i
= 0; i
< length
; ++i
) {
743 if (refCounts
[i
] > maxCount
) {
745 maxCount
= refCounts
[i
];
752 static constexpr int32_t CAPACITY
= 32;
757 int32_t indexes
[CAPACITY
];
758 uint32_t values
[CAPACITY
];
759 int32_t refCounts
[CAPACITY
];
762 // Custom hash table for mixed-value blocks to be found anywhere in the
763 // compacted data or index so far.
771 bool init(int32_t maxLength
, int32_t newBlockLength
) {
772 // We store actual data indexes + 1 to reserve 0 for empty entries.
773 int32_t maxDataIndex
= maxLength
- newBlockLength
+ 1;
775 if (maxDataIndex
<= 0xfff) { // 4k
779 } else if (maxDataIndex
<= 0x7fff) { // 32k
783 } else if (maxDataIndex
<= 0x1ffff) { // 128k
788 // maxDataIndex up to around MAX_DATA_LENGTH, ca. 1.1M
793 if (newLength
> capacity
) {
795 table
= (uint32_t *)uprv_malloc(newLength
* 4);
796 if (table
== nullptr) {
799 capacity
= newLength
;
802 uprv_memset(table
, 0, length
* 4);
804 blockLength
= newBlockLength
;
808 template<typename UInt
>
809 void extend(const UInt
*data
, int32_t minStart
, int32_t prevDataLength
, int32_t newDataLength
) {
810 int32_t start
= prevDataLength
- blockLength
;
811 if (start
>= minStart
) {
812 ++start
; // Skip the last block that we added last time.
814 start
= minStart
; // Begin with the first full block.
816 for (int32_t end
= newDataLength
- blockLength
; start
<= end
; ++start
) {
817 uint32_t hashCode
= makeHashCode(data
, start
);
818 addEntry(data
, start
, hashCode
, start
);
822 template<typename UIntA
, typename UIntB
>
823 int32_t findBlock(const UIntA
*data
, const UIntB
*blockData
, int32_t blockStart
) const {
824 uint32_t hashCode
= makeHashCode(blockData
, blockStart
);
825 int32_t entryIndex
= findEntry(data
, blockData
, blockStart
, hashCode
);
826 if (entryIndex
>= 0) {
827 return (table
[entryIndex
] & mask
) - 1;
833 int32_t findAllSameBlock(const uint32_t *data
, uint32_t blockValue
) const {
834 uint32_t hashCode
= makeHashCode(blockValue
);
835 int32_t entryIndex
= findEntry(data
, blockValue
, hashCode
);
836 if (entryIndex
>= 0) {
837 return (table
[entryIndex
] & mask
) - 1;
844 template<typename UInt
>
845 uint32_t makeHashCode(const UInt
*blockData
, int32_t blockStart
) const {
846 int32_t blockLimit
= blockStart
+ blockLength
;
847 uint32_t hashCode
= blockData
[blockStart
++];
849 hashCode
= 37 * hashCode
+ blockData
[blockStart
++];
850 } while (blockStart
< blockLimit
);
854 uint32_t makeHashCode(uint32_t blockValue
) const {
855 uint32_t hashCode
= blockValue
;
856 for (int32_t i
= 1; i
< blockLength
; ++i
) {
857 hashCode
= 37 * hashCode
+ blockValue
;
862 template<typename UInt
>
863 void addEntry(const UInt
*data
, int32_t blockStart
, uint32_t hashCode
, int32_t dataIndex
) {
864 U_ASSERT(0 <= dataIndex
&& dataIndex
< (int32_t)mask
);
865 int32_t entryIndex
= findEntry(data
, data
, blockStart
, hashCode
);
866 if (entryIndex
< 0) {
867 table
[~entryIndex
] = (hashCode
<< shift
) | (dataIndex
+ 1);
871 template<typename UIntA
, typename UIntB
>
872 int32_t findEntry(const UIntA
*data
, const UIntB
*blockData
, int32_t blockStart
,
873 uint32_t hashCode
) const {
874 uint32_t shiftedHashCode
= hashCode
<< shift
;
875 int32_t initialEntryIndex
= (hashCode
% (length
- 1)) + 1; // 1..length-1
876 for (int32_t entryIndex
= initialEntryIndex
;;) {
877 uint32_t entry
= table
[entryIndex
];
881 if ((entry
& ~mask
) == shiftedHashCode
) {
882 int32_t dataIndex
= (entry
& mask
) - 1;
883 if (equalBlocks(data
+ dataIndex
, blockData
+ blockStart
, blockLength
)) {
887 entryIndex
= nextIndex(initialEntryIndex
, entryIndex
);
891 int32_t findEntry(const uint32_t *data
, uint32_t blockValue
, uint32_t hashCode
) const {
892 uint32_t shiftedHashCode
= hashCode
<< shift
;
893 int32_t initialEntryIndex
= (hashCode
% (length
- 1)) + 1; // 1..length-1
894 for (int32_t entryIndex
= initialEntryIndex
;;) {
895 uint32_t entry
= table
[entryIndex
];
899 if ((entry
& ~mask
) == shiftedHashCode
) {
900 int32_t dataIndex
= (entry
& mask
) - 1;
901 if (allValuesSameAs(data
+ dataIndex
, blockLength
, blockValue
)) {
905 entryIndex
= nextIndex(initialEntryIndex
, entryIndex
);
909 inline int32_t nextIndex(int32_t initialEntryIndex
, int32_t entryIndex
) const {
910 // U_ASSERT(0 < initialEntryIndex && initialEntryIndex < length);
911 return (entryIndex
+ initialEntryIndex
) % length
;
915 // The length is a prime number, larger than the maximum data length.
916 // The "shift" lower bits store a data index + 1.
917 // The remaining upper bits store a partial hashCode of the block data values.
918 uint32_t *table
= nullptr;
919 int32_t capacity
= 0;
924 int32_t blockLength
= 0;
927 int32_t MutableCodePointTrie::compactWholeDataBlocks(int32_t fastILimit
, AllSameBlocks
&allSameBlocks
) {
929 bool overflow
= false;
932 // ASCII data will be stored as a linear table, even if the following code
933 // does not yet count it that way.
934 int32_t newDataCapacity
= ASCII_LIMIT
;
935 // Add room for a small data null block in case it would match the start of
936 // a fast data block where dataNullOffset must not be set in that case.
937 newDataCapacity
+= UCPTRIE_SMALL_DATA_BLOCK_LENGTH
;
938 // Add room for special values (errorValue, highValue) and padding.
939 newDataCapacity
+= 4;
940 int32_t iLimit
= highStart
>> UCPTRIE_SHIFT_3
;
941 int32_t blockLength
= UCPTRIE_FAST_DATA_BLOCK_LENGTH
;
942 int32_t inc
= SMALL_DATA_BLOCKS_PER_BMP_BLOCK
;
943 for (int32_t i
= 0; i
< iLimit
; i
+= inc
) {
944 if (i
== fastILimit
) {
945 blockLength
= UCPTRIE_SMALL_DATA_BLOCK_LENGTH
;
948 uint32_t value
= index
[i
];
949 if (flags
[i
] == MIXED
) {
951 const uint32_t *p
= data
+ value
;
953 if (allValuesSameAs(p
+ 1, blockLength
- 1, value
)) {
956 // Fall through to ALL_SAME handling.
958 newDataCapacity
+= blockLength
;
962 U_ASSERT(flags
[i
] == ALL_SAME
);
964 // Do all of the fast-range data block's ALL_SAME parts have the same value?
966 int32_t next_i
= i
+ inc
;
967 for (int32_t j
= i
+ 1; j
< next_i
; ++j
) {
968 U_ASSERT(flags
[j
] == ALL_SAME
);
969 if (index
[j
] != value
) {
975 // Turn it into a MIXED block.
976 if (getDataBlock(i
) < 0) {
979 newDataCapacity
+= blockLength
;
984 // Is there another ALL_SAME block with the same value?
985 int32_t other
= allSameBlocks
.findOrAdd(i
, inc
, value
);
986 if (other
== AllSameBlocks::OVERFLOW
) {
987 // The fixed-size array overflowed. Slow check for a duplicate block.
990 puts("UCPTrie AllSameBlocks overflow");
994 int32_t jInc
= SMALL_DATA_BLOCKS_PER_BMP_BLOCK
;
995 for (int32_t j
= 0;; j
+= jInc
) {
997 allSameBlocks
.add(i
, inc
, value
);
1000 if (j
== fastILimit
) {
1003 if (flags
[j
] == ALL_SAME
&& index
[j
] == value
) {
1004 allSameBlocks
.add(j
, jInc
+ inc
, value
);
1007 // We could keep counting blocks with the same value
1008 // before we add the first one, which may improve compaction in rare cases,
1009 // but it would make it slower.
1017 // New unique same-value block.
1018 newDataCapacity
+= blockLength
;
1021 return newDataCapacity
;
1024 #ifdef UCPTRIE_DEBUG
1025 # define DEBUG_DO(expr) expr
1027 # define DEBUG_DO(expr)
1030 #ifdef UCPTRIE_DEBUG
1031 // Braille symbols: U+28xx = UTF-8 E2 A0 80..E2 A3 BF
1032 int32_t appendValue(char s
[], int32_t length
, uint32_t value
) {
1033 value
^= value
>> 16;
1034 value
^= value
>> 8;
1036 s
[length
+ 1] = (char)(0xA0 + ((value
>> 6) & 3));
1037 s
[length
+ 2] = (char)(0x80 + (value
& 0x3F));
1041 void printBlock(const uint32_t *block
, int32_t blockLength
, uint32_t value
,
1042 UChar32 start
, int32_t overlap
, uint32_t initialValue
) {
1043 char s
[UCPTRIE_FAST_DATA_BLOCK_LENGTH
* 3 + 3];
1046 for (i
= 0; i
< overlap
; ++i
) {
1047 length
= appendValue(s
, length
, 0); // Braille blank
1050 for (; i
< blockLength
; ++i
) {
1051 if (block
!= nullptr) {
1054 if (value
== initialValue
) {
1055 value
= 0x40; // Braille lower left dot
1057 length
= appendValue(s
, length
, value
);
1061 if (start
<= 0xffff) {
1062 printf(" %04lX %s|\n", (long)start
, s
);
1063 } else if (start
<= 0xfffff) {
1064 printf(" %5lX %s|\n", (long)start
, s
);
1066 printf(" %6lX %s|\n", (long)start
, s
);
1072 * Compacts a build-time trie.
1075 * - removes blocks that are identical with earlier ones
1076 * - overlaps each new non-duplicate block as much as possible with the previously-written one
1077 * - works with fast-range data blocks whose length is a multiple of that of
1078 * higher-code-point data blocks
1080 * It does not try to find an optimal order of writing, deduplicating, and overlapping blocks.
1082 int32_t MutableCodePointTrie::compactData(
1083 int32_t fastILimit
, uint32_t *newData
, int32_t newDataCapacity
,
1084 int32_t dataNullIndex
, MixedBlocks
&mixedBlocks
, UErrorCode
&errorCode
) {
1085 #ifdef UCPTRIE_DEBUG
1086 int32_t countSame
=0, sumOverlaps
=0;
1087 bool printData
= dataLength
== 29088 /* line.brk */ ||
1088 // dataLength == 30048 /* CanonIterData */ ||
1089 dataLength
== 50400 /* zh.txt~stroke */;
1092 // The linear ASCII data has been copied into newData already.
1093 int32_t newDataLength
= 0;
1094 for (int32_t i
= 0; newDataLength
< ASCII_LIMIT
;
1095 newDataLength
+= UCPTRIE_FAST_DATA_BLOCK_LENGTH
, i
+= SMALL_DATA_BLOCKS_PER_BMP_BLOCK
) {
1096 index
[i
] = newDataLength
;
1097 #ifdef UCPTRIE_DEBUG
1099 printBlock(newData
+ newDataLength
, UCPTRIE_FAST_DATA_BLOCK_LENGTH
, 0, newDataLength
, 0, initialValue
);
1104 int32_t blockLength
= UCPTRIE_FAST_DATA_BLOCK_LENGTH
;
1105 if (!mixedBlocks
.init(newDataCapacity
, blockLength
)) {
1106 errorCode
= U_MEMORY_ALLOCATION_ERROR
;
1109 mixedBlocks
.extend(newData
, 0, 0, newDataLength
);
1111 int32_t iLimit
= highStart
>> UCPTRIE_SHIFT_3
;
1112 int32_t inc
= SMALL_DATA_BLOCKS_PER_BMP_BLOCK
;
1113 int32_t fastLength
= 0;
1114 for (int32_t i
= ASCII_I_LIMIT
; i
< iLimit
; i
+= inc
) {
1115 if (i
== fastILimit
) {
1116 blockLength
= UCPTRIE_SMALL_DATA_BLOCK_LENGTH
;
1118 fastLength
= newDataLength
;
1119 if (!mixedBlocks
.init(newDataCapacity
, blockLength
)) {
1120 errorCode
= U_MEMORY_ALLOCATION_ERROR
;
1123 mixedBlocks
.extend(newData
, 0, 0, newDataLength
);
1125 if (flags
[i
] == ALL_SAME
) {
1126 uint32_t value
= index
[i
];
1127 // Find an earlier part of the data array of length blockLength
1128 // that is filled with this value.
1129 int32_t n
= mixedBlocks
.findAllSameBlock(newData
, value
);
1130 // If we find a match, and the current block is the data null block,
1131 // and it is not a fast block but matches the start of a fast block,
1132 // then we need to continue looking.
1133 // This is because this small block is shorter than the fast block,
1134 // and not all of the rest of the fast block is filled with this value.
1135 // Otherwise trie.getRange() would detect that the fast block starts at
1136 // dataNullOffset and assume incorrectly that it is filled with the null value.
1137 while (n
>= 0 && i
== dataNullIndex
&& i
>= fastILimit
&& n
< fastLength
&&
1138 isStartOfSomeFastBlock(n
, index
, fastILimit
)) {
1139 n
= findAllSameBlock(newData
, n
+ 1, newDataLength
, value
, blockLength
);
1142 DEBUG_DO(++countSame
);
1145 n
= getAllSameOverlap(newData
, newDataLength
, value
, blockLength
);
1146 DEBUG_DO(sumOverlaps
+= n
);
1147 #ifdef UCPTRIE_DEBUG
1149 printBlock(nullptr, blockLength
, value
, i
<< UCPTRIE_SHIFT_3
, n
, initialValue
);
1152 index
[i
] = newDataLength
- n
;
1153 int32_t prevDataLength
= newDataLength
;
1154 while (n
< blockLength
) {
1155 newData
[newDataLength
++] = value
;
1158 mixedBlocks
.extend(newData
, 0, prevDataLength
, newDataLength
);
1160 } else if (flags
[i
] == MIXED
) {
1161 const uint32_t *block
= data
+ index
[i
];
1162 int32_t n
= mixedBlocks
.findBlock(newData
, block
, 0);
1164 DEBUG_DO(++countSame
);
1167 n
= getOverlap(newData
, newDataLength
, block
, 0, blockLength
);
1168 DEBUG_DO(sumOverlaps
+= n
);
1169 #ifdef UCPTRIE_DEBUG
1171 printBlock(block
, blockLength
, 0, i
<< UCPTRIE_SHIFT_3
, n
, initialValue
);
1174 index
[i
] = newDataLength
- n
;
1175 int32_t prevDataLength
= newDataLength
;
1176 while (n
< blockLength
) {
1177 newData
[newDataLength
++] = block
[n
++];
1179 mixedBlocks
.extend(newData
, 0, prevDataLength
, newDataLength
);
1181 } else /* SAME_AS */ {
1182 uint32_t j
= index
[i
];
1183 index
[i
] = index
[j
];
1187 #ifdef UCPTRIE_DEBUG
1188 /* we saved some space */
1189 printf("compacting UCPTrie: count of 32-bit data words %lu->%lu countSame=%ld sumOverlaps=%ld\n",
1190 (long)dataLength
, (long)newDataLength
, (long)countSame
, (long)sumOverlaps
);
1192 return newDataLength
;
1195 int32_t MutableCodePointTrie::compactIndex(int32_t fastILimit
, MixedBlocks
&mixedBlocks
,
1196 UErrorCode
&errorCode
) {
1197 int32_t fastIndexLength
= fastILimit
>> (UCPTRIE_FAST_SHIFT
- UCPTRIE_SHIFT_3
);
1198 if ((highStart
>> UCPTRIE_FAST_SHIFT
) <= fastIndexLength
) {
1199 // Only the linear fast index, no multi-stage index tables.
1200 index3NullOffset
= UCPTRIE_NO_INDEX3_NULL_OFFSET
;
1201 return fastIndexLength
;
1204 // Condense the fast index table.
1205 // Also, does it contain an index-3 block with all dataNullOffset?
1206 uint16_t fastIndex
[UCPTRIE_BMP_INDEX_LENGTH
]; // fastIndexLength
1207 int32_t i3FirstNull
= -1;
1208 for (int32_t i
= 0, j
= 0; i
< fastILimit
; ++j
) {
1209 uint32_t i3
= index
[i
];
1210 fastIndex
[j
] = (uint16_t)i3
;
1211 if (i3
== (uint32_t)dataNullOffset
) {
1212 if (i3FirstNull
< 0) {
1214 } else if (index3NullOffset
< 0 &&
1215 (j
- i3FirstNull
+ 1) == UCPTRIE_INDEX_3_BLOCK_LENGTH
) {
1216 index3NullOffset
= i3FirstNull
;
1221 // Set the index entries that compactData() skipped.
1222 // Needed when the multi-stage index covers the fast index range as well.
1223 int32_t iNext
= i
+ SMALL_DATA_BLOCKS_PER_BMP_BLOCK
;
1224 while (++i
< iNext
) {
1225 i3
+= UCPTRIE_SMALL_DATA_BLOCK_LENGTH
;
1230 if (!mixedBlocks
.init(fastIndexLength
, UCPTRIE_INDEX_3_BLOCK_LENGTH
)) {
1231 errorCode
= U_MEMORY_ALLOCATION_ERROR
;
1234 mixedBlocks
.extend(fastIndex
, 0, 0, fastIndexLength
);
1236 // Examine index-3 blocks. For each determine one of:
1237 // - same as the index-3 null block
1238 // - same as a fast-index block
1241 // We store this in the first flags entry for the index-3 block.
1243 // Also determine an upper limit for the index-3 table length.
1244 int32_t index3Capacity
= 0;
1245 i3FirstNull
= index3NullOffset
;
1246 bool hasLongI3Blocks
= false;
1247 // If the fast index covers the whole BMP, then
1248 // the multi-stage index is only for supplementary code points.
1249 // Otherwise, the multi-stage index covers all of Unicode.
1250 int32_t iStart
= fastILimit
< BMP_I_LIMIT
? 0 : BMP_I_LIMIT
;
1251 int32_t iLimit
= highStart
>> UCPTRIE_SHIFT_3
;
1252 for (int32_t i
= iStart
; i
< iLimit
;) {
1254 int32_t jLimit
= i
+ UCPTRIE_INDEX_3_BLOCK_LENGTH
;
1255 uint32_t oredI3
= 0;
1258 uint32_t i3
= index
[j
];
1260 if (i3
!= (uint32_t)dataNullOffset
) {
1263 } while (++j
< jLimit
);
1266 if (i3FirstNull
< 0) {
1267 if (oredI3
<= 0xffff) {
1268 index3Capacity
+= UCPTRIE_INDEX_3_BLOCK_LENGTH
;
1270 index3Capacity
+= INDEX_3_18BIT_BLOCK_LENGTH
;
1271 hasLongI3Blocks
= true;
1276 if (oredI3
<= 0xffff) {
1277 int32_t n
= mixedBlocks
.findBlock(fastIndex
, index
, i
);
1283 index3Capacity
+= UCPTRIE_INDEX_3_BLOCK_LENGTH
;
1287 index3Capacity
+= INDEX_3_18BIT_BLOCK_LENGTH
;
1288 hasLongI3Blocks
= true;
1294 int32_t index2Capacity
= (iLimit
- iStart
) >> UCPTRIE_SHIFT_2_3
;
1296 // Length of the index-1 table, rounded up.
1297 int32_t index1Length
= (index2Capacity
+ UCPTRIE_INDEX_2_MASK
) >> UCPTRIE_SHIFT_1_2
;
1299 // Index table: Fast index, index-1, index-3, index-2.
1300 // +1 for possible index table padding.
1301 int32_t index16Capacity
= fastIndexLength
+ index1Length
+ index3Capacity
+ index2Capacity
+ 1;
1302 index16
= (uint16_t *)uprv_malloc(index16Capacity
* 2);
1303 if (index16
== nullptr) {
1304 errorCode
= U_MEMORY_ALLOCATION_ERROR
;
1307 uprv_memcpy(index16
, fastIndex
, fastIndexLength
* 2);
1309 if (!mixedBlocks
.init(index16Capacity
, UCPTRIE_INDEX_3_BLOCK_LENGTH
)) {
1310 errorCode
= U_MEMORY_ALLOCATION_ERROR
;
1313 MixedBlocks longI3Blocks
;
1314 if (hasLongI3Blocks
) {
1315 if (!longI3Blocks
.init(index16Capacity
, INDEX_3_18BIT_BLOCK_LENGTH
)) {
1316 errorCode
= U_MEMORY_ALLOCATION_ERROR
;
1321 // Compact the index-3 table and write an uncompacted version of the index-2 table.
1322 uint16_t index2
[UNICODE_LIMIT
>> UCPTRIE_SHIFT_2
]; // index2Capacity
1323 int32_t i2Length
= 0;
1324 i3FirstNull
= index3NullOffset
;
1325 int32_t index3Start
= fastIndexLength
+ index1Length
;
1326 int32_t indexLength
= index3Start
;
1327 for (int32_t i
= iStart
; i
< iLimit
; i
+= UCPTRIE_INDEX_3_BLOCK_LENGTH
) {
1329 uint8_t f
= flags
[i
];
1330 if (f
== I3_NULL
&& i3FirstNull
< 0) {
1331 // First index-3 null block. Write & overlap it like a normal block, then remember it.
1332 f
= dataNullOffset
<= 0xffff ? I3_16
: I3_18
;
1336 i3
= index3NullOffset
;
1337 } else if (f
== I3_BMP
) {
1339 } else if (f
== I3_16
) {
1340 int32_t n
= mixedBlocks
.findBlock(index16
, index
, i
);
1344 if (indexLength
== index3Start
) {
1345 // No overlap at the boundary between the index-1 and index-3 tables.
1348 n
= getOverlap(index16
, indexLength
,
1349 index
, i
, UCPTRIE_INDEX_3_BLOCK_LENGTH
);
1351 i3
= indexLength
- n
;
1352 int32_t prevIndexLength
= indexLength
;
1353 while (n
< UCPTRIE_INDEX_3_BLOCK_LENGTH
) {
1354 index16
[indexLength
++] = index
[i
+ n
++];
1356 mixedBlocks
.extend(index16
, index3Start
, prevIndexLength
, indexLength
);
1357 if (hasLongI3Blocks
) {
1358 longI3Blocks
.extend(index16
, index3Start
, prevIndexLength
, indexLength
);
1362 U_ASSERT(f
== I3_18
);
1363 U_ASSERT(hasLongI3Blocks
);
1364 // Encode an index-3 block that contains one or more data indexes exceeding 16 bits.
1366 int32_t jLimit
= i
+ UCPTRIE_INDEX_3_BLOCK_LENGTH
;
1367 int32_t k
= indexLength
;
1370 uint32_t v
= index
[j
++];
1371 uint32_t upperBits
= (v
& 0x30000) >> 2;
1374 upperBits
|= (v
& 0x30000) >> 4;
1377 upperBits
|= (v
& 0x30000) >> 6;
1380 upperBits
|= (v
& 0x30000) >> 8;
1383 upperBits
|= (v
& 0x30000) >> 10;
1386 upperBits
|= (v
& 0x30000) >> 12;
1389 upperBits
|= (v
& 0x30000) >> 14;
1392 upperBits
|= (v
& 0x30000) >> 16;
1394 index16
[k
- 9] = upperBits
;
1395 } while (j
< jLimit
);
1396 int32_t n
= longI3Blocks
.findBlock(index16
, index16
, indexLength
);
1400 if (indexLength
== index3Start
) {
1401 // No overlap at the boundary between the index-1 and index-3 tables.
1404 n
= getOverlap(index16
, indexLength
,
1405 index16
, indexLength
, INDEX_3_18BIT_BLOCK_LENGTH
);
1407 i3
= (indexLength
- n
) | 0x8000;
1408 int32_t prevIndexLength
= indexLength
;
1410 int32_t start
= indexLength
;
1411 while (n
< INDEX_3_18BIT_BLOCK_LENGTH
) {
1412 index16
[indexLength
++] = index16
[start
+ n
++];
1415 indexLength
+= INDEX_3_18BIT_BLOCK_LENGTH
;
1417 mixedBlocks
.extend(index16
, index3Start
, prevIndexLength
, indexLength
);
1418 if (hasLongI3Blocks
) {
1419 longI3Blocks
.extend(index16
, index3Start
, prevIndexLength
, indexLength
);
1423 if (index3NullOffset
< 0 && i3FirstNull
>= 0) {
1424 index3NullOffset
= i3
;
1426 // Set the index-2 table entry.
1427 index2
[i2Length
++] = i3
;
1429 U_ASSERT(i2Length
== index2Capacity
);
1430 U_ASSERT(indexLength
<= index3Start
+ index3Capacity
);
1432 if (index3NullOffset
< 0) {
1433 index3NullOffset
= UCPTRIE_NO_INDEX3_NULL_OFFSET
;
1435 if (indexLength
>= (UCPTRIE_NO_INDEX3_NULL_OFFSET
+ UCPTRIE_INDEX_3_BLOCK_LENGTH
)) {
1436 // The index-3 offsets exceed 15 bits, or
1437 // the last one cannot be distinguished from the no-null-block value.
1438 errorCode
= U_INDEX_OUTOFBOUNDS_ERROR
;
1442 // Compact the index-2 table and write the index-1 table.
1443 static_assert(UCPTRIE_INDEX_2_BLOCK_LENGTH
== UCPTRIE_INDEX_3_BLOCK_LENGTH
,
1444 "must re-init mixedBlocks");
1445 int32_t blockLength
= UCPTRIE_INDEX_2_BLOCK_LENGTH
;
1446 int32_t i1
= fastIndexLength
;
1447 for (int32_t i
= 0; i
< i2Length
; i
+= blockLength
) {
1449 if ((i2Length
- i
) >= blockLength
) {
1451 U_ASSERT(blockLength
== UCPTRIE_INDEX_2_BLOCK_LENGTH
);
1452 n
= mixedBlocks
.findBlock(index16
, index2
, i
);
1454 // highStart is inside the last index-2 block. Shorten it.
1455 blockLength
= i2Length
- i
;
1456 n
= findSameBlock(index16
, index3Start
, indexLength
,
1457 index2
, i
, blockLength
);
1463 if (indexLength
== index3Start
) {
1464 // No overlap at the boundary between the index-1 and index-3/2 tables.
1467 n
= getOverlap(index16
, indexLength
, index2
, i
, blockLength
);
1469 i2
= indexLength
- n
;
1470 int32_t prevIndexLength
= indexLength
;
1471 while (n
< blockLength
) {
1472 index16
[indexLength
++] = index2
[i
+ n
++];
1474 mixedBlocks
.extend(index16
, index3Start
, prevIndexLength
, indexLength
);
1476 // Set the index-1 table entry.
1479 U_ASSERT(i1
== index3Start
);
1480 U_ASSERT(indexLength
<= index16Capacity
);
1482 #ifdef UCPTRIE_DEBUG
1483 /* we saved some space */
1484 printf("compacting UCPTrie: count of 16-bit index words %lu->%lu\n",
1485 (long)iLimit
, (long)indexLength
);
1491 int32_t MutableCodePointTrie::compactTrie(int32_t fastILimit
, UErrorCode
&errorCode
) {
1492 // Find the real highStart and round it up.
1493 U_ASSERT((highStart
& (UCPTRIE_CP_PER_INDEX_2_ENTRY
- 1)) == 0);
1494 highValue
= get(MAX_UNICODE
);
1495 int32_t realHighStart
= findHighStart();
1496 realHighStart
= (realHighStart
+ (UCPTRIE_CP_PER_INDEX_2_ENTRY
- 1)) &
1497 ~(UCPTRIE_CP_PER_INDEX_2_ENTRY
- 1);
1498 if (realHighStart
== UNICODE_LIMIT
) {
1499 highValue
= initialValue
;
1502 #ifdef UCPTRIE_DEBUG
1503 printf("UCPTrie: highStart U+%06lx highValue 0x%lx initialValue 0x%lx\n",
1504 (long)realHighStart
, (long)highValue
, (long)initialValue
);
1507 // We always store indexes and data values for the fast range.
1508 // Pin highStart to the top of that range while building.
1509 UChar32 fastLimit
= fastILimit
<< UCPTRIE_SHIFT_3
;
1510 if (realHighStart
< fastLimit
) {
1511 for (int32_t i
= (realHighStart
>> UCPTRIE_SHIFT_3
); i
< fastILimit
; ++i
) {
1512 flags
[i
] = ALL_SAME
;
1513 index
[i
] = highValue
;
1515 highStart
= fastLimit
;
1517 highStart
= realHighStart
;
1520 uint32_t asciiData
[ASCII_LIMIT
];
1521 for (int32_t i
= 0; i
< ASCII_LIMIT
; ++i
) {
1522 asciiData
[i
] = get(i
);
1525 // First we look for which data blocks have the same value repeated over the whole block,
1526 // deduplicate such blocks, find a good null data block (for faster enumeration),
1527 // and get an upper bound for the necessary data array length.
1528 AllSameBlocks allSameBlocks
;
1529 int32_t newDataCapacity
= compactWholeDataBlocks(fastILimit
, allSameBlocks
);
1530 if (newDataCapacity
< 0) {
1531 errorCode
= U_MEMORY_ALLOCATION_ERROR
;
1534 uint32_t *newData
= (uint32_t *)uprv_malloc(newDataCapacity
* 4);
1535 if (newData
== nullptr) {
1536 errorCode
= U_MEMORY_ALLOCATION_ERROR
;
1539 uprv_memcpy(newData
, asciiData
, sizeof(asciiData
));
1541 int32_t dataNullIndex
= allSameBlocks
.findMostUsed();
1543 MixedBlocks mixedBlocks
;
1544 int32_t newDataLength
= compactData(fastILimit
, newData
, newDataCapacity
,
1545 dataNullIndex
, mixedBlocks
, errorCode
);
1546 if (U_FAILURE(errorCode
)) { return 0; }
1547 U_ASSERT(newDataLength
<= newDataCapacity
);
1550 dataCapacity
= newDataCapacity
;
1551 dataLength
= newDataLength
;
1552 if (dataLength
> (0x3ffff + UCPTRIE_SMALL_DATA_BLOCK_LENGTH
)) {
1553 // The offset of the last data block is too high to be stored in the index table.
1554 errorCode
= U_INDEX_OUTOFBOUNDS_ERROR
;
1558 if (dataNullIndex
>= 0) {
1559 dataNullOffset
= index
[dataNullIndex
];
1560 #ifdef UCPTRIE_DEBUG
1561 if (data
[dataNullOffset
] != initialValue
) {
1562 printf("UCPTrie initialValue %lx -> more common nullValue %lx\n",
1563 (long)initialValue
, (long)data
[dataNullOffset
]);
1566 initialValue
= data
[dataNullOffset
];
1568 dataNullOffset
= UCPTRIE_NO_DATA_NULL_OFFSET
;
1571 int32_t indexLength
= compactIndex(fastILimit
, mixedBlocks
, errorCode
);
1572 highStart
= realHighStart
;
1576 UCPTrie
*MutableCodePointTrie::build(UCPTrieType type
, UCPTrieValueWidth valueWidth
, UErrorCode
&errorCode
) {
1577 if (U_FAILURE(errorCode
)) {
1580 if (type
< UCPTRIE_TYPE_FAST
|| UCPTRIE_TYPE_SMALL
< type
||
1581 valueWidth
< UCPTRIE_VALUE_BITS_16
|| UCPTRIE_VALUE_BITS_8
< valueWidth
) {
1582 errorCode
= U_ILLEGAL_ARGUMENT_ERROR
;
1586 // The mutable trie always stores 32-bit values.
1587 // When we build a UCPTrie for a smaller value width, we first mask off unused bits
1588 // before compacting the data.
1589 switch (valueWidth
) {
1590 case UCPTRIE_VALUE_BITS_32
:
1592 case UCPTRIE_VALUE_BITS_16
:
1595 case UCPTRIE_VALUE_BITS_8
:
1602 UChar32 fastLimit
= type
== UCPTRIE_TYPE_FAST
? BMP_LIMIT
: UCPTRIE_SMALL_LIMIT
;
1603 int32_t indexLength
= compactTrie(fastLimit
>> UCPTRIE_SHIFT_3
, errorCode
);
1604 if (U_FAILURE(errorCode
)) {
1609 // Ensure data table alignment: The index length must be even for uint32_t data.
1610 if (valueWidth
== UCPTRIE_VALUE_BITS_32
&& (indexLength
& 1) != 0) {
1611 index16
[indexLength
++] = 0xffee; // arbitrary value
1614 // Make the total trie structure length a multiple of 4 bytes by padding the data table,
1615 // and store special values as the last two data values.
1616 int32_t length
= indexLength
* 2;
1617 if (valueWidth
== UCPTRIE_VALUE_BITS_16
) {
1618 if (((indexLength
^ dataLength
) & 1) != 0) {
1620 data
[dataLength
++] = errorValue
;
1622 if (data
[dataLength
- 1] != errorValue
|| data
[dataLength
- 2] != highValue
) {
1623 data
[dataLength
++] = highValue
;
1624 data
[dataLength
++] = errorValue
;
1626 length
+= dataLength
* 2;
1627 } else if (valueWidth
== UCPTRIE_VALUE_BITS_32
) {
1628 // 32-bit data words never need padding to a multiple of 4 bytes.
1629 if (data
[dataLength
- 1] != errorValue
|| data
[dataLength
- 2] != highValue
) {
1630 if (data
[dataLength
- 1] != highValue
) {
1631 data
[dataLength
++] = highValue
;
1633 data
[dataLength
++] = errorValue
;
1635 length
+= dataLength
* 4;
1637 int32_t and3
= (length
+ dataLength
) & 3;
1638 if (and3
== 0 && data
[dataLength
- 1] == errorValue
&& data
[dataLength
- 2] == highValue
) {
1640 } else if(and3
== 3 && data
[dataLength
- 1] == highValue
) {
1641 data
[dataLength
++] = errorValue
;
1644 data
[dataLength
++] = highValue
;
1645 and3
= (and3
+ 1) & 3;
1647 data
[dataLength
++] = highValue
;
1648 data
[dataLength
++] = errorValue
;
1650 length
+= dataLength
;
1653 // Calculate the total length of the UCPTrie as a single memory block.
1654 length
+= sizeof(UCPTrie
);
1655 U_ASSERT((length
& 3) == 0);
1657 uint8_t *bytes
= (uint8_t *)uprv_malloc(length
);
1658 if (bytes
== nullptr) {
1659 errorCode
= U_MEMORY_ALLOCATION_ERROR
;
1663 UCPTrie
*trie
= reinterpret_cast<UCPTrie
*>(bytes
);
1664 uprv_memset(trie
, 0, sizeof(UCPTrie
));
1665 trie
->indexLength
= indexLength
;
1666 trie
->dataLength
= dataLength
;
1668 trie
->highStart
= highStart
;
1669 // Round up shifted12HighStart to a multiple of 0x1000 for easy testing from UTF-8 lead bytes.
1670 // Runtime code needs to then test for the real highStart as well.
1671 trie
->shifted12HighStart
= (highStart
+ 0xfff) >> 12;
1673 trie
->valueWidth
= valueWidth
;
1675 trie
->index3NullOffset
= index3NullOffset
;
1676 trie
->dataNullOffset
= dataNullOffset
;
1677 trie
->nullValue
= initialValue
;
1679 bytes
+= sizeof(UCPTrie
);
1681 // Fill the index and data arrays.
1682 uint16_t *dest16
= (uint16_t *)bytes
;
1683 trie
->index
= dest16
;
1685 if (highStart
<= fastLimit
) {
1686 // Condense only the fast index from the mutable-trie index.
1687 for (int32_t i
= 0, j
= 0; j
< indexLength
; i
+= SMALL_DATA_BLOCKS_PER_BMP_BLOCK
, ++j
) {
1688 *dest16
++ = (uint16_t)index
[i
]; // dest16[j]
1691 uprv_memcpy(dest16
, index16
, indexLength
* 2);
1692 dest16
+= indexLength
;
1694 bytes
+= indexLength
* 2;
1696 // Write the data array.
1697 const uint32_t *p
= data
;
1698 switch (valueWidth
) {
1699 case UCPTRIE_VALUE_BITS_16
:
1700 // Write 16-bit data values.
1701 trie
->data
.ptr16
= dest16
;
1702 for (int32_t i
= dataLength
; i
> 0; --i
) {
1703 *dest16
++ = (uint16_t)*p
++;
1706 case UCPTRIE_VALUE_BITS_32
:
1707 // Write 32-bit data values.
1708 trie
->data
.ptr32
= (uint32_t *)bytes
;
1709 uprv_memcpy(bytes
, p
, (size_t)dataLength
* 4);
1711 case UCPTRIE_VALUE_BITS_8
:
1712 // Write 8-bit data values.
1713 trie
->data
.ptr8
= bytes
;
1714 for (int32_t i
= dataLength
; i
> 0; --i
) {
1715 *bytes
++ = (uint8_t)*p
++;
1719 // Will not occur, valueWidth checked at the beginning.
1723 #ifdef UCPTRIE_DEBUG
1726 ucptrie_printLengths(trie
, "");
1739 U_CAPI UMutableCPTrie
* U_EXPORT2
1740 umutablecptrie_open(uint32_t initialValue
, uint32_t errorValue
, UErrorCode
*pErrorCode
) {
1741 if (U_FAILURE(*pErrorCode
)) {
1744 LocalPointer
<MutableCodePointTrie
> trie(
1745 new MutableCodePointTrie(initialValue
, errorValue
, *pErrorCode
), *pErrorCode
);
1746 if (U_FAILURE(*pErrorCode
)) {
1749 return reinterpret_cast<UMutableCPTrie
*>(trie
.orphan());
1752 U_CAPI UMutableCPTrie
* U_EXPORT2
1753 umutablecptrie_clone(const UMutableCPTrie
*other
, UErrorCode
*pErrorCode
) {
1754 if (U_FAILURE(*pErrorCode
)) {
1757 if (other
== nullptr) {
1760 LocalPointer
<MutableCodePointTrie
> clone(
1761 new MutableCodePointTrie(*reinterpret_cast<const MutableCodePointTrie
*>(other
), *pErrorCode
), *pErrorCode
);
1762 if (U_FAILURE(*pErrorCode
)) {
1765 return reinterpret_cast<UMutableCPTrie
*>(clone
.orphan());
1768 U_CAPI
void U_EXPORT2
1769 umutablecptrie_close(UMutableCPTrie
*trie
) {
1770 delete reinterpret_cast<MutableCodePointTrie
*>(trie
);
1773 U_CAPI UMutableCPTrie
* U_EXPORT2
1774 umutablecptrie_fromUCPMap(const UCPMap
*map
, UErrorCode
*pErrorCode
) {
1775 if (U_FAILURE(*pErrorCode
)) {
1778 if (map
== nullptr) {
1779 *pErrorCode
= U_ILLEGAL_ARGUMENT_ERROR
;
1782 return reinterpret_cast<UMutableCPTrie
*>(MutableCodePointTrie::fromUCPMap(map
, *pErrorCode
));
1785 U_CAPI UMutableCPTrie
* U_EXPORT2
1786 umutablecptrie_fromUCPTrie(const UCPTrie
*trie
, UErrorCode
*pErrorCode
) {
1787 if (U_FAILURE(*pErrorCode
)) {
1790 if (trie
== nullptr) {
1791 *pErrorCode
= U_ILLEGAL_ARGUMENT_ERROR
;
1794 return reinterpret_cast<UMutableCPTrie
*>(MutableCodePointTrie::fromUCPTrie(trie
, *pErrorCode
));
1797 U_CAPI
uint32_t U_EXPORT2
1798 umutablecptrie_get(const UMutableCPTrie
*trie
, UChar32 c
) {
1799 return reinterpret_cast<const MutableCodePointTrie
*>(trie
)->get(c
);
1804 UChar32
getRange(const void *trie
, UChar32 start
,
1805 UCPMapValueFilter
*filter
, const void *context
, uint32_t *pValue
) {
1806 return reinterpret_cast<const MutableCodePointTrie
*>(trie
)->
1807 getRange(start
, filter
, context
, pValue
);
1812 U_CAPI UChar32 U_EXPORT2
1813 umutablecptrie_getRange(const UMutableCPTrie
*trie
, UChar32 start
,
1814 UCPMapRangeOption option
, uint32_t surrogateValue
,
1815 UCPMapValueFilter
*filter
, const void *context
, uint32_t *pValue
) {
1816 return ucptrie_internalGetRange(getRange
, trie
, start
,
1817 option
, surrogateValue
,
1818 filter
, context
, pValue
);
1821 U_CAPI
void U_EXPORT2
1822 umutablecptrie_set(UMutableCPTrie
*trie
, UChar32 c
, uint32_t value
, UErrorCode
*pErrorCode
) {
1823 if (U_FAILURE(*pErrorCode
)) {
1826 reinterpret_cast<MutableCodePointTrie
*>(trie
)->set(c
, value
, *pErrorCode
);
1829 U_CAPI
void U_EXPORT2
1830 umutablecptrie_setRange(UMutableCPTrie
*trie
, UChar32 start
, UChar32 end
,
1831 uint32_t value
, UErrorCode
*pErrorCode
) {
1832 if (U_FAILURE(*pErrorCode
)) {
1835 reinterpret_cast<MutableCodePointTrie
*>(trie
)->setRange(start
, end
, value
, *pErrorCode
);
1838 /* Compact and internally serialize the trie. */
1839 U_CAPI UCPTrie
* U_EXPORT2
1840 umutablecptrie_buildImmutable(UMutableCPTrie
*trie
, UCPTrieType type
, UCPTrieValueWidth valueWidth
,
1841 UErrorCode
*pErrorCode
) {
1842 if (U_FAILURE(*pErrorCode
)) {
1845 return reinterpret_cast<MutableCodePointTrie
*>(trie
)->build(type
, valueWidth
, *pErrorCode
);
1848 #ifdef UCPTRIE_DEBUG
1849 U_CFUNC
void umutablecptrie_setName(UMutableCPTrie
*trie
, const char *name
) {
1850 reinterpret_cast<MutableCodePointTrie
*>(trie
)->name
= name
;