1 // © 2017 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
4 // ucptrie.cpp (modified from utrie2.cpp)
5 // created: 2017dec29 Markus W. Scherer
7 // #define UCPTRIE_DEBUG
12 #include "unicode/utypes.h"
13 #include "unicode/ucptrie.h"
14 #include "unicode/utf.h"
15 #include "unicode/utf8.h"
16 #include "unicode/utf16.h"
19 #include "ucptrie_impl.h"
21 U_CAPI UCPTrie
* U_EXPORT2
22 ucptrie_openFromBinary(UCPTrieType type
, UCPTrieValueWidth valueWidth
,
23 const void *data
, int32_t length
, int32_t *pActualLength
,
24 UErrorCode
*pErrorCode
) {
25 if (U_FAILURE(*pErrorCode
)) {
29 if (length
<= 0 || (U_POINTER_MASK_LSB(data
, 3) != 0) ||
30 type
< UCPTRIE_TYPE_ANY
|| UCPTRIE_TYPE_SMALL
< type
||
31 valueWidth
< UCPTRIE_VALUE_BITS_ANY
|| UCPTRIE_VALUE_BITS_8
< valueWidth
) {
32 *pErrorCode
= U_ILLEGAL_ARGUMENT_ERROR
;
36 // Enough data for a trie header?
37 if (length
< (int32_t)sizeof(UCPTrieHeader
)) {
38 *pErrorCode
= U_INVALID_FORMAT_ERROR
;
42 // Check the signature.
43 const UCPTrieHeader
*header
= (const UCPTrieHeader
*)data
;
44 if (header
->signature
!= UCPTRIE_SIG
) {
45 *pErrorCode
= U_INVALID_FORMAT_ERROR
;
49 int32_t options
= header
->options
;
50 int32_t typeInt
= (options
>> 6) & 3;
51 int32_t valueWidthInt
= options
& UCPTRIE_OPTIONS_VALUE_BITS_MASK
;
52 if (typeInt
> UCPTRIE_TYPE_SMALL
|| valueWidthInt
> UCPTRIE_VALUE_BITS_8
||
53 (options
& UCPTRIE_OPTIONS_RESERVED_MASK
) != 0) {
54 *pErrorCode
= U_INVALID_FORMAT_ERROR
;
57 UCPTrieType actualType
= (UCPTrieType
)typeInt
;
58 UCPTrieValueWidth actualValueWidth
= (UCPTrieValueWidth
)valueWidthInt
;
63 valueWidth
= actualValueWidth
;
65 if (type
!= actualType
|| valueWidth
!= actualValueWidth
) {
66 *pErrorCode
= U_INVALID_FORMAT_ERROR
;
70 // Get the length values and offsets.
72 uprv_memset(&tempTrie
, 0, sizeof(tempTrie
));
73 tempTrie
.indexLength
= header
->indexLength
;
75 ((options
& UCPTRIE_OPTIONS_DATA_LENGTH_MASK
) << 4) | header
->dataLength
;
76 tempTrie
.index3NullOffset
= header
->index3NullOffset
;
77 tempTrie
.dataNullOffset
=
78 ((options
& UCPTRIE_OPTIONS_DATA_NULL_OFFSET_MASK
) << 8) | header
->dataNullOffset
;
80 tempTrie
.highStart
= header
->shiftedHighStart
<< UCPTRIE_SHIFT_2
;
81 tempTrie
.shifted12HighStart
= (tempTrie
.highStart
+ 0xfff) >> 12;
83 tempTrie
.valueWidth
= valueWidth
;
85 // Calculate the actual length.
86 int32_t actualLength
= (int32_t)sizeof(UCPTrieHeader
) + tempTrie
.indexLength
* 2;
87 if (valueWidth
== UCPTRIE_VALUE_BITS_16
) {
88 actualLength
+= tempTrie
.dataLength
* 2;
89 } else if (valueWidth
== UCPTRIE_VALUE_BITS_32
) {
90 actualLength
+= tempTrie
.dataLength
* 4;
92 actualLength
+= tempTrie
.dataLength
;
94 if (length
< actualLength
) {
95 *pErrorCode
= U_INVALID_FORMAT_ERROR
; // Not enough bytes.
100 UCPTrie
*trie
= (UCPTrie
*)uprv_malloc(sizeof(UCPTrie
));
101 if (trie
== nullptr) {
102 *pErrorCode
= U_MEMORY_ALLOCATION_ERROR
;
105 uprv_memcpy(trie
, &tempTrie
, sizeof(tempTrie
));
107 trie
->name
= "fromSerialized";
110 // Set the pointers to its index and data arrays.
111 const uint16_t *p16
= (const uint16_t *)(header
+ 1);
113 p16
+= trie
->indexLength
;
116 int32_t nullValueOffset
= trie
->dataNullOffset
;
117 if (nullValueOffset
>= trie
->dataLength
) {
118 nullValueOffset
= trie
->dataLength
- UCPTRIE_HIGH_VALUE_NEG_DATA_OFFSET
;
120 switch (valueWidth
) {
121 case UCPTRIE_VALUE_BITS_16
:
122 trie
->data
.ptr16
= p16
;
123 trie
->nullValue
= trie
->data
.ptr16
[nullValueOffset
];
125 case UCPTRIE_VALUE_BITS_32
:
126 trie
->data
.ptr32
= (const uint32_t *)p16
;
127 trie
->nullValue
= trie
->data
.ptr32
[nullValueOffset
];
129 case UCPTRIE_VALUE_BITS_8
:
130 trie
->data
.ptr8
= (const uint8_t *)p16
;
131 trie
->nullValue
= trie
->data
.ptr8
[nullValueOffset
];
134 // Unreachable because valueWidth was checked above.
135 *pErrorCode
= U_INVALID_FORMAT_ERROR
;
139 if (pActualLength
!= nullptr) {
140 *pActualLength
= actualLength
;
145 U_CAPI
void U_EXPORT2
146 ucptrie_close(UCPTrie
*trie
) {
150 U_CAPI UCPTrieType U_EXPORT2
151 ucptrie_getType(const UCPTrie
*trie
) {
152 return (UCPTrieType
)trie
->type
;
155 U_CAPI UCPTrieValueWidth U_EXPORT2
156 ucptrie_getValueWidth(const UCPTrie
*trie
) {
157 return (UCPTrieValueWidth
)trie
->valueWidth
;
160 U_CAPI
int32_t U_EXPORT2
161 ucptrie_internalSmallIndex(const UCPTrie
*trie
, UChar32 c
) {
162 int32_t i1
= c
>> UCPTRIE_SHIFT_1
;
163 if (trie
->type
== UCPTRIE_TYPE_FAST
) {
164 U_ASSERT(0xffff < c
&& c
< trie
->highStart
);
165 i1
+= UCPTRIE_BMP_INDEX_LENGTH
- UCPTRIE_OMITTED_BMP_INDEX_1_LENGTH
;
167 U_ASSERT((uint32_t)c
< (uint32_t)trie
->highStart
&& trie
->highStart
> UCPTRIE_SMALL_LIMIT
);
168 i1
+= UCPTRIE_SMALL_INDEX_LENGTH
;
170 int32_t i3Block
= trie
->index
[
171 (int32_t)trie
->index
[i1
] + ((c
>> UCPTRIE_SHIFT_2
) & UCPTRIE_INDEX_2_MASK
)];
172 int32_t i3
= (c
>> UCPTRIE_SHIFT_3
) & UCPTRIE_INDEX_3_MASK
;
174 if ((i3Block
& 0x8000) == 0) {
176 dataBlock
= trie
->index
[i3Block
+ i3
];
178 // 18-bit indexes stored in groups of 9 entries per 8 indexes.
179 i3Block
= (i3Block
& 0x7fff) + (i3
& ~7) + (i3
>> 3);
181 dataBlock
= ((int32_t)trie
->index
[i3Block
++] << (2 + (2 * i3
))) & 0x30000;
182 dataBlock
|= trie
->index
[i3Block
+ i3
];
184 return dataBlock
+ (c
& UCPTRIE_SMALL_DATA_MASK
);
187 U_CAPI
int32_t U_EXPORT2
188 ucptrie_internalSmallU8Index(const UCPTrie
*trie
, int32_t lt1
, uint8_t t2
, uint8_t t3
) {
189 UChar32 c
= (lt1
<< 12) | (t2
<< 6) | t3
;
190 if (c
>= trie
->highStart
) {
191 // Possible because the UTF-8 macro compares with shifted12HighStart which may be higher.
192 return trie
->dataLength
- UCPTRIE_HIGH_VALUE_NEG_DATA_OFFSET
;
194 return ucptrie_internalSmallIndex(trie
, c
);
197 U_CAPI
int32_t U_EXPORT2
198 ucptrie_internalU8PrevIndex(const UCPTrie
*trie
, UChar32 c
,
199 const uint8_t *start
, const uint8_t *src
) {
201 // Support 64-bit pointers by avoiding cast of arbitrary difference.
202 if ((src
- start
) <= 7) {
203 i
= length
= (int32_t)(src
- start
);
208 c
= utf8_prevCharSafeBody(start
, 0, &i
, c
, -1);
209 i
= length
- i
; // Number of bytes read backward from src.
210 int32_t idx
= _UCPTRIE_CP_INDEX(trie
, 0xffff, c
);
211 return (idx
<< 3) | i
;
216 inline uint32_t getValue(UCPTrieData data
, UCPTrieValueWidth valueWidth
, int32_t dataIndex
) {
217 switch (valueWidth
) {
218 case UCPTRIE_VALUE_BITS_16
:
219 return data
.ptr16
[dataIndex
];
220 case UCPTRIE_VALUE_BITS_32
:
221 return data
.ptr32
[dataIndex
];
222 case UCPTRIE_VALUE_BITS_8
:
223 return data
.ptr8
[dataIndex
];
225 // Unreachable if the trie is properly initialized.
232 U_CAPI
uint32_t U_EXPORT2
233 ucptrie_get(const UCPTrie
*trie
, UChar32 c
) {
235 if ((uint32_t)c
<= 0x7f) {
239 UChar32 fastMax
= trie
->type
== UCPTRIE_TYPE_FAST
? 0xffff : UCPTRIE_SMALL_MAX
;
240 dataIndex
= _UCPTRIE_CP_INDEX(trie
, fastMax
, c
);
242 return getValue(trie
->data
, (UCPTrieValueWidth
)trie
->valueWidth
, dataIndex
);
247 constexpr int32_t MAX_UNICODE
= 0x10ffff;
249 inline uint32_t maybeFilterValue(uint32_t value
, uint32_t trieNullValue
, uint32_t nullValue
,
250 UCPMapValueFilter
*filter
, const void *context
) {
251 if (value
== trieNullValue
) {
253 } else if (filter
!= nullptr) {
254 value
= filter(context
, value
);
259 UChar32
getRange(const void *t
, UChar32 start
,
260 UCPMapValueFilter
*filter
, const void *context
, uint32_t *pValue
) {
261 if ((uint32_t)start
> MAX_UNICODE
) {
264 const UCPTrie
*trie
= reinterpret_cast<const UCPTrie
*>(t
);
265 UCPTrieValueWidth valueWidth
= (UCPTrieValueWidth
)trie
->valueWidth
;
266 if (start
>= trie
->highStart
) {
267 if (pValue
!= nullptr) {
268 int32_t di
= trie
->dataLength
- UCPTRIE_HIGH_VALUE_NEG_DATA_OFFSET
;
269 uint32_t value
= getValue(trie
->data
, valueWidth
, di
);
270 if (filter
!= nullptr) { value
= filter(context
, value
); }
276 uint32_t nullValue
= trie
->nullValue
;
277 if (filter
!= nullptr) { nullValue
= filter(context
, nullValue
); }
278 const uint16_t *index
= trie
->index
;
280 int32_t prevI3Block
= -1;
281 int32_t prevBlock
= -1;
283 uint32_t trieValue
, value
;
284 bool haveValue
= false;
288 int32_t i3BlockLength
;
289 int32_t dataBlockLength
;
290 if (c
<= 0xffff && (trie
->type
== UCPTRIE_TYPE_FAST
|| c
<= UCPTRIE_SMALL_MAX
)) {
292 i3
= c
>> UCPTRIE_FAST_SHIFT
;
293 i3BlockLength
= trie
->type
== UCPTRIE_TYPE_FAST
?
294 UCPTRIE_BMP_INDEX_LENGTH
: UCPTRIE_SMALL_INDEX_LENGTH
;
295 dataBlockLength
= UCPTRIE_FAST_DATA_BLOCK_LENGTH
;
297 // Use the multi-stage index.
298 int32_t i1
= c
>> UCPTRIE_SHIFT_1
;
299 if (trie
->type
== UCPTRIE_TYPE_FAST
) {
300 U_ASSERT(0xffff < c
&& c
< trie
->highStart
);
301 i1
+= UCPTRIE_BMP_INDEX_LENGTH
- UCPTRIE_OMITTED_BMP_INDEX_1_LENGTH
;
303 U_ASSERT(c
< trie
->highStart
&& trie
->highStart
> UCPTRIE_SMALL_LIMIT
);
304 i1
+= UCPTRIE_SMALL_INDEX_LENGTH
;
306 i3Block
= trie
->index
[
307 (int32_t)trie
->index
[i1
] + ((c
>> UCPTRIE_SHIFT_2
) & UCPTRIE_INDEX_2_MASK
)];
308 if (i3Block
== prevI3Block
&& (c
- start
) >= UCPTRIE_CP_PER_INDEX_2_ENTRY
) {
309 // The index-3 block is the same as the previous one, and filled with value.
310 U_ASSERT((c
& (UCPTRIE_CP_PER_INDEX_2_ENTRY
- 1)) == 0);
311 c
+= UCPTRIE_CP_PER_INDEX_2_ENTRY
;
314 prevI3Block
= i3Block
;
315 if (i3Block
== trie
->index3NullOffset
) {
316 // This is the index-3 null block.
318 if (nullValue
!= value
) {
322 trieValue
= trie
->nullValue
;
324 if (pValue
!= nullptr) { *pValue
= nullValue
; }
327 prevBlock
= trie
->dataNullOffset
;
328 c
= (c
+ UCPTRIE_CP_PER_INDEX_2_ENTRY
) & ~(UCPTRIE_CP_PER_INDEX_2_ENTRY
- 1);
331 i3
= (c
>> UCPTRIE_SHIFT_3
) & UCPTRIE_INDEX_3_MASK
;
332 i3BlockLength
= UCPTRIE_INDEX_3_BLOCK_LENGTH
;
333 dataBlockLength
= UCPTRIE_SMALL_DATA_BLOCK_LENGTH
;
335 // Enumerate data blocks for one index-3 block.
338 if ((i3Block
& 0x8000) == 0) {
339 block
= index
[i3Block
+ i3
];
341 // 18-bit indexes stored in groups of 9 entries per 8 indexes.
342 int32_t group
= (i3Block
& 0x7fff) + (i3
& ~7) + (i3
>> 3);
344 block
= ((int32_t)index
[group
++] << (2 + (2 * gi
))) & 0x30000;
345 block
|= index
[group
+ gi
];
347 if (block
== prevBlock
&& (c
- start
) >= dataBlockLength
) {
348 // The block is the same as the previous one, and filled with value.
349 U_ASSERT((c
& (dataBlockLength
- 1)) == 0);
350 c
+= dataBlockLength
;
352 int32_t dataMask
= dataBlockLength
- 1;
354 if (block
== trie
->dataNullOffset
) {
355 // This is the data null block.
357 if (nullValue
!= value
) {
361 trieValue
= trie
->nullValue
;
363 if (pValue
!= nullptr) { *pValue
= nullValue
; }
366 c
= (c
+ dataBlockLength
) & ~dataMask
;
368 int32_t di
= block
+ (c
& dataMask
);
369 uint32_t trieValue2
= getValue(trie
->data
, valueWidth
, di
);
371 if (trieValue2
!= trieValue
) {
372 if (filter
== nullptr ||
373 maybeFilterValue(trieValue2
, trie
->nullValue
, nullValue
,
374 filter
, context
) != value
) {
377 trieValue
= trieValue2
; // may or may not help
380 trieValue
= trieValue2
;
381 value
= maybeFilterValue(trieValue2
, trie
->nullValue
, nullValue
,
383 if (pValue
!= nullptr) { *pValue
= value
; }
386 while ((++c
& dataMask
) != 0) {
387 trieValue2
= getValue(trie
->data
, valueWidth
, ++di
);
388 if (trieValue2
!= trieValue
) {
389 if (filter
== nullptr ||
390 maybeFilterValue(trieValue2
, trie
->nullValue
, nullValue
,
391 filter
, context
) != value
) {
394 trieValue
= trieValue2
; // may or may not help
399 } while (++i3
< i3BlockLength
);
400 } while (c
< trie
->highStart
);
402 int32_t di
= trie
->dataLength
- UCPTRIE_HIGH_VALUE_NEG_DATA_OFFSET
;
403 uint32_t highValue
= getValue(trie
->data
, valueWidth
, di
);
404 if (maybeFilterValue(highValue
, trie
->nullValue
, nullValue
,
405 filter
, context
) != value
) {
415 ucptrie_internalGetRange(UCPTrieGetRange
*getRange
,
416 const void *trie
, UChar32 start
,
417 UCPMapRangeOption option
, uint32_t surrogateValue
,
418 UCPMapValueFilter
*filter
, const void *context
, uint32_t *pValue
) {
419 if (option
== UCPMAP_RANGE_NORMAL
) {
420 return getRange(trie
, start
, filter
, context
, pValue
);
423 if (pValue
== nullptr) {
424 // We need to examine the range value even if the caller does not want it.
427 UChar32 surrEnd
= option
== UCPMAP_RANGE_FIXED_ALL_SURROGATES
? 0xdfff : 0xdbff;
428 UChar32 end
= getRange(trie
, start
, filter
, context
, pValue
);
429 if (end
< 0xd7ff || start
> surrEnd
) {
432 // The range overlaps with surrogates, or ends just before the first one.
433 if (*pValue
== surrogateValue
) {
434 if (end
>= surrEnd
) {
435 // Surrogates followed by a non-surrogateValue range,
436 // or surrogates are part of a larger surrogateValue range.
440 if (start
<= 0xd7ff) {
441 return 0xd7ff; // Non-surrogateValue range ends before surrogateValue surrogates.
443 // Start is a surrogate with a non-surrogateValue code *unit* value.
444 // Return a surrogateValue code *point* range.
445 *pValue
= surrogateValue
;
447 return surrEnd
; // Surrogate range ends before non-surrogateValue rest of range.
450 // See if the surrogateValue surrogate range can be merged with
451 // an immediately following range.
453 UChar32 end2
= getRange(trie
, surrEnd
+ 1, filter
, context
, &value2
);
454 if (value2
== surrogateValue
) {
460 U_CAPI UChar32 U_EXPORT2
461 ucptrie_getRange(const UCPTrie
*trie
, UChar32 start
,
462 UCPMapRangeOption option
, uint32_t surrogateValue
,
463 UCPMapValueFilter
*filter
, const void *context
, uint32_t *pValue
) {
464 return ucptrie_internalGetRange(getRange
, trie
, start
,
465 option
, surrogateValue
,
466 filter
, context
, pValue
);
469 U_CAPI
int32_t U_EXPORT2
470 ucptrie_toBinary(const UCPTrie
*trie
,
471 void *data
, int32_t capacity
,
472 UErrorCode
*pErrorCode
) {
473 if (U_FAILURE(*pErrorCode
)) {
477 UCPTrieType type
= (UCPTrieType
)trie
->type
;
478 UCPTrieValueWidth valueWidth
= (UCPTrieValueWidth
)trie
->valueWidth
;
479 if (type
< UCPTRIE_TYPE_FAST
|| UCPTRIE_TYPE_SMALL
< type
||
480 valueWidth
< UCPTRIE_VALUE_BITS_16
|| UCPTRIE_VALUE_BITS_8
< valueWidth
||
482 (capacity
> 0 && (data
== nullptr || (U_POINTER_MASK_LSB(data
, 3) != 0)))) {
483 *pErrorCode
= U_ILLEGAL_ARGUMENT_ERROR
;
487 int32_t length
= (int32_t)sizeof(UCPTrieHeader
) + trie
->indexLength
* 2;
488 switch (valueWidth
) {
489 case UCPTRIE_VALUE_BITS_16
:
490 length
+= trie
->dataLength
* 2;
492 case UCPTRIE_VALUE_BITS_32
:
493 length
+= trie
->dataLength
* 4;
495 case UCPTRIE_VALUE_BITS_8
:
496 length
+= trie
->dataLength
;
502 if (capacity
< length
) {
503 *pErrorCode
= U_BUFFER_OVERFLOW_ERROR
;
507 char *bytes
= (char *)data
;
508 UCPTrieHeader
*header
= (UCPTrieHeader
*)bytes
;
509 header
->signature
= UCPTRIE_SIG
; // "Tri3"
510 header
->options
= (uint16_t)(
511 ((trie
->dataLength
& 0xf0000) >> 4) |
512 ((trie
->dataNullOffset
& 0xf0000) >> 8) |
515 header
->indexLength
= (uint16_t)trie
->indexLength
;
516 header
->dataLength
= (uint16_t)trie
->dataLength
;
517 header
->index3NullOffset
= trie
->index3NullOffset
;
518 header
->dataNullOffset
= (uint16_t)trie
->dataNullOffset
;
519 header
->shiftedHighStart
= trie
->highStart
>> UCPTRIE_SHIFT_2
;
520 bytes
+= sizeof(UCPTrieHeader
);
522 uprv_memcpy(bytes
, trie
->index
, trie
->indexLength
* 2);
523 bytes
+= trie
->indexLength
* 2;
525 switch (valueWidth
) {
526 case UCPTRIE_VALUE_BITS_16
:
527 uprv_memcpy(bytes
, trie
->data
.ptr16
, trie
->dataLength
* 2);
529 case UCPTRIE_VALUE_BITS_32
:
530 uprv_memcpy(bytes
, trie
->data
.ptr32
, trie
->dataLength
* 4);
532 case UCPTRIE_VALUE_BITS_8
:
533 uprv_memcpy(bytes
, trie
->data
.ptr8
, trie
->dataLength
);
545 long countNull(const UCPTrie
*trie
) {
546 uint32_t nullValue
=trie
->nullValue
;
547 int32_t length
=trie
->dataLength
;
549 switch (trie
->valueWidth
) {
550 case UCPTRIE_VALUE_BITS_16
:
551 for(int32_t i
=0; i
<length
; ++i
) {
552 if(trie
->data
.ptr16
[i
]==nullValue
) { ++count
; }
555 case UCPTRIE_VALUE_BITS_32
:
556 for(int32_t i
=0; i
<length
; ++i
) {
557 if(trie
->data
.ptr32
[i
]==nullValue
) { ++count
; }
560 case UCPTRIE_VALUE_BITS_8
:
561 for(int32_t i
=0; i
<length
; ++i
) {
562 if(trie
->data
.ptr8
[i
]==nullValue
) { ++count
; }
573 ucptrie_printLengths(const UCPTrie
*trie
, const char *which
) {
574 long indexLength
=trie
->indexLength
;
575 long dataLength
=(long)trie
->dataLength
;
576 long totalLength
=(long)sizeof(UCPTrieHeader
)+indexLength
*2+
577 dataLength
*(trie
->valueWidth
==UCPTRIE_VALUE_BITS_16
? 2 :
578 trie
->valueWidth
==UCPTRIE_VALUE_BITS_32
? 4 : 1);
579 printf("**UCPTrieLengths(%s %s)** index:%6ld data:%6ld countNull:%6ld serialized:%6ld\n",
580 which
, trie
->name
, indexLength
, dataLength
, countNull(trie
), totalLength
);
587 // Initially, this is the same as UCPTrie. This may well change.
589 U_CAPI
uint32_t U_EXPORT2
590 ucpmap_get(const UCPMap
*map
, UChar32 c
) {
591 return ucptrie_get(reinterpret_cast<const UCPTrie
*>(map
), c
);
594 U_CAPI UChar32 U_EXPORT2
595 ucpmap_getRange(const UCPMap
*map
, UChar32 start
,
596 UCPMapRangeOption option
, uint32_t surrogateValue
,
597 UCPMapValueFilter
*filter
, const void *context
, uint32_t *pValue
) {
598 return ucptrie_getRange(reinterpret_cast<const UCPTrie
*>(map
), start
,
599 option
, surrogateValue
,
600 filter
, context
, pValue
);