1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
4 ******************************************************************************
6 * Copyright (C) 2001-2014, International Business Machines
7 * Corporation and others. All Rights Reserved.
9 ******************************************************************************
10 * file name: utrie2.cpp
12 * tab size: 8 (not used)
15 * created on: 2008aug16 (starting from a copy of utrie.c)
16 * created by: Markus W. Scherer
18 * This is a common implementation of a Unicode trie.
19 * It is a kind of compressed, serializable table of 16- or 32-bit values associated with
20 * Unicode code points (0..0x10ffff).
21 * This is the second common version of a Unicode trie (hence the name UTrie2).
22 * See utrie2.h for a comparison.
24 * This file contains only the runtime and enumeration code, for read-only access.
25 * See utrie2_builder.c for the builder code.
27 #include "unicode/utypes.h"
29 #include "unicode/umutablecptrie.h"
31 #include "unicode/utf.h"
32 #include "unicode/utf8.h"
33 #include "unicode/utf16.h"
36 #include "utrie2_impl.h"
39 /* Public UTrie2 API implementation ----------------------------------------- */
42 get32(const UNewTrie2
*trie
, UChar32 c
, UBool fromLSCP
) {
45 if(c
>=trie
->highStart
&& (!U_IS_LEAD(c
) || fromLSCP
)) {
46 return trie
->data
[trie
->dataLength
-UTRIE2_DATA_GRANULARITY
];
49 if(U_IS_LEAD(c
) && fromLSCP
) {
50 i2
=(UTRIE2_LSCP_INDEX_2_OFFSET
-(0xd800>>UTRIE2_SHIFT_2
))+
53 i2
=trie
->index1
[c
>>UTRIE2_SHIFT_1
]+
54 ((c
>>UTRIE2_SHIFT_2
)&UTRIE2_INDEX_2_MASK
);
56 block
=trie
->index2
[i2
];
57 return trie
->data
[block
+(c
&UTRIE2_DATA_MASK
)];
60 U_CAPI
uint32_t U_EXPORT2
61 utrie2_get32(const UTrie2
*trie
, UChar32 c
) {
62 if(trie
->data16
!=NULL
) {
63 return UTRIE2_GET16(trie
, c
);
64 } else if(trie
->data32
!=NULL
) {
65 return UTRIE2_GET32(trie
, c
);
66 } else if((uint32_t)c
>0x10ffff) {
67 return trie
->errorValue
;
69 return get32(trie
->newTrie
, c
, TRUE
);
73 U_CAPI
uint32_t U_EXPORT2
74 utrie2_get32FromLeadSurrogateCodeUnit(const UTrie2
*trie
, UChar32 c
) {
76 return trie
->errorValue
;
78 if(trie
->data16
!=NULL
) {
79 return UTRIE2_GET16_FROM_U16_SINGLE_LEAD(trie
, c
);
80 } else if(trie
->data32
!=NULL
) {
81 return UTRIE2_GET32_FROM_U16_SINGLE_LEAD(trie
, c
);
83 return get32(trie
->newTrie
, c
, FALSE
);
88 u8Index(const UTrie2
*trie
, UChar32 c
, int32_t i
) {
90 _UTRIE2_INDEX_FROM_CP(
92 trie
->data32
==NULL
? trie
->indexLength
: 0,
97 U_CAPI
int32_t U_EXPORT2
98 utrie2_internalU8NextIndex(const UTrie2
*trie
, UChar32 c
,
99 const uint8_t *src
, const uint8_t *limit
) {
102 /* support 64-bit pointers by avoiding cast of arbitrary difference */
104 length
=(int32_t)(limit
-src
);
108 c
=utf8_nextCharSafeBody(src
, &i
, length
, c
, -1);
109 return u8Index(trie
, c
, i
);
112 U_CAPI
int32_t U_EXPORT2
113 utrie2_internalU8PrevIndex(const UTrie2
*trie
, UChar32 c
,
114 const uint8_t *start
, const uint8_t *src
) {
116 /* support 64-bit pointers by avoiding cast of arbitrary difference */
118 i
=length
=(int32_t)(src
-start
);
123 c
=utf8_prevCharSafeBody(start
, 0, &i
, c
, -1);
124 i
=length
-i
; /* number of bytes read backward from src */
125 return u8Index(trie
, c
, i
);
128 U_CAPI UTrie2
* U_EXPORT2
129 utrie2_openFromSerialized(UTrie2ValueBits valueBits
,
130 const void *data
, int32_t length
, int32_t *pActualLength
,
131 UErrorCode
*pErrorCode
) {
132 const UTrie2Header
*header
;
134 int32_t actualLength
;
139 if(U_FAILURE(*pErrorCode
)) {
143 if( length
<=0 || (U_POINTER_MASK_LSB(data
, 3)!=0) ||
144 valueBits
<0 || UTRIE2_COUNT_VALUE_BITS
<=valueBits
146 *pErrorCode
=U_ILLEGAL_ARGUMENT_ERROR
;
150 /* enough data for a trie header? */
151 if(length
<(int32_t)sizeof(UTrie2Header
)) {
152 *pErrorCode
=U_INVALID_FORMAT_ERROR
;
156 /* check the signature */
157 header
=(const UTrie2Header
*)data
;
158 if(header
->signature
!=UTRIE2_SIG
) {
159 *pErrorCode
=U_INVALID_FORMAT_ERROR
;
163 /* get the options */
164 if(valueBits
!=(UTrie2ValueBits
)(header
->options
&UTRIE2_OPTIONS_VALUE_BITS_MASK
)) {
165 *pErrorCode
=U_INVALID_FORMAT_ERROR
;
169 /* get the length values and offsets */
170 uprv_memset(&tempTrie
, 0, sizeof(tempTrie
));
171 tempTrie
.indexLength
=header
->indexLength
;
172 tempTrie
.dataLength
=header
->shiftedDataLength
<<UTRIE2_INDEX_SHIFT
;
173 tempTrie
.index2NullOffset
=header
->index2NullOffset
;
174 tempTrie
.dataNullOffset
=header
->dataNullOffset
;
176 tempTrie
.highStart
=header
->shiftedHighStart
<<UTRIE2_SHIFT_1
;
177 tempTrie
.highValueIndex
=tempTrie
.dataLength
-UTRIE2_DATA_GRANULARITY
;
178 if(valueBits
==UTRIE2_16_VALUE_BITS
) {
179 tempTrie
.highValueIndex
+=tempTrie
.indexLength
;
182 /* calculate the actual length */
183 actualLength
=(int32_t)sizeof(UTrie2Header
)+tempTrie
.indexLength
*2;
184 if(valueBits
==UTRIE2_16_VALUE_BITS
) {
185 actualLength
+=tempTrie
.dataLength
*2;
187 actualLength
+=tempTrie
.dataLength
*4;
189 if(length
<actualLength
) {
190 *pErrorCode
=U_INVALID_FORMAT_ERROR
; /* not enough bytes */
194 /* allocate the trie */
195 trie
=(UTrie2
*)uprv_malloc(sizeof(UTrie2
));
197 *pErrorCode
=U_MEMORY_ALLOCATION_ERROR
;
200 uprv_memcpy(trie
, &tempTrie
, sizeof(tempTrie
));
201 trie
->memory
=(uint32_t *)data
;
202 trie
->length
=actualLength
;
203 trie
->isMemoryOwned
=FALSE
;
205 trie
->name
="fromSerialized";
208 /* set the pointers to its index and data arrays */
209 p16
=(const uint16_t *)(header
+1);
211 p16
+=trie
->indexLength
;
215 case UTRIE2_16_VALUE_BITS
:
218 trie
->initialValue
=trie
->index
[trie
->dataNullOffset
];
219 trie
->errorValue
=trie
->data16
[UTRIE2_BAD_UTF8_DATA_OFFSET
];
221 case UTRIE2_32_VALUE_BITS
:
223 trie
->data32
=(const uint32_t *)p16
;
224 trie
->initialValue
=trie
->data32
[trie
->dataNullOffset
];
225 trie
->errorValue
=trie
->data32
[UTRIE2_BAD_UTF8_DATA_OFFSET
];
228 *pErrorCode
=U_INVALID_FORMAT_ERROR
;
232 if(pActualLength
!=NULL
) {
233 *pActualLength
=actualLength
;
238 U_CAPI UTrie2
* U_EXPORT2
239 utrie2_openDummy(UTrie2ValueBits valueBits
,
240 uint32_t initialValue
, uint32_t errorValue
,
241 UErrorCode
*pErrorCode
) {
243 UTrie2Header
*header
;
246 int32_t indexLength
, dataLength
, length
, i
;
247 int32_t dataMove
; /* >0 if the data is moved to the end of the index array */
249 if(U_FAILURE(*pErrorCode
)) {
253 if(valueBits
<0 || UTRIE2_COUNT_VALUE_BITS
<=valueBits
) {
254 *pErrorCode
=U_ILLEGAL_ARGUMENT_ERROR
;
258 /* calculate the total length of the dummy trie data */
259 indexLength
=UTRIE2_INDEX_1_OFFSET
;
260 dataLength
=UTRIE2_DATA_START_OFFSET
+UTRIE2_DATA_GRANULARITY
;
261 length
=(int32_t)sizeof(UTrie2Header
)+indexLength
*2;
262 if(valueBits
==UTRIE2_16_VALUE_BITS
) {
263 length
+=dataLength
*2;
265 length
+=dataLength
*4;
268 /* allocate the trie */
269 trie
=(UTrie2
*)uprv_malloc(sizeof(UTrie2
));
271 *pErrorCode
=U_MEMORY_ALLOCATION_ERROR
;
274 uprv_memset(trie
, 0, sizeof(UTrie2
));
275 trie
->memory
=uprv_malloc(length
);
276 if(trie
->memory
==NULL
) {
278 *pErrorCode
=U_MEMORY_ALLOCATION_ERROR
;
282 trie
->isMemoryOwned
=TRUE
;
284 /* set the UTrie2 fields */
285 if(valueBits
==UTRIE2_16_VALUE_BITS
) {
286 dataMove
=indexLength
;
291 trie
->indexLength
=indexLength
;
292 trie
->dataLength
=dataLength
;
293 trie
->index2NullOffset
=UTRIE2_INDEX_2_OFFSET
;
294 trie
->dataNullOffset
=(uint16_t)dataMove
;
295 trie
->initialValue
=initialValue
;
296 trie
->errorValue
=errorValue
;
298 trie
->highValueIndex
=dataMove
+UTRIE2_DATA_START_OFFSET
;
303 /* set the header fields */
304 header
=(UTrie2Header
*)trie
->memory
;
306 header
->signature
=UTRIE2_SIG
; /* "Tri2" */
307 header
->options
=(uint16_t)valueBits
;
309 header
->indexLength
=(uint16_t)indexLength
;
310 header
->shiftedDataLength
=(uint16_t)(dataLength
>>UTRIE2_INDEX_SHIFT
);
311 header
->index2NullOffset
=(uint16_t)UTRIE2_INDEX_2_OFFSET
;
312 header
->dataNullOffset
=(uint16_t)dataMove
;
313 header
->shiftedHighStart
=0;
315 /* fill the index and data arrays */
316 dest16
=(uint16_t *)(header
+1);
319 /* write the index-2 array values shifted right by UTRIE2_INDEX_SHIFT */
320 for(i
=0; i
<UTRIE2_INDEX_2_BMP_LENGTH
; ++i
) {
321 *dest16
++=(uint16_t)(dataMove
>>UTRIE2_INDEX_SHIFT
); /* null data block */
324 /* write UTF-8 2-byte index-2 values, not right-shifted */
325 for(i
=0; i
<(0xc2-0xc0); ++i
) { /* C0..C1 */
326 *dest16
++=(uint16_t)(dataMove
+UTRIE2_BAD_UTF8_DATA_OFFSET
);
328 for(; i
<(0xe0-0xc0); ++i
) { /* C2..DF */
329 *dest16
++=(uint16_t)dataMove
;
332 /* write the 16/32-bit data array */
334 case UTRIE2_16_VALUE_BITS
:
335 /* write 16-bit data values */
338 for(i
=0; i
<0x80; ++i
) {
339 *dest16
++=(uint16_t)initialValue
;
342 *dest16
++=(uint16_t)errorValue
;
344 /* highValue and reserved values */
345 for(i
=0; i
<UTRIE2_DATA_GRANULARITY
; ++i
) {
346 *dest16
++=(uint16_t)initialValue
;
349 case UTRIE2_32_VALUE_BITS
:
350 /* write 32-bit data values */
351 p
=(uint32_t *)dest16
;
354 for(i
=0; i
<0x80; ++i
) {
360 /* highValue and reserved values */
361 for(i
=0; i
<UTRIE2_DATA_GRANULARITY
; ++i
) {
366 *pErrorCode
=U_ILLEGAL_ARGUMENT_ERROR
;
373 U_CAPI
void U_EXPORT2
374 utrie2_close(UTrie2
*trie
) {
376 if(trie
->isMemoryOwned
) {
377 uprv_free(trie
->memory
);
379 if(trie
->newTrie
!=NULL
) {
380 uprv_free(trie
->newTrie
->data
);
382 umutablecptrie_close(trie
->newTrie
->t3
);
384 uprv_free(trie
->newTrie
);
390 U_CAPI UBool U_EXPORT2
391 utrie2_isFrozen(const UTrie2
*trie
) {
392 return (UBool
)(trie
->newTrie
==NULL
);
395 U_CAPI
int32_t U_EXPORT2
396 utrie2_serialize(const UTrie2
*trie
,
397 void *data
, int32_t capacity
,
398 UErrorCode
*pErrorCode
) {
400 if(U_FAILURE(*pErrorCode
)) {
404 if( trie
==NULL
|| trie
->memory
==NULL
|| trie
->newTrie
!=NULL
||
405 capacity
<0 || (capacity
>0 && (data
==NULL
|| (U_POINTER_MASK_LSB(data
, 3)!=0)))
407 *pErrorCode
=U_ILLEGAL_ARGUMENT_ERROR
;
411 if(capacity
>=trie
->length
) {
412 uprv_memcpy(data
, trie
->memory
, trie
->length
);
414 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
419 /* enumeration -------------------------------------------------------------- */
421 #define MIN_VALUE(a, b) ((a)<(b) ? (a) : (b))
423 /* default UTrie2EnumValue() returns the input value itself */
424 static uint32_t U_CALLCONV
425 enumSameValue(const void * /*context*/, uint32_t value
) {
430 * Enumerate all ranges of code points with the same relevant values.
431 * The values are transformed from the raw trie entries by the enumValue function.
433 * Currently requires start<limit and both start and limit must be multiples
434 * of UTRIE2_DATA_BLOCK_LENGTH.
437 * - Skip a whole block if we know that it is filled with a single value,
438 * and it is the same as we visited just before.
439 * - Handle the null block specially because we know a priori that it is filled
440 * with a single value.
443 enumEitherTrie(const UTrie2
*trie
,
444 UChar32 start
, UChar32 limit
,
445 UTrie2EnumValue
*enumValue
, UTrie2EnumRange
*enumRange
, const void *context
) {
446 const uint32_t *data32
;
449 uint32_t value
, prevValue
, initialValue
;
450 UChar32 c
, prev
, highStart
;
451 int32_t j
, i2Block
, prevI2Block
, index2NullOffset
, block
, prevBlock
, nullBlock
;
453 if(enumRange
==NULL
) {
456 if(enumValue
==NULL
) {
457 enumValue
=enumSameValue
;
460 if(trie
->newTrie
==NULL
) {
463 U_ASSERT(idx
!=NULL
); /* the following code assumes trie->newTrie is not NULL when idx is NULL */
466 index2NullOffset
=trie
->index2NullOffset
;
467 nullBlock
=trie
->dataNullOffset
;
469 /* unfrozen, mutable trie */
471 data32
=trie
->newTrie
->data
;
472 U_ASSERT(data32
!=NULL
); /* the following code assumes idx is not NULL when data32 is NULL */
474 index2NullOffset
=trie
->newTrie
->index2NullOffset
;
475 nullBlock
=trie
->newTrie
->dataNullOffset
;
478 highStart
=trie
->highStart
;
480 /* get the enumeration value that corresponds to an initial-value trie data entry */
481 initialValue
=enumValue(context
, trie
->initialValue
);
483 /* set variables for previous range */
489 /* enumerate index-2 blocks */
490 for(c
=start
; c
<limit
&& c
<highStart
;) {
491 /* Code point limit for iterating inside this i2Block. */
492 UChar32 tempLimit
=c
+UTRIE2_CP_PER_INDEX_1_ENTRY
;
493 if(limit
<tempLimit
) {
497 if(!U_IS_SURROGATE(c
)) {
498 i2Block
=c
>>UTRIE2_SHIFT_2
;
499 } else if(U_IS_SURROGATE_LEAD(c
)) {
501 * Enumerate values for lead surrogate code points, not code units:
502 * This special block has half the normal length.
504 i2Block
=UTRIE2_LSCP_INDEX_2_OFFSET
;
505 tempLimit
=MIN_VALUE(0xdc00, limit
);
508 * Switch back to the normal part of the index-2 table.
509 * Enumerate the second half of the surrogates block.
511 i2Block
=0xd800>>UTRIE2_SHIFT_2
;
512 tempLimit
=MIN_VALUE(0xe000, limit
);
515 /* supplementary code points */
517 i2Block
=idx
[(UTRIE2_INDEX_1_OFFSET
-UTRIE2_OMITTED_BMP_INDEX_1_LENGTH
)+
518 (c
>>UTRIE2_SHIFT_1
)];
520 i2Block
=trie
->newTrie
->index1
[c
>>UTRIE2_SHIFT_1
];
522 if(i2Block
==prevI2Block
&& (c
-prev
)>=UTRIE2_CP_PER_INDEX_1_ENTRY
) {
524 * The index-2 block is the same as the previous one, and filled with prevValue.
525 * Only possible for supplementary code points because the linear-BMP index-2
526 * table creates unique i2Block values.
528 c
+=UTRIE2_CP_PER_INDEX_1_ENTRY
;
533 if(i2Block
==index2NullOffset
) {
534 /* this is the null index-2 block */
535 if(prevValue
!=initialValue
) {
536 if(prev
<c
&& !enumRange(context
, prev
, c
-1, prevValue
)) {
541 prevValue
=initialValue
;
543 c
+=UTRIE2_CP_PER_INDEX_1_ENTRY
;
545 /* enumerate data blocks for one index-2 block */
547 i2
=(c
>>UTRIE2_SHIFT_2
)&UTRIE2_INDEX_2_MASK
;
548 if((c
>>UTRIE2_SHIFT_1
)==(tempLimit
>>UTRIE2_SHIFT_1
)) {
549 i2Limit
=(tempLimit
>>UTRIE2_SHIFT_2
)&UTRIE2_INDEX_2_MASK
;
551 i2Limit
=UTRIE2_INDEX_2_BLOCK_LENGTH
;
553 for(; i2
<i2Limit
; ++i2
) {
555 block
=(int32_t)idx
[i2Block
+i2
]<<UTRIE2_INDEX_SHIFT
;
557 block
=trie
->newTrie
->index2
[i2Block
+i2
];
559 if(block
==prevBlock
&& (c
-prev
)>=UTRIE2_DATA_BLOCK_LENGTH
) {
560 /* the block is the same as the previous one, and filled with prevValue */
561 c
+=UTRIE2_DATA_BLOCK_LENGTH
;
565 if(block
==nullBlock
) {
566 /* this is the null data block */
567 if(prevValue
!=initialValue
) {
568 if(prev
<c
&& !enumRange(context
, prev
, c
-1, prevValue
)) {
572 prevValue
=initialValue
;
574 c
+=UTRIE2_DATA_BLOCK_LENGTH
;
576 for(j
=0; j
<UTRIE2_DATA_BLOCK_LENGTH
; ++j
) {
577 value
=enumValue(context
, data32
!=NULL
? data32
[block
+j
] : idx
[block
+j
]);
578 if(value
!=prevValue
) {
579 if(prev
<c
&& !enumRange(context
, prev
, c
-1, prevValue
)) {
593 c
=limit
; /* could be higher if in the index2NullOffset */
595 /* c==highStart<limit */
600 data32
[trie
->highValueIndex
] :
601 idx
[trie
->highValueIndex
];
603 highValue
=trie
->newTrie
->data
[trie
->newTrie
->dataLength
-UTRIE2_DATA_GRANULARITY
];
605 value
=enumValue(context
, highValue
);
606 if(value
!=prevValue
) {
607 if(prev
<c
&& !enumRange(context
, prev
, c
-1, prevValue
)) {
616 /* deliver last range */
617 enumRange(context
, prev
, c
-1, prevValue
);
620 U_CAPI
void U_EXPORT2
621 utrie2_enum(const UTrie2
*trie
,
622 UTrie2EnumValue
*enumValue
, UTrie2EnumRange
*enumRange
, const void *context
) {
623 enumEitherTrie(trie
, 0, 0x110000, enumValue
, enumRange
, context
);
626 U_CAPI
void U_EXPORT2
627 utrie2_enumForLeadSurrogate(const UTrie2
*trie
, UChar32 lead
,
628 UTrie2EnumValue
*enumValue
, UTrie2EnumRange
*enumRange
,
629 const void *context
) {
630 if(!U16_IS_LEAD(lead
)) {
633 lead
=(lead
-0xd7c0)<<10; /* start code point */
634 enumEitherTrie(trie
, lead
, lead
+0x400, enumValue
, enumRange
, context
);
637 /* C++ convenience wrappers ------------------------------------------------- */
641 uint16_t BackwardUTrie2StringIterator::previous16() {
642 codePointLimit
=codePointStart
;
643 if(start
>=codePointStart
) {
644 codePoint
=U_SENTINEL
;
645 return static_cast<uint16_t>(trie
->errorValue
);
648 UTRIE2_U16_PREV16(trie
, start
, codePointStart
, codePoint
, result
);
652 uint16_t ForwardUTrie2StringIterator::next16() {
653 codePointStart
=codePointLimit
;
654 if(codePointLimit
==limit
) {
655 codePoint
=U_SENTINEL
;
656 return static_cast<uint16_t>(trie
->errorValue
);
659 UTRIE2_U16_NEXT16(trie
, codePointLimit
, limit
, codePoint
, result
);