2 ******************************************************************************
4 * Copyright (C) 2001-2004, International Business Machines
5 * Corporation and others. All Rights Reserved.
7 ******************************************************************************
10 * tab size: 8 (not used)
13 * created on: 2001oct20
14 * created by: Markus W. Scherer
16 * This is a common implementation of a "folded" trie.
17 * It is a kind of compressed, serializable table of 16- or 32-bit values associated with
18 * Unicode code points (0..0x10ffff).
25 #include "unicode/utypes.h"
31 #define ABS(x) ((x)>=0 ? (x) : -(x))
33 /* Building a trie ----------------------------------------------------------*/
35 U_CAPI UNewTrie
* U_EXPORT2
36 utrie_open(UNewTrie
*fillIn
,
37 uint32_t *aliasData
, int32_t maxDataLength
,
38 uint32_t initialValue
, uint32_t leadUnitValue
,
43 if( maxDataLength
<UTRIE_DATA_BLOCK_LENGTH
||
44 (latin1Linear
&& maxDataLength
<1024)
52 trie
=(UNewTrie
*)uprv_malloc(sizeof(UNewTrie
));
57 uprv_memset(trie
, 0, sizeof(UNewTrie
));
58 trie
->isAllocated
= (UBool
)(fillIn
==NULL
);
62 trie
->isDataAllocated
=FALSE
;
64 trie
->data
=(uint32_t *)uprv_malloc(maxDataLength
*4);
65 if(trie
->data
==NULL
) {
69 trie
->isDataAllocated
=TRUE
;
72 /* preallocate and reset the first data block (block index 0) */
73 j
=UTRIE_DATA_BLOCK_LENGTH
;
76 /* preallocate and reset the first block (number 0) and Latin-1 (U+0000..U+00ff) after that */
77 /* made sure above that maxDataLength>=1024 */
79 /* set indexes to point to consecutive data blocks */
82 /* do this at least for trie->index[0] even if that block is only partly used for Latin-1 */
84 j
+=UTRIE_DATA_BLOCK_LENGTH
;
85 } while(i
<(256>>UTRIE_SHIFT
));
88 /* reset the initially allocated blocks to the initial value */
91 trie
->data
[--j
]=initialValue
;
94 trie
->leadUnitValue
=leadUnitValue
;
95 trie
->indexLength
=UTRIE_MAX_INDEX_LENGTH
;
96 trie
->dataCapacity
=maxDataLength
;
97 trie
->isLatin1Linear
=latin1Linear
;
98 trie
->isCompacted
=FALSE
;
102 U_CAPI UNewTrie
* U_EXPORT2
103 utrie_clone(UNewTrie
*fillIn
, const UNewTrie
*other
, uint32_t *aliasData
, int32_t aliasDataCapacity
) {
105 UBool isDataAllocated
;
107 /* do not clone if other is not valid or already compacted */
108 if(other
==NULL
|| other
->data
==NULL
|| other
->isCompacted
) {
113 if(aliasData
!=NULL
&& aliasDataCapacity
>=other
->dataCapacity
) {
114 isDataAllocated
=FALSE
;
116 aliasDataCapacity
=other
->dataCapacity
;
117 aliasData
=(uint32_t *)uprv_malloc(other
->dataCapacity
*4);
118 if(aliasData
==NULL
) {
121 isDataAllocated
=TRUE
;
124 trie
=utrie_open(fillIn
, aliasData
, aliasDataCapacity
,
125 other
->data
[0], other
->leadUnitValue
,
126 other
->isLatin1Linear
);
128 uprv_free(aliasData
);
130 uprv_memcpy(trie
->index
, other
->index
, sizeof(trie
->index
));
131 uprv_memcpy(trie
->data
, other
->data
, other
->dataLength
*4);
132 trie
->dataLength
=other
->dataLength
;
133 trie
->isDataAllocated
=isDataAllocated
;
139 U_CAPI
void U_EXPORT2
140 utrie_close(UNewTrie
*trie
) {
142 if(trie
->isDataAllocated
) {
143 uprv_free(trie
->data
);
146 if(trie
->isAllocated
) {
152 U_CAPI
uint32_t * U_EXPORT2
153 utrie_getData(UNewTrie
*trie
, int32_t *pLength
) {
154 if(trie
==NULL
|| pLength
==NULL
) {
158 *pLength
=trie
->dataLength
;
163 utrie_allocDataBlock(UNewTrie
*trie
) {
164 int32_t newBlock
, newTop
;
166 newBlock
=trie
->dataLength
;
167 newTop
=newBlock
+UTRIE_DATA_BLOCK_LENGTH
;
168 if(newTop
>trie
->dataCapacity
) {
169 /* out of memory in the data array */
172 trie
->dataLength
=newTop
;
177 * No error checking for illegal arguments.
179 * @return -1 if no new data block available (out of memory in data array)
183 utrie_getDataBlock(UNewTrie
*trie
, UChar32 c
) {
184 int32_t indexValue
, newBlock
;
187 indexValue
=trie
->index
[c
];
192 /* allocate a new data block */
193 newBlock
=utrie_allocDataBlock(trie
);
195 /* out of memory in the data array */
198 trie
->index
[c
]=newBlock
;
200 /* copy-on-write for a block from a setRange() */
201 uprv_memcpy(trie
->data
+newBlock
, trie
->data
-indexValue
, 4*UTRIE_DATA_BLOCK_LENGTH
);
206 * @return TRUE if the value was successfully set
208 U_CAPI UBool U_EXPORT2
209 utrie_set32(UNewTrie
*trie
, UChar32 c
, uint32_t value
) {
212 /* valid, uncompacted trie and valid c? */
213 if(trie
==NULL
|| trie
->isCompacted
|| (uint32_t)c
>0x10ffff) {
217 block
=utrie_getDataBlock(trie
, c
);
222 trie
->data
[block
+(c
&UTRIE_MASK
)]=value
;
226 U_CAPI
uint32_t U_EXPORT2
227 utrie_get32(UNewTrie
*trie
, UChar32 c
, UBool
*pInBlockZero
) {
230 /* valid, uncompacted trie and valid c? */
231 if(trie
==NULL
|| trie
->isCompacted
|| (uint32_t)c
>0x10ffff) {
232 if(pInBlockZero
!=NULL
) {
238 block
=trie
->index
[c
>>UTRIE_SHIFT
];
239 if(pInBlockZero
!=NULL
) {
240 *pInBlockZero
= (UBool
)(block
==0);
243 return trie
->data
[ABS(block
)+(c
&UTRIE_MASK
)];
250 utrie_fillBlock(uint32_t *block
, UChar32 start
, UChar32 limit
,
251 uint32_t value
, uint32_t initialValue
, UBool overwrite
) {
257 while(block
<pLimit
) {
261 while(block
<pLimit
) {
262 if(*block
==initialValue
) {
270 U_CAPI UBool U_EXPORT2
271 utrie_setRange32(UNewTrie
*trie
, UChar32 start
, UChar32 limit
, uint32_t value
, UBool overwrite
) {
273 * repeat value in [start..limit[
274 * mark index values for repeat-data blocks by setting bit 31 of the index values
275 * fill around existing values if any, if(overwrite)
277 uint32_t initialValue
;
278 int32_t block
, rest
, repeatBlock
;
280 /* valid, uncompacted trie and valid indexes? */
281 if( trie
==NULL
|| trie
->isCompacted
||
282 (uint32_t)start
>0x10ffff || (uint32_t)limit
>0x110000 || start
>limit
287 return TRUE
; /* nothing to do */
290 initialValue
=trie
->data
[0];
291 if(start
&UTRIE_MASK
) {
294 /* set partial block at [start..following block boundary[ */
295 block
=utrie_getDataBlock(trie
, start
);
300 nextStart
=(start
+UTRIE_DATA_BLOCK_LENGTH
)&~UTRIE_MASK
;
301 if(nextStart
<=limit
) {
302 utrie_fillBlock(trie
->data
+block
, start
&UTRIE_MASK
, UTRIE_DATA_BLOCK_LENGTH
,
303 value
, initialValue
, overwrite
);
306 utrie_fillBlock(trie
->data
+block
, start
&UTRIE_MASK
, limit
&UTRIE_MASK
,
307 value
, initialValue
, overwrite
);
312 /* number of positions in the last, partial block */
313 rest
=limit
&UTRIE_MASK
;
315 /* round down limit to a block boundary */
318 /* iterate over all-value blocks */
319 if(value
==initialValue
) {
325 /* get index value */
326 block
=trie
->index
[start
>>UTRIE_SHIFT
];
328 /* already allocated, fill in value */
329 utrie_fillBlock(trie
->data
+block
, 0, UTRIE_DATA_BLOCK_LENGTH
, value
, initialValue
, overwrite
);
330 } else if(trie
->data
[-block
]!=value
&& (block
==0 || overwrite
)) {
331 /* set the repeatBlock instead of the current block 0 or range block */
333 trie
->index
[start
>>UTRIE_SHIFT
]=-repeatBlock
;
335 /* create and set and fill the repeatBlock */
336 repeatBlock
=utrie_getDataBlock(trie
, start
);
341 /* set the negative block number to indicate that it is a repeat block */
342 trie
->index
[start
>>UTRIE_SHIFT
]=-repeatBlock
;
343 utrie_fillBlock(trie
->data
+repeatBlock
, 0, UTRIE_DATA_BLOCK_LENGTH
, value
, initialValue
, TRUE
);
347 start
+=UTRIE_DATA_BLOCK_LENGTH
;
351 /* set partial block at [last block boundary..limit[ */
352 block
=utrie_getDataBlock(trie
, start
);
357 utrie_fillBlock(trie
->data
+block
, 0, rest
, value
, initialValue
, overwrite
);
364 _findSameIndexBlock(const int32_t *index
, int32_t indexLength
,
365 int32_t otherBlock
) {
368 for(block
=UTRIE_BMP_INDEX_LENGTH
; block
<indexLength
; block
+=UTRIE_SURROGATE_BLOCK_COUNT
) {
369 for(i
=0; i
<UTRIE_SURROGATE_BLOCK_COUNT
; ++i
) {
370 if(index
[block
+i
]!=index
[otherBlock
+i
]) {
374 if(i
==UTRIE_SURROGATE_BLOCK_COUNT
) {
382 * Fold the normalization data for supplementary code points into
383 * a compact area on top of the BMP-part of the trie index,
384 * with the lead surrogates indexing this compact area.
386 * Duplicate the index values for lead surrogates:
387 * From inside the BMP area, where some may be overridden with folded values,
388 * to just after the BMP area, where they can be retrieved for
389 * code point lookups.
392 utrie_fold(UNewTrie
*trie
, UNewTrieGetFoldedValue
*getFoldedValue
, UErrorCode
*pErrorCode
) {
393 int32_t leadIndexes
[UTRIE_SURROGATE_BLOCK_COUNT
];
397 int32_t indexLength
, block
;
401 /* copy the lead surrogate indexes into a temporary array */
402 uprv_memcpy(leadIndexes
, index
+(0xd800>>UTRIE_SHIFT
), 4*UTRIE_SURROGATE_BLOCK_COUNT
);
405 * set all values for lead surrogate code *units* to leadUnitValue
406 * so that, by default, runtime lookups will find no data for associated
407 * supplementary code points, unless there is data for such code points
408 * which will result in a non-zero folding value below that is set for
409 * the respective lead units
411 * the above saved the indexes for surrogate code *points*
412 * fill the indexes with simplified code from utrie_setRange32()
414 if(trie
->leadUnitValue
==trie
->data
[0]) {
415 block
=0; /* leadUnitValue==initialValue, use all-initial-value block */
417 /* create and fill the repeatBlock */
418 block
=utrie_allocDataBlock(trie
);
420 /* data table overflow */
421 *pErrorCode
=U_MEMORY_ALLOCATION_ERROR
;
424 utrie_fillBlock(trie
->data
+block
, 0, UTRIE_DATA_BLOCK_LENGTH
, trie
->leadUnitValue
, trie
->data
[0], TRUE
);
425 block
=-block
; /* negative block number to indicate that it is a repeat block */
427 for(c
=(0xd800>>UTRIE_SHIFT
); c
<(0xdc00>>UTRIE_SHIFT
); ++c
) {
428 trie
->index
[c
]=block
;
432 * Fold significant index values into the area just after the BMP indexes.
433 * In case the first lead surrogate has significant data,
434 * its index block must be used first (in which case the folding is a no-op).
435 * Later all folded index blocks are moved up one to insert the copied
436 * lead surrogate indexes.
438 indexLength
=UTRIE_BMP_INDEX_LENGTH
;
440 /* search for any index (stage 1) entries for supplementary code points */
441 for(c
=0x10000; c
<0x110000;) {
442 if(index
[c
>>UTRIE_SHIFT
]!=0) {
443 /* there is data, treat the full block for a lead surrogate */
447 printf("supplementary data for lead surrogate U+%04lx\n", (long)(0xd7c0+(c
>>10)));
450 /* is there an identical index block? */
451 block
=_findSameIndexBlock(index
, indexLength
, c
>>UTRIE_SHIFT
);
454 * get a folded value for [c..c+0x400[ and,
455 * if different from the value for the lead surrogate code point,
456 * set it for the lead surrogate code unit
458 value
=getFoldedValue(trie
, c
, block
+UTRIE_SURROGATE_BLOCK_COUNT
);
459 if(value
!=utrie_get32(trie
, U16_LEAD(c
), NULL
)) {
460 if(!utrie_set32(trie
, U16_LEAD(c
), value
)) {
461 /* data table overflow */
462 *pErrorCode
=U_MEMORY_ALLOCATION_ERROR
;
466 /* if we did not find an identical index block... */
467 if(block
==indexLength
) {
468 /* move the actual index (stage 1) entries from the supplementary position to the new one */
469 uprv_memmove(index
+indexLength
,
470 index
+(c
>>UTRIE_SHIFT
),
471 4*UTRIE_SURROGATE_BLOCK_COUNT
);
472 indexLength
+=UTRIE_SURROGATE_BLOCK_COUNT
;
477 c
+=UTRIE_DATA_BLOCK_LENGTH
;
482 * index array overflow?
483 * This is to guarantee that a folding offset is of the form
484 * UTRIE_BMP_INDEX_LENGTH+n*UTRIE_SURROGATE_BLOCK_COUNT with n=0..1023.
485 * If the index is too large, then n>=1024 and more than 10 bits are necessary.
487 * In fact, it can only ever become n==1024 with completely unfoldable data and
488 * the additional block of duplicated values for lead surrogates.
490 if(indexLength
>=UTRIE_MAX_INDEX_LENGTH
) {
491 *pErrorCode
=U_INDEX_OUTOFBOUNDS_ERROR
;
496 * make space for the lead surrogate index block and
497 * insert it between the BMP indexes and the folded ones
499 uprv_memmove(index
+UTRIE_BMP_INDEX_LENGTH
+UTRIE_SURROGATE_BLOCK_COUNT
,
500 index
+UTRIE_BMP_INDEX_LENGTH
,
501 4*(indexLength
-UTRIE_BMP_INDEX_LENGTH
));
502 uprv_memcpy(index
+UTRIE_BMP_INDEX_LENGTH
,
504 4*UTRIE_SURROGATE_BLOCK_COUNT
);
505 indexLength
+=UTRIE_SURROGATE_BLOCK_COUNT
;
508 printf("trie index count: BMP %ld all Unicode %ld folded %ld\n",
509 UTRIE_BMP_INDEX_LENGTH
, (long)UTRIE_MAX_INDEX_LENGTH
, indexLength
);
512 trie
->indexLength
=indexLength
;
516 * Set a value in the trie index map to indicate which data block
517 * is referenced and which one is not.
518 * utrie_compact() will remove data blocks that are not used at all.
521 * - -1 if it is not used
524 _findUnusedBlocks(UNewTrie
*trie
) {
527 /* fill the entire map with "not used" */
528 uprv_memset(trie
->map
, 0xff, (UTRIE_MAX_BUILD_TIME_DATA_LENGTH
>>UTRIE_SHIFT
)*4);
530 /* mark each block that _is_ used with 0 */
531 for(i
=0; i
<trie
->indexLength
; ++i
) {
532 trie
->map
[ABS(trie
->index
[i
])>>UTRIE_SHIFT
]=0;
535 /* never move the all-initial-value block 0 */
540 _findSameDataBlock(const uint32_t *data
, int32_t dataLength
,
541 int32_t otherBlock
, int32_t step
) {
544 /* ensure that we do not even partially get past dataLength */
545 dataLength
-=UTRIE_DATA_BLOCK_LENGTH
;
547 for(block
=0; block
<=dataLength
; block
+=step
) {
548 for(i
=0; i
<UTRIE_DATA_BLOCK_LENGTH
; ++i
) {
549 if(data
[block
+i
]!=data
[otherBlock
+i
]) {
553 if(i
==UTRIE_DATA_BLOCK_LENGTH
) {
561 * Compact a folded build-time trie.
564 * - removes blocks that are identical with earlier ones
565 * - overlaps adjacent blocks as much as possible (if overlap==TRUE)
566 * - moves blocks in steps of the data granularity
569 * - try to move and overlap blocks that are not already adjacent
570 * - try to move and overlap blocks that overlap with multiple values in the overlap region
573 utrie_compact(UNewTrie
*trie
, UBool overlap
, UErrorCode
*pErrorCode
) {
575 int32_t i
, start
, prevEnd
, newStart
, overlapStart
;
577 if(pErrorCode
==NULL
|| U_FAILURE(*pErrorCode
)) {
581 /* valid, uncompacted trie? */
583 *pErrorCode
=U_ILLEGAL_ARGUMENT_ERROR
;
586 if(trie
->isCompacted
) {
587 return; /* nothing left to do */
592 /* initialize the index map with "block is used/unused" flags */
593 _findUnusedBlocks(trie
);
595 /* if Latin-1 is preallocated and linear, then do not compact Latin-1 data */
596 if(trie
->isLatin1Linear
&& UTRIE_SHIFT
<=8) {
597 overlapStart
=UTRIE_DATA_BLOCK_LENGTH
+256;
599 overlapStart
=UTRIE_DATA_BLOCK_LENGTH
;
602 newStart
=UTRIE_DATA_BLOCK_LENGTH
;
604 for(start
=newStart
; start
<trie
->dataLength
;) {
606 * start: index of first entry of current block
607 * prevEnd: index to last entry of previous block
608 * newStart: index where the current block is to be moved
611 /* skip blocks that are not used */
612 if(trie
->map
[start
>>UTRIE_SHIFT
]<0) {
613 /* advance start to the next block */
614 start
+=UTRIE_DATA_BLOCK_LENGTH
;
616 /* leave prevEnd and newStart with the previous block! */
620 /* search for an identical block */
621 if( start
>=overlapStart
&&
622 (i
=_findSameDataBlock(trie
->data
, newStart
, start
,
623 overlap
? UTRIE_DATA_GRANULARITY
: UTRIE_DATA_BLOCK_LENGTH
))
626 /* found an identical block, set the other block's index value for the current block */
627 trie
->map
[start
>>UTRIE_SHIFT
]=i
;
629 /* advance start to the next block */
630 start
+=UTRIE_DATA_BLOCK_LENGTH
;
632 /* leave prevEnd and newStart with the previous block! */
636 /* see if the beginning of this block can be overlapped with the end of the previous block */
637 /* x: first value in the current block */
639 if(x
==trie
->data
[prevEnd
] && overlap
&& start
>=overlapStart
) {
640 /* overlap by at least one */
641 for(i
=1; i
<UTRIE_DATA_BLOCK_LENGTH
&& x
==trie
->data
[start
+i
] && x
==trie
->data
[prevEnd
-i
]; ++i
) {}
643 /* overlap by i, rounded down for the data block granularity */
644 i
&=~(UTRIE_DATA_GRANULARITY
-1);
651 trie
->map
[start
>>UTRIE_SHIFT
]=newStart
-i
;
653 /* move the non-overlapping indexes to their new positions */
655 for(i
=UTRIE_DATA_BLOCK_LENGTH
-i
; i
>0; --i
) {
656 trie
->data
[newStart
++]=trie
->data
[start
++];
658 } else if(newStart
<start
) {
659 /* no overlap, just move the indexes to their new positions */
660 trie
->map
[start
>>UTRIE_SHIFT
]=newStart
;
661 for(i
=UTRIE_DATA_BLOCK_LENGTH
; i
>0; --i
) {
662 trie
->data
[newStart
++]=trie
->data
[start
++];
664 } else /* no overlap && newStart==start */ {
665 trie
->map
[start
>>UTRIE_SHIFT
]=start
;
666 newStart
+=UTRIE_DATA_BLOCK_LENGTH
;
673 /* now adjust the index (stage 1) table */
674 for(i
=0; i
<trie
->indexLength
; ++i
) {
675 trie
->index
[i
]=trie
->map
[ABS(trie
->index
[i
])>>UTRIE_SHIFT
];
679 /* we saved some space */
680 printf("compacting trie: count of 32-bit words %lu->%lu\n",
681 (long)trie
->dataLength
, (long)newStart
);
684 trie
->dataLength
=newStart
;
687 /* serialization ------------------------------------------------------------ */
690 * Trie data structure in serialized form:
692 * UTrieHeader header;
693 * uint16_t index[header.indexLength];
694 * uint16_t data[header.dataLength];
697 /** "Trie" in big-endian US-ASCII (0x54726965) */
702 * 9 1=Latin-1 data is stored linearly at data+UTRIE_DATA_BLOCK_LENGTH
703 * 8 0=16-bit data, 1=32-bit data
704 * 7..4 UTRIE_INDEX_SHIFT // 0..UTRIE_SHIFT
705 * 3..0 UTRIE_SHIFT // 1..9
709 /** indexLength is a multiple of UTRIE_SURROGATE_BLOCK_COUNT */
712 /** dataLength>=UTRIE_DATA_BLOCK_LENGTH */
716 typedef struct UTrieHeader UTrieHeader
;
719 * Constants for use with UTrieHeader.options.
722 /** Mask to get the UTRIE_SHIFT value from options. */
723 UTRIE_OPTIONS_SHIFT_MASK
=0xf,
725 /** Shift options right this much to get the UTRIE_INDEX_SHIFT value. */
726 UTRIE_OPTIONS_INDEX_SHIFT
=4,
728 /** If set, then the data (stage 2) array is 32 bits wide. */
729 UTRIE_OPTIONS_DATA_IS_32_BIT
=0x100,
732 * If set, then Latin-1 data (for U+0000..U+00ff) is stored in the data (stage 2) array
733 * as a simple, linear array at data+UTRIE_DATA_BLOCK_LENGTH.
735 UTRIE_OPTIONS_LATIN1_IS_LINEAR
=0x200
739 * Default function for the folding value:
740 * Just store the offset (16 bits) if there is any non-initial-value entry.
742 * The offset parameter is never 0.
743 * Returning the offset itself is safe for UTRIE_SHIFT>=5 because
744 * for UTRIE_SHIFT==5 the maximum index length is UTRIE_MAX_INDEX_LENGTH==0x8800
745 * which fits into 16-bit trie values;
746 * for higher UTRIE_SHIFT, UTRIE_MAX_INDEX_LENGTH decreases.
748 * Theoretically, it would be safer for all possible UTRIE_SHIFT including
749 * those of 4 and lower to return offset>>UTRIE_SURROGATE_BLOCK_BITS
750 * which would always result in a value of 0x40..0x43f
751 * (start/end 1k blocks of supplementary Unicode code points).
752 * However, this would be uglier, and would not work for some existing
753 * binary data file formats.
755 * Also, we do not plan to change UTRIE_SHIFT because it would change binary
756 * data file formats, and we would probably not make it smaller because of
757 * the then even larger BMP index length even for empty tries.
759 static uint32_t U_CALLCONV
760 defaultGetFoldedValue(UNewTrie
*trie
, UChar32 start
, int32_t offset
) {
761 uint32_t value
, initialValue
;
765 initialValue
=trie
->data
[0];
768 value
=utrie_get32(trie
, start
, &inBlockZero
);
770 start
+=UTRIE_DATA_BLOCK_LENGTH
;
771 } else if(value
!=initialValue
) {
772 return (uint32_t)offset
;
780 U_CAPI
int32_t U_EXPORT2
781 utrie_serialize(UNewTrie
*trie
, void *dt
, int32_t capacity
,
782 UNewTrieGetFoldedValue
*getFoldedValue
,
783 UBool reduceTo16Bits
,
784 UErrorCode
*pErrorCode
) {
789 uint8_t* data
= NULL
;
792 if(pErrorCode
==NULL
|| U_FAILURE(*pErrorCode
)) {
796 if(trie
==NULL
|| capacity
<0 || (capacity
>0 && dt
==NULL
)) {
797 *pErrorCode
=U_ILLEGAL_ARGUMENT_ERROR
;
800 if(getFoldedValue
==NULL
) {
801 getFoldedValue
=defaultGetFoldedValue
;
805 /* fold and compact if necessary, also checks that indexLength is within limits */
806 if(!trie
->isCompacted
) {
807 /* compact once without overlap to improve folding */
808 utrie_compact(trie
, FALSE
, pErrorCode
);
810 /* fold the supplementary part of the index array */
811 utrie_fold(trie
, getFoldedValue
, pErrorCode
);
813 /* compact again with overlap for minimum data array length */
814 utrie_compact(trie
, TRUE
, pErrorCode
);
816 trie
->isCompacted
=TRUE
;
817 if(U_FAILURE(*pErrorCode
)) {
822 /* is dataLength within limits? */
823 if( (reduceTo16Bits
? (trie
->dataLength
+trie
->indexLength
) : trie
->dataLength
) >= UTRIE_MAX_DATA_LENGTH
) {
824 *pErrorCode
=U_INDEX_OUTOFBOUNDS_ERROR
;
827 length
=sizeof(UTrieHeader
)+2*trie
->indexLength
;
829 length
+=2*trie
->dataLength
;
831 length
+=4*trie
->dataLength
;
834 if(length
>capacity
) {
835 return length
; /* preflighting */
838 /* set the header fields */
839 header
=(UTrieHeader
*)data
;
840 data
+=sizeof(UTrieHeader
);
842 header
->signature
=0x54726965; /* "Trie" */
843 header
->options
=UTRIE_SHIFT
| (UTRIE_INDEX_SHIFT
<<UTRIE_OPTIONS_INDEX_SHIFT
);
845 if(!reduceTo16Bits
) {
846 header
->options
|=UTRIE_OPTIONS_DATA_IS_32_BIT
;
848 if(trie
->isLatin1Linear
) {
849 header
->options
|=UTRIE_OPTIONS_LATIN1_IS_LINEAR
;
852 header
->indexLength
=trie
->indexLength
;
853 header
->dataLength
=trie
->dataLength
;
855 /* write the index (stage 1) array and the 16/32-bit data (stage 2) array */
857 /* write 16-bit index values shifted right by UTRIE_INDEX_SHIFT, after adding indexLength */
858 p
=(uint32_t *)trie
->index
;
859 dest16
=(uint16_t *)data
;
860 for(i
=trie
->indexLength
; i
>0; --i
) {
861 *dest16
++=(uint16_t)((*p
++ + trie
->indexLength
)>>UTRIE_INDEX_SHIFT
);
864 /* write 16-bit data values */
866 for(i
=trie
->dataLength
; i
>0; --i
) {
867 *dest16
++=(uint16_t)*p
++;
870 /* write 16-bit index values shifted right by UTRIE_INDEX_SHIFT */
871 p
=(uint32_t *)trie
->index
;
872 dest16
=(uint16_t *)data
;
873 for(i
=trie
->indexLength
; i
>0; --i
) {
874 *dest16
++=(uint16_t)(*p
++ >> UTRIE_INDEX_SHIFT
);
877 /* write 32-bit data values */
878 uprv_memcpy(dest16
, trie
->data
, 4*trie
->dataLength
);
884 /* inverse to defaultGetFoldedValue() */
885 static int32_t U_CALLCONV
886 defaultGetFoldingOffset(uint32_t data
) {
887 return (int32_t)data
;
890 U_CAPI
int32_t U_EXPORT2
891 utrie_unserialize(UTrie
*trie
, const void *data
, int32_t length
, UErrorCode
*pErrorCode
) {
896 if(pErrorCode
==NULL
|| U_FAILURE(*pErrorCode
)) {
900 /* enough data for a trie header? */
901 if(length
<sizeof(UTrieHeader
)) {
902 *pErrorCode
=U_INVALID_FORMAT_ERROR
;
906 /* check the signature */
907 header
=(UTrieHeader
*)data
;
908 if(header
->signature
!=0x54726965) {
909 *pErrorCode
=U_INVALID_FORMAT_ERROR
;
913 /* get the options and check the shift values */
914 options
=header
->options
;
915 if( (options
&UTRIE_OPTIONS_SHIFT_MASK
)!=UTRIE_SHIFT
||
916 ((options
>>UTRIE_OPTIONS_INDEX_SHIFT
)&UTRIE_OPTIONS_SHIFT_MASK
)!=UTRIE_INDEX_SHIFT
918 *pErrorCode
=U_INVALID_FORMAT_ERROR
;
921 trie
->isLatin1Linear
= (UBool
)((options
&UTRIE_OPTIONS_LATIN1_IS_LINEAR
)!=0);
923 /* get the length values */
924 trie
->indexLength
=header
->indexLength
;
925 trie
->dataLength
=header
->dataLength
;
927 length
-=(int32_t)sizeof(UTrieHeader
);
929 /* enough data for the index? */
930 if(length
<2*trie
->indexLength
) {
931 *pErrorCode
=U_INVALID_FORMAT_ERROR
;
934 p16
=(uint16_t *)(header
+1);
936 p16
+=trie
->indexLength
;
937 length
-=2*trie
->indexLength
;
940 if(options
&UTRIE_OPTIONS_DATA_IS_32_BIT
) {
941 if(length
<4*trie
->dataLength
) {
942 *pErrorCode
=U_INVALID_FORMAT_ERROR
;
945 trie
->data32
=(const uint32_t *)p16
;
946 trie
->initialValue
=trie
->data32
[0];
947 length
=(int32_t)sizeof(UTrieHeader
)+2*trie
->indexLength
+4*trie
->dataLength
;
949 if(length
<2*trie
->dataLength
) {
950 *pErrorCode
=U_INVALID_FORMAT_ERROR
;
954 /* the "data16" data is used via the index pointer */
956 trie
->initialValue
=trie
->index
[trie
->indexLength
];
957 length
=(int32_t)sizeof(UTrieHeader
)+2*trie
->indexLength
+2*trie
->dataLength
;
960 trie
->getFoldingOffset
=defaultGetFoldingOffset
;
965 /* swapping ----------------------------------------------------------------- */
967 U_CAPI
int32_t U_EXPORT2
968 utrie_swap(const UDataSwapper
*ds
,
969 const void *inData
, int32_t length
, void *outData
,
970 UErrorCode
*pErrorCode
) {
971 const UTrieHeader
*inTrie
;
976 if(pErrorCode
==NULL
|| U_FAILURE(*pErrorCode
)) {
979 if(ds
==NULL
|| inData
==NULL
|| (length
>=0 && outData
==NULL
)) {
980 *pErrorCode
=U_ILLEGAL_ARGUMENT_ERROR
;
984 /* setup and swapping */
985 if(length
>=0 && length
<sizeof(UTrieHeader
)) {
986 *pErrorCode
=U_INDEX_OUTOFBOUNDS_ERROR
;
990 inTrie
=(const UTrieHeader
*)inData
;
991 trie
.signature
=ds
->readUInt32(inTrie
->signature
);
992 trie
.options
=ds
->readUInt32(inTrie
->options
);
993 trie
.indexLength
=udata_readInt32(ds
, inTrie
->indexLength
);
994 trie
.dataLength
=udata_readInt32(ds
, inTrie
->dataLength
);
996 if( trie
.signature
!=0x54726965 ||
997 (trie
.options
&UTRIE_OPTIONS_SHIFT_MASK
)!=UTRIE_SHIFT
||
998 ((trie
.options
>>UTRIE_OPTIONS_INDEX_SHIFT
)&UTRIE_OPTIONS_SHIFT_MASK
)!=UTRIE_INDEX_SHIFT
||
999 trie
.indexLength
<UTRIE_BMP_INDEX_LENGTH
||
1000 (trie
.indexLength
&(UTRIE_SURROGATE_BLOCK_COUNT
-1))!=0 ||
1001 trie
.dataLength
<UTRIE_DATA_BLOCK_LENGTH
||
1002 (trie
.dataLength
&(UTRIE_DATA_GRANULARITY
-1))!=0 ||
1003 ((trie
.options
&UTRIE_OPTIONS_LATIN1_IS_LINEAR
)!=0 && trie
.dataLength
<(UTRIE_DATA_BLOCK_LENGTH
+0x100))
1005 *pErrorCode
=U_INVALID_FORMAT_ERROR
; /* not a UTrie */
1009 dataIs32
=(UBool
)((trie
.options
&UTRIE_OPTIONS_DATA_IS_32_BIT
)!=0);
1010 size
=sizeof(UTrieHeader
)+trie
.indexLength
*2+trie
.dataLength
*(dataIs32
?4:2);
1013 UTrieHeader
*outTrie
;
1016 *pErrorCode
=U_INDEX_OUTOFBOUNDS_ERROR
;
1020 outTrie
=(UTrieHeader
*)outData
;
1022 /* swap the header */
1023 ds
->swapArray32(ds
, inTrie
, sizeof(UTrieHeader
), outTrie
, pErrorCode
);
1025 /* swap the index and the data */
1027 ds
->swapArray16(ds
, inTrie
+1, trie
.indexLength
*2, outTrie
+1, pErrorCode
);
1028 ds
->swapArray32(ds
, (const uint16_t *)(inTrie
+1)+trie
.indexLength
, trie
.dataLength
*4,
1029 (uint16_t *)(outTrie
+1)+trie
.indexLength
, pErrorCode
);
1031 ds
->swapArray16(ds
, inTrie
+1, (trie
.indexLength
+trie
.dataLength
)*2, outTrie
+1, pErrorCode
);
1038 /* enumeration -------------------------------------------------------------- */
1040 /* default UTrieEnumValue() returns the input value itself */
1041 static uint32_t U_CALLCONV
1042 enumSameValue(const void *context
, uint32_t value
) {
1047 * Enumerate all ranges of code points with the same relevant values.
1048 * The values are transformed from the raw trie entries by the enumValue function.
1050 U_CAPI
void U_EXPORT2
1051 utrie_enum(const UTrie
*trie
,
1052 UTrieEnumValue
*enumValue
, UTrieEnumRange
*enumRange
, const void *context
) {
1053 const uint32_t *data32
;
1054 const uint16_t *index
;
1056 uint32_t value
, prevValue
, initialValue
;
1058 int32_t l
, i
, j
, block
, prevBlock
, offset
;
1060 /* check arguments */
1061 if(trie
==NULL
|| trie
->index
==NULL
|| enumRange
==NULL
) {
1064 if(enumValue
==NULL
) {
1065 enumValue
=enumSameValue
;
1069 data32
=trie
->data32
;
1071 /* get the enumeration value that corresponds to an initial-value trie data entry */
1072 initialValue
=enumValue(context
, trie
->initialValue
);
1074 /* set variables for previous range */
1077 prevValue
=initialValue
;
1079 /* enumerate BMP - the main loop enumerates data blocks */
1080 for(i
=0, c
=0; c
<=0xffff; ++i
) {
1082 /* skip lead surrogate code _units_, go to lead surr. code _points_ */
1083 i
=UTRIE_BMP_INDEX_LENGTH
;
1084 } else if(c
==0xdc00) {
1085 /* go back to regular BMP code points */
1089 block
=index
[i
]<<UTRIE_INDEX_SHIFT
;
1090 if(block
==prevBlock
) {
1091 /* the block is the same as the previous one, and filled with value */
1092 c
+=UTRIE_DATA_BLOCK_LENGTH
;
1093 } else if(block
==0) {
1094 /* this is the all-initial-value block */
1095 if(prevValue
!=initialValue
) {
1097 if(!enumRange(context
, prev
, c
, prevValue
)) {
1103 prevValue
=initialValue
;
1105 c
+=UTRIE_DATA_BLOCK_LENGTH
;
1108 for(j
=0; j
<UTRIE_DATA_BLOCK_LENGTH
; ++j
) {
1109 value
=enumValue(context
, data32
!=NULL
? data32
[block
+j
] : index
[block
+j
]);
1110 if(value
!=prevValue
) {
1112 if(!enumRange(context
, prev
, c
, prevValue
)) {
1127 /* enumerate supplementary code points */
1128 for(l
=0xd800; l
<0xdc00;) {
1129 /* lead surrogate access */
1130 offset
=index
[l
>>UTRIE_SHIFT
]<<UTRIE_INDEX_SHIFT
;
1131 if(offset
==(data32
!=NULL
? 0 : trie
->indexLength
)) {
1132 /* no entries for a whole block of lead surrogates */
1133 if(prevValue
!=initialValue
) {
1135 if(!enumRange(context
, prev
, c
, prevValue
)) {
1141 prevValue
=initialValue
;
1144 l
+=UTRIE_DATA_BLOCK_LENGTH
;
1145 c
+=UTRIE_DATA_BLOCK_LENGTH
<<10;
1149 value
= data32
!=NULL
? data32
[offset
+(l
&UTRIE_MASK
)] : index
[offset
+(l
&UTRIE_MASK
)];
1151 /* enumerate trail surrogates for this lead surrogate */
1152 offset
=trie
->getFoldingOffset(value
);
1154 /* no data for this lead surrogate */
1155 if(prevValue
!=initialValue
) {
1157 if(!enumRange(context
, prev
, c
, prevValue
)) {
1163 prevValue
=initialValue
;
1166 /* nothing else to do for the supplementary code points for this lead surrogate */
1169 /* enumerate code points for this lead surrogate */
1171 offset
+=UTRIE_SURROGATE_BLOCK_COUNT
;
1173 /* copy of most of the body of the BMP loop */
1174 block
=index
[i
]<<UTRIE_INDEX_SHIFT
;
1175 if(block
==prevBlock
) {
1176 /* the block is the same as the previous one, and filled with value */
1177 c
+=UTRIE_DATA_BLOCK_LENGTH
;
1178 } else if(block
==0) {
1179 /* this is the all-initial-value block */
1180 if(prevValue
!=initialValue
) {
1182 if(!enumRange(context
, prev
, c
, prevValue
)) {
1188 prevValue
=initialValue
;
1190 c
+=UTRIE_DATA_BLOCK_LENGTH
;
1193 for(j
=0; j
<UTRIE_DATA_BLOCK_LENGTH
; ++j
) {
1194 value
=enumValue(context
, data32
!=NULL
? data32
[block
+j
] : index
[block
+j
]);
1195 if(value
!=prevValue
) {
1197 if(!enumRange(context
, prev
, c
, prevValue
)) {
1210 } while(++i
<offset
);
1216 /* deliver last range */
1217 enumRange(context
, prev
, c
, prevValue
);