2 ******************************************************************************
4 * Copyright (C) 2001-2006, International Business Machines
5 * Corporation and others. All Rights Reserved.
7 ******************************************************************************
10 * tab size: 8 (not used)
13 * created on: 2001oct20
14 * created by: Markus W. Scherer
16 * This is a common implementation of a "folded" trie.
17 * It is a kind of compressed, serializable table of 16- or 32-bit values associated with
18 * Unicode code points (0..0x10ffff).
25 #include "unicode/utypes.h"
29 /* miscellaneous ------------------------------------------------------------ */
32 #define ABS(x) ((x)>=0 ? (x) : -(x))
35 equal_uint32(const uint32_t *s
, const uint32_t *t
, int32_t length
) {
36 while(length
>0 && *s
==*t
) {
41 return (UBool
)(length
==0);
44 /* Building a trie ----------------------------------------------------------*/
46 U_CAPI UNewTrie
* U_EXPORT2
47 utrie_open(UNewTrie
*fillIn
,
48 uint32_t *aliasData
, int32_t maxDataLength
,
49 uint32_t initialValue
, uint32_t leadUnitValue
,
54 if( maxDataLength
<UTRIE_DATA_BLOCK_LENGTH
||
55 (latin1Linear
&& maxDataLength
<1024)
63 trie
=(UNewTrie
*)uprv_malloc(sizeof(UNewTrie
));
68 uprv_memset(trie
, 0, sizeof(UNewTrie
));
69 trie
->isAllocated
= (UBool
)(fillIn
==NULL
);
73 trie
->isDataAllocated
=FALSE
;
75 trie
->data
=(uint32_t *)uprv_malloc(maxDataLength
*4);
76 if(trie
->data
==NULL
) {
80 trie
->isDataAllocated
=TRUE
;
83 /* preallocate and reset the first data block (block index 0) */
84 j
=UTRIE_DATA_BLOCK_LENGTH
;
87 /* preallocate and reset the first block (number 0) and Latin-1 (U+0000..U+00ff) after that */
88 /* made sure above that maxDataLength>=1024 */
90 /* set indexes to point to consecutive data blocks */
93 /* do this at least for trie->index[0] even if that block is only partly used for Latin-1 */
95 j
+=UTRIE_DATA_BLOCK_LENGTH
;
96 } while(i
<(256>>UTRIE_SHIFT
));
99 /* reset the initially allocated blocks to the initial value */
102 trie
->data
[--j
]=initialValue
;
105 trie
->leadUnitValue
=leadUnitValue
;
106 trie
->indexLength
=UTRIE_MAX_INDEX_LENGTH
;
107 trie
->dataCapacity
=maxDataLength
;
108 trie
->isLatin1Linear
=latin1Linear
;
109 trie
->isCompacted
=FALSE
;
113 U_CAPI UNewTrie
* U_EXPORT2
114 utrie_clone(UNewTrie
*fillIn
, const UNewTrie
*other
, uint32_t *aliasData
, int32_t aliasDataCapacity
) {
116 UBool isDataAllocated
;
118 /* do not clone if other is not valid or already compacted */
119 if(other
==NULL
|| other
->data
==NULL
|| other
->isCompacted
) {
124 if(aliasData
!=NULL
&& aliasDataCapacity
>=other
->dataCapacity
) {
125 isDataAllocated
=FALSE
;
127 aliasDataCapacity
=other
->dataCapacity
;
128 aliasData
=(uint32_t *)uprv_malloc(other
->dataCapacity
*4);
129 if(aliasData
==NULL
) {
132 isDataAllocated
=TRUE
;
135 trie
=utrie_open(fillIn
, aliasData
, aliasDataCapacity
,
136 other
->data
[0], other
->leadUnitValue
,
137 other
->isLatin1Linear
);
139 uprv_free(aliasData
);
141 uprv_memcpy(trie
->index
, other
->index
, sizeof(trie
->index
));
142 uprv_memcpy(trie
->data
, other
->data
, other
->dataLength
*4);
143 trie
->dataLength
=other
->dataLength
;
144 trie
->isDataAllocated
=isDataAllocated
;
150 U_CAPI
void U_EXPORT2
151 utrie_close(UNewTrie
*trie
) {
153 if(trie
->isDataAllocated
) {
154 uprv_free(trie
->data
);
157 if(trie
->isAllocated
) {
163 U_CAPI
uint32_t * U_EXPORT2
164 utrie_getData(UNewTrie
*trie
, int32_t *pLength
) {
165 if(trie
==NULL
|| pLength
==NULL
) {
169 *pLength
=trie
->dataLength
;
174 utrie_allocDataBlock(UNewTrie
*trie
) {
175 int32_t newBlock
, newTop
;
177 newBlock
=trie
->dataLength
;
178 newTop
=newBlock
+UTRIE_DATA_BLOCK_LENGTH
;
179 if(newTop
>trie
->dataCapacity
) {
180 /* out of memory in the data array */
183 trie
->dataLength
=newTop
;
188 * No error checking for illegal arguments.
190 * @return -1 if no new data block available (out of memory in data array)
194 utrie_getDataBlock(UNewTrie
*trie
, UChar32 c
) {
195 int32_t indexValue
, newBlock
;
198 indexValue
=trie
->index
[c
];
203 /* allocate a new data block */
204 newBlock
=utrie_allocDataBlock(trie
);
206 /* out of memory in the data array */
209 trie
->index
[c
]=newBlock
;
211 /* copy-on-write for a block from a setRange() */
212 uprv_memcpy(trie
->data
+newBlock
, trie
->data
-indexValue
, 4*UTRIE_DATA_BLOCK_LENGTH
);
217 * @return TRUE if the value was successfully set
219 U_CAPI UBool U_EXPORT2
220 utrie_set32(UNewTrie
*trie
, UChar32 c
, uint32_t value
) {
223 /* valid, uncompacted trie and valid c? */
224 if(trie
==NULL
|| trie
->isCompacted
|| (uint32_t)c
>0x10ffff) {
228 block
=utrie_getDataBlock(trie
, c
);
233 trie
->data
[block
+(c
&UTRIE_MASK
)]=value
;
237 U_CAPI
uint32_t U_EXPORT2
238 utrie_get32(UNewTrie
*trie
, UChar32 c
, UBool
*pInBlockZero
) {
241 /* valid, uncompacted trie and valid c? */
242 if(trie
==NULL
|| trie
->isCompacted
|| (uint32_t)c
>0x10ffff) {
243 if(pInBlockZero
!=NULL
) {
249 block
=trie
->index
[c
>>UTRIE_SHIFT
];
250 if(pInBlockZero
!=NULL
) {
251 *pInBlockZero
= (UBool
)(block
==0);
254 return trie
->data
[ABS(block
)+(c
&UTRIE_MASK
)];
261 utrie_fillBlock(uint32_t *block
, UChar32 start
, UChar32 limit
,
262 uint32_t value
, uint32_t initialValue
, UBool overwrite
) {
268 while(block
<pLimit
) {
272 while(block
<pLimit
) {
273 if(*block
==initialValue
) {
281 U_CAPI UBool U_EXPORT2
282 utrie_setRange32(UNewTrie
*trie
, UChar32 start
, UChar32 limit
, uint32_t value
, UBool overwrite
) {
284 * repeat value in [start..limit[
285 * mark index values for repeat-data blocks by setting bit 31 of the index values
286 * fill around existing values if any, if(overwrite)
288 uint32_t initialValue
;
289 int32_t block
, rest
, repeatBlock
;
291 /* valid, uncompacted trie and valid indexes? */
292 if( trie
==NULL
|| trie
->isCompacted
||
293 (uint32_t)start
>0x10ffff || (uint32_t)limit
>0x110000 || start
>limit
298 return TRUE
; /* nothing to do */
301 initialValue
=trie
->data
[0];
302 if(start
&UTRIE_MASK
) {
305 /* set partial block at [start..following block boundary[ */
306 block
=utrie_getDataBlock(trie
, start
);
311 nextStart
=(start
+UTRIE_DATA_BLOCK_LENGTH
)&~UTRIE_MASK
;
312 if(nextStart
<=limit
) {
313 utrie_fillBlock(trie
->data
+block
, start
&UTRIE_MASK
, UTRIE_DATA_BLOCK_LENGTH
,
314 value
, initialValue
, overwrite
);
317 utrie_fillBlock(trie
->data
+block
, start
&UTRIE_MASK
, limit
&UTRIE_MASK
,
318 value
, initialValue
, overwrite
);
323 /* number of positions in the last, partial block */
324 rest
=limit
&UTRIE_MASK
;
326 /* round down limit to a block boundary */
329 /* iterate over all-value blocks */
330 if(value
==initialValue
) {
336 /* get index value */
337 block
=trie
->index
[start
>>UTRIE_SHIFT
];
339 /* already allocated, fill in value */
340 utrie_fillBlock(trie
->data
+block
, 0, UTRIE_DATA_BLOCK_LENGTH
, value
, initialValue
, overwrite
);
341 } else if(trie
->data
[-block
]!=value
&& (block
==0 || overwrite
)) {
342 /* set the repeatBlock instead of the current block 0 or range block */
344 trie
->index
[start
>>UTRIE_SHIFT
]=-repeatBlock
;
346 /* create and set and fill the repeatBlock */
347 repeatBlock
=utrie_getDataBlock(trie
, start
);
352 /* set the negative block number to indicate that it is a repeat block */
353 trie
->index
[start
>>UTRIE_SHIFT
]=-repeatBlock
;
354 utrie_fillBlock(trie
->data
+repeatBlock
, 0, UTRIE_DATA_BLOCK_LENGTH
, value
, initialValue
, TRUE
);
358 start
+=UTRIE_DATA_BLOCK_LENGTH
;
362 /* set partial block at [last block boundary..limit[ */
363 block
=utrie_getDataBlock(trie
, start
);
368 utrie_fillBlock(trie
->data
+block
, 0, rest
, value
, initialValue
, overwrite
);
375 _findSameIndexBlock(const int32_t *index
, int32_t indexLength
,
376 int32_t otherBlock
) {
379 for(block
=UTRIE_BMP_INDEX_LENGTH
; block
<indexLength
; block
+=UTRIE_SURROGATE_BLOCK_COUNT
) {
380 for(i
=0; i
<UTRIE_SURROGATE_BLOCK_COUNT
; ++i
) {
381 if(index
[block
+i
]!=index
[otherBlock
+i
]) {
385 if(i
==UTRIE_SURROGATE_BLOCK_COUNT
) {
393 * Fold the normalization data for supplementary code points into
394 * a compact area on top of the BMP-part of the trie index,
395 * with the lead surrogates indexing this compact area.
397 * Duplicate the index values for lead surrogates:
398 * From inside the BMP area, where some may be overridden with folded values,
399 * to just after the BMP area, where they can be retrieved for
400 * code point lookups.
403 utrie_fold(UNewTrie
*trie
, UNewTrieGetFoldedValue
*getFoldedValue
, UErrorCode
*pErrorCode
) {
404 int32_t leadIndexes
[UTRIE_SURROGATE_BLOCK_COUNT
];
408 int32_t indexLength
, block
;
412 /* copy the lead surrogate indexes into a temporary array */
413 uprv_memcpy(leadIndexes
, index
+(0xd800>>UTRIE_SHIFT
), 4*UTRIE_SURROGATE_BLOCK_COUNT
);
416 * set all values for lead surrogate code *units* to leadUnitValue
417 * so that, by default, runtime lookups will find no data for associated
418 * supplementary code points, unless there is data for such code points
419 * which will result in a non-zero folding value below that is set for
420 * the respective lead units
422 * the above saved the indexes for surrogate code *points*
423 * fill the indexes with simplified code from utrie_setRange32()
425 if(trie
->leadUnitValue
==trie
->data
[0]) {
426 block
=0; /* leadUnitValue==initialValue, use all-initial-value block */
428 /* create and fill the repeatBlock */
429 block
=utrie_allocDataBlock(trie
);
431 /* data table overflow */
432 *pErrorCode
=U_MEMORY_ALLOCATION_ERROR
;
435 utrie_fillBlock(trie
->data
+block
, 0, UTRIE_DATA_BLOCK_LENGTH
, trie
->leadUnitValue
, trie
->data
[0], TRUE
);
436 block
=-block
; /* negative block number to indicate that it is a repeat block */
438 for(c
=(0xd800>>UTRIE_SHIFT
); c
<(0xdc00>>UTRIE_SHIFT
); ++c
) {
439 trie
->index
[c
]=block
;
443 * Fold significant index values into the area just after the BMP indexes.
444 * In case the first lead surrogate has significant data,
445 * its index block must be used first (in which case the folding is a no-op).
446 * Later all folded index blocks are moved up one to insert the copied
447 * lead surrogate indexes.
449 indexLength
=UTRIE_BMP_INDEX_LENGTH
;
451 /* search for any index (stage 1) entries for supplementary code points */
452 for(c
=0x10000; c
<0x110000;) {
453 if(index
[c
>>UTRIE_SHIFT
]!=0) {
454 /* there is data, treat the full block for a lead surrogate */
458 printf("supplementary data for lead surrogate U+%04lx\n", (long)(0xd7c0+(c
>>10)));
461 /* is there an identical index block? */
462 block
=_findSameIndexBlock(index
, indexLength
, c
>>UTRIE_SHIFT
);
465 * get a folded value for [c..c+0x400[ and,
466 * if different from the value for the lead surrogate code point,
467 * set it for the lead surrogate code unit
469 value
=getFoldedValue(trie
, c
, block
+UTRIE_SURROGATE_BLOCK_COUNT
);
470 if(value
!=utrie_get32(trie
, U16_LEAD(c
), NULL
)) {
471 if(!utrie_set32(trie
, U16_LEAD(c
), value
)) {
472 /* data table overflow */
473 *pErrorCode
=U_MEMORY_ALLOCATION_ERROR
;
477 /* if we did not find an identical index block... */
478 if(block
==indexLength
) {
479 /* move the actual index (stage 1) entries from the supplementary position to the new one */
480 uprv_memmove(index
+indexLength
,
481 index
+(c
>>UTRIE_SHIFT
),
482 4*UTRIE_SURROGATE_BLOCK_COUNT
);
483 indexLength
+=UTRIE_SURROGATE_BLOCK_COUNT
;
488 c
+=UTRIE_DATA_BLOCK_LENGTH
;
493 * index array overflow?
494 * This is to guarantee that a folding offset is of the form
495 * UTRIE_BMP_INDEX_LENGTH+n*UTRIE_SURROGATE_BLOCK_COUNT with n=0..1023.
496 * If the index is too large, then n>=1024 and more than 10 bits are necessary.
498 * In fact, it can only ever become n==1024 with completely unfoldable data and
499 * the additional block of duplicated values for lead surrogates.
501 if(indexLength
>=UTRIE_MAX_INDEX_LENGTH
) {
502 *pErrorCode
=U_INDEX_OUTOFBOUNDS_ERROR
;
507 * make space for the lead surrogate index block and
508 * insert it between the BMP indexes and the folded ones
510 uprv_memmove(index
+UTRIE_BMP_INDEX_LENGTH
+UTRIE_SURROGATE_BLOCK_COUNT
,
511 index
+UTRIE_BMP_INDEX_LENGTH
,
512 4*(indexLength
-UTRIE_BMP_INDEX_LENGTH
));
513 uprv_memcpy(index
+UTRIE_BMP_INDEX_LENGTH
,
515 4*UTRIE_SURROGATE_BLOCK_COUNT
);
516 indexLength
+=UTRIE_SURROGATE_BLOCK_COUNT
;
519 printf("trie index count: BMP %ld all Unicode %ld folded %ld\n",
520 UTRIE_BMP_INDEX_LENGTH
, (long)UTRIE_MAX_INDEX_LENGTH
, indexLength
);
523 trie
->indexLength
=indexLength
;
527 * Set a value in the trie index map to indicate which data block
528 * is referenced and which one is not.
529 * utrie_compact() will remove data blocks that are not used at all.
532 * - -1 if it is not used
535 _findUnusedBlocks(UNewTrie
*trie
) {
538 /* fill the entire map with "not used" */
539 uprv_memset(trie
->map
, 0xff, (UTRIE_MAX_BUILD_TIME_DATA_LENGTH
>>UTRIE_SHIFT
)*4);
541 /* mark each block that _is_ used with 0 */
542 for(i
=0; i
<trie
->indexLength
; ++i
) {
543 trie
->map
[ABS(trie
->index
[i
])>>UTRIE_SHIFT
]=0;
546 /* never move the all-initial-value block 0 */
551 _findSameDataBlock(const uint32_t *data
, int32_t dataLength
,
552 int32_t otherBlock
, int32_t step
) {
555 /* ensure that we do not even partially get past dataLength */
556 dataLength
-=UTRIE_DATA_BLOCK_LENGTH
;
558 for(block
=0; block
<=dataLength
; block
+=step
) {
559 if(equal_uint32(data
+block
, data
+otherBlock
, UTRIE_DATA_BLOCK_LENGTH
)) {
567 * Compact a folded build-time trie.
570 * - removes blocks that are identical with earlier ones
571 * - overlaps adjacent blocks as much as possible (if overlap==TRUE)
572 * - moves blocks in steps of the data granularity
573 * - moves and overlaps blocks that overlap with multiple values in the overlap region
576 * - try to move and overlap blocks that are not already adjacent
579 utrie_compact(UNewTrie
*trie
, UBool overlap
, UErrorCode
*pErrorCode
) {
580 int32_t i
, start
, newStart
, overlapStart
;
582 if(pErrorCode
==NULL
|| U_FAILURE(*pErrorCode
)) {
586 /* valid, uncompacted trie? */
588 *pErrorCode
=U_ILLEGAL_ARGUMENT_ERROR
;
591 if(trie
->isCompacted
) {
592 return; /* nothing left to do */
597 /* initialize the index map with "block is used/unused" flags */
598 _findUnusedBlocks(trie
);
600 /* if Latin-1 is preallocated and linear, then do not compact Latin-1 data */
601 if(trie
->isLatin1Linear
&& UTRIE_SHIFT
<=8) {
602 overlapStart
=UTRIE_DATA_BLOCK_LENGTH
+256;
604 overlapStart
=UTRIE_DATA_BLOCK_LENGTH
;
607 newStart
=UTRIE_DATA_BLOCK_LENGTH
;
608 for(start
=newStart
; start
<trie
->dataLength
;) {
610 * start: index of first entry of current block
611 * newStart: index where the current block is to be moved
612 * (right after current end of already-compacted data)
615 /* skip blocks that are not used */
616 if(trie
->map
[start
>>UTRIE_SHIFT
]<0) {
617 /* advance start to the next block */
618 start
+=UTRIE_DATA_BLOCK_LENGTH
;
620 /* leave newStart with the previous block! */
624 /* search for an identical block */
625 if( start
>=overlapStart
&&
626 (i
=_findSameDataBlock(trie
->data
, newStart
, start
,
627 overlap
? UTRIE_DATA_GRANULARITY
: UTRIE_DATA_BLOCK_LENGTH
))
630 /* found an identical block, set the other block's index value for the current block */
631 trie
->map
[start
>>UTRIE_SHIFT
]=i
;
633 /* advance start to the next block */
634 start
+=UTRIE_DATA_BLOCK_LENGTH
;
636 /* leave newStart with the previous block! */
640 /* see if the beginning of this block can be overlapped with the end of the previous block */
641 if(overlap
&& start
>=overlapStart
) {
642 /* look for maximum overlap (modulo granularity) with the previous, adjacent block */
643 for(i
=UTRIE_DATA_BLOCK_LENGTH
-UTRIE_DATA_GRANULARITY
;
644 i
>0 && !equal_uint32(trie
->data
+(newStart
-i
), trie
->data
+start
, i
);
645 i
-=UTRIE_DATA_GRANULARITY
) {}
652 trie
->map
[start
>>UTRIE_SHIFT
]=newStart
-i
;
654 /* move the non-overlapping indexes to their new positions */
656 for(i
=UTRIE_DATA_BLOCK_LENGTH
-i
; i
>0; --i
) {
657 trie
->data
[newStart
++]=trie
->data
[start
++];
659 } else if(newStart
<start
) {
660 /* no overlap, just move the indexes to their new positions */
661 trie
->map
[start
>>UTRIE_SHIFT
]=newStart
;
662 for(i
=UTRIE_DATA_BLOCK_LENGTH
; i
>0; --i
) {
663 trie
->data
[newStart
++]=trie
->data
[start
++];
665 } else /* no overlap && newStart==start */ {
666 trie
->map
[start
>>UTRIE_SHIFT
]=start
;
667 newStart
+=UTRIE_DATA_BLOCK_LENGTH
;
672 /* now adjust the index (stage 1) table */
673 for(i
=0; i
<trie
->indexLength
; ++i
) {
674 trie
->index
[i
]=trie
->map
[ABS(trie
->index
[i
])>>UTRIE_SHIFT
];
678 /* we saved some space */
679 printf("compacting trie: count of 32-bit words %lu->%lu\n",
680 (long)trie
->dataLength
, (long)newStart
);
683 trie
->dataLength
=newStart
;
686 /* serialization ------------------------------------------------------------ */
689 * Default function for the folding value:
690 * Just store the offset (16 bits) if there is any non-initial-value entry.
692 * The offset parameter is never 0.
693 * Returning the offset itself is safe for UTRIE_SHIFT>=5 because
694 * for UTRIE_SHIFT==5 the maximum index length is UTRIE_MAX_INDEX_LENGTH==0x8800
695 * which fits into 16-bit trie values;
696 * for higher UTRIE_SHIFT, UTRIE_MAX_INDEX_LENGTH decreases.
698 * Theoretically, it would be safer for all possible UTRIE_SHIFT including
699 * those of 4 and lower to return offset>>UTRIE_SURROGATE_BLOCK_BITS
700 * which would always result in a value of 0x40..0x43f
701 * (start/end 1k blocks of supplementary Unicode code points).
702 * However, this would be uglier, and would not work for some existing
703 * binary data file formats.
705 * Also, we do not plan to change UTRIE_SHIFT because it would change binary
706 * data file formats, and we would probably not make it smaller because of
707 * the then even larger BMP index length even for empty tries.
709 static uint32_t U_CALLCONV
710 defaultGetFoldedValue(UNewTrie
*trie
, UChar32 start
, int32_t offset
) {
711 uint32_t value
, initialValue
;
715 initialValue
=trie
->data
[0];
718 value
=utrie_get32(trie
, start
, &inBlockZero
);
720 start
+=UTRIE_DATA_BLOCK_LENGTH
;
721 } else if(value
!=initialValue
) {
722 return (uint32_t)offset
;
730 U_CAPI
int32_t U_EXPORT2
731 utrie_serialize(UNewTrie
*trie
, void *dt
, int32_t capacity
,
732 UNewTrieGetFoldedValue
*getFoldedValue
,
733 UBool reduceTo16Bits
,
734 UErrorCode
*pErrorCode
) {
739 uint8_t* data
= NULL
;
742 if(pErrorCode
==NULL
|| U_FAILURE(*pErrorCode
)) {
746 if(trie
==NULL
|| capacity
<0 || (capacity
>0 && dt
==NULL
)) {
747 *pErrorCode
=U_ILLEGAL_ARGUMENT_ERROR
;
750 if(getFoldedValue
==NULL
) {
751 getFoldedValue
=defaultGetFoldedValue
;
755 /* fold and compact if necessary, also checks that indexLength is within limits */
756 if(!trie
->isCompacted
) {
757 /* compact once without overlap to improve folding */
758 utrie_compact(trie
, FALSE
, pErrorCode
);
760 /* fold the supplementary part of the index array */
761 utrie_fold(trie
, getFoldedValue
, pErrorCode
);
763 /* compact again with overlap for minimum data array length */
764 utrie_compact(trie
, TRUE
, pErrorCode
);
766 trie
->isCompacted
=TRUE
;
767 if(U_FAILURE(*pErrorCode
)) {
772 /* is dataLength within limits? */
773 if( (reduceTo16Bits
? (trie
->dataLength
+trie
->indexLength
) : trie
->dataLength
) >= UTRIE_MAX_DATA_LENGTH
) {
774 *pErrorCode
=U_INDEX_OUTOFBOUNDS_ERROR
;
777 length
=sizeof(UTrieHeader
)+2*trie
->indexLength
;
779 length
+=2*trie
->dataLength
;
781 length
+=4*trie
->dataLength
;
784 if(length
>capacity
) {
785 return length
; /* preflighting */
788 /* set the header fields */
789 header
=(UTrieHeader
*)data
;
790 data
+=sizeof(UTrieHeader
);
792 header
->signature
=0x54726965; /* "Trie" */
793 header
->options
=UTRIE_SHIFT
| (UTRIE_INDEX_SHIFT
<<UTRIE_OPTIONS_INDEX_SHIFT
);
795 if(!reduceTo16Bits
) {
796 header
->options
|=UTRIE_OPTIONS_DATA_IS_32_BIT
;
798 if(trie
->isLatin1Linear
) {
799 header
->options
|=UTRIE_OPTIONS_LATIN1_IS_LINEAR
;
802 header
->indexLength
=trie
->indexLength
;
803 header
->dataLength
=trie
->dataLength
;
805 /* write the index (stage 1) array and the 16/32-bit data (stage 2) array */
807 /* write 16-bit index values shifted right by UTRIE_INDEX_SHIFT, after adding indexLength */
808 p
=(uint32_t *)trie
->index
;
809 dest16
=(uint16_t *)data
;
810 for(i
=trie
->indexLength
; i
>0; --i
) {
811 *dest16
++=(uint16_t)((*p
++ + trie
->indexLength
)>>UTRIE_INDEX_SHIFT
);
814 /* write 16-bit data values */
816 for(i
=trie
->dataLength
; i
>0; --i
) {
817 *dest16
++=(uint16_t)*p
++;
820 /* write 16-bit index values shifted right by UTRIE_INDEX_SHIFT */
821 p
=(uint32_t *)trie
->index
;
822 dest16
=(uint16_t *)data
;
823 for(i
=trie
->indexLength
; i
>0; --i
) {
824 *dest16
++=(uint16_t)(*p
++ >> UTRIE_INDEX_SHIFT
);
827 /* write 32-bit data values */
828 uprv_memcpy(dest16
, trie
->data
, 4*trie
->dataLength
);
834 /* inverse to defaultGetFoldedValue() */
835 U_CAPI
int32_t U_EXPORT2
836 utrie_defaultGetFoldingOffset(uint32_t data
) {
837 return (int32_t)data
;
840 U_CAPI
int32_t U_EXPORT2
841 utrie_unserialize(UTrie
*trie
, const void *data
, int32_t length
, UErrorCode
*pErrorCode
) {
842 const UTrieHeader
*header
;
846 if(pErrorCode
==NULL
|| U_FAILURE(*pErrorCode
)) {
850 /* enough data for a trie header? */
851 if(length
<sizeof(UTrieHeader
)) {
852 *pErrorCode
=U_INVALID_FORMAT_ERROR
;
856 /* check the signature */
857 header
=(const UTrieHeader
*)data
;
858 if(header
->signature
!=0x54726965) {
859 *pErrorCode
=U_INVALID_FORMAT_ERROR
;
863 /* get the options and check the shift values */
864 options
=header
->options
;
865 if( (options
&UTRIE_OPTIONS_SHIFT_MASK
)!=UTRIE_SHIFT
||
866 ((options
>>UTRIE_OPTIONS_INDEX_SHIFT
)&UTRIE_OPTIONS_SHIFT_MASK
)!=UTRIE_INDEX_SHIFT
868 *pErrorCode
=U_INVALID_FORMAT_ERROR
;
871 trie
->isLatin1Linear
= (UBool
)((options
&UTRIE_OPTIONS_LATIN1_IS_LINEAR
)!=0);
873 /* get the length values */
874 trie
->indexLength
=header
->indexLength
;
875 trie
->dataLength
=header
->dataLength
;
877 length
-=(int32_t)sizeof(UTrieHeader
);
879 /* enough data for the index? */
880 if(length
<2*trie
->indexLength
) {
881 *pErrorCode
=U_INVALID_FORMAT_ERROR
;
884 p16
=(const uint16_t *)(header
+1);
886 p16
+=trie
->indexLength
;
887 length
-=2*trie
->indexLength
;
890 if(options
&UTRIE_OPTIONS_DATA_IS_32_BIT
) {
891 if(length
<4*trie
->dataLength
) {
892 *pErrorCode
=U_INVALID_FORMAT_ERROR
;
895 trie
->data32
=(const uint32_t *)p16
;
896 trie
->initialValue
=trie
->data32
[0];
897 length
=(int32_t)sizeof(UTrieHeader
)+2*trie
->indexLength
+4*trie
->dataLength
;
899 if(length
<2*trie
->dataLength
) {
900 *pErrorCode
=U_INVALID_FORMAT_ERROR
;
904 /* the "data16" data is used via the index pointer */
906 trie
->initialValue
=trie
->index
[trie
->indexLength
];
907 length
=(int32_t)sizeof(UTrieHeader
)+2*trie
->indexLength
+2*trie
->dataLength
;
910 trie
->getFoldingOffset
=utrie_defaultGetFoldingOffset
;
915 U_CAPI
int32_t U_EXPORT2
916 utrie_unserializeDummy(UTrie
*trie
,
917 void *data
, int32_t length
,
918 uint32_t initialValue
, uint32_t leadUnitValue
,
920 UErrorCode
*pErrorCode
) {
922 int32_t actualLength
, latin1Length
, i
, limit
;
925 if(pErrorCode
==NULL
|| U_FAILURE(*pErrorCode
)) {
929 /* calculate the actual size of the dummy trie data */
931 /* max(Latin-1, block 0) */
932 latin1Length
= UTRIE_SHIFT
<=8 ? 256 : UTRIE_DATA_BLOCK_LENGTH
;
934 trie
->indexLength
=UTRIE_BMP_INDEX_LENGTH
+UTRIE_SURROGATE_BLOCK_COUNT
;
935 trie
->dataLength
=latin1Length
;
936 if(leadUnitValue
!=initialValue
) {
937 trie
->dataLength
+=UTRIE_DATA_BLOCK_LENGTH
;
940 actualLength
=trie
->indexLength
*2;
942 actualLength
+=trie
->dataLength
*2;
944 actualLength
+=trie
->dataLength
*4;
947 /* enough space for the dummy trie? */
948 if(length
<actualLength
) {
949 *pErrorCode
=U_BUFFER_OVERFLOW_ERROR
;
953 trie
->isLatin1Linear
=TRUE
;
954 trie
->initialValue
=initialValue
;
956 /* fill the index and data arrays */
957 p16
=(uint16_t *)data
;
961 /* indexes to block 0 */
962 block
=(uint16_t)(trie
->indexLength
>>UTRIE_INDEX_SHIFT
);
963 limit
=trie
->indexLength
;
964 for(i
=0; i
<limit
; ++i
) {
968 if(leadUnitValue
!=initialValue
) {
969 /* indexes for lead surrogate code units to the block after Latin-1 */
970 block
+=(uint16_t)(latin1Length
>>UTRIE_INDEX_SHIFT
);
971 i
=0xd800>>UTRIE_SHIFT
;
972 limit
=0xdc00>>UTRIE_SHIFT
;
973 for(; i
<limit
; ++i
) {
981 p16
+=trie
->indexLength
;
982 for(i
=0; i
<latin1Length
; ++i
) {
983 p16
[i
]=(uint16_t)initialValue
;
986 /* data for lead surrogate code units */
987 if(leadUnitValue
!=initialValue
) {
988 limit
=latin1Length
+UTRIE_DATA_BLOCK_LENGTH
;
989 for(/* i=latin1Length */; i
<limit
; ++i
) {
990 p16
[i
]=(uint16_t)leadUnitValue
;
996 /* indexes to block 0 */
997 uprv_memset(p16
, 0, trie
->indexLength
*2);
999 if(leadUnitValue
!=initialValue
) {
1000 /* indexes for lead surrogate code units to the block after Latin-1 */
1001 block
=(uint16_t)(latin1Length
>>UTRIE_INDEX_SHIFT
);
1002 i
=0xd800>>UTRIE_SHIFT
;
1003 limit
=0xdc00>>UTRIE_SHIFT
;
1004 for(; i
<limit
; ++i
) {
1009 trie
->data32
=p32
=(uint32_t *)(p16
+trie
->indexLength
);
1012 for(i
=0; i
<latin1Length
; ++i
) {
1013 p32
[i
]=initialValue
;
1016 /* data for lead surrogate code units */
1017 if(leadUnitValue
!=initialValue
) {
1018 limit
=latin1Length
+UTRIE_DATA_BLOCK_LENGTH
;
1019 for(/* i=latin1Length */; i
<limit
; ++i
) {
1020 p32
[i
]=leadUnitValue
;
1025 trie
->getFoldingOffset
=utrie_defaultGetFoldingOffset
;
1027 return actualLength
;
1030 /* enumeration -------------------------------------------------------------- */
1032 /* default UTrieEnumValue() returns the input value itself */
1033 static uint32_t U_CALLCONV
1034 enumSameValue(const void *context
, uint32_t value
) {
1039 * Enumerate all ranges of code points with the same relevant values.
1040 * The values are transformed from the raw trie entries by the enumValue function.
1042 U_CAPI
void U_EXPORT2
1043 utrie_enum(const UTrie
*trie
,
1044 UTrieEnumValue
*enumValue
, UTrieEnumRange
*enumRange
, const void *context
) {
1045 const uint32_t *data32
;
1046 const uint16_t *index
;
1048 uint32_t value
, prevValue
, initialValue
;
1050 int32_t l
, i
, j
, block
, prevBlock
, offset
;
1052 /* check arguments */
1053 if(trie
==NULL
|| trie
->index
==NULL
|| enumRange
==NULL
) {
1056 if(enumValue
==NULL
) {
1057 enumValue
=enumSameValue
;
1061 data32
=trie
->data32
;
1063 /* get the enumeration value that corresponds to an initial-value trie data entry */
1064 initialValue
=enumValue(context
, trie
->initialValue
);
1066 /* set variables for previous range */
1069 prevValue
=initialValue
;
1071 /* enumerate BMP - the main loop enumerates data blocks */
1072 for(i
=0, c
=0; c
<=0xffff; ++i
) {
1074 /* skip lead surrogate code _units_, go to lead surr. code _points_ */
1075 i
=UTRIE_BMP_INDEX_LENGTH
;
1076 } else if(c
==0xdc00) {
1077 /* go back to regular BMP code points */
1081 block
=index
[i
]<<UTRIE_INDEX_SHIFT
;
1082 if(block
==prevBlock
) {
1083 /* the block is the same as the previous one, and filled with value */
1084 c
+=UTRIE_DATA_BLOCK_LENGTH
;
1085 } else if(block
==0) {
1086 /* this is the all-initial-value block */
1087 if(prevValue
!=initialValue
) {
1089 if(!enumRange(context
, prev
, c
, prevValue
)) {
1095 prevValue
=initialValue
;
1097 c
+=UTRIE_DATA_BLOCK_LENGTH
;
1100 for(j
=0; j
<UTRIE_DATA_BLOCK_LENGTH
; ++j
) {
1101 value
=enumValue(context
, data32
!=NULL
? data32
[block
+j
] : index
[block
+j
]);
1102 if(value
!=prevValue
) {
1104 if(!enumRange(context
, prev
, c
, prevValue
)) {
1119 /* enumerate supplementary code points */
1120 for(l
=0xd800; l
<0xdc00;) {
1121 /* lead surrogate access */
1122 offset
=index
[l
>>UTRIE_SHIFT
]<<UTRIE_INDEX_SHIFT
;
1123 if(offset
==(data32
!=NULL
? 0 : trie
->indexLength
)) {
1124 /* no entries for a whole block of lead surrogates */
1125 if(prevValue
!=initialValue
) {
1127 if(!enumRange(context
, prev
, c
, prevValue
)) {
1133 prevValue
=initialValue
;
1136 l
+=UTRIE_DATA_BLOCK_LENGTH
;
1137 c
+=UTRIE_DATA_BLOCK_LENGTH
<<10;
1141 value
= data32
!=NULL
? data32
[offset
+(l
&UTRIE_MASK
)] : index
[offset
+(l
&UTRIE_MASK
)];
1143 /* enumerate trail surrogates for this lead surrogate */
1144 offset
=trie
->getFoldingOffset(value
);
1146 /* no data for this lead surrogate */
1147 if(prevValue
!=initialValue
) {
1149 if(!enumRange(context
, prev
, c
, prevValue
)) {
1155 prevValue
=initialValue
;
1158 /* nothing else to do for the supplementary code points for this lead surrogate */
1161 /* enumerate code points for this lead surrogate */
1163 offset
+=UTRIE_SURROGATE_BLOCK_COUNT
;
1165 /* copy of most of the body of the BMP loop */
1166 block
=index
[i
]<<UTRIE_INDEX_SHIFT
;
1167 if(block
==prevBlock
) {
1168 /* the block is the same as the previous one, and filled with value */
1169 c
+=UTRIE_DATA_BLOCK_LENGTH
;
1170 } else if(block
==0) {
1171 /* this is the all-initial-value block */
1172 if(prevValue
!=initialValue
) {
1174 if(!enumRange(context
, prev
, c
, prevValue
)) {
1180 prevValue
=initialValue
;
1182 c
+=UTRIE_DATA_BLOCK_LENGTH
;
1185 for(j
=0; j
<UTRIE_DATA_BLOCK_LENGTH
; ++j
) {
1186 value
=enumValue(context
, data32
!=NULL
? data32
[block
+j
] : index
[block
+j
]);
1187 if(value
!=prevValue
) {
1189 if(!enumRange(context
, prev
, c
, prevValue
)) {
1202 } while(++i
<offset
);
1208 /* deliver last range */
1209 enumRange(context
, prev
, c
, prevValue
);