]> git.saurik.com Git - apple/icu.git/blob - icuSources/common/utrie.c
ICU-6.2.8.tar.gz
[apple/icu.git] / icuSources / common / utrie.c
1 /*
2 ******************************************************************************
3 *
4 * Copyright (C) 2001-2004, International Business Machines
5 * Corporation and others. All Rights Reserved.
6 *
7 ******************************************************************************
8 * file name: utrie.c
9 * encoding: US-ASCII
10 * tab size: 8 (not used)
11 * indentation:4
12 *
13 * created on: 2001oct20
14 * created by: Markus W. Scherer
15 *
16 * This is a common implementation of a "folded" trie.
17 * It is a kind of compressed, serializable table of 16- or 32-bit values associated with
18 * Unicode code points (0..0x10ffff).
19 */
20
21 #ifdef UTRIE_DEBUG
22 # include <stdio.h>
23 #endif
24
25 #include "unicode/utypes.h"
26 #include "udataswp.h"
27 #include "cmemory.h"
28 #include "utrie.h"
29
30 #undef ABS
31 #define ABS(x) ((x)>=0 ? (x) : -(x))
32
33 /* Building a trie ----------------------------------------------------------*/
34
35 U_CAPI UNewTrie * U_EXPORT2
36 utrie_open(UNewTrie *fillIn,
37 uint32_t *aliasData, int32_t maxDataLength,
38 uint32_t initialValue, uint32_t leadUnitValue,
39 UBool latin1Linear) {
40 UNewTrie *trie;
41 int32_t i, j;
42
43 if( maxDataLength<UTRIE_DATA_BLOCK_LENGTH ||
44 (latin1Linear && maxDataLength<1024)
45 ) {
46 return NULL;
47 }
48
49 if(fillIn!=NULL) {
50 trie=fillIn;
51 } else {
52 trie=(UNewTrie *)uprv_malloc(sizeof(UNewTrie));
53 if(trie==NULL) {
54 return NULL;
55 }
56 }
57 uprv_memset(trie, 0, sizeof(UNewTrie));
58 trie->isAllocated= (UBool)(fillIn==NULL);
59
60 if(aliasData!=NULL) {
61 trie->data=aliasData;
62 trie->isDataAllocated=FALSE;
63 } else {
64 trie->data=(uint32_t *)uprv_malloc(maxDataLength*4);
65 if(trie->data==NULL) {
66 uprv_free(trie);
67 return NULL;
68 }
69 trie->isDataAllocated=TRUE;
70 }
71
72 /* preallocate and reset the first data block (block index 0) */
73 j=UTRIE_DATA_BLOCK_LENGTH;
74
75 if(latin1Linear) {
76 /* preallocate and reset the first block (number 0) and Latin-1 (U+0000..U+00ff) after that */
77 /* made sure above that maxDataLength>=1024 */
78
79 /* set indexes to point to consecutive data blocks */
80 i=0;
81 do {
82 /* do this at least for trie->index[0] even if that block is only partly used for Latin-1 */
83 trie->index[i++]=j;
84 j+=UTRIE_DATA_BLOCK_LENGTH;
85 } while(i<(256>>UTRIE_SHIFT));
86 }
87
88 /* reset the initially allocated blocks to the initial value */
89 trie->dataLength=j;
90 while(j>0) {
91 trie->data[--j]=initialValue;
92 }
93
94 trie->leadUnitValue=leadUnitValue;
95 trie->indexLength=UTRIE_MAX_INDEX_LENGTH;
96 trie->dataCapacity=maxDataLength;
97 trie->isLatin1Linear=latin1Linear;
98 trie->isCompacted=FALSE;
99 return trie;
100 }
101
102 U_CAPI UNewTrie * U_EXPORT2
103 utrie_clone(UNewTrie *fillIn, const UNewTrie *other, uint32_t *aliasData, int32_t aliasDataCapacity) {
104 UNewTrie *trie;
105 UBool isDataAllocated;
106
107 /* do not clone if other is not valid or already compacted */
108 if(other==NULL || other->data==NULL || other->isCompacted) {
109 return NULL;
110 }
111
112 /* clone data */
113 if(aliasData!=NULL && aliasDataCapacity>=other->dataCapacity) {
114 isDataAllocated=FALSE;
115 } else {
116 aliasDataCapacity=other->dataCapacity;
117 aliasData=(uint32_t *)uprv_malloc(other->dataCapacity*4);
118 if(aliasData==NULL) {
119 return NULL;
120 }
121 isDataAllocated=TRUE;
122 }
123
124 trie=utrie_open(fillIn, aliasData, aliasDataCapacity,
125 other->data[0], other->leadUnitValue,
126 other->isLatin1Linear);
127 if(trie==NULL) {
128 uprv_free(aliasData);
129 } else {
130 uprv_memcpy(trie->index, other->index, sizeof(trie->index));
131 uprv_memcpy(trie->data, other->data, other->dataLength*4);
132 trie->dataLength=other->dataLength;
133 trie->isDataAllocated=isDataAllocated;
134 }
135
136 return trie;
137 }
138
139 U_CAPI void U_EXPORT2
140 utrie_close(UNewTrie *trie) {
141 if(trie!=NULL) {
142 if(trie->isDataAllocated) {
143 uprv_free(trie->data);
144 trie->data=NULL;
145 }
146 if(trie->isAllocated) {
147 uprv_free(trie);
148 }
149 }
150 }
151
152 U_CAPI uint32_t * U_EXPORT2
153 utrie_getData(UNewTrie *trie, int32_t *pLength) {
154 if(trie==NULL || pLength==NULL) {
155 return NULL;
156 }
157
158 *pLength=trie->dataLength;
159 return trie->data;
160 }
161
162 static int32_t
163 utrie_allocDataBlock(UNewTrie *trie) {
164 int32_t newBlock, newTop;
165
166 newBlock=trie->dataLength;
167 newTop=newBlock+UTRIE_DATA_BLOCK_LENGTH;
168 if(newTop>trie->dataCapacity) {
169 /* out of memory in the data array */
170 return -1;
171 }
172 trie->dataLength=newTop;
173 return newBlock;
174 }
175
176 /**
177 * No error checking for illegal arguments.
178 *
179 * @return -1 if no new data block available (out of memory in data array)
180 * @internal
181 */
182 static int32_t
183 utrie_getDataBlock(UNewTrie *trie, UChar32 c) {
184 int32_t indexValue, newBlock;
185
186 c>>=UTRIE_SHIFT;
187 indexValue=trie->index[c];
188 if(indexValue>0) {
189 return indexValue;
190 }
191
192 /* allocate a new data block */
193 newBlock=utrie_allocDataBlock(trie);
194 if(newBlock<0) {
195 /* out of memory in the data array */
196 return -1;
197 }
198 trie->index[c]=newBlock;
199
200 /* copy-on-write for a block from a setRange() */
201 uprv_memcpy(trie->data+newBlock, trie->data-indexValue, 4*UTRIE_DATA_BLOCK_LENGTH);
202 return newBlock;
203 }
204
205 /**
206 * @return TRUE if the value was successfully set
207 */
208 U_CAPI UBool U_EXPORT2
209 utrie_set32(UNewTrie *trie, UChar32 c, uint32_t value) {
210 int32_t block;
211
212 /* valid, uncompacted trie and valid c? */
213 if(trie==NULL || trie->isCompacted || (uint32_t)c>0x10ffff) {
214 return FALSE;
215 }
216
217 block=utrie_getDataBlock(trie, c);
218 if(block<0) {
219 return FALSE;
220 }
221
222 trie->data[block+(c&UTRIE_MASK)]=value;
223 return TRUE;
224 }
225
226 U_CAPI uint32_t U_EXPORT2
227 utrie_get32(UNewTrie *trie, UChar32 c, UBool *pInBlockZero) {
228 int32_t block;
229
230 /* valid, uncompacted trie and valid c? */
231 if(trie==NULL || trie->isCompacted || (uint32_t)c>0x10ffff) {
232 if(pInBlockZero!=NULL) {
233 *pInBlockZero=TRUE;
234 }
235 return 0;
236 }
237
238 block=trie->index[c>>UTRIE_SHIFT];
239 if(pInBlockZero!=NULL) {
240 *pInBlockZero= (UBool)(block==0);
241 }
242
243 return trie->data[ABS(block)+(c&UTRIE_MASK)];
244 }
245
246 /**
247 * @internal
248 */
249 static void
250 utrie_fillBlock(uint32_t *block, UChar32 start, UChar32 limit,
251 uint32_t value, uint32_t initialValue, UBool overwrite) {
252 uint32_t *pLimit;
253
254 pLimit=block+limit;
255 block+=start;
256 if(overwrite) {
257 while(block<pLimit) {
258 *block++=value;
259 }
260 } else {
261 while(block<pLimit) {
262 if(*block==initialValue) {
263 *block=value;
264 }
265 ++block;
266 }
267 }
268 }
269
270 U_CAPI UBool U_EXPORT2
271 utrie_setRange32(UNewTrie *trie, UChar32 start, UChar32 limit, uint32_t value, UBool overwrite) {
272 /*
273 * repeat value in [start..limit[
274 * mark index values for repeat-data blocks by setting bit 31 of the index values
275 * fill around existing values if any, if(overwrite)
276 */
277 uint32_t initialValue;
278 int32_t block, rest, repeatBlock;
279
280 /* valid, uncompacted trie and valid indexes? */
281 if( trie==NULL || trie->isCompacted ||
282 (uint32_t)start>0x10ffff || (uint32_t)limit>0x110000 || start>limit
283 ) {
284 return FALSE;
285 }
286 if(start==limit) {
287 return TRUE; /* nothing to do */
288 }
289
290 initialValue=trie->data[0];
291 if(start&UTRIE_MASK) {
292 UChar32 nextStart;
293
294 /* set partial block at [start..following block boundary[ */
295 block=utrie_getDataBlock(trie, start);
296 if(block<0) {
297 return FALSE;
298 }
299
300 nextStart=(start+UTRIE_DATA_BLOCK_LENGTH)&~UTRIE_MASK;
301 if(nextStart<=limit) {
302 utrie_fillBlock(trie->data+block, start&UTRIE_MASK, UTRIE_DATA_BLOCK_LENGTH,
303 value, initialValue, overwrite);
304 start=nextStart;
305 } else {
306 utrie_fillBlock(trie->data+block, start&UTRIE_MASK, limit&UTRIE_MASK,
307 value, initialValue, overwrite);
308 return TRUE;
309 }
310 }
311
312 /* number of positions in the last, partial block */
313 rest=limit&UTRIE_MASK;
314
315 /* round down limit to a block boundary */
316 limit&=~UTRIE_MASK;
317
318 /* iterate over all-value blocks */
319 if(value==initialValue) {
320 repeatBlock=0;
321 } else {
322 repeatBlock=-1;
323 }
324 while(start<limit) {
325 /* get index value */
326 block=trie->index[start>>UTRIE_SHIFT];
327 if(block>0) {
328 /* already allocated, fill in value */
329 utrie_fillBlock(trie->data+block, 0, UTRIE_DATA_BLOCK_LENGTH, value, initialValue, overwrite);
330 } else if(trie->data[-block]!=value && (block==0 || overwrite)) {
331 /* set the repeatBlock instead of the current block 0 or range block */
332 if(repeatBlock>=0) {
333 trie->index[start>>UTRIE_SHIFT]=-repeatBlock;
334 } else {
335 /* create and set and fill the repeatBlock */
336 repeatBlock=utrie_getDataBlock(trie, start);
337 if(repeatBlock<0) {
338 return FALSE;
339 }
340
341 /* set the negative block number to indicate that it is a repeat block */
342 trie->index[start>>UTRIE_SHIFT]=-repeatBlock;
343 utrie_fillBlock(trie->data+repeatBlock, 0, UTRIE_DATA_BLOCK_LENGTH, value, initialValue, TRUE);
344 }
345 }
346
347 start+=UTRIE_DATA_BLOCK_LENGTH;
348 }
349
350 if(rest>0) {
351 /* set partial block at [last block boundary..limit[ */
352 block=utrie_getDataBlock(trie, start);
353 if(block<0) {
354 return FALSE;
355 }
356
357 utrie_fillBlock(trie->data+block, 0, rest, value, initialValue, overwrite);
358 }
359
360 return TRUE;
361 }
362
363 static int32_t
364 _findSameIndexBlock(const int32_t *index, int32_t indexLength,
365 int32_t otherBlock) {
366 int32_t block, i;
367
368 for(block=UTRIE_BMP_INDEX_LENGTH; block<indexLength; block+=UTRIE_SURROGATE_BLOCK_COUNT) {
369 for(i=0; i<UTRIE_SURROGATE_BLOCK_COUNT; ++i) {
370 if(index[block+i]!=index[otherBlock+i]) {
371 break;
372 }
373 }
374 if(i==UTRIE_SURROGATE_BLOCK_COUNT) {
375 return block;
376 }
377 }
378 return indexLength;
379 }
380
381 /*
382 * Fold the normalization data for supplementary code points into
383 * a compact area on top of the BMP-part of the trie index,
384 * with the lead surrogates indexing this compact area.
385 *
386 * Duplicate the index values for lead surrogates:
387 * From inside the BMP area, where some may be overridden with folded values,
388 * to just after the BMP area, where they can be retrieved for
389 * code point lookups.
390 */
391 static void
392 utrie_fold(UNewTrie *trie, UNewTrieGetFoldedValue *getFoldedValue, UErrorCode *pErrorCode) {
393 int32_t leadIndexes[UTRIE_SURROGATE_BLOCK_COUNT];
394 int32_t *index;
395 uint32_t value;
396 UChar32 c;
397 int32_t indexLength, block;
398
399 index=trie->index;
400
401 /* copy the lead surrogate indexes into a temporary array */
402 uprv_memcpy(leadIndexes, index+(0xd800>>UTRIE_SHIFT), 4*UTRIE_SURROGATE_BLOCK_COUNT);
403
404 /*
405 * set all values for lead surrogate code *units* to leadUnitValue
406 * so that, by default, runtime lookups will find no data for associated
407 * supplementary code points, unless there is data for such code points
408 * which will result in a non-zero folding value below that is set for
409 * the respective lead units
410 *
411 * the above saved the indexes for surrogate code *points*
412 * fill the indexes with simplified code from utrie_setRange32()
413 */
414 if(trie->leadUnitValue==trie->data[0]) {
415 block=0; /* leadUnitValue==initialValue, use all-initial-value block */
416 } else {
417 /* create and fill the repeatBlock */
418 block=utrie_allocDataBlock(trie);
419 if(block<0) {
420 /* data table overflow */
421 *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
422 return;
423 }
424 utrie_fillBlock(trie->data+block, 0, UTRIE_DATA_BLOCK_LENGTH, trie->leadUnitValue, trie->data[0], TRUE);
425 block=-block; /* negative block number to indicate that it is a repeat block */
426 }
427 for(c=(0xd800>>UTRIE_SHIFT); c<(0xdc00>>UTRIE_SHIFT); ++c) {
428 trie->index[c]=block;
429 }
430
431 /*
432 * Fold significant index values into the area just after the BMP indexes.
433 * In case the first lead surrogate has significant data,
434 * its index block must be used first (in which case the folding is a no-op).
435 * Later all folded index blocks are moved up one to insert the copied
436 * lead surrogate indexes.
437 */
438 indexLength=UTRIE_BMP_INDEX_LENGTH;
439
440 /* search for any index (stage 1) entries for supplementary code points */
441 for(c=0x10000; c<0x110000;) {
442 if(index[c>>UTRIE_SHIFT]!=0) {
443 /* there is data, treat the full block for a lead surrogate */
444 c&=~0x3ff;
445
446 #ifdef UTRIE_DEBUG
447 printf("supplementary data for lead surrogate U+%04lx\n", (long)(0xd7c0+(c>>10)));
448 #endif
449
450 /* is there an identical index block? */
451 block=_findSameIndexBlock(index, indexLength, c>>UTRIE_SHIFT);
452
453 /*
454 * get a folded value for [c..c+0x400[ and,
455 * if different from the value for the lead surrogate code point,
456 * set it for the lead surrogate code unit
457 */
458 value=getFoldedValue(trie, c, block+UTRIE_SURROGATE_BLOCK_COUNT);
459 if(value!=utrie_get32(trie, U16_LEAD(c), NULL)) {
460 if(!utrie_set32(trie, U16_LEAD(c), value)) {
461 /* data table overflow */
462 *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
463 return;
464 }
465
466 /* if we did not find an identical index block... */
467 if(block==indexLength) {
468 /* move the actual index (stage 1) entries from the supplementary position to the new one */
469 uprv_memmove(index+indexLength,
470 index+(c>>UTRIE_SHIFT),
471 4*UTRIE_SURROGATE_BLOCK_COUNT);
472 indexLength+=UTRIE_SURROGATE_BLOCK_COUNT;
473 }
474 }
475 c+=0x400;
476 } else {
477 c+=UTRIE_DATA_BLOCK_LENGTH;
478 }
479 }
480
481 /*
482 * index array overflow?
483 * This is to guarantee that a folding offset is of the form
484 * UTRIE_BMP_INDEX_LENGTH+n*UTRIE_SURROGATE_BLOCK_COUNT with n=0..1023.
485 * If the index is too large, then n>=1024 and more than 10 bits are necessary.
486 *
487 * In fact, it can only ever become n==1024 with completely unfoldable data and
488 * the additional block of duplicated values for lead surrogates.
489 */
490 if(indexLength>=UTRIE_MAX_INDEX_LENGTH) {
491 *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
492 return;
493 }
494
495 /*
496 * make space for the lead surrogate index block and
497 * insert it between the BMP indexes and the folded ones
498 */
499 uprv_memmove(index+UTRIE_BMP_INDEX_LENGTH+UTRIE_SURROGATE_BLOCK_COUNT,
500 index+UTRIE_BMP_INDEX_LENGTH,
501 4*(indexLength-UTRIE_BMP_INDEX_LENGTH));
502 uprv_memcpy(index+UTRIE_BMP_INDEX_LENGTH,
503 leadIndexes,
504 4*UTRIE_SURROGATE_BLOCK_COUNT);
505 indexLength+=UTRIE_SURROGATE_BLOCK_COUNT;
506
507 #ifdef UTRIE_DEBUG
508 printf("trie index count: BMP %ld all Unicode %ld folded %ld\n",
509 UTRIE_BMP_INDEX_LENGTH, (long)UTRIE_MAX_INDEX_LENGTH, indexLength);
510 #endif
511
512 trie->indexLength=indexLength;
513 }
514
515 /*
516 * Set a value in the trie index map to indicate which data block
517 * is referenced and which one is not.
518 * utrie_compact() will remove data blocks that are not used at all.
519 * Set
520 * - 0 if it is used
521 * - -1 if it is not used
522 */
523 static void
524 _findUnusedBlocks(UNewTrie *trie) {
525 int32_t i;
526
527 /* fill the entire map with "not used" */
528 uprv_memset(trie->map, 0xff, (UTRIE_MAX_BUILD_TIME_DATA_LENGTH>>UTRIE_SHIFT)*4);
529
530 /* mark each block that _is_ used with 0 */
531 for(i=0; i<trie->indexLength; ++i) {
532 trie->map[ABS(trie->index[i])>>UTRIE_SHIFT]=0;
533 }
534
535 /* never move the all-initial-value block 0 */
536 trie->map[0]=0;
537 }
538
539 static int32_t
540 _findSameDataBlock(const uint32_t *data, int32_t dataLength,
541 int32_t otherBlock, int32_t step) {
542 int32_t block, i;
543
544 /* ensure that we do not even partially get past dataLength */
545 dataLength-=UTRIE_DATA_BLOCK_LENGTH;
546
547 for(block=0; block<=dataLength; block+=step) {
548 for(i=0; i<UTRIE_DATA_BLOCK_LENGTH; ++i) {
549 if(data[block+i]!=data[otherBlock+i]) {
550 break;
551 }
552 }
553 if(i==UTRIE_DATA_BLOCK_LENGTH) {
554 return block;
555 }
556 }
557 return -1;
558 }
559
560 /*
561 * Compact a folded build-time trie.
562 *
563 * The compaction
564 * - removes blocks that are identical with earlier ones
565 * - overlaps adjacent blocks as much as possible (if overlap==TRUE)
566 * - moves blocks in steps of the data granularity
567 *
568 * It does not
569 * - try to move and overlap blocks that are not already adjacent
570 * - try to move and overlap blocks that overlap with multiple values in the overlap region
571 */
572 static void
573 utrie_compact(UNewTrie *trie, UBool overlap, UErrorCode *pErrorCode) {
574 uint32_t x;
575 int32_t i, start, prevEnd, newStart, overlapStart;
576
577 if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
578 return;
579 }
580
581 /* valid, uncompacted trie? */
582 if(trie==NULL) {
583 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
584 return;
585 }
586 if(trie->isCompacted) {
587 return; /* nothing left to do */
588 }
589
590 /* compaction */
591
592 /* initialize the index map with "block is used/unused" flags */
593 _findUnusedBlocks(trie);
594
595 /* if Latin-1 is preallocated and linear, then do not compact Latin-1 data */
596 if(trie->isLatin1Linear && UTRIE_SHIFT<=8) {
597 overlapStart=UTRIE_DATA_BLOCK_LENGTH+256;
598 } else {
599 overlapStart=UTRIE_DATA_BLOCK_LENGTH;
600 }
601
602 newStart=UTRIE_DATA_BLOCK_LENGTH;
603 prevEnd=newStart-1;
604 for(start=newStart; start<trie->dataLength;) {
605 /*
606 * start: index of first entry of current block
607 * prevEnd: index to last entry of previous block
608 * newStart: index where the current block is to be moved
609 */
610
611 /* skip blocks that are not used */
612 if(trie->map[start>>UTRIE_SHIFT]<0) {
613 /* advance start to the next block */
614 start+=UTRIE_DATA_BLOCK_LENGTH;
615
616 /* leave prevEnd and newStart with the previous block! */
617 continue;
618 }
619
620 /* search for an identical block */
621 if( start>=overlapStart &&
622 (i=_findSameDataBlock(trie->data, newStart, start,
623 overlap ? UTRIE_DATA_GRANULARITY : UTRIE_DATA_BLOCK_LENGTH))
624 >=0
625 ) {
626 /* found an identical block, set the other block's index value for the current block */
627 trie->map[start>>UTRIE_SHIFT]=i;
628
629 /* advance start to the next block */
630 start+=UTRIE_DATA_BLOCK_LENGTH;
631
632 /* leave prevEnd and newStart with the previous block! */
633 continue;
634 }
635
636 /* see if the beginning of this block can be overlapped with the end of the previous block */
637 /* x: first value in the current block */
638 x=trie->data[start];
639 if(x==trie->data[prevEnd] && overlap && start>=overlapStart) {
640 /* overlap by at least one */
641 for(i=1; i<UTRIE_DATA_BLOCK_LENGTH && x==trie->data[start+i] && x==trie->data[prevEnd-i]; ++i) {}
642
643 /* overlap by i, rounded down for the data block granularity */
644 i&=~(UTRIE_DATA_GRANULARITY-1);
645 } else {
646 i=0;
647 }
648
649 if(i>0) {
650 /* some overlap */
651 trie->map[start>>UTRIE_SHIFT]=newStart-i;
652
653 /* move the non-overlapping indexes to their new positions */
654 start+=i;
655 for(i=UTRIE_DATA_BLOCK_LENGTH-i; i>0; --i) {
656 trie->data[newStart++]=trie->data[start++];
657 }
658 } else if(newStart<start) {
659 /* no overlap, just move the indexes to their new positions */
660 trie->map[start>>UTRIE_SHIFT]=newStart;
661 for(i=UTRIE_DATA_BLOCK_LENGTH; i>0; --i) {
662 trie->data[newStart++]=trie->data[start++];
663 }
664 } else /* no overlap && newStart==start */ {
665 trie->map[start>>UTRIE_SHIFT]=start;
666 newStart+=UTRIE_DATA_BLOCK_LENGTH;
667 start=newStart;
668 }
669
670 prevEnd=newStart-1;
671 }
672
673 /* now adjust the index (stage 1) table */
674 for(i=0; i<trie->indexLength; ++i) {
675 trie->index[i]=trie->map[ABS(trie->index[i])>>UTRIE_SHIFT];
676 }
677
678 #ifdef UTRIE_DEBUG
679 /* we saved some space */
680 printf("compacting trie: count of 32-bit words %lu->%lu\n",
681 (long)trie->dataLength, (long)newStart);
682 #endif
683
684 trie->dataLength=newStart;
685 }
686
687 /* serialization ------------------------------------------------------------ */
688
689 /**
690 * Trie data structure in serialized form:
691 *
692 * UTrieHeader header;
693 * uint16_t index[header.indexLength];
694 * uint16_t data[header.dataLength];
695 */
696 struct UTrieHeader {
697 /** "Trie" in big-endian US-ASCII (0x54726965) */
698 uint32_t signature;
699
700 /**
701 * options bit field:
702 * 9 1=Latin-1 data is stored linearly at data+UTRIE_DATA_BLOCK_LENGTH
703 * 8 0=16-bit data, 1=32-bit data
704 * 7..4 UTRIE_INDEX_SHIFT // 0..UTRIE_SHIFT
705 * 3..0 UTRIE_SHIFT // 1..9
706 */
707 uint32_t options;
708
709 /** indexLength is a multiple of UTRIE_SURROGATE_BLOCK_COUNT */
710 int32_t indexLength;
711
712 /** dataLength>=UTRIE_DATA_BLOCK_LENGTH */
713 int32_t dataLength;
714 };
715
716 typedef struct UTrieHeader UTrieHeader;
717
718 /**
719 * Constants for use with UTrieHeader.options.
720 */
721 enum {
722 /** Mask to get the UTRIE_SHIFT value from options. */
723 UTRIE_OPTIONS_SHIFT_MASK=0xf,
724
725 /** Shift options right this much to get the UTRIE_INDEX_SHIFT value. */
726 UTRIE_OPTIONS_INDEX_SHIFT=4,
727
728 /** If set, then the data (stage 2) array is 32 bits wide. */
729 UTRIE_OPTIONS_DATA_IS_32_BIT=0x100,
730
731 /**
732 * If set, then Latin-1 data (for U+0000..U+00ff) is stored in the data (stage 2) array
733 * as a simple, linear array at data+UTRIE_DATA_BLOCK_LENGTH.
734 */
735 UTRIE_OPTIONS_LATIN1_IS_LINEAR=0x200
736 };
737
738 /*
739 * Default function for the folding value:
740 * Just store the offset (16 bits) if there is any non-initial-value entry.
741 *
742 * The offset parameter is never 0.
743 * Returning the offset itself is safe for UTRIE_SHIFT>=5 because
744 * for UTRIE_SHIFT==5 the maximum index length is UTRIE_MAX_INDEX_LENGTH==0x8800
745 * which fits into 16-bit trie values;
746 * for higher UTRIE_SHIFT, UTRIE_MAX_INDEX_LENGTH decreases.
747 *
748 * Theoretically, it would be safer for all possible UTRIE_SHIFT including
749 * those of 4 and lower to return offset>>UTRIE_SURROGATE_BLOCK_BITS
750 * which would always result in a value of 0x40..0x43f
751 * (start/end 1k blocks of supplementary Unicode code points).
752 * However, this would be uglier, and would not work for some existing
753 * binary data file formats.
754 *
755 * Also, we do not plan to change UTRIE_SHIFT because it would change binary
756 * data file formats, and we would probably not make it smaller because of
757 * the then even larger BMP index length even for empty tries.
758 */
759 static uint32_t U_CALLCONV
760 defaultGetFoldedValue(UNewTrie *trie, UChar32 start, int32_t offset) {
761 uint32_t value, initialValue;
762 UChar32 limit;
763 UBool inBlockZero;
764
765 initialValue=trie->data[0];
766 limit=start+0x400;
767 while(start<limit) {
768 value=utrie_get32(trie, start, &inBlockZero);
769 if(inBlockZero) {
770 start+=UTRIE_DATA_BLOCK_LENGTH;
771 } else if(value!=initialValue) {
772 return (uint32_t)offset;
773 } else {
774 ++start;
775 }
776 }
777 return 0;
778 }
779
780 U_CAPI int32_t U_EXPORT2
781 utrie_serialize(UNewTrie *trie, void *dt, int32_t capacity,
782 UNewTrieGetFoldedValue *getFoldedValue,
783 UBool reduceTo16Bits,
784 UErrorCode *pErrorCode) {
785 UTrieHeader *header;
786 uint32_t *p;
787 uint16_t *dest16;
788 int32_t i, length;
789 uint8_t* data = NULL;
790
791 /* argument check */
792 if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
793 return 0;
794 }
795
796 if(trie==NULL || capacity<0 || (capacity>0 && dt==NULL)) {
797 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
798 return 0;
799 }
800 if(getFoldedValue==NULL) {
801 getFoldedValue=defaultGetFoldedValue;
802 }
803
804 data = (uint8_t*)dt;
805 /* fold and compact if necessary, also checks that indexLength is within limits */
806 if(!trie->isCompacted) {
807 /* compact once without overlap to improve folding */
808 utrie_compact(trie, FALSE, pErrorCode);
809
810 /* fold the supplementary part of the index array */
811 utrie_fold(trie, getFoldedValue, pErrorCode);
812
813 /* compact again with overlap for minimum data array length */
814 utrie_compact(trie, TRUE, pErrorCode);
815
816 trie->isCompacted=TRUE;
817 if(U_FAILURE(*pErrorCode)) {
818 return 0;
819 }
820 }
821
822 /* is dataLength within limits? */
823 if( (reduceTo16Bits ? (trie->dataLength+trie->indexLength) : trie->dataLength) >= UTRIE_MAX_DATA_LENGTH) {
824 *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
825 }
826
827 length=sizeof(UTrieHeader)+2*trie->indexLength;
828 if(reduceTo16Bits) {
829 length+=2*trie->dataLength;
830 } else {
831 length+=4*trie->dataLength;
832 }
833
834 if(length>capacity) {
835 return length; /* preflighting */
836 }
837
838 /* set the header fields */
839 header=(UTrieHeader *)data;
840 data+=sizeof(UTrieHeader);
841
842 header->signature=0x54726965; /* "Trie" */
843 header->options=UTRIE_SHIFT | (UTRIE_INDEX_SHIFT<<UTRIE_OPTIONS_INDEX_SHIFT);
844
845 if(!reduceTo16Bits) {
846 header->options|=UTRIE_OPTIONS_DATA_IS_32_BIT;
847 }
848 if(trie->isLatin1Linear) {
849 header->options|=UTRIE_OPTIONS_LATIN1_IS_LINEAR;
850 }
851
852 header->indexLength=trie->indexLength;
853 header->dataLength=trie->dataLength;
854
855 /* write the index (stage 1) array and the 16/32-bit data (stage 2) array */
856 if(reduceTo16Bits) {
857 /* write 16-bit index values shifted right by UTRIE_INDEX_SHIFT, after adding indexLength */
858 p=(uint32_t *)trie->index;
859 dest16=(uint16_t *)data;
860 for(i=trie->indexLength; i>0; --i) {
861 *dest16++=(uint16_t)((*p++ + trie->indexLength)>>UTRIE_INDEX_SHIFT);
862 }
863
864 /* write 16-bit data values */
865 p=trie->data;
866 for(i=trie->dataLength; i>0; --i) {
867 *dest16++=(uint16_t)*p++;
868 }
869 } else {
870 /* write 16-bit index values shifted right by UTRIE_INDEX_SHIFT */
871 p=(uint32_t *)trie->index;
872 dest16=(uint16_t *)data;
873 for(i=trie->indexLength; i>0; --i) {
874 *dest16++=(uint16_t)(*p++ >> UTRIE_INDEX_SHIFT);
875 }
876
877 /* write 32-bit data values */
878 uprv_memcpy(dest16, trie->data, 4*trie->dataLength);
879 }
880
881 return length;
882 }
883
884 /* inverse to defaultGetFoldedValue() */
885 static int32_t U_CALLCONV
886 defaultGetFoldingOffset(uint32_t data) {
887 return (int32_t)data;
888 }
889
890 U_CAPI int32_t U_EXPORT2
891 utrie_unserialize(UTrie *trie, const void *data, int32_t length, UErrorCode *pErrorCode) {
892 UTrieHeader *header;
893 uint16_t *p16;
894 uint32_t options;
895
896 if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
897 return -1;
898 }
899
900 /* enough data for a trie header? */
901 if(length<sizeof(UTrieHeader)) {
902 *pErrorCode=U_INVALID_FORMAT_ERROR;
903 return -1;
904 }
905
906 /* check the signature */
907 header=(UTrieHeader *)data;
908 if(header->signature!=0x54726965) {
909 *pErrorCode=U_INVALID_FORMAT_ERROR;
910 return -1;
911 }
912
913 /* get the options and check the shift values */
914 options=header->options;
915 if( (options&UTRIE_OPTIONS_SHIFT_MASK)!=UTRIE_SHIFT ||
916 ((options>>UTRIE_OPTIONS_INDEX_SHIFT)&UTRIE_OPTIONS_SHIFT_MASK)!=UTRIE_INDEX_SHIFT
917 ) {
918 *pErrorCode=U_INVALID_FORMAT_ERROR;
919 return -1;
920 }
921 trie->isLatin1Linear= (UBool)((options&UTRIE_OPTIONS_LATIN1_IS_LINEAR)!=0);
922
923 /* get the length values */
924 trie->indexLength=header->indexLength;
925 trie->dataLength=header->dataLength;
926
927 length-=(int32_t)sizeof(UTrieHeader);
928
929 /* enough data for the index? */
930 if(length<2*trie->indexLength) {
931 *pErrorCode=U_INVALID_FORMAT_ERROR;
932 return -1;
933 }
934 p16=(uint16_t *)(header+1);
935 trie->index=p16;
936 p16+=trie->indexLength;
937 length-=2*trie->indexLength;
938
939 /* get the data */
940 if(options&UTRIE_OPTIONS_DATA_IS_32_BIT) {
941 if(length<4*trie->dataLength) {
942 *pErrorCode=U_INVALID_FORMAT_ERROR;
943 return -1;
944 }
945 trie->data32=(const uint32_t *)p16;
946 trie->initialValue=trie->data32[0];
947 length=(int32_t)sizeof(UTrieHeader)+2*trie->indexLength+4*trie->dataLength;
948 } else {
949 if(length<2*trie->dataLength) {
950 *pErrorCode=U_INVALID_FORMAT_ERROR;
951 return -1;
952 }
953
954 /* the "data16" data is used via the index pointer */
955 trie->data32=NULL;
956 trie->initialValue=trie->index[trie->indexLength];
957 length=(int32_t)sizeof(UTrieHeader)+2*trie->indexLength+2*trie->dataLength;
958 }
959
960 trie->getFoldingOffset=defaultGetFoldingOffset;
961
962 return length;
963 }
964
965 /* swapping ----------------------------------------------------------------- */
966
967 U_CAPI int32_t U_EXPORT2
968 utrie_swap(const UDataSwapper *ds,
969 const void *inData, int32_t length, void *outData,
970 UErrorCode *pErrorCode) {
971 const UTrieHeader *inTrie;
972 UTrieHeader trie;
973 int32_t size;
974 UBool dataIs32;
975
976 if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
977 return 0;
978 }
979 if(ds==NULL || inData==NULL || (length>=0 && outData==NULL)) {
980 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
981 return 0;
982 }
983
984 /* setup and swapping */
985 if(length>=0 && length<sizeof(UTrieHeader)) {
986 *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
987 return 0;
988 }
989
990 inTrie=(const UTrieHeader *)inData;
991 trie.signature=ds->readUInt32(inTrie->signature);
992 trie.options=ds->readUInt32(inTrie->options);
993 trie.indexLength=udata_readInt32(ds, inTrie->indexLength);
994 trie.dataLength=udata_readInt32(ds, inTrie->dataLength);
995
996 if( trie.signature!=0x54726965 ||
997 (trie.options&UTRIE_OPTIONS_SHIFT_MASK)!=UTRIE_SHIFT ||
998 ((trie.options>>UTRIE_OPTIONS_INDEX_SHIFT)&UTRIE_OPTIONS_SHIFT_MASK)!=UTRIE_INDEX_SHIFT ||
999 trie.indexLength<UTRIE_BMP_INDEX_LENGTH ||
1000 (trie.indexLength&(UTRIE_SURROGATE_BLOCK_COUNT-1))!=0 ||
1001 trie.dataLength<UTRIE_DATA_BLOCK_LENGTH ||
1002 (trie.dataLength&(UTRIE_DATA_GRANULARITY-1))!=0 ||
1003 ((trie.options&UTRIE_OPTIONS_LATIN1_IS_LINEAR)!=0 && trie.dataLength<(UTRIE_DATA_BLOCK_LENGTH+0x100))
1004 ) {
1005 *pErrorCode=U_INVALID_FORMAT_ERROR; /* not a UTrie */
1006 return 0;
1007 }
1008
1009 dataIs32=(UBool)((trie.options&UTRIE_OPTIONS_DATA_IS_32_BIT)!=0);
1010 size=sizeof(UTrieHeader)+trie.indexLength*2+trie.dataLength*(dataIs32?4:2);
1011
1012 if(length>=0) {
1013 UTrieHeader *outTrie;
1014
1015 if(length<size) {
1016 *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
1017 return 0;
1018 }
1019
1020 outTrie=(UTrieHeader *)outData;
1021
1022 /* swap the header */
1023 ds->swapArray32(ds, inTrie, sizeof(UTrieHeader), outTrie, pErrorCode);
1024
1025 /* swap the index and the data */
1026 if(dataIs32) {
1027 ds->swapArray16(ds, inTrie+1, trie.indexLength*2, outTrie+1, pErrorCode);
1028 ds->swapArray32(ds, (const uint16_t *)(inTrie+1)+trie.indexLength, trie.dataLength*4,
1029 (uint16_t *)(outTrie+1)+trie.indexLength, pErrorCode);
1030 } else {
1031 ds->swapArray16(ds, inTrie+1, (trie.indexLength+trie.dataLength)*2, outTrie+1, pErrorCode);
1032 }
1033 }
1034
1035 return size;
1036 }
1037
1038 /* enumeration -------------------------------------------------------------- */
1039
1040 /* default UTrieEnumValue() returns the input value itself */
1041 static uint32_t U_CALLCONV
1042 enumSameValue(const void *context, uint32_t value) {
1043 return value;
1044 }
1045
1046 /**
1047 * Enumerate all ranges of code points with the same relevant values.
1048 * The values are transformed from the raw trie entries by the enumValue function.
1049 */
1050 U_CAPI void U_EXPORT2
1051 utrie_enum(const UTrie *trie,
1052 UTrieEnumValue *enumValue, UTrieEnumRange *enumRange, const void *context) {
1053 const uint32_t *data32;
1054 const uint16_t *index;
1055
1056 uint32_t value, prevValue, initialValue;
1057 UChar32 c, prev;
1058 int32_t l, i, j, block, prevBlock, offset;
1059
1060 /* check arguments */
1061 if(trie==NULL || trie->index==NULL || enumRange==NULL) {
1062 return;
1063 }
1064 if(enumValue==NULL) {
1065 enumValue=enumSameValue;
1066 }
1067
1068 index=trie->index;
1069 data32=trie->data32;
1070
1071 /* get the enumeration value that corresponds to an initial-value trie data entry */
1072 initialValue=enumValue(context, trie->initialValue);
1073
1074 /* set variables for previous range */
1075 prevBlock=0;
1076 prev=0;
1077 prevValue=initialValue;
1078
1079 /* enumerate BMP - the main loop enumerates data blocks */
1080 for(i=0, c=0; c<=0xffff; ++i) {
1081 if(c==0xd800) {
1082 /* skip lead surrogate code _units_, go to lead surr. code _points_ */
1083 i=UTRIE_BMP_INDEX_LENGTH;
1084 } else if(c==0xdc00) {
1085 /* go back to regular BMP code points */
1086 i=c>>UTRIE_SHIFT;
1087 }
1088
1089 block=index[i]<<UTRIE_INDEX_SHIFT;
1090 if(block==prevBlock) {
1091 /* the block is the same as the previous one, and filled with value */
1092 c+=UTRIE_DATA_BLOCK_LENGTH;
1093 } else if(block==0) {
1094 /* this is the all-initial-value block */
1095 if(prevValue!=initialValue) {
1096 if(prev<c) {
1097 if(!enumRange(context, prev, c, prevValue)) {
1098 return;
1099 }
1100 }
1101 prevBlock=0;
1102 prev=c;
1103 prevValue=initialValue;
1104 }
1105 c+=UTRIE_DATA_BLOCK_LENGTH;
1106 } else {
1107 prevBlock=block;
1108 for(j=0; j<UTRIE_DATA_BLOCK_LENGTH; ++j) {
1109 value=enumValue(context, data32!=NULL ? data32[block+j] : index[block+j]);
1110 if(value!=prevValue) {
1111 if(prev<c) {
1112 if(!enumRange(context, prev, c, prevValue)) {
1113 return;
1114 }
1115 }
1116 if(j>0) {
1117 prevBlock=-1;
1118 }
1119 prev=c;
1120 prevValue=value;
1121 }
1122 ++c;
1123 }
1124 }
1125 }
1126
1127 /* enumerate supplementary code points */
1128 for(l=0xd800; l<0xdc00;) {
1129 /* lead surrogate access */
1130 offset=index[l>>UTRIE_SHIFT]<<UTRIE_INDEX_SHIFT;
1131 if(offset==(data32!=NULL ? 0 : trie->indexLength)) {
1132 /* no entries for a whole block of lead surrogates */
1133 if(prevValue!=initialValue) {
1134 if(prev<c) {
1135 if(!enumRange(context, prev, c, prevValue)) {
1136 return;
1137 }
1138 }
1139 prevBlock=0;
1140 prev=c;
1141 prevValue=initialValue;
1142 }
1143
1144 l+=UTRIE_DATA_BLOCK_LENGTH;
1145 c+=UTRIE_DATA_BLOCK_LENGTH<<10;
1146 continue;
1147 }
1148
1149 value= data32!=NULL ? data32[offset+(l&UTRIE_MASK)] : index[offset+(l&UTRIE_MASK)];
1150
1151 /* enumerate trail surrogates for this lead surrogate */
1152 offset=trie->getFoldingOffset(value);
1153 if(offset<=0) {
1154 /* no data for this lead surrogate */
1155 if(prevValue!=initialValue) {
1156 if(prev<c) {
1157 if(!enumRange(context, prev, c, prevValue)) {
1158 return;
1159 }
1160 }
1161 prevBlock=0;
1162 prev=c;
1163 prevValue=initialValue;
1164 }
1165
1166 /* nothing else to do for the supplementary code points for this lead surrogate */
1167 c+=0x400;
1168 } else {
1169 /* enumerate code points for this lead surrogate */
1170 i=offset;
1171 offset+=UTRIE_SURROGATE_BLOCK_COUNT;
1172 do {
1173 /* copy of most of the body of the BMP loop */
1174 block=index[i]<<UTRIE_INDEX_SHIFT;
1175 if(block==prevBlock) {
1176 /* the block is the same as the previous one, and filled with value */
1177 c+=UTRIE_DATA_BLOCK_LENGTH;
1178 } else if(block==0) {
1179 /* this is the all-initial-value block */
1180 if(prevValue!=initialValue) {
1181 if(prev<c) {
1182 if(!enumRange(context, prev, c, prevValue)) {
1183 return;
1184 }
1185 }
1186 prevBlock=0;
1187 prev=c;
1188 prevValue=initialValue;
1189 }
1190 c+=UTRIE_DATA_BLOCK_LENGTH;
1191 } else {
1192 prevBlock=block;
1193 for(j=0; j<UTRIE_DATA_BLOCK_LENGTH; ++j) {
1194 value=enumValue(context, data32!=NULL ? data32[block+j] : index[block+j]);
1195 if(value!=prevValue) {
1196 if(prev<c) {
1197 if(!enumRange(context, prev, c, prevValue)) {
1198 return;
1199 }
1200 }
1201 if(j>0) {
1202 prevBlock=-1;
1203 }
1204 prev=c;
1205 prevValue=value;
1206 }
1207 ++c;
1208 }
1209 }
1210 } while(++i<offset);
1211 }
1212
1213 ++l;
1214 }
1215
1216 /* deliver last range */
1217 enumRange(context, prev, c, prevValue);
1218 }