]>
Commit | Line | Data |
---|---|---|
f3c0d7a5 A |
1 | // © 2016 and later: Unicode, Inc. and others. |
2 | // License & terms of use: http://www.unicode.org/copyright.html | |
729e4ab9 A |
3 | /* |
4 | ****************************************************************************** | |
5 | * | |
57a6839d | 6 | * Copyright (C) 2001-2014, International Business Machines |
729e4ab9 A |
7 | * Corporation and others. All Rights Reserved. |
8 | * | |
9 | ****************************************************************************** | |
4388f060 | 10 | * file name: utrie2_builder.cpp |
f3c0d7a5 | 11 | * encoding: UTF-8 |
729e4ab9 A |
12 | * tab size: 8 (not used) |
13 | * indentation:4 | |
14 | * | |
15 | * created on: 2008sep26 (split off from utrie2.c) | |
16 | * created by: Markus W. Scherer | |
17 | * | |
18 | * This is a common implementation of a Unicode trie. | |
19 | * It is a kind of compressed, serializable table of 16- or 32-bit values associated with | |
20 | * Unicode code points (0..0x10ffff). | |
21 | * This is the second common version of a Unicode trie (hence the name UTrie2). | |
22 | * See utrie2.h for a comparison. | |
23 | * | |
24 | * This file contains only the builder code. | |
25 | * See utrie2.c for the runtime and enumeration code. | |
26 | */ | |
27 | #ifdef UTRIE2_DEBUG | |
28 | # include <stdio.h> | |
29 | #endif | |
30 | ||
31 | #include "unicode/utypes.h" | |
32 | #include "cmemory.h" | |
33 | #include "utrie2.h" | |
34 | #include "utrie2_impl.h" | |
35 | ||
36 | #include "utrie.h" /* for utrie2_fromUTrie() and utrie_swap() */ | |
37 | ||
729e4ab9 A |
38 | /* Implementation notes ----------------------------------------------------- */ |
39 | ||
40 | /* | |
41 | * The UTRIE2_SHIFT_1, UTRIE2_SHIFT_2, UTRIE2_INDEX_SHIFT and other values | |
42 | * have been chosen to minimize trie sizes overall. | |
43 | * Most of the code is flexible enough to work with a range of values, | |
44 | * within certain limits. | |
45 | * | |
46 | * Exception: Support for separate values for lead surrogate code _units_ | |
47 | * vs. code _points_ was added after the constants were fixed, | |
48 | * and has not been tested nor particularly designed for different constant values. | |
49 | * (Especially the utrie2_enum() code that jumps to the special LSCP index-2 | |
50 | * part and back.) | |
51 | * | |
52 | * Requires UTRIE2_SHIFT_2<=6. Otherwise 0xc0 which is the top of the ASCII-linear data | |
53 | * including the bad-UTF-8-data block is not a multiple of UTRIE2_DATA_BLOCK_LENGTH | |
54 | * and map[block>>UTRIE2_SHIFT_2] (used in reference counting and compaction | |
55 | * remapping) stops working. | |
56 | * | |
57 | * Requires UTRIE2_SHIFT_1>=10 because utrie2_enumForLeadSurrogate() | |
58 | * assumes that a single index-2 block is used for 0x400 code points | |
59 | * corresponding to one lead surrogate. | |
60 | * | |
61 | * Requires UTRIE2_SHIFT_1<=16. Otherwise one single index-2 block contains | |
62 | * more than one Unicode plane, and the split of the index-2 table into a BMP | |
63 | * part and a supplementary part, with a gap in between, would not work. | |
64 | * | |
65 | * Requires UTRIE2_INDEX_SHIFT>=1 not because of the code but because | |
66 | * there is data with more than 64k distinct values, | |
67 | * for example for Unihan collation with a separate collation weight per | |
68 | * Han character. | |
69 | */ | |
70 | ||
71 | /* Building a trie ----------------------------------------------------------*/ | |
72 | ||
73 | enum { | |
74 | /** The null index-2 block, following the gap in the index-2 table. */ | |
75 | UNEWTRIE2_INDEX_2_NULL_OFFSET=UNEWTRIE2_INDEX_GAP_OFFSET+UNEWTRIE2_INDEX_GAP_LENGTH, | |
76 | ||
77 | /** The start of allocated index-2 blocks. */ | |
78 | UNEWTRIE2_INDEX_2_START_OFFSET=UNEWTRIE2_INDEX_2_NULL_OFFSET+UTRIE2_INDEX_2_BLOCK_LENGTH, | |
79 | ||
80 | /** | |
81 | * The null data block. | |
82 | * Length 64=0x40 even if UTRIE2_DATA_BLOCK_LENGTH is smaller, | |
83 | * to work with 6-bit trail bytes from 2-byte UTF-8. | |
84 | */ | |
85 | UNEWTRIE2_DATA_NULL_OFFSET=UTRIE2_DATA_START_OFFSET, | |
86 | ||
87 | /** The start of allocated data blocks. */ | |
88 | UNEWTRIE2_DATA_START_OFFSET=UNEWTRIE2_DATA_NULL_OFFSET+0x40, | |
89 | ||
90 | /** | |
91 | * The start of data blocks for U+0800 and above. | |
92 | * Below, compaction uses a block length of 64 for 2-byte UTF-8. | |
93 | * From here on, compaction uses UTRIE2_DATA_BLOCK_LENGTH. | |
94 | * Data values for 0x780 code points beyond ASCII. | |
95 | */ | |
96 | UNEWTRIE2_DATA_0800_OFFSET=UNEWTRIE2_DATA_START_OFFSET+0x780 | |
97 | }; | |
98 | ||
99 | /* Start with allocation of 16k data entries. */ | |
100 | #define UNEWTRIE2_INITIAL_DATA_LENGTH ((int32_t)1<<14) | |
101 | ||
102 | /* Grow about 8x each time. */ | |
103 | #define UNEWTRIE2_MEDIUM_DATA_LENGTH ((int32_t)1<<17) | |
104 | ||
105 | static int32_t | |
106 | allocIndex2Block(UNewTrie2 *trie); | |
107 | ||
108 | U_CAPI UTrie2 * U_EXPORT2 | |
109 | utrie2_open(uint32_t initialValue, uint32_t errorValue, UErrorCode *pErrorCode) { | |
110 | UTrie2 *trie; | |
111 | UNewTrie2 *newTrie; | |
112 | uint32_t *data; | |
113 | int32_t i, j; | |
114 | ||
115 | if(U_FAILURE(*pErrorCode)) { | |
116 | return NULL; | |
117 | } | |
118 | ||
119 | trie=(UTrie2 *)uprv_malloc(sizeof(UTrie2)); | |
120 | newTrie=(UNewTrie2 *)uprv_malloc(sizeof(UNewTrie2)); | |
121 | data=(uint32_t *)uprv_malloc(UNEWTRIE2_INITIAL_DATA_LENGTH*4); | |
122 | if(trie==NULL || newTrie==NULL || data==NULL) { | |
123 | uprv_free(trie); | |
124 | uprv_free(newTrie); | |
125 | uprv_free(data); | |
126 | *pErrorCode=U_MEMORY_ALLOCATION_ERROR; | |
127 | return 0; | |
128 | } | |
129 | ||
130 | uprv_memset(trie, 0, sizeof(UTrie2)); | |
131 | trie->initialValue=initialValue; | |
132 | trie->errorValue=errorValue; | |
133 | trie->highStart=0x110000; | |
134 | trie->newTrie=newTrie; | |
135 | ||
136 | newTrie->data=data; | |
137 | newTrie->dataCapacity=UNEWTRIE2_INITIAL_DATA_LENGTH; | |
138 | newTrie->initialValue=initialValue; | |
139 | newTrie->errorValue=errorValue; | |
140 | newTrie->highStart=0x110000; | |
141 | newTrie->firstFreeBlock=0; /* no free block in the list */ | |
142 | newTrie->isCompacted=FALSE; | |
143 | ||
144 | /* | |
145 | * preallocate and reset | |
146 | * - ASCII | |
147 | * - the bad-UTF-8-data block | |
148 | * - the null data block | |
149 | */ | |
150 | for(i=0; i<0x80; ++i) { | |
151 | newTrie->data[i]=initialValue; | |
152 | } | |
153 | for(; i<0xc0; ++i) { | |
154 | newTrie->data[i]=errorValue; | |
155 | } | |
156 | for(i=UNEWTRIE2_DATA_NULL_OFFSET; i<UNEWTRIE2_DATA_START_OFFSET; ++i) { | |
157 | newTrie->data[i]=initialValue; | |
158 | } | |
159 | newTrie->dataNullOffset=UNEWTRIE2_DATA_NULL_OFFSET; | |
160 | newTrie->dataLength=UNEWTRIE2_DATA_START_OFFSET; | |
161 | ||
162 | /* set the index-2 indexes for the 2=0x80>>UTRIE2_SHIFT_2 ASCII data blocks */ | |
163 | for(i=0, j=0; j<0x80; ++i, j+=UTRIE2_DATA_BLOCK_LENGTH) { | |
164 | newTrie->index2[i]=j; | |
165 | newTrie->map[i]=1; | |
166 | } | |
167 | /* reference counts for the bad-UTF-8-data block */ | |
168 | for(; j<0xc0; ++i, j+=UTRIE2_DATA_BLOCK_LENGTH) { | |
169 | newTrie->map[i]=0; | |
170 | } | |
171 | /* | |
172 | * Reference counts for the null data block: all blocks except for the ASCII blocks. | |
173 | * Plus 1 so that we don't drop this block during compaction. | |
174 | * Plus as many as needed for lead surrogate code points. | |
175 | */ | |
176 | /* i==newTrie->dataNullOffset */ | |
177 | newTrie->map[i++]= | |
178 | (0x110000>>UTRIE2_SHIFT_2)- | |
179 | (0x80>>UTRIE2_SHIFT_2)+ | |
180 | 1+ | |
181 | UTRIE2_LSCP_INDEX_2_LENGTH; | |
182 | j+=UTRIE2_DATA_BLOCK_LENGTH; | |
183 | for(; j<UNEWTRIE2_DATA_START_OFFSET; ++i, j+=UTRIE2_DATA_BLOCK_LENGTH) { | |
184 | newTrie->map[i]=0; | |
185 | } | |
186 | ||
187 | /* | |
188 | * set the remaining indexes in the BMP index-2 block | |
189 | * to the null data block | |
190 | */ | |
191 | for(i=0x80>>UTRIE2_SHIFT_2; i<UTRIE2_INDEX_2_BMP_LENGTH; ++i) { | |
192 | newTrie->index2[i]=UNEWTRIE2_DATA_NULL_OFFSET; | |
193 | } | |
194 | ||
195 | /* | |
196 | * Fill the index gap with impossible values so that compaction | |
197 | * does not overlap other index-2 blocks with the gap. | |
198 | */ | |
199 | for(i=0; i<UNEWTRIE2_INDEX_GAP_LENGTH; ++i) { | |
200 | newTrie->index2[UNEWTRIE2_INDEX_GAP_OFFSET+i]=-1; | |
201 | } | |
202 | ||
203 | /* set the indexes in the null index-2 block */ | |
204 | for(i=0; i<UTRIE2_INDEX_2_BLOCK_LENGTH; ++i) { | |
205 | newTrie->index2[UNEWTRIE2_INDEX_2_NULL_OFFSET+i]=UNEWTRIE2_DATA_NULL_OFFSET; | |
206 | } | |
207 | newTrie->index2NullOffset=UNEWTRIE2_INDEX_2_NULL_OFFSET; | |
208 | newTrie->index2Length=UNEWTRIE2_INDEX_2_START_OFFSET; | |
209 | ||
210 | /* set the index-1 indexes for the linear index-2 block */ | |
211 | for(i=0, j=0; | |
212 | i<UTRIE2_OMITTED_BMP_INDEX_1_LENGTH; | |
213 | ++i, j+=UTRIE2_INDEX_2_BLOCK_LENGTH | |
214 | ) { | |
215 | newTrie->index1[i]=j; | |
216 | } | |
217 | ||
218 | /* set the remaining index-1 indexes to the null index-2 block */ | |
219 | for(; i<UNEWTRIE2_INDEX_1_LENGTH; ++i) { | |
220 | newTrie->index1[i]=UNEWTRIE2_INDEX_2_NULL_OFFSET; | |
221 | } | |
222 | ||
223 | /* | |
224 | * Preallocate and reset data for U+0080..U+07ff, | |
225 | * for 2-byte UTF-8 which will be compacted in 64-blocks | |
226 | * even if UTRIE2_DATA_BLOCK_LENGTH is smaller. | |
227 | */ | |
228 | for(i=0x80; i<0x800; i+=UTRIE2_DATA_BLOCK_LENGTH) { | |
229 | utrie2_set32(trie, i, initialValue, pErrorCode); | |
230 | } | |
231 | ||
232 | return trie; | |
233 | } | |
234 | ||
235 | static UNewTrie2 * | |
236 | cloneBuilder(const UNewTrie2 *other) { | |
237 | UNewTrie2 *trie; | |
238 | ||
239 | trie=(UNewTrie2 *)uprv_malloc(sizeof(UNewTrie2)); | |
240 | if(trie==NULL) { | |
241 | return NULL; | |
242 | } | |
243 | ||
244 | trie->data=(uint32_t *)uprv_malloc(other->dataCapacity*4); | |
245 | if(trie->data==NULL) { | |
246 | uprv_free(trie); | |
247 | return NULL; | |
248 | } | |
249 | trie->dataCapacity=other->dataCapacity; | |
250 | ||
251 | /* clone data */ | |
252 | uprv_memcpy(trie->index1, other->index1, sizeof(trie->index1)); | |
a62d09fc | 253 | uprv_memcpy(trie->index2, other->index2, (size_t)other->index2Length*4); |
729e4ab9 A |
254 | trie->index2NullOffset=other->index2NullOffset; |
255 | trie->index2Length=other->index2Length; | |
256 | ||
a62d09fc | 257 | uprv_memcpy(trie->data, other->data, (size_t)other->dataLength*4); |
729e4ab9 A |
258 | trie->dataNullOffset=other->dataNullOffset; |
259 | trie->dataLength=other->dataLength; | |
260 | ||
261 | /* reference counters */ | |
262 | if(other->isCompacted) { | |
263 | trie->firstFreeBlock=0; | |
264 | } else { | |
a62d09fc | 265 | uprv_memcpy(trie->map, other->map, ((size_t)other->dataLength>>UTRIE2_SHIFT_2)*4); |
729e4ab9 A |
266 | trie->firstFreeBlock=other->firstFreeBlock; |
267 | } | |
268 | ||
269 | trie->initialValue=other->initialValue; | |
270 | trie->errorValue=other->errorValue; | |
271 | trie->highStart=other->highStart; | |
272 | trie->isCompacted=other->isCompacted; | |
273 | ||
274 | return trie; | |
275 | } | |
276 | ||
277 | U_CAPI UTrie2 * U_EXPORT2 | |
278 | utrie2_clone(const UTrie2 *other, UErrorCode *pErrorCode) { | |
279 | UTrie2 *trie; | |
280 | ||
281 | if(U_FAILURE(*pErrorCode)) { | |
282 | return NULL; | |
283 | } | |
284 | if(other==NULL || (other->memory==NULL && other->newTrie==NULL)) { | |
285 | *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; | |
286 | return NULL; | |
287 | } | |
288 | ||
289 | trie=(UTrie2 *)uprv_malloc(sizeof(UTrie2)); | |
290 | if(trie==NULL) { | |
291 | return NULL; | |
292 | } | |
293 | uprv_memcpy(trie, other, sizeof(UTrie2)); | |
294 | ||
295 | if(other->memory!=NULL) { | |
296 | trie->memory=uprv_malloc(other->length); | |
297 | if(trie->memory!=NULL) { | |
298 | trie->isMemoryOwned=TRUE; | |
299 | uprv_memcpy(trie->memory, other->memory, other->length); | |
300 | ||
301 | /* make the clone's pointers point to its own memory */ | |
302 | trie->index=(uint16_t *)trie->memory+(other->index-(uint16_t *)other->memory); | |
303 | if(other->data16!=NULL) { | |
304 | trie->data16=(uint16_t *)trie->memory+(other->data16-(uint16_t *)other->memory); | |
305 | } | |
306 | if(other->data32!=NULL) { | |
307 | trie->data32=(uint32_t *)trie->memory+(other->data32-(uint32_t *)other->memory); | |
308 | } | |
309 | } | |
310 | } else /* other->newTrie!=NULL */ { | |
311 | trie->newTrie=cloneBuilder(other->newTrie); | |
312 | } | |
313 | ||
314 | if(trie->memory==NULL && trie->newTrie==NULL) { | |
315 | uprv_free(trie); | |
316 | trie=NULL; | |
317 | } | |
318 | return trie; | |
319 | } | |
320 | ||
321 | typedef struct NewTrieAndStatus { | |
322 | UTrie2 *trie; | |
323 | UErrorCode errorCode; | |
324 | UBool exclusiveLimit; /* rather than inclusive range end */ | |
325 | } NewTrieAndStatus; | |
326 | ||
327 | static UBool U_CALLCONV | |
328 | copyEnumRange(const void *context, UChar32 start, UChar32 end, uint32_t value) { | |
329 | NewTrieAndStatus *nt=(NewTrieAndStatus *)context; | |
330 | if(value!=nt->trie->initialValue) { | |
331 | if(nt->exclusiveLimit) { | |
332 | --end; | |
333 | } | |
334 | if(start==end) { | |
335 | utrie2_set32(nt->trie, start, value, &nt->errorCode); | |
336 | } else { | |
337 | utrie2_setRange32(nt->trie, start, end, value, TRUE, &nt->errorCode); | |
338 | } | |
339 | return U_SUCCESS(nt->errorCode); | |
340 | } else { | |
341 | return TRUE; | |
342 | } | |
343 | } | |
344 | ||
345 | #ifdef UTRIE2_DEBUG | |
346 | static void | |
347 | utrie_printLengths(const UTrie *trie) { | |
348 | long indexLength=trie->indexLength; | |
349 | long dataLength=(long)trie->dataLength; | |
350 | long totalLength=(long)sizeof(UTrieHeader)+indexLength*2+dataLength*(trie->data32!=NULL ? 4 : 2); | |
351 | printf("**UTrieLengths** index:%6ld data:%6ld serialized:%6ld\n", | |
352 | indexLength, dataLength, totalLength); | |
353 | } | |
354 | ||
355 | static void | |
356 | utrie2_printLengths(const UTrie2 *trie, const char *which) { | |
357 | long indexLength=trie->indexLength; | |
358 | long dataLength=(long)trie->dataLength; | |
359 | long totalLength=(long)sizeof(UTrie2Header)+indexLength*2+dataLength*(trie->data32!=NULL ? 4 : 2); | |
360 | printf("**UTrie2Lengths(%s)** index:%6ld data:%6ld serialized:%6ld\n", | |
361 | which, indexLength, dataLength, totalLength); | |
362 | } | |
363 | #endif | |
364 | ||
365 | U_CAPI UTrie2 * U_EXPORT2 | |
366 | utrie2_cloneAsThawed(const UTrie2 *other, UErrorCode *pErrorCode) { | |
367 | NewTrieAndStatus context; | |
368 | UChar lead; | |
369 | ||
370 | if(U_FAILURE(*pErrorCode)) { | |
371 | return NULL; | |
372 | } | |
373 | if(other==NULL || (other->memory==NULL && other->newTrie==NULL)) { | |
374 | *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; | |
375 | return NULL; | |
376 | } | |
377 | if(other->newTrie!=NULL && !other->newTrie->isCompacted) { | |
378 | return utrie2_clone(other, pErrorCode); /* clone an unfrozen trie */ | |
379 | } | |
380 | ||
381 | /* Clone the frozen trie by enumerating it and building a new one. */ | |
382 | context.trie=utrie2_open(other->initialValue, other->errorValue, pErrorCode); | |
383 | if(U_FAILURE(*pErrorCode)) { | |
384 | return NULL; | |
385 | } | |
386 | context.exclusiveLimit=FALSE; | |
387 | context.errorCode=*pErrorCode; | |
388 | utrie2_enum(other, NULL, copyEnumRange, &context); | |
389 | *pErrorCode=context.errorCode; | |
390 | for(lead=0xd800; lead<0xdc00; ++lead) { | |
391 | uint32_t value; | |
392 | if(other->data32==NULL) { | |
393 | value=UTRIE2_GET16_FROM_U16_SINGLE_LEAD(other, lead); | |
394 | } else { | |
395 | value=UTRIE2_GET32_FROM_U16_SINGLE_LEAD(other, lead); | |
396 | } | |
397 | if(value!=other->initialValue) { | |
398 | utrie2_set32ForLeadSurrogateCodeUnit(context.trie, lead, value, pErrorCode); | |
399 | } | |
400 | } | |
401 | if(U_FAILURE(*pErrorCode)) { | |
402 | utrie2_close(context.trie); | |
403 | context.trie=NULL; | |
404 | } | |
405 | return context.trie; | |
406 | } | |
407 | ||
408 | /* Almost the same as utrie2_cloneAsThawed() but copies a UTrie and freezes the clone. */ | |
409 | U_CAPI UTrie2 * U_EXPORT2 | |
410 | utrie2_fromUTrie(const UTrie *trie1, uint32_t errorValue, UErrorCode *pErrorCode) { | |
411 | NewTrieAndStatus context; | |
412 | UChar lead; | |
413 | ||
414 | if(U_FAILURE(*pErrorCode)) { | |
415 | return NULL; | |
416 | } | |
417 | if(trie1==NULL) { | |
418 | *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; | |
419 | return NULL; | |
420 | } | |
421 | context.trie=utrie2_open(trie1->initialValue, errorValue, pErrorCode); | |
422 | if(U_FAILURE(*pErrorCode)) { | |
423 | return NULL; | |
424 | } | |
425 | context.exclusiveLimit=TRUE; | |
426 | context.errorCode=*pErrorCode; | |
427 | utrie_enum(trie1, NULL, copyEnumRange, &context); | |
428 | *pErrorCode=context.errorCode; | |
429 | for(lead=0xd800; lead<0xdc00; ++lead) { | |
430 | uint32_t value; | |
431 | if(trie1->data32==NULL) { | |
432 | value=UTRIE_GET16_FROM_LEAD(trie1, lead); | |
433 | } else { | |
434 | value=UTRIE_GET32_FROM_LEAD(trie1, lead); | |
435 | } | |
436 | if(value!=trie1->initialValue) { | |
437 | utrie2_set32ForLeadSurrogateCodeUnit(context.trie, lead, value, pErrorCode); | |
438 | } | |
439 | } | |
440 | if(U_SUCCESS(*pErrorCode)) { | |
441 | utrie2_freeze(context.trie, | |
442 | trie1->data32!=NULL ? UTRIE2_32_VALUE_BITS : UTRIE2_16_VALUE_BITS, | |
443 | pErrorCode); | |
444 | } | |
445 | #ifdef UTRIE2_DEBUG | |
446 | if(U_SUCCESS(*pErrorCode)) { | |
447 | utrie_printLengths(trie1); | |
448 | utrie2_printLengths(context.trie, "fromUTrie"); | |
449 | } | |
450 | #endif | |
451 | if(U_FAILURE(*pErrorCode)) { | |
452 | utrie2_close(context.trie); | |
453 | context.trie=NULL; | |
454 | } | |
455 | return context.trie; | |
456 | } | |
457 | ||
4388f060 | 458 | static inline UBool |
729e4ab9 A |
459 | isInNullBlock(UNewTrie2 *trie, UChar32 c, UBool forLSCP) { |
460 | int32_t i2, block; | |
461 | ||
462 | if(U_IS_LEAD(c) && forLSCP) { | |
463 | i2=(UTRIE2_LSCP_INDEX_2_OFFSET-(0xd800>>UTRIE2_SHIFT_2))+ | |
464 | (c>>UTRIE2_SHIFT_2); | |
465 | } else { | |
466 | i2=trie->index1[c>>UTRIE2_SHIFT_1]+ | |
467 | ((c>>UTRIE2_SHIFT_2)&UTRIE2_INDEX_2_MASK); | |
468 | } | |
469 | block=trie->index2[i2]; | |
470 | return (UBool)(block==trie->dataNullOffset); | |
471 | } | |
472 | ||
473 | static int32_t | |
474 | allocIndex2Block(UNewTrie2 *trie) { | |
475 | int32_t newBlock, newTop; | |
476 | ||
477 | newBlock=trie->index2Length; | |
478 | newTop=newBlock+UTRIE2_INDEX_2_BLOCK_LENGTH; | |
b331163b | 479 | if(newTop>UPRV_LENGTHOF(trie->index2)) { |
729e4ab9 A |
480 | /* |
481 | * Should never occur. | |
482 | * Either UTRIE2_MAX_BUILD_TIME_INDEX_LENGTH is incorrect, | |
483 | * or the code writes more values than should be possible. | |
484 | */ | |
485 | return -1; | |
486 | } | |
487 | trie->index2Length=newTop; | |
488 | uprv_memcpy(trie->index2+newBlock, trie->index2+trie->index2NullOffset, UTRIE2_INDEX_2_BLOCK_LENGTH*4); | |
489 | return newBlock; | |
490 | } | |
491 | ||
492 | static int32_t | |
493 | getIndex2Block(UNewTrie2 *trie, UChar32 c, UBool forLSCP) { | |
494 | int32_t i1, i2; | |
495 | ||
496 | if(U_IS_LEAD(c) && forLSCP) { | |
497 | return UTRIE2_LSCP_INDEX_2_OFFSET; | |
498 | } | |
499 | ||
500 | i1=c>>UTRIE2_SHIFT_1; | |
501 | i2=trie->index1[i1]; | |
502 | if(i2==trie->index2NullOffset) { | |
503 | i2=allocIndex2Block(trie); | |
504 | if(i2<0) { | |
505 | return -1; /* program error */ | |
506 | } | |
507 | trie->index1[i1]=i2; | |
508 | } | |
509 | return i2; | |
510 | } | |
511 | ||
512 | static int32_t | |
513 | allocDataBlock(UNewTrie2 *trie, int32_t copyBlock) { | |
514 | int32_t newBlock, newTop; | |
515 | ||
516 | if(trie->firstFreeBlock!=0) { | |
517 | /* get the first free block */ | |
518 | newBlock=trie->firstFreeBlock; | |
519 | trie->firstFreeBlock=-trie->map[newBlock>>UTRIE2_SHIFT_2]; | |
520 | } else { | |
521 | /* get a new block from the high end */ | |
522 | newBlock=trie->dataLength; | |
523 | newTop=newBlock+UTRIE2_DATA_BLOCK_LENGTH; | |
524 | if(newTop>trie->dataCapacity) { | |
525 | /* out of memory in the data array */ | |
526 | int32_t capacity; | |
527 | uint32_t *data; | |
528 | ||
529 | if(trie->dataCapacity<UNEWTRIE2_MEDIUM_DATA_LENGTH) { | |
530 | capacity=UNEWTRIE2_MEDIUM_DATA_LENGTH; | |
531 | } else if(trie->dataCapacity<UNEWTRIE2_MAX_DATA_LENGTH) { | |
532 | capacity=UNEWTRIE2_MAX_DATA_LENGTH; | |
533 | } else { | |
534 | /* | |
535 | * Should never occur. | |
536 | * Either UNEWTRIE2_MAX_DATA_LENGTH is incorrect, | |
537 | * or the code writes more values than should be possible. | |
538 | */ | |
539 | return -1; | |
540 | } | |
541 | data=(uint32_t *)uprv_malloc(capacity*4); | |
542 | if(data==NULL) { | |
543 | return -1; | |
544 | } | |
a62d09fc | 545 | uprv_memcpy(data, trie->data, (size_t)trie->dataLength*4); |
729e4ab9 A |
546 | uprv_free(trie->data); |
547 | trie->data=data; | |
548 | trie->dataCapacity=capacity; | |
549 | } | |
550 | trie->dataLength=newTop; | |
551 | } | |
552 | uprv_memcpy(trie->data+newBlock, trie->data+copyBlock, UTRIE2_DATA_BLOCK_LENGTH*4); | |
553 | trie->map[newBlock>>UTRIE2_SHIFT_2]=0; | |
554 | return newBlock; | |
555 | } | |
556 | ||
557 | /* call when the block's reference counter reaches 0 */ | |
558 | static void | |
559 | releaseDataBlock(UNewTrie2 *trie, int32_t block) { | |
560 | /* put this block at the front of the free-block chain */ | |
561 | trie->map[block>>UTRIE2_SHIFT_2]=-trie->firstFreeBlock; | |
562 | trie->firstFreeBlock=block; | |
563 | } | |
564 | ||
4388f060 | 565 | static inline UBool |
729e4ab9 A |
566 | isWritableBlock(UNewTrie2 *trie, int32_t block) { |
567 | return (UBool)(block!=trie->dataNullOffset && 1==trie->map[block>>UTRIE2_SHIFT_2]); | |
568 | } | |
569 | ||
4388f060 | 570 | static inline void |
729e4ab9 A |
571 | setIndex2Entry(UNewTrie2 *trie, int32_t i2, int32_t block) { |
572 | int32_t oldBlock; | |
573 | ++trie->map[block>>UTRIE2_SHIFT_2]; /* increment first, in case block==oldBlock! */ | |
574 | oldBlock=trie->index2[i2]; | |
575 | if(0 == --trie->map[oldBlock>>UTRIE2_SHIFT_2]) { | |
576 | releaseDataBlock(trie, oldBlock); | |
577 | } | |
578 | trie->index2[i2]=block; | |
579 | } | |
580 | ||
581 | /** | |
582 | * No error checking for illegal arguments. | |
583 | * | |
584 | * @return -1 if no new data block available (out of memory in data array) | |
585 | * @internal | |
586 | */ | |
587 | static int32_t | |
588 | getDataBlock(UNewTrie2 *trie, UChar32 c, UBool forLSCP) { | |
589 | int32_t i2, oldBlock, newBlock; | |
590 | ||
591 | i2=getIndex2Block(trie, c, forLSCP); | |
592 | if(i2<0) { | |
593 | return -1; /* program error */ | |
594 | } | |
595 | ||
596 | i2+=(c>>UTRIE2_SHIFT_2)&UTRIE2_INDEX_2_MASK; | |
597 | oldBlock=trie->index2[i2]; | |
598 | if(isWritableBlock(trie, oldBlock)) { | |
599 | return oldBlock; | |
600 | } | |
601 | ||
602 | /* allocate a new data block */ | |
603 | newBlock=allocDataBlock(trie, oldBlock); | |
604 | if(newBlock<0) { | |
605 | /* out of memory in the data array */ | |
606 | return -1; | |
607 | } | |
608 | setIndex2Entry(trie, i2, newBlock); | |
609 | return newBlock; | |
610 | } | |
611 | ||
612 | /** | |
613 | * @return TRUE if the value was successfully set | |
614 | */ | |
615 | static void | |
616 | set32(UNewTrie2 *trie, | |
617 | UChar32 c, UBool forLSCP, uint32_t value, | |
618 | UErrorCode *pErrorCode) { | |
619 | int32_t block; | |
620 | ||
621 | if(trie==NULL || trie->isCompacted) { | |
622 | *pErrorCode=U_NO_WRITE_PERMISSION; | |
623 | return; | |
624 | } | |
625 | ||
626 | block=getDataBlock(trie, c, forLSCP); | |
627 | if(block<0) { | |
628 | *pErrorCode=U_MEMORY_ALLOCATION_ERROR; | |
629 | return; | |
630 | } | |
631 | ||
632 | trie->data[block+(c&UTRIE2_DATA_MASK)]=value; | |
633 | } | |
634 | ||
635 | U_CAPI void U_EXPORT2 | |
636 | utrie2_set32(UTrie2 *trie, UChar32 c, uint32_t value, UErrorCode *pErrorCode) { | |
637 | if(U_FAILURE(*pErrorCode)) { | |
638 | return; | |
639 | } | |
640 | if((uint32_t)c>0x10ffff) { | |
641 | *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; | |
642 | return; | |
643 | } | |
644 | set32(trie->newTrie, c, TRUE, value, pErrorCode); | |
645 | } | |
646 | ||
647 | U_CAPI void U_EXPORT2 | |
648 | utrie2_set32ForLeadSurrogateCodeUnit(UTrie2 *trie, | |
649 | UChar32 c, uint32_t value, | |
650 | UErrorCode *pErrorCode) { | |
651 | if(U_FAILURE(*pErrorCode)) { | |
652 | return; | |
653 | } | |
654 | if(!U_IS_LEAD(c)) { | |
655 | *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; | |
656 | return; | |
657 | } | |
658 | set32(trie->newTrie, c, FALSE, value, pErrorCode); | |
659 | } | |
660 | ||
661 | static void | |
662 | writeBlock(uint32_t *block, uint32_t value) { | |
663 | uint32_t *limit=block+UTRIE2_DATA_BLOCK_LENGTH; | |
664 | while(block<limit) { | |
665 | *block++=value; | |
666 | } | |
667 | } | |
668 | ||
669 | /** | |
670 | * initialValue is ignored if overwrite=TRUE | |
671 | * @internal | |
672 | */ | |
673 | static void | |
674 | fillBlock(uint32_t *block, UChar32 start, UChar32 limit, | |
675 | uint32_t value, uint32_t initialValue, UBool overwrite) { | |
676 | uint32_t *pLimit; | |
677 | ||
678 | pLimit=block+limit; | |
679 | block+=start; | |
680 | if(overwrite) { | |
681 | while(block<pLimit) { | |
682 | *block++=value; | |
683 | } | |
684 | } else { | |
685 | while(block<pLimit) { | |
686 | if(*block==initialValue) { | |
687 | *block=value; | |
688 | } | |
689 | ++block; | |
690 | } | |
691 | } | |
692 | } | |
693 | ||
694 | U_CAPI void U_EXPORT2 | |
695 | utrie2_setRange32(UTrie2 *trie, | |
696 | UChar32 start, UChar32 end, | |
697 | uint32_t value, UBool overwrite, | |
698 | UErrorCode *pErrorCode) { | |
699 | /* | |
700 | * repeat value in [start..end] | |
701 | * mark index values for repeat-data blocks by setting bit 31 of the index values | |
702 | * fill around existing values if any, if(overwrite) | |
703 | */ | |
704 | UNewTrie2 *newTrie; | |
705 | int32_t block, rest, repeatBlock; | |
706 | UChar32 limit; | |
707 | ||
708 | if(U_FAILURE(*pErrorCode)) { | |
709 | return; | |
710 | } | |
711 | if((uint32_t)start>0x10ffff || (uint32_t)end>0x10ffff || start>end) { | |
712 | *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; | |
713 | return; | |
714 | } | |
715 | newTrie=trie->newTrie; | |
716 | if(newTrie==NULL || newTrie->isCompacted) { | |
717 | *pErrorCode=U_NO_WRITE_PERMISSION; | |
718 | return; | |
719 | } | |
720 | if(!overwrite && value==newTrie->initialValue) { | |
721 | return; /* nothing to do */ | |
722 | } | |
723 | ||
724 | limit=end+1; | |
725 | if(start&UTRIE2_DATA_MASK) { | |
726 | UChar32 nextStart; | |
727 | ||
728 | /* set partial block at [start..following block boundary[ */ | |
729 | block=getDataBlock(newTrie, start, TRUE); | |
730 | if(block<0) { | |
731 | *pErrorCode=U_MEMORY_ALLOCATION_ERROR; | |
732 | return; | |
733 | } | |
734 | ||
735 | nextStart=(start+UTRIE2_DATA_BLOCK_LENGTH)&~UTRIE2_DATA_MASK; | |
736 | if(nextStart<=limit) { | |
737 | fillBlock(newTrie->data+block, start&UTRIE2_DATA_MASK, UTRIE2_DATA_BLOCK_LENGTH, | |
738 | value, newTrie->initialValue, overwrite); | |
739 | start=nextStart; | |
740 | } else { | |
741 | fillBlock(newTrie->data+block, start&UTRIE2_DATA_MASK, limit&UTRIE2_DATA_MASK, | |
742 | value, newTrie->initialValue, overwrite); | |
743 | return; | |
744 | } | |
745 | } | |
746 | ||
747 | /* number of positions in the last, partial block */ | |
748 | rest=limit&UTRIE2_DATA_MASK; | |
749 | ||
750 | /* round down limit to a block boundary */ | |
751 | limit&=~UTRIE2_DATA_MASK; | |
752 | ||
753 | /* iterate over all-value blocks */ | |
754 | if(value==newTrie->initialValue) { | |
755 | repeatBlock=newTrie->dataNullOffset; | |
756 | } else { | |
757 | repeatBlock=-1; | |
758 | } | |
759 | ||
760 | while(start<limit) { | |
761 | int32_t i2; | |
762 | UBool setRepeatBlock=FALSE; | |
763 | ||
764 | if(value==newTrie->initialValue && isInNullBlock(newTrie, start, TRUE)) { | |
765 | start+=UTRIE2_DATA_BLOCK_LENGTH; /* nothing to do */ | |
766 | continue; | |
767 | } | |
768 | ||
769 | /* get index value */ | |
770 | i2=getIndex2Block(newTrie, start, TRUE); | |
771 | if(i2<0) { | |
772 | *pErrorCode=U_INTERNAL_PROGRAM_ERROR; | |
773 | return; | |
774 | } | |
775 | i2+=(start>>UTRIE2_SHIFT_2)&UTRIE2_INDEX_2_MASK; | |
776 | block=newTrie->index2[i2]; | |
777 | if(isWritableBlock(newTrie, block)) { | |
778 | /* already allocated */ | |
779 | if(overwrite && block>=UNEWTRIE2_DATA_0800_OFFSET) { | |
780 | /* | |
781 | * We overwrite all values, and it's not a | |
782 | * protected (ASCII-linear or 2-byte UTF-8) block: | |
783 | * replace with the repeatBlock. | |
784 | */ | |
785 | setRepeatBlock=TRUE; | |
786 | } else { | |
787 | /* !overwrite, or protected block: just write the values into this block */ | |
788 | fillBlock(newTrie->data+block, | |
789 | 0, UTRIE2_DATA_BLOCK_LENGTH, | |
790 | value, newTrie->initialValue, overwrite); | |
791 | } | |
792 | } else if(newTrie->data[block]!=value && (overwrite || block==newTrie->dataNullOffset)) { | |
793 | /* | |
794 | * Set the repeatBlock instead of the null block or previous repeat block: | |
795 | * | |
796 | * If !isWritableBlock() then all entries in the block have the same value | |
797 | * because it's the null block or a range block (the repeatBlock from a previous | |
798 | * call to utrie2_setRange32()). | |
799 | * No other blocks are used multiple times before compacting. | |
800 | * | |
801 | * The null block is the only non-writable block with the initialValue because | |
802 | * of the repeatBlock initialization above. (If value==initialValue, then | |
803 | * the repeatBlock will be the null data block.) | |
804 | * | |
805 | * We set our repeatBlock if the desired value differs from the block's value, | |
806 | * and if we overwrite any data or if the data is all initial values | |
807 | * (which is the same as the block being the null block, see above). | |
808 | */ | |
809 | setRepeatBlock=TRUE; | |
810 | } | |
811 | if(setRepeatBlock) { | |
812 | if(repeatBlock>=0) { | |
813 | setIndex2Entry(newTrie, i2, repeatBlock); | |
814 | } else { | |
815 | /* create and set and fill the repeatBlock */ | |
816 | repeatBlock=getDataBlock(newTrie, start, TRUE); | |
817 | if(repeatBlock<0) { | |
818 | *pErrorCode=U_MEMORY_ALLOCATION_ERROR; | |
819 | return; | |
820 | } | |
821 | writeBlock(newTrie->data+repeatBlock, value); | |
822 | } | |
823 | } | |
824 | ||
825 | start+=UTRIE2_DATA_BLOCK_LENGTH; | |
826 | } | |
827 | ||
828 | if(rest>0) { | |
829 | /* set partial block at [last block boundary..limit[ */ | |
830 | block=getDataBlock(newTrie, start, TRUE); | |
831 | if(block<0) { | |
832 | *pErrorCode=U_MEMORY_ALLOCATION_ERROR; | |
833 | return; | |
834 | } | |
835 | ||
836 | fillBlock(newTrie->data+block, 0, rest, value, newTrie->initialValue, overwrite); | |
837 | } | |
838 | ||
839 | return; | |
840 | } | |
841 | ||
842 | /* compaction --------------------------------------------------------------- */ | |
843 | ||
4388f060 | 844 | static inline UBool |
729e4ab9 A |
845 | equal_int32(const int32_t *s, const int32_t *t, int32_t length) { |
846 | while(length>0 && *s==*t) { | |
847 | ++s; | |
848 | ++t; | |
849 | --length; | |
850 | } | |
851 | return (UBool)(length==0); | |
852 | } | |
853 | ||
4388f060 | 854 | static inline UBool |
729e4ab9 A |
855 | equal_uint32(const uint32_t *s, const uint32_t *t, int32_t length) { |
856 | while(length>0 && *s==*t) { | |
857 | ++s; | |
858 | ++t; | |
859 | --length; | |
860 | } | |
861 | return (UBool)(length==0); | |
862 | } | |
863 | ||
864 | static int32_t | |
865 | findSameIndex2Block(const int32_t *idx, int32_t index2Length, int32_t otherBlock) { | |
866 | int32_t block; | |
867 | ||
868 | /* ensure that we do not even partially get past index2Length */ | |
869 | index2Length-=UTRIE2_INDEX_2_BLOCK_LENGTH; | |
870 | ||
871 | for(block=0; block<=index2Length; ++block) { | |
872 | if(equal_int32(idx+block, idx+otherBlock, UTRIE2_INDEX_2_BLOCK_LENGTH)) { | |
873 | return block; | |
874 | } | |
875 | } | |
876 | return -1; | |
877 | } | |
878 | ||
879 | static int32_t | |
880 | findSameDataBlock(const uint32_t *data, int32_t dataLength, int32_t otherBlock, int32_t blockLength) { | |
881 | int32_t block; | |
882 | ||
883 | /* ensure that we do not even partially get past dataLength */ | |
884 | dataLength-=blockLength; | |
885 | ||
886 | for(block=0; block<=dataLength; block+=UTRIE2_DATA_GRANULARITY) { | |
887 | if(equal_uint32(data+block, data+otherBlock, blockLength)) { | |
888 | return block; | |
889 | } | |
890 | } | |
891 | return -1; | |
892 | } | |
893 | ||
894 | /* | |
895 | * Find the start of the last range in the trie by enumerating backward. | |
896 | * Indexes for supplementary code points higher than this will be omitted. | |
897 | */ | |
898 | static UChar32 | |
899 | findHighStart(UNewTrie2 *trie, uint32_t highValue) { | |
900 | const uint32_t *data32; | |
901 | ||
902 | uint32_t value, initialValue; | |
903 | UChar32 c, prev; | |
904 | int32_t i1, i2, j, i2Block, prevI2Block, index2NullOffset, block, prevBlock, nullBlock; | |
905 | ||
906 | data32=trie->data; | |
907 | initialValue=trie->initialValue; | |
908 | ||
909 | index2NullOffset=trie->index2NullOffset; | |
910 | nullBlock=trie->dataNullOffset; | |
911 | ||
912 | /* set variables for previous range */ | |
913 | if(highValue==initialValue) { | |
914 | prevI2Block=index2NullOffset; | |
915 | prevBlock=nullBlock; | |
916 | } else { | |
917 | prevI2Block=-1; | |
918 | prevBlock=-1; | |
919 | } | |
920 | prev=0x110000; | |
921 | ||
922 | /* enumerate index-2 blocks */ | |
923 | i1=UNEWTRIE2_INDEX_1_LENGTH; | |
924 | c=prev; | |
925 | while(c>0) { | |
926 | i2Block=trie->index1[--i1]; | |
927 | if(i2Block==prevI2Block) { | |
928 | /* the index-2 block is the same as the previous one, and filled with highValue */ | |
929 | c-=UTRIE2_CP_PER_INDEX_1_ENTRY; | |
930 | continue; | |
931 | } | |
932 | prevI2Block=i2Block; | |
933 | if(i2Block==index2NullOffset) { | |
934 | /* this is the null index-2 block */ | |
935 | if(highValue!=initialValue) { | |
936 | return c; | |
937 | } | |
938 | c-=UTRIE2_CP_PER_INDEX_1_ENTRY; | |
939 | } else { | |
940 | /* enumerate data blocks for one index-2 block */ | |
941 | for(i2=UTRIE2_INDEX_2_BLOCK_LENGTH; i2>0;) { | |
942 | block=trie->index2[i2Block+ --i2]; | |
943 | if(block==prevBlock) { | |
944 | /* the block is the same as the previous one, and filled with highValue */ | |
945 | c-=UTRIE2_DATA_BLOCK_LENGTH; | |
946 | continue; | |
947 | } | |
948 | prevBlock=block; | |
949 | if(block==nullBlock) { | |
950 | /* this is the null data block */ | |
951 | if(highValue!=initialValue) { | |
952 | return c; | |
953 | } | |
954 | c-=UTRIE2_DATA_BLOCK_LENGTH; | |
955 | } else { | |
956 | for(j=UTRIE2_DATA_BLOCK_LENGTH; j>0;) { | |
957 | value=data32[block+ --j]; | |
958 | if(value!=highValue) { | |
959 | return c; | |
960 | } | |
961 | --c; | |
962 | } | |
963 | } | |
964 | } | |
965 | } | |
966 | } | |
967 | ||
968 | /* deliver last range */ | |
969 | return 0; | |
970 | } | |
971 | ||
972 | /* | |
973 | * Compact a build-time trie. | |
974 | * | |
975 | * The compaction | |
976 | * - removes blocks that are identical with earlier ones | |
977 | * - overlaps adjacent blocks as much as possible (if overlap==TRUE) | |
978 | * - moves blocks in steps of the data granularity | |
979 | * - moves and overlaps blocks that overlap with multiple values in the overlap region | |
980 | * | |
981 | * It does not | |
982 | * - try to move and overlap blocks that are not already adjacent | |
983 | */ | |
984 | static void | |
985 | compactData(UNewTrie2 *trie) { | |
986 | int32_t start, newStart, movedStart; | |
987 | int32_t blockLength, overlap; | |
988 | int32_t i, mapIndex, blockCount; | |
989 | ||
990 | /* do not compact linear-ASCII data */ | |
991 | newStart=UTRIE2_DATA_START_OFFSET; | |
992 | for(start=0, i=0; start<newStart; start+=UTRIE2_DATA_BLOCK_LENGTH, ++i) { | |
993 | trie->map[i]=start; | |
994 | } | |
995 | ||
996 | /* | |
997 | * Start with a block length of 64 for 2-byte UTF-8, | |
998 | * then switch to UTRIE2_DATA_BLOCK_LENGTH. | |
999 | */ | |
1000 | blockLength=64; | |
1001 | blockCount=blockLength>>UTRIE2_SHIFT_2; | |
1002 | for(start=newStart; start<trie->dataLength;) { | |
1003 | /* | |
1004 | * start: index of first entry of current block | |
1005 | * newStart: index where the current block is to be moved | |
1006 | * (right after current end of already-compacted data) | |
1007 | */ | |
1008 | if(start==UNEWTRIE2_DATA_0800_OFFSET) { | |
1009 | blockLength=UTRIE2_DATA_BLOCK_LENGTH; | |
1010 | blockCount=1; | |
1011 | } | |
1012 | ||
1013 | /* skip blocks that are not used */ | |
1014 | if(trie->map[start>>UTRIE2_SHIFT_2]<=0) { | |
1015 | /* advance start to the next block */ | |
1016 | start+=blockLength; | |
1017 | ||
1018 | /* leave newStart with the previous block! */ | |
1019 | continue; | |
1020 | } | |
1021 | ||
1022 | /* search for an identical block */ | |
1023 | if( (movedStart=findSameDataBlock(trie->data, newStart, start, blockLength)) | |
1024 | >=0 | |
1025 | ) { | |
1026 | /* found an identical block, set the other block's index value for the current block */ | |
1027 | for(i=blockCount, mapIndex=start>>UTRIE2_SHIFT_2; i>0; --i) { | |
1028 | trie->map[mapIndex++]=movedStart; | |
1029 | movedStart+=UTRIE2_DATA_BLOCK_LENGTH; | |
1030 | } | |
1031 | ||
1032 | /* advance start to the next block */ | |
1033 | start+=blockLength; | |
1034 | ||
1035 | /* leave newStart with the previous block! */ | |
1036 | continue; | |
1037 | } | |
1038 | ||
1039 | /* see if the beginning of this block can be overlapped with the end of the previous block */ | |
1040 | /* look for maximum overlap (modulo granularity) with the previous, adjacent block */ | |
1041 | for(overlap=blockLength-UTRIE2_DATA_GRANULARITY; | |
1042 | overlap>0 && !equal_uint32(trie->data+(newStart-overlap), trie->data+start, overlap); | |
1043 | overlap-=UTRIE2_DATA_GRANULARITY) {} | |
1044 | ||
1045 | if(overlap>0 || newStart<start) { | |
1046 | /* some overlap, or just move the whole block */ | |
1047 | movedStart=newStart-overlap; | |
1048 | for(i=blockCount, mapIndex=start>>UTRIE2_SHIFT_2; i>0; --i) { | |
1049 | trie->map[mapIndex++]=movedStart; | |
1050 | movedStart+=UTRIE2_DATA_BLOCK_LENGTH; | |
1051 | } | |
1052 | ||
1053 | /* move the non-overlapping indexes to their new positions */ | |
1054 | start+=overlap; | |
1055 | for(i=blockLength-overlap; i>0; --i) { | |
1056 | trie->data[newStart++]=trie->data[start++]; | |
1057 | } | |
1058 | } else /* no overlap && newStart==start */ { | |
1059 | for(i=blockCount, mapIndex=start>>UTRIE2_SHIFT_2; i>0; --i) { | |
1060 | trie->map[mapIndex++]=start; | |
1061 | start+=UTRIE2_DATA_BLOCK_LENGTH; | |
1062 | } | |
1063 | newStart=start; | |
1064 | } | |
1065 | } | |
1066 | ||
1067 | /* now adjust the index-2 table */ | |
1068 | for(i=0; i<trie->index2Length; ++i) { | |
1069 | if(i==UNEWTRIE2_INDEX_GAP_OFFSET) { | |
1070 | /* Gap indexes are invalid (-1). Skip over the gap. */ | |
1071 | i+=UNEWTRIE2_INDEX_GAP_LENGTH; | |
1072 | } | |
1073 | trie->index2[i]=trie->map[trie->index2[i]>>UTRIE2_SHIFT_2]; | |
1074 | } | |
1075 | trie->dataNullOffset=trie->map[trie->dataNullOffset>>UTRIE2_SHIFT_2]; | |
1076 | ||
1077 | /* ensure dataLength alignment */ | |
1078 | while((newStart&(UTRIE2_DATA_GRANULARITY-1))!=0) { | |
1079 | trie->data[newStart++]=trie->initialValue; | |
1080 | } | |
1081 | ||
1082 | #ifdef UTRIE2_DEBUG | |
1083 | /* we saved some space */ | |
1084 | printf("compacting UTrie2: count of 32-bit data words %lu->%lu\n", | |
1085 | (long)trie->dataLength, (long)newStart); | |
1086 | #endif | |
1087 | ||
1088 | trie->dataLength=newStart; | |
1089 | } | |
1090 | ||
1091 | static void | |
1092 | compactIndex2(UNewTrie2 *trie) { | |
1093 | int32_t i, start, newStart, movedStart, overlap; | |
1094 | ||
1095 | /* do not compact linear-BMP index-2 blocks */ | |
1096 | newStart=UTRIE2_INDEX_2_BMP_LENGTH; | |
1097 | for(start=0, i=0; start<newStart; start+=UTRIE2_INDEX_2_BLOCK_LENGTH, ++i) { | |
1098 | trie->map[i]=start; | |
1099 | } | |
1100 | ||
1101 | /* Reduce the index table gap to what will be needed at runtime. */ | |
1102 | newStart+=UTRIE2_UTF8_2B_INDEX_2_LENGTH+((trie->highStart-0x10000)>>UTRIE2_SHIFT_1); | |
1103 | ||
1104 | for(start=UNEWTRIE2_INDEX_2_NULL_OFFSET; start<trie->index2Length;) { | |
1105 | /* | |
1106 | * start: index of first entry of current block | |
1107 | * newStart: index where the current block is to be moved | |
1108 | * (right after current end of already-compacted data) | |
1109 | */ | |
1110 | ||
1111 | /* search for an identical block */ | |
1112 | if( (movedStart=findSameIndex2Block(trie->index2, newStart, start)) | |
1113 | >=0 | |
1114 | ) { | |
1115 | /* found an identical block, set the other block's index value for the current block */ | |
1116 | trie->map[start>>UTRIE2_SHIFT_1_2]=movedStart; | |
1117 | ||
1118 | /* advance start to the next block */ | |
1119 | start+=UTRIE2_INDEX_2_BLOCK_LENGTH; | |
1120 | ||
1121 | /* leave newStart with the previous block! */ | |
1122 | continue; | |
1123 | } | |
1124 | ||
1125 | /* see if the beginning of this block can be overlapped with the end of the previous block */ | |
1126 | /* look for maximum overlap with the previous, adjacent block */ | |
1127 | for(overlap=UTRIE2_INDEX_2_BLOCK_LENGTH-1; | |
1128 | overlap>0 && !equal_int32(trie->index2+(newStart-overlap), trie->index2+start, overlap); | |
1129 | --overlap) {} | |
1130 | ||
1131 | if(overlap>0 || newStart<start) { | |
1132 | /* some overlap, or just move the whole block */ | |
1133 | trie->map[start>>UTRIE2_SHIFT_1_2]=newStart-overlap; | |
1134 | ||
1135 | /* move the non-overlapping indexes to their new positions */ | |
1136 | start+=overlap; | |
1137 | for(i=UTRIE2_INDEX_2_BLOCK_LENGTH-overlap; i>0; --i) { | |
1138 | trie->index2[newStart++]=trie->index2[start++]; | |
1139 | } | |
1140 | } else /* no overlap && newStart==start */ { | |
1141 | trie->map[start>>UTRIE2_SHIFT_1_2]=start; | |
1142 | start+=UTRIE2_INDEX_2_BLOCK_LENGTH; | |
1143 | newStart=start; | |
1144 | } | |
1145 | } | |
1146 | ||
1147 | /* now adjust the index-1 table */ | |
1148 | for(i=0; i<UNEWTRIE2_INDEX_1_LENGTH; ++i) { | |
1149 | trie->index1[i]=trie->map[trie->index1[i]>>UTRIE2_SHIFT_1_2]; | |
1150 | } | |
1151 | trie->index2NullOffset=trie->map[trie->index2NullOffset>>UTRIE2_SHIFT_1_2]; | |
1152 | ||
1153 | /* | |
1154 | * Ensure data table alignment: | |
1155 | * Needs to be granularity-aligned for 16-bit trie | |
1156 | * (so that dataMove will be down-shiftable), | |
1157 | * and 2-aligned for uint32_t data. | |
1158 | */ | |
1159 | while((newStart&((UTRIE2_DATA_GRANULARITY-1)|1))!=0) { | |
1160 | /* Arbitrary value: 0x3fffc not possible for real data. */ | |
1161 | trie->index2[newStart++]=(int32_t)0xffff<<UTRIE2_INDEX_SHIFT; | |
1162 | } | |
1163 | ||
1164 | #ifdef UTRIE2_DEBUG | |
1165 | /* we saved some space */ | |
1166 | printf("compacting UTrie2: count of 16-bit index-2 words %lu->%lu\n", | |
1167 | (long)trie->index2Length, (long)newStart); | |
1168 | #endif | |
1169 | ||
1170 | trie->index2Length=newStart; | |
1171 | } | |
1172 | ||
1173 | static void | |
1174 | compactTrie(UTrie2 *trie, UErrorCode *pErrorCode) { | |
1175 | UNewTrie2 *newTrie; | |
1176 | UChar32 highStart, suppHighStart; | |
1177 | uint32_t highValue; | |
1178 | ||
1179 | newTrie=trie->newTrie; | |
1180 | ||
1181 | /* find highStart and round it up */ | |
1182 | highValue=utrie2_get32(trie, 0x10ffff); | |
1183 | highStart=findHighStart(newTrie, highValue); | |
1184 | highStart=(highStart+(UTRIE2_CP_PER_INDEX_1_ENTRY-1))&~(UTRIE2_CP_PER_INDEX_1_ENTRY-1); | |
1185 | if(highStart==0x110000) { | |
1186 | highValue=trie->errorValue; | |
1187 | } | |
1188 | ||
1189 | /* | |
1190 | * Set trie->highStart only after utrie2_get32(trie, highStart). | |
1191 | * Otherwise utrie2_get32(trie, highStart) would try to read the highValue. | |
1192 | */ | |
1193 | trie->highStart=newTrie->highStart=highStart; | |
1194 | ||
1195 | #ifdef UTRIE2_DEBUG | |
1196 | printf("UTrie2: highStart U+%04lx highValue 0x%lx initialValue 0x%lx\n", | |
1197 | (long)highStart, (long)highValue, (long)trie->initialValue); | |
1198 | #endif | |
1199 | ||
1200 | if(highStart<0x110000) { | |
1201 | /* Blank out [highStart..10ffff] to release associated data blocks. */ | |
1202 | suppHighStart= highStart<=0x10000 ? 0x10000 : highStart; | |
1203 | utrie2_setRange32(trie, suppHighStart, 0x10ffff, trie->initialValue, TRUE, pErrorCode); | |
1204 | if(U_FAILURE(*pErrorCode)) { | |
1205 | return; | |
1206 | } | |
1207 | } | |
1208 | ||
1209 | compactData(newTrie); | |
1210 | if(highStart>0x10000) { | |
1211 | compactIndex2(newTrie); | |
1212 | #ifdef UTRIE2_DEBUG | |
1213 | } else { | |
1214 | printf("UTrie2: highStart U+%04lx count of 16-bit index-2 words %lu->%lu\n", | |
1215 | (long)highStart, (long)trie->newTrie->index2Length, (long)UTRIE2_INDEX_1_OFFSET); | |
1216 | #endif | |
1217 | } | |
1218 | ||
1219 | /* | |
1220 | * Store the highValue in the data array and round up the dataLength. | |
1221 | * Must be done after compactData() because that assumes that dataLength | |
1222 | * is a multiple of UTRIE2_DATA_BLOCK_LENGTH. | |
1223 | */ | |
1224 | newTrie->data[newTrie->dataLength++]=highValue; | |
1225 | while((newTrie->dataLength&(UTRIE2_DATA_GRANULARITY-1))!=0) { | |
1226 | newTrie->data[newTrie->dataLength++]=trie->initialValue; | |
1227 | } | |
1228 | ||
1229 | newTrie->isCompacted=TRUE; | |
1230 | } | |
1231 | ||
1232 | /* serialization ------------------------------------------------------------ */ | |
1233 | ||
1234 | /** | |
1235 | * Maximum length of the runtime index array. | |
1236 | * Limited by its own 16-bit index values, and by uint16_t UTrie2Header.indexLength. | |
1237 | * (The actual maximum length is lower, | |
1238 | * (0x110000>>UTRIE2_SHIFT_2)+UTRIE2_UTF8_2B_INDEX_2_LENGTH+UTRIE2_MAX_INDEX_1_LENGTH.) | |
1239 | */ | |
1240 | #define UTRIE2_MAX_INDEX_LENGTH 0xffff | |
1241 | ||
1242 | /** | |
1243 | * Maximum length of the runtime data array. | |
1244 | * Limited by 16-bit index values that are left-shifted by UTRIE2_INDEX_SHIFT, | |
1245 | * and by uint16_t UTrie2Header.shiftedDataLength. | |
1246 | */ | |
1247 | #define UTRIE2_MAX_DATA_LENGTH (0xffff<<UTRIE2_INDEX_SHIFT) | |
1248 | ||
1249 | /* Compact and internally serialize the trie. */ | |
1250 | U_CAPI void U_EXPORT2 | |
1251 | utrie2_freeze(UTrie2 *trie, UTrie2ValueBits valueBits, UErrorCode *pErrorCode) { | |
1252 | UNewTrie2 *newTrie; | |
1253 | UTrie2Header *header; | |
1254 | uint32_t *p; | |
1255 | uint16_t *dest16; | |
1256 | int32_t i, length; | |
1257 | int32_t allIndexesLength; | |
1258 | int32_t dataMove; /* >0 if the data is moved to the end of the index array */ | |
1259 | UChar32 highStart; | |
1260 | ||
1261 | /* argument check */ | |
1262 | if(U_FAILURE(*pErrorCode)) { | |
1263 | return; | |
1264 | } | |
1265 | if( trie==NULL || | |
1266 | valueBits<0 || UTRIE2_COUNT_VALUE_BITS<=valueBits | |
1267 | ) { | |
1268 | *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; | |
1269 | return; | |
1270 | } | |
1271 | newTrie=trie->newTrie; | |
1272 | if(newTrie==NULL) { | |
1273 | /* already frozen */ | |
1274 | UTrie2ValueBits frozenValueBits= | |
1275 | trie->data16!=NULL ? UTRIE2_16_VALUE_BITS : UTRIE2_32_VALUE_BITS; | |
1276 | if(valueBits!=frozenValueBits) { | |
1277 | *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; | |
1278 | } | |
1279 | return; | |
1280 | } | |
1281 | ||
1282 | /* compact if necessary */ | |
1283 | if(!newTrie->isCompacted) { | |
1284 | compactTrie(trie, pErrorCode); | |
1285 | if(U_FAILURE(*pErrorCode)) { | |
1286 | return; | |
1287 | } | |
1288 | } | |
1289 | highStart=trie->highStart; | |
1290 | ||
1291 | if(highStart<=0x10000) { | |
1292 | allIndexesLength=UTRIE2_INDEX_1_OFFSET; | |
1293 | } else { | |
1294 | allIndexesLength=newTrie->index2Length; | |
1295 | } | |
1296 | if(valueBits==UTRIE2_16_VALUE_BITS) { | |
1297 | dataMove=allIndexesLength; | |
1298 | } else { | |
1299 | dataMove=0; | |
1300 | } | |
1301 | ||
1302 | /* are indexLength and dataLength within limits? */ | |
1303 | if( /* for unshifted indexLength */ | |
1304 | allIndexesLength>UTRIE2_MAX_INDEX_LENGTH || | |
1305 | /* for unshifted dataNullOffset */ | |
1306 | (dataMove+newTrie->dataNullOffset)>0xffff || | |
1307 | /* for unshifted 2-byte UTF-8 index-2 values */ | |
1308 | (dataMove+UNEWTRIE2_DATA_0800_OFFSET)>0xffff || | |
1309 | /* for shiftedDataLength */ | |
1310 | (dataMove+newTrie->dataLength)>UTRIE2_MAX_DATA_LENGTH | |
1311 | ) { | |
1312 | *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR; | |
1313 | return; | |
1314 | } | |
1315 | ||
1316 | /* calculate the total serialized length */ | |
1317 | length=sizeof(UTrie2Header)+allIndexesLength*2; | |
1318 | if(valueBits==UTRIE2_16_VALUE_BITS) { | |
1319 | length+=newTrie->dataLength*2; | |
1320 | } else { | |
1321 | length+=newTrie->dataLength*4; | |
1322 | } | |
1323 | ||
1324 | trie->memory=uprv_malloc(length); | |
1325 | if(trie->memory==NULL) { | |
1326 | *pErrorCode=U_MEMORY_ALLOCATION_ERROR; | |
1327 | return; | |
1328 | } | |
1329 | trie->length=length; | |
1330 | trie->isMemoryOwned=TRUE; | |
1331 | ||
1332 | trie->indexLength=allIndexesLength; | |
1333 | trie->dataLength=newTrie->dataLength; | |
1334 | if(highStart<=0x10000) { | |
1335 | trie->index2NullOffset=0xffff; | |
1336 | } else { | |
1337 | trie->index2NullOffset=UTRIE2_INDEX_2_OFFSET+newTrie->index2NullOffset; | |
1338 | } | |
1339 | trie->dataNullOffset=(uint16_t)(dataMove+newTrie->dataNullOffset); | |
1340 | trie->highValueIndex=dataMove+trie->dataLength-UTRIE2_DATA_GRANULARITY; | |
1341 | ||
1342 | /* set the header fields */ | |
1343 | header=(UTrie2Header *)trie->memory; | |
1344 | ||
1345 | header->signature=UTRIE2_SIG; /* "Tri2" */ | |
1346 | header->options=(uint16_t)valueBits; | |
1347 | ||
1348 | header->indexLength=(uint16_t)trie->indexLength; | |
1349 | header->shiftedDataLength=(uint16_t)(trie->dataLength>>UTRIE2_INDEX_SHIFT); | |
1350 | header->index2NullOffset=trie->index2NullOffset; | |
1351 | header->dataNullOffset=trie->dataNullOffset; | |
1352 | header->shiftedHighStart=(uint16_t)(highStart>>UTRIE2_SHIFT_1); | |
1353 | ||
1354 | /* fill the index and data arrays */ | |
1355 | dest16=(uint16_t *)(header+1); | |
1356 | trie->index=dest16; | |
1357 | ||
1358 | /* write the index-2 array values shifted right by UTRIE2_INDEX_SHIFT, after adding dataMove */ | |
1359 | p=(uint32_t *)newTrie->index2; | |
1360 | for(i=UTRIE2_INDEX_2_BMP_LENGTH; i>0; --i) { | |
1361 | *dest16++=(uint16_t)((dataMove + *p++)>>UTRIE2_INDEX_SHIFT); | |
1362 | } | |
1363 | ||
1364 | /* write UTF-8 2-byte index-2 values, not right-shifted */ | |
1365 | for(i=0; i<(0xc2-0xc0); ++i) { /* C0..C1 */ | |
1366 | *dest16++=(uint16_t)(dataMove+UTRIE2_BAD_UTF8_DATA_OFFSET); | |
1367 | } | |
1368 | for(; i<(0xe0-0xc0); ++i) { /* C2..DF */ | |
1369 | *dest16++=(uint16_t)(dataMove+newTrie->index2[i<<(6-UTRIE2_SHIFT_2)]); | |
1370 | } | |
1371 | ||
1372 | if(highStart>0x10000) { | |
1373 | int32_t index1Length=(highStart-0x10000)>>UTRIE2_SHIFT_1; | |
1374 | int32_t index2Offset=UTRIE2_INDEX_2_BMP_LENGTH+UTRIE2_UTF8_2B_INDEX_2_LENGTH+index1Length; | |
1375 | ||
1376 | /* write 16-bit index-1 values for supplementary code points */ | |
1377 | p=(uint32_t *)newTrie->index1+UTRIE2_OMITTED_BMP_INDEX_1_LENGTH; | |
1378 | for(i=index1Length; i>0; --i) { | |
1379 | *dest16++=(uint16_t)(UTRIE2_INDEX_2_OFFSET + *p++); | |
1380 | } | |
1381 | ||
1382 | /* | |
1383 | * write the index-2 array values for supplementary code points, | |
1384 | * shifted right by UTRIE2_INDEX_SHIFT, after adding dataMove | |
1385 | */ | |
1386 | p=(uint32_t *)newTrie->index2+index2Offset; | |
1387 | for(i=newTrie->index2Length-index2Offset; i>0; --i) { | |
1388 | *dest16++=(uint16_t)((dataMove + *p++)>>UTRIE2_INDEX_SHIFT); | |
1389 | } | |
1390 | } | |
1391 | ||
1392 | /* write the 16/32-bit data array */ | |
1393 | switch(valueBits) { | |
1394 | case UTRIE2_16_VALUE_BITS: | |
1395 | /* write 16-bit data values */ | |
1396 | trie->data16=dest16; | |
1397 | trie->data32=NULL; | |
1398 | p=newTrie->data; | |
1399 | for(i=newTrie->dataLength; i>0; --i) { | |
1400 | *dest16++=(uint16_t)*p++; | |
1401 | } | |
1402 | break; | |
1403 | case UTRIE2_32_VALUE_BITS: | |
1404 | /* write 32-bit data values */ | |
1405 | trie->data16=NULL; | |
1406 | trie->data32=(uint32_t *)dest16; | |
a62d09fc | 1407 | uprv_memcpy(dest16, newTrie->data, (size_t)newTrie->dataLength*4); |
729e4ab9 A |
1408 | break; |
1409 | default: | |
1410 | *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; | |
1411 | return; | |
1412 | } | |
1413 | ||
1414 | /* Delete the UNewTrie2. */ | |
1415 | uprv_free(newTrie->data); | |
1416 | uprv_free(newTrie); | |
1417 | trie->newTrie=NULL; | |
1418 | } | |
1419 | ||
729e4ab9 A |
1420 | /* |
1421 | * This is here to avoid a dependency from utrie2.cpp on utrie.c. | |
1422 | * This file already depends on utrie.c. | |
1423 | * Otherwise, this should be in utrie2.cpp right after utrie2_swap(). | |
1424 | */ | |
1425 | U_CAPI int32_t U_EXPORT2 | |
1426 | utrie2_swapAnyVersion(const UDataSwapper *ds, | |
1427 | const void *inData, int32_t length, void *outData, | |
1428 | UErrorCode *pErrorCode) { | |
1429 | if(U_SUCCESS(*pErrorCode)) { | |
1430 | switch(utrie2_getVersion(inData, length, TRUE)) { | |
1431 | case 1: | |
1432 | return utrie_swap(ds, inData, length, outData, pErrorCode); | |
1433 | case 2: | |
1434 | return utrie2_swap(ds, inData, length, outData, pErrorCode); | |
1435 | default: | |
1436 | *pErrorCode=U_INVALID_FORMAT_ERROR; | |
1437 | return 0; | |
1438 | } | |
1439 | } | |
1440 | return 0; | |
1441 | } |