]>
Commit | Line | Data |
---|---|---|
729e4ab9 A |
1 | /* |
2 | ******************************************************************************* | |
3 | * | |
4388f060 | 4 | * Copyright (C) 2002-2011, International Business Machines |
729e4ab9 A |
5 | * Corporation and others. All Rights Reserved. |
6 | * | |
7 | ******************************************************************************* | |
8 | * file name: propsvec.c | |
9 | * encoding: US-ASCII | |
10 | * tab size: 8 (not used) | |
11 | * indentation:4 | |
12 | * | |
13 | * created on: 2002feb22 | |
14 | * created by: Markus W. Scherer | |
15 | * | |
16 | * Store bits (Unicode character properties) in bit set vectors. | |
17 | */ | |
18 | ||
19 | #include <stdlib.h> | |
20 | #include "unicode/utypes.h" | |
21 | #include "cmemory.h" | |
22 | #include "utrie.h" | |
23 | #include "utrie2.h" | |
24 | #include "uarrsort.h" | |
25 | #include "propsvec.h" | |
4388f060 | 26 | #include "uassert.h" |
729e4ab9 A |
27 | |
28 | struct UPropsVectors { | |
29 | uint32_t *v; | |
30 | int32_t columns; /* number of columns, plus two for start & limit values */ | |
31 | int32_t maxRows; | |
32 | int32_t rows; | |
33 | int32_t prevRow; /* search optimization: remember last row seen */ | |
34 | UBool isCompacted; | |
35 | }; | |
36 | ||
37 | #define UPVEC_INITIAL_ROWS (1<<12) | |
38 | #define UPVEC_MEDIUM_ROWS ((int32_t)1<<16) | |
39 | #define UPVEC_MAX_ROWS (UPVEC_MAX_CP+1) | |
40 | ||
41 | U_CAPI UPropsVectors * U_EXPORT2 | |
42 | upvec_open(int32_t columns, UErrorCode *pErrorCode) { | |
43 | UPropsVectors *pv; | |
44 | uint32_t *v, *row; | |
45 | uint32_t cp; | |
46 | ||
47 | if(U_FAILURE(*pErrorCode)) { | |
48 | return NULL; | |
49 | } | |
50 | if(columns<1) { | |
51 | *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; | |
52 | return NULL; | |
53 | } | |
54 | columns+=2; /* count range start and limit columns */ | |
55 | ||
56 | pv=(UPropsVectors *)uprv_malloc(sizeof(UPropsVectors)); | |
57 | v=(uint32_t *)uprv_malloc(UPVEC_INITIAL_ROWS*columns*4); | |
58 | if(pv==NULL || v==NULL) { | |
59 | uprv_free(pv); | |
60 | uprv_free(v); | |
61 | *pErrorCode=U_MEMORY_ALLOCATION_ERROR; | |
62 | return NULL; | |
63 | } | |
64 | uprv_memset(pv, 0, sizeof(UPropsVectors)); | |
65 | pv->v=v; | |
66 | pv->columns=columns; | |
67 | pv->maxRows=UPVEC_INITIAL_ROWS; | |
68 | pv->rows=2+(UPVEC_MAX_CP-UPVEC_FIRST_SPECIAL_CP); | |
69 | ||
70 | /* set the all-Unicode row and the special-value rows */ | |
71 | row=pv->v; | |
72 | uprv_memset(row, 0, pv->rows*columns*4); | |
73 | row[0]=0; | |
74 | row[1]=0x110000; | |
75 | row+=columns; | |
76 | for(cp=UPVEC_FIRST_SPECIAL_CP; cp<=UPVEC_MAX_CP; ++cp) { | |
77 | row[0]=cp; | |
78 | row[1]=cp+1; | |
79 | row+=columns; | |
80 | } | |
81 | return pv; | |
82 | } | |
83 | ||
84 | U_CAPI void U_EXPORT2 | |
85 | upvec_close(UPropsVectors *pv) { | |
86 | if(pv!=NULL) { | |
87 | uprv_free(pv->v); | |
88 | uprv_free(pv); | |
89 | } | |
90 | } | |
91 | ||
92 | static uint32_t * | |
93 | _findRow(UPropsVectors *pv, UChar32 rangeStart) { | |
94 | uint32_t *row; | |
4388f060 | 95 | int32_t columns, i, start, limit, prevRow; |
729e4ab9 A |
96 | |
97 | columns=pv->columns; | |
4388f060 | 98 | limit=pv->rows; |
729e4ab9 A |
99 | prevRow=pv->prevRow; |
100 | ||
101 | /* check the vicinity of the last-seen row (start searching with an unrolled loop) */ | |
102 | row=pv->v+prevRow*columns; | |
103 | if(rangeStart>=(UChar32)row[0]) { | |
104 | if(rangeStart<(UChar32)row[1]) { | |
105 | /* same row as last seen */ | |
106 | return row; | |
107 | } else if(rangeStart<(UChar32)(row+=columns)[1]) { | |
108 | /* next row after the last one */ | |
109 | pv->prevRow=prevRow+1; | |
110 | return row; | |
111 | } else if(rangeStart<(UChar32)(row+=columns)[1]) { | |
112 | /* second row after the last one */ | |
113 | pv->prevRow=prevRow+2; | |
114 | return row; | |
115 | } else if((rangeStart-(UChar32)row[1])<10) { | |
116 | /* we are close, continue looping */ | |
117 | prevRow+=2; | |
118 | do { | |
119 | ++prevRow; | |
120 | row+=columns; | |
121 | } while(rangeStart>=(UChar32)row[1]); | |
122 | pv->prevRow=prevRow; | |
123 | return row; | |
124 | } | |
125 | } else if(rangeStart<(UChar32)pv->v[1]) { | |
126 | /* the very first row */ | |
127 | pv->prevRow=0; | |
128 | return pv->v; | |
129 | } | |
130 | ||
131 | /* do a binary search for the start of the range */ | |
132 | start=0; | |
133 | while(start<limit-1) { | |
134 | i=(start+limit)/2; | |
135 | row=pv->v+i*columns; | |
136 | if(rangeStart<(UChar32)row[0]) { | |
137 | limit=i; | |
138 | } else if(rangeStart<(UChar32)row[1]) { | |
139 | pv->prevRow=i; | |
140 | return row; | |
141 | } else { | |
142 | start=i; | |
143 | } | |
144 | } | |
145 | ||
146 | /* must be found because all ranges together always cover all of Unicode */ | |
147 | pv->prevRow=start; | |
148 | return pv->v+start*columns; | |
149 | } | |
150 | ||
151 | U_CAPI void U_EXPORT2 | |
152 | upvec_setValue(UPropsVectors *pv, | |
153 | UChar32 start, UChar32 end, | |
154 | int32_t column, | |
155 | uint32_t value, uint32_t mask, | |
156 | UErrorCode *pErrorCode) { | |
157 | uint32_t *firstRow, *lastRow; | |
158 | int32_t columns; | |
159 | UChar32 limit; | |
160 | UBool splitFirstRow, splitLastRow; | |
161 | ||
162 | /* argument checking */ | |
163 | if(U_FAILURE(*pErrorCode)) { | |
164 | return; | |
165 | } | |
166 | if( pv==NULL || | |
167 | start<0 || start>end || end>UPVEC_MAX_CP || | |
168 | column<0 || column>=(pv->columns-2) | |
169 | ) { | |
170 | *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; | |
171 | return; | |
172 | } | |
173 | if(pv->isCompacted) { | |
174 | *pErrorCode=U_NO_WRITE_PERMISSION; | |
175 | return; | |
176 | } | |
177 | limit=end+1; | |
178 | ||
179 | /* initialize */ | |
180 | columns=pv->columns; | |
181 | column+=2; /* skip range start and limit columns */ | |
182 | value&=mask; | |
183 | ||
184 | /* find the rows whose ranges overlap with the input range */ | |
185 | ||
186 | /* find the first and last rows, always successful */ | |
187 | firstRow=_findRow(pv, start); | |
188 | lastRow=_findRow(pv, end); | |
189 | ||
190 | /* | |
191 | * Rows need to be split if they partially overlap with the | |
192 | * input range (only possible for the first and last rows) | |
193 | * and if their value differs from the input value. | |
194 | */ | |
195 | splitFirstRow= (UBool)(start!=(UChar32)firstRow[0] && value!=(firstRow[column]&mask)); | |
196 | splitLastRow= (UBool)(limit!=(UChar32)lastRow[1] && value!=(lastRow[column]&mask)); | |
197 | ||
198 | /* split first/last rows if necessary */ | |
199 | if(splitFirstRow || splitLastRow) { | |
200 | int32_t count, rows; | |
201 | ||
202 | rows=pv->rows; | |
203 | if((rows+splitFirstRow+splitLastRow)>pv->maxRows) { | |
204 | uint32_t *newVectors; | |
205 | int32_t newMaxRows; | |
206 | ||
207 | if(pv->maxRows<UPVEC_MEDIUM_ROWS) { | |
208 | newMaxRows=UPVEC_MEDIUM_ROWS; | |
209 | } else if(pv->maxRows<UPVEC_MAX_ROWS) { | |
210 | newMaxRows=UPVEC_MAX_ROWS; | |
211 | } else { | |
212 | /* Implementation bug, or UPVEC_MAX_ROWS too low. */ | |
213 | *pErrorCode=U_INTERNAL_PROGRAM_ERROR; | |
214 | return; | |
215 | } | |
216 | newVectors=(uint32_t *)uprv_malloc(newMaxRows*columns*4); | |
217 | if(newVectors==NULL) { | |
218 | *pErrorCode=U_MEMORY_ALLOCATION_ERROR; | |
219 | return; | |
220 | } | |
a62d09fc | 221 | uprv_memcpy(newVectors, pv->v, (size_t)rows*columns*4); |
729e4ab9 A |
222 | firstRow=newVectors+(firstRow-pv->v); |
223 | lastRow=newVectors+(lastRow-pv->v); | |
224 | uprv_free(pv->v); | |
225 | pv->v=newVectors; | |
226 | pv->maxRows=newMaxRows; | |
227 | } | |
228 | ||
229 | /* count the number of row cells to move after the last row, and move them */ | |
230 | count = (int32_t)((pv->v+rows*columns)-(lastRow+columns)); | |
231 | if(count>0) { | |
232 | uprv_memmove( | |
233 | lastRow+(1+splitFirstRow+splitLastRow)*columns, | |
234 | lastRow+columns, | |
235 | count*4); | |
236 | } | |
237 | pv->rows=rows+splitFirstRow+splitLastRow; | |
238 | ||
239 | /* split the first row, and move the firstRow pointer to the second part */ | |
240 | if(splitFirstRow) { | |
241 | /* copy all affected rows up one and move the lastRow pointer */ | |
242 | count = (int32_t)((lastRow-firstRow)+columns); | |
a62d09fc | 243 | uprv_memmove(firstRow+columns, firstRow, (size_t)count*4); |
729e4ab9 A |
244 | lastRow+=columns; |
245 | ||
246 | /* split the range and move the firstRow pointer */ | |
247 | firstRow[1]=firstRow[columns]=(uint32_t)start; | |
248 | firstRow+=columns; | |
249 | } | |
250 | ||
251 | /* split the last row */ | |
252 | if(splitLastRow) { | |
253 | /* copy the last row data */ | |
a62d09fc | 254 | uprv_memcpy(lastRow+columns, lastRow, (size_t)columns*4); |
729e4ab9 A |
255 | |
256 | /* split the range and move the firstRow pointer */ | |
257 | lastRow[1]=lastRow[columns]=(uint32_t)limit; | |
258 | } | |
259 | } | |
260 | ||
261 | /* set the "row last seen" to the last row for the range */ | |
262 | pv->prevRow=(int32_t)((lastRow-(pv->v))/columns); | |
263 | ||
264 | /* set the input value in all remaining rows */ | |
265 | firstRow+=column; | |
266 | lastRow+=column; | |
267 | mask=~mask; | |
268 | for(;;) { | |
269 | *firstRow=(*firstRow&mask)|value; | |
270 | if(firstRow==lastRow) { | |
271 | break; | |
272 | } | |
273 | firstRow+=columns; | |
274 | } | |
275 | } | |
276 | ||
277 | U_CAPI uint32_t U_EXPORT2 | |
278 | upvec_getValue(const UPropsVectors *pv, UChar32 c, int32_t column) { | |
279 | uint32_t *row; | |
280 | UPropsVectors *ncpv; | |
281 | ||
282 | if(pv->isCompacted || c<0 || c>UPVEC_MAX_CP || column<0 || column>=(pv->columns-2)) { | |
283 | return 0; | |
284 | } | |
285 | ncpv=(UPropsVectors *)pv; | |
286 | row=_findRow(ncpv, c); | |
287 | return row[2+column]; | |
288 | } | |
289 | ||
290 | U_CAPI uint32_t * U_EXPORT2 | |
291 | upvec_getRow(const UPropsVectors *pv, int32_t rowIndex, | |
292 | UChar32 *pRangeStart, UChar32 *pRangeEnd) { | |
293 | uint32_t *row; | |
294 | int32_t columns; | |
295 | ||
296 | if(pv->isCompacted || rowIndex<0 || rowIndex>=pv->rows) { | |
297 | return NULL; | |
298 | } | |
299 | ||
300 | columns=pv->columns; | |
301 | row=pv->v+rowIndex*columns; | |
302 | if(pRangeStart!=NULL) { | |
303 | *pRangeStart=(UChar32)row[0]; | |
304 | } | |
305 | if(pRangeEnd!=NULL) { | |
306 | *pRangeEnd=(UChar32)row[1]-1; | |
307 | } | |
308 | return row+2; | |
309 | } | |
310 | ||
311 | static int32_t U_CALLCONV | |
312 | upvec_compareRows(const void *context, const void *l, const void *r) { | |
313 | const uint32_t *left=(const uint32_t *)l, *right=(const uint32_t *)r; | |
314 | const UPropsVectors *pv=(const UPropsVectors *)context; | |
315 | int32_t i, count, columns; | |
316 | ||
317 | count=columns=pv->columns; /* includes start/limit columns */ | |
318 | ||
319 | /* start comparing after start/limit but wrap around to them */ | |
320 | i=2; | |
321 | do { | |
322 | if(left[i]!=right[i]) { | |
323 | return left[i]<right[i] ? -1 : 1; | |
324 | } | |
325 | if(++i==columns) { | |
326 | i=0; | |
327 | } | |
328 | } while(--count>0); | |
329 | ||
330 | return 0; | |
331 | } | |
332 | ||
333 | U_CAPI void U_EXPORT2 | |
334 | upvec_compact(UPropsVectors *pv, UPVecCompactHandler *handler, void *context, UErrorCode *pErrorCode) { | |
335 | uint32_t *row; | |
336 | int32_t i, columns, valueColumns, rows, count; | |
337 | UChar32 start, limit; | |
338 | ||
339 | /* argument checking */ | |
340 | if(U_FAILURE(*pErrorCode)) { | |
341 | return; | |
342 | } | |
343 | if(handler==NULL) { | |
344 | *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; | |
345 | return; | |
346 | } | |
347 | if(pv->isCompacted) { | |
348 | return; | |
349 | } | |
350 | ||
351 | /* Set the flag now: Sorting and compacting destroys the builder data structure. */ | |
352 | pv->isCompacted=TRUE; | |
353 | ||
354 | rows=pv->rows; | |
355 | columns=pv->columns; | |
4388f060 | 356 | U_ASSERT(columns>=3); /* upvec_open asserts this */ |
729e4ab9 A |
357 | valueColumns=columns-2; /* not counting start & limit */ |
358 | ||
359 | /* sort the properties vectors to find unique vector values */ | |
360 | uprv_sortArray(pv->v, rows, columns*4, | |
361 | upvec_compareRows, pv, FALSE, pErrorCode); | |
362 | if(U_FAILURE(*pErrorCode)) { | |
363 | return; | |
364 | } | |
365 | ||
366 | /* | |
367 | * Find and set the special values. | |
368 | * This has to do almost the same work as the compaction below, | |
369 | * to find the indexes where the special-value rows will move. | |
370 | */ | |
371 | row=pv->v; | |
372 | count=-valueColumns; | |
373 | for(i=0; i<rows; ++i) { | |
374 | start=(UChar32)row[0]; | |
375 | ||
376 | /* count a new values vector if it is different from the current one */ | |
377 | if(count<0 || 0!=uprv_memcmp(row+2, row-valueColumns, valueColumns*4)) { | |
378 | count+=valueColumns; | |
379 | } | |
380 | ||
381 | if(start>=UPVEC_FIRST_SPECIAL_CP) { | |
382 | handler(context, start, start, count, row+2, valueColumns, pErrorCode); | |
383 | if(U_FAILURE(*pErrorCode)) { | |
384 | return; | |
385 | } | |
386 | } | |
387 | ||
388 | row+=columns; | |
389 | } | |
390 | ||
391 | /* count is at the beginning of the last vector, add valueColumns to include that last vector */ | |
392 | count+=valueColumns; | |
393 | ||
394 | /* Call the handler once more to signal the start of delivering real values. */ | |
395 | handler(context, UPVEC_START_REAL_VALUES_CP, UPVEC_START_REAL_VALUES_CP, | |
396 | count, row-valueColumns, valueColumns, pErrorCode); | |
397 | if(U_FAILURE(*pErrorCode)) { | |
398 | return; | |
399 | } | |
400 | ||
401 | /* | |
402 | * Move vector contents up to a contiguous array with only unique | |
403 | * vector values, and call the handler function for each vector. | |
404 | * | |
405 | * This destroys the Properties Vector structure and replaces it | |
406 | * with an array of just vector values. | |
407 | */ | |
408 | row=pv->v; | |
409 | count=-valueColumns; | |
410 | for(i=0; i<rows; ++i) { | |
411 | /* fetch these first before memmove() may overwrite them */ | |
412 | start=(UChar32)row[0]; | |
413 | limit=(UChar32)row[1]; | |
414 | ||
415 | /* add a new values vector if it is different from the current one */ | |
416 | if(count<0 || 0!=uprv_memcmp(row+2, pv->v+count, valueColumns*4)) { | |
417 | count+=valueColumns; | |
a62d09fc | 418 | uprv_memmove(pv->v+count, row+2, (size_t)valueColumns*4); |
729e4ab9 A |
419 | } |
420 | ||
421 | if(start<UPVEC_FIRST_SPECIAL_CP) { | |
422 | handler(context, start, limit-1, count, pv->v+count, valueColumns, pErrorCode); | |
423 | if(U_FAILURE(*pErrorCode)) { | |
424 | return; | |
425 | } | |
426 | } | |
427 | ||
428 | row+=columns; | |
429 | } | |
430 | ||
431 | /* count is at the beginning of the last vector, add one to include that last vector */ | |
432 | pv->rows=count/valueColumns+1; | |
433 | } | |
434 | ||
435 | U_CAPI const uint32_t * U_EXPORT2 | |
436 | upvec_getArray(const UPropsVectors *pv, int32_t *pRows, int32_t *pColumns) { | |
437 | if(!pv->isCompacted) { | |
438 | return NULL; | |
439 | } | |
440 | if(pRows!=NULL) { | |
441 | *pRows=pv->rows; | |
442 | } | |
443 | if(pColumns!=NULL) { | |
444 | *pColumns=pv->columns-2; | |
445 | } | |
446 | return pv->v; | |
447 | } | |
448 | ||
449 | U_CAPI uint32_t * U_EXPORT2 | |
450 | upvec_cloneArray(const UPropsVectors *pv, | |
451 | int32_t *pRows, int32_t *pColumns, UErrorCode *pErrorCode) { | |
452 | uint32_t *clonedArray; | |
453 | int32_t byteLength; | |
454 | ||
455 | if(U_FAILURE(*pErrorCode)) { | |
456 | return NULL; | |
457 | } | |
458 | if(!pv->isCompacted) { | |
459 | *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; | |
460 | return NULL; | |
461 | } | |
462 | byteLength=pv->rows*(pv->columns-2)*4; | |
463 | clonedArray=(uint32_t *)uprv_malloc(byteLength); | |
464 | if(clonedArray==NULL) { | |
465 | *pErrorCode=U_MEMORY_ALLOCATION_ERROR; | |
466 | return NULL; | |
467 | } | |
468 | uprv_memcpy(clonedArray, pv->v, byteLength); | |
469 | if(pRows!=NULL) { | |
470 | *pRows=pv->rows; | |
471 | } | |
472 | if(pColumns!=NULL) { | |
473 | *pColumns=pv->columns-2; | |
474 | } | |
475 | return clonedArray; | |
476 | } | |
477 | ||
478 | U_CAPI UTrie2 * U_EXPORT2 | |
479 | upvec_compactToUTrie2WithRowIndexes(UPropsVectors *pv, UErrorCode *pErrorCode) { | |
480 | UPVecToUTrie2Context toUTrie2={ NULL }; | |
481 | upvec_compact(pv, upvec_compactToUTrie2Handler, &toUTrie2, pErrorCode); | |
482 | utrie2_freeze(toUTrie2.trie, UTRIE2_16_VALUE_BITS, pErrorCode); | |
483 | if(U_FAILURE(*pErrorCode)) { | |
484 | utrie2_close(toUTrie2.trie); | |
485 | toUTrie2.trie=NULL; | |
486 | } | |
487 | return toUTrie2.trie; | |
488 | } | |
489 | ||
490 | /* | |
491 | * TODO(markus): Add upvec_16BitsToUTrie2() function that enumerates all rows, extracts | |
492 | * some 16-bit field and builds and returns a UTrie2. | |
493 | */ | |
494 | ||
495 | U_CAPI void U_CALLCONV | |
496 | upvec_compactToUTrie2Handler(void *context, | |
497 | UChar32 start, UChar32 end, | |
498 | int32_t rowIndex, uint32_t *row, int32_t columns, | |
499 | UErrorCode *pErrorCode) { | |
500 | UPVecToUTrie2Context *toUTrie2=(UPVecToUTrie2Context *)context; | |
501 | if(start<UPVEC_FIRST_SPECIAL_CP) { | |
502 | utrie2_setRange32(toUTrie2->trie, start, end, (uint32_t)rowIndex, TRUE, pErrorCode); | |
503 | } else { | |
504 | switch(start) { | |
505 | case UPVEC_INITIAL_VALUE_CP: | |
506 | toUTrie2->initialValue=rowIndex; | |
507 | break; | |
508 | case UPVEC_ERROR_VALUE_CP: | |
509 | toUTrie2->errorValue=rowIndex; | |
510 | break; | |
511 | case UPVEC_START_REAL_VALUES_CP: | |
512 | toUTrie2->maxValue=rowIndex; | |
513 | if(rowIndex>0xffff) { | |
514 | /* too many rows for a 16-bit trie */ | |
515 | *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR; | |
516 | } else { | |
517 | toUTrie2->trie=utrie2_open(toUTrie2->initialValue, | |
518 | toUTrie2->errorValue, pErrorCode); | |
519 | } | |
520 | break; | |
521 | default: | |
522 | break; | |
523 | } | |
524 | } | |
525 | } |