]> git.saurik.com Git - apple/icu.git/blame - icuSources/i18n/ucol_wgt.cpp
ICU-461.12.tar.gz
[apple/icu.git] / icuSources / i18n / ucol_wgt.cpp
CommitLineData
b75a7d8f
A
1/*
2*******************************************************************************
3*
729e4ab9 4* Copyright (C) 1999-2010, International Business Machines
b75a7d8f
A
5* Corporation and others. All Rights Reserved.
6*
7*******************************************************************************
8* file name: ucol_wgt.c
9* encoding: US-ASCII
10* tab size: 8 (not used)
11* indentation:4
12*
13* created on: 2001mar08
14* created by: Markus W. Scherer
15*
16* This file contains code for allocating n collation element weights
17* between two exclusive limits.
18* It is used only internally by ucol_bld.
19*/
20
21#include "unicode/utypes.h"
22
23#if !UCONFIG_NO_COLLATION
24
25#include "ucol_imp.h"
26#include "ucol_wgt.h"
27#include "cmemory.h"
374ca955 28#include "uarrsort.h"
b75a7d8f
A
29
30#ifdef UCOL_DEBUG
31# include <stdio.h>
32#endif
33
b75a7d8f
A
34/* collation element weight allocation -------------------------------------- */
35
36/* helper functions for CE weights */
37
374ca955 38static U_INLINE int32_t
b75a7d8f
A
39lengthOfWeight(uint32_t weight) {
40 if((weight&0xffffff)==0) {
41 return 1;
42 } else if((weight&0xffff)==0) {
43 return 2;
44 } else if((weight&0xff)==0) {
45 return 3;
46 } else {
47 return 4;
48 }
49}
50
374ca955 51static U_INLINE uint32_t
b75a7d8f
A
52getWeightTrail(uint32_t weight, int32_t length) {
53 return (uint32_t)(weight>>(8*(4-length)))&0xff;
54}
55
374ca955 56static U_INLINE uint32_t
b75a7d8f
A
57setWeightTrail(uint32_t weight, int32_t length, uint32_t trail) {
58 length=8*(4-length);
59 return (uint32_t)((weight&(0xffffff00<<length))|(trail<<length));
60}
61
374ca955 62static U_INLINE uint32_t
729e4ab9
A
63getWeightByte(uint32_t weight, int32_t idx) {
64 return getWeightTrail(weight, idx); /* same calculation */
b75a7d8f
A
65}
66
374ca955 67static U_INLINE uint32_t
729e4ab9 68setWeightByte(uint32_t weight, int32_t idx, uint32_t byte) {
b75a7d8f
A
69 uint32_t mask; /* 0xffffffff except a 00 "hole" for the index-th byte */
70
729e4ab9
A
71 idx*=8;
72 mask=((uint32_t)0xffffffff)>>idx;
73 idx=32-idx;
74 mask|=0xffffff00<<idx;
75 return (uint32_t)((weight&mask)|(byte<<idx));
b75a7d8f
A
76}
77
374ca955 78static U_INLINE uint32_t
b75a7d8f
A
79truncateWeight(uint32_t weight, int32_t length) {
80 return (uint32_t)(weight&(0xffffffff<<(8*(4-length))));
81}
82
374ca955 83static U_INLINE uint32_t
b75a7d8f
A
84incWeightTrail(uint32_t weight, int32_t length) {
85 return (uint32_t)(weight+(1UL<<(8*(4-length))));
86}
87
374ca955 88static U_INLINE uint32_t
b75a7d8f
A
89decWeightTrail(uint32_t weight, int32_t length) {
90 return (uint32_t)(weight-(1UL<<(8*(4-length))));
91}
92
374ca955 93static U_INLINE uint32_t
b75a7d8f
A
94incWeight(uint32_t weight, int32_t length, uint32_t maxByte) {
95 uint32_t byte;
96
97 for(;;) {
98 byte=getWeightByte(weight, length);
99 if(byte<maxByte) {
100 return setWeightByte(weight, length, byte+1);
101 } else {
102 /* roll over, set this byte to UCOL_BYTE_FIRST_TAILORED and increment the previous one */
103 weight=setWeightByte(weight, length, UCOL_BYTE_FIRST_TAILORED);
104 --length;
105 }
106 }
107}
108
374ca955 109static U_INLINE int32_t
b75a7d8f
A
110lengthenRange(WeightRange *range, uint32_t maxByte, uint32_t countBytes) {
111 int32_t length;
112
113 length=range->length2+1;
114 range->start=setWeightTrail(range->start, length, UCOL_BYTE_FIRST_TAILORED);
115 range->end=setWeightTrail(range->end, length, maxByte);
116 range->count2*=countBytes;
117 range->length2=length;
118 return length;
119}
120
374ca955 121/* for uprv_sortArray: sort ranges in weight order */
73c04bcf 122static int32_t U_CALLCONV
729e4ab9 123compareRanges(const void * /*context*/, const void *left, const void *right) {
b75a7d8f
A
124 uint32_t l, r;
125
126 l=((const WeightRange *)left)->start;
127 r=((const WeightRange *)right)->start;
128 if(l<r) {
129 return -1;
130 } else if(l>r) {
131 return 1;
132 } else {
133 return 0;
134 }
135}
136
137/*
138 * take two CE weights and calculate the
139 * possible ranges of weights between the two limits, excluding them
140 * for weights with up to 4 bytes there are up to 2*4-1=7 ranges
141 */
374ca955 142static U_INLINE int32_t
b75a7d8f
A
143getWeightRanges(uint32_t lowerLimit, uint32_t upperLimit,
144 uint32_t maxByte, uint32_t countBytes,
145 WeightRange ranges[7]) {
146 WeightRange lower[5], middle, upper[5]; /* [0] and [1] are not used - this simplifies indexing */
147 uint32_t weight, trail;
148 int32_t length, lowerLength, upperLength, rangeCount;
149
150 /* assume that both lowerLimit & upperLimit are not 0 */
151
152 /* get the lengths of the limits */
153 lowerLength=lengthOfWeight(lowerLimit);
154 upperLength=lengthOfWeight(upperLimit);
155
156#ifdef UCOL_DEBUG
157 printf("length of lower limit 0x%08lx is %ld\n", lowerLimit, lowerLength);
158 printf("length of upper limit 0x%08lx is %ld\n", upperLimit, upperLength);
159#endif
160
161 if(lowerLimit>=upperLimit) {
162#ifdef UCOL_DEBUG
163 printf("error: no space between lower & upper limits\n");
164#endif
165 return 0;
166 }
167
168 /* check that neither is a prefix of the other */
169 if(lowerLength<upperLength) {
170 if(lowerLimit==truncateWeight(upperLimit, lowerLength)) {
171#ifdef UCOL_DEBUG
172 printf("error: lower limit 0x%08lx is a prefix of upper limit 0x%08lx\n", lowerLimit, upperLimit);
173#endif
174 return 0;
175 }
176 }
177 /* if the upper limit is a prefix of the lower limit then the earlier test lowerLimit>=upperLimit has caught it */
178
179 /* reset local variables */
180 uprv_memset(lower, 0, sizeof(lower));
181 uprv_memset(&middle, 0, sizeof(middle));
182 uprv_memset(upper, 0, sizeof(upper));
183
184 /*
185 * With the limit lengths of 1..4, there are up to 7 ranges for allocation:
186 * range minimum length
187 * lower[4] 4
188 * lower[3] 3
189 * lower[2] 2
190 * middle 1
191 * upper[2] 2
192 * upper[3] 3
193 * upper[4] 4
194 *
195 * We are now going to calculate up to 7 ranges.
196 * Some of them will typically overlap, so we will then have to merge and eliminate ranges.
197 */
198 weight=lowerLimit;
199 for(length=lowerLength; length>=2; --length) {
200 trail=getWeightTrail(weight, length);
201 if(trail<maxByte) {
202 lower[length].start=incWeightTrail(weight, length);
203 lower[length].end=setWeightTrail(weight, length, maxByte);
204 lower[length].length=length;
205 lower[length].count=maxByte-trail;
206 }
207 weight=truncateWeight(weight, length-1);
208 }
209 middle.start=incWeightTrail(weight, 1);
210
211 weight=upperLimit;
212 for(length=upperLength; length>=2; --length) {
213 trail=getWeightTrail(weight, length);
214 if(trail>UCOL_BYTE_FIRST_TAILORED) {
215 upper[length].start=setWeightTrail(weight, length, UCOL_BYTE_FIRST_TAILORED);
216 upper[length].end=decWeightTrail(weight, length);
217 upper[length].length=length;
218 upper[length].count=trail-UCOL_BYTE_FIRST_TAILORED;
219 }
220 weight=truncateWeight(weight, length-1);
221 }
222 middle.end=decWeightTrail(weight, 1);
223
224 /* set the middle range */
225 middle.length=1;
226 if(middle.end>=middle.start) {
227 middle.count=(int32_t)((middle.end-middle.start)>>24)+1;
228 } else {
229 /* eliminate overlaps */
230 uint32_t start, end;
231
232 /* remove the middle range */
233 middle.count=0;
234
235 /* reduce or remove the lower ranges that go beyond upperLimit */
236 for(length=4; length>=2; --length) {
237 if(lower[length].count>0 && upper[length].count>0) {
238 start=upper[length].start;
239 end=lower[length].end;
240
241 if(end>=start || incWeight(end, length, maxByte)==start) {
242 /* lower and upper ranges collide or are directly adjacent: merge these two and remove all shorter ranges */
243 start=lower[length].start;
244 end=lower[length].end=upper[length].end;
245 /*
246 * merging directly adjacent ranges needs to subtract the 0/1 gaps in between;
247 * it may result in a range with count>countBytes
248 */
249 lower[length].count=
250 (int32_t)(getWeightTrail(end, length)-getWeightTrail(start, length)+1+
251 countBytes*(getWeightByte(end, length-1)-getWeightByte(start, length-1)));
252 upper[length].count=0;
253 while(--length>=2) {
254 lower[length].count=upper[length].count=0;
255 }
256 break;
257 }
258 }
259 }
260 }
261
262#ifdef UCOL_DEBUG
263 /* print ranges */
264 for(length=4; length>=2; --length) {
265 if(lower[length].count>0) {
266 printf("lower[%ld] .start=0x%08lx .end=0x%08lx .count=%ld\n", length, lower[length].start, lower[length].end, lower[length].count);
267 }
268 }
269 if(middle.count>0) {
270 printf("middle .start=0x%08lx .end=0x%08lx .count=%ld\n", middle.start, middle.end, middle.count);
271 }
272 for(length=2; length<=4; ++length) {
273 if(upper[length].count>0) {
274 printf("upper[%ld] .start=0x%08lx .end=0x%08lx .count=%ld\n", length, upper[length].start, upper[length].end, upper[length].count);
275 }
276 }
277#endif
278
279 /* copy the ranges, shortest first, into the result array */
280 rangeCount=0;
281 if(middle.count>0) {
282 uprv_memcpy(ranges, &middle, sizeof(WeightRange));
283 rangeCount=1;
284 }
285 for(length=2; length<=4; ++length) {
286 /* copy upper first so that later the middle range is more likely the first one to use */
287 if(upper[length].count>0) {
288 uprv_memcpy(ranges+rangeCount, upper+length, sizeof(WeightRange));
289 ++rangeCount;
290 }
291 if(lower[length].count>0) {
292 uprv_memcpy(ranges+rangeCount, lower+length, sizeof(WeightRange));
293 ++rangeCount;
294 }
295 }
296 return rangeCount;
297}
298
299/*
300 * call getWeightRanges and then determine heuristically
301 * which ranges to use for a given number of weights between (excluding)
302 * two limits
303 */
304U_CFUNC int32_t
305ucol_allocWeights(uint32_t lowerLimit, uint32_t upperLimit,
306 uint32_t n,
307 uint32_t maxByte,
308 WeightRange ranges[7]) {
309 /* number of usable byte values 3..maxByte */
310 uint32_t countBytes=maxByte-UCOL_BYTE_FIRST_TAILORED+1;
311
312 uint32_t lengthCounts[6]; /* [0] unused, [5] to make index checks unnecessary */
313 uint32_t maxCount;
374ca955 314 int32_t i, rangeCount, minLength/*, maxLength*/;
b75a7d8f
A
315
316 /* countBytes to the power of index */
317 uint32_t powers[5];
318 /* gcc requires explicit initialization */
319 powers[0] = 1;
320 powers[1] = countBytes;
321 powers[2] = countBytes*countBytes;
322 powers[3] = countBytes*countBytes*countBytes;
323 powers[4] = countBytes*countBytes*countBytes*countBytes;
324
325#ifdef UCOL_DEBUG
326 puts("");
327#endif
328
329 rangeCount=getWeightRanges(lowerLimit, upperLimit, maxByte, countBytes, ranges);
330 if(rangeCount<=0) {
331#ifdef UCOL_DEBUG
332 printf("error: unable to get Weight ranges\n");
333#endif
334 return 0;
335 }
336
337 /* what is the maximum number of weights with these ranges? */
338 maxCount=0;
339 for(i=0; i<rangeCount; ++i) {
340 maxCount+=(uint32_t)ranges[i].count*powers[4-ranges[i].length];
341 }
342 if(maxCount>=n) {
343#ifdef UCOL_DEBUG
344 printf("the maximum number of %lu weights is sufficient for n=%lu\n", maxCount, n);
345#endif
346 } else {
347#ifdef UCOL_DEBUG
348 printf("error: the maximum number of %lu weights is insufficient for n=%lu\n", maxCount, n);
349#endif
350 return 0;
351 }
352
353 /* set the length2 and count2 fields */
354 for(i=0; i<rangeCount; ++i) {
355 ranges[i].length2=ranges[i].length;
356 ranges[i].count2=(uint32_t)ranges[i].count;
357 }
358
359 /* try until we find suitably large ranges */
360 for(;;) {
361 /* get the smallest number of bytes in a range */
362 minLength=ranges[0].length2;
363
364 /* sum up the number of elements that fit into ranges of each byte length */
365 uprv_memset(lengthCounts, 0, sizeof(lengthCounts));
366 for(i=0; i<rangeCount; ++i) {
367 lengthCounts[ranges[i].length2]+=ranges[i].count2;
368 }
369
370 /* now try to allocate n elements in the available short ranges */
371 if(n<=(lengthCounts[minLength]+lengthCounts[minLength+1])) {
372 /* trivial cases, use the first few ranges */
373 maxCount=0;
374 rangeCount=0;
375 do {
376 maxCount+=ranges[rangeCount].count2;
377 ++rangeCount;
378 } while(n>maxCount);
379#ifdef UCOL_DEBUG
380 printf("take first %ld ranges\n", rangeCount);
381#endif
382 break;
383 } else if(n<=ranges[0].count2*countBytes) {
384 /* easy case, just make this one range large enough by lengthening it once more, possibly split it */
385 uint32_t count1, count2, power_1, power;
386
374ca955 387 /*maxLength=minLength+1;*/
b75a7d8f
A
388
389 /* calculate how to split the range between maxLength-1 (count1) and maxLength (count2) */
390 power_1=powers[minLength-ranges[0].length];
391 power=power_1*countBytes;
392 count2=(n+power-1)/power;
393 count1=ranges[0].count-count2;
394
395 /* split the range */
396#ifdef UCOL_DEBUG
397 printf("split the first range %ld:%ld\n", count1, count2);
398#endif
399 if(count1<1) {
374ca955
A
400 rangeCount=1;
401
b75a7d8f
A
402 /* lengthen the entire range to maxLength */
403 lengthenRange(ranges, maxByte, countBytes);
404 } else {
405 /* really split the range */
406 uint32_t byte;
407
408 /* create a new range with the end and initial and current length of the old one */
409 rangeCount=2;
410 ranges[1].end=ranges[0].end;
411 ranges[1].length=ranges[0].length;
412 ranges[1].length2=minLength;
413
414 /* set the end of the first range according to count1 */
415 i=ranges[0].length;
416 byte=getWeightByte(ranges[0].start, i)+count1-1;
417
418 /*
419 * ranges[0].count and count1 may be >countBytes
420 * from merging adjacent ranges;
421 * byte>maxByte is possible
422 */
423 if(byte<=maxByte) {
424 ranges[0].end=setWeightByte(ranges[0].start, i, byte);
425 } else /* byte>maxByte */ {
426 ranges[0].end=setWeightByte(incWeight(ranges[0].start, i-1, maxByte), i, byte-countBytes);
427 }
428
429 /* set the bytes in the end weight at length+1..length2 to maxByte */
430 byte=(maxByte<<24)|(maxByte<<16)|(maxByte<<8)|maxByte; /* this used to be 0xffffffff */
431 ranges[0].end=truncateWeight(ranges[0].end, i)|
432 ((byte>>(8*i))&(byte<<(8*(4-minLength))));
433
434 /* set the start of the second range to immediately follow the end of the first one */
435 ranges[1].start=incWeight(ranges[0].end, minLength, maxByte);
436
437 /* set the count values (informational) */
438 ranges[0].count=count1;
439 ranges[1].count=count2;
440
441 ranges[0].count2=count1*power_1;
442 ranges[1].count2=count2*power_1; /* will be *countBytes when lengthened */
443
444 /* lengthen the second range to maxLength */
445 lengthenRange(ranges+1, maxByte, countBytes);
446 }
447 break;
448 }
449
450 /* no good match, lengthen all minLength ranges and iterate */
451#ifdef UCOL_DEBUG
452 printf("lengthen the short ranges from %ld bytes to %ld and iterate\n", minLength, minLength+1);
453#endif
454 for(i=0; ranges[i].length2==minLength; ++i) {
455 lengthenRange(ranges+i, maxByte, countBytes);
456 }
457 }
458
459 if(rangeCount>1) {
460 /* sort the ranges by weight values */
374ca955
A
461 UErrorCode errorCode=U_ZERO_ERROR;
462 uprv_sortArray(ranges, rangeCount, sizeof(WeightRange), compareRanges, NULL, FALSE, &errorCode);
463 /* ignore error code: we know that the internal sort function will not fail here */
b75a7d8f
A
464 }
465
466#ifdef UCOL_DEBUG
467 puts("final ranges:");
468 for(i=0; i<rangeCount; ++i) {
469 printf("ranges[%ld] .start=0x%08lx .end=0x%08lx .length=%ld .length2=%ld .count=%ld .count2=%lu\n",
470 i, ranges[i].start, ranges[i].end, ranges[i].length, ranges[i].length2, ranges[i].count, ranges[i].count2);
471 }
472#endif
473
474 /* set maxByte in ranges[0] for ucol_nextWeight() */
475 ranges[0].count=maxByte;
476
477 return rangeCount;
478}
479
480/*
481 * given a set of ranges calculated by ucol_allocWeights(),
482 * iterate through the weights
483 */
484U_CFUNC uint32_t
485ucol_nextWeight(WeightRange ranges[], int32_t *pRangeCount) {
486 if(*pRangeCount<=0) {
487 return 0xffffffff;
488 } else {
489 uint32_t weight, maxByte;
490
491 /* get maxByte from the .count field */
492 maxByte=ranges[0].count;
493
494 /* get the next weight */
495 weight=ranges[0].start;
496 if(weight==ranges[0].end) {
497 /* this range is finished, remove it and move the following ones up */
498 if(--*pRangeCount>0) {
499 uprv_memmove(ranges, ranges+1, *pRangeCount*sizeof(WeightRange));
500 ranges[0].count=maxByte; /* keep maxByte in ranges[0] */
501 }
502 } else {
503 /* increment the weight for the next value */
504 ranges[0].start=incWeight(weight, ranges[0].length2, maxByte);
505 }
506
507 return weight;
508 }
509}
510
729e4ab9 511#if 0 // #ifdef UCOL_DEBUG
b75a7d8f
A
512
513static void
514testAlloc(uint32_t lowerLimit, uint32_t upperLimit, uint32_t n, UBool enumerate) {
515 WeightRange ranges[8];
516 int32_t rangeCount;
517
518 rangeCount=ucol_allocWeights(lowerLimit, upperLimit, n, ranges);
519 if(enumerate) {
520 uint32_t weight;
521
522 while(n>0) {
523 weight=ucol_nextWeight(ranges, &rangeCount);
524 if(weight==0xffffffff) {
525 printf("error: 0xffffffff with %lu more weights to go\n", n);
526 break;
527 }
528 printf(" 0x%08lx\n", weight);
529 --n;
530 }
531 }
532}
533
534extern int
535main(int argc, const char *argv[]) {
536#if 0
537#endif
538 testAlloc(0x364214fc, 0x44b87d23, 5, FALSE);
539 testAlloc(0x36421500, 0x44b87d23, 5, FALSE);
540 testAlloc(0x36421500, 0x44b87d23, 20, FALSE);
541 testAlloc(0x36421500, 0x44b87d23, 13700, FALSE);
542 testAlloc(0x36421500, 0x38b87d23, 1, FALSE);
543 testAlloc(0x36421500, 0x38b87d23, 20, FALSE);
544 testAlloc(0x36421500, 0x38b87d23, 200, TRUE);
545 testAlloc(0x36421500, 0x38b87d23, 13700, FALSE);
546 testAlloc(0x36421500, 0x37b87d23, 13700, FALSE);
547 testAlloc(0x36ef1500, 0x37b87d23, 13700, FALSE);
548 testAlloc(0x36421500, 0x36b87d23, 13700, FALSE);
549 testAlloc(0x36b87122, 0x36b87d23, 13700, FALSE);
550 testAlloc(0x49000000, 0x4a600000, 13700, FALSE);
551 testAlloc(0x9fffffff, 0xd0000000, 13700, FALSE);
552 testAlloc(0x9fffffff, 0xd0000000, 67400, FALSE);
553 testAlloc(0x9fffffff, 0xa0030000, 67400, FALSE);
554 testAlloc(0x9fffffff, 0xa0030000, 40000, FALSE);
555 testAlloc(0xa0000000, 0xa0030000, 40000, FALSE);
556 testAlloc(0xa0031100, 0xa0030000, 40000, FALSE);
557#if 0
558#endif
559 return 0;
560}
561
562#endif
563
564#endif /* #if !UCONFIG_NO_COLLATION */