]> git.saurik.com Git - apple/icu.git/blame - icuSources/i18n/ucol_wgt.c
ICU-3.13.tar.gz
[apple/icu.git] / icuSources / i18n / ucol_wgt.c
CommitLineData
b75a7d8f
A
1/*
2*******************************************************************************
3*
4* Copyright (C) 1999-2003, International Business Machines
5* Corporation and others. All Rights Reserved.
6*
7*******************************************************************************
8* file name: ucol_wgt.c
9* encoding: US-ASCII
10* tab size: 8 (not used)
11* indentation:4
12*
13* created on: 2001mar08
14* created by: Markus W. Scherer
15*
16* This file contains code for allocating n collation element weights
17* between two exclusive limits.
18* It is used only internally by ucol_bld.
19*/
20
21#include "unicode/utypes.h"
22
23#if !UCONFIG_NO_COLLATION
24
25#include "ucol_imp.h"
26#include "ucol_wgt.h"
27#include "cmemory.h"
28
29#ifdef UCOL_DEBUG
30# include <stdio.h>
31#endif
32
33/* we are using qsort() */
34#include <stdlib.h>
35
36#if defined(UCOL_DEBUG) && defined(WIN32)
37 /* turn off "unreferenced formal parameter" */
38# pragma warning(disable: 4100)
39#endif
40
41/* collation element weight allocation -------------------------------------- */
42
43/* helper functions for CE weights */
44
45static int32_t
46lengthOfWeight(uint32_t weight) {
47 if((weight&0xffffff)==0) {
48 return 1;
49 } else if((weight&0xffff)==0) {
50 return 2;
51 } else if((weight&0xff)==0) {
52 return 3;
53 } else {
54 return 4;
55 }
56}
57
58static uint32_t
59getWeightTrail(uint32_t weight, int32_t length) {
60 return (uint32_t)(weight>>(8*(4-length)))&0xff;
61}
62
63static uint32_t
64setWeightTrail(uint32_t weight, int32_t length, uint32_t trail) {
65 length=8*(4-length);
66 return (uint32_t)((weight&(0xffffff00<<length))|(trail<<length));
67}
68
69static uint32_t
70getWeightByte(uint32_t weight, int32_t index) {
71 return getWeightTrail(weight, index); /* same calculation */
72}
73
74static uint32_t
75setWeightByte(uint32_t weight, int32_t index, uint32_t byte) {
76 uint32_t mask; /* 0xffffffff except a 00 "hole" for the index-th byte */
77
78 index*=8;
79 mask=((uint32_t)0xffffffff)>>index;
80 index=32-index;
81 mask|=0xffffff00<<index;
82 return (uint32_t)((weight&mask)|(byte<<index));
83}
84
85static uint32_t
86truncateWeight(uint32_t weight, int32_t length) {
87 return (uint32_t)(weight&(0xffffffff<<(8*(4-length))));
88}
89
90static uint32_t
91incWeightTrail(uint32_t weight, int32_t length) {
92 return (uint32_t)(weight+(1UL<<(8*(4-length))));
93}
94
95static uint32_t
96decWeightTrail(uint32_t weight, int32_t length) {
97 return (uint32_t)(weight-(1UL<<(8*(4-length))));
98}
99
100static uint32_t
101incWeight(uint32_t weight, int32_t length, uint32_t maxByte) {
102 uint32_t byte;
103
104 for(;;) {
105 byte=getWeightByte(weight, length);
106 if(byte<maxByte) {
107 return setWeightByte(weight, length, byte+1);
108 } else {
109 /* roll over, set this byte to UCOL_BYTE_FIRST_TAILORED and increment the previous one */
110 weight=setWeightByte(weight, length, UCOL_BYTE_FIRST_TAILORED);
111 --length;
112 }
113 }
114}
115
116static int32_t
117lengthenRange(WeightRange *range, uint32_t maxByte, uint32_t countBytes) {
118 int32_t length;
119
120 length=range->length2+1;
121 range->start=setWeightTrail(range->start, length, UCOL_BYTE_FIRST_TAILORED);
122 range->end=setWeightTrail(range->end, length, maxByte);
123 range->count2*=countBytes;
124 range->length2=length;
125 return length;
126}
127
128/* for qsort: sort ranges in weight order */
129static int
130compareRanges(const void *left, const void *right) {
131 uint32_t l, r;
132
133 l=((const WeightRange *)left)->start;
134 r=((const WeightRange *)right)->start;
135 if(l<r) {
136 return -1;
137 } else if(l>r) {
138 return 1;
139 } else {
140 return 0;
141 }
142}
143
144/*
145 * take two CE weights and calculate the
146 * possible ranges of weights between the two limits, excluding them
147 * for weights with up to 4 bytes there are up to 2*4-1=7 ranges
148 */
149static int32_t
150getWeightRanges(uint32_t lowerLimit, uint32_t upperLimit,
151 uint32_t maxByte, uint32_t countBytes,
152 WeightRange ranges[7]) {
153 WeightRange lower[5], middle, upper[5]; /* [0] and [1] are not used - this simplifies indexing */
154 uint32_t weight, trail;
155 int32_t length, lowerLength, upperLength, rangeCount;
156
157 /* assume that both lowerLimit & upperLimit are not 0 */
158
159 /* get the lengths of the limits */
160 lowerLength=lengthOfWeight(lowerLimit);
161 upperLength=lengthOfWeight(upperLimit);
162
163#ifdef UCOL_DEBUG
164 printf("length of lower limit 0x%08lx is %ld\n", lowerLimit, lowerLength);
165 printf("length of upper limit 0x%08lx is %ld\n", upperLimit, upperLength);
166#endif
167
168 if(lowerLimit>=upperLimit) {
169#ifdef UCOL_DEBUG
170 printf("error: no space between lower & upper limits\n");
171#endif
172 return 0;
173 }
174
175 /* check that neither is a prefix of the other */
176 if(lowerLength<upperLength) {
177 if(lowerLimit==truncateWeight(upperLimit, lowerLength)) {
178#ifdef UCOL_DEBUG
179 printf("error: lower limit 0x%08lx is a prefix of upper limit 0x%08lx\n", lowerLimit, upperLimit);
180#endif
181 return 0;
182 }
183 }
184 /* if the upper limit is a prefix of the lower limit then the earlier test lowerLimit>=upperLimit has caught it */
185
186 /* reset local variables */
187 uprv_memset(lower, 0, sizeof(lower));
188 uprv_memset(&middle, 0, sizeof(middle));
189 uprv_memset(upper, 0, sizeof(upper));
190
191 /*
192 * With the limit lengths of 1..4, there are up to 7 ranges for allocation:
193 * range minimum length
194 * lower[4] 4
195 * lower[3] 3
196 * lower[2] 2
197 * middle 1
198 * upper[2] 2
199 * upper[3] 3
200 * upper[4] 4
201 *
202 * We are now going to calculate up to 7 ranges.
203 * Some of them will typically overlap, so we will then have to merge and eliminate ranges.
204 */
205 weight=lowerLimit;
206 for(length=lowerLength; length>=2; --length) {
207 trail=getWeightTrail(weight, length);
208 if(trail<maxByte) {
209 lower[length].start=incWeightTrail(weight, length);
210 lower[length].end=setWeightTrail(weight, length, maxByte);
211 lower[length].length=length;
212 lower[length].count=maxByte-trail;
213 }
214 weight=truncateWeight(weight, length-1);
215 }
216 middle.start=incWeightTrail(weight, 1);
217
218 weight=upperLimit;
219 for(length=upperLength; length>=2; --length) {
220 trail=getWeightTrail(weight, length);
221 if(trail>UCOL_BYTE_FIRST_TAILORED) {
222 upper[length].start=setWeightTrail(weight, length, UCOL_BYTE_FIRST_TAILORED);
223 upper[length].end=decWeightTrail(weight, length);
224 upper[length].length=length;
225 upper[length].count=trail-UCOL_BYTE_FIRST_TAILORED;
226 }
227 weight=truncateWeight(weight, length-1);
228 }
229 middle.end=decWeightTrail(weight, 1);
230
231 /* set the middle range */
232 middle.length=1;
233 if(middle.end>=middle.start) {
234 middle.count=(int32_t)((middle.end-middle.start)>>24)+1;
235 } else {
236 /* eliminate overlaps */
237 uint32_t start, end;
238
239 /* remove the middle range */
240 middle.count=0;
241
242 /* reduce or remove the lower ranges that go beyond upperLimit */
243 for(length=4; length>=2; --length) {
244 if(lower[length].count>0 && upper[length].count>0) {
245 start=upper[length].start;
246 end=lower[length].end;
247
248 if(end>=start || incWeight(end, length, maxByte)==start) {
249 /* lower and upper ranges collide or are directly adjacent: merge these two and remove all shorter ranges */
250 start=lower[length].start;
251 end=lower[length].end=upper[length].end;
252 /*
253 * merging directly adjacent ranges needs to subtract the 0/1 gaps in between;
254 * it may result in a range with count>countBytes
255 */
256 lower[length].count=
257 (int32_t)(getWeightTrail(end, length)-getWeightTrail(start, length)+1+
258 countBytes*(getWeightByte(end, length-1)-getWeightByte(start, length-1)));
259 upper[length].count=0;
260 while(--length>=2) {
261 lower[length].count=upper[length].count=0;
262 }
263 break;
264 }
265 }
266 }
267 }
268
269#ifdef UCOL_DEBUG
270 /* print ranges */
271 for(length=4; length>=2; --length) {
272 if(lower[length].count>0) {
273 printf("lower[%ld] .start=0x%08lx .end=0x%08lx .count=%ld\n", length, lower[length].start, lower[length].end, lower[length].count);
274 }
275 }
276 if(middle.count>0) {
277 printf("middle .start=0x%08lx .end=0x%08lx .count=%ld\n", middle.start, middle.end, middle.count);
278 }
279 for(length=2; length<=4; ++length) {
280 if(upper[length].count>0) {
281 printf("upper[%ld] .start=0x%08lx .end=0x%08lx .count=%ld\n", length, upper[length].start, upper[length].end, upper[length].count);
282 }
283 }
284#endif
285
286 /* copy the ranges, shortest first, into the result array */
287 rangeCount=0;
288 if(middle.count>0) {
289 uprv_memcpy(ranges, &middle, sizeof(WeightRange));
290 rangeCount=1;
291 }
292 for(length=2; length<=4; ++length) {
293 /* copy upper first so that later the middle range is more likely the first one to use */
294 if(upper[length].count>0) {
295 uprv_memcpy(ranges+rangeCount, upper+length, sizeof(WeightRange));
296 ++rangeCount;
297 }
298 if(lower[length].count>0) {
299 uprv_memcpy(ranges+rangeCount, lower+length, sizeof(WeightRange));
300 ++rangeCount;
301 }
302 }
303 return rangeCount;
304}
305
306/*
307 * call getWeightRanges and then determine heuristically
308 * which ranges to use for a given number of weights between (excluding)
309 * two limits
310 */
311U_CFUNC int32_t
312ucol_allocWeights(uint32_t lowerLimit, uint32_t upperLimit,
313 uint32_t n,
314 uint32_t maxByte,
315 WeightRange ranges[7]) {
316 /* number of usable byte values 3..maxByte */
317 uint32_t countBytes=maxByte-UCOL_BYTE_FIRST_TAILORED+1;
318
319 uint32_t lengthCounts[6]; /* [0] unused, [5] to make index checks unnecessary */
320 uint32_t maxCount;
321 int32_t i, rangeCount, minLength, maxLength;
322
323 /* countBytes to the power of index */
324 uint32_t powers[5];
325 /* gcc requires explicit initialization */
326 powers[0] = 1;
327 powers[1] = countBytes;
328 powers[2] = countBytes*countBytes;
329 powers[3] = countBytes*countBytes*countBytes;
330 powers[4] = countBytes*countBytes*countBytes*countBytes;
331
332#ifdef UCOL_DEBUG
333 puts("");
334#endif
335
336 rangeCount=getWeightRanges(lowerLimit, upperLimit, maxByte, countBytes, ranges);
337 if(rangeCount<=0) {
338#ifdef UCOL_DEBUG
339 printf("error: unable to get Weight ranges\n");
340#endif
341 return 0;
342 }
343
344 /* what is the maximum number of weights with these ranges? */
345 maxCount=0;
346 for(i=0; i<rangeCount; ++i) {
347 maxCount+=(uint32_t)ranges[i].count*powers[4-ranges[i].length];
348 }
349 if(maxCount>=n) {
350#ifdef UCOL_DEBUG
351 printf("the maximum number of %lu weights is sufficient for n=%lu\n", maxCount, n);
352#endif
353 } else {
354#ifdef UCOL_DEBUG
355 printf("error: the maximum number of %lu weights is insufficient for n=%lu\n", maxCount, n);
356#endif
357 return 0;
358 }
359
360 /* set the length2 and count2 fields */
361 for(i=0; i<rangeCount; ++i) {
362 ranges[i].length2=ranges[i].length;
363 ranges[i].count2=(uint32_t)ranges[i].count;
364 }
365
366 /* try until we find suitably large ranges */
367 for(;;) {
368 /* get the smallest number of bytes in a range */
369 minLength=ranges[0].length2;
370
371 /* sum up the number of elements that fit into ranges of each byte length */
372 uprv_memset(lengthCounts, 0, sizeof(lengthCounts));
373 for(i=0; i<rangeCount; ++i) {
374 lengthCounts[ranges[i].length2]+=ranges[i].count2;
375 }
376
377 /* now try to allocate n elements in the available short ranges */
378 if(n<=(lengthCounts[minLength]+lengthCounts[minLength+1])) {
379 /* trivial cases, use the first few ranges */
380 maxCount=0;
381 rangeCount=0;
382 do {
383 maxCount+=ranges[rangeCount].count2;
384 ++rangeCount;
385 } while(n>maxCount);
386#ifdef UCOL_DEBUG
387 printf("take first %ld ranges\n", rangeCount);
388#endif
389 break;
390 } else if(n<=ranges[0].count2*countBytes) {
391 /* easy case, just make this one range large enough by lengthening it once more, possibly split it */
392 uint32_t count1, count2, power_1, power;
393
394 rangeCount=1;
395 maxLength=minLength+1;
396
397 /* calculate how to split the range between maxLength-1 (count1) and maxLength (count2) */
398 power_1=powers[minLength-ranges[0].length];
399 power=power_1*countBytes;
400 count2=(n+power-1)/power;
401 count1=ranges[0].count-count2;
402
403 /* split the range */
404#ifdef UCOL_DEBUG
405 printf("split the first range %ld:%ld\n", count1, count2);
406#endif
407 if(count1<1) {
408 /* lengthen the entire range to maxLength */
409 lengthenRange(ranges, maxByte, countBytes);
410 } else {
411 /* really split the range */
412 uint32_t byte;
413
414 /* create a new range with the end and initial and current length of the old one */
415 rangeCount=2;
416 ranges[1].end=ranges[0].end;
417 ranges[1].length=ranges[0].length;
418 ranges[1].length2=minLength;
419
420 /* set the end of the first range according to count1 */
421 i=ranges[0].length;
422 byte=getWeightByte(ranges[0].start, i)+count1-1;
423
424 /*
425 * ranges[0].count and count1 may be >countBytes
426 * from merging adjacent ranges;
427 * byte>maxByte is possible
428 */
429 if(byte<=maxByte) {
430 ranges[0].end=setWeightByte(ranges[0].start, i, byte);
431 } else /* byte>maxByte */ {
432 ranges[0].end=setWeightByte(incWeight(ranges[0].start, i-1, maxByte), i, byte-countBytes);
433 }
434
435 /* set the bytes in the end weight at length+1..length2 to maxByte */
436 byte=(maxByte<<24)|(maxByte<<16)|(maxByte<<8)|maxByte; /* this used to be 0xffffffff */
437 ranges[0].end=truncateWeight(ranges[0].end, i)|
438 ((byte>>(8*i))&(byte<<(8*(4-minLength))));
439
440 /* set the start of the second range to immediately follow the end of the first one */
441 ranges[1].start=incWeight(ranges[0].end, minLength, maxByte);
442
443 /* set the count values (informational) */
444 ranges[0].count=count1;
445 ranges[1].count=count2;
446
447 ranges[0].count2=count1*power_1;
448 ranges[1].count2=count2*power_1; /* will be *countBytes when lengthened */
449
450 /* lengthen the second range to maxLength */
451 lengthenRange(ranges+1, maxByte, countBytes);
452 }
453 break;
454 }
455
456 /* no good match, lengthen all minLength ranges and iterate */
457#ifdef UCOL_DEBUG
458 printf("lengthen the short ranges from %ld bytes to %ld and iterate\n", minLength, minLength+1);
459#endif
460 for(i=0; ranges[i].length2==minLength; ++i) {
461 lengthenRange(ranges+i, maxByte, countBytes);
462 }
463 }
464
465 if(rangeCount>1) {
466 /* sort the ranges by weight values */
467 qsort(ranges, rangeCount, sizeof(WeightRange), compareRanges);
468 }
469
470#ifdef UCOL_DEBUG
471 puts("final ranges:");
472 for(i=0; i<rangeCount; ++i) {
473 printf("ranges[%ld] .start=0x%08lx .end=0x%08lx .length=%ld .length2=%ld .count=%ld .count2=%lu\n",
474 i, ranges[i].start, ranges[i].end, ranges[i].length, ranges[i].length2, ranges[i].count, ranges[i].count2);
475 }
476#endif
477
478 /* set maxByte in ranges[0] for ucol_nextWeight() */
479 ranges[0].count=maxByte;
480
481 return rangeCount;
482}
483
484/*
485 * given a set of ranges calculated by ucol_allocWeights(),
486 * iterate through the weights
487 */
488U_CFUNC uint32_t
489ucol_nextWeight(WeightRange ranges[], int32_t *pRangeCount) {
490 if(*pRangeCount<=0) {
491 return 0xffffffff;
492 } else {
493 uint32_t weight, maxByte;
494
495 /* get maxByte from the .count field */
496 maxByte=ranges[0].count;
497
498 /* get the next weight */
499 weight=ranges[0].start;
500 if(weight==ranges[0].end) {
501 /* this range is finished, remove it and move the following ones up */
502 if(--*pRangeCount>0) {
503 uprv_memmove(ranges, ranges+1, *pRangeCount*sizeof(WeightRange));
504 ranges[0].count=maxByte; /* keep maxByte in ranges[0] */
505 }
506 } else {
507 /* increment the weight for the next value */
508 ranges[0].start=incWeight(weight, ranges[0].length2, maxByte);
509 }
510
511 return weight;
512 }
513}
514
515#ifdef UCOL_DEBUG
516
517static void
518testAlloc(uint32_t lowerLimit, uint32_t upperLimit, uint32_t n, UBool enumerate) {
519 WeightRange ranges[8];
520 int32_t rangeCount;
521
522 rangeCount=ucol_allocWeights(lowerLimit, upperLimit, n, ranges);
523 if(enumerate) {
524 uint32_t weight;
525
526 while(n>0) {
527 weight=ucol_nextWeight(ranges, &rangeCount);
528 if(weight==0xffffffff) {
529 printf("error: 0xffffffff with %lu more weights to go\n", n);
530 break;
531 }
532 printf(" 0x%08lx\n", weight);
533 --n;
534 }
535 }
536}
537
538extern int
539main(int argc, const char *argv[]) {
540#if 0
541#endif
542 testAlloc(0x364214fc, 0x44b87d23, 5, FALSE);
543 testAlloc(0x36421500, 0x44b87d23, 5, FALSE);
544 testAlloc(0x36421500, 0x44b87d23, 20, FALSE);
545 testAlloc(0x36421500, 0x44b87d23, 13700, FALSE);
546 testAlloc(0x36421500, 0x38b87d23, 1, FALSE);
547 testAlloc(0x36421500, 0x38b87d23, 20, FALSE);
548 testAlloc(0x36421500, 0x38b87d23, 200, TRUE);
549 testAlloc(0x36421500, 0x38b87d23, 13700, FALSE);
550 testAlloc(0x36421500, 0x37b87d23, 13700, FALSE);
551 testAlloc(0x36ef1500, 0x37b87d23, 13700, FALSE);
552 testAlloc(0x36421500, 0x36b87d23, 13700, FALSE);
553 testAlloc(0x36b87122, 0x36b87d23, 13700, FALSE);
554 testAlloc(0x49000000, 0x4a600000, 13700, FALSE);
555 testAlloc(0x9fffffff, 0xd0000000, 13700, FALSE);
556 testAlloc(0x9fffffff, 0xd0000000, 67400, FALSE);
557 testAlloc(0x9fffffff, 0xa0030000, 67400, FALSE);
558 testAlloc(0x9fffffff, 0xa0030000, 40000, FALSE);
559 testAlloc(0xa0000000, 0xa0030000, 40000, FALSE);
560 testAlloc(0xa0031100, 0xa0030000, 40000, FALSE);
561#if 0
562#endif
563 return 0;
564}
565
566#endif
567
568#endif /* #if !UCONFIG_NO_COLLATION */