]>
git.saurik.com Git - apple/icu.git/blob - icuSources/i18n/ucol_wgt.cpp
2 *******************************************************************************
4 * Copyright (C) 1999-2011, International Business Machines
5 * Corporation and others. All Rights Reserved.
7 *******************************************************************************
8 * file name: ucol_wgt.cpp
10 * tab size: 8 (not used)
13 * created on: 2001mar08
14 * created by: Markus W. Scherer
16 * This file contains code for allocating n collation element weights
17 * between two exclusive limits.
18 * It is used only internally by ucol_bld.
21 #include "unicode/utypes.h"
23 #if !UCONFIG_NO_COLLATION
34 /* collation element weight allocation -------------------------------------- */
36 /* helper functions for CE weights */
39 lengthOfWeight(uint32_t weight
) {
40 if((weight
&0xffffff)==0) {
42 } else if((weight
&0xffff)==0) {
44 } else if((weight
&0xff)==0) {
51 static inline uint32_t
52 getWeightTrail(uint32_t weight
, int32_t length
) {
53 return (uint32_t)(weight
>>(8*(4-length
)))&0xff;
56 static inline uint32_t
57 setWeightTrail(uint32_t weight
, int32_t length
, uint32_t trail
) {
59 return (uint32_t)((weight
&(0xffffff00<<length
))|(trail
<<length
));
62 static inline uint32_t
63 getWeightByte(uint32_t weight
, int32_t idx
) {
64 return getWeightTrail(weight
, idx
); /* same calculation */
67 static inline uint32_t
68 setWeightByte(uint32_t weight
, int32_t idx
, uint32_t byte
) {
69 uint32_t mask
; /* 0xffffffff except a 00 "hole" for the index-th byte */
73 mask
=((uint32_t)0xffffffff)>>idx
;
75 // Do not use uint32_t>>32 because on some platforms that does not shift at all
76 // while we need it to become 0.
77 // PowerPC: 0xffffffff>>32 = 0 (wanted)
78 // x86: 0xffffffff>>32 = 0xffffffff (not wanted)
80 // ANSI C99 6.5.7 Bitwise shift operators:
81 // "If the value of the right operand is negative
82 // or is greater than or equal to the width of the promoted left operand,
83 // the behavior is undefined."
87 mask
|=0xffffff00<<idx
;
88 return (uint32_t)((weight
&mask
)|(byte
<<idx
));
91 static inline uint32_t
92 truncateWeight(uint32_t weight
, int32_t length
) {
93 return (uint32_t)(weight
&(0xffffffff<<(8*(4-length
))));
96 static inline uint32_t
97 incWeightTrail(uint32_t weight
, int32_t length
) {
98 return (uint32_t)(weight
+(1UL<<(8*(4-length
))));
101 static inline uint32_t
102 decWeightTrail(uint32_t weight
, int32_t length
) {
103 return (uint32_t)(weight
-(1UL<<(8*(4-length
))));
106 static inline uint32_t
107 incWeight(uint32_t weight
, int32_t length
, uint32_t maxByte
) {
111 byte
=getWeightByte(weight
, length
);
113 return setWeightByte(weight
, length
, byte
+1);
115 /* roll over, set this byte to UCOL_BYTE_FIRST_TAILORED and increment the previous one */
116 weight
=setWeightByte(weight
, length
, UCOL_BYTE_FIRST_TAILORED
);
122 static inline int32_t
123 lengthenRange(WeightRange
*range
, uint32_t maxByte
, uint32_t countBytes
) {
126 length
=range
->length2
+1;
127 range
->start
=setWeightTrail(range
->start
, length
, UCOL_BYTE_FIRST_TAILORED
);
128 range
->end
=setWeightTrail(range
->end
, length
, maxByte
);
129 range
->count2
*=countBytes
;
130 range
->length2
=length
;
134 /* for uprv_sortArray: sort ranges in weight order */
135 static int32_t U_CALLCONV
136 compareRanges(const void * /*context*/, const void *left
, const void *right
) {
139 l
=((const WeightRange
*)left
)->start
;
140 r
=((const WeightRange
*)right
)->start
;
151 * take two CE weights and calculate the
152 * possible ranges of weights between the two limits, excluding them
153 * for weights with up to 4 bytes there are up to 2*4-1=7 ranges
155 static inline int32_t
156 getWeightRanges(uint32_t lowerLimit
, uint32_t upperLimit
,
157 uint32_t maxByte
, uint32_t countBytes
,
158 WeightRange ranges
[7]) {
159 WeightRange lower
[5], middle
, upper
[5]; /* [0] and [1] are not used - this simplifies indexing */
160 uint32_t weight
, trail
;
161 int32_t length
, lowerLength
, upperLength
, rangeCount
;
163 /* assume that both lowerLimit & upperLimit are not 0 */
165 /* get the lengths of the limits */
166 lowerLength
=lengthOfWeight(lowerLimit
);
167 upperLength
=lengthOfWeight(upperLimit
);
170 printf("length of lower limit 0x%08lx is %ld\n", lowerLimit
, lowerLength
);
171 printf("length of upper limit 0x%08lx is %ld\n", upperLimit
, upperLength
);
174 if(lowerLimit
>=upperLimit
) {
176 printf("error: no space between lower & upper limits\n");
181 /* check that neither is a prefix of the other */
182 if(lowerLength
<upperLength
) {
183 if(lowerLimit
==truncateWeight(upperLimit
, lowerLength
)) {
185 printf("error: lower limit 0x%08lx is a prefix of upper limit 0x%08lx\n", lowerLimit
, upperLimit
);
190 /* if the upper limit is a prefix of the lower limit then the earlier test lowerLimit>=upperLimit has caught it */
192 /* reset local variables */
193 uprv_memset(lower
, 0, sizeof(lower
));
194 uprv_memset(&middle
, 0, sizeof(middle
));
195 uprv_memset(upper
, 0, sizeof(upper
));
198 * With the limit lengths of 1..4, there are up to 7 ranges for allocation:
199 * range minimum length
208 * We are now going to calculate up to 7 ranges.
209 * Some of them will typically overlap, so we will then have to merge and eliminate ranges.
212 for(length
=lowerLength
; length
>=2; --length
) {
213 trail
=getWeightTrail(weight
, length
);
215 lower
[length
].start
=incWeightTrail(weight
, length
);
216 lower
[length
].end
=setWeightTrail(weight
, length
, maxByte
);
217 lower
[length
].length
=length
;
218 lower
[length
].count
=maxByte
-trail
;
220 weight
=truncateWeight(weight
, length
-1);
222 middle
.start
=incWeightTrail(weight
, 1);
225 for(length
=upperLength
; length
>=2; --length
) {
226 trail
=getWeightTrail(weight
, length
);
227 if(trail
>UCOL_BYTE_FIRST_TAILORED
) {
228 upper
[length
].start
=setWeightTrail(weight
, length
, UCOL_BYTE_FIRST_TAILORED
);
229 upper
[length
].end
=decWeightTrail(weight
, length
);
230 upper
[length
].length
=length
;
231 upper
[length
].count
=trail
-UCOL_BYTE_FIRST_TAILORED
;
233 weight
=truncateWeight(weight
, length
-1);
235 middle
.end
=decWeightTrail(weight
, 1);
237 /* set the middle range */
239 if(middle
.end
>=middle
.start
) {
240 middle
.count
=(int32_t)((middle
.end
-middle
.start
)>>24)+1;
242 /* eliminate overlaps */
245 /* remove the middle range */
248 /* reduce or remove the lower ranges that go beyond upperLimit */
249 for(length
=4; length
>=2; --length
) {
250 if(lower
[length
].count
>0 && upper
[length
].count
>0) {
251 start
=upper
[length
].start
;
252 end
=lower
[length
].end
;
254 if(end
>=start
|| incWeight(end
, length
, maxByte
)==start
) {
255 /* lower and upper ranges collide or are directly adjacent: merge these two and remove all shorter ranges */
256 start
=lower
[length
].start
;
257 end
=lower
[length
].end
=upper
[length
].end
;
259 * merging directly adjacent ranges needs to subtract the 0/1 gaps in between;
260 * it may result in a range with count>countBytes
263 (int32_t)(getWeightTrail(end
, length
)-getWeightTrail(start
, length
)+1+
264 countBytes
*(getWeightByte(end
, length
-1)-getWeightByte(start
, length
-1)));
265 upper
[length
].count
=0;
267 lower
[length
].count
=upper
[length
].count
=0;
277 for(length
=4; length
>=2; --length
) {
278 if(lower
[length
].count
>0) {
279 printf("lower[%ld] .start=0x%08lx .end=0x%08lx .count=%ld\n", length
, lower
[length
].start
, lower
[length
].end
, lower
[length
].count
);
283 printf("middle .start=0x%08lx .end=0x%08lx .count=%ld\n", middle
.start
, middle
.end
, middle
.count
);
285 for(length
=2; length
<=4; ++length
) {
286 if(upper
[length
].count
>0) {
287 printf("upper[%ld] .start=0x%08lx .end=0x%08lx .count=%ld\n", length
, upper
[length
].start
, upper
[length
].end
, upper
[length
].count
);
292 /* copy the ranges, shortest first, into the result array */
295 uprv_memcpy(ranges
, &middle
, sizeof(WeightRange
));
298 for(length
=2; length
<=4; ++length
) {
299 /* copy upper first so that later the middle range is more likely the first one to use */
300 if(upper
[length
].count
>0) {
301 uprv_memcpy(ranges
+rangeCount
, upper
+length
, sizeof(WeightRange
));
304 if(lower
[length
].count
>0) {
305 uprv_memcpy(ranges
+rangeCount
, lower
+length
, sizeof(WeightRange
));
313 * call getWeightRanges and then determine heuristically
314 * which ranges to use for a given number of weights between (excluding)
318 ucol_allocWeights(uint32_t lowerLimit
, uint32_t upperLimit
,
321 WeightRange ranges
[7]) {
322 /* number of usable byte values 3..maxByte */
323 uint32_t countBytes
=maxByte
-UCOL_BYTE_FIRST_TAILORED
+1;
325 uint32_t lengthCounts
[6]; /* [0] unused, [5] to make index checks unnecessary */
327 int32_t i
, rangeCount
, minLength
/*, maxLength*/;
329 /* countBytes to the power of index */
331 /* gcc requires explicit initialization */
333 powers
[1] = countBytes
;
334 powers
[2] = countBytes
*countBytes
;
335 powers
[3] = countBytes
*countBytes
*countBytes
;
336 powers
[4] = countBytes
*countBytes
*countBytes
*countBytes
;
342 rangeCount
=getWeightRanges(lowerLimit
, upperLimit
, maxByte
, countBytes
, ranges
);
345 printf("error: unable to get Weight ranges\n");
350 /* what is the maximum number of weights with these ranges? */
352 for(i
=0; i
<rangeCount
; ++i
) {
353 maxCount
+=(uint32_t)ranges
[i
].count
*powers
[4-ranges
[i
].length
];
357 printf("the maximum number of %lu weights is sufficient for n=%lu\n", maxCount
, n
);
361 printf("error: the maximum number of %lu weights is insufficient for n=%lu\n", maxCount
, n
);
366 /* set the length2 and count2 fields */
367 for(i
=0; i
<rangeCount
; ++i
) {
368 ranges
[i
].length2
=ranges
[i
].length
;
369 ranges
[i
].count2
=(uint32_t)ranges
[i
].count
;
372 /* try until we find suitably large ranges */
374 /* get the smallest number of bytes in a range */
375 minLength
=ranges
[0].length2
;
377 /* sum up the number of elements that fit into ranges of each byte length */
378 uprv_memset(lengthCounts
, 0, sizeof(lengthCounts
));
379 for(i
=0; i
<rangeCount
; ++i
) {
380 lengthCounts
[ranges
[i
].length2
]+=ranges
[i
].count2
;
383 /* now try to allocate n elements in the available short ranges */
384 if(n
<=(lengthCounts
[minLength
]+lengthCounts
[minLength
+1])) {
385 /* trivial cases, use the first few ranges */
389 maxCount
+=ranges
[rangeCount
].count2
;
393 printf("take first %ld ranges\n", rangeCount
);
396 } else if(n
<=ranges
[0].count2
*countBytes
) {
397 /* easy case, just make this one range large enough by lengthening it once more, possibly split it */
398 uint32_t count1
, count2
, power_1
, power
;
400 /*maxLength=minLength+1;*/
402 /* calculate how to split the range between maxLength-1 (count1) and maxLength (count2) */
403 power_1
=powers
[minLength
-ranges
[0].length
];
404 power
=power_1
*countBytes
;
405 count2
=(n
+power
-1)/power
;
406 count1
=ranges
[0].count
-count2
;
408 /* split the range */
410 printf("split the first range %ld:%ld\n", count1
, count2
);
415 /* lengthen the entire range to maxLength */
416 lengthenRange(ranges
, maxByte
, countBytes
);
418 /* really split the range */
421 /* create a new range with the end and initial and current length of the old one */
423 ranges
[1].end
=ranges
[0].end
;
424 ranges
[1].length
=ranges
[0].length
;
425 ranges
[1].length2
=minLength
;
427 /* set the end of the first range according to count1 */
429 byte
=getWeightByte(ranges
[0].start
, i
)+count1
-1;
432 * ranges[0].count and count1 may be >countBytes
433 * from merging adjacent ranges;
434 * byte>maxByte is possible
437 ranges
[0].end
=setWeightByte(ranges
[0].start
, i
, byte
);
438 } else /* byte>maxByte */ {
439 ranges
[0].end
=setWeightByte(incWeight(ranges
[0].start
, i
-1, maxByte
), i
, byte
-countBytes
);
442 /* set the bytes in the end weight at length+1..length2 to maxByte */
443 byte
=(maxByte
<<24)|(maxByte
<<16)|(maxByte
<<8)|maxByte
; /* this used to be 0xffffffff */
444 ranges
[0].end
=truncateWeight(ranges
[0].end
, i
)|
445 ((byte
>>(8*i
))&(byte
<<(8*(4-minLength
))));
447 /* set the start of the second range to immediately follow the end of the first one */
448 ranges
[1].start
=incWeight(ranges
[0].end
, minLength
, maxByte
);
450 /* set the count values (informational) */
451 ranges
[0].count
=count1
;
452 ranges
[1].count
=count2
;
454 ranges
[0].count2
=count1
*power_1
;
455 ranges
[1].count2
=count2
*power_1
; /* will be *countBytes when lengthened */
457 /* lengthen the second range to maxLength */
458 lengthenRange(ranges
+1, maxByte
, countBytes
);
463 /* no good match, lengthen all minLength ranges and iterate */
465 printf("lengthen the short ranges from %ld bytes to %ld and iterate\n", minLength
, minLength
+1);
467 for(i
=0; ranges
[i
].length2
==minLength
; ++i
) {
468 lengthenRange(ranges
+i
, maxByte
, countBytes
);
473 /* sort the ranges by weight values */
474 UErrorCode errorCode
=U_ZERO_ERROR
;
475 uprv_sortArray(ranges
, rangeCount
, sizeof(WeightRange
), compareRanges
, NULL
, FALSE
, &errorCode
);
476 /* ignore error code: we know that the internal sort function will not fail here */
480 puts("final ranges:");
481 for(i
=0; i
<rangeCount
; ++i
) {
482 printf("ranges[%ld] .start=0x%08lx .end=0x%08lx .length=%ld .length2=%ld .count=%ld .count2=%lu\n",
483 i
, ranges
[i
].start
, ranges
[i
].end
, ranges
[i
].length
, ranges
[i
].length2
, ranges
[i
].count
, ranges
[i
].count2
);
487 /* set maxByte in ranges[0] for ucol_nextWeight() */
488 ranges
[0].count
=maxByte
;
494 * given a set of ranges calculated by ucol_allocWeights(),
495 * iterate through the weights
498 ucol_nextWeight(WeightRange ranges
[], int32_t *pRangeCount
) {
499 if(*pRangeCount
<=0) {
502 uint32_t weight
, maxByte
;
504 /* get maxByte from the .count field */
505 maxByte
=ranges
[0].count
;
507 /* get the next weight */
508 weight
=ranges
[0].start
;
509 if(weight
==ranges
[0].end
) {
510 /* this range is finished, remove it and move the following ones up */
511 if(--*pRangeCount
>0) {
512 uprv_memmove(ranges
, ranges
+1, *pRangeCount
*sizeof(WeightRange
));
513 ranges
[0].count
=maxByte
; /* keep maxByte in ranges[0] */
516 /* increment the weight for the next value */
517 ranges
[0].start
=incWeight(weight
, ranges
[0].length2
, maxByte
);
524 #if 0 // #ifdef UCOL_DEBUG
527 testAlloc(uint32_t lowerLimit
, uint32_t upperLimit
, uint32_t n
, UBool enumerate
) {
528 WeightRange ranges
[8];
531 rangeCount
=ucol_allocWeights(lowerLimit
, upperLimit
, n
, ranges
);
536 weight
=ucol_nextWeight(ranges
, &rangeCount
);
537 if(weight
==0xffffffff) {
538 printf("error: 0xffffffff with %lu more weights to go\n", n
);
541 printf(" 0x%08lx\n", weight
);
548 main(int argc
, const char *argv
[]) {
551 testAlloc(0x364214fc, 0x44b87d23, 5, FALSE
);
552 testAlloc(0x36421500, 0x44b87d23, 5, FALSE
);
553 testAlloc(0x36421500, 0x44b87d23, 20, FALSE
);
554 testAlloc(0x36421500, 0x44b87d23, 13700, FALSE
);
555 testAlloc(0x36421500, 0x38b87d23, 1, FALSE
);
556 testAlloc(0x36421500, 0x38b87d23, 20, FALSE
);
557 testAlloc(0x36421500, 0x38b87d23, 200, TRUE
);
558 testAlloc(0x36421500, 0x38b87d23, 13700, FALSE
);
559 testAlloc(0x36421500, 0x37b87d23, 13700, FALSE
);
560 testAlloc(0x36ef1500, 0x37b87d23, 13700, FALSE
);
561 testAlloc(0x36421500, 0x36b87d23, 13700, FALSE
);
562 testAlloc(0x36b87122, 0x36b87d23, 13700, FALSE
);
563 testAlloc(0x49000000, 0x4a600000, 13700, FALSE
);
564 testAlloc(0x9fffffff, 0xd0000000, 13700, FALSE
);
565 testAlloc(0x9fffffff, 0xd0000000, 67400, FALSE
);
566 testAlloc(0x9fffffff, 0xa0030000, 67400, FALSE
);
567 testAlloc(0x9fffffff, 0xa0030000, 40000, FALSE
);
568 testAlloc(0xa0000000, 0xa0030000, 40000, FALSE
);
569 testAlloc(0xa0031100, 0xa0030000, 40000, FALSE
);
577 #endif /* #if !UCONFIG_NO_COLLATION */