2 *******************************************************************************
4 * Copyright (C) 1999-2014, International Business Machines
5 * Corporation and others. All Rights Reserved.
7 *******************************************************************************
8 * file name: collationweights.cpp
10 * tab size: 8 (not used)
13 * created on: 2001mar08 as ucol_wgt.cpp
14 * created by: Markus W. Scherer
16 * This file contains code for allocating n collation element weights
17 * between two exclusive limits.
18 * It is used only internally by the collation tailoring builder.
21 #include "unicode/utypes.h"
23 #if !UCONFIG_NO_COLLATION
26 #include "collation.h"
27 #include "collationweights.h"
37 /* collation element weight allocation -------------------------------------- */
39 /* helper functions for CE weights */
41 static inline uint32_t
42 getWeightTrail(uint32_t weight
, int32_t length
) {
43 return (uint32_t)(weight
>>(8*(4-length
)))&0xff;
46 static inline uint32_t
47 setWeightTrail(uint32_t weight
, int32_t length
, uint32_t trail
) {
49 return (uint32_t)((weight
&(0xffffff00<<length
))|(trail
<<length
));
52 static inline uint32_t
53 getWeightByte(uint32_t weight
, int32_t idx
) {
54 return getWeightTrail(weight
, idx
); /* same calculation */
57 static inline uint32_t
58 setWeightByte(uint32_t weight
, int32_t idx
, uint32_t byte
) {
59 uint32_t mask
; /* 0xffffffff except a 00 "hole" for the index-th byte */
63 mask
=((uint32_t)0xffffffff)>>idx
;
65 // Do not use uint32_t>>32 because on some platforms that does not shift at all
66 // while we need it to become 0.
67 // PowerPC: 0xffffffff>>32 = 0 (wanted)
68 // x86: 0xffffffff>>32 = 0xffffffff (not wanted)
70 // ANSI C99 6.5.7 Bitwise shift operators:
71 // "If the value of the right operand is negative
72 // or is greater than or equal to the width of the promoted left operand,
73 // the behavior is undefined."
77 mask
|=0xffffff00<<idx
;
78 return (uint32_t)((weight
&mask
)|(byte
<<idx
));
81 static inline uint32_t
82 truncateWeight(uint32_t weight
, int32_t length
) {
83 return (uint32_t)(weight
&(0xffffffff<<(8*(4-length
))));
86 static inline uint32_t
87 incWeightTrail(uint32_t weight
, int32_t length
) {
88 return (uint32_t)(weight
+(1UL<<(8*(4-length
))));
91 static inline uint32_t
92 decWeightTrail(uint32_t weight
, int32_t length
) {
93 return (uint32_t)(weight
-(1UL<<(8*(4-length
))));
96 CollationWeights::CollationWeights()
97 : middleLength(0), rangeIndex(0), rangeCount(0) {
98 for(int32_t i
= 0; i
< 5; ++i
) {
99 minBytes
[i
] = maxBytes
[i
] = 0;
104 CollationWeights::initForPrimary(UBool compressible
) {
106 minBytes
[1] = Collation::MERGE_SEPARATOR_BYTE
+ 1;
107 maxBytes
[1] = Collation::TRAIL_WEIGHT_BYTE
;
109 minBytes
[2] = Collation::PRIMARY_COMPRESSION_LOW_BYTE
+ 1;
110 maxBytes
[2] = Collation::PRIMARY_COMPRESSION_HIGH_BYTE
- 1;
122 CollationWeights::initForSecondary() {
123 // We use only the lower 16 bits for secondary weights.
129 minBytes
[3] = Collation::MERGE_SEPARATOR_BYTE
+ 1;
136 CollationWeights::initForTertiary() {
137 // We use only the lower 16 bits for tertiary weights.
143 // We use only 6 bits per byte.
144 // The other bits are used for case & quaternary weights.
145 minBytes
[3] = Collation::MERGE_SEPARATOR_BYTE
+ 1;
152 CollationWeights::incWeight(uint32_t weight
, int32_t length
) const {
154 uint32_t byte
=getWeightByte(weight
, length
);
155 if(byte
<maxBytes
[length
]) {
156 return setWeightByte(weight
, length
, byte
+1);
158 // Roll over, set this byte to the minimum and increment the previous one.
159 weight
=setWeightByte(weight
, length
, minBytes
[length
]);
161 U_ASSERT(length
> 0);
167 CollationWeights::incWeightByOffset(uint32_t weight
, int32_t length
, int32_t offset
) const {
169 offset
+= getWeightByte(weight
, length
);
170 if((uint32_t)offset
<= maxBytes
[length
]) {
171 return setWeightByte(weight
, length
, offset
);
173 // Split the offset between this byte and the previous one.
174 offset
-= minBytes
[length
];
175 weight
= setWeightByte(weight
, length
, minBytes
[length
] + offset
% countBytes(length
));
176 offset
/= countBytes(length
);
178 U_ASSERT(length
> 0);
184 CollationWeights::lengthenRange(WeightRange
&range
) const {
185 int32_t length
=range
.length
+1;
186 range
.start
=setWeightTrail(range
.start
, length
, minBytes
[length
]);
187 range
.end
=setWeightTrail(range
.end
, length
, maxBytes
[length
]);
188 range
.count
*=countBytes(length
);
192 /* for uprv_sortArray: sort ranges in weight order */
193 static int32_t U_CALLCONV
194 compareRanges(const void * /*context*/, const void *left
, const void *right
) {
197 l
=((const CollationWeights::WeightRange
*)left
)->start
;
198 r
=((const CollationWeights::WeightRange
*)right
)->start
;
209 CollationWeights::getWeightRanges(uint32_t lowerLimit
, uint32_t upperLimit
) {
210 U_ASSERT(lowerLimit
!= 0);
211 U_ASSERT(upperLimit
!= 0);
213 /* get the lengths of the limits */
214 int32_t lowerLength
=lengthOfWeight(lowerLimit
);
215 int32_t upperLength
=lengthOfWeight(upperLimit
);
218 printf("length of lower limit 0x%08lx is %ld\n", lowerLimit
, lowerLength
);
219 printf("length of upper limit 0x%08lx is %ld\n", upperLimit
, upperLength
);
221 U_ASSERT(lowerLength
>=middleLength
);
222 // Permit upperLength<middleLength: The upper limit for secondaries is 0x10000.
224 if(lowerLimit
>=upperLimit
) {
226 printf("error: no space between lower & upper limits\n");
231 /* check that neither is a prefix of the other */
232 if(lowerLength
<upperLength
) {
233 if(lowerLimit
==truncateWeight(upperLimit
, lowerLength
)) {
235 printf("error: lower limit 0x%08lx is a prefix of upper limit 0x%08lx\n", lowerLimit
, upperLimit
);
240 /* if the upper limit is a prefix of the lower limit then the earlier test lowerLimit>=upperLimit has caught it */
242 WeightRange lower
[5], middle
, upper
[5]; /* [0] and [1] are not used - this simplifies indexing */
243 uprv_memset(lower
, 0, sizeof(lower
));
244 uprv_memset(&middle
, 0, sizeof(middle
));
245 uprv_memset(upper
, 0, sizeof(upper
));
248 * With the limit lengths of 1..4, there are up to 7 ranges for allocation:
249 * range minimum length
258 * We are now going to calculate up to 7 ranges.
259 * Some of them will typically overlap, so we will then have to merge and eliminate ranges.
261 uint32_t weight
=lowerLimit
;
262 for(int32_t length
=lowerLength
; length
>middleLength
; --length
) {
263 uint32_t trail
=getWeightTrail(weight
, length
);
264 if(trail
<maxBytes
[length
]) {
265 lower
[length
].start
=incWeightTrail(weight
, length
);
266 lower
[length
].end
=setWeightTrail(weight
, length
, maxBytes
[length
]);
267 lower
[length
].length
=length
;
268 lower
[length
].count
=maxBytes
[length
]-trail
;
270 weight
=truncateWeight(weight
, length
-1);
272 if(weight
<0xff000000) {
273 middle
.start
=incWeightTrail(weight
, middleLength
);
275 // Prevent overflow for primary lead byte FF
276 // which would yield a middle range starting at 0.
277 middle
.start
=0xffffffff; // no middle range
281 for(int32_t length
=upperLength
; length
>middleLength
; --length
) {
282 uint32_t trail
=getWeightTrail(weight
, length
);
283 if(trail
>minBytes
[length
]) {
284 upper
[length
].start
=setWeightTrail(weight
, length
, minBytes
[length
]);
285 upper
[length
].end
=decWeightTrail(weight
, length
);
286 upper
[length
].length
=length
;
287 upper
[length
].count
=trail
-minBytes
[length
];
289 weight
=truncateWeight(weight
, length
-1);
291 middle
.end
=decWeightTrail(weight
, middleLength
);
293 /* set the middle range */
294 middle
.length
=middleLength
;
295 if(middle
.end
>=middle
.start
) {
296 middle
.count
=(int32_t)((middle
.end
-middle
.start
)>>(8*(4-middleLength
)))+1;
298 /* no middle range, eliminate overlaps */
300 /* reduce or remove the lower ranges that go beyond upperLimit */
301 for(int32_t length
=4; length
>middleLength
; --length
) {
302 if(lower
[length
].count
>0 && upper
[length
].count
>0) {
303 uint32_t start
=upper
[length
].start
;
304 uint32_t end
=lower
[length
].end
;
306 if(end
>=start
|| incWeight(end
, length
)==start
) {
307 /* lower and upper ranges collide or are directly adjacent: merge these two and remove all shorter ranges */
308 start
=lower
[length
].start
;
309 end
=lower
[length
].end
=upper
[length
].end
;
311 * merging directly adjacent ranges needs to subtract the 0/1 gaps in between;
312 * it may result in a range with count>countBytes
315 (int32_t)(getWeightTrail(end
, length
)-getWeightTrail(start
, length
)+1+
316 countBytes(length
)*(getWeightByte(end
, length
-1)-getWeightByte(start
, length
-1)));
317 upper
[length
].count
=0;
318 while(--length
>middleLength
) {
319 lower
[length
].count
=upper
[length
].count
=0;
329 for(int32_t length
=4; length
>=2; --length
) {
330 if(lower
[length
].count
>0) {
331 printf("lower[%ld] .start=0x%08lx .end=0x%08lx .count=%ld\n", length
, lower
[length
].start
, lower
[length
].end
, lower
[length
].count
);
335 printf("middle .start=0x%08lx .end=0x%08lx .count=%ld\n", middle
.start
, middle
.end
, middle
.count
);
337 for(int32_t length
=2; length
<=4; ++length
) {
338 if(upper
[length
].count
>0) {
339 printf("upper[%ld] .start=0x%08lx .end=0x%08lx .count=%ld\n", length
, upper
[length
].start
, upper
[length
].end
, upper
[length
].count
);
344 /* copy the ranges, shortest first, into the result array */
347 uprv_memcpy(ranges
, &middle
, sizeof(WeightRange
));
350 for(int32_t length
=middleLength
+1; length
<=4; ++length
) {
351 /* copy upper first so that later the middle range is more likely the first one to use */
352 if(upper
[length
].count
>0) {
353 uprv_memcpy(ranges
+rangeCount
, upper
+length
, sizeof(WeightRange
));
356 if(lower
[length
].count
>0) {
357 uprv_memcpy(ranges
+rangeCount
, lower
+length
, sizeof(WeightRange
));
365 CollationWeights::allocWeightsInShortRanges(int32_t n
, int32_t minLength
) {
366 // See if the first few minLength and minLength+1 ranges have enough weights.
367 for(int32_t i
= 0; i
< rangeCount
&& ranges
[i
].length
<= (minLength
+ 1); ++i
) {
368 if(n
<= ranges
[i
].count
) {
369 // Use the first few minLength and minLength+1 ranges.
370 if(ranges
[i
].length
> minLength
) {
371 // Reduce the number of weights from the last minLength+1 range
372 // which might sort before some minLength ranges,
373 // so that we use all weights in the minLength ranges.
378 printf("take first %ld ranges\n", rangeCount
);
382 /* sort the ranges by weight values */
383 UErrorCode errorCode
=U_ZERO_ERROR
;
384 uprv_sortArray(ranges
, rangeCount
, sizeof(WeightRange
),
385 compareRanges
, NULL
, FALSE
, &errorCode
);
386 /* ignore error code: we know that the internal sort function will not fail here */
390 n
-= ranges
[i
].count
; // still >0
396 CollationWeights::allocWeightsInMinLengthRanges(int32_t n
, int32_t minLength
) {
397 // See if the minLength ranges have enough weights
398 // when we split one and lengthen the following ones.
400 int32_t minLengthRangeCount
;
401 for(minLengthRangeCount
= 0;
402 minLengthRangeCount
< rangeCount
&&
403 ranges
[minLengthRangeCount
].length
== minLength
;
404 ++minLengthRangeCount
) {
405 count
+= ranges
[minLengthRangeCount
].count
;
408 int32_t nextCountBytes
= countBytes(minLength
+ 1);
409 if(n
> count
* nextCountBytes
) { return FALSE
; }
411 // Use the minLength ranges. Merge them, and then split again as necessary.
412 uint32_t start
= ranges
[0].start
;
413 uint32_t end
= ranges
[0].end
;
414 for(int32_t i
= 1; i
< minLengthRangeCount
; ++i
) {
415 if(ranges
[i
].start
< start
) { start
= ranges
[i
].start
; }
416 if(ranges
[i
].end
> end
) { end
= ranges
[i
].end
; }
419 // Calculate how to split the range between minLength (count1) and minLength+1 (count2).
421 // count1 + count2 * nextCountBytes = n
422 // count1 + count2 = count
424 // (count - count2) + count2 * nextCountBytes = n
425 // and then into the following count1 & count2 computations.
426 int32_t count2
= (n
- count
) / (nextCountBytes
- 1); // number of weights to be lengthened
427 int32_t count1
= count
- count2
; // number of minLength weights
428 if(count2
== 0 || (count1
+ count2
* nextCountBytes
) < n
) {
432 U_ASSERT((count1
+ count2
* nextCountBytes
) >= n
);
435 ranges
[0].start
= start
;
438 // Make one long range.
440 ranges
[0].count
= count
;
441 lengthenRange(ranges
[0]);
444 // Split the range, lengthen the second part.
446 printf("split the range number %ld (out of %ld minLength ranges) by %ld:%ld\n",
447 splitRange
, rangeCount
, count1
, count2
);
450 // Next start = start + count1. First end = 1 before that.
451 ranges
[0].end
= incWeightByOffset(start
, minLength
, count1
- 1);
452 ranges
[0].count
= count1
;
454 ranges
[1].start
= incWeight(ranges
[0].end
, minLength
);
456 ranges
[1].length
= minLength
; // +1 when lengthened
457 ranges
[1].count
= count2
; // *countBytes when lengthened
458 lengthenRange(ranges
[1]);
465 * call getWeightRanges and then determine heuristically
466 * which ranges to use for a given number of weights between (excluding)
470 CollationWeights::allocWeights(uint32_t lowerLimit
, uint32_t upperLimit
, int32_t n
) {
475 if(!getWeightRanges(lowerLimit
, upperLimit
)) {
477 printf("error: unable to get Weight ranges\n");
482 /* try until we find suitably large ranges */
484 /* get the smallest number of bytes in a range */
485 int32_t minLength
=ranges
[0].length
;
487 if(allocWeightsInShortRanges(n
, minLength
)) { break; }
491 printf("error: the maximum number of %ld weights is insufficient for n=%ld\n",
497 if(allocWeightsInMinLengthRanges(n
, minLength
)) { break; }
499 /* no good match, lengthen all minLength ranges and iterate */
501 printf("lengthen the short ranges from %ld bytes to %ld and iterate\n", minLength
, minLength
+1);
503 for(int32_t i
=0; ranges
[i
].length
==minLength
; ++i
) {
504 lengthenRange(ranges
[i
]);
509 puts("final ranges:");
510 for(int32_t i
=0; i
<rangeCount
; ++i
) {
511 printf("ranges[%ld] .start=0x%08lx .end=0x%08lx .length=%ld .count=%ld\n",
512 i
, ranges
[i
].start
, ranges
[i
].end
, ranges
[i
].length
, ranges
[i
].count
);
521 CollationWeights::nextWeight() {
522 if(rangeIndex
>= rangeCount
) {
525 /* get the next weight */
526 WeightRange
&range
= ranges
[rangeIndex
];
527 uint32_t weight
= range
.start
;
528 if(--range
.count
== 0) {
529 /* this range is finished */
532 /* increment the weight for the next value */
533 range
.start
= incWeight(weight
, range
.length
);
534 U_ASSERT(range
.start
<= range
.end
);
543 #endif /* #if !UCONFIG_NO_COLLATION */