1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
4 *******************************************************************************
6 * Copyright (C) 1999-2015, International Business Machines
7 * Corporation and others. All Rights Reserved.
9 *******************************************************************************
10 * file name: collationweights.cpp
12 * tab size: 8 (not used)
15 * created on: 2001mar08 as ucol_wgt.cpp
16 * created by: Markus W. Scherer
18 * This file contains code for allocating n collation element weights
19 * between two exclusive limits.
20 * It is used only internally by the collation tailoring builder.
23 #include "unicode/utypes.h"
25 #if !UCONFIG_NO_COLLATION
28 #include "collation.h"
29 #include "collationweights.h"
39 /* collation element weight allocation -------------------------------------- */
41 /* helper functions for CE weights */
43 static inline uint32_t
44 getWeightTrail(uint32_t weight
, int32_t length
) {
45 return (uint32_t)(weight
>>(8*(4-length
)))&0xff;
48 static inline uint32_t
49 setWeightTrail(uint32_t weight
, int32_t length
, uint32_t trail
) {
51 return (uint32_t)((weight
&(0xffffff00<<length
))|(trail
<<length
));
54 static inline uint32_t
55 getWeightByte(uint32_t weight
, int32_t idx
) {
56 return getWeightTrail(weight
, idx
); /* same calculation */
59 static inline uint32_t
60 setWeightByte(uint32_t weight
, int32_t idx
, uint32_t byte
) {
61 uint32_t mask
; /* 0xffffffff except a 00 "hole" for the index-th byte */
65 mask
=((uint32_t)0xffffffff)>>idx
;
67 // Do not use uint32_t>>32 because on some platforms that does not shift at all
68 // while we need it to become 0.
69 // PowerPC: 0xffffffff>>32 = 0 (wanted)
70 // x86: 0xffffffff>>32 = 0xffffffff (not wanted)
72 // ANSI C99 6.5.7 Bitwise shift operators:
73 // "If the value of the right operand is negative
74 // or is greater than or equal to the width of the promoted left operand,
75 // the behavior is undefined."
79 mask
|=0xffffff00<<idx
;
80 return (uint32_t)((weight
&mask
)|(byte
<<idx
));
83 static inline uint32_t
84 truncateWeight(uint32_t weight
, int32_t length
) {
85 return (uint32_t)(weight
&(0xffffffff<<(8*(4-length
))));
88 static inline uint32_t
89 incWeightTrail(uint32_t weight
, int32_t length
) {
90 return (uint32_t)(weight
+(1UL<<(8*(4-length
))));
93 static inline uint32_t
94 decWeightTrail(uint32_t weight
, int32_t length
) {
95 return (uint32_t)(weight
-(1UL<<(8*(4-length
))));
98 CollationWeights::CollationWeights()
99 : middleLength(0), rangeIndex(0), rangeCount(0) {
100 for(int32_t i
= 0; i
< 5; ++i
) {
101 minBytes
[i
] = maxBytes
[i
] = 0;
106 CollationWeights::initForPrimary(UBool compressible
) {
108 minBytes
[1] = Collation::MERGE_SEPARATOR_BYTE
+ 1;
109 maxBytes
[1] = Collation::TRAIL_WEIGHT_BYTE
;
111 minBytes
[2] = Collation::PRIMARY_COMPRESSION_LOW_BYTE
+ 1;
112 maxBytes
[2] = Collation::PRIMARY_COMPRESSION_HIGH_BYTE
- 1;
124 CollationWeights::initForSecondary() {
125 // We use only the lower 16 bits for secondary weights.
131 minBytes
[3] = Collation::LEVEL_SEPARATOR_BYTE
+ 1;
138 CollationWeights::initForTertiary() {
139 // We use only the lower 16 bits for tertiary weights.
145 // We use only 6 bits per byte.
146 // The other bits are used for case & quaternary weights.
147 minBytes
[3] = Collation::LEVEL_SEPARATOR_BYTE
+ 1;
154 CollationWeights::incWeight(uint32_t weight
, int32_t length
) const {
156 uint32_t byte
=getWeightByte(weight
, length
);
157 if(byte
<maxBytes
[length
]) {
158 return setWeightByte(weight
, length
, byte
+1);
160 // Roll over, set this byte to the minimum and increment the previous one.
161 weight
=setWeightByte(weight
, length
, minBytes
[length
]);
163 U_ASSERT(length
> 0);
169 CollationWeights::incWeightByOffset(uint32_t weight
, int32_t length
, int32_t offset
) const {
171 offset
+= getWeightByte(weight
, length
);
172 if((uint32_t)offset
<= maxBytes
[length
]) {
173 return setWeightByte(weight
, length
, offset
);
175 // Split the offset between this byte and the previous one.
176 offset
-= minBytes
[length
];
177 weight
= setWeightByte(weight
, length
, minBytes
[length
] + offset
% countBytes(length
));
178 offset
/= countBytes(length
);
180 U_ASSERT(length
> 0);
186 CollationWeights::lengthenRange(WeightRange
&range
) const {
187 int32_t length
=range
.length
+1;
188 range
.start
=setWeightTrail(range
.start
, length
, minBytes
[length
]);
189 range
.end
=setWeightTrail(range
.end
, length
, maxBytes
[length
]);
190 range
.count
*=countBytes(length
);
194 /* for uprv_sortArray: sort ranges in weight order */
195 static int32_t U_CALLCONV
196 compareRanges(const void * /*context*/, const void *left
, const void *right
) {
199 l
=((const CollationWeights::WeightRange
*)left
)->start
;
200 r
=((const CollationWeights::WeightRange
*)right
)->start
;
211 CollationWeights::getWeightRanges(uint32_t lowerLimit
, uint32_t upperLimit
) {
212 U_ASSERT(lowerLimit
!= 0);
213 U_ASSERT(upperLimit
!= 0);
215 /* get the lengths of the limits */
216 int32_t lowerLength
=lengthOfWeight(lowerLimit
);
217 int32_t upperLength
=lengthOfWeight(upperLimit
);
220 printf("length of lower limit 0x%08lx is %ld\n", lowerLimit
, lowerLength
);
221 printf("length of upper limit 0x%08lx is %ld\n", upperLimit
, upperLength
);
223 U_ASSERT(lowerLength
>=middleLength
);
224 // Permit upperLength<middleLength: The upper limit for secondaries is 0x10000.
226 if(lowerLimit
>=upperLimit
) {
228 printf("error: no space between lower & upper limits\n");
233 /* check that neither is a prefix of the other */
234 if(lowerLength
<upperLength
) {
235 if(lowerLimit
==truncateWeight(upperLimit
, lowerLength
)) {
237 printf("error: lower limit 0x%08lx is a prefix of upper limit 0x%08lx\n", lowerLimit
, upperLimit
);
242 /* if the upper limit is a prefix of the lower limit then the earlier test lowerLimit>=upperLimit has caught it */
244 WeightRange lower
[5], middle
, upper
[5]; /* [0] and [1] are not used - this simplifies indexing */
245 uprv_memset(lower
, 0, sizeof(lower
));
246 uprv_memset(&middle
, 0, sizeof(middle
));
247 uprv_memset(upper
, 0, sizeof(upper
));
250 * With the limit lengths of 1..4, there are up to 7 ranges for allocation:
251 * range minimum length
260 * We are now going to calculate up to 7 ranges.
261 * Some of them will typically overlap, so we will then have to merge and eliminate ranges.
263 uint32_t weight
=lowerLimit
;
264 for(int32_t length
=lowerLength
; length
>middleLength
; --length
) {
265 uint32_t trail
=getWeightTrail(weight
, length
);
266 if(trail
<maxBytes
[length
]) {
267 lower
[length
].start
=incWeightTrail(weight
, length
);
268 lower
[length
].end
=setWeightTrail(weight
, length
, maxBytes
[length
]);
269 lower
[length
].length
=length
;
270 lower
[length
].count
=maxBytes
[length
]-trail
;
272 weight
=truncateWeight(weight
, length
-1);
274 if(weight
<0xff000000) {
275 middle
.start
=incWeightTrail(weight
, middleLength
);
277 // Prevent overflow for primary lead byte FF
278 // which would yield a middle range starting at 0.
279 middle
.start
=0xffffffff; // no middle range
283 for(int32_t length
=upperLength
; length
>middleLength
; --length
) {
284 uint32_t trail
=getWeightTrail(weight
, length
);
285 if(trail
>minBytes
[length
]) {
286 upper
[length
].start
=setWeightTrail(weight
, length
, minBytes
[length
]);
287 upper
[length
].end
=decWeightTrail(weight
, length
);
288 upper
[length
].length
=length
;
289 upper
[length
].count
=trail
-minBytes
[length
];
291 weight
=truncateWeight(weight
, length
-1);
293 middle
.end
=decWeightTrail(weight
, middleLength
);
295 /* set the middle range */
296 middle
.length
=middleLength
;
297 if(middle
.end
>=middle
.start
) {
298 middle
.count
=(int32_t)((middle
.end
-middle
.start
)>>(8*(4-middleLength
)))+1;
300 /* no middle range, eliminate overlaps */
301 for(int32_t length
=4; length
>middleLength
; --length
) {
302 if(lower
[length
].count
>0 && upper
[length
].count
>0) {
303 // Note: The lowerEnd and upperStart weights are versions of
304 // lowerLimit and upperLimit (which are lowerLimit<upperLimit),
305 // truncated (still less-or-equal)
306 // and then with their last bytes changed to the
307 // maxByte (for lowerEnd) or minByte (for upperStart).
308 const uint32_t lowerEnd
=lower
[length
].end
;
309 const uint32_t upperStart
=upper
[length
].start
;
312 if(lowerEnd
>upperStart
) {
313 // These two lower and upper ranges collide.
314 // Since lowerLimit<upperLimit and lowerEnd and upperStart
315 // are versions with only their last bytes modified
316 // (and following ones removed/reset to 0),
317 // lowerEnd>upperStart is only possible
318 // if the leading bytes are equal
319 // and lastByte(lowerEnd)>lastByte(upperStart).
320 U_ASSERT(truncateWeight(lowerEnd
, length
-1)==
321 truncateWeight(upperStart
, length
-1));
322 // Intersect these two ranges.
323 lower
[length
].end
=upper
[length
].end
;
325 (int32_t)getWeightTrail(lower
[length
].end
, length
)-
326 (int32_t)getWeightTrail(lower
[length
].start
, length
)+1;
327 // count might be <=0 in which case there is no room,
328 // and the range-collecting code below will ignore this range.
330 } else if(lowerEnd
==upperStart
) {
331 // Not possible, unless minByte==maxByte which is not allowed.
332 U_ASSERT(minBytes
[length
]<maxBytes
[length
]);
333 } else /* lowerEnd<upperStart */ {
334 if(incWeight(lowerEnd
, length
)==upperStart
) {
335 // Merge adjacent ranges.
336 lower
[length
].end
=upper
[length
].end
;
337 lower
[length
].count
+=upper
[length
].count
; // might be >countBytes
342 // Remove all shorter ranges.
343 // There was no room available for them between the ranges we just merged.
344 upper
[length
].count
=0;
345 while(--length
>middleLength
) {
346 lower
[length
].count
=upper
[length
].count
=0;
356 for(int32_t length
=4; length
>=2; --length
) {
357 if(lower
[length
].count
>0) {
358 printf("lower[%ld] .start=0x%08lx .end=0x%08lx .count=%ld\n", length
, lower
[length
].start
, lower
[length
].end
, lower
[length
].count
);
362 printf("middle .start=0x%08lx .end=0x%08lx .count=%ld\n", middle
.start
, middle
.end
, middle
.count
);
364 for(int32_t length
=2; length
<=4; ++length
) {
365 if(upper
[length
].count
>0) {
366 printf("upper[%ld] .start=0x%08lx .end=0x%08lx .count=%ld\n", length
, upper
[length
].start
, upper
[length
].end
, upper
[length
].count
);
371 /* copy the ranges, shortest first, into the result array */
374 uprv_memcpy(ranges
, &middle
, sizeof(WeightRange
));
377 for(int32_t length
=middleLength
+1; length
<=4; ++length
) {
378 /* copy upper first so that later the middle range is more likely the first one to use */
379 if(upper
[length
].count
>0) {
380 uprv_memcpy(ranges
+rangeCount
, upper
+length
, sizeof(WeightRange
));
383 if(lower
[length
].count
>0) {
384 uprv_memcpy(ranges
+rangeCount
, lower
+length
, sizeof(WeightRange
));
392 CollationWeights::allocWeightsInShortRanges(int32_t n
, int32_t minLength
) {
393 // See if the first few minLength and minLength+1 ranges have enough weights.
394 for(int32_t i
= 0; i
< rangeCount
&& ranges
[i
].length
<= (minLength
+ 1); ++i
) {
395 if(n
<= ranges
[i
].count
) {
396 // Use the first few minLength and minLength+1 ranges.
397 if(ranges
[i
].length
> minLength
) {
398 // Reduce the number of weights from the last minLength+1 range
399 // which might sort before some minLength ranges,
400 // so that we use all weights in the minLength ranges.
405 printf("take first %ld ranges\n", rangeCount
);
409 /* sort the ranges by weight values */
410 UErrorCode errorCode
=U_ZERO_ERROR
;
411 uprv_sortArray(ranges
, rangeCount
, sizeof(WeightRange
),
412 compareRanges
, NULL
, FALSE
, &errorCode
);
413 /* ignore error code: we know that the internal sort function will not fail here */
417 n
-= ranges
[i
].count
; // still >0
423 CollationWeights::allocWeightsInMinLengthRanges(int32_t n
, int32_t minLength
) {
424 // See if the minLength ranges have enough weights
425 // when we split one and lengthen the following ones.
427 int32_t minLengthRangeCount
;
428 for(minLengthRangeCount
= 0;
429 minLengthRangeCount
< rangeCount
&&
430 ranges
[minLengthRangeCount
].length
== minLength
;
431 ++minLengthRangeCount
) {
432 count
+= ranges
[minLengthRangeCount
].count
;
435 int32_t nextCountBytes
= countBytes(minLength
+ 1);
436 if(n
> count
* nextCountBytes
) { return FALSE
; }
438 // Use the minLength ranges. Merge them, and then split again as necessary.
439 uint32_t start
= ranges
[0].start
;
440 uint32_t end
= ranges
[0].end
;
441 for(int32_t i
= 1; i
< minLengthRangeCount
; ++i
) {
442 if(ranges
[i
].start
< start
) { start
= ranges
[i
].start
; }
443 if(ranges
[i
].end
> end
) { end
= ranges
[i
].end
; }
446 // Calculate how to split the range between minLength (count1) and minLength+1 (count2).
448 // count1 + count2 * nextCountBytes = n
449 // count1 + count2 = count
451 // (count - count2) + count2 * nextCountBytes = n
452 // and then into the following count1 & count2 computations.
453 int32_t count2
= (n
- count
) / (nextCountBytes
- 1); // number of weights to be lengthened
454 int32_t count1
= count
- count2
; // number of minLength weights
455 if(count2
== 0 || (count1
+ count2
* nextCountBytes
) < n
) {
459 U_ASSERT((count1
+ count2
* nextCountBytes
) >= n
);
462 ranges
[0].start
= start
;
465 // Make one long range.
467 ranges
[0].count
= count
;
468 lengthenRange(ranges
[0]);
471 // Split the range, lengthen the second part.
473 printf("split the range number %ld (out of %ld minLength ranges) by %ld:%ld\n",
474 splitRange
, rangeCount
, count1
, count2
);
477 // Next start = start + count1. First end = 1 before that.
478 ranges
[0].end
= incWeightByOffset(start
, minLength
, count1
- 1);
479 ranges
[0].count
= count1
;
481 ranges
[1].start
= incWeight(ranges
[0].end
, minLength
);
483 ranges
[1].length
= minLength
; // +1 when lengthened
484 ranges
[1].count
= count2
; // *countBytes when lengthened
485 lengthenRange(ranges
[1]);
492 * call getWeightRanges and then determine heuristically
493 * which ranges to use for a given number of weights between (excluding)
497 CollationWeights::allocWeights(uint32_t lowerLimit
, uint32_t upperLimit
, int32_t n
) {
502 if(!getWeightRanges(lowerLimit
, upperLimit
)) {
504 printf("error: unable to get Weight ranges\n");
509 /* try until we find suitably large ranges */
511 /* get the smallest number of bytes in a range */
512 int32_t minLength
=ranges
[0].length
;
514 if(allocWeightsInShortRanges(n
, minLength
)) { break; }
518 printf("error: the maximum number of %ld weights is insufficient for n=%ld\n",
524 if(allocWeightsInMinLengthRanges(n
, minLength
)) { break; }
526 /* no good match, lengthen all minLength ranges and iterate */
528 printf("lengthen the short ranges from %ld bytes to %ld and iterate\n", minLength
, minLength
+1);
530 for(int32_t i
=0; i
<rangeCount
&& ranges
[i
].length
==minLength
; ++i
) {
531 lengthenRange(ranges
[i
]);
536 puts("final ranges:");
537 for(int32_t i
=0; i
<rangeCount
; ++i
) {
538 printf("ranges[%ld] .start=0x%08lx .end=0x%08lx .length=%ld .count=%ld\n",
539 i
, ranges
[i
].start
, ranges
[i
].end
, ranges
[i
].length
, ranges
[i
].count
);
548 CollationWeights::nextWeight() {
549 if(rangeIndex
>= rangeCount
) {
552 /* get the next weight */
553 WeightRange
&range
= ranges
[rangeIndex
];
554 uint32_t weight
= range
.start
;
555 if(--range
.count
== 0) {
556 /* this range is finished */
559 /* increment the weight for the next value */
560 range
.start
= incWeight(weight
, range
.length
);
561 U_ASSERT(range
.start
<= range
.end
);
570 #endif /* #if !UCONFIG_NO_COLLATION */