2 *******************************************************************************
4 * Copyright (C) 1999-2015, International Business Machines
5 * Corporation and others. All Rights Reserved.
7 *******************************************************************************
8 * file name: collationweights.cpp
10 * tab size: 8 (not used)
13 * created on: 2001mar08 as ucol_wgt.cpp
14 * created by: Markus W. Scherer
16 * This file contains code for allocating n collation element weights
17 * between two exclusive limits.
18 * It is used only internally by the collation tailoring builder.
21 #include "unicode/utypes.h"
23 #if !UCONFIG_NO_COLLATION
26 #include "collation.h"
27 #include "collationweights.h"
37 /* collation element weight allocation -------------------------------------- */
39 /* helper functions for CE weights */
41 static inline uint32_t
42 getWeightTrail(uint32_t weight
, int32_t length
) {
43 return (uint32_t)(weight
>>(8*(4-length
)))&0xff;
46 static inline uint32_t
47 setWeightTrail(uint32_t weight
, int32_t length
, uint32_t trail
) {
49 return (uint32_t)((weight
&(0xffffff00<<length
))|(trail
<<length
));
52 static inline uint32_t
53 getWeightByte(uint32_t weight
, int32_t idx
) {
54 return getWeightTrail(weight
, idx
); /* same calculation */
57 static inline uint32_t
58 setWeightByte(uint32_t weight
, int32_t idx
, uint32_t byte
) {
59 uint32_t mask
; /* 0xffffffff except a 00 "hole" for the index-th byte */
63 mask
=((uint32_t)0xffffffff)>>idx
;
65 // Do not use uint32_t>>32 because on some platforms that does not shift at all
66 // while we need it to become 0.
67 // PowerPC: 0xffffffff>>32 = 0 (wanted)
68 // x86: 0xffffffff>>32 = 0xffffffff (not wanted)
70 // ANSI C99 6.5.7 Bitwise shift operators:
71 // "If the value of the right operand is negative
72 // or is greater than or equal to the width of the promoted left operand,
73 // the behavior is undefined."
77 mask
|=0xffffff00<<idx
;
78 return (uint32_t)((weight
&mask
)|(byte
<<idx
));
81 static inline uint32_t
82 truncateWeight(uint32_t weight
, int32_t length
) {
83 return (uint32_t)(weight
&(0xffffffff<<(8*(4-length
))));
86 static inline uint32_t
87 incWeightTrail(uint32_t weight
, int32_t length
) {
88 return (uint32_t)(weight
+(1UL<<(8*(4-length
))));
91 static inline uint32_t
92 decWeightTrail(uint32_t weight
, int32_t length
) {
93 return (uint32_t)(weight
-(1UL<<(8*(4-length
))));
96 CollationWeights::CollationWeights()
97 : middleLength(0), rangeIndex(0), rangeCount(0) {
98 for(int32_t i
= 0; i
< 5; ++i
) {
99 minBytes
[i
] = maxBytes
[i
] = 0;
104 CollationWeights::initForPrimary(UBool compressible
) {
106 minBytes
[1] = Collation::MERGE_SEPARATOR_BYTE
+ 1;
107 maxBytes
[1] = Collation::TRAIL_WEIGHT_BYTE
;
109 minBytes
[2] = Collation::PRIMARY_COMPRESSION_LOW_BYTE
+ 1;
110 maxBytes
[2] = Collation::PRIMARY_COMPRESSION_HIGH_BYTE
- 1;
122 CollationWeights::initForSecondary() {
123 // We use only the lower 16 bits for secondary weights.
129 minBytes
[3] = Collation::LEVEL_SEPARATOR_BYTE
+ 1;
136 CollationWeights::initForTertiary() {
137 // We use only the lower 16 bits for tertiary weights.
143 // We use only 6 bits per byte.
144 // The other bits are used for case & quaternary weights.
145 minBytes
[3] = Collation::LEVEL_SEPARATOR_BYTE
+ 1;
152 CollationWeights::incWeight(uint32_t weight
, int32_t length
) const {
154 uint32_t byte
=getWeightByte(weight
, length
);
155 if(byte
<maxBytes
[length
]) {
156 return setWeightByte(weight
, length
, byte
+1);
158 // Roll over, set this byte to the minimum and increment the previous one.
159 weight
=setWeightByte(weight
, length
, minBytes
[length
]);
161 U_ASSERT(length
> 0);
167 CollationWeights::incWeightByOffset(uint32_t weight
, int32_t length
, int32_t offset
) const {
169 offset
+= getWeightByte(weight
, length
);
170 if((uint32_t)offset
<= maxBytes
[length
]) {
171 return setWeightByte(weight
, length
, offset
);
173 // Split the offset between this byte and the previous one.
174 offset
-= minBytes
[length
];
175 weight
= setWeightByte(weight
, length
, minBytes
[length
] + offset
% countBytes(length
));
176 offset
/= countBytes(length
);
178 U_ASSERT(length
> 0);
184 CollationWeights::lengthenRange(WeightRange
&range
) const {
185 int32_t length
=range
.length
+1;
186 range
.start
=setWeightTrail(range
.start
, length
, minBytes
[length
]);
187 range
.end
=setWeightTrail(range
.end
, length
, maxBytes
[length
]);
188 range
.count
*=countBytes(length
);
192 /* for uprv_sortArray: sort ranges in weight order */
193 static int32_t U_CALLCONV
194 compareRanges(const void * /*context*/, const void *left
, const void *right
) {
197 l
=((const CollationWeights::WeightRange
*)left
)->start
;
198 r
=((const CollationWeights::WeightRange
*)right
)->start
;
209 CollationWeights::getWeightRanges(uint32_t lowerLimit
, uint32_t upperLimit
) {
210 U_ASSERT(lowerLimit
!= 0);
211 U_ASSERT(upperLimit
!= 0);
213 /* get the lengths of the limits */
214 int32_t lowerLength
=lengthOfWeight(lowerLimit
);
215 int32_t upperLength
=lengthOfWeight(upperLimit
);
218 printf("length of lower limit 0x%08lx is %ld\n", lowerLimit
, lowerLength
);
219 printf("length of upper limit 0x%08lx is %ld\n", upperLimit
, upperLength
);
221 U_ASSERT(lowerLength
>=middleLength
);
222 // Permit upperLength<middleLength: The upper limit for secondaries is 0x10000.
224 if(lowerLimit
>=upperLimit
) {
226 printf("error: no space between lower & upper limits\n");
231 /* check that neither is a prefix of the other */
232 if(lowerLength
<upperLength
) {
233 if(lowerLimit
==truncateWeight(upperLimit
, lowerLength
)) {
235 printf("error: lower limit 0x%08lx is a prefix of upper limit 0x%08lx\n", lowerLimit
, upperLimit
);
240 /* if the upper limit is a prefix of the lower limit then the earlier test lowerLimit>=upperLimit has caught it */
242 WeightRange lower
[5], middle
, upper
[5]; /* [0] and [1] are not used - this simplifies indexing */
243 uprv_memset(lower
, 0, sizeof(lower
));
244 uprv_memset(&middle
, 0, sizeof(middle
));
245 uprv_memset(upper
, 0, sizeof(upper
));
248 * With the limit lengths of 1..4, there are up to 7 ranges for allocation:
249 * range minimum length
258 * We are now going to calculate up to 7 ranges.
259 * Some of them will typically overlap, so we will then have to merge and eliminate ranges.
261 uint32_t weight
=lowerLimit
;
262 for(int32_t length
=lowerLength
; length
>middleLength
; --length
) {
263 uint32_t trail
=getWeightTrail(weight
, length
);
264 if(trail
<maxBytes
[length
]) {
265 lower
[length
].start
=incWeightTrail(weight
, length
);
266 lower
[length
].end
=setWeightTrail(weight
, length
, maxBytes
[length
]);
267 lower
[length
].length
=length
;
268 lower
[length
].count
=maxBytes
[length
]-trail
;
270 weight
=truncateWeight(weight
, length
-1);
272 if(weight
<0xff000000) {
273 middle
.start
=incWeightTrail(weight
, middleLength
);
275 // Prevent overflow for primary lead byte FF
276 // which would yield a middle range starting at 0.
277 middle
.start
=0xffffffff; // no middle range
281 for(int32_t length
=upperLength
; length
>middleLength
; --length
) {
282 uint32_t trail
=getWeightTrail(weight
, length
);
283 if(trail
>minBytes
[length
]) {
284 upper
[length
].start
=setWeightTrail(weight
, length
, minBytes
[length
]);
285 upper
[length
].end
=decWeightTrail(weight
, length
);
286 upper
[length
].length
=length
;
287 upper
[length
].count
=trail
-minBytes
[length
];
289 weight
=truncateWeight(weight
, length
-1);
291 middle
.end
=decWeightTrail(weight
, middleLength
);
293 /* set the middle range */
294 middle
.length
=middleLength
;
295 if(middle
.end
>=middle
.start
) {
296 middle
.count
=(int32_t)((middle
.end
-middle
.start
)>>(8*(4-middleLength
)))+1;
298 /* no middle range, eliminate overlaps */
299 for(int32_t length
=4; length
>middleLength
; --length
) {
300 if(lower
[length
].count
>0 && upper
[length
].count
>0) {
301 // Note: The lowerEnd and upperStart weights are versions of
302 // lowerLimit and upperLimit (which are lowerLimit<upperLimit),
303 // truncated (still less-or-equal)
304 // and then with their last bytes changed to the
305 // maxByte (for lowerEnd) or minByte (for upperStart).
306 const uint32_t lowerEnd
=lower
[length
].end
;
307 const uint32_t upperStart
=upper
[length
].start
;
310 if(lowerEnd
>upperStart
) {
311 // These two lower and upper ranges collide.
312 // Since lowerLimit<upperLimit and lowerEnd and upperStart
313 // are versions with only their last bytes modified
314 // (and following ones removed/reset to 0),
315 // lowerEnd>upperStart is only possible
316 // if the leading bytes are equal
317 // and lastByte(lowerEnd)>lastByte(upperStart).
318 U_ASSERT(truncateWeight(lowerEnd
, length
-1)==
319 truncateWeight(upperStart
, length
-1));
320 // Intersect these two ranges.
321 lower
[length
].end
=upper
[length
].end
;
323 (int32_t)getWeightTrail(lower
[length
].end
, length
)-
324 (int32_t)getWeightTrail(lower
[length
].start
, length
)+1;
325 // count might be <=0 in which case there is no room,
326 // and the range-collecting code below will ignore this range.
328 } else if(lowerEnd
==upperStart
) {
329 // Not possible, unless minByte==maxByte which is not allowed.
330 U_ASSERT(minBytes
[length
]<maxBytes
[length
]);
331 } else /* lowerEnd<upperStart */ {
332 if(incWeight(lowerEnd
, length
)==upperStart
) {
333 // Merge adjacent ranges.
334 lower
[length
].end
=upper
[length
].end
;
335 lower
[length
].count
+=upper
[length
].count
; // might be >countBytes
340 // Remove all shorter ranges.
341 // There was no room available for them between the ranges we just merged.
342 upper
[length
].count
=0;
343 while(--length
>middleLength
) {
344 lower
[length
].count
=upper
[length
].count
=0;
354 for(int32_t length
=4; length
>=2; --length
) {
355 if(lower
[length
].count
>0) {
356 printf("lower[%ld] .start=0x%08lx .end=0x%08lx .count=%ld\n", length
, lower
[length
].start
, lower
[length
].end
, lower
[length
].count
);
360 printf("middle .start=0x%08lx .end=0x%08lx .count=%ld\n", middle
.start
, middle
.end
, middle
.count
);
362 for(int32_t length
=2; length
<=4; ++length
) {
363 if(upper
[length
].count
>0) {
364 printf("upper[%ld] .start=0x%08lx .end=0x%08lx .count=%ld\n", length
, upper
[length
].start
, upper
[length
].end
, upper
[length
].count
);
369 /* copy the ranges, shortest first, into the result array */
372 uprv_memcpy(ranges
, &middle
, sizeof(WeightRange
));
375 for(int32_t length
=middleLength
+1; length
<=4; ++length
) {
376 /* copy upper first so that later the middle range is more likely the first one to use */
377 if(upper
[length
].count
>0) {
378 uprv_memcpy(ranges
+rangeCount
, upper
+length
, sizeof(WeightRange
));
381 if(lower
[length
].count
>0) {
382 uprv_memcpy(ranges
+rangeCount
, lower
+length
, sizeof(WeightRange
));
390 CollationWeights::allocWeightsInShortRanges(int32_t n
, int32_t minLength
) {
391 // See if the first few minLength and minLength+1 ranges have enough weights.
392 for(int32_t i
= 0; i
< rangeCount
&& ranges
[i
].length
<= (minLength
+ 1); ++i
) {
393 if(n
<= ranges
[i
].count
) {
394 // Use the first few minLength and minLength+1 ranges.
395 if(ranges
[i
].length
> minLength
) {
396 // Reduce the number of weights from the last minLength+1 range
397 // which might sort before some minLength ranges,
398 // so that we use all weights in the minLength ranges.
403 printf("take first %ld ranges\n", rangeCount
);
407 /* sort the ranges by weight values */
408 UErrorCode errorCode
=U_ZERO_ERROR
;
409 uprv_sortArray(ranges
, rangeCount
, sizeof(WeightRange
),
410 compareRanges
, NULL
, FALSE
, &errorCode
);
411 /* ignore error code: we know that the internal sort function will not fail here */
415 n
-= ranges
[i
].count
; // still >0
421 CollationWeights::allocWeightsInMinLengthRanges(int32_t n
, int32_t minLength
) {
422 // See if the minLength ranges have enough weights
423 // when we split one and lengthen the following ones.
425 int32_t minLengthRangeCount
;
426 for(minLengthRangeCount
= 0;
427 minLengthRangeCount
< rangeCount
&&
428 ranges
[minLengthRangeCount
].length
== minLength
;
429 ++minLengthRangeCount
) {
430 count
+= ranges
[minLengthRangeCount
].count
;
433 int32_t nextCountBytes
= countBytes(minLength
+ 1);
434 if(n
> count
* nextCountBytes
) { return FALSE
; }
436 // Use the minLength ranges. Merge them, and then split again as necessary.
437 uint32_t start
= ranges
[0].start
;
438 uint32_t end
= ranges
[0].end
;
439 for(int32_t i
= 1; i
< minLengthRangeCount
; ++i
) {
440 if(ranges
[i
].start
< start
) { start
= ranges
[i
].start
; }
441 if(ranges
[i
].end
> end
) { end
= ranges
[i
].end
; }
444 // Calculate how to split the range between minLength (count1) and minLength+1 (count2).
446 // count1 + count2 * nextCountBytes = n
447 // count1 + count2 = count
449 // (count - count2) + count2 * nextCountBytes = n
450 // and then into the following count1 & count2 computations.
451 int32_t count2
= (n
- count
) / (nextCountBytes
- 1); // number of weights to be lengthened
452 int32_t count1
= count
- count2
; // number of minLength weights
453 if(count2
== 0 || (count1
+ count2
* nextCountBytes
) < n
) {
457 U_ASSERT((count1
+ count2
* nextCountBytes
) >= n
);
460 ranges
[0].start
= start
;
463 // Make one long range.
465 ranges
[0].count
= count
;
466 lengthenRange(ranges
[0]);
469 // Split the range, lengthen the second part.
471 printf("split the range number %ld (out of %ld minLength ranges) by %ld:%ld\n",
472 splitRange
, rangeCount
, count1
, count2
);
475 // Next start = start + count1. First end = 1 before that.
476 ranges
[0].end
= incWeightByOffset(start
, minLength
, count1
- 1);
477 ranges
[0].count
= count1
;
479 ranges
[1].start
= incWeight(ranges
[0].end
, minLength
);
481 ranges
[1].length
= minLength
; // +1 when lengthened
482 ranges
[1].count
= count2
; // *countBytes when lengthened
483 lengthenRange(ranges
[1]);
490 * call getWeightRanges and then determine heuristically
491 * which ranges to use for a given number of weights between (excluding)
495 CollationWeights::allocWeights(uint32_t lowerLimit
, uint32_t upperLimit
, int32_t n
) {
500 if(!getWeightRanges(lowerLimit
, upperLimit
)) {
502 printf("error: unable to get Weight ranges\n");
507 /* try until we find suitably large ranges */
509 /* get the smallest number of bytes in a range */
510 int32_t minLength
=ranges
[0].length
;
512 if(allocWeightsInShortRanges(n
, minLength
)) { break; }
516 printf("error: the maximum number of %ld weights is insufficient for n=%ld\n",
522 if(allocWeightsInMinLengthRanges(n
, minLength
)) { break; }
524 /* no good match, lengthen all minLength ranges and iterate */
526 printf("lengthen the short ranges from %ld bytes to %ld and iterate\n", minLength
, minLength
+1);
528 for(int32_t i
=0; ranges
[i
].length
==minLength
; ++i
) {
529 lengthenRange(ranges
[i
]);
534 puts("final ranges:");
535 for(int32_t i
=0; i
<rangeCount
; ++i
) {
536 printf("ranges[%ld] .start=0x%08lx .end=0x%08lx .length=%ld .count=%ld\n",
537 i
, ranges
[i
].start
, ranges
[i
].end
, ranges
[i
].length
, ranges
[i
].count
);
546 CollationWeights::nextWeight() {
547 if(rangeIndex
>= rangeCount
) {
550 /* get the next weight */
551 WeightRange
&range
= ranges
[rangeIndex
];
552 uint32_t weight
= range
.start
;
553 if(--range
.count
== 0) {
554 /* this range is finished */
557 /* increment the weight for the next value */
558 range
.start
= incWeight(weight
, range
.length
);
559 U_ASSERT(range
.start
<= range
.end
);
568 #endif /* #if !UCONFIG_NO_COLLATION */