[apple/icu.git] / icuSources / i18n / collationweights.cpp

// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
/*  
*******************************************************************************
*
*   Copyright (C) 1999-2015, International Business Machines
*   Corporation and others.  All Rights Reserved.
*
*******************************************************************************
*   file name:  collationweights.cpp
*   encoding:   UTF-8
*   tab size:   8 (not used)
*   indentation:4
*
*   created on: 2001mar08 as ucol_wgt.cpp
*   created by: Markus W. Scherer
*
*   This file contains code for allocating n collation element weights
*   between two exclusive limits.
*   It is used only internally by the collation tailoring builder.
*/

#include "unicode/utypes.h"

#if !UCONFIG_NO_COLLATION

#include "cmemory.h"
#include "collation.h"
#include "collationweights.h"
#include "uarrsort.h"
#include "uassert.h"

#ifdef UCOL_DEBUG
#   include <stdio.h>
#endif

U_NAMESPACE_BEGIN

/* collation element weight allocation -------------------------------------- */

/* helper functions for CE weights */

static inline uint32_t
getWeightTrail(uint32_t weight, int32_t length) {
    return (uint32_t)(weight>>(8*(4-length)))&0xff;
}

static inline uint32_t
setWeightTrail(uint32_t weight, int32_t length, uint32_t trail) {
    length=8*(4-length);
    return (uint32_t)((weight&(0xffffff00<<length))|(trail<<length));
}

static inline uint32_t
getWeightByte(uint32_t weight, int32_t idx) {
    return getWeightTrail(weight, idx); /* same calculation */
}

static inline uint32_t
setWeightByte(uint32_t weight, int32_t idx, uint32_t byte) {
    uint32_t mask; /* 0xffffffff except a 00 "hole" for the index-th byte */

    idx*=8;
    if(idx<32) {
        mask=((uint32_t)0xffffffff)>>idx;
    } else {
        // Do not use uint32_t>>32 because on some platforms that does not shift at all
        // while we need it to become 0.
        // PowerPC: 0xffffffff>>32 = 0           (wanted)
        // x86:     0xffffffff>>32 = 0xffffffff  (not wanted)
        //
        // ANSI C99 6.5.7 Bitwise shift operators:
        // "If the value of the right operand is negative
        // or is greater than or equal to the width of the promoted left operand,
        // the behavior is undefined."
        mask=0;
    }
    idx=32-idx;
    mask|=0xffffff00<<idx;
    return (uint32_t)((weight&mask)|(byte<<idx));
}

static inline uint32_t
truncateWeight(uint32_t weight, int32_t length) {
    return (uint32_t)(weight&(0xffffffff<<(8*(4-length))));
}

static inline uint32_t
incWeightTrail(uint32_t weight, int32_t length) {
    return (uint32_t)(weight+(1UL<<(8*(4-length))));
}

static inline uint32_t
decWeightTrail(uint32_t weight, int32_t length) {
    return (uint32_t)(weight-(1UL<<(8*(4-length))));
}

CollationWeights::CollationWeights()
        : middleLength(0), rangeIndex(0), rangeCount(0) {
    for(int32_t i = 0; i < 5; ++i) {
        minBytes[i] = maxBytes[i] = 0;
    }
}

void
CollationWeights::initForPrimary(UBool compressible) {
    middleLength=1;
    minBytes[1] = Collation::MERGE_SEPARATOR_BYTE + 1;
    maxBytes[1] = Collation::TRAIL_WEIGHT_BYTE;
    if(compressible) {
        minBytes[2] = Collation::PRIMARY_COMPRESSION_LOW_BYTE + 1;
        maxBytes[2] = Collation::PRIMARY_COMPRESSION_HIGH_BYTE - 1;
    } else {
        minBytes[2] = 2;
        maxBytes[2] = 0xff;
    }
    minBytes[3] = 2;
    maxBytes[3] = 0xff;
    minBytes[4] = 2;
    maxBytes[4] = 0xff;
}

void
CollationWeights::initForSecondary() {
    // We use only the lower 16 bits for secondary weights.
    middleLength=3;
    minBytes[1] = 0;
    maxBytes[1] = 0;
    minBytes[2] = 0;
    maxBytes[2] = 0;
    minBytes[3] = Collation::LEVEL_SEPARATOR_BYTE + 1;
    maxBytes[3] = 0xff;
    minBytes[4] = 2;
    maxBytes[4] = 0xff;
}

void
CollationWeights::initForTertiary() {
    // We use only the lower 16 bits for tertiary weights.
    middleLength=3;
    minBytes[1] = 0;
    maxBytes[1] = 0;
    minBytes[2] = 0;
    maxBytes[2] = 0;
    // We use only 6 bits per byte.
    // The other bits are used for case & quaternary weights.
    minBytes[3] = Collation::LEVEL_SEPARATOR_BYTE + 1;
    maxBytes[3] = 0x3f;
    minBytes[4] = 2;
    maxBytes[4] = 0x3f;
}

uint32_t
CollationWeights::incWeight(uint32_t weight, int32_t length) const {
    for(;;) {
        uint32_t byte=getWeightByte(weight, length);
        if(byte<maxBytes[length]) {
            return setWeightByte(weight, length, byte+1);
        } else {
            // Roll over, set this byte to the minimum and increment the previous one.
            weight=setWeightByte(weight, length, minBytes[length]);
            --length;
            U_ASSERT(length > 0);
        }
    }
}

uint32_t
CollationWeights::incWeightByOffset(uint32_t weight, int32_t length, int32_t offset) const {
    for(;;) {
        offset += getWeightByte(weight, length);
        if((uint32_t)offset <= maxBytes[length]) {
            return setWeightByte(weight, length, offset);
        } else {
            // Split the offset between this byte and the previous one.
            offset -= minBytes[length];
            weight = setWeightByte(weight, length, minBytes[length] + offset % countBytes(length));
            offset /= countBytes(length);
            --length;
            U_ASSERT(length > 0);
        }
    }
}

void
CollationWeights::lengthenRange(WeightRange &range) const {
    int32_t length=range.length+1;
    range.start=setWeightTrail(range.start, length, minBytes[length]);
    range.end=setWeightTrail(range.end, length, maxBytes[length]);
    range.count*=countBytes(length);
    range.length=length;
}

/* for uprv_sortArray: sort ranges in weight order */
static int32_t U_CALLCONV
compareRanges(const void * /*context*/, const void *left, const void *right) {
    uint32_t l, r;

    l=((const CollationWeights::WeightRange *)left)->start;
    r=((const CollationWeights::WeightRange *)right)->start;
    if(l<r) {
        return -1;
    } else if(l>r) {
        return 1;
    } else {
        return 0;
    }
}

UBool
CollationWeights::getWeightRanges(uint32_t lowerLimit, uint32_t upperLimit) {
    U_ASSERT(lowerLimit != 0);
    U_ASSERT(upperLimit != 0);

    /* get the lengths of the limits */
    int32_t lowerLength=lengthOfWeight(lowerLimit);
    int32_t upperLength=lengthOfWeight(upperLimit);

#ifdef UCOL_DEBUG
    printf("length of lower limit 0x%08lx is %ld\n", lowerLimit, lowerLength);
    printf("length of upper limit 0x%08lx is %ld\n", upperLimit, upperLength);
#endif
    U_ASSERT(lowerLength>=middleLength);
    // Permit upperLength<middleLength: The upper limit for secondaries is 0x10000.

    if(lowerLimit>=upperLimit) {
#ifdef UCOL_DEBUG
        printf("error: no space between lower & upper limits\n");
#endif
        return FALSE;
    }

    /* check that neither is a prefix of the other */
    if(lowerLength<upperLength) {
        if(lowerLimit==truncateWeight(upperLimit, lowerLength)) {
#ifdef UCOL_DEBUG
            printf("error: lower limit 0x%08lx is a prefix of upper limit 0x%08lx\n", lowerLimit, upperLimit);
#endif
            return FALSE;
        }
    }
    /* if the upper limit is a prefix of the lower limit then the earlier test lowerLimit>=upperLimit has caught it */

    WeightRange lower[5], middle, upper[5]; /* [0] and [1] are not used - this simplifies indexing */
    uprv_memset(lower, 0, sizeof(lower));
    uprv_memset(&middle, 0, sizeof(middle));
    uprv_memset(upper, 0, sizeof(upper));

    /*
     * With the limit lengths of 1..4, there are up to 7 ranges for allocation:
     * range     minimum length
     * lower[4]  4
     * lower[3]  3
     * lower[2]  2
     * middle    1
     * upper[2]  2
     * upper[3]  3
     * upper[4]  4
     *
     * We are now going to calculate up to 7 ranges.
     * Some of them will typically overlap, so we will then have to merge and eliminate ranges.
     */
    uint32_t weight=lowerLimit;
    for(int32_t length=lowerLength; length>middleLength; --length) {
        uint32_t trail=getWeightTrail(weight, length);
        if(trail<maxBytes[length]) {
            lower[length].start=incWeightTrail(weight, length);
            lower[length].end=setWeightTrail(weight, length, maxBytes[length]);
            lower[length].length=length;
            lower[length].count=maxBytes[length]-trail;
        }
        weight=truncateWeight(weight, length-1);
    }
    if(weight<0xff000000) {
        middle.start=incWeightTrail(weight, middleLength);
    } else {
        // Prevent overflow for primary lead byte FF
        // which would yield a middle range starting at 0.
        middle.start=0xffffffff;  // no middle range
    }

    weight=upperLimit;
    for(int32_t length=upperLength; length>middleLength; --length) {
        uint32_t trail=getWeightTrail(weight, length);
        if(trail>minBytes[length]) {
            upper[length].start=setWeightTrail(weight, length, minBytes[length]);
            upper[length].end=decWeightTrail(weight, length);
            upper[length].length=length;
            upper[length].count=trail-minBytes[length];
        }
        weight=truncateWeight(weight, length-1);
    }
    middle.end=decWeightTrail(weight, middleLength);

    /* set the middle range */
    middle.length=middleLength;
    if(middle.end>=middle.start) {
        middle.count=(int32_t)((middle.end-middle.start)>>(8*(4-middleLength)))+1;
    } else {
        /* no middle range, eliminate overlaps */
        for(int32_t length=4; length>middleLength; --length) {
            if(lower[length].count>0 && upper[length].count>0) {
                // Note: The lowerEnd and upperStart weights are versions of
                // lowerLimit and upperLimit (which are lowerLimit<upperLimit),
                // truncated (still less-or-equal)
                // and then with their last bytes changed to the
                // maxByte (for lowerEnd) or minByte (for upperStart).
                const uint32_t lowerEnd=lower[length].end;
                const uint32_t upperStart=upper[length].start;
                UBool merged=FALSE;

                if(lowerEnd>upperStart) {
                    // These two lower and upper ranges collide.
                    // Since lowerLimit<upperLimit and lowerEnd and upperStart
                    // are versions with only their last bytes modified
                    // (and following ones removed/reset to 0),
                    // lowerEnd>upperStart is only possible
                    // if the leading bytes are equal
                    // and lastByte(lowerEnd)>lastByte(upperStart).
                    U_ASSERT(truncateWeight(lowerEnd, length-1)==
                            truncateWeight(upperStart, length-1));
                    // Intersect these two ranges.
                    lower[length].end=upper[length].end;
                    lower[length].count=
                            (int32_t)getWeightTrail(lower[length].end, length)-
                            (int32_t)getWeightTrail(lower[length].start, length)+1;
                    // count might be <=0 in which case there is no room,
                    // and the range-collecting code below will ignore this range.
                    merged=TRUE;
                } else if(lowerEnd==upperStart) {
                    // Not possible, unless minByte==maxByte which is not allowed.
                    U_ASSERT(minBytes[length]<maxBytes[length]);
                } else /* lowerEnd<upperStart */ {
                    if(incWeight(lowerEnd, length)==upperStart) {
                        // Merge adjacent ranges.
                        lower[length].end=upper[length].end;
                        lower[length].count+=upper[length].count;  // might be >countBytes
                        merged=TRUE;
                    }
                }
                if(merged) {
                    // Remove all shorter ranges.
                    // There was no room available for them between the ranges we just merged.
                    upper[length].count=0;
                    while(--length>middleLength) {
                        lower[length].count=upper[length].count=0;
                    }
                    break;
                }
            }
        }
    }

#ifdef UCOL_DEBUG
    /* print ranges */
    for(int32_t length=4; length>=2; --length) {
        if(lower[length].count>0) {
            printf("lower[%ld] .start=0x%08lx .end=0x%08lx .count=%ld\n", length, lower[length].start, lower[length].end, lower[length].count);
        }
    }
    if(middle.count>0) {
        printf("middle   .start=0x%08lx .end=0x%08lx .count=%ld\n", middle.start, middle.end, middle.count);
    }
    for(int32_t length=2; length<=4; ++length) {
        if(upper[length].count>0) {
            printf("upper[%ld] .start=0x%08lx .end=0x%08lx .count=%ld\n", length, upper[length].start, upper[length].end, upper[length].count);
        }
    }
#endif

    /* copy the ranges, shortest first, into the result array */
    rangeCount=0;
    if(middle.count>0) {
        uprv_memcpy(ranges, &middle, sizeof(WeightRange));
        rangeCount=1;
    }
    for(int32_t length=middleLength+1; length<=4; ++length) {
        /* copy upper first so that later the middle range is more likely the first one to use */
        if(upper[length].count>0) {
            uprv_memcpy(ranges+rangeCount, upper+length, sizeof(WeightRange));
            ++rangeCount;
        }
        if(lower[length].count>0) {
            uprv_memcpy(ranges+rangeCount, lower+length, sizeof(WeightRange));
            ++rangeCount;
        }
    }
    return rangeCount>0;
}

UBool
CollationWeights::allocWeightsInShortRanges(int32_t n, int32_t minLength) {
    // See if the first few minLength and minLength+1 ranges have enough weights.
    for(int32_t i = 0; i < rangeCount && ranges[i].length <= (minLength + 1); ++i) {
        if(n <= ranges[i].count) {
            // Use the first few minLength and minLength+1 ranges.
            if(ranges[i].length > minLength) {
                // Reduce the number of weights from the last minLength+1 range
                // which might sort before some minLength ranges,
                // so that we use all weights in the minLength ranges.
                ranges[i].count = n;
            }
            rangeCount = i + 1;
#ifdef UCOL_DEBUG
            printf("take first %ld ranges\n", rangeCount);
#endif

            if(rangeCount>1) {
                /* sort the ranges by weight values */
                UErrorCode errorCode=U_ZERO_ERROR;
                uprv_sortArray(ranges, rangeCount, sizeof(WeightRange),
                               compareRanges, NULL, FALSE, &errorCode);
                /* ignore error code: we know that the internal sort function will not fail here */
            }
            return TRUE;
        }
        n -= ranges[i].count;  // still >0
    }
    return FALSE;
}

UBool
CollationWeights::allocWeightsInMinLengthRanges(int32_t n, int32_t minLength) {
    // See if the minLength ranges have enough weights
    // when we split one and lengthen the following ones.
    int32_t count = 0;
    int32_t minLengthRangeCount;
    for(minLengthRangeCount = 0;
            minLengthRangeCount < rangeCount &&
                ranges[minLengthRangeCount].length == minLength;
            ++minLengthRangeCount) {
        count += ranges[minLengthRangeCount].count;
    }

    int32_t nextCountBytes = countBytes(minLength + 1);
    if(n > count * nextCountBytes) { return FALSE; }

    // Use the minLength ranges. Merge them, and then split again as necessary.
    uint32_t start = ranges[0].start;
    uint32_t end = ranges[0].end;
    for(int32_t i = 1; i < minLengthRangeCount; ++i) {
        if(ranges[i].start < start) { start = ranges[i].start; }
        if(ranges[i].end > end) { end = ranges[i].end; }
    }

    // Calculate how to split the range between minLength (count1) and minLength+1 (count2).
    // Goal:
    //   count1 + count2 * nextCountBytes = n
    //   count1 + count2 = count
    // These turn into
    //   (count - count2) + count2 * nextCountBytes = n
    // and then into the following count1 & count2 computations.
    int32_t count2 = (n - count) / (nextCountBytes - 1);  // number of weights to be lengthened
    int32_t count1 = count - count2;  // number of minLength weights
    if(count2 == 0 || (count1 + count2 * nextCountBytes) < n) {
        // round up
        ++count2;
        --count1;
        U_ASSERT((count1 + count2 * nextCountBytes) >= n);
    }

    ranges[0].start = start;

    if(count1 == 0) {
        // Make one long range.
        ranges[0].end = end;
        ranges[0].count = count;
        lengthenRange(ranges[0]);
        rangeCount = 1;
    } else {
        // Split the range, lengthen the second part.
#ifdef UCOL_DEBUG
        printf("split the range number %ld (out of %ld minLength ranges) by %ld:%ld\n",
               splitRange, rangeCount, count1, count2);
#endif

        // Next start = start + count1. First end = 1 before that.
        ranges[0].end = incWeightByOffset(start, minLength, count1 - 1);
        ranges[0].count = count1;

        ranges[1].start = incWeight(ranges[0].end, minLength);
        ranges[1].end = end;
        ranges[1].length = minLength;  // +1 when lengthened
        ranges[1].count = count2;  // *countBytes when lengthened
        lengthenRange(ranges[1]);
        rangeCount = 2;
    }
    return TRUE;
}

/*
 * call getWeightRanges and then determine heuristically
 * which ranges to use for a given number of weights between (excluding)
 * two limits
 */
UBool
CollationWeights::allocWeights(uint32_t lowerLimit, uint32_t upperLimit, int32_t n) {
#ifdef UCOL_DEBUG
    puts("");
#endif

    if(!getWeightRanges(lowerLimit, upperLimit)) {
#ifdef UCOL_DEBUG
        printf("error: unable to get Weight ranges\n");
#endif
        return FALSE;
    }

    /* try until we find suitably large ranges */
    for(;;) {
        /* get the smallest number of bytes in a range */
        int32_t minLength=ranges[0].length;

        if(allocWeightsInShortRanges(n, minLength)) { break; }

        if(minLength == 4) {
#ifdef UCOL_DEBUG
            printf("error: the maximum number of %ld weights is insufficient for n=%ld\n",
                   minLengthCount, n);
#endif
            return FALSE;
        }

        if(allocWeightsInMinLengthRanges(n, minLength)) { break; }

        /* no good match, lengthen all minLength ranges and iterate */
#ifdef UCOL_DEBUG
        printf("lengthen the short ranges from %ld bytes to %ld and iterate\n", minLength, minLength+1);
#endif
        for(int32_t i=0; ranges[i].length==minLength; ++i) {
            lengthenRange(ranges[i]);
        }
    }

#ifdef UCOL_DEBUG
    puts("final ranges:");
    for(int32_t i=0; i<rangeCount; ++i) {
        printf("ranges[%ld] .start=0x%08lx .end=0x%08lx .length=%ld .count=%ld\n",
               i, ranges[i].start, ranges[i].end, ranges[i].length, ranges[i].count);
    }
#endif

    rangeIndex = 0;
    return TRUE;
}

uint32_t
CollationWeights::nextWeight() {
    if(rangeIndex >= rangeCount) {
        return 0xffffffff;
    } else {
        /* get the next weight */
        WeightRange &range = ranges[rangeIndex];
        uint32_t weight = range.start;
        if(--range.count == 0) {
            /* this range is finished */
            ++rangeIndex;
        } else {
            /* increment the weight for the next value */
            range.start = incWeight(weight, range.length);
            U_ASSERT(range.start <= range.end);
        }

        return weight;
    }
}

U_NAMESPACE_END

#endif /* #if !UCONFIG_NO_COLLATION */
Commit	Line	Data
f3c0d7a5 A	1	// © 2016 and later: Unicode, Inc. and others.
f3c0d7a5 A	2	// License & terms of use: http://www.unicode.org/copyright.html
57a6839d A	3	/*
	4	*******************************************************************************
	5	*
2ca993e8	6	* Copyright (C) 1999-2015, International Business Machines
57a6839d A	7	* Corporation and others. All Rights Reserved.
	8	*
	9	*******************************************************************************
	10	* file name: collationweights.cpp
f3c0d7a5	11	* encoding: UTF-8
57a6839d A	12	* tab size: 8 (not used)
	13	* indentation:4
	14	*
	15	* created on: 2001mar08 as ucol_wgt.cpp
	16	* created by: Markus W. Scherer
	17	*
	18	* This file contains code for allocating n collation element weights
	19	* between two exclusive limits.
	20	* It is used only internally by the collation tailoring builder.
	21	*/
	22
	23	#include "unicode/utypes.h"
	24
	25	#if !UCONFIG_NO_COLLATION
	26
	27	#include "cmemory.h"
	28	#include "collation.h"
	29	#include "collationweights.h"
	30	#include "uarrsort.h"
	31	#include "uassert.h"
	32
	33	#ifdef UCOL_DEBUG
	34	# include <stdio.h>
	35	#endif
	36
	37	U_NAMESPACE_BEGIN
	38
	39	/* collation element weight allocation -------------------------------------- */
	40
	41	/* helper functions for CE weights */
	42
	43	static inline uint32_t
	44	getWeightTrail(uint32_t weight, int32_t length) {
	45	return (uint32_t)(weight>>(8*(4-length)))&0xff;
	46	}
	47
	48	static inline uint32_t
	49	setWeightTrail(uint32_t weight, int32_t length, uint32_t trail) {
	50	length=8*(4-length);
	51	return (uint32_t)((weight&(0xffffff00<<length))\|(trail<<length));
	52	}
	53
	54	static inline uint32_t
	55	getWeightByte(uint32_t weight, int32_t idx) {
	56	return getWeightTrail(weight, idx); /* same calculation */
	57	}
	58
	59	static inline uint32_t
	60	setWeightByte(uint32_t weight, int32_t idx, uint32_t byte) {
	61	uint32_t mask; /* 0xffffffff except a 00 "hole" for the index-th byte */
	62
	63	idx*=8;
	64	if(idx<32) {
	65	mask=((uint32_t)0xffffffff)>>idx;
	66	} else {
	67	// Do not use uint32_t>>32 because on some platforms that does not shift at all
	68	// while we need it to become 0.
	69	// PowerPC: 0xffffffff>>32 = 0 (wanted)
	70	// x86: 0xffffffff>>32 = 0xffffffff (not wanted)
	71	//
	72	// ANSI C99 6.5.7 Bitwise shift operators:
	73	// "If the value of the right operand is negative
	74	// or is greater than or equal to the width of the promoted left operand,
	75	// the behavior is undefined."
76	mask=0;
77	}
78	idx=32-idx;
79	mask\|=0xffffff00<<idx;
80	return (uint32_t)((weight&mask)\|(byte<<idx));
81	}
82
83	static inline uint32_t
84	truncateWeight(uint32_t weight, int32_t length) {
85	return (uint32_t)(weight&(0xffffffff<<(8*(4-length))));
86	}
87
88	static inline uint32_t
89	incWeightTrail(uint32_t weight, int32_t length) {
90	return (uint32_t)(weight+(1UL<<(8*(4-length))));
91	}
92
93	static inline uint32_t
94	decWeightTrail(uint32_t weight, int32_t length) {
95	return (uint32_t)(weight-(1UL<<(8*(4-length))));
96	}
97
98	CollationWeights::CollationWeights()
99	: middleLength(0), rangeIndex(0), rangeCount(0) {
100	for(int32_t i = 0; i < 5; ++i) {
101	minBytes[i] = maxBytes[i] = 0;
102	}
103	}
104
105	void
106	CollationWeights::initForPrimary(UBool compressible) {
107	middleLength=1;
108	minBytes[1] = Collation::MERGE_SEPARATOR_BYTE + 1;
109	maxBytes[1] = Collation::TRAIL_WEIGHT_BYTE;
110	if(compressible) {
111	minBytes[2] = Collation::PRIMARY_COMPRESSION_LOW_BYTE + 1;
112	maxBytes[2] = Collation::PRIMARY_COMPRESSION_HIGH_BYTE - 1;
113	} else {
114	minBytes[2] = 2;
115	maxBytes[2] = 0xff;
116	}
117	minBytes[3] = 2;
118	maxBytes[3] = 0xff;
119	minBytes[4] = 2;
120	maxBytes[4] = 0xff;
121	}
122
123	void
124	CollationWeights::initForSecondary() {
125	// We use only the lower 16 bits for secondary weights.
126	middleLength=3;
127	minBytes[1] = 0;
128	maxBytes[1] = 0;
129	minBytes[2] = 0;
130	maxBytes[2] = 0;
b331163b	131	minBytes[3] = Collation::LEVEL_SEPARATOR_BYTE + 1;
57a6839d A	132	maxBytes[3] = 0xff;
	133	minBytes[4] = 2;
	134	maxBytes[4] = 0xff;
	135	}
	136
	137	void
	138	CollationWeights::initForTertiary() {
	139	// We use only the lower 16 bits for tertiary weights.
	140	middleLength=3;
	141	minBytes[1] = 0;
	142	maxBytes[1] = 0;
	143	minBytes[2] = 0;
	144	maxBytes[2] = 0;
	145	// We use only 6 bits per byte.
	146	// The other bits are used for case & quaternary weights.
b331163b	147	minBytes[3] = Collation::LEVEL_SEPARATOR_BYTE + 1;
57a6839d A	148	maxBytes[3] = 0x3f;
	149	minBytes[4] = 2;
	150	maxBytes[4] = 0x3f;
	151	}
	152
	153	uint32_t
	154	CollationWeights::incWeight(uint32_t weight, int32_t length) const {
	155	for(;;) {
	156	uint32_t byte=getWeightByte(weight, length);
	157	if(byte<maxBytes[length]) {
	158	return setWeightByte(weight, length, byte+1);
	159	} else {
	160	// Roll over, set this byte to the minimum and increment the previous one.
	161	weight=setWeightByte(weight, length, minBytes[length]);
	162	--length;
	163	U_ASSERT(length > 0);
	164	}
	165	}
	166	}
	167
	168	uint32_t
	169	CollationWeights::incWeightByOffset(uint32_t weight, int32_t length, int32_t offset) const {
	170	for(;;) {
	171	offset += getWeightByte(weight, length);
	172	if((uint32_t)offset <= maxBytes[length]) {
	173	return setWeightByte(weight, length, offset);
	174	} else {
	175	// Split the offset between this byte and the previous one.
	176	offset -= minBytes[length];
	177	weight = setWeightByte(weight, length, minBytes[length] + offset % countBytes(length));
	178	offset /= countBytes(length);
	179	--length;
	180	U_ASSERT(length > 0);
	181	}
	182	}
	183	}
	184
	185	void
	186	CollationWeights::lengthenRange(WeightRange &range) const {
	187	int32_t length=range.length+1;
	188	range.start=setWeightTrail(range.start, length, minBytes[length]);
	189	range.end=setWeightTrail(range.end, length, maxBytes[length]);
	190	range.count*=countBytes(length);
	191	range.length=length;
	192	}
	193
	194	/* for uprv_sortArray: sort ranges in weight order */
	195	static int32_t U_CALLCONV
	196	compareRanges(const void * /context/, const void left, const void right) {
	197	uint32_t l, r;
	198
	199	l=((const CollationWeights::WeightRange *)left)->start;
	200	r=((const CollationWeights::WeightRange *)right)->start;
	201	if(l<r) {
	202	return -1;
	203	} else if(l>r) {
	204	return 1;
	205	} else {
	206	return 0;
	207	}
	208	}
	209
	210	UBool
	211	CollationWeights::getWeightRanges(uint32_t lowerLimit, uint32_t upperLimit) {
212	U_ASSERT(lowerLimit != 0);
213	U_ASSERT(upperLimit != 0);
214
215	/* get the lengths of the limits */
216	int32_t lowerLength=lengthOfWeight(lowerLimit);
217	int32_t upperLength=lengthOfWeight(upperLimit);
218
219	#ifdef UCOL_DEBUG
220	printf("length of lower limit 0x%08lx is %ld\n", lowerLimit, lowerLength);
221	printf("length of upper limit 0x%08lx is %ld\n", upperLimit, upperLength);
222	#endif
223	U_ASSERT(lowerLength>=middleLength);
224	// Permit upperLength<middleLength: The upper limit for secondaries is 0x10000.
225
226	if(lowerLimit>=upperLimit) {
227	#ifdef UCOL_DEBUG
228	printf("error: no space between lower & upper limits\n");
229	#endif
230	return FALSE;
231	}
232
233	/* check that neither is a prefix of the other */
234	if(lowerLength<upperLength) {
235	if(lowerLimit==truncateWeight(upperLimit, lowerLength)) {
236	#ifdef UCOL_DEBUG
237	printf("error: lower limit 0x%08lx is a prefix of upper limit 0x%08lx\n", lowerLimit, upperLimit);
238	#endif
239	return FALSE;
240	}
241	}
242	/* if the upper limit is a prefix of the lower limit then the earlier test lowerLimit>=upperLimit has caught it */
243
244	WeightRange lower[5], middle, upper[5]; /* [0] and [1] are not used - this simplifies indexing */
245	uprv_memset(lower, 0, sizeof(lower));
246	uprv_memset(&middle, 0, sizeof(middle));
247	uprv_memset(upper, 0, sizeof(upper));
248
249	/*
250	* With the limit lengths of 1..4, there are up to 7 ranges for allocation:
251	* range minimum length
252	* lower[4] 4
253	* lower[3] 3
254	* lower[2] 2
255	* middle 1
256	* upper[2] 2
257	* upper[3] 3
258	* upper[4] 4
259	*
260	* We are now going to calculate up to 7 ranges.
261	* Some of them will typically overlap, so we will then have to merge and eliminate ranges.
262	*/
263	uint32_t weight=lowerLimit;
264	for(int32_t length=lowerLength; length>middleLength; --length) {
265	uint32_t trail=getWeightTrail(weight, length);
266	if(trail<maxBytes[length]) {
267	lower[length].start=incWeightTrail(weight, length);
268	lower[length].end=setWeightTrail(weight, length, maxBytes[length]);
269	lower[length].length=length;
270	lower[length].count=maxBytes[length]-trail;
271	}
272	weight=truncateWeight(weight, length-1);
273	}
274	if(weight<0xff000000) {
275	middle.start=incWeightTrail(weight, middleLength);
276	} else {
277	// Prevent overflow for primary lead byte FF
278	// which would yield a middle range starting at 0.
279	middle.start=0xffffffff; // no middle range
280	}
281
282	weight=upperLimit;
283	for(int32_t length=upperLength; length>middleLength; --length) {
284	uint32_t trail=getWeightTrail(weight, length);
285	if(trail>minBytes[length]) {
286	upper[length].start=setWeightTrail(weight, length, minBytes[length]);
287	upper[length].end=decWeightTrail(weight, length);
288	upper[length].length=length;
289	upper[length].count=trail-minBytes[length];
290	}
291	weight=truncateWeight(weight, length-1);
292	}
293	middle.end=decWeightTrail(weight, middleLength);
294
295	/* set the middle range */
296	middle.length=middleLength;
297	if(middle.end>=middle.start) {
298	middle.count=(int32_t)((middle.end-middle.start)>>(8*(4-middleLength)))+1;
299	} else {
300	/* no middle range, eliminate overlaps */
57a6839d A	301	for(int32_t length=4; length>middleLength; --length) {
57a6839d A	302	if(lower[length].count>0 && upper[length].count>0) {
2ca993e8 A	303	// Note: The lowerEnd and upperStart weights are versions of
	304	// lowerLimit and upperLimit (which are lowerLimit<upperLimit),
	305	// truncated (still less-or-equal)
	306	// and then with their last bytes changed to the
	307	// maxByte (for lowerEnd) or minByte (for upperStart).
	308	const uint32_t lowerEnd=lower[length].end;
	309	const uint32_t upperStart=upper[length].start;
	310	UBool merged=FALSE;
	311
	312	if(lowerEnd>upperStart) {
	313	// These two lower and upper ranges collide.
	314	// Since lowerLimit<upperLimit and lowerEnd and upperStart
	315	// are versions with only their last bytes modified
	316	// (and following ones removed/reset to 0),
	317	// lowerEnd>upperStart is only possible
	318	// if the leading bytes are equal
	319	// and lastByte(lowerEnd)>lastByte(upperStart).
	320	U_ASSERT(truncateWeight(lowerEnd, length-1)==
	321	truncateWeight(upperStart, length-1));
	322	// Intersect these two ranges.
	323	lower[length].end=upper[length].end;
57a6839d	324	lower[length].count=
2ca993e8 A	325	(int32_t)getWeightTrail(lower[length].end, length)-
	326	(int32_t)getWeightTrail(lower[length].start, length)+1;
	327	// count might be <=0 in which case there is no room,
	328	// and the range-collecting code below will ignore this range.
	329	merged=TRUE;
	330	} else if(lowerEnd==upperStart) {
	331	// Not possible, unless minByte==maxByte which is not allowed.
	332	U_ASSERT(minBytes[length]<maxBytes[length]);
	333	} else /* lowerEnd<upperStart */ {
	334	if(incWeight(lowerEnd, length)==upperStart) {
	335	// Merge adjacent ranges.
	336	lower[length].end=upper[length].end;
	337	lower[length].count+=upper[length].count; // might be >countBytes
	338	merged=TRUE;
	339	}
	340	}
	341	if(merged) {
	342	// Remove all shorter ranges.
	343	// There was no room available for them between the ranges we just merged.
57a6839d A	344	upper[length].count=0;
	345	while(--length>middleLength) {
	346	lower[length].count=upper[length].count=0;
	347	}
	348	break;
	349	}
	350	}
	351	}
	352	}
	353
	354	#ifdef UCOL_DEBUG
	355	/* print ranges */
	356	for(int32_t length=4; length>=2; --length) {
	357	if(lower[length].count>0) {
	358	printf("lower[%ld] .start=0x%08lx .end=0x%08lx .count=%ld\n", length, lower[length].start, lower[length].end, lower[length].count);
	359	}
	360	}
	361	if(middle.count>0) {
	362	printf("middle .start=0x%08lx .end=0x%08lx .count=%ld\n", middle.start, middle.end, middle.count);
	363	}
	364	for(int32_t length=2; length<=4; ++length) {
	365	if(upper[length].count>0) {
	366	printf("upper[%ld] .start=0x%08lx .end=0x%08lx .count=%ld\n", length, upper[length].start, upper[length].end, upper[length].count);
	367	}
	368	}
	369	#endif
	370
	371	/* copy the ranges, shortest first, into the result array */
	372	rangeCount=0;
	373	if(middle.count>0) {
	374	uprv_memcpy(ranges, &middle, sizeof(WeightRange));
	375	rangeCount=1;
	376	}
	377	for(int32_t length=middleLength+1; length<=4; ++length) {
	378	/* copy upper first so that later the middle range is more likely the first one to use */
	379	if(upper[length].count>0) {
	380	uprv_memcpy(ranges+rangeCount, upper+length, sizeof(WeightRange));
	381	++rangeCount;
	382	}
	383	if(lower[length].count>0) {
	384	uprv_memcpy(ranges+rangeCount, lower+length, sizeof(WeightRange));
	385	++rangeCount;
	386	}
	387	}
	388	return rangeCount>0;
	389	}
	390
	391	UBool
	392	CollationWeights::allocWeightsInShortRanges(int32_t n, int32_t minLength) {
	393	// See if the first few minLength and minLength+1 ranges have enough weights.
	394	for(int32_t i = 0; i < rangeCount && ranges[i].length <= (minLength + 1); ++i) {
	395	if(n <= ranges[i].count) {
	396	// Use the first few minLength and minLength+1 ranges.
	397	if(ranges[i].length > minLength) {
	398	// Reduce the number of weights from the last minLength+1 range
	399	// which might sort before some minLength ranges,
	400	// so that we use all weights in the minLength ranges.
	401	ranges[i].count = n;
	402	}
	403	rangeCount = i + 1;
	404	#ifdef UCOL_DEBUG
	405	printf("take first %ld ranges\n", rangeCount);
	406	#endif
	407
408	if(rangeCount>1) {
409	/* sort the ranges by weight values */
410	UErrorCode errorCode=U_ZERO_ERROR;
411	uprv_sortArray(ranges, rangeCount, sizeof(WeightRange),
412	compareRanges, NULL, FALSE, &errorCode);
413	/* ignore error code: we know that the internal sort function will not fail here */
414	}
415	return TRUE;
416	}
417	n -= ranges[i].count; // still >0
418	}
419	return FALSE;
420	}
421
422	UBool
423	CollationWeights::allocWeightsInMinLengthRanges(int32_t n, int32_t minLength) {
424	// See if the minLength ranges have enough weights
425	// when we split one and lengthen the following ones.
426	int32_t count = 0;
427	int32_t minLengthRangeCount;
428	for(minLengthRangeCount = 0;
429	minLengthRangeCount < rangeCount &&
430	ranges[minLengthRangeCount].length == minLength;
431	++minLengthRangeCount) {
432	count += ranges[minLengthRangeCount].count;
433	}
434
435	int32_t nextCountBytes = countBytes(minLength + 1);
436	if(n > count * nextCountBytes) { return FALSE; }
437
438	// Use the minLength ranges. Merge them, and then split again as necessary.
439	uint32_t start = ranges[0].start;
440	uint32_t end = ranges[0].end;
441	for(int32_t i = 1; i < minLengthRangeCount; ++i) {
442	if(ranges[i].start < start) { start = ranges[i].start; }
443	if(ranges[i].end > end) { end = ranges[i].end; }
444	}
445
446	// Calculate how to split the range between minLength (count1) and minLength+1 (count2).
447	// Goal:
448	// count1 + count2 * nextCountBytes = n
449	// count1 + count2 = count
450	// These turn into
451	// (count - count2) + count2 * nextCountBytes = n
452	// and then into the following count1 & count2 computations.
453	int32_t count2 = (n - count) / (nextCountBytes - 1); // number of weights to be lengthened
454	int32_t count1 = count - count2; // number of minLength weights
455	if(count2 == 0 \|\| (count1 + count2 * nextCountBytes) < n) {
456	// round up
457	++count2;
458	--count1;
459	U_ASSERT((count1 + count2 * nextCountBytes) >= n);
460	}
461
462	ranges[0].start = start;
463
464	if(count1 == 0) {
465	// Make one long range.
466	ranges[0].end = end;
467	ranges[0].count = count;
468	lengthenRange(ranges[0]);
469	rangeCount = 1;
470	} else {
471	// Split the range, lengthen the second part.
472	#ifdef UCOL_DEBUG
473	printf("split the range number %ld (out of %ld minLength ranges) by %ld:%ld\n",
474	splitRange, rangeCount, count1, count2);
475	#endif
476
477	// Next start = start + count1. First end = 1 before that.
478	ranges[0].end = incWeightByOffset(start, minLength, count1 - 1);
479	ranges[0].count = count1;
480
481	ranges[1].start = incWeight(ranges[0].end, minLength);
482	ranges[1].end = end;
483	ranges[1].length = minLength; // +1 when lengthened
484	ranges[1].count = count2; // *countBytes when lengthened
485	lengthenRange(ranges[1]);
486	rangeCount = 2;
487	}
488	return TRUE;
489	}
490
491	/*
492	* call getWeightRanges and then determine heuristically
493	* which ranges to use for a given number of weights between (excluding)
494	* two limits
495	*/
496	UBool
497	CollationWeights::allocWeights(uint32_t lowerLimit, uint32_t upperLimit, int32_t n) {
498	#ifdef UCOL_DEBUG
499	puts("");
500	#endif
501
502	if(!getWeightRanges(lowerLimit, upperLimit)) {
503	#ifdef UCOL_DEBUG
504	printf("error: unable to get Weight ranges\n");
505	#endif
506	return FALSE;
507	}
508
509	/* try until we find suitably large ranges */
510	for(;;) {
511	/* get the smallest number of bytes in a range */
512	int32_t minLength=ranges[0].length;
513
514	if(allocWeightsInShortRanges(n, minLength)) { break; }
515
516	if(minLength == 4) {
517	#ifdef UCOL_DEBUG
518	printf("error: the maximum number of %ld weights is insufficient for n=%ld\n",
519	minLengthCount, n);
520	#endif
521	return FALSE;
522	}
523
524	if(allocWeightsInMinLengthRanges(n, minLength)) { break; }
525
526	/* no good match, lengthen all minLength ranges and iterate */
527	#ifdef UCOL_DEBUG
528	printf("lengthen the short ranges from %ld bytes to %ld and iterate\n", minLength, minLength+1);
529	#endif
530	for(int32_t i=0; ranges[i].length==minLength; ++i) {
531	lengthenRange(ranges[i]);
532	}
533	}
534
535	#ifdef UCOL_DEBUG
536	puts("final ranges:");
537	for(int32_t i=0; i<rangeCount; ++i) {
538	printf("ranges[%ld] .start=0x%08lx .end=0x%08lx .length=%ld .count=%ld\n",
539	i, ranges[i].start, ranges[i].end, ranges[i].length, ranges[i].count);
540	}
541	#endif
542
543	rangeIndex = 0;
544	return TRUE;
545	}
546
547	uint32_t
548	CollationWeights::nextWeight() {
549	if(rangeIndex >= rangeCount) {
550	return 0xffffffff;
551	} else {
552	/* get the next weight */
553	WeightRange &range = ranges[rangeIndex];
554	uint32_t weight = range.start;
555	if(--range.count == 0) {
556	/* this range is finished */
557	++rangeIndex;
558	} else {
559	/* increment the weight for the next value */
560	range.start = incWeight(weight, range.length);
561	U_ASSERT(range.start <= range.end);
562	}
563
564	return weight;
565	}
566	}
567
568	U_NAMESPACE_END
569
570	#endif /* #if !UCONFIG_NO_COLLATION */