git.saurik.com Git - apple/icu.git/blame_incremental - icuSources/i18n/collationweights.cpp

... / ...

Commit	Line	Data
	1	/*
	2	*******************************************************************************
	3	*
	4	* Copyright (C) 1999-2015, International Business Machines
	5	* Corporation and others. All Rights Reserved.
	6	*
	7	*******************************************************************************
	8	* file name: collationweights.cpp
	9	* encoding: US-ASCII
	10	* tab size: 8 (not used)
	11	* indentation:4
	12	*
	13	* created on: 2001mar08 as ucol_wgt.cpp
	14	* created by: Markus W. Scherer
	15	*
	16	* This file contains code for allocating n collation element weights
	17	* between two exclusive limits.
	18	* It is used only internally by the collation tailoring builder.
	19	*/
	20
	21	#include "unicode/utypes.h"
	22
	23	#if !UCONFIG_NO_COLLATION
	24
	25	#include "cmemory.h"
	26	#include "collation.h"
	27	#include "collationweights.h"
	28	#include "uarrsort.h"
	29	#include "uassert.h"
	30
	31	#ifdef UCOL_DEBUG
	32	# include <stdio.h>
	33	#endif
	34
	35	U_NAMESPACE_BEGIN
	36
	37	/* collation element weight allocation -------------------------------------- */
	38
	39	/* helper functions for CE weights */
	40
	41	static inline uint32_t
	42	getWeightTrail(uint32_t weight, int32_t length) {
	43	return (uint32_t)(weight>>(8*(4-length)))&0xff;
	44	}
	45
	46	static inline uint32_t
	47	setWeightTrail(uint32_t weight, int32_t length, uint32_t trail) {
	48	length=8*(4-length);
	49	return (uint32_t)((weight&(0xffffff00<<length))\|(trail<<length));
	50	}
	51
	52	static inline uint32_t
	53	getWeightByte(uint32_t weight, int32_t idx) {
	54	return getWeightTrail(weight, idx); /* same calculation */
	55	}
	56
	57	static inline uint32_t
	58	setWeightByte(uint32_t weight, int32_t idx, uint32_t byte) {
	59	uint32_t mask; /* 0xffffffff except a 00 "hole" for the index-th byte */
	60
	61	idx*=8;
	62	if(idx<32) {
	63	mask=((uint32_t)0xffffffff)>>idx;
	64	} else {
	65	// Do not use uint32_t>>32 because on some platforms that does not shift at all
	66	// while we need it to become 0.
	67	// PowerPC: 0xffffffff>>32 = 0 (wanted)
	68	// x86: 0xffffffff>>32 = 0xffffffff (not wanted)
	69	//
	70	// ANSI C99 6.5.7 Bitwise shift operators:
	71	// "If the value of the right operand is negative
	72	// or is greater than or equal to the width of the promoted left operand,
	73	// the behavior is undefined."
	74	mask=0;
	75	}
	76	idx=32-idx;
	77	mask\|=0xffffff00<<idx;
	78	return (uint32_t)((weight&mask)\|(byte<<idx));
	79	}
	80
	81	static inline uint32_t
	82	truncateWeight(uint32_t weight, int32_t length) {
	83	return (uint32_t)(weight&(0xffffffff<<(8*(4-length))));
	84	}
	85
	86	static inline uint32_t
	87	incWeightTrail(uint32_t weight, int32_t length) {
	88	return (uint32_t)(weight+(1UL<<(8*(4-length))));
	89	}
	90
	91	static inline uint32_t
	92	decWeightTrail(uint32_t weight, int32_t length) {
	93	return (uint32_t)(weight-(1UL<<(8*(4-length))));
	94	}
	95
	96	CollationWeights::CollationWeights()
	97	: middleLength(0), rangeIndex(0), rangeCount(0) {
	98	for(int32_t i = 0; i < 5; ++i) {
	99	minBytes[i] = maxBytes[i] = 0;
	100	}
	101	}
	102
	103	void
	104	CollationWeights::initForPrimary(UBool compressible) {
	105	middleLength=1;
	106	minBytes[1] = Collation::MERGE_SEPARATOR_BYTE + 1;
	107	maxBytes[1] = Collation::TRAIL_WEIGHT_BYTE;
	108	if(compressible) {
	109	minBytes[2] = Collation::PRIMARY_COMPRESSION_LOW_BYTE + 1;
	110	maxBytes[2] = Collation::PRIMARY_COMPRESSION_HIGH_BYTE - 1;
	111	} else {
	112	minBytes[2] = 2;
	113	maxBytes[2] = 0xff;
	114	}
	115	minBytes[3] = 2;
	116	maxBytes[3] = 0xff;
	117	minBytes[4] = 2;
	118	maxBytes[4] = 0xff;
	119	}
	120
	121	void
	122	CollationWeights::initForSecondary() {
	123	// We use only the lower 16 bits for secondary weights.
	124	middleLength=3;
	125	minBytes[1] = 0;
	126	maxBytes[1] = 0;
	127	minBytes[2] = 0;
	128	maxBytes[2] = 0;
	129	minBytes[3] = Collation::LEVEL_SEPARATOR_BYTE + 1;
	130	maxBytes[3] = 0xff;
	131	minBytes[4] = 2;
	132	maxBytes[4] = 0xff;
	133	}
	134
	135	void
	136	CollationWeights::initForTertiary() {
	137	// We use only the lower 16 bits for tertiary weights.
	138	middleLength=3;
	139	minBytes[1] = 0;
	140	maxBytes[1] = 0;
	141	minBytes[2] = 0;
	142	maxBytes[2] = 0;
	143	// We use only 6 bits per byte.
	144	// The other bits are used for case & quaternary weights.
	145	minBytes[3] = Collation::LEVEL_SEPARATOR_BYTE + 1;
	146	maxBytes[3] = 0x3f;
	147	minBytes[4] = 2;
	148	maxBytes[4] = 0x3f;
	149	}
	150
	151	uint32_t
	152	CollationWeights::incWeight(uint32_t weight, int32_t length) const {
	153	for(;;) {
	154	uint32_t byte=getWeightByte(weight, length);
	155	if(byte<maxBytes[length]) {
	156	return setWeightByte(weight, length, byte+1);
	157	} else {
	158	// Roll over, set this byte to the minimum and increment the previous one.
	159	weight=setWeightByte(weight, length, minBytes[length]);
	160	--length;
	161	U_ASSERT(length > 0);
	162	}
	163	}
	164	}
	165
	166	uint32_t
	167	CollationWeights::incWeightByOffset(uint32_t weight, int32_t length, int32_t offset) const {
	168	for(;;) {
	169	offset += getWeightByte(weight, length);
	170	if((uint32_t)offset <= maxBytes[length]) {
	171	return setWeightByte(weight, length, offset);
	172	} else {
	173	// Split the offset between this byte and the previous one.
	174	offset -= minBytes[length];
	175	weight = setWeightByte(weight, length, minBytes[length] + offset % countBytes(length));
	176	offset /= countBytes(length);
	177	--length;
	178	U_ASSERT(length > 0);
	179	}
	180	}
	181	}
	182
	183	void
	184	CollationWeights::lengthenRange(WeightRange &range) const {
	185	int32_t length=range.length+1;
	186	range.start=setWeightTrail(range.start, length, minBytes[length]);
	187	range.end=setWeightTrail(range.end, length, maxBytes[length]);
	188	range.count*=countBytes(length);
	189	range.length=length;
	190	}
	191
	192	/* for uprv_sortArray: sort ranges in weight order */
	193	static int32_t U_CALLCONV
	194	compareRanges(const void * /context/, const void left, const void right) {
	195	uint32_t l, r;
	196
	197	l=((const CollationWeights::WeightRange *)left)->start;
	198	r=((const CollationWeights::WeightRange *)right)->start;
	199	if(l<r) {
	200	return -1;
	201	} else if(l>r) {
	202	return 1;
	203	} else {
	204	return 0;
	205	}
	206	}
	207
	208	UBool
	209	CollationWeights::getWeightRanges(uint32_t lowerLimit, uint32_t upperLimit) {
	210	U_ASSERT(lowerLimit != 0);
	211	U_ASSERT(upperLimit != 0);
	212
	213	/* get the lengths of the limits */
	214	int32_t lowerLength=lengthOfWeight(lowerLimit);
	215	int32_t upperLength=lengthOfWeight(upperLimit);
	216
	217	#ifdef UCOL_DEBUG
	218	printf("length of lower limit 0x%08lx is %ld\n", lowerLimit, lowerLength);
	219	printf("length of upper limit 0x%08lx is %ld\n", upperLimit, upperLength);
	220	#endif
	221	U_ASSERT(lowerLength>=middleLength);
	222	// Permit upperLength<middleLength: The upper limit for secondaries is 0x10000.
	223
	224	if(lowerLimit>=upperLimit) {
	225	#ifdef UCOL_DEBUG
	226	printf("error: no space between lower & upper limits\n");
	227	#endif
	228	return FALSE;
	229	}
	230
	231	/* check that neither is a prefix of the other */
	232	if(lowerLength<upperLength) {
	233	if(lowerLimit==truncateWeight(upperLimit, lowerLength)) {
	234	#ifdef UCOL_DEBUG
	235	printf("error: lower limit 0x%08lx is a prefix of upper limit 0x%08lx\n", lowerLimit, upperLimit);
	236	#endif
	237	return FALSE;
	238	}
	239	}
	240	/* if the upper limit is a prefix of the lower limit then the earlier test lowerLimit>=upperLimit has caught it */
	241
	242	WeightRange lower[5], middle, upper[5]; /* [0] and [1] are not used - this simplifies indexing */
	243	uprv_memset(lower, 0, sizeof(lower));
	244	uprv_memset(&middle, 0, sizeof(middle));
	245	uprv_memset(upper, 0, sizeof(upper));
	246
	247	/*
	248	* With the limit lengths of 1..4, there are up to 7 ranges for allocation:
	249	* range minimum length
	250	* lower[4] 4
	251	* lower[3] 3
	252	* lower[2] 2
	253	* middle 1
	254	* upper[2] 2
	255	* upper[3] 3
	256	* upper[4] 4
	257	*
	258	* We are now going to calculate up to 7 ranges.
	259	* Some of them will typically overlap, so we will then have to merge and eliminate ranges.
	260	*/
	261	uint32_t weight=lowerLimit;
	262	for(int32_t length=lowerLength; length>middleLength; --length) {
	263	uint32_t trail=getWeightTrail(weight, length);
	264	if(trail<maxBytes[length]) {
	265	lower[length].start=incWeightTrail(weight, length);
	266	lower[length].end=setWeightTrail(weight, length, maxBytes[length]);
	267	lower[length].length=length;
	268	lower[length].count=maxBytes[length]-trail;
	269	}
	270	weight=truncateWeight(weight, length-1);
	271	}
	272	if(weight<0xff000000) {
	273	middle.start=incWeightTrail(weight, middleLength);
	274	} else {
	275	// Prevent overflow for primary lead byte FF
	276	// which would yield a middle range starting at 0.
	277	middle.start=0xffffffff; // no middle range
	278	}
	279
	280	weight=upperLimit;
	281	for(int32_t length=upperLength; length>middleLength; --length) {
	282	uint32_t trail=getWeightTrail(weight, length);
	283	if(trail>minBytes[length]) {
	284	upper[length].start=setWeightTrail(weight, length, minBytes[length]);
	285	upper[length].end=decWeightTrail(weight, length);
	286	upper[length].length=length;
	287	upper[length].count=trail-minBytes[length];
	288	}
	289	weight=truncateWeight(weight, length-1);
	290	}
	291	middle.end=decWeightTrail(weight, middleLength);
	292
	293	/* set the middle range */
	294	middle.length=middleLength;
	295	if(middle.end>=middle.start) {
	296	middle.count=(int32_t)((middle.end-middle.start)>>(8*(4-middleLength)))+1;
	297	} else {
	298	/* no middle range, eliminate overlaps */
	299	for(int32_t length=4; length>middleLength; --length) {
	300	if(lower[length].count>0 && upper[length].count>0) {
	301	// Note: The lowerEnd and upperStart weights are versions of
	302	// lowerLimit and upperLimit (which are lowerLimit<upperLimit),
	303	// truncated (still less-or-equal)
	304	// and then with their last bytes changed to the
	305	// maxByte (for lowerEnd) or minByte (for upperStart).
	306	const uint32_t lowerEnd=lower[length].end;
	307	const uint32_t upperStart=upper[length].start;
	308	UBool merged=FALSE;
	309
	310	if(lowerEnd>upperStart) {
	311	// These two lower and upper ranges collide.
	312	// Since lowerLimit<upperLimit and lowerEnd and upperStart
	313	// are versions with only their last bytes modified
	314	// (and following ones removed/reset to 0),
	315	// lowerEnd>upperStart is only possible
	316	// if the leading bytes are equal
	317	// and lastByte(lowerEnd)>lastByte(upperStart).
	318	U_ASSERT(truncateWeight(lowerEnd, length-1)==
	319	truncateWeight(upperStart, length-1));
	320	// Intersect these two ranges.
	321	lower[length].end=upper[length].end;
	322	lower[length].count=
	323	(int32_t)getWeightTrail(lower[length].end, length)-
	324	(int32_t)getWeightTrail(lower[length].start, length)+1;
	325	// count might be <=0 in which case there is no room,
	326	// and the range-collecting code below will ignore this range.
	327	merged=TRUE;
	328	} else if(lowerEnd==upperStart) {
	329	// Not possible, unless minByte==maxByte which is not allowed.
	330	U_ASSERT(minBytes[length]<maxBytes[length]);
	331	} else /* lowerEnd<upperStart */ {
	332	if(incWeight(lowerEnd, length)==upperStart) {
	333	// Merge adjacent ranges.
	334	lower[length].end=upper[length].end;
	335	lower[length].count+=upper[length].count; // might be >countBytes
	336	merged=TRUE;
	337	}
	338	}
	339	if(merged) {
	340	// Remove all shorter ranges.
	341	// There was no room available for them between the ranges we just merged.
	342	upper[length].count=0;
	343	while(--length>middleLength) {
	344	lower[length].count=upper[length].count=0;
	345	}
	346	break;
	347	}
	348	}
	349	}
	350	}
	351
	352	#ifdef UCOL_DEBUG
	353	/* print ranges */
	354	for(int32_t length=4; length>=2; --length) {
	355	if(lower[length].count>0) {
	356	printf("lower[%ld] .start=0x%08lx .end=0x%08lx .count=%ld\n", length, lower[length].start, lower[length].end, lower[length].count);
	357	}
	358	}
	359	if(middle.count>0) {
	360	printf("middle .start=0x%08lx .end=0x%08lx .count=%ld\n", middle.start, middle.end, middle.count);
	361	}
	362	for(int32_t length=2; length<=4; ++length) {
	363	if(upper[length].count>0) {
	364	printf("upper[%ld] .start=0x%08lx .end=0x%08lx .count=%ld\n", length, upper[length].start, upper[length].end, upper[length].count);
	365	}
	366	}
	367	#endif
	368
	369	/* copy the ranges, shortest first, into the result array */
	370	rangeCount=0;
	371	if(middle.count>0) {
	372	uprv_memcpy(ranges, &middle, sizeof(WeightRange));
	373	rangeCount=1;
	374	}
	375	for(int32_t length=middleLength+1; length<=4; ++length) {
	376	/* copy upper first so that later the middle range is more likely the first one to use */
	377	if(upper[length].count>0) {
	378	uprv_memcpy(ranges+rangeCount, upper+length, sizeof(WeightRange));
	379	++rangeCount;
	380	}
	381	if(lower[length].count>0) {
	382	uprv_memcpy(ranges+rangeCount, lower+length, sizeof(WeightRange));
	383	++rangeCount;
	384	}
	385	}
	386	return rangeCount>0;
	387	}
	388
	389	UBool
	390	CollationWeights::allocWeightsInShortRanges(int32_t n, int32_t minLength) {
	391	// See if the first few minLength and minLength+1 ranges have enough weights.
	392	for(int32_t i = 0; i < rangeCount && ranges[i].length <= (minLength + 1); ++i) {
	393	if(n <= ranges[i].count) {
	394	// Use the first few minLength and minLength+1 ranges.
	395	if(ranges[i].length > minLength) {
	396	// Reduce the number of weights from the last minLength+1 range
	397	// which might sort before some minLength ranges,
	398	// so that we use all weights in the minLength ranges.
	399	ranges[i].count = n;
	400	}
	401	rangeCount = i + 1;
	402	#ifdef UCOL_DEBUG
	403	printf("take first %ld ranges\n", rangeCount);
	404	#endif
	405
	406	if(rangeCount>1) {
	407	/* sort the ranges by weight values */
	408	UErrorCode errorCode=U_ZERO_ERROR;
	409	uprv_sortArray(ranges, rangeCount, sizeof(WeightRange),
	410	compareRanges, NULL, FALSE, &errorCode);
	411	/* ignore error code: we know that the internal sort function will not fail here */
	412	}
	413	return TRUE;
	414	}
	415	n -= ranges[i].count; // still >0
	416	}
	417	return FALSE;
	418	}
	419
	420	UBool
	421	CollationWeights::allocWeightsInMinLengthRanges(int32_t n, int32_t minLength) {
	422	// See if the minLength ranges have enough weights
	423	// when we split one and lengthen the following ones.
	424	int32_t count = 0;
	425	int32_t minLengthRangeCount;
	426	for(minLengthRangeCount = 0;
	427	minLengthRangeCount < rangeCount &&
	428	ranges[minLengthRangeCount].length == minLength;
	429	++minLengthRangeCount) {
	430	count += ranges[minLengthRangeCount].count;
	431	}
	432
	433	int32_t nextCountBytes = countBytes(minLength + 1);
	434	if(n > count * nextCountBytes) { return FALSE; }
	435
	436	// Use the minLength ranges. Merge them, and then split again as necessary.
	437	uint32_t start = ranges[0].start;
	438	uint32_t end = ranges[0].end;
	439	for(int32_t i = 1; i < minLengthRangeCount; ++i) {
	440	if(ranges[i].start < start) { start = ranges[i].start; }
	441	if(ranges[i].end > end) { end = ranges[i].end; }
	442	}
	443
	444	// Calculate how to split the range between minLength (count1) and minLength+1 (count2).
	445	// Goal:
	446	// count1 + count2 * nextCountBytes = n
	447	// count1 + count2 = count
	448	// These turn into
	449	// (count - count2) + count2 * nextCountBytes = n
	450	// and then into the following count1 & count2 computations.
	451	int32_t count2 = (n - count) / (nextCountBytes - 1); // number of weights to be lengthened
	452	int32_t count1 = count - count2; // number of minLength weights
	453	if(count2 == 0 \|\| (count1 + count2 * nextCountBytes) < n) {
	454	// round up
	455	++count2;
	456	--count1;
	457	U_ASSERT((count1 + count2 * nextCountBytes) >= n);
	458	}
	459
	460	ranges[0].start = start;
	461
	462	if(count1 == 0) {
	463	// Make one long range.
	464	ranges[0].end = end;
	465	ranges[0].count = count;
	466	lengthenRange(ranges[0]);
	467	rangeCount = 1;
	468	} else {
	469	// Split the range, lengthen the second part.
	470	#ifdef UCOL_DEBUG
	471	printf("split the range number %ld (out of %ld minLength ranges) by %ld:%ld\n",
	472	splitRange, rangeCount, count1, count2);
	473	#endif
	474
	475	// Next start = start + count1. First end = 1 before that.
	476	ranges[0].end = incWeightByOffset(start, minLength, count1 - 1);
	477	ranges[0].count = count1;
	478
	479	ranges[1].start = incWeight(ranges[0].end, minLength);
	480	ranges[1].end = end;
	481	ranges[1].length = minLength; // +1 when lengthened
	482	ranges[1].count = count2; // *countBytes when lengthened
	483	lengthenRange(ranges[1]);
	484	rangeCount = 2;
	485	}
	486	return TRUE;
	487	}
	488
	489	/*
	490	* call getWeightRanges and then determine heuristically
	491	* which ranges to use for a given number of weights between (excluding)
	492	* two limits
	493	*/
	494	UBool
	495	CollationWeights::allocWeights(uint32_t lowerLimit, uint32_t upperLimit, int32_t n) {
	496	#ifdef UCOL_DEBUG
	497	puts("");
	498	#endif
	499
	500	if(!getWeightRanges(lowerLimit, upperLimit)) {
	501	#ifdef UCOL_DEBUG
	502	printf("error: unable to get Weight ranges\n");
	503	#endif
	504	return FALSE;
	505	}
	506
	507	/* try until we find suitably large ranges */
	508	for(;;) {
	509	/* get the smallest number of bytes in a range */
	510	int32_t minLength=ranges[0].length;
	511
	512	if(allocWeightsInShortRanges(n, minLength)) { break; }
	513
	514	if(minLength == 4) {
	515	#ifdef UCOL_DEBUG
	516	printf("error: the maximum number of %ld weights is insufficient for n=%ld\n",
	517	minLengthCount, n);
	518	#endif
	519	return FALSE;
	520	}
	521
	522	if(allocWeightsInMinLengthRanges(n, minLength)) { break; }
	523
	524	/* no good match, lengthen all minLength ranges and iterate */
	525	#ifdef UCOL_DEBUG
	526	printf("lengthen the short ranges from %ld bytes to %ld and iterate\n", minLength, minLength+1);
	527	#endif
	528	for(int32_t i=0; ranges[i].length==minLength; ++i) {
	529	lengthenRange(ranges[i]);
	530	}
	531	}
	532
	533	#ifdef UCOL_DEBUG
	534	puts("final ranges:");
	535	for(int32_t i=0; i<rangeCount; ++i) {
	536	printf("ranges[%ld] .start=0x%08lx .end=0x%08lx .length=%ld .count=%ld\n",
	537	i, ranges[i].start, ranges[i].end, ranges[i].length, ranges[i].count);
	538	}
	539	#endif
	540
	541	rangeIndex = 0;
	542	return TRUE;
	543	}
	544
	545	uint32_t
	546	CollationWeights::nextWeight() {
	547	if(rangeIndex >= rangeCount) {
	548	return 0xffffffff;
	549	} else {
	550	/* get the next weight */
	551	WeightRange &range = ranges[rangeIndex];
	552	uint32_t weight = range.start;
	553	if(--range.count == 0) {
	554	/* this range is finished */
	555	++rangeIndex;
	556	} else {
	557	/* increment the weight for the next value */
	558	range.start = incWeight(weight, range.length);
	559	U_ASSERT(range.start <= range.end);
	560	}
	561
	562	return weight;
	563	}
	564	}
	565
	566	U_NAMESPACE_END
	567
	568	#endif /* #if !UCONFIG_NO_COLLATION */