git.saurik.com Git - apple/icu.git/blame_incremental - icuSources/i18n/collationweights.cpp

... / ...

Commit	Line	Data
	1	// © 2016 and later: Unicode, Inc. and others.
	2	// License & terms of use: http://www.unicode.org/copyright.html
	3	/*
	4	*******************************************************************************
	5	*
	6	* Copyright (C) 1999-2015, International Business Machines
	7	* Corporation and others. All Rights Reserved.
	8	*
	9	*******************************************************************************
	10	* file name: collationweights.cpp
	11	* encoding: UTF-8
	12	* tab size: 8 (not used)
	13	* indentation:4
	14	*
	15	* created on: 2001mar08 as ucol_wgt.cpp
	16	* created by: Markus W. Scherer
	17	*
	18	* This file contains code for allocating n collation element weights
	19	* between two exclusive limits.
	20	* It is used only internally by the collation tailoring builder.
	21	*/
	22
	23	#include "unicode/utypes.h"
	24
	25	#if !UCONFIG_NO_COLLATION
	26
	27	#include "cmemory.h"
	28	#include "collation.h"
	29	#include "collationweights.h"
	30	#include "uarrsort.h"
	31	#include "uassert.h"
	32
	33	#ifdef UCOL_DEBUG
	34	# include <stdio.h>
	35	#endif
	36
	37	U_NAMESPACE_BEGIN
	38
	39	/* collation element weight allocation -------------------------------------- */
	40
	41	/* helper functions for CE weights */
	42
	43	static inline uint32_t
	44	getWeightTrail(uint32_t weight, int32_t length) {
	45	return (uint32_t)(weight>>(8*(4-length)))&0xff;
	46	}
	47
	48	static inline uint32_t
	49	setWeightTrail(uint32_t weight, int32_t length, uint32_t trail) {
	50	length=8*(4-length);
	51	return (uint32_t)((weight&(0xffffff00<<length))\|(trail<<length));
	52	}
	53
	54	static inline uint32_t
	55	getWeightByte(uint32_t weight, int32_t idx) {
	56	return getWeightTrail(weight, idx); /* same calculation */
	57	}
	58
	59	static inline uint32_t
	60	setWeightByte(uint32_t weight, int32_t idx, uint32_t byte) {
	61	uint32_t mask; /* 0xffffffff except a 00 "hole" for the index-th byte */
	62
	63	idx*=8;
	64	if(idx<32) {
	65	mask=((uint32_t)0xffffffff)>>idx;
	66	} else {
	67	// Do not use uint32_t>>32 because on some platforms that does not shift at all
	68	// while we need it to become 0.
	69	// PowerPC: 0xffffffff>>32 = 0 (wanted)
	70	// x86: 0xffffffff>>32 = 0xffffffff (not wanted)
	71	//
	72	// ANSI C99 6.5.7 Bitwise shift operators:
	73	// "If the value of the right operand is negative
	74	// or is greater than or equal to the width of the promoted left operand,
	75	// the behavior is undefined."
	76	mask=0;
	77	}
	78	idx=32-idx;
	79	mask\|=0xffffff00<<idx;
	80	return (uint32_t)((weight&mask)\|(byte<<idx));
	81	}
	82
	83	static inline uint32_t
	84	truncateWeight(uint32_t weight, int32_t length) {
	85	return (uint32_t)(weight&(0xffffffff<<(8*(4-length))));
	86	}
	87
	88	static inline uint32_t
	89	incWeightTrail(uint32_t weight, int32_t length) {
	90	return (uint32_t)(weight+(1UL<<(8*(4-length))));
	91	}
	92
	93	static inline uint32_t
	94	decWeightTrail(uint32_t weight, int32_t length) {
	95	return (uint32_t)(weight-(1UL<<(8*(4-length))));
	96	}
	97
	98	CollationWeights::CollationWeights()
	99	: middleLength(0), rangeIndex(0), rangeCount(0) {
	100	for(int32_t i = 0; i < 5; ++i) {
	101	minBytes[i] = maxBytes[i] = 0;
	102	}
	103	}
	104
	105	void
	106	CollationWeights::initForPrimary(UBool compressible) {
	107	middleLength=1;
	108	minBytes[1] = Collation::MERGE_SEPARATOR_BYTE + 1;
	109	maxBytes[1] = Collation::TRAIL_WEIGHT_BYTE;
	110	if(compressible) {
	111	minBytes[2] = Collation::PRIMARY_COMPRESSION_LOW_BYTE + 1;
	112	maxBytes[2] = Collation::PRIMARY_COMPRESSION_HIGH_BYTE - 1;
	113	} else {
	114	minBytes[2] = 2;
	115	maxBytes[2] = 0xff;
	116	}
	117	minBytes[3] = 2;
	118	maxBytes[3] = 0xff;
	119	minBytes[4] = 2;
	120	maxBytes[4] = 0xff;
	121	}
	122
	123	void
	124	CollationWeights::initForSecondary() {
	125	// We use only the lower 16 bits for secondary weights.
	126	middleLength=3;
	127	minBytes[1] = 0;
	128	maxBytes[1] = 0;
	129	minBytes[2] = 0;
	130	maxBytes[2] = 0;
	131	minBytes[3] = Collation::LEVEL_SEPARATOR_BYTE + 1;
	132	maxBytes[3] = 0xff;
	133	minBytes[4] = 2;
	134	maxBytes[4] = 0xff;
	135	}
	136
	137	void
	138	CollationWeights::initForTertiary() {
	139	// We use only the lower 16 bits for tertiary weights.
	140	middleLength=3;
	141	minBytes[1] = 0;
	142	maxBytes[1] = 0;
	143	minBytes[2] = 0;
	144	maxBytes[2] = 0;
	145	// We use only 6 bits per byte.
	146	// The other bits are used for case & quaternary weights.
	147	minBytes[3] = Collation::LEVEL_SEPARATOR_BYTE + 1;
	148	maxBytes[3] = 0x3f;
	149	minBytes[4] = 2;
	150	maxBytes[4] = 0x3f;
	151	}
	152
	153	uint32_t
	154	CollationWeights::incWeight(uint32_t weight, int32_t length) const {
	155	for(;;) {
	156	uint32_t byte=getWeightByte(weight, length);
	157	if(byte<maxBytes[length]) {
	158	return setWeightByte(weight, length, byte+1);
	159	} else {
	160	// Roll over, set this byte to the minimum and increment the previous one.
	161	weight=setWeightByte(weight, length, minBytes[length]);
	162	--length;
	163	U_ASSERT(length > 0);
	164	}
	165	}
	166	}
	167
	168	uint32_t
	169	CollationWeights::incWeightByOffset(uint32_t weight, int32_t length, int32_t offset) const {
	170	for(;;) {
	171	offset += getWeightByte(weight, length);
	172	if((uint32_t)offset <= maxBytes[length]) {
	173	return setWeightByte(weight, length, offset);
	174	} else {
	175	// Split the offset between this byte and the previous one.
	176	offset -= minBytes[length];
	177	weight = setWeightByte(weight, length, minBytes[length] + offset % countBytes(length));
	178	offset /= countBytes(length);
	179	--length;
	180	U_ASSERT(length > 0);
	181	}
	182	}
	183	}
	184
	185	void
	186	CollationWeights::lengthenRange(WeightRange &range) const {
	187	int32_t length=range.length+1;
	188	range.start=setWeightTrail(range.start, length, minBytes[length]);
	189	range.end=setWeightTrail(range.end, length, maxBytes[length]);
	190	range.count*=countBytes(length);
	191	range.length=length;
	192	}
	193
	194	/* for uprv_sortArray: sort ranges in weight order */
	195	static int32_t U_CALLCONV
	196	compareRanges(const void * /context/, const void left, const void right) {
	197	uint32_t l, r;
	198
	199	l=((const CollationWeights::WeightRange *)left)->start;
	200	r=((const CollationWeights::WeightRange *)right)->start;
	201	if(l<r) {
	202	return -1;
	203	} else if(l>r) {
	204	return 1;
	205	} else {
	206	return 0;
	207	}
	208	}
	209
	210	UBool
	211	CollationWeights::getWeightRanges(uint32_t lowerLimit, uint32_t upperLimit) {
	212	U_ASSERT(lowerLimit != 0);
	213	U_ASSERT(upperLimit != 0);
	214
	215	/* get the lengths of the limits */
	216	int32_t lowerLength=lengthOfWeight(lowerLimit);
	217	int32_t upperLength=lengthOfWeight(upperLimit);
	218
	219	#ifdef UCOL_DEBUG
	220	printf("length of lower limit 0x%08lx is %ld\n", lowerLimit, lowerLength);
	221	printf("length of upper limit 0x%08lx is %ld\n", upperLimit, upperLength);
	222	#endif
	223	U_ASSERT(lowerLength>=middleLength);
	224	// Permit upperLength<middleLength: The upper limit for secondaries is 0x10000.
	225
	226	if(lowerLimit>=upperLimit) {
	227	#ifdef UCOL_DEBUG
	228	printf("error: no space between lower & upper limits\n");
	229	#endif
	230	return FALSE;
	231	}
	232
	233	/* check that neither is a prefix of the other */
	234	if(lowerLength<upperLength) {
	235	if(lowerLimit==truncateWeight(upperLimit, lowerLength)) {
	236	#ifdef UCOL_DEBUG
	237	printf("error: lower limit 0x%08lx is a prefix of upper limit 0x%08lx\n", lowerLimit, upperLimit);
	238	#endif
	239	return FALSE;
	240	}
	241	}
	242	/* if the upper limit is a prefix of the lower limit then the earlier test lowerLimit>=upperLimit has caught it */
	243
	244	WeightRange lower[5], middle, upper[5]; /* [0] and [1] are not used - this simplifies indexing */
	245	uprv_memset(lower, 0, sizeof(lower));
	246	uprv_memset(&middle, 0, sizeof(middle));
	247	uprv_memset(upper, 0, sizeof(upper));
	248
	249	/*
	250	* With the limit lengths of 1..4, there are up to 7 ranges for allocation:
	251	* range minimum length
	252	* lower[4] 4
	253	* lower[3] 3
	254	* lower[2] 2
	255	* middle 1
	256	* upper[2] 2
	257	* upper[3] 3
	258	* upper[4] 4
	259	*
	260	* We are now going to calculate up to 7 ranges.
	261	* Some of them will typically overlap, so we will then have to merge and eliminate ranges.
	262	*/
	263	uint32_t weight=lowerLimit;
	264	for(int32_t length=lowerLength; length>middleLength; --length) {
	265	uint32_t trail=getWeightTrail(weight, length);
	266	if(trail<maxBytes[length]) {
	267	lower[length].start=incWeightTrail(weight, length);
	268	lower[length].end=setWeightTrail(weight, length, maxBytes[length]);
	269	lower[length].length=length;
	270	lower[length].count=maxBytes[length]-trail;
	271	}
	272	weight=truncateWeight(weight, length-1);
	273	}
	274	if(weight<0xff000000) {
	275	middle.start=incWeightTrail(weight, middleLength);
	276	} else {
	277	// Prevent overflow for primary lead byte FF
	278	// which would yield a middle range starting at 0.
	279	middle.start=0xffffffff; // no middle range
	280	}
	281
	282	weight=upperLimit;
	283	for(int32_t length=upperLength; length>middleLength; --length) {
	284	uint32_t trail=getWeightTrail(weight, length);
	285	if(trail>minBytes[length]) {
	286	upper[length].start=setWeightTrail(weight, length, minBytes[length]);
	287	upper[length].end=decWeightTrail(weight, length);
	288	upper[length].length=length;
	289	upper[length].count=trail-minBytes[length];
	290	}
	291	weight=truncateWeight(weight, length-1);
	292	}
	293	middle.end=decWeightTrail(weight, middleLength);
	294
	295	/* set the middle range */
	296	middle.length=middleLength;
	297	if(middle.end>=middle.start) {
	298	middle.count=(int32_t)((middle.end-middle.start)>>(8*(4-middleLength)))+1;
	299	} else {
	300	/* no middle range, eliminate overlaps */
	301	for(int32_t length=4; length>middleLength; --length) {
	302	if(lower[length].count>0 && upper[length].count>0) {
	303	// Note: The lowerEnd and upperStart weights are versions of
	304	// lowerLimit and upperLimit (which are lowerLimit<upperLimit),
	305	// truncated (still less-or-equal)
	306	// and then with their last bytes changed to the
	307	// maxByte (for lowerEnd) or minByte (for upperStart).
	308	const uint32_t lowerEnd=lower[length].end;
	309	const uint32_t upperStart=upper[length].start;
	310	UBool merged=FALSE;
	311
	312	if(lowerEnd>upperStart) {
	313	// These two lower and upper ranges collide.
	314	// Since lowerLimit<upperLimit and lowerEnd and upperStart
	315	// are versions with only their last bytes modified
	316	// (and following ones removed/reset to 0),
	317	// lowerEnd>upperStart is only possible
	318	// if the leading bytes are equal
	319	// and lastByte(lowerEnd)>lastByte(upperStart).
	320	U_ASSERT(truncateWeight(lowerEnd, length-1)==
	321	truncateWeight(upperStart, length-1));
	322	// Intersect these two ranges.
	323	lower[length].end=upper[length].end;
	324	lower[length].count=
	325	(int32_t)getWeightTrail(lower[length].end, length)-
	326	(int32_t)getWeightTrail(lower[length].start, length)+1;
	327	// count might be <=0 in which case there is no room,
	328	// and the range-collecting code below will ignore this range.
	329	merged=TRUE;
	330	} else if(lowerEnd==upperStart) {
	331	// Not possible, unless minByte==maxByte which is not allowed.
	332	U_ASSERT(minBytes[length]<maxBytes[length]);
	333	} else /* lowerEnd<upperStart */ {
	334	if(incWeight(lowerEnd, length)==upperStart) {
	335	// Merge adjacent ranges.
	336	lower[length].end=upper[length].end;
	337	lower[length].count+=upper[length].count; // might be >countBytes
	338	merged=TRUE;
	339	}
	340	}
	341	if(merged) {
	342	// Remove all shorter ranges.
	343	// There was no room available for them between the ranges we just merged.
	344	upper[length].count=0;
	345	while(--length>middleLength) {
	346	lower[length].count=upper[length].count=0;
	347	}
	348	break;
	349	}
	350	}
	351	}
	352	}
	353
	354	#ifdef UCOL_DEBUG
	355	/* print ranges */
	356	for(int32_t length=4; length>=2; --length) {
	357	if(lower[length].count>0) {
	358	printf("lower[%ld] .start=0x%08lx .end=0x%08lx .count=%ld\n", length, lower[length].start, lower[length].end, lower[length].count);
	359	}
	360	}
	361	if(middle.count>0) {
	362	printf("middle .start=0x%08lx .end=0x%08lx .count=%ld\n", middle.start, middle.end, middle.count);
	363	}
	364	for(int32_t length=2; length<=4; ++length) {
	365	if(upper[length].count>0) {
	366	printf("upper[%ld] .start=0x%08lx .end=0x%08lx .count=%ld\n", length, upper[length].start, upper[length].end, upper[length].count);
	367	}
	368	}
	369	#endif
	370
	371	/* copy the ranges, shortest first, into the result array */
	372	rangeCount=0;
	373	if(middle.count>0) {
	374	uprv_memcpy(ranges, &middle, sizeof(WeightRange));
	375	rangeCount=1;
	376	}
	377	for(int32_t length=middleLength+1; length<=4; ++length) {
	378	/* copy upper first so that later the middle range is more likely the first one to use */
	379	if(upper[length].count>0) {
	380	uprv_memcpy(ranges+rangeCount, upper+length, sizeof(WeightRange));
	381	++rangeCount;
	382	}
	383	if(lower[length].count>0) {
	384	uprv_memcpy(ranges+rangeCount, lower+length, sizeof(WeightRange));
	385	++rangeCount;
	386	}
	387	}
	388	return rangeCount>0;
	389	}
	390
	391	UBool
	392	CollationWeights::allocWeightsInShortRanges(int32_t n, int32_t minLength) {
	393	// See if the first few minLength and minLength+1 ranges have enough weights.
	394	for(int32_t i = 0; i < rangeCount && ranges[i].length <= (minLength + 1); ++i) {
	395	if(n <= ranges[i].count) {
	396	// Use the first few minLength and minLength+1 ranges.
	397	if(ranges[i].length > minLength) {
	398	// Reduce the number of weights from the last minLength+1 range
	399	// which might sort before some minLength ranges,
	400	// so that we use all weights in the minLength ranges.
	401	ranges[i].count = n;
	402	}
	403	rangeCount = i + 1;
	404	#ifdef UCOL_DEBUG
	405	printf("take first %ld ranges\n", rangeCount);
	406	#endif
	407
	408	if(rangeCount>1) {
	409	/* sort the ranges by weight values */
	410	UErrorCode errorCode=U_ZERO_ERROR;
	411	uprv_sortArray(ranges, rangeCount, sizeof(WeightRange),
	412	compareRanges, NULL, FALSE, &errorCode);
	413	/* ignore error code: we know that the internal sort function will not fail here */
	414	}
	415	return TRUE;
	416	}
	417	n -= ranges[i].count; // still >0
	418	}
	419	return FALSE;
	420	}
	421
	422	UBool
	423	CollationWeights::allocWeightsInMinLengthRanges(int32_t n, int32_t minLength) {
	424	// See if the minLength ranges have enough weights
	425	// when we split one and lengthen the following ones.
	426	int32_t count = 0;
	427	int32_t minLengthRangeCount;
	428	for(minLengthRangeCount = 0;
	429	minLengthRangeCount < rangeCount &&
	430	ranges[minLengthRangeCount].length == minLength;
	431	++minLengthRangeCount) {
	432	count += ranges[minLengthRangeCount].count;
	433	}
	434
	435	int32_t nextCountBytes = countBytes(minLength + 1);
	436	if(n > count * nextCountBytes) { return FALSE; }
	437
	438	// Use the minLength ranges. Merge them, and then split again as necessary.
	439	uint32_t start = ranges[0].start;
	440	uint32_t end = ranges[0].end;
	441	for(int32_t i = 1; i < minLengthRangeCount; ++i) {
	442	if(ranges[i].start < start) { start = ranges[i].start; }
	443	if(ranges[i].end > end) { end = ranges[i].end; }
	444	}
	445
	446	// Calculate how to split the range between minLength (count1) and minLength+1 (count2).
	447	// Goal:
	448	// count1 + count2 * nextCountBytes = n
	449	// count1 + count2 = count
	450	// These turn into
	451	// (count - count2) + count2 * nextCountBytes = n
	452	// and then into the following count1 & count2 computations.
	453	int32_t count2 = (n - count) / (nextCountBytes - 1); // number of weights to be lengthened
	454	int32_t count1 = count - count2; // number of minLength weights
	455	if(count2 == 0 \|\| (count1 + count2 * nextCountBytes) < n) {
	456	// round up
	457	++count2;
	458	--count1;
	459	U_ASSERT((count1 + count2 * nextCountBytes) >= n);
	460	}
	461
	462	ranges[0].start = start;
	463
	464	if(count1 == 0) {
	465	// Make one long range.
	466	ranges[0].end = end;
	467	ranges[0].count = count;
	468	lengthenRange(ranges[0]);
	469	rangeCount = 1;
	470	} else {
	471	// Split the range, lengthen the second part.
	472	#ifdef UCOL_DEBUG
	473	printf("split the range number %ld (out of %ld minLength ranges) by %ld:%ld\n",
	474	splitRange, rangeCount, count1, count2);
	475	#endif
	476
	477	// Next start = start + count1. First end = 1 before that.
	478	ranges[0].end = incWeightByOffset(start, minLength, count1 - 1);
	479	ranges[0].count = count1;
	480
	481	ranges[1].start = incWeight(ranges[0].end, minLength);
	482	ranges[1].end = end;
	483	ranges[1].length = minLength; // +1 when lengthened
	484	ranges[1].count = count2; // *countBytes when lengthened
	485	lengthenRange(ranges[1]);
	486	rangeCount = 2;
	487	}
	488	return TRUE;
	489	}
	490
	491	/*
	492	* call getWeightRanges and then determine heuristically
	493	* which ranges to use for a given number of weights between (excluding)
	494	* two limits
	495	*/
	496	UBool
	497	CollationWeights::allocWeights(uint32_t lowerLimit, uint32_t upperLimit, int32_t n) {
	498	#ifdef UCOL_DEBUG
	499	puts("");
	500	#endif
	501
	502	if(!getWeightRanges(lowerLimit, upperLimit)) {
	503	#ifdef UCOL_DEBUG
	504	printf("error: unable to get Weight ranges\n");
	505	#endif
	506	return FALSE;
	507	}
	508
	509	/* try until we find suitably large ranges */
	510	for(;;) {
	511	/* get the smallest number of bytes in a range */
	512	int32_t minLength=ranges[0].length;
	513
	514	if(allocWeightsInShortRanges(n, minLength)) { break; }
	515
	516	if(minLength == 4) {
	517	#ifdef UCOL_DEBUG
	518	printf("error: the maximum number of %ld weights is insufficient for n=%ld\n",
	519	minLengthCount, n);
	520	#endif
	521	return FALSE;
	522	}
	523
	524	if(allocWeightsInMinLengthRanges(n, minLength)) { break; }
	525
	526	/* no good match, lengthen all minLength ranges and iterate */
	527	#ifdef UCOL_DEBUG
	528	printf("lengthen the short ranges from %ld bytes to %ld and iterate\n", minLength, minLength+1);
	529	#endif
	530	for(int32_t i=0; i<rangeCount && ranges[i].length==minLength; ++i) {
	531	lengthenRange(ranges[i]);
	532	}
	533	}
	534
	535	#ifdef UCOL_DEBUG
	536	puts("final ranges:");
	537	for(int32_t i=0; i<rangeCount; ++i) {
	538	printf("ranges[%ld] .start=0x%08lx .end=0x%08lx .length=%ld .count=%ld\n",
	539	i, ranges[i].start, ranges[i].end, ranges[i].length, ranges[i].count);
	540	}
	541	#endif
	542
	543	rangeIndex = 0;
	544	return TRUE;
	545	}
	546
	547	uint32_t
	548	CollationWeights::nextWeight() {
	549	if(rangeIndex >= rangeCount) {
	550	return 0xffffffff;
	551	} else {
	552	/* get the next weight */
	553	WeightRange &range = ranges[rangeIndex];
	554	uint32_t weight = range.start;
	555	if(--range.count == 0) {
	556	/* this range is finished */
	557	++rangeIndex;
	558	} else {
	559	/* increment the weight for the next value */
	560	range.start = incWeight(weight, range.length);
	561	U_ASSERT(range.start <= range.end);
	562	}
	563
	564	return weight;
	565	}
	566	}
	567
	568	U_NAMESPACE_END
	569
	570	#endif /* #if !UCONFIG_NO_COLLATION */