[apple/icu.git] / icuSources / i18n / sortkey.cpp

/*
*******************************************************************************
* Copyright (C) 1996-2011, International Business Machines Corporation and    *
* others. All Rights Reserved.                                                *
*******************************************************************************
*/
//===============================================================================
//
// File sortkey.cpp
//
//
//
// Created by: Helena Shih
//
// Modification History:
//
//  Date         Name          Description
//
//  6/20/97      helena        Java class name change.
//  6/23/97      helena        Added comments to make code more readable.
//  6/26/98      erm           Canged to use byte arrays instead of UnicodeString
//  7/31/98      erm           hashCode: minimum inc should be 2 not 1,
//                             Cleaned up operator=
// 07/12/99      helena        HPUX 11 CC port.
// 03/06/01      synwee        Modified compareTo, to handle the result of
//                             2 string similar in contents, but one is longer
//                             than the other
//===============================================================================

#include "unicode/utypes.h"

#if !UCONFIG_NO_COLLATION

#include "unicode/sortkey.h"
#include "cmemory.h"
#include "uelement.h"
#include "ustr_imp.h"

U_NAMESPACE_BEGIN

// A hash code of kInvalidHashCode indicates that the has code needs
// to be computed. A hash code of kEmptyHashCode is used for empty keys
// and for any key whose computed hash code is kInvalidHashCode.
#define kInvalidHashCode ((int32_t)0)
#define kEmptyHashCode ((int32_t)1)

UOBJECT_DEFINE_RTTI_IMPLEMENTATION(CollationKey)

CollationKey::CollationKey()
    : UObject(), fBogus(FALSE), fCount(0), fCapacity(0),
      fHashCode(kEmptyHashCode), fBytes(NULL)
{
}

// Create a collation key from a bit array.
CollationKey::CollationKey(const uint8_t* newValues, int32_t count)
    : UObject(), fBogus(FALSE), fCount(count), fCapacity(count),
      fHashCode(kInvalidHashCode)
{
    fBytes = (uint8_t *)uprv_malloc(count);

    if (fBytes == NULL)
    {
        setToBogus();
        return;
    }

    uprv_memcpy(fBytes, newValues, fCount);
}

CollationKey::CollationKey(const CollationKey& other)
: UObject(other), fBogus(FALSE), fCount(other.fCount), fCapacity(other.fCapacity),
  fHashCode(other.fHashCode), fBytes(NULL)
{
    if (other.fBogus)
    {
        setToBogus();
        return;
    }

    fBytes = (uint8_t *)uprv_malloc(fCapacity);

    if (fBytes == NULL)
    {
        setToBogus();
        return;
    }

    uprv_memcpy(fBytes, other.fBytes, other.fCount);
    if(fCapacity>fCount) {
        uprv_memset(fBytes+fCount, 0, fCapacity-fCount);
    }
}

CollationKey::~CollationKey()
{
        uprv_free(fBytes);
}

void CollationKey::adopt(uint8_t *values, int32_t capacity, int32_t count) {
    if(fBytes != NULL) {
        uprv_free(fBytes);
    }
    fBytes = values;
    fCapacity = capacity;
    setLength(count);
}

void CollationKey::setLength(int32_t newLength) {
    fBogus = FALSE;
    fCount = newLength;
    fHashCode = kInvalidHashCode;
}

// set the key to an empty state
CollationKey&
CollationKey::reset()
{
    fCount = 0;
    fBogus = FALSE;
    fHashCode = kEmptyHashCode;

    return *this;
}

// set the key to a "bogus" or invalid state
CollationKey&
CollationKey::setToBogus()
{
    uprv_free(fBytes);
    fBytes = NULL;

    fCapacity = 0;
    fCount = 0;
    fHashCode = kInvalidHashCode;

    return *this;
}

UBool
CollationKey::operator==(const CollationKey& source) const
{
    return (this->fCount == source.fCount &&
            (this->fBytes == source.fBytes ||
             uprv_memcmp(this->fBytes, source.fBytes, this->fCount) == 0));
}

const CollationKey&
CollationKey::operator=(const CollationKey& other)
{
    if (this != &other)
    {
        if (other.isBogus())
        {
            return setToBogus();
        }

        if (other.fBytes != NULL)
        {
            ensureCapacity(other.fCount);

            if (isBogus())
            {
                return *this;
            }

            fHashCode = other.fHashCode;
            uprv_memcpy(fBytes, other.fBytes, fCount);
        }
        else
        {
            fCount = 0;
            fBogus = FALSE;
            fHashCode = kEmptyHashCode;
        }
    }

    return *this;
}

// Bitwise comparison for the collation keys.
// NOTE: this is somewhat messy 'cause we can't count
// on memcmp returning the exact values which match
// Collator::EComparisonResult
Collator::EComparisonResult
CollationKey::compareTo(const CollationKey& target) const
{
    uint8_t *src = this->fBytes;
    uint8_t *tgt = target.fBytes;

    // are we comparing the same string
    if (src == tgt)
        return  Collator::EQUAL;

        /*
        int count = (this->fCount < target.fCount) ? this->fCount : target.fCount;
        if (count == 0)
        {
        // If count is 0, at least one of the keys is empty.
        // An empty key is always LESS than a non-empty one
        // and EQUAL to another empty
        if (this->fCount < target.fCount)
        {
        return Collator::LESS;
        }

          if (this->fCount > target.fCount)
          {
          return Collator::GREATER;
          }
          return Collator::EQUAL;
          }
    */

    int                         minLength;
    Collator::EComparisonResult result;

    // are we comparing different lengths?
    if (this->fCount != target.fCount) {
        if (this->fCount < target.fCount) {
            minLength = this->fCount;
            result    =  Collator::LESS;
        }
        else {
            minLength = target.fCount;
            result    =  Collator::GREATER;
        }
    }
    else {
        minLength = target.fCount;
        result    =  Collator::EQUAL;
    }

    if (minLength > 0) {
        int diff = uprv_memcmp(src, tgt, minLength);
        if (diff > 0) {
            return Collator::GREATER;
        }
        else
            if (diff < 0) {
                return Collator::LESS;
            }
    }

    return result;
    /*
    if (result < 0)
    {
    return Collator::LESS;
    }

      if (result > 0)
      {
      return Collator::GREATER;
      }
      return Collator::EQUAL;
    */
}

// Bitwise comparison for the collation keys.
UCollationResult
CollationKey::compareTo(const CollationKey& target, UErrorCode &status) const
{
  if(U_SUCCESS(status)) {
    uint8_t *src = this->fBytes;
    uint8_t *tgt = target.fBytes;

    // are we comparing the same string
    if (src == tgt)
        return  UCOL_EQUAL;

    int                         minLength;
    UCollationResult result;

    // are we comparing different lengths?
    if (this->fCount != target.fCount) {
        if (this->fCount < target.fCount) {
            minLength = this->fCount;
            result    =  UCOL_LESS;
        }
        else {
            minLength = target.fCount;
            result    =  UCOL_GREATER;
        }
    }
    else {
        minLength = target.fCount;
        result    =  UCOL_EQUAL;
    }

    if (minLength > 0) {
        int diff = uprv_memcmp(src, tgt, minLength);
        if (diff > 0) {
            return UCOL_GREATER;
        }
        else
            if (diff < 0) {
                return UCOL_LESS;
            }
    }

    return result;
  } else {
    return UCOL_EQUAL;
  }
}

CollationKey&
CollationKey::ensureCapacity(int32_t newSize)
{
    if (fCapacity < newSize)
    {
        uprv_free(fBytes);

        fBytes = (uint8_t *)uprv_malloc(newSize);

        if (fBytes == NULL)
        {
            return setToBogus();
        }

        uprv_memset(fBytes, 0, fCapacity);
        fCapacity = newSize;
    }

    fBogus = FALSE;
    fCount = newSize;
    fHashCode = kInvalidHashCode;

    return *this;
}

#ifdef U_USE_COLLATION_KEY_DEPRECATES
// Create a copy of the byte array.
uint8_t*
CollationKey::toByteArray(int32_t& count) const
{
    uint8_t *result = (uint8_t*) uprv_malloc( sizeof(uint8_t) * fCount );

    if (result == NULL)
    {
        count = 0;
    }
    else
    {
        count = fCount;
        uprv_memcpy(result, fBytes, fCount);
    }

    return result;
}
#endif

int32_t
CollationKey::hashCode() const
{
    // (Cribbed from UnicodeString)
    // We cache the hashCode; when it becomes invalid, due to any change to the
    // string, we note this by setting it to kInvalidHashCode. [LIU]

    // Note: This method is semantically const, but physically non-const.

    if (fHashCode == kInvalidHashCode)
    {
        const char *s = reinterpret_cast<const char *>(fBytes);
        ((CollationKey *)this)->fHashCode = s == NULL ? 0 : ustr_hashCharsN(s, fCount);
#if 0
        // We compute the hash by iterating sparsely over 64 (at most) characters
        // spaced evenly through the string.  For each character, we multiply the
        // previous hash value by a prime number and add the new character in,
        // in the manner of a additive linear congruential random number generator,
        // thus producing a pseudorandom deterministic value which should be well
        // distributed over the output range. [LIU]
        const uint8_t   *p = fBytes, *limit = fBytes + fCount;
        int32_t         inc = (fCount >= 256) ? fCount/128 : 2; // inc = max(fSize/64, 1);
        int32_t         hash = 0;

        while (p < limit)
        {
            hash = ( hash * 37 ) + ((p[0] << 8) + p[1]);
            p += inc;
        }

        // If we happened to get kInvalidHashCode, replace it with kEmptyHashCode
        if (hash == kInvalidHashCode)
        {
            hash = kEmptyHashCode;
        }

        ((CollationKey *)this)->fHashCode = hash; // cast away const
#endif
    }

    return fHashCode;
}

U_NAMESPACE_END

U_CAPI int32_t U_EXPORT2
ucol_keyHashCode(const uint8_t *key, 
                       int32_t  length)
{
    icu::CollationKey newKey(key, length);
    return newKey.hashCode();
}

#endif /* #if !UCONFIG_NO_COLLATION */
Commit	Line	Data
b75a7d8f A	1	/*
b75a7d8f A	2	*******************************************************************************
4388f060	3	* Copyright (C) 1996-2011, International Business Machines Corporation and *
b75a7d8f A	4	* others. All Rights Reserved. *
	5	*******************************************************************************
	6	*/
	7	//===============================================================================
	8	//
	9	// File sortkey.cpp
	10	//
	11	//
	12	//
	13	// Created by: Helena Shih
	14	//
	15	// Modification History:
	16	//
	17	// Date Name Description
	18	//
	19	// 6/20/97 helena Java class name change.
	20	// 6/23/97 helena Added comments to make code more readable.
	21	// 6/26/98 erm Canged to use byte arrays instead of UnicodeString
	22	// 7/31/98 erm hashCode: minimum inc should be 2 not 1,
	23	// Cleaned up operator=
	24	// 07/12/99 helena HPUX 11 CC port.
	25	// 03/06/01 synwee Modified compareTo, to handle the result of
	26	// 2 string similar in contents, but one is longer
	27	// than the other
	28	//===============================================================================
	29
	30	#include "unicode/utypes.h"
	31
	32	#if !UCONFIG_NO_COLLATION
	33
	34	#include "unicode/sortkey.h"
	35	#include "cmemory.h"
4388f060 A	36	#include "uelement.h"
4388f060 A	37	#include "ustr_imp.h"
b75a7d8f A	38
	39	U_NAMESPACE_BEGIN
	40
	41	// A hash code of kInvalidHashCode indicates that the has code needs
	42	// to be computed. A hash code of kEmptyHashCode is used for empty keys
	43	// and for any key whose computed hash code is kInvalidHashCode.
	44	#define kInvalidHashCode ((int32_t)0)
	45	#define kEmptyHashCode ((int32_t)1)
	46
374ca955	47	UOBJECT_DEFINE_RTTI_IMPLEMENTATION(CollationKey)
b75a7d8f A	48
	49	CollationKey::CollationKey()
	50	: UObject(), fBogus(FALSE), fCount(0), fCapacity(0),
	51	fHashCode(kEmptyHashCode), fBytes(NULL)
	52	{
	53	}
	54
	55	// Create a collation key from a bit array.
	56	CollationKey::CollationKey(const uint8_t* newValues, int32_t count)
	57	: UObject(), fBogus(FALSE), fCount(count), fCapacity(count),
	58	fHashCode(kInvalidHashCode)
	59	{
	60	fBytes = (uint8_t *)uprv_malloc(count);
	61
	62	if (fBytes == NULL)
	63	{
	64	setToBogus();
	65	return;
	66	}
	67
	68	uprv_memcpy(fBytes, newValues, fCount);
	69	}
	70
	71	CollationKey::CollationKey(const CollationKey& other)
	72	: UObject(other), fBogus(FALSE), fCount(other.fCount), fCapacity(other.fCapacity),
	73	fHashCode(other.fHashCode), fBytes(NULL)
	74	{
	75	if (other.fBogus)
	76	{
	77	setToBogus();
	78	return;
	79	}
	80
	81	fBytes = (uint8_t *)uprv_malloc(fCapacity);
	82
	83	if (fBytes == NULL)
	84	{
	85	setToBogus();
	86	return;
	87	}
	88
	89	uprv_memcpy(fBytes, other.fBytes, other.fCount);
	90	if(fCapacity>fCount) {
	91	uprv_memset(fBytes+fCount, 0, fCapacity-fCount);
	92	}
	93	}
	94
	95	CollationKey::~CollationKey()
	96	{
	97	uprv_free(fBytes);
	98	}
	99
4388f060	100	void CollationKey::adopt(uint8_t *values, int32_t capacity, int32_t count) {
b75a7d8f A	101	if(fBytes != NULL) {
	102	uprv_free(fBytes);
	103	}
b75a7d8f	104	fBytes = values;
4388f060 A	105	fCapacity = capacity;
	106	setLength(count);
	107	}
	108
	109	void CollationKey::setLength(int32_t newLength) {
	110	fBogus = FALSE;
	111	fCount = newLength;
b75a7d8f A	112	fHashCode = kInvalidHashCode;
	113	}
	114
	115	// set the key to an empty state
	116	CollationKey&
	117	CollationKey::reset()
	118	{
	119	fCount = 0;
	120	fBogus = FALSE;
	121	fHashCode = kEmptyHashCode;
	122
	123	return *this;
	124	}
	125
	126	// set the key to a "bogus" or invalid state
	127	CollationKey&
	128	CollationKey::setToBogus()
	129	{
	130	uprv_free(fBytes);
	131	fBytes = NULL;
	132
	133	fCapacity = 0;
	134	fCount = 0;
	135	fHashCode = kInvalidHashCode;
	136
	137	return *this;
	138	}
	139
	140	UBool
	141	CollationKey::operator==(const CollationKey& source) const
	142	{
	143	return (this->fCount == source.fCount &&
	144	(this->fBytes == source.fBytes \|\|
	145	uprv_memcmp(this->fBytes, source.fBytes, this->fCount) == 0));
	146	}
	147
	148	const CollationKey&
	149	CollationKey::operator=(const CollationKey& other)
	150	{
	151	if (this != &other)
	152	{
	153	if (other.isBogus())
	154	{
	155	return setToBogus();
	156	}
	157
	158	if (other.fBytes != NULL)
	159	{
	160	ensureCapacity(other.fCount);
	161
	162	if (isBogus())
	163	{
	164	return *this;
	165	}
	166
	167	fHashCode = other.fHashCode;
	168	uprv_memcpy(fBytes, other.fBytes, fCount);
	169	}
	170	else
	171	{
	172	fCount = 0;
	173	fBogus = FALSE;
	174	fHashCode = kEmptyHashCode;
	175	}
176	}
177
178	return *this;
179	}
180
181	// Bitwise comparison for the collation keys.
182	// NOTE: this is somewhat messy 'cause we can't count
183	// on memcmp returning the exact values which match
184	// Collator::EComparisonResult
185	Collator::EComparisonResult
186	CollationKey::compareTo(const CollationKey& target) const
187	{
188	uint8_t *src = this->fBytes;
189	uint8_t *tgt = target.fBytes;
190
191	// are we comparing the same string
192	if (src == tgt)
193	return Collator::EQUAL;
194
195	/*
196	int count = (this->fCount < target.fCount) ? this->fCount : target.fCount;
197	if (count == 0)
198	{
199	// If count is 0, at least one of the keys is empty.
200	// An empty key is always LESS than a non-empty one
201	// and EQUAL to another empty
202	if (this->fCount < target.fCount)
203	{
204	return Collator::LESS;
205	}
206
207	if (this->fCount > target.fCount)
208	{
209	return Collator::GREATER;
210	}
211	return Collator::EQUAL;
212	}
213	*/
214
215	int minLength;
216	Collator::EComparisonResult result;
217
218	// are we comparing different lengths?
219	if (this->fCount != target.fCount) {
220	if (this->fCount < target.fCount) {
221	minLength = this->fCount;
222	result = Collator::LESS;
223	}
224	else {
225	minLength = target.fCount;
226	result = Collator::GREATER;
227	}
228	}
229	else {
230	minLength = target.fCount;
231	result = Collator::EQUAL;
232	}
233
234	if (minLength > 0) {
235	int diff = uprv_memcmp(src, tgt, minLength);
236	if (diff > 0) {
237	return Collator::GREATER;
238	}
239	else
240	if (diff < 0) {
241	return Collator::LESS;
242	}
243	}
244
245	return result;
246	/*
247	if (result < 0)
248	{
249	return Collator::LESS;
250	}
251
252	if (result > 0)
253	{
254	return Collator::GREATER;
255	}
256	return Collator::EQUAL;
257	*/
258	}
259
260	// Bitwise comparison for the collation keys.
261	UCollationResult
262	CollationKey::compareTo(const CollationKey& target, UErrorCode &status) const
263	{
264	if(U_SUCCESS(status)) {
265	uint8_t *src = this->fBytes;
266	uint8_t *tgt = target.fBytes;
267
268	// are we comparing the same string
269	if (src == tgt)
270	return UCOL_EQUAL;
271
272	int minLength;
273	UCollationResult result;
274
275	// are we comparing different lengths?
276	if (this->fCount != target.fCount) {
277	if (this->fCount < target.fCount) {
278	minLength = this->fCount;
279	result = UCOL_LESS;
280	}
281	else {
282	minLength = target.fCount;
283	result = UCOL_GREATER;
284	}
285	}
286	else {
287	minLength = target.fCount;
288	result = UCOL_EQUAL;
289	}
290
291	if (minLength > 0) {
292	int diff = uprv_memcmp(src, tgt, minLength);
293	if (diff > 0) {
294	return UCOL_GREATER;
295	}
296	else
297	if (diff < 0) {
298	return UCOL_LESS;
299	}
300	}
301
302	return result;
303	} else {
304	return UCOL_EQUAL;
305	}
306	}
307
308	CollationKey&
309	CollationKey::ensureCapacity(int32_t newSize)
310	{
311	if (fCapacity < newSize)
312	{
313	uprv_free(fBytes);
314
315	fBytes = (uint8_t *)uprv_malloc(newSize);
316
317	if (fBytes == NULL)
318	{
319	return setToBogus();
320	}
321
322	uprv_memset(fBytes, 0, fCapacity);
323	fCapacity = newSize;
324	}
325
326	fBogus = FALSE;
327	fCount = newSize;
328	fHashCode = kInvalidHashCode;
329
330	return *this;
331	}
332
333	#ifdef U_USE_COLLATION_KEY_DEPRECATES
334	// Create a copy of the byte array.
335	uint8_t*
336	CollationKey::toByteArray(int32_t& count) const
337	{
338	uint8_t result = (uint8_t) uprv_malloc( sizeof(uint8_t) * fCount );
339
340	if (result == NULL)
341	{
342	count = 0;
343	}
344	else
345	{
346	count = fCount;
347	uprv_memcpy(result, fBytes, fCount);
348	}
349
350	return result;
351	}
352	#endif
353
354	int32_t
355	CollationKey::hashCode() const
356	{
357	// (Cribbed from UnicodeString)
358	// We cache the hashCode; when it becomes invalid, due to any change to the
359	// string, we note this by setting it to kInvalidHashCode. [LIU]
360
361	// Note: This method is semantically const, but physically non-const.
362
363	if (fHashCode == kInvalidHashCode)
364	{
4388f060 A	365	const char s = reinterpret_cast<const char >(fBytes);
4388f060 A	366	((CollationKey *)this)->fHashCode = s == NULL ? 0 : ustr_hashCharsN(s, fCount);
b75a7d8f A	367	#if 0
	368	// We compute the hash by iterating sparsely over 64 (at most) characters
	369	// spaced evenly through the string. For each character, we multiply the
	370	// previous hash value by a prime number and add the new character in,
	371	// in the manner of a additive linear congruential random number generator,
	372	// thus producing a pseudorandom deterministic value which should be well
	373	// distributed over the output range. [LIU]
	374	const uint8_t p = fBytes, limit = fBytes + fCount;
	375	int32_t inc = (fCount >= 256) ? fCount/128 : 2; // inc = max(fSize/64, 1);
	376	int32_t hash = 0;
	377
	378	while (p < limit)
	379	{
	380	hash = ( hash * 37 ) + ((p[0] << 8) + p[1]);
	381	p += inc;
	382	}
	383
	384	// If we happened to get kInvalidHashCode, replace it with kEmptyHashCode
	385	if (hash == kInvalidHashCode)
	386	{
	387	hash = kEmptyHashCode;
	388	}
	389
	390	((CollationKey *)this)->fHashCode = hash; // cast away const
	391	#endif
	392	}
	393
	394	return fHashCode;
	395	}
	396
	397	U_NAMESPACE_END
	398
73c04bcf A	399	U_CAPI int32_t U_EXPORT2
	400	ucol_keyHashCode(const uint8_t *key,
	401	int32_t length)
	402	{
4388f060	403	icu::CollationKey newKey(key, length);
73c04bcf A	404	return newKey.hashCode();
	405	}
	406
b75a7d8f	407	#endif /* #if !UCONFIG_NO_COLLATION */