[apple/icu.git] / icuSources / i18n / sortkey.cpp

/*
*******************************************************************************
* Copyright (C) 1996-2006, International Business Machines Corporation and    *
* others. All Rights Reserved.                                                *
*******************************************************************************
*/
//===============================================================================
//
// File sortkey.cpp
//
//
//
// Created by: Helena Shih
//
// Modification History:
//
//  Date         Name          Description
//
//  6/20/97      helena        Java class name change.
//  6/23/97      helena        Added comments to make code more readable.
//  6/26/98      erm           Canged to use byte arrays instead of UnicodeString
//  7/31/98      erm           hashCode: minimum inc should be 2 not 1,
//                             Cleaned up operator=
// 07/12/99      helena        HPUX 11 CC port.
// 03/06/01      synwee        Modified compareTo, to handle the result of
//                             2 string similar in contents, but one is longer
//                             than the other
//===============================================================================

#include "unicode/utypes.h"

#if !UCONFIG_NO_COLLATION

#include "unicode/sortkey.h"
#include "cmemory.h"
#include "uhash.h"

U_NAMESPACE_BEGIN

// A hash code of kInvalidHashCode indicates that the has code needs
// to be computed. A hash code of kEmptyHashCode is used for empty keys
// and for any key whose computed hash code is kInvalidHashCode.
#define kInvalidHashCode ((int32_t)0)
#define kEmptyHashCode ((int32_t)1)

UOBJECT_DEFINE_RTTI_IMPLEMENTATION(CollationKey)

CollationKey::CollationKey()
    : UObject(), fBogus(FALSE), fCount(0), fCapacity(0),
      fHashCode(kEmptyHashCode), fBytes(NULL)
{
}

// Create a collation key from a bit array.
CollationKey::CollationKey(const uint8_t* newValues, int32_t count)
    : UObject(), fBogus(FALSE), fCount(count), fCapacity(count),
      fHashCode(kInvalidHashCode)
{
    fBytes = (uint8_t *)uprv_malloc(count);

    if (fBytes == NULL)
    {
        setToBogus();
        return;
    }

    uprv_memcpy(fBytes, newValues, fCount);
}

CollationKey::CollationKey(const CollationKey& other)
: UObject(other), fBogus(FALSE), fCount(other.fCount), fCapacity(other.fCapacity),
  fHashCode(other.fHashCode), fBytes(NULL)
{
    if (other.fBogus)
    {
        setToBogus();
        return;
    }

    fBytes = (uint8_t *)uprv_malloc(fCapacity);

    if (fBytes == NULL)
    {
        setToBogus();
        return;
    }

    uprv_memcpy(fBytes, other.fBytes, other.fCount);
    if(fCapacity>fCount) {
        uprv_memset(fBytes+fCount, 0, fCapacity-fCount);
    }
}

CollationKey::~CollationKey()
{
        uprv_free(fBytes);
}

void CollationKey::adopt(uint8_t *values, int32_t count) {
    if(fBytes != NULL) {
        uprv_free(fBytes);
    }
    fBogus = FALSE;
    fBytes = values;
    fCount = count;
    fCapacity = count;
    fHashCode = kInvalidHashCode;
}

// set the key to an empty state
CollationKey&
CollationKey::reset()
{
    fCount = 0;
    fBogus = FALSE;
    fHashCode = kEmptyHashCode;

    return *this;
}

// set the key to a "bogus" or invalid state
CollationKey&
CollationKey::setToBogus()
{
    uprv_free(fBytes);
    fBytes = NULL;

    fCapacity = 0;
    fCount = 0;
    fHashCode = kInvalidHashCode;

    return *this;
}

UBool
CollationKey::operator==(const CollationKey& source) const
{
    return (this->fCount == source.fCount &&
            (this->fBytes == source.fBytes ||
             uprv_memcmp(this->fBytes, source.fBytes, this->fCount) == 0));
}

const CollationKey&
CollationKey::operator=(const CollationKey& other)
{
    if (this != &other)
    {
        if (other.isBogus())
        {
            return setToBogus();
        }

        if (other.fBytes != NULL)
        {
            ensureCapacity(other.fCount);

            if (isBogus())
            {
                return *this;
            }

            fHashCode = other.fHashCode;
            uprv_memcpy(fBytes, other.fBytes, fCount);
        }
        else
        {
            fCount = 0;
            fBogus = FALSE;
            fHashCode = kEmptyHashCode;
        }
    }

    return *this;
}

// Bitwise comparison for the collation keys.
// NOTE: this is somewhat messy 'cause we can't count
// on memcmp returning the exact values which match
// Collator::EComparisonResult
Collator::EComparisonResult
CollationKey::compareTo(const CollationKey& target) const
{
    uint8_t *src = this->fBytes;
    uint8_t *tgt = target.fBytes;

    // are we comparing the same string
    if (src == tgt)
        return  Collator::EQUAL;

        /*
        int count = (this->fCount < target.fCount) ? this->fCount : target.fCount;
        if (count == 0)
        {
        // If count is 0, at least one of the keys is empty.
        // An empty key is always LESS than a non-empty one
        // and EQUAL to another empty
        if (this->fCount < target.fCount)
        {
        return Collator::LESS;
        }

          if (this->fCount > target.fCount)
          {
          return Collator::GREATER;
          }
          return Collator::EQUAL;
          }
    */

    int                         minLength;
    Collator::EComparisonResult result;

    // are we comparing different lengths?
    if (this->fCount != target.fCount) {
        if (this->fCount < target.fCount) {
            minLength = this->fCount;
            result    =  Collator::LESS;
        }
        else {
            minLength = target.fCount;
            result    =  Collator::GREATER;
        }
    }
    else {
        minLength = target.fCount;
        result    =  Collator::EQUAL;
    }

    if (minLength > 0) {
        int diff = uprv_memcmp(src, tgt, minLength);
        if (diff > 0) {
            return Collator::GREATER;
        }
        else
            if (diff < 0) {
                return Collator::LESS;
            }
    }

    return result;
    /*
    if (result < 0)
    {
    return Collator::LESS;
    }

      if (result > 0)
      {
      return Collator::GREATER;
      }
      return Collator::EQUAL;
    */
}

// Bitwise comparison for the collation keys.
UCollationResult
CollationKey::compareTo(const CollationKey& target, UErrorCode &status) const
{
  if(U_SUCCESS(status)) {
    uint8_t *src = this->fBytes;
    uint8_t *tgt = target.fBytes;

    // are we comparing the same string
    if (src == tgt)
        return  UCOL_EQUAL;

    int                         minLength;
    UCollationResult result;

    // are we comparing different lengths?
    if (this->fCount != target.fCount) {
        if (this->fCount < target.fCount) {
            minLength = this->fCount;
            result    =  UCOL_LESS;
        }
        else {
            minLength = target.fCount;
            result    =  UCOL_GREATER;
        }
    }
    else {
        minLength = target.fCount;
        result    =  UCOL_EQUAL;
    }

    if (minLength > 0) {
        int diff = uprv_memcmp(src, tgt, minLength);
        if (diff > 0) {
            return UCOL_GREATER;
        }
        else
            if (diff < 0) {
                return UCOL_LESS;
            }
    }

    return result;
  } else {
    return UCOL_EQUAL;
  }
}

CollationKey&
CollationKey::ensureCapacity(int32_t newSize)
{
    if (fCapacity < newSize)
    {
        uprv_free(fBytes);

        fBytes = (uint8_t *)uprv_malloc(newSize);

        if (fBytes == NULL)
        {
            return setToBogus();
        }

        uprv_memset(fBytes, 0, fCapacity);
        fCapacity = newSize;
    }

    fBogus = FALSE;
    fCount = newSize;
    fHashCode = kInvalidHashCode;

    return *this;
}

#ifdef U_USE_COLLATION_KEY_DEPRECATES
// Create a copy of the byte array.
uint8_t*
CollationKey::toByteArray(int32_t& count) const
{
    uint8_t *result = (uint8_t*) uprv_malloc( sizeof(uint8_t) * fCount );

    if (result == NULL)
    {
        count = 0;
    }
    else
    {
        count = fCount;
        uprv_memcpy(result, fBytes, fCount);
    }

    return result;
}
#endif

int32_t
CollationKey::hashCode() const
{
    // (Cribbed from UnicodeString)
    // We cache the hashCode; when it becomes invalid, due to any change to the
    // string, we note this by setting it to kInvalidHashCode. [LIU]

    // Note: This method is semantically const, but physically non-const.

    if (fHashCode == kInvalidHashCode)
    {
        UHashTok key;
        key.pointer = fBytes;
        ((CollationKey *)this)->fHashCode = uhash_hashChars(key);
#if 0
        // We compute the hash by iterating sparsely over 64 (at most) characters
        // spaced evenly through the string.  For each character, we multiply the
        // previous hash value by a prime number and add the new character in,
        // in the manner of a additive linear congruential random number generator,
        // thus producing a pseudorandom deterministic value which should be well
        // distributed over the output range. [LIU]
        const uint8_t   *p = fBytes, *limit = fBytes + fCount;
        int32_t         inc = (fCount >= 256) ? fCount/128 : 2; // inc = max(fSize/64, 1);
        int32_t         hash = 0;

        while (p < limit)
        {
            hash = ( hash * 37 ) + ((p[0] << 8) + p[1]);
            p += inc;
        }

        // If we happened to get kInvalidHashCode, replace it with kEmptyHashCode
        if (hash == kInvalidHashCode)
        {
            hash = kEmptyHashCode;
        }

        ((CollationKey *)this)->fHashCode = hash; // cast away const
#endif
    }

    return fHashCode;
}

U_NAMESPACE_END

U_CAPI int32_t U_EXPORT2
ucol_keyHashCode(const uint8_t *key, 
                       int32_t  length)
{
    U_NAMESPACE_QUALIFIER CollationKey newKey(key, length);
    return newKey.hashCode();
}

#endif /* #if !UCONFIG_NO_COLLATION */
Commit	Line	Data
b75a7d8f A	1	/*
b75a7d8f A	2	*******************************************************************************
73c04bcf	3	* Copyright (C) 1996-2006, International Business Machines Corporation and *
b75a7d8f A	4	* others. All Rights Reserved. *
	5	*******************************************************************************
	6	*/
	7	//===============================================================================
	8	//
	9	// File sortkey.cpp
	10	//
	11	//
	12	//
	13	// Created by: Helena Shih
	14	//
	15	// Modification History:
	16	//
	17	// Date Name Description
	18	//
	19	// 6/20/97 helena Java class name change.
	20	// 6/23/97 helena Added comments to make code more readable.
	21	// 6/26/98 erm Canged to use byte arrays instead of UnicodeString
	22	// 7/31/98 erm hashCode: minimum inc should be 2 not 1,
	23	// Cleaned up operator=
	24	// 07/12/99 helena HPUX 11 CC port.
	25	// 03/06/01 synwee Modified compareTo, to handle the result of
	26	// 2 string similar in contents, but one is longer
	27	// than the other
	28	//===============================================================================
	29
	30	#include "unicode/utypes.h"
	31
	32	#if !UCONFIG_NO_COLLATION
	33
	34	#include "unicode/sortkey.h"
	35	#include "cmemory.h"
	36	#include "uhash.h"
	37
	38	U_NAMESPACE_BEGIN
	39
	40	// A hash code of kInvalidHashCode indicates that the has code needs
	41	// to be computed. A hash code of kEmptyHashCode is used for empty keys
	42	// and for any key whose computed hash code is kInvalidHashCode.
	43	#define kInvalidHashCode ((int32_t)0)
	44	#define kEmptyHashCode ((int32_t)1)
	45
374ca955	46	UOBJECT_DEFINE_RTTI_IMPLEMENTATION(CollationKey)
b75a7d8f A	47
	48	CollationKey::CollationKey()
	49	: UObject(), fBogus(FALSE), fCount(0), fCapacity(0),
	50	fHashCode(kEmptyHashCode), fBytes(NULL)
	51	{
	52	}
	53
	54	// Create a collation key from a bit array.
	55	CollationKey::CollationKey(const uint8_t* newValues, int32_t count)
	56	: UObject(), fBogus(FALSE), fCount(count), fCapacity(count),
	57	fHashCode(kInvalidHashCode)
	58	{
	59	fBytes = (uint8_t *)uprv_malloc(count);
	60
	61	if (fBytes == NULL)
	62	{
	63	setToBogus();
	64	return;
	65	}
	66
	67	uprv_memcpy(fBytes, newValues, fCount);
	68	}
	69
	70	CollationKey::CollationKey(const CollationKey& other)
	71	: UObject(other), fBogus(FALSE), fCount(other.fCount), fCapacity(other.fCapacity),
	72	fHashCode(other.fHashCode), fBytes(NULL)
	73	{
	74	if (other.fBogus)
	75	{
	76	setToBogus();
	77	return;
	78	}
	79
	80	fBytes = (uint8_t *)uprv_malloc(fCapacity);
	81
	82	if (fBytes == NULL)
	83	{
	84	setToBogus();
	85	return;
	86	}
	87
	88	uprv_memcpy(fBytes, other.fBytes, other.fCount);
	89	if(fCapacity>fCount) {
	90	uprv_memset(fBytes+fCount, 0, fCapacity-fCount);
	91	}
	92	}
	93
	94	CollationKey::~CollationKey()
	95	{
	96	uprv_free(fBytes);
	97	}
	98
	99	void CollationKey::adopt(uint8_t *values, int32_t count) {
	100	if(fBytes != NULL) {
	101	uprv_free(fBytes);
	102	}
	103	fBogus = FALSE;
	104	fBytes = values;
	105	fCount = count;
	106	fCapacity = count;
	107	fHashCode = kInvalidHashCode;
	108	}
	109
	110	// set the key to an empty state
111	CollationKey&
112	CollationKey::reset()
113	{
114	fCount = 0;
115	fBogus = FALSE;
116	fHashCode = kEmptyHashCode;
117
118	return *this;
119	}
120
121	// set the key to a "bogus" or invalid state
122	CollationKey&
123	CollationKey::setToBogus()
124	{
125	uprv_free(fBytes);
126	fBytes = NULL;
127
128	fCapacity = 0;
129	fCount = 0;
130	fHashCode = kInvalidHashCode;
131
132	return *this;
133	}
134
135	UBool
136	CollationKey::operator==(const CollationKey& source) const
137	{
138	return (this->fCount == source.fCount &&
139	(this->fBytes == source.fBytes \|\|
140	uprv_memcmp(this->fBytes, source.fBytes, this->fCount) == 0));
141	}
142
143	const CollationKey&
144	CollationKey::operator=(const CollationKey& other)
145	{
146	if (this != &other)
147	{
148	if (other.isBogus())
149	{
150	return setToBogus();
151	}
152
153	if (other.fBytes != NULL)
154	{
155	ensureCapacity(other.fCount);
156
157	if (isBogus())
158	{
159	return *this;
160	}
161
162	fHashCode = other.fHashCode;
163	uprv_memcpy(fBytes, other.fBytes, fCount);
164	}
165	else
166	{
167	fCount = 0;
168	fBogus = FALSE;
169	fHashCode = kEmptyHashCode;
170	}
171	}
172
173	return *this;
174	}
175
176	// Bitwise comparison for the collation keys.
177	// NOTE: this is somewhat messy 'cause we can't count
178	// on memcmp returning the exact values which match
179	// Collator::EComparisonResult
180	Collator::EComparisonResult
181	CollationKey::compareTo(const CollationKey& target) const
182	{
183	uint8_t *src = this->fBytes;
184	uint8_t *tgt = target.fBytes;
185
186	// are we comparing the same string
187	if (src == tgt)
188	return Collator::EQUAL;
189
190	/*
191	int count = (this->fCount < target.fCount) ? this->fCount : target.fCount;
192	if (count == 0)
193	{
194	// If count is 0, at least one of the keys is empty.
195	// An empty key is always LESS than a non-empty one
196	// and EQUAL to another empty
197	if (this->fCount < target.fCount)
198	{
199	return Collator::LESS;
200	}
201
202	if (this->fCount > target.fCount)
203	{
204	return Collator::GREATER;
205	}
206	return Collator::EQUAL;
207	}
208	*/
209
210	int minLength;
211	Collator::EComparisonResult result;
212
213	// are we comparing different lengths?
214	if (this->fCount != target.fCount) {
215	if (this->fCount < target.fCount) {
216	minLength = this->fCount;
217	result = Collator::LESS;
218	}
219	else {
220	minLength = target.fCount;
221	result = Collator::GREATER;
222	}
223	}
224	else {
225	minLength = target.fCount;
226	result = Collator::EQUAL;
227	}
228
229	if (minLength > 0) {
230	int diff = uprv_memcmp(src, tgt, minLength);
231	if (diff > 0) {
232	return Collator::GREATER;
233	}
234	else
235	if (diff < 0) {
236	return Collator::LESS;
237	}
238	}
239
240	return result;
241	/*
242	if (result < 0)
243	{
244	return Collator::LESS;
245	}
246
247	if (result > 0)
248	{
249	return Collator::GREATER;
250	}
251	return Collator::EQUAL;
252	*/
253	}
254
255	// Bitwise comparison for the collation keys.
256	UCollationResult
257	CollationKey::compareTo(const CollationKey& target, UErrorCode &status) const
258	{
259	if(U_SUCCESS(status)) {
260	uint8_t *src = this->fBytes;
261	uint8_t *tgt = target.fBytes;
262
263	// are we comparing the same string
264	if (src == tgt)
265	return UCOL_EQUAL;
266
267	int minLength;
268	UCollationResult result;
269
270	// are we comparing different lengths?
271	if (this->fCount != target.fCount) {
272	if (this->fCount < target.fCount) {
273	minLength = this->fCount;
274	result = UCOL_LESS;
275	}
276	else {
277	minLength = target.fCount;
278	result = UCOL_GREATER;
279	}
280	}
281	else {
282	minLength = target.fCount;
283	result = UCOL_EQUAL;
284	}
285
286	if (minLength > 0) {
287	int diff = uprv_memcmp(src, tgt, minLength);
288	if (diff > 0) {
289	return UCOL_GREATER;
290	}
291	else
292	if (diff < 0) {
293	return UCOL_LESS;
294	}
295	}
296
297	return result;
298	} else {
299	return UCOL_EQUAL;
300	}
301	}
302
303	CollationKey&
304	CollationKey::ensureCapacity(int32_t newSize)
305	{
306	if (fCapacity < newSize)
307	{
308	uprv_free(fBytes);
309
310	fBytes = (uint8_t *)uprv_malloc(newSize);
311
312	if (fBytes == NULL)
313	{
314	return setToBogus();
315	}
316
317	uprv_memset(fBytes, 0, fCapacity);
318	fCapacity = newSize;
319	}
320
321	fBogus = FALSE;
322	fCount = newSize;
323	fHashCode = kInvalidHashCode;
324
325	return *this;
326	}
327
328	#ifdef U_USE_COLLATION_KEY_DEPRECATES
329	// Create a copy of the byte array.
330	uint8_t*
331	CollationKey::toByteArray(int32_t& count) const
332	{
333	uint8_t result = (uint8_t) uprv_malloc( sizeof(uint8_t) * fCount );
334
335	if (result == NULL)
336	{
337	count = 0;
338	}
339	else
340	{
341	count = fCount;
342	uprv_memcpy(result, fBytes, fCount);
343	}
344
345	return result;
346	}
347	#endif
348
349	int32_t
350	CollationKey::hashCode() const
351	{
352	// (Cribbed from UnicodeString)
353	// We cache the hashCode; when it becomes invalid, due to any change to the
354	// string, we note this by setting it to kInvalidHashCode. [LIU]
355
356	// Note: This method is semantically const, but physically non-const.
357
358	if (fHashCode == kInvalidHashCode)
359	{
360	UHashTok key;
361	key.pointer = fBytes;
362	((CollationKey *)this)->fHashCode = uhash_hashChars(key);
363	#if 0
364	// We compute the hash by iterating sparsely over 64 (at most) characters
365	// spaced evenly through the string. For each character, we multiply the
366	// previous hash value by a prime number and add the new character in,
367	// in the manner of a additive linear congruential random number generator,
368	// thus producing a pseudorandom deterministic value which should be well
369	// distributed over the output range. [LIU]
370	const uint8_t p = fBytes, limit = fBytes + fCount;
371	int32_t inc = (fCount >= 256) ? fCount/128 : 2; // inc = max(fSize/64, 1);
372	int32_t hash = 0;
373
374	while (p < limit)
375	{
376	hash = ( hash * 37 ) + ((p[0] << 8) + p[1]);
377	p += inc;
378	}
379
380	// If we happened to get kInvalidHashCode, replace it with kEmptyHashCode
381	if (hash == kInvalidHashCode)
382	{
383	hash = kEmptyHashCode;
384	}
385
386	((CollationKey *)this)->fHashCode = hash; // cast away const
387	#endif
388	}
389
390	return fHashCode;
391	}
392
393	U_NAMESPACE_END
394
73c04bcf A	395	U_CAPI int32_t U_EXPORT2
	396	ucol_keyHashCode(const uint8_t *key,
	397	int32_t length)
	398	{
46f4442e	399	U_NAMESPACE_QUALIFIER CollationKey newKey(key, length);
73c04bcf A	400	return newKey.hashCode();
	401	}
	402
b75a7d8f	403	#endif /* #if !UCONFIG_NO_COLLATION */