icuSources/i18n/sortkey.cpp

   1 /*
   2 *******************************************************************************
   3 * Copyright (C) 1996-2011, International Business Machines Corporation and    *
   4 * others. All Rights Reserved.                                                *
   5 *******************************************************************************
   6 */
   7 //===============================================================================
   8 //
   9 // File sortkey.cpp
  10 //
  11 //
  12 //
  13 // Created by: Helena Shih
  14 //
  15 // Modification History:
  16 //
  17 //  Date         Name          Description
  18 //
  19 //  6/20/97      helena        Java class name change.
  20 //  6/23/97      helena        Added comments to make code more readable.
  21 //  6/26/98      erm           Canged to use byte arrays instead of UnicodeString
  22 //  7/31/98      erm           hashCode: minimum inc should be 2 not 1,
  23 //                             Cleaned up operator=
  24 // 07/12/99      helena        HPUX 11 CC port.
  25 // 03/06/01      synwee        Modified compareTo, to handle the result of
  26 //                             2 string similar in contents, but one is longer
  27 //                             than the other
  28 //===============================================================================
  29
  30 #include "unicode/utypes.h"
  31
  32 #if !UCONFIG_NO_COLLATION
  33
  34 #include "unicode/sortkey.h"
  35 #include "cmemory.h"
  36 #include "uelement.h"
  37 #include "ustr_imp.h"
  38
  39 U_NAMESPACE_BEGIN
  40
  41 // A hash code of kInvalidHashCode indicates that the has code needs
  42 // to be computed. A hash code of kEmptyHashCode is used for empty keys
  43 // and for any key whose computed hash code is kInvalidHashCode.
  44 #define kInvalidHashCode ((int32_t)0)
  45 #define kEmptyHashCode ((int32_t)1)
  46
  47 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(CollationKey)
  48
  49 CollationKey::CollationKey()
  50     : UObject(), fBogus(FALSE), fCount(0), fCapacity(0),
  51       fHashCode(kEmptyHashCode), fBytes(NULL)
  52 {
  53 }
  54
  55 // Create a collation key from a bit array.
  56 CollationKey::CollationKey(const uint8_t* newValues, int32_t count)
  57     : UObject(), fBogus(FALSE), fCount(count), fCapacity(count),
  58       fHashCode(kInvalidHashCode)
  59 {
  60     fBytes = (uint8_t *)uprv_malloc(count);
  61
  62     if (fBytes == NULL)
  63     {
  64         setToBogus();
  65         return;
  66     }
  67
  68     uprv_memcpy(fBytes, newValues, fCount);
  69 }
  70
  71 CollationKey::CollationKey(const CollationKey& other)
  72 : UObject(other), fBogus(FALSE), fCount(other.fCount), fCapacity(other.fCapacity),
  73   fHashCode(other.fHashCode), fBytes(NULL)
  74 {
  75     if (other.fBogus)
  76     {
  77         setToBogus();
  78         return;
  79     }
  80
  81     fBytes = (uint8_t *)uprv_malloc(fCapacity);
  82
  83     if (fBytes == NULL)
  84     {
  85         setToBogus();
  86         return;
  87     }
  88
  89     uprv_memcpy(fBytes, other.fBytes, other.fCount);
  90     if(fCapacity>fCount) {
  91         uprv_memset(fBytes+fCount, 0, fCapacity-fCount);
  92     }
  93 }
  94
  95 CollationKey::~CollationKey()
  96 {
  97         uprv_free(fBytes);
  98 }
  99
 100 void CollationKey::adopt(uint8_t *values, int32_t capacity, int32_t count) {
 101     if(fBytes != NULL) {
 102         uprv_free(fBytes);
 103     }
 104     fBytes = values;
 105     fCapacity = capacity;
 106     setLength(count);
 107 }
 108
 109 void CollationKey::setLength(int32_t newLength) {
 110     fBogus = FALSE;
 111     fCount = newLength;
 112     fHashCode = kInvalidHashCode;
 113 }
 114
 115 // set the key to an empty state
 116 CollationKey&
 117 CollationKey::reset()
 118 {
 119     fCount = 0;
 120     fBogus = FALSE;
 121     fHashCode = kEmptyHashCode;
 122
 123     return *this;
 124 }
 125
 126 // set the key to a "bogus" or invalid state
 127 CollationKey&
 128 CollationKey::setToBogus()
 129 {
 130     uprv_free(fBytes);
 131     fBytes = NULL;
 132
 133     fCapacity = 0;
 134     fCount = 0;
 135     fHashCode = kInvalidHashCode;
 136
 137     return *this;
 138 }
 139
 140 UBool
 141 CollationKey::operator==(const CollationKey& source) const
 142 {
 143     return (this->fCount == source.fCount &&
 144             (this->fBytes == source.fBytes ||
 145              uprv_memcmp(this->fBytes, source.fBytes, this->fCount) == 0));
 146 }
 147
 148 const CollationKey&
 149 CollationKey::operator=(const CollationKey& other)
 150 {
 151     if (this != &other)
 152     {
 153         if (other.isBogus())
 154         {
 155             return setToBogus();
 156         }
 157
 158         if (other.fBytes != NULL)
 159         {
 160             ensureCapacity(other.fCount);
 161
 162             if (isBogus())
 163             {
 164                 return *this;
 165             }
 166
 167             fHashCode = other.fHashCode;
 168             uprv_memcpy(fBytes, other.fBytes, fCount);
 169         }
 170         else
 171         {
 172             fCount = 0;
 173             fBogus = FALSE;
 174             fHashCode = kEmptyHashCode;
 175         }
 176     }
 177
 178     return *this;
 179 }
 180
 181 // Bitwise comparison for the collation keys.
 182 // NOTE: this is somewhat messy 'cause we can't count
 183 // on memcmp returning the exact values which match
 184 // Collator::EComparisonResult
 185 Collator::EComparisonResult
 186 CollationKey::compareTo(const CollationKey& target) const
 187 {
 188     uint8_t *src = this->fBytes;
 189     uint8_t *tgt = target.fBytes;
 190
 191     // are we comparing the same string
 192     if (src == tgt)
 193         return  Collator::EQUAL;
 194
 195         /*
 196         int count = (this->fCount < target.fCount) ? this->fCount : target.fCount;
 197         if (count == 0)
 198         {
 199         // If count is 0, at least one of the keys is empty.
 200         // An empty key is always LESS than a non-empty one
 201         // and EQUAL to another empty
 202         if (this->fCount < target.fCount)
 203         {
 204         return Collator::LESS;
 205         }
 206
 207           if (this->fCount > target.fCount)
 208           {
 209           return Collator::GREATER;
 210           }
 211           return Collator::EQUAL;
 212           }
 213     */
 214
 215     int                         minLength;
 216     Collator::EComparisonResult result;
 217
 218     // are we comparing different lengths?
 219     if (this->fCount != target.fCount) {
 220         if (this->fCount < target.fCount) {
 221             minLength = this->fCount;
 222             result    =  Collator::LESS;
 223         }
 224         else {
 225             minLength = target.fCount;
 226             result    =  Collator::GREATER;
 227         }
 228     }
 229     else {
 230         minLength = target.fCount;
 231         result    =  Collator::EQUAL;
 232     }
 233
 234     if (minLength > 0) {
 235         int diff = uprv_memcmp(src, tgt, minLength);
 236         if (diff > 0) {
 237             return Collator::GREATER;
 238         }
 239         else
 240             if (diff < 0) {
 241                 return Collator::LESS;
 242             }
 243     }
 244
 245     return result;
 246     /*
 247     if (result < 0)
 248     {
 249     return Collator::LESS;
 250     }
 251
 252       if (result > 0)
 253       {
 254       return Collator::GREATER;
 255       }
 256       return Collator::EQUAL;
 257     */
 258 }
 259
 260 // Bitwise comparison for the collation keys.
 261 UCollationResult
 262 CollationKey::compareTo(const CollationKey& target, UErrorCode &status) const
 263 {
 264   if(U_SUCCESS(status)) {
 265     uint8_t *src = this->fBytes;
 266     uint8_t *tgt = target.fBytes;
 267
 268     // are we comparing the same string
 269     if (src == tgt)
 270         return  UCOL_EQUAL;
 271
 272     int                         minLength;
 273     UCollationResult result;
 274
 275     // are we comparing different lengths?
 276     if (this->fCount != target.fCount) {
 277         if (this->fCount < target.fCount) {
 278             minLength = this->fCount;
 279             result    =  UCOL_LESS;
 280         }
 281         else {
 282             minLength = target.fCount;
 283             result    =  UCOL_GREATER;
 284         }
 285     }
 286     else {
 287         minLength = target.fCount;
 288         result    =  UCOL_EQUAL;
 289     }
 290
 291     if (minLength > 0) {
 292         int diff = uprv_memcmp(src, tgt, minLength);
 293         if (diff > 0) {
 294             return UCOL_GREATER;
 295         }
 296         else
 297             if (diff < 0) {
 298                 return UCOL_LESS;
 299             }
 300     }
 301
 302     return result;
 303   } else {
 304     return UCOL_EQUAL;
 305   }
 306 }
 307
 308 CollationKey&
 309 CollationKey::ensureCapacity(int32_t newSize)
 310 {
 311     if (fCapacity < newSize)
 312     {
 313         uprv_free(fBytes);
 314
 315         fBytes = (uint8_t *)uprv_malloc(newSize);
 316
 317         if (fBytes == NULL)
 318         {
 319             return setToBogus();
 320         }
 321
 322         uprv_memset(fBytes, 0, fCapacity);
 323         fCapacity = newSize;
 324     }
 325
 326     fBogus = FALSE;
 327     fCount = newSize;
 328     fHashCode = kInvalidHashCode;
 329
 330     return *this;
 331 }
 332
 333 #ifdef U_USE_COLLATION_KEY_DEPRECATES
 334 // Create a copy of the byte array.
 335 uint8_t*
 336 CollationKey::toByteArray(int32_t& count) const
 337 {
 338     uint8_t *result = (uint8_t*) uprv_malloc( sizeof(uint8_t) * fCount );
 339
 340     if (result == NULL)
 341     {
 342         count = 0;
 343     }
 344     else
 345     {
 346         count = fCount;
 347         uprv_memcpy(result, fBytes, fCount);
 348     }
 349
 350     return result;
 351 }
 352 #endif
 353
 354 int32_t
 355 CollationKey::hashCode() const
 356 {
 357     // (Cribbed from UnicodeString)
 358     // We cache the hashCode; when it becomes invalid, due to any change to the
 359     // string, we note this by setting it to kInvalidHashCode. [LIU]
 360
 361     // Note: This method is semantically const, but physically non-const.
 362
 363     if (fHashCode == kInvalidHashCode)
 364     {
 365         const char *s = reinterpret_cast<const char *>(fBytes);
 366         ((CollationKey *)this)->fHashCode = s == NULL ? 0 : ustr_hashCharsN(s, fCount);
 367 #if 0
 368         // We compute the hash by iterating sparsely over 64 (at most) characters
 369         // spaced evenly through the string.  For each character, we multiply the
 370         // previous hash value by a prime number and add the new character in,
 371         // in the manner of a additive linear congruential random number generator,
 372         // thus producing a pseudorandom deterministic value which should be well
 373         // distributed over the output range. [LIU]
 374         const uint8_t   *p = fBytes, *limit = fBytes + fCount;
 375         int32_t         inc = (fCount >= 256) ? fCount/128 : 2; // inc = max(fSize/64, 1);
 376         int32_t         hash = 0;
 377
 378         while (p < limit)
 379         {
 380             hash = ( hash * 37 ) + ((p[0] << 8) + p[1]);
 381             p += inc;
 382         }
 383
 384         // If we happened to get kInvalidHashCode, replace it with kEmptyHashCode
 385         if (hash == kInvalidHashCode)
 386         {
 387             hash = kEmptyHashCode;
 388         }
 389
 390         ((CollationKey *)this)->fHashCode = hash; // cast away const
 391 #endif
 392     }
 393
 394     return fHashCode;
 395 }
 396
 397 U_NAMESPACE_END
 398
 399 U_CAPI int32_t U_EXPORT2
 400 ucol_keyHashCode(const uint8_t *key,
 401                        int32_t  length)
 402 {
 403     icu::CollationKey newKey(key, length);
 404     return newKey.hashCode();
 405 }
 406
 407 #endif /* #if !UCONFIG_NO_COLLATION */