icuSources/i18n/sortkey.cpp

   1 /*
   2 *******************************************************************************
   3 * Copyright (C) 1996-2003, International Business Machines Corporation and    *
   4 * others. All Rights Reserved.                                                *
   5 *******************************************************************************
   6 */
   7 //===============================================================================
   8 //
   9 // File sortkey.cpp
  10 //
  11 //
  12 //
  13 // Created by: Helena Shih
  14 //
  15 // Modification History:
  16 //
  17 //  Date         Name          Description
  18 //
  19 //  6/20/97      helena        Java class name change.
  20 //  6/23/97      helena        Added comments to make code more readable.
  21 //  6/26/98      erm           Canged to use byte arrays instead of UnicodeString
  22 //  7/31/98      erm           hashCode: minimum inc should be 2 not 1,
  23 //                             Cleaned up operator=
  24 // 07/12/99      helena        HPUX 11 CC port.
  25 // 03/06/01      synwee        Modified compareTo, to handle the result of
  26 //                             2 string similar in contents, but one is longer
  27 //                             than the other
  28 //===============================================================================
  29
  30 #include "unicode/utypes.h"
  31
  32 #if !UCONFIG_NO_COLLATION
  33
  34 #include "unicode/sortkey.h"
  35 #include "cmemory.h"
  36 #include "uhash.h"
  37
  38 U_NAMESPACE_BEGIN
  39
  40 // A hash code of kInvalidHashCode indicates that the has code needs
  41 // to be computed. A hash code of kEmptyHashCode is used for empty keys
  42 // and for any key whose computed hash code is kInvalidHashCode.
  43 #define kInvalidHashCode ((int32_t)0)
  44 #define kEmptyHashCode ((int32_t)1)
  45
  46 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(CollationKey)
  47
  48 CollationKey::CollationKey()
  49     : UObject(), fBogus(FALSE), fCount(0), fCapacity(0),
  50       fHashCode(kEmptyHashCode), fBytes(NULL)
  51 {
  52 }
  53
  54 // Create a collation key from a bit array.
  55 CollationKey::CollationKey(const uint8_t* newValues, int32_t count)
  56     : UObject(), fBogus(FALSE), fCount(count), fCapacity(count),
  57       fHashCode(kInvalidHashCode)
  58 {
  59     fBytes = (uint8_t *)uprv_malloc(count);
  60
  61     if (fBytes == NULL)
  62     {
  63         setToBogus();
  64         return;
  65     }
  66
  67     uprv_memcpy(fBytes, newValues, fCount);
  68 }
  69
  70 CollationKey::CollationKey(const CollationKey& other)
  71 : UObject(other), fBogus(FALSE), fCount(other.fCount), fCapacity(other.fCapacity),
  72   fHashCode(other.fHashCode), fBytes(NULL)
  73 {
  74     if (other.fBogus)
  75     {
  76         setToBogus();
  77         return;
  78     }
  79
  80     fBytes = (uint8_t *)uprv_malloc(fCapacity);
  81
  82     if (fBytes == NULL)
  83     {
  84         setToBogus();
  85         return;
  86     }
  87
  88     uprv_memcpy(fBytes, other.fBytes, other.fCount);
  89     if(fCapacity>fCount) {
  90         uprv_memset(fBytes+fCount, 0, fCapacity-fCount);
  91     }
  92 }
  93
  94 CollationKey::~CollationKey()
  95 {
  96         uprv_free(fBytes);
  97 }
  98
  99 void CollationKey::adopt(uint8_t *values, int32_t count) {
 100     if(fBytes != NULL) {
 101         uprv_free(fBytes);
 102     }
 103     fBogus = FALSE;
 104     fBytes = values;
 105     fCount = count;
 106     fCapacity = count;
 107     fHashCode = kInvalidHashCode;
 108 }
 109
 110 // set the key to an empty state
 111 CollationKey&
 112 CollationKey::reset()
 113 {
 114     fCount = 0;
 115     fBogus = FALSE;
 116     fHashCode = kEmptyHashCode;
 117
 118     return *this;
 119 }
 120
 121 // set the key to a "bogus" or invalid state
 122 CollationKey&
 123 CollationKey::setToBogus()
 124 {
 125     uprv_free(fBytes);
 126     fBytes = NULL;
 127
 128     fCapacity = 0;
 129     fCount = 0;
 130     fHashCode = kInvalidHashCode;
 131
 132     return *this;
 133 }
 134
 135 UBool
 136 CollationKey::operator==(const CollationKey& source) const
 137 {
 138     return (this->fCount == source.fCount &&
 139             (this->fBytes == source.fBytes ||
 140              uprv_memcmp(this->fBytes, source.fBytes, this->fCount) == 0));
 141 }
 142
 143 const CollationKey&
 144 CollationKey::operator=(const CollationKey& other)
 145 {
 146     if (this != &other)
 147     {
 148         if (other.isBogus())
 149         {
 150             return setToBogus();
 151         }
 152
 153         if (other.fBytes != NULL)
 154         {
 155             ensureCapacity(other.fCount);
 156
 157             if (isBogus())
 158             {
 159                 return *this;
 160             }
 161
 162             fHashCode = other.fHashCode;
 163             uprv_memcpy(fBytes, other.fBytes, fCount);
 164         }
 165         else
 166         {
 167             fCount = 0;
 168             fBogus = FALSE;
 169             fHashCode = kEmptyHashCode;
 170         }
 171     }
 172
 173     return *this;
 174 }
 175
 176 // Bitwise comparison for the collation keys.
 177 // NOTE: this is somewhat messy 'cause we can't count
 178 // on memcmp returning the exact values which match
 179 // Collator::EComparisonResult
 180 Collator::EComparisonResult
 181 CollationKey::compareTo(const CollationKey& target) const
 182 {
 183     uint8_t *src = this->fBytes;
 184     uint8_t *tgt = target.fBytes;
 185
 186     // are we comparing the same string
 187     if (src == tgt)
 188         return  Collator::EQUAL;
 189
 190         /*
 191         int count = (this->fCount < target.fCount) ? this->fCount : target.fCount;
 192         if (count == 0)
 193         {
 194         // If count is 0, at least one of the keys is empty.
 195         // An empty key is always LESS than a non-empty one
 196         // and EQUAL to another empty
 197         if (this->fCount < target.fCount)
 198         {
 199         return Collator::LESS;
 200         }
 201
 202           if (this->fCount > target.fCount)
 203           {
 204           return Collator::GREATER;
 205           }
 206           return Collator::EQUAL;
 207           }
 208     */
 209
 210     int                         minLength;
 211     Collator::EComparisonResult result;
 212
 213     // are we comparing different lengths?
 214     if (this->fCount != target.fCount) {
 215         if (this->fCount < target.fCount) {
 216             minLength = this->fCount;
 217             result    =  Collator::LESS;
 218         }
 219         else {
 220             minLength = target.fCount;
 221             result    =  Collator::GREATER;
 222         }
 223     }
 224     else {
 225         minLength = target.fCount;
 226         result    =  Collator::EQUAL;
 227     }
 228
 229     if (minLength > 0) {
 230         int diff = uprv_memcmp(src, tgt, minLength);
 231         if (diff > 0) {
 232             return Collator::GREATER;
 233         }
 234         else
 235             if (diff < 0) {
 236                 return Collator::LESS;
 237             }
 238     }
 239
 240     return result;
 241     /*
 242     if (result < 0)
 243     {
 244     return Collator::LESS;
 245     }
 246
 247       if (result > 0)
 248       {
 249       return Collator::GREATER;
 250       }
 251       return Collator::EQUAL;
 252     */
 253 }
 254
 255 // Bitwise comparison for the collation keys.
 256 UCollationResult
 257 CollationKey::compareTo(const CollationKey& target, UErrorCode &status) const
 258 {
 259   if(U_SUCCESS(status)) {
 260     uint8_t *src = this->fBytes;
 261     uint8_t *tgt = target.fBytes;
 262
 263     // are we comparing the same string
 264     if (src == tgt)
 265         return  UCOL_EQUAL;
 266
 267     int                         minLength;
 268     UCollationResult result;
 269
 270     // are we comparing different lengths?
 271     if (this->fCount != target.fCount) {
 272         if (this->fCount < target.fCount) {
 273             minLength = this->fCount;
 274             result    =  UCOL_LESS;
 275         }
 276         else {
 277             minLength = target.fCount;
 278             result    =  UCOL_GREATER;
 279         }
 280     }
 281     else {
 282         minLength = target.fCount;
 283         result    =  UCOL_EQUAL;
 284     }
 285
 286     if (minLength > 0) {
 287         int diff = uprv_memcmp(src, tgt, minLength);
 288         if (diff > 0) {
 289             return UCOL_GREATER;
 290         }
 291         else
 292             if (diff < 0) {
 293                 return UCOL_LESS;
 294             }
 295     }
 296
 297     return result;
 298   } else {
 299     return UCOL_EQUAL;
 300   }
 301 }
 302
 303 CollationKey&
 304 CollationKey::ensureCapacity(int32_t newSize)
 305 {
 306     if (fCapacity < newSize)
 307     {
 308         uprv_free(fBytes);
 309
 310         fBytes = (uint8_t *)uprv_malloc(newSize);
 311
 312         if (fBytes == NULL)
 313         {
 314             return setToBogus();
 315         }
 316
 317         uprv_memset(fBytes, 0, fCapacity);
 318         fCapacity = newSize;
 319     }
 320
 321     fBogus = FALSE;
 322     fCount = newSize;
 323     fHashCode = kInvalidHashCode;
 324
 325     return *this;
 326 }
 327
 328 #ifdef U_USE_COLLATION_KEY_DEPRECATES
 329 // Create a copy of the byte array.
 330 uint8_t*
 331 CollationKey::toByteArray(int32_t& count) const
 332 {
 333     uint8_t *result = (uint8_t*) uprv_malloc( sizeof(uint8_t) * fCount );
 334
 335     if (result == NULL)
 336     {
 337         count = 0;
 338     }
 339     else
 340     {
 341         count = fCount;
 342         uprv_memcpy(result, fBytes, fCount);
 343     }
 344
 345     return result;
 346 }
 347 #endif
 348
 349 int32_t
 350 CollationKey::hashCode() const
 351 {
 352     // (Cribbed from UnicodeString)
 353     // We cache the hashCode; when it becomes invalid, due to any change to the
 354     // string, we note this by setting it to kInvalidHashCode. [LIU]
 355
 356     // Note: This method is semantically const, but physically non-const.
 357
 358     if (fHashCode == kInvalidHashCode)
 359     {
 360         UHashTok key;
 361         key.pointer = fBytes;
 362         ((CollationKey *)this)->fHashCode = uhash_hashChars(key);
 363 #if 0
 364         // We compute the hash by iterating sparsely over 64 (at most) characters
 365         // spaced evenly through the string.  For each character, we multiply the
 366         // previous hash value by a prime number and add the new character in,
 367         // in the manner of a additive linear congruential random number generator,
 368         // thus producing a pseudorandom deterministic value which should be well
 369         // distributed over the output range. [LIU]
 370         const uint8_t   *p = fBytes, *limit = fBytes + fCount;
 371         int32_t         inc = (fCount >= 256) ? fCount/128 : 2; // inc = max(fSize/64, 1);
 372         int32_t         hash = 0;
 373
 374         while (p < limit)
 375         {
 376             hash = ( hash * 37 ) + ((p[0] << 8) + p[1]);
 377             p += inc;
 378         }
 379
 380         // If we happened to get kInvalidHashCode, replace it with kEmptyHashCode
 381         if (hash == kInvalidHashCode)
 382         {
 383             hash = kEmptyHashCode;
 384         }
 385
 386         ((CollationKey *)this)->fHashCode = hash; // cast away const
 387 #endif
 388     }
 389
 390     return fHashCode;
 391 }
 392
 393 U_NAMESPACE_END
 394
 395 #endif /* #if !UCONFIG_NO_COLLATION */