[apple/javascriptcore.git] / wtf / text / StringHash.h

/*
 * Copyright (C) 2006, 2007, 2008 Apple Inc. All rights reserved
 * Copyright (C) Research In Motion Limited 2009. All rights reserved.
 *
 * This library is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Library General Public
 * License as published by the Free Software Foundation; either
 * version 2 of the License, or (at your option) any later version.
 *
 * This library is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Library General Public License for more details.
 *
 * You should have received a copy of the GNU Library General Public License
 * along with this library; see the file COPYING.LIB.  If not, write to
 * the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
 * Boston, MA 02110-1301, USA.
 *
 */

#ifndef StringHash_h
#define StringHash_h

#include "AtomicString.h"
#include "WTFString.h"
#include <wtf/HashTraits.h>
#include <wtf/StringHashFunctions.h>
#include <wtf/unicode/Unicode.h>

// FIXME: This is a temporary layering violation while we move string code to WTF.
// Landing the file moves in one patch, will follow on with patches to change the namespaces.
namespace WebCore {

    // The hash() functions on StringHash and CaseFoldingHash do not support
    // null strings. get(), contains(), and add() on HashMap<String,..., StringHash>
    // cause a null-pointer dereference when passed null strings.

    // FIXME: We should really figure out a way to put the computeHash function that's
    // currently a member function of StringImpl into this file so we can be a little
    // closer to having all the nearly-identical hash functions in one place.

    struct StringHash {
        static unsigned hash(StringImpl* key) { return key->hash(); }
        static bool equal(const StringImpl* a, const StringImpl* b)
        {
            if (a == b)
                return true;
            if (!a || !b)
                return false;

            unsigned aLength = a->length();
            unsigned bLength = b->length();
            if (aLength != bLength)
                return false;

            // FIXME: perhaps we should have a more abstract macro that indicates when
            // going 4 bytes at a time is unsafe
#if CPU(ARM) || CPU(SH4)
            const UChar* aChars = a->characters();
            const UChar* bChars = b->characters();
            for (unsigned i = 0; i != aLength; ++i) {
                if (*aChars++ != *bChars++)
                    return false;
            }
            return true;
#else
            /* Do it 4-bytes-at-a-time on architectures where it's safe */
            const uint32_t* aChars = reinterpret_cast<const uint32_t*>(a->characters());
            const uint32_t* bChars = reinterpret_cast<const uint32_t*>(b->characters());

            unsigned halfLength = aLength >> 1;
            for (unsigned i = 0; i != halfLength; ++i)
                if (*aChars++ != *bChars++)
                    return false;

            if (aLength & 1 && *reinterpret_cast<const uint16_t*>(aChars) != *reinterpret_cast<const uint16_t*>(bChars))
                return false;

            return true;
#endif
        }

        static unsigned hash(const RefPtr<StringImpl>& key) { return key->hash(); }
        static bool equal(const RefPtr<StringImpl>& a, const RefPtr<StringImpl>& b)
        {
            return equal(a.get(), b.get());
        }

        static unsigned hash(const String& key) { return key.impl()->hash(); }
        static bool equal(const String& a, const String& b)
        {
            return equal(a.impl(), b.impl());
        }

        static const bool safeToCompareToEmptyOrDeleted = false;
    };

    class CaseFoldingHash {
    public:
        // Paul Hsieh's SuperFastHash
        // http://www.azillionmonkeys.com/qed/hash.html
        static unsigned hash(const UChar* data, unsigned length)
        {
            unsigned l = length;
            const UChar* s = data;
            uint32_t hash = WTF::stringHashingStartValue;
            uint32_t tmp;
            
            int rem = l & 1;
            l >>= 1;
            
            // Main loop.
            for (; l > 0; l--) {
                hash += WTF::Unicode::foldCase(s[0]);
                tmp = (WTF::Unicode::foldCase(s[1]) << 11) ^ hash;
                hash = (hash << 16) ^ tmp;
                s += 2;
                hash += hash >> 11;
            }
            
            // Handle end case.
            if (rem) {
                hash += WTF::Unicode::foldCase(s[0]);
                hash ^= hash << 11;
                hash += hash >> 17;
            }
            
            // Force "avalanching" of final 127 bits.
            hash ^= hash << 3;
            hash += hash >> 5;
            hash ^= hash << 2;
            hash += hash >> 15;
            hash ^= hash << 10;
            
            // This avoids ever returning a hash code of 0, since that is used to
            // signal "hash not computed yet", using a value that is likely to be
            // effectively the same as 0 when the low bits are masked.
            hash |= !hash << 31;
            
            return hash;
        }

        static unsigned hash(StringImpl* str)
        {
            return hash(str->characters(), str->length());
        }
        
        static unsigned hash(const char* str, unsigned length)
        {
            // This hash is designed to work on 16-bit chunks at a time. But since the normal case
            // (above) is to hash UTF-16 characters, we just treat the 8-bit chars as if they
            // were 16-bit chunks, which will give matching results.

            unsigned l = length;
            const char* s = str;
            uint32_t hash = WTF::stringHashingStartValue;
            uint32_t tmp;
            
            int rem = l & 1;
            l >>= 1;
            
            // Main loop
            for (; l > 0; l--) {
                hash += WTF::Unicode::foldCase(s[0]);
                tmp = (WTF::Unicode::foldCase(s[1]) << 11) ^ hash;
                hash = (hash << 16) ^ tmp;
                s += 2;
                hash += hash >> 11;
            }
            
            // Handle end case
            if (rem) {
                hash += WTF::Unicode::foldCase(s[0]);
                hash ^= hash << 11;
                hash += hash >> 17;
            }
            
            // Force "avalanching" of final 127 bits
            hash ^= hash << 3;
            hash += hash >> 5;
            hash ^= hash << 2;
            hash += hash >> 15;
            hash ^= hash << 10;
            
            // this avoids ever returning a hash code of 0, since that is used to
            // signal "hash not computed yet", using a value that is likely to be
            // effectively the same as 0 when the low bits are masked
            hash |= !hash << 31;
            
            return hash;
        }
        
        static bool equal(const StringImpl* a, const StringImpl* b)
        {
            if (a == b)
                return true;
            if (!a || !b)
                return false;
            unsigned length = a->length();
            if (length != b->length())
                return false;
            return WTF::Unicode::umemcasecmp(a->characters(), b->characters(), length) == 0;
        }

        static unsigned hash(const RefPtr<StringImpl>& key) 
        {
            return hash(key.get());
        }

        static bool equal(const RefPtr<StringImpl>& a, const RefPtr<StringImpl>& b)
        {
            return equal(a.get(), b.get());
        }

        static unsigned hash(const String& key)
        {
            return hash(key.impl());
        }
        static unsigned hash(const AtomicString& key)
        {
            return hash(key.impl());
        }
        static bool equal(const String& a, const String& b)
        {
            return equal(a.impl(), b.impl());
        }
        static bool equal(const AtomicString& a, const AtomicString& b)
        {
            return (a == b) || equal(a.impl(), b.impl());
        }

        static const bool safeToCompareToEmptyOrDeleted = false;
    };

    // This hash can be used in cases where the key is a hash of a string, but we don't
    // want to store the string. It's not really specific to string hashing, but all our
    // current uses of it are for strings.
    struct AlreadyHashed : IntHash<unsigned> {
        static unsigned hash(unsigned key) { return key; }

        // To use a hash value as a key for a hash table, we need to eliminate the
        // "deleted" value, which is negative one. That could be done by changing
        // the string hash function to never generate negative one, but this works
        // and is still relatively efficient.
        static unsigned avoidDeletedValue(unsigned hash)
        {
            ASSERT(hash);
            unsigned newHash = hash | (!(hash + 1) << 31);
            ASSERT(newHash);
            ASSERT(newHash != 0xFFFFFFFF);
            return newHash;
        }
    };

}

namespace WTF {

    template<> struct HashTraits<WebCore::String> : GenericHashTraits<WebCore::String> {
        static const bool emptyValueIsZero = true;
        static void constructDeletedValue(WebCore::String& slot) { new (&slot) WebCore::String(HashTableDeletedValue); }
        static bool isDeletedValue(const WebCore::String& slot) { return slot.isHashTableDeletedValue(); }
    };

}

#endif
Commit	Line	Data
4e4e5a6f A	1	/*
	2	* Copyright (C) 2006, 2007, 2008 Apple Inc. All rights reserved
	3	* Copyright (C) Research In Motion Limited 2009. All rights reserved.
	4	*
	5	* This library is free software; you can redistribute it and/or
	6	* modify it under the terms of the GNU Library General Public
	7	* License as published by the Free Software Foundation; either
	8	* version 2 of the License, or (at your option) any later version.
	9	*
	10	* This library is distributed in the hope that it will be useful,
	11	* but WITHOUT ANY WARRANTY; without even the implied warranty of
	12	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
	13	* Library General Public License for more details.
	14	*
	15	* You should have received a copy of the GNU Library General Public License
	16	* along with this library; see the file COPYING.LIB. If not, write to
	17	* the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
	18	* Boston, MA 02110-1301, USA.
	19	*
	20	*/
	21
	22	#ifndef StringHash_h
	23	#define StringHash_h
	24
	25	#include "AtomicString.h"
	26	#include "WTFString.h"
	27	#include <wtf/HashTraits.h>
	28	#include <wtf/StringHashFunctions.h>
	29	#include <wtf/unicode/Unicode.h>
	30
	31	// FIXME: This is a temporary layering violation while we move string code to WTF.
	32	// Landing the file moves in one patch, will follow on with patches to change the namespaces.
	33	namespace WebCore {
	34
	35	// The hash() functions on StringHash and CaseFoldingHash do not support
	36	// null strings. get(), contains(), and add() on HashMap<String,..., StringHash>
	37	// cause a null-pointer dereference when passed null strings.
	38
	39	// FIXME: We should really figure out a way to put the computeHash function that's
	40	// currently a member function of StringImpl into this file so we can be a little
	41	// closer to having all the nearly-identical hash functions in one place.
	42
	43	struct StringHash {
	44	static unsigned hash(StringImpl* key) { return key->hash(); }
	45	static bool equal(const StringImpl* a, const StringImpl* b)
	46	{
	47	if (a == b)
	48	return true;
	49	if (!a \|\| !b)
	50	return false;
	51
	52	unsigned aLength = a->length();
	53	unsigned bLength = b->length();
	54	if (aLength != bLength)
	55	return false;
	56
	57	// FIXME: perhaps we should have a more abstract macro that indicates when
	58	// going 4 bytes at a time is unsafe
	59	#if CPU(ARM) \|\| CPU(SH4)
	60	const UChar* aChars = a->characters();
	61	const UChar* bChars = b->characters();
	62	for (unsigned i = 0; i != aLength; ++i) {
	63	if (aChars++ != bChars++)
	64	return false;
65	}
66	return true;
67	#else
68	/* Do it 4-bytes-at-a-time on architectures where it's safe */
69	const uint32_t* aChars = reinterpret_cast<const uint32_t*>(a->characters());
70	const uint32_t* bChars = reinterpret_cast<const uint32_t*>(b->characters());
71
72	unsigned halfLength = aLength >> 1;
73	for (unsigned i = 0; i != halfLength; ++i)
74	if (aChars++ != bChars++)
75	return false;
76
77	if (aLength & 1 && reinterpret_cast<const uint16_t>(aChars) != reinterpret_cast<const uint16_t>(bChars))
78	return false;
79
80	return true;
81	#endif
82	}
83
84	static unsigned hash(const RefPtr<StringImpl>& key) { return key->hash(); }
85	static bool equal(const RefPtr<StringImpl>& a, const RefPtr<StringImpl>& b)
86	{
87	return equal(a.get(), b.get());
88	}
89
90	static unsigned hash(const String& key) { return key.impl()->hash(); }
91	static bool equal(const String& a, const String& b)
92	{
93	return equal(a.impl(), b.impl());
94	}
95
96	static const bool safeToCompareToEmptyOrDeleted = false;
97	};
98
99	class CaseFoldingHash {
100	public:
101	// Paul Hsieh's SuperFastHash
102	// http://www.azillionmonkeys.com/qed/hash.html
103	static unsigned hash(const UChar* data, unsigned length)
104	{
105	unsigned l = length;
106	const UChar* s = data;
107	uint32_t hash = WTF::stringHashingStartValue;
108	uint32_t tmp;
109
110	int rem = l & 1;
111	l >>= 1;
112
113	// Main loop.
114	for (; l > 0; l--) {
115	hash += WTF::Unicode::foldCase(s[0]);
116	tmp = (WTF::Unicode::foldCase(s[1]) << 11) ^ hash;
117	hash = (hash << 16) ^ tmp;
118	s += 2;
119	hash += hash >> 11;
120	}
121
122	// Handle end case.
123	if (rem) {
124	hash += WTF::Unicode::foldCase(s[0]);
125	hash ^= hash << 11;
126	hash += hash >> 17;
127	}
128
129	// Force "avalanching" of final 127 bits.
130	hash ^= hash << 3;
131	hash += hash >> 5;
132	hash ^= hash << 2;
133	hash += hash >> 15;
134	hash ^= hash << 10;
135
136	// This avoids ever returning a hash code of 0, since that is used to
137	// signal "hash not computed yet", using a value that is likely to be
138	// effectively the same as 0 when the low bits are masked.
139	hash \|= !hash << 31;
140
141	return hash;
142	}
143
144	static unsigned hash(StringImpl* str)
145	{
146	return hash(str->characters(), str->length());
147	}
148
149	static unsigned hash(const char* str, unsigned length)
150	{
151	// This hash is designed to work on 16-bit chunks at a time. But since the normal case
152	// (above) is to hash UTF-16 characters, we just treat the 8-bit chars as if they
153	// were 16-bit chunks, which will give matching results.
154
155	unsigned l = length;
156	const char* s = str;
157	uint32_t hash = WTF::stringHashingStartValue;
158	uint32_t tmp;
159
160	int rem = l & 1;
161	l >>= 1;
162
163	// Main loop
164	for (; l > 0; l--) {
165	hash += WTF::Unicode::foldCase(s[0]);
166	tmp = (WTF::Unicode::foldCase(s[1]) << 11) ^ hash;
167	hash = (hash << 16) ^ tmp;
168	s += 2;
169	hash += hash >> 11;
170	}
171
172	// Handle end case
173	if (rem) {
174	hash += WTF::Unicode::foldCase(s[0]);
175	hash ^= hash << 11;
176	hash += hash >> 17;
177	}
178
179	// Force "avalanching" of final 127 bits
180	hash ^= hash << 3;
181	hash += hash >> 5;
182	hash ^= hash << 2;
183	hash += hash >> 15;
184	hash ^= hash << 10;
185
186	// this avoids ever returning a hash code of 0, since that is used to
187	// signal "hash not computed yet", using a value that is likely to be
188	// effectively the same as 0 when the low bits are masked
189	hash \|= !hash << 31;
190
191	return hash;
192	}
193
194	static bool equal(const StringImpl* a, const StringImpl* b)
195	{
196	if (a == b)
197	return true;
198	if (!a \|\| !b)
199	return false;
200	unsigned length = a->length();
201	if (length != b->length())
202	return false;
203	return WTF::Unicode::umemcasecmp(a->characters(), b->characters(), length) == 0;
204	}
205
206	static unsigned hash(const RefPtr<StringImpl>& key)
207	{
208	return hash(key.get());
209	}
210
211	static bool equal(const RefPtr<StringImpl>& a, const RefPtr<StringImpl>& b)
212	{
213	return equal(a.get(), b.get());
214	}
215
216	static unsigned hash(const String& key)
217	{
218	return hash(key.impl());
219	}
220	static unsigned hash(const AtomicString& key)
221	{
222	return hash(key.impl());
223	}
224	static bool equal(const String& a, const String& b)
225	{
226	return equal(a.impl(), b.impl());
227	}
228	static bool equal(const AtomicString& a, const AtomicString& b)
229	{
230	return (a == b) \|\| equal(a.impl(), b.impl());
231	}
232
233	static const bool safeToCompareToEmptyOrDeleted = false;
234	};
235
236	// This hash can be used in cases where the key is a hash of a string, but we don't
237	// want to store the string. It's not really specific to string hashing, but all our
238	// current uses of it are for strings.
239	struct AlreadyHashed : IntHash<unsigned> {
240	static unsigned hash(unsigned key) { return key; }
241
242	// To use a hash value as a key for a hash table, we need to eliminate the
243	// "deleted" value, which is negative one. That could be done by changing
244	// the string hash function to never generate negative one, but this works
245	// and is still relatively efficient.
246	static unsigned avoidDeletedValue(unsigned hash)
247	{
248	ASSERT(hash);
249	unsigned newHash = hash \| (!(hash + 1) << 31);
250	ASSERT(newHash);
251	ASSERT(newHash != 0xFFFFFFFF);
252	return newHash;
253	}
254	};
255
256	}
257
258	namespace WTF {
259
260	template<> struct HashTraits<WebCore::String> : GenericHashTraits<WebCore::String> {
261	static const bool emptyValueIsZero = true;
262	static void constructDeletedValue(WebCore::String& slot) { new (&slot) WebCore::String(HashTableDeletedValue); }
263	static bool isDeletedValue(const WebCore::String& slot) { return slot.isHashTableDeletedValue(); }
264	};
265
266	}
267
268	#endif