third_party/WebKit/Source/wtf/StringHasher.h

   1 /*
   2  * Copyright (C) 2005, 2006, 2008, 2010, 2013 Apple Inc. All rights reserved.
   3  * Copyright (C) 2010 Patrick Gansterer <paroga@paroga.com>
   4  *
   5  * This library is free software; you can redistribute it and/or
   6  * modify it under the terms of the GNU Library General Public
   7  * License as published by the Free Software Foundation; either
   8  * version 2 of the License, or (at your option) any later version.
   9  *
  10  * This library is distributed in the hope that it will be useful,
  11  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  13  * Library General Public License for more details.
  14  *
  15  * You should have received a copy of the GNU Library General Public License
  16  * along with this library; see the file COPYING.LIB.  If not, write to
  17  * the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
  18  * Boston, MA 02110-1301, USA.
  19  *
  20  */
  21
  22 #ifndef WTF_StringHasher_h
  23 #define WTF_StringHasher_h
  24
  25 #include "wtf/text/Unicode.h"
  26
  27 namespace WTF {
  28
  29 // Paul Hsieh's SuperFastHash
  30 // http://www.azillionmonkeys.com/qed/hash.html
  31
  32 // LChar data is interpreted as Latin-1-encoded (zero extended to 16 bits).
  33
  34 // NOTE: The hash computation here must stay in sync with the create_hash_table script in
  35 // JavaScriptCore and the CodeGeneratorJS.pm script in WebCore.
  36
  37 // Golden ratio. Arbitrary start value to avoid mapping all zeros to a hash value of zero.
  38 static const unsigned stringHashingStartValue = 0x9E3779B9U;
  39
  40 class StringHasher {
  41 public:
  42     static const unsigned flagCount = 8; // Save 8 bits for StringImpl to use as flags.
  43
  44     StringHasher()
  45         : m_hash(stringHashingStartValue)
  46         , m_hasPendingCharacter(false)
  47         , m_pendingCharacter(0)
  48     {
  49     }
  50
  51     // The hasher hashes two characters at a time, and thus an "aligned" hasher is one
  52     // where an even number of characters have been added. Callers that always add
  53     // characters two at a time can use the "assuming aligned" functions.
  54     void addCharactersAssumingAligned(UChar a, UChar b)
  55     {
  56         ASSERT(!m_hasPendingCharacter);
  57         m_hash += a;
  58         m_hash = (m_hash << 16) ^ ((b << 11) ^ m_hash);
  59         m_hash += m_hash >> 11;
  60     }
  61
  62     void addCharacter(UChar character)
  63     {
  64         if (m_hasPendingCharacter) {
  65             m_hasPendingCharacter = false;
  66             addCharactersAssumingAligned(m_pendingCharacter, character);
  67             return;
  68         }
  69
  70         m_pendingCharacter = character;
  71         m_hasPendingCharacter = true;
  72     }
  73
  74     void addCharacters(UChar a, UChar b)
  75     {
  76         if (m_hasPendingCharacter) {
  77 #if ENABLE(ASSERT)
  78             m_hasPendingCharacter = false;
  79 #endif
  80             addCharactersAssumingAligned(m_pendingCharacter, a);
  81             m_pendingCharacter = b;
  82 #if ENABLE(ASSERT)
  83             m_hasPendingCharacter = true;
  84 #endif
  85             return;
  86         }
  87
  88         addCharactersAssumingAligned(a, b);
  89     }
  90
  91     template<typename T, UChar Converter(T)> void addCharactersAssumingAligned(const T* data, unsigned length)
  92     {
  93         ASSERT(!m_hasPendingCharacter);
  94
  95         bool remainder = length & 1;
  96         length >>= 1;
  97
  98         while (length--) {
  99             addCharactersAssumingAligned(Converter(data[0]), Converter(data[1]));
 100             data += 2;
 101         }
 102
 103         if (remainder)
 104             addCharacter(Converter(*data));
 105     }
 106
 107     template<typename T> void addCharactersAssumingAligned(const T* data, unsigned length)
 108     {
 109         addCharactersAssumingAligned<T, defaultConverter>(data, length);
 110     }
 111
 112     template<typename T, UChar Converter(T)> void addCharacters(const T* data, unsigned length)
 113     {
 114         if (m_hasPendingCharacter && length) {
 115             m_hasPendingCharacter = false;
 116             addCharactersAssumingAligned(m_pendingCharacter, Converter(*data++));
 117             --length;
 118         }
 119         addCharactersAssumingAligned<T, Converter>(data, length);
 120     }
 121
 122     template<typename T> void addCharacters(const T* data, unsigned length)
 123     {
 124         addCharacters<T, defaultConverter>(data, length);
 125     }
 126
 127     unsigned hashWithTop8BitsMasked() const
 128     {
 129         unsigned result = avalancheBits();
 130
 131         // Reserving space from the high bits for flags preserves most of the hash's
 132         // value, since hash lookup typically masks out the high bits anyway.
 133         result &= (1U << (sizeof(result) * 8 - flagCount)) - 1;
 134
 135         // This avoids ever returning a hash code of 0, since that is used to
 136         // signal "hash not computed yet". Setting the high bit maintains
 137         // reasonable fidelity to a hash code of 0 because it is likely to yield
 138         // exactly 0 when hash lookup masks out the high bits.
 139         if (!result)
 140             result = 0x80000000 >> flagCount;
 141
 142         return result;
 143     }
 144
 145     unsigned hash() const
 146     {
 147         unsigned result = avalancheBits();
 148
 149         // This avoids ever returning a hash code of 0, since that is used to
 150         // signal "hash not computed yet". Setting the high bit maintains
 151         // reasonable fidelity to a hash code of 0 because it is likely to yield
 152         // exactly 0 when hash lookup masks out the high bits.
 153         if (!result)
 154             result = 0x80000000;
 155
 156         return result;
 157     }
 158
 159     template<typename T, UChar Converter(T)> static unsigned computeHashAndMaskTop8Bits(const T* data, unsigned length)
 160     {
 161         StringHasher hasher;
 162         hasher.addCharactersAssumingAligned<T, Converter>(data, length);
 163         return hasher.hashWithTop8BitsMasked();
 164     }
 165
 166     template<typename T> static unsigned computeHashAndMaskTop8Bits(const T* data, unsigned length)
 167     {
 168         return computeHashAndMaskTop8Bits<T, defaultConverter>(data, length);
 169     }
 170
 171     template<typename T, UChar Converter(T)> static unsigned computeHash(const T* data, unsigned length)
 172     {
 173         StringHasher hasher;
 174         hasher.addCharactersAssumingAligned<T, Converter>(data, length);
 175         return hasher.hash();
 176     }
 177
 178     template<typename T> static unsigned computeHash(const T* data, unsigned length)
 179     {
 180         return computeHash<T, defaultConverter>(data, length);
 181     }
 182
 183     static unsigned hashMemory(const void* data, unsigned length)
 184     {
 185         // FIXME: Why does this function use the version of the hash that drops the top 8 bits?
 186         // We want that for all string hashing so we can use those bits in StringImpl and hash
 187         // strings consistently, but I don't see why we'd want that for general memory hashing.
 188         ASSERT(!(length % 2));
 189         return computeHashAndMaskTop8Bits<UChar>(static_cast<const UChar*>(data), length / sizeof(UChar));
 190     }
 191
 192     template<size_t length> static unsigned hashMemory(const void* data)
 193     {
 194         static_assert(!(length % 2), "length must be a multiple of two");
 195         return hashMemory(data, length);
 196     }
 197
 198 private:
 199     static UChar defaultConverter(UChar character)
 200     {
 201         return character;
 202     }
 203
 204     static UChar defaultConverter(LChar character)
 205     {
 206         return character;
 207     }
 208
 209     unsigned avalancheBits() const
 210     {
 211         unsigned result = m_hash;
 212
 213         // Handle end case.
 214         if (m_hasPendingCharacter) {
 215             result += m_pendingCharacter;
 216             result ^= result << 11;
 217             result += result >> 17;
 218         }
 219
 220         // Force "avalanching" of final 31 bits.
 221         result ^= result << 3;
 222         result += result >> 5;
 223         result ^= result << 2;
 224         result += result >> 15;
 225         result ^= result << 10;
 226
 227         return result;
 228     }
 229
 230     unsigned m_hash;
 231     bool m_hasPendingCharacter;
 232     UChar m_pendingCharacter;
 233 };
 234
 235 } // namespace WTF
 236
 237 using WTF::StringHasher;
 238
 239 #endif // WTF_StringHasher_h