hbase-common/src/main/java/org/apache/hadoop/hbase/util/JenkinsHash.java

   1 /**
   2  *
   3  * Licensed to the Apache Software Foundation (ASF) under one
   4  * or more contributor license agreements.  See the NOTICE file
   5  * distributed with this work for additional information
   6  * regarding copyright ownership.  The ASF licenses this file
   7  * to you under the Apache License, Version 2.0 (the
   8  * "License"); you may not use this file except in compliance
   9  * with the License.  You may obtain a copy of the License at
  10  *
  11  *     http://www.apache.org/licenses/LICENSE-2.0
  12  *
  13  * Unless required by applicable law or agreed to in writing, software
  14  * distributed under the License is distributed on an "AS IS" BASIS,
  15  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  16  * See the License for the specific language governing permissions and
  17  * limitations under the License.
  18  */
  19
  20 package org.apache.hadoop.hbase.util;
  21
  22 import static java.lang.Integer.rotateLeft;
  23
  24 import java.io.FileInputStream;
  25 import java.io.IOException;
  26
  27 import org.apache.yetus.audience.InterfaceAudience;
  28 import org.apache.yetus.audience.InterfaceStability;
  29
  30 /**
  31  * Produces 32-bit hash for hash table lookup.
  32  *
  33  * <pre>lookup3.c, by Bob Jenkins, May 2006, Public Domain.
  34  *
  35  * You can use this free for any purpose.  It's in the public domain.
  36  * It has no warranty.
  37  * </pre>
  38  *
  39  * @see <a href="http://burtleburtle.net/bob/c/lookup3.c">lookup3.c</a>
  40  * @see <a href="http://www.ddj.com/184410284">Hash Functions (and how this
  41  * function compares to others such as CRC, MD?, etc</a>
  42  * @see <a href="http://burtleburtle.net/bob/hash/doobs.html">Has update on the
  43  * Dr. Dobbs Article</a>
  44  */
  45 @InterfaceAudience.Private
  46 @InterfaceStability.Stable
  47 public class JenkinsHash extends Hash {
  48   private static final int BYTE_MASK = 0xff;
  49
  50   private static JenkinsHash _instance = new JenkinsHash();
  51
  52   public static Hash getInstance() {
  53     return _instance;
  54   }
  55
  56   /**
  57    * Compute the hash of the specified file
  58    * @param args name of file to compute hash of.
  59    * @throws IOException e
  60    */
  61   public static void main(String[] args) throws IOException {
  62     if (args.length != 1) {
  63       System.err.println("Usage: JenkinsHash filename");
  64       System.exit(-1);
  65     }
  66     FileInputStream in = new FileInputStream(args[0]);
  67     byte[] bytes = new byte[512];
  68     int value = 0;
  69     JenkinsHash hash = new JenkinsHash();
  70     try {
  71       for (int length = in.read(bytes); length > 0; length = in.read(bytes)) {
  72         value = hash.hash(new ByteArrayHashKey(bytes, 0, length), value);
  73       }
  74     } finally {
  75       in.close();
  76     }
  77     System.out.println(Math.abs(value));
  78   }
  79
  80   /**
  81    * taken from  hashlittle() -- hash a variable-length key into a 32-bit value
  82    *
  83    * @param hashKey the key to extract the  bytes for hash algo
  84    * @param initval can be any integer value
  85    * @return a 32-bit value.  Every bit of the key affects every bit of the
  86    * return value.  Two keys differing by one or two bits will have totally
  87    * different hash values.
  88    *
  89    * <p>The best hash table sizes are powers of 2.  There is no need to do mod
  90    * a prime (mod is sooo slow!).  If you need less than 32 bits, use a bitmask.
  91    * For example, if you need only 10 bits, do
  92    * <code>h = (h &amp; hashmask(10));</code>
  93    * In which case, the hash table should have hashsize(10) elements.
  94    *
  95    * <p>If you are hashing n strings byte[][] k, do it like this:
  96    * for (int i = 0, h = 0; i &lt; n; ++i) h = hash( k[i], h);
  97    *
  98    * <p>By Bob Jenkins, 2006.  bob_jenkins@burtleburtle.net.  You may use this
  99    * code any way you wish, private, educational, or commercial.  It's free.
 100    *
 101    * <p>Use for hash table lookup, or anything where one collision in 2^^32 is
 102    * acceptable.  Do NOT use for cryptographic purposes.
 103   */
 104   @SuppressWarnings({"fallthrough", "MissingDefault"})
 105   @Override
 106   public <T> int hash(HashKey<T> hashKey, int initval) {
 107     int length = hashKey.length();
 108     int a, b, c;
 109     a = b = c = 0xdeadbeef + length + initval;
 110     int offset = 0;
 111     for (; length > 12; offset += 12, length -= 12) {
 112       a += (hashKey.get(offset) & BYTE_MASK);
 113       a += ((hashKey.get(offset + 1) & BYTE_MASK) <<  8);
 114       a += ((hashKey.get(offset + 2) & BYTE_MASK) << 16);
 115       a += ((hashKey.get(offset + 3) & BYTE_MASK) << 24);
 116       b += (hashKey.get(offset + 4) & BYTE_MASK);
 117       b += ((hashKey.get(offset + 5) & BYTE_MASK) <<  8);
 118       b += ((hashKey.get(offset + 6) & BYTE_MASK) << 16);
 119       b += ((hashKey.get(offset + 7) & BYTE_MASK) << 24);
 120       c += (hashKey.get(offset + 8) & BYTE_MASK);
 121       c += ((hashKey.get(offset + 9) & BYTE_MASK) <<  8);
 122       c += ((hashKey.get(offset + 10) & BYTE_MASK) << 16);
 123       c += ((hashKey.get(offset + 11) & BYTE_MASK) << 24);
 124
 125       /*
 126        * mix -- mix 3 32-bit values reversibly.
 127        * This is reversible, so any information in (a,b,c) before mix() is
 128        * still in (a,b,c) after mix().
 129        *
 130        * If four pairs of (a,b,c) inputs are run through mix(), or through
 131        * mix() in reverse, there are at least 32 bits of the output that
 132        * are sometimes the same for one pair and different for another pair.
 133        *
 134        * This was tested for:
 135        * - pairs that differed by one bit, by two bits, in any combination
 136        *   of top bits of (a,b,c), or in any combination of bottom bits of
 137        *   (a,b,c).
 138        * - "differ" is defined as +, -, ^, or ~^.  For + and -, I transformed
 139        *   the output delta to a Gray code (a^(a>>1)) so a string of 1's (as
 140        *    is commonly produced by subtraction) look like a single 1-bit
 141        *    difference.
 142        * - the base values were pseudorandom, all zero but one bit set, or
 143        *   all zero plus a counter that starts at zero.
 144        *
 145        * Some k values for my "a-=c; a^=rot(c,k); c+=b;" arrangement that
 146        * satisfy this are
 147        *     4  6  8 16 19  4
 148        *     9 15  3 18 27 15
 149        *    14  9  3  7 17  3
 150        * Well, "9 15 3 18 27 15" didn't quite get 32 bits diffing for
 151        * "differ" defined as + with a one-bit base and a two-bit delta.  I
 152        * used http://burtleburtle.net/bob/hash/avalanche.html to choose
 153        * the operations, constants, and arrangements of the variables.
 154        *
 155        * This does not achieve avalanche.  There are input bits of (a,b,c)
 156        * that fail to affect some output bits of (a,b,c), especially of a.
 157        * The most thoroughly mixed value is c, but it doesn't really even
 158        * achieve avalanche in c.
 159        *
 160        * This allows some parallelism.  Read-after-writes are good at doubling
 161        * the number of bits affected, so the goal of mixing pulls in the
 162        * opposite direction as the goal of parallelism.  I did what I could.
 163        * Rotates seem to cost as much as shifts on every machine I could lay
 164        * my hands on, and rotates are much kinder to the top and bottom bits,
 165        * so I used rotates.
 166        *
 167        * #define mix(a,b,c) \
 168        * { \
 169        *   a -= c;  a ^= rot(c, 4);  c += b; \
 170        *   b -= a;  b ^= rot(a, 6);  a += c; \
 171        *   c -= b;  c ^= rot(b, 8);  b += a; \
 172        *   a -= c;  a ^= rot(c,16);  c += b; \
 173        *   b -= a;  b ^= rot(a,19);  a += c; \
 174        *   c -= b;  c ^= rot(b, 4);  b += a; \
 175        * }
 176        *
 177        * mix(a,b,c);
 178        */
 179       a -= c; a ^= rotateLeft(c, 4); c += b;
 180       b -= a; b ^= rotateLeft(a, 6); a += c;
 181       c -= b; c ^= rotateLeft(b, 8); b += a;
 182       a -= c; a ^= rotateLeft(c, 16); c += b;
 183       b -= a; b ^= rotateLeft(a, 19); a += c;
 184       c -= b; c ^= rotateLeft(b, 4); b += a;
 185     }
 186
 187     //-------------------------------- last block: affect all 32 bits of (c)
 188     switch (length) {                   // all the case statements fall through
 189     case 12:
 190       c += ((hashKey.get(offset + 11) & BYTE_MASK) << 24);
 191     case 11:
 192       c += ((hashKey.get(offset + 10) & BYTE_MASK) << 16);
 193     case 10:
 194       c += ((hashKey.get(offset + 9) & BYTE_MASK) <<  8);
 195     case  9:
 196       c += (hashKey.get(offset + 8) & BYTE_MASK);
 197     case  8:
 198       b += ((hashKey.get(offset + 7) & BYTE_MASK) << 24);
 199     case  7:
 200       b += ((hashKey.get(offset + 6) & BYTE_MASK) << 16);
 201     case  6:
 202       b += ((hashKey.get(offset + 5) & BYTE_MASK) <<  8);
 203     case  5:
 204       b += (hashKey.get(offset + 4) & BYTE_MASK);
 205     case  4:
 206       a += ((hashKey.get(offset + 3) & BYTE_MASK) << 24);
 207     case  3:
 208       a += ((hashKey.get(offset + 2) & BYTE_MASK) << 16);
 209     case  2:
 210       a += ((hashKey.get(offset + 1) & BYTE_MASK) <<  8);
 211     case  1:
 212       //noinspection PointlessArithmeticExpression
 213       a += (hashKey.get(offset + 0) & BYTE_MASK);
 214       break;
 215     case  0:
 216       return c;
 217     }
 218     /*
 219      * final -- final mixing of 3 32-bit values (a,b,c) into c
 220      *
 221      * Pairs of (a,b,c) values differing in only a few bits will usually
 222      * produce values of c that look totally different.  This was tested for
 223      * - pairs that differed by one bit, by two bits, in any combination
 224      *   of top bits of (a,b,c), or in any combination of bottom bits of
 225      *   (a,b,c).
 226      *
 227      * - "differ" is defined as +, -, ^, or ~^.  For + and -, I transformed
 228      *   the output delta to a Gray code (a^(a>>1)) so a string of 1's (as
 229      *   is commonly produced by subtraction) look like a single 1-bit
 230      *   difference.
 231      *
 232      * - the base values were pseudorandom, all zero but one bit set, or
 233      *   all zero plus a counter that starts at zero.
 234      *
 235      * These constants passed:
 236      *   14 11 25 16 4 14 24
 237      *   12 14 25 16 4 14 24
 238      * and these came close:
 239      *    4  8 15 26 3 22 24
 240      *   10  8 15 26 3 22 24
 241      *   11  8 15 26 3 22 24
 242      *
 243      * #define final(a,b,c) \
 244      * {
 245      *   c ^= b; c -= rot(b,14); \
 246      *   a ^= c; a -= rot(c,11); \
 247      *   b ^= a; b -= rot(a,25); \
 248      *   c ^= b; c -= rot(b,16); \
 249      *   a ^= c; a -= rot(c,4);  \
 250      *   b ^= a; b -= rot(a,14); \
 251      *   c ^= b; c -= rot(b,24); \
 252      * }
 253      *
 254      */
 255     c ^= b; c -= rotateLeft(b, 14);
 256     a ^= c; a -= rotateLeft(c, 11);
 257     b ^= a; b -= rotateLeft(a, 25);
 258     c ^= b; c -= rotateLeft(b, 16);
 259     a ^= c; a -= rotateLeft(c, 4);
 260     b ^= a; b -= rotateLeft(a, 14);
 261     c ^= b; c -= rotateLeft(b, 24);
 262     return c;
 263   }
 264 }