modules/cache/cache_hash.c

   1 /* Licensed to the Apache Software Foundation (ASF) under one or more
   2  * contributor license agreements.  See the NOTICE file distributed with
   3  * this work for additional information regarding copyright ownership.
   4  * The ASF licenses this file to You under the Apache License, Version 2.0
   5  * (the "License"); you may not use this file except in compliance with
   6  * the License.  You may obtain a copy of the License at
   7  *
   8  *     http://www.apache.org/licenses/LICENSE-2.0
   9  *
  10  * Unless required by applicable law or agreed to in writing, software
  11  * distributed under the License is distributed on an "AS IS" BASIS,
  12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13  * See the License for the specific language governing permissions and
  14  * limitations under the License.
  15  */
  16
  17 #include "apr_general.h"
  18
  19 #include "mod_cache.h"
  20 #include "cache_hash.h"
  21
  22 #if APR_HAVE_STDLIB_H
  23 #include <stdlib.h>
  24 #endif
  25 #if APR_HAVE_STRING_H
  26 #include <string.h>
  27 #endif
  28
  29
  30 /*
  31  * The internal form of a hash table.
  32  *
  33  * The table is an array indexed by the hash of the key; collisions
  34  * are resolved by hanging a linked list of hash entries off each
  35  * element of the array. Although this is a really simple design it
  36  * isn't too bad given that pools have a low allocation overhead.
  37  */
  38
  39 typedef struct cache_hash_entry_t cache_hash_entry_t;
  40
  41 struct cache_hash_entry_t {
  42     cache_hash_entry_t   *next;
  43     unsigned int         hash;
  44     const void           *key;
  45     apr_ssize_t          klen;
  46     const void           *val;
  47 };
  48
  49 /*
  50  * Data structure for iterating through a hash table.
  51  *
  52  * We keep a pointer to the next hash entry here to allow the current
  53  * hash entry to be freed or otherwise mangled between calls to
  54  * cache_hash_next().
  55  */
  56 struct cache_hash_index_t {
  57     cache_hash_t         *ht;
  58     cache_hash_entry_t   *this, *next;
  59     int                  index;
  60 };
  61
  62 /*
  63  * The size of the array is always a power of two. We use the maximum
  64  * index rather than the size so that we can use bitwise-AND for
  65  * modular arithmetic.
  66  * The count of hash entries may be greater depending on the chosen
  67  * collision rate.
  68  */
  69 struct cache_hash_t {
  70     cache_hash_entry_t   **array;
  71     cache_hash_index_t     iterator;  /* For cache_hash_first(NULL, ...) */
  72     int                  count, max;
  73 };
  74
  75 /*
  76  * Hash creation functions.
  77  */
  78 static cache_hash_entry_t **alloc_array(cache_hash_t *ht, int max)
  79 {
  80    return calloc(1, sizeof(*ht->array) * (max + 1));
  81 }
  82
  83 cache_hash_t* cache_hash_make(apr_size_t size)
  84 {
  85     cache_hash_t *ht;
  86     ht = malloc(sizeof(cache_hash_t));
  87     if (!ht) {
  88         return NULL;
  89     }
  90     ht->count = 0;
  91     ht->max = size;
  92     ht->array = alloc_array(ht, ht->max);
  93     if (!ht->array) {
  94         free(ht);
  95         return NULL;
  96     }
  97     return ht;
  98 }
  99
 100 void cache_hash_free(cache_hash_t *ht)
 101 {
 102     if (ht) {
 103         if (ht->array) {
 104             free (ht->array);
 105         }
 106         free (ht);
 107     }
 108 }
 109 /*
 110  * Hash iteration functions.
 111  */
 112
 113 cache_hash_index_t* cache_hash_next(cache_hash_index_t *hi)
 114 {
 115     hi->this = hi->next;
 116     while (!hi->this) {
 117         if (hi->index > hi->ht->max)
 118             return NULL;
 119         hi->this = hi->ht->array[hi->index++];
 120     }
 121     hi->next = hi->this->next;
 122     return hi;
 123 }
 124
 125 cache_hash_index_t* cache_hash_first(cache_hash_t *ht)
 126 {
 127     cache_hash_index_t *hi;
 128
 129     hi = &ht->iterator;
 130     hi->ht = ht;
 131     hi->index = 0;
 132     hi->this = NULL;
 133     hi->next = NULL;
 134     return cache_hash_next(hi);
 135 }
 136
 137 void cache_hash_this(cache_hash_index_t *hi,
 138                                   const void **key,
 139                                   apr_ssize_t *klen,
 140                                   void **val)
 141 {
 142     if (key)  *key  = hi->this->key;
 143     if (klen) *klen = hi->this->klen;
 144     if (val)  *val  = (void *)hi->this->val;
 145 }
 146
 147
 148 /*
 149  * This is where we keep the details of the hash function and control
 150  * the maximum collision rate.
 151  *
 152  * If val is non-NULL it creates and initializes a new hash entry if
 153  * there isn't already one there; it returns an updatable pointer so
 154  * that hash entries can be removed.
 155  */
 156
 157 static cache_hash_entry_t **find_entry(cache_hash_t *ht,
 158                                        const void *key,
 159                                        apr_ssize_t klen,
 160                                        const void *val)
 161 {
 162     cache_hash_entry_t **hep, *he;
 163     const unsigned char *p;
 164     unsigned int hash;
 165     apr_ssize_t i;
 166
 167     /*
 168      * This is the popular `times 33' hash algorithm which is used by
 169      * perl and also appears in Berkeley DB. This is one of the best
 170      * known hash functions for strings because it is both computed
 171      * very fast and distributes very well.
 172      *
 173      * The originator may be Dan Bernstein but the code in Berkeley DB
 174      * cites Chris Torek as the source. The best citation I have found
 175      * is "Chris Torek, Hash function for text in C, Usenet message
 176      * <27038@mimsy.umd.edu> in comp.lang.c , October, 1990." in Rich
 177      * Salz's USENIX 1992 paper about INN which can be found at
 178      * <http://citeseer.nj.nec.com/salz92internetnews.html>.
 179      *
 180      * The magic of number 33, i.e. why it works better than many other
 181      * constants, prime or not, has never been adequately explained by
 182      * anyone. So I try an explanation: if one experimentally tests all
 183      * multipliers between 1 and 256 (as I did while writing a low-level
 184      * data structure library some time ago) one detects that even
 185      * numbers are not useable at all. The remaining 128 odd numbers
 186      * (except for the number 1) work more or less all equally well.
 187      * They all distribute in an acceptable way and this way fill a hash
 188      * table with an average percent of approx. 86%.
 189      *
 190      * If one compares the chi^2 values of the variants (see
 191      * Bob Jenkins ``Hashing Frequently Asked Questions'' at
 192      * http://burtleburtle.net/bob/hash/hashfaq.html for a description
 193      * of chi^2), the number 33 not even has the best value. But the
 194      * number 33 and a few other equally good numbers like 17, 31, 63,
 195      * 127 and 129 have nevertheless a great advantage to the remaining
 196      * numbers in the large set of possible multipliers: their multiply
 197      * operation can be replaced by a faster operation based on just one
 198      * shift plus either a single addition or subtraction operation. And
 199      * because a hash function has to both distribute good _and_ has to
 200      * be very fast to compute, those few numbers should be preferred.
 201      *
 202      *                  -- Ralf S. Engelschall <rse@engelschall.com>
 203      */
 204     hash = 0;
 205     if (klen == CACHE_HASH_KEY_STRING) {
 206         for (p = key; *p; p++) {
 207             hash = hash * 33 + *p;
 208         }
 209         klen = p - (const unsigned char *)key;
 210     }
 211     else {
 212         for (p = key, i = klen; i; i--, p++) {
 213             hash = hash * 33 + *p;
 214         }
 215     }
 216
 217     /* scan linked list */
 218     for (hep = &ht->array[hash % ht->max], he = *hep;
 219          he;
 220          hep = &he->next, he = *hep) {
 221         if (he->hash == hash &&
 222             he->klen == klen &&
 223             memcmp(he->key, key, klen) == 0)
 224             break;
 225     }
 226     if (he || !val)
 227         return hep;
 228     /* add a new entry for non-NULL values */
 229     he = malloc(sizeof(*he));
 230     if (!he) {
 231         return NULL;
 232     }
 233     he->next = NULL;
 234     he->hash = hash;
 235     he->key  = key;
 236     he->klen = klen;
 237     he->val  = val;
 238     *hep = he;
 239     ht->count++;
 240     return hep;
 241 }
 242
 243 void* cache_hash_get(cache_hash_t *ht,
 244                                    const void *key,
 245                                    apr_ssize_t klen)
 246 {
 247     cache_hash_entry_t *he;
 248     he = *find_entry(ht, key, klen, NULL);
 249     if (he)
 250         return (void *)he->val;
 251     else
 252         return NULL;
 253 }
 254
 255 void* cache_hash_set(cache_hash_t *ht,
 256                                      const void *key,
 257                                      apr_ssize_t klen,
 258                                      const void *val)
 259 {
 260     cache_hash_entry_t **hep, *tmp;
 261     const void *tval;
 262     hep = find_entry(ht, key, klen, val);
 263     /* If hep == NULL, then the malloc() in find_entry failed */
 264     if (hep && *hep) {
 265         if (!val) {
 266             /* delete entry */
 267             tval = (*hep)->val;
 268             tmp = *hep;
 269             *hep = (*hep)->next;
 270             free(tmp);
 271             --ht->count;
 272         }
 273         else {
 274             /* replace entry */
 275             tval = (*hep)->val;
 276             (*hep)->val = val;
 277         }
 278         /* Return the object just removed from the cache to let the
 279          * caller clean it up. Cast the constness away upon return.
 280          */
 281         return (void *) tval;
 282     }
 283     /* else key not present and val==NULL */
 284     return NULL;
 285 }
 286
 287 int cache_hash_count(cache_hash_t *ht)
 288 {
 289     return ht->count;
 290 }