sys/kern/vfs_cache.c

   1 /*      $NetBSD: vfs_cache.c,v 1.83 2009/02/18 13:24:18 yamt Exp $      */
   2
   3 /*-
   4  * Copyright (c) 2008 The NetBSD Foundation, Inc.
   5  * All rights reserved.
   6  *
   7  * Redistribution and use in source and binary forms, with or without
   8  * modification, are permitted provided that the following conditions
   9  * are met:
  10  * 1. Redistributions of source code must retain the above copyright
  11  *    notice, this list of conditions and the following disclaimer.
  12  * 2. Redistributions in binary form must reproduce the above copyright
  13  *    notice, this list of conditions and the following disclaimer in the
  14  *    documentation and/or other materials provided with the distribution.
  15  *
  16  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
  17  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
  18  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
  19  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
  20  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  21  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  22  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  23  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  24  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  25  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  26  * POSSIBILITY OF SUCH DAMAGE.
  27  */
  28
  29 /*
  30  * Copyright (c) 1989, 1993
  31  *      The Regents of the University of California.  All rights reserved.
  32  *
  33  * Redistribution and use in source and binary forms, with or without
  34  * modification, are permitted provided that the following conditions
  35  * are met:
  36  * 1. Redistributions of source code must retain the above copyright
  37  *    notice, this list of conditions and the following disclaimer.
  38  * 2. Redistributions in binary form must reproduce the above copyright
  39  *    notice, this list of conditions and the following disclaimer in the
  40  *    documentation and/or other materials provided with the distribution.
  41  * 3. Neither the name of the University nor the names of its contributors
  42  *    may be used to endorse or promote products derived from this software
  43  *    without specific prior written permission.
  44  *
  45  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  46  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  47  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  48  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  49  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  50  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  51  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  52  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  53  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  54  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  55  * SUCH DAMAGE.
  56  *
  57  *      @(#)vfs_cache.c 8.3 (Berkeley) 8/22/94
  58  */
  59
  60 #include <sys/cdefs.h>
  61 __KERNEL_RCSID(0, "$NetBSD: vfs_cache.c,v 1.83 2009/02/18 13:24:18 yamt Exp $");
  62
  63 #include "opt_ddb.h"
  64 #include "opt_revcache.h"
  65
  66 #include <sys/param.h>
  67 #include <sys/systm.h>
  68 #include <sys/time.h>
  69 #include <sys/mount.h>
  70 #include <sys/vnode.h>
  71 #include <sys/namei.h>
  72 #include <sys/errno.h>
  73 #include <sys/pool.h>
  74 #include <sys/mutex.h>
  75 #include <sys/atomic.h>
  76 #include <sys/kthread.h>
  77 #include <sys/kernel.h>
  78 #include <sys/cpu.h>
  79 #include <sys/evcnt.h>
  80
  81 #define NAMECACHE_ENTER_REVERSE
  82 /*
  83  * Name caching works as follows:
  84  *
  85  * Names found by directory scans are retained in a cache
  86  * for future reference.  It is managed LRU, so frequently
  87  * used names will hang around.  Cache is indexed by hash value
  88  * obtained from (dvp, name) where dvp refers to the directory
  89  * containing name.
  90  *
  91  * For simplicity (and economy of storage), names longer than
  92  * a maximum length of NCHNAMLEN are not cached; they occur
  93  * infrequently in any case, and are almost never of interest.
  94  *
  95  * Upon reaching the last segment of a path, if the reference
  96  * is for DELETE, or NOCACHE is set (rewrite), and the
  97  * name is located in the cache, it will be dropped.
  98  * The entry is dropped also when it was not possible to lock
  99  * the cached vnode, either because vget() failed or the generation
 100  * number has changed while waiting for the lock.
 101  */
 102
 103 /*
 104  * Per-cpu namecache data.
 105  */
 106 struct nchcpu {
 107         kmutex_t        cpu_lock;
 108         struct nchstats cpu_stats;
 109 };
 110
 111 /*
 112  * Structures associated with name cacheing.
 113  */
 114 LIST_HEAD(nchashhead, namecache) *nchashtbl;
 115 u_long  nchash;                         /* size of hash table - 1 */
 116 #define NCHASH(cnp, dvp)        \
 117         (((cnp)->cn_hash ^ ((uintptr_t)(dvp) >> 3)) & nchash)
 118
 119 LIST_HEAD(ncvhashhead, namecache) *ncvhashtbl;
 120 u_long  ncvhash;                        /* size of hash table - 1 */
 121 #define NCVHASH(vp)             (((uintptr_t)(vp) >> 3) & ncvhash)
 122
 123 long    numcache;                       /* number of cache entries allocated */
 124 static u_int    cache_gcpend;           /* number of entries pending GC */
 125 static void     *cache_gcqueue;         /* garbage collection queue */
 126
 127 TAILQ_HEAD(, namecache) nclruhead =             /* LRU chain */
 128         TAILQ_HEAD_INITIALIZER(nclruhead);
 129 #define COUNT(c,x)      (c.x++)
 130 struct  nchstats nchstats;              /* cache effectiveness statistics */
 131
 132 static pool_cache_t namecache_cache;
 133
 134 int cache_lowat = 95;
 135 int cache_hiwat = 98;
 136 int cache_hottime = 5;                  /* number of seconds */
 137 int doingcache = 1;                     /* 1 => enable the cache */
 138
 139 static struct evcnt cache_ev_scan;
 140 static struct evcnt cache_ev_gc;
 141 static struct evcnt cache_ev_over;
 142 static struct evcnt cache_ev_under;
 143 static struct evcnt cache_ev_forced;
 144
 145 /* A single lock to serialize modifications. */
 146 static kmutex_t *namecache_lock;
 147
 148 static void cache_invalidate(struct namecache *);
 149 static inline struct namecache *cache_lookup_entry(
 150     const struct vnode *, const struct componentname *);
 151 static void cache_thread(void *);
 152 static void cache_invalidate(struct namecache *);
 153 static void cache_disassociate(struct namecache *);
 154 static void cache_reclaim(void);
 155 static int cache_ctor(void *, void *, int);
 156 static void cache_dtor(void *, void *);
 157
 158 /*
 159  * Invalidate a cache entry and enqueue it for garbage collection.
 160  */
 161 static void
 162 cache_invalidate(struct namecache *ncp)
 163 {
 164         void *head;
 165
 166         KASSERT(mutex_owned(&ncp->nc_lock));
 167
 168         if (ncp->nc_dvp != NULL) {
 169                 ncp->nc_vp = NULL;
 170                 ncp->nc_dvp = NULL;
 171                 do {
 172                         head = cache_gcqueue;
 173                         ncp->nc_gcqueue = head;
 174                 } while (atomic_cas_ptr(&cache_gcqueue, head, ncp) != head);
 175                 atomic_inc_uint(&cache_gcpend);
 176         }
 177 }
 178
 179 /*
 180  * Disassociate a namecache entry from any vnodes it is attached to,
 181  * and remove from the global LRU list.
 182  */
 183 static void
 184 cache_disassociate(struct namecache *ncp)
 185 {
 186
 187         KASSERT(mutex_owned(namecache_lock));
 188         KASSERT(ncp->nc_dvp == NULL);
 189
 190         if (ncp->nc_lru.tqe_prev != NULL) {
 191                 TAILQ_REMOVE(&nclruhead, ncp, nc_lru);
 192                 ncp->nc_lru.tqe_prev = NULL;
 193         }
 194         if (ncp->nc_vhash.le_prev != NULL) {
 195                 LIST_REMOVE(ncp, nc_vhash);
 196                 ncp->nc_vhash.le_prev = NULL;
 197         }
 198         if (ncp->nc_vlist.le_prev != NULL) {
 199                 LIST_REMOVE(ncp, nc_vlist);
 200                 ncp->nc_vlist.le_prev = NULL;
 201         }
 202         if (ncp->nc_dvlist.le_prev != NULL) {
 203                 LIST_REMOVE(ncp, nc_dvlist);
 204                 ncp->nc_dvlist.le_prev = NULL;
 205         }
 206 }
 207
 208 /*
 209  * Lock all CPUs to prevent any cache lookup activity.  Conceptually,
 210  * this locks out all "readers".
 211  */
 212 static void
 213 cache_lock_cpus(void)
 214 {
 215         CPU_INFO_ITERATOR cii;
 216         struct cpu_info *ci;
 217         struct nchcpu *cpup;
 218         long *s, *d, *m;
 219
 220         for (CPU_INFO_FOREACH(cii, ci)) {
 221                 cpup = ci->ci_data.cpu_nch;
 222                 mutex_enter(&cpup->cpu_lock);
 223
 224                 /* Collate statistics. */
 225                 d = (long *)&nchstats;
 226                 s = (long *)&cpup->cpu_stats;
 227                 m = s + sizeof(nchstats) / sizeof(long);
 228                 for (; s < m; s++, d++) {
 229                         *d += *s;
 230                         *s = 0;
 231                 }
 232         }
 233 }
 234
 235 /*
 236  * Release all CPU locks.
 237  */
 238 static void
 239 cache_unlock_cpus(void)
 240 {
 241         CPU_INFO_ITERATOR cii;
 242         struct cpu_info *ci;
 243         struct nchcpu *cpup;
 244
 245         for (CPU_INFO_FOREACH(cii, ci)) {
 246                 cpup = ci->ci_data.cpu_nch;
 247                 mutex_exit(&cpup->cpu_lock);
 248         }
 249 }
 250
 251 /*
 252  * Find a single cache entry and return it locked.  'namecache_lock' or
 253  * at least one of the per-CPU locks must be held.
 254  */
 255 static struct namecache *
 256 cache_lookup_entry(const struct vnode *dvp, const struct componentname *cnp)
 257 {
 258         struct nchashhead *ncpp;
 259         struct namecache *ncp;
 260
 261         KASSERT(dvp != NULL);
 262         ncpp = &nchashtbl[NCHASH(cnp, dvp)];
 263
 264         LIST_FOREACH(ncp, ncpp, nc_hash) {
 265                 if (ncp->nc_dvp != dvp ||
 266                     ncp->nc_nlen != cnp->cn_namelen ||
 267                     memcmp(ncp->nc_name, cnp->cn_nameptr, (u_int)ncp->nc_nlen))
 268                         continue;
 269                 mutex_enter(&ncp->nc_lock);
 270                 if (__predict_true(ncp->nc_dvp == dvp)) {
 271                         ncp->nc_hittime = hardclock_ticks;
 272                         return ncp;
 273                 }
 274                 /* Raced: entry has been nullified. */
 275                 mutex_exit(&ncp->nc_lock);
 276         }
 277
 278         return NULL;
 279 }
 280
 281 /*
 282  * Look for a the name in the cache. We don't do this
 283  * if the segment name is long, simply so the cache can avoid
 284  * holding long names (which would either waste space, or
 285  * add greatly to the complexity).
 286  *
 287  * Lookup is called with ni_dvp pointing to the directory to search,
 288  * ni_ptr pointing to the name of the entry being sought, ni_namelen
 289  * tells the length of the name, and ni_hash contains a hash of
 290  * the name. If the lookup succeeds, the vnode is locked, stored in ni_vp
 291  * and a status of zero is returned. If the locking fails for whatever
 292  * reason, the vnode is unlocked and the error is returned to caller.
 293  * If the lookup determines that the name does not exist (negative cacheing),
 294  * a status of ENOENT is returned. If the lookup fails, a status of -1
 295  * is returned.
 296  */
 297 int
 298 cache_lookup(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp)
 299 {
 300         struct namecache *ncp;
 301         struct vnode *vp;
 302         struct nchcpu *cpup;
 303         int error;
 304
 305         if (__predict_false(!doingcache)) {
 306                 cnp->cn_flags &= ~MAKEENTRY;
 307                 *vpp = NULL;
 308                 return -1;
 309         }
 310
 311         cpup = curcpu()->ci_data.cpu_nch;
 312         mutex_enter(&cpup->cpu_lock);
 313         if (__predict_false(cnp->cn_namelen > NCHNAMLEN)) {
 314                 COUNT(cpup->cpu_stats, ncs_long);
 315                 cnp->cn_flags &= ~MAKEENTRY;
 316                 mutex_exit(&cpup->cpu_lock);
 317                 *vpp = NULL;
 318                 return -1;
 319         }
 320         ncp = cache_lookup_entry(dvp, cnp);
 321         if (__predict_false(ncp == NULL)) {
 322                 COUNT(cpup->cpu_stats, ncs_miss);
 323                 mutex_exit(&cpup->cpu_lock);
 324                 *vpp = NULL;
 325                 return -1;
 326         }
 327         if ((cnp->cn_flags & MAKEENTRY) == 0) {
 328                 COUNT(cpup->cpu_stats, ncs_badhits);
 329                 /*
 330                  * Last component and we are renaming or deleting,
 331                  * the cache entry is invalid, or otherwise don't
 332                  * want cache entry to exist.
 333                  */
 334                 cache_invalidate(ncp);
 335                 mutex_exit(&ncp->nc_lock);
 336                 mutex_exit(&cpup->cpu_lock);
 337                 *vpp = NULL;
 338                 return -1;
 339         } else if (ncp->nc_vp == NULL) {
 340                 /*
 341                  * Restore the ISWHITEOUT flag saved earlier.
 342                  */
 343                 KASSERT((ncp->nc_flags & ~ISWHITEOUT) == 0);
 344                 cnp->cn_flags |= ncp->nc_flags;
 345                 if (__predict_true(cnp->cn_nameiop != CREATE ||
 346                     (cnp->cn_flags & ISLASTCN) == 0)) {
 347                         COUNT(cpup->cpu_stats, ncs_neghits);
 348                         mutex_exit(&ncp->nc_lock);
 349                         mutex_exit(&cpup->cpu_lock);
 350                         return ENOENT;
 351                 } else {
 352                         COUNT(cpup->cpu_stats, ncs_badhits);
 353                         /*
 354                          * Last component and we are renaming or
 355                          * deleting, the cache entry is invalid,
 356                          * or otherwise don't want cache entry to
 357                          * exist.
 358                          */
 359                         cache_invalidate(ncp);
 360                         mutex_exit(&ncp->nc_lock);
 361                         mutex_exit(&cpup->cpu_lock);
 362                         *vpp = NULL;
 363                         return -1;
 364                 }
 365         }
 366
 367         vp = ncp->nc_vp;
 368         if (vtryget(vp)) {
 369                 mutex_exit(&ncp->nc_lock);
 370                 mutex_exit(&cpup->cpu_lock);
 371         } else {
 372                 mutex_enter(&vp->v_interlock);
 373                 mutex_exit(&ncp->nc_lock);
 374                 mutex_exit(&cpup->cpu_lock);
 375                 error = vget(vp, LK_NOWAIT | LK_INTERLOCK);
 376                 if (error) {
 377                         KASSERT(error == EBUSY);
 378                         /*
 379                          * This vnode is being cleaned out.
 380                          * XXX badhits?
 381                          */
 382                         COUNT(cpup->cpu_stats, ncs_falsehits);
 383                         *vpp = NULL;
 384                         return -1;
 385                 }
 386         }
 387
 388 #ifdef DEBUG
 389         /*
 390          * since we released nb->nb_lock,
 391          * we can't use this pointer any more.
 392          */
 393         ncp = NULL;
 394 #endif /* DEBUG */
 395
 396         if (vp == dvp) {        /* lookup on "." */
 397                 error = 0;
 398         } else if (cnp->cn_flags & ISDOTDOT) {
 399                 VOP_UNLOCK(dvp, 0);
 400                 error = vn_lock(vp, LK_EXCLUSIVE);
 401                 vn_lock(dvp, LK_EXCLUSIVE | LK_RETRY);
 402         } else {
 403                 error = vn_lock(vp, LK_EXCLUSIVE);
 404         }
 405
 406         /*
 407          * Check that the lock succeeded.
 408          */
 409         if (error) {
 410                 /* Unlocked, but only for stats. */
 411                 COUNT(cpup->cpu_stats, ncs_badhits);
 412                 vrele(vp);
 413                 *vpp = NULL;
 414                 return -1;
 415         }
 416
 417         /* Unlocked, but only for stats. */
 418         COUNT(cpup->cpu_stats, ncs_goodhits);
 419         *vpp = vp;
 420         return 0;
 421 }
 422
 423 int
 424 cache_lookup_raw(struct vnode *dvp, struct vnode **vpp,
 425     struct componentname *cnp)
 426 {
 427         struct namecache *ncp;
 428         struct vnode *vp;
 429         struct nchcpu *cpup;
 430         int error;
 431
 432         if (__predict_false(!doingcache)) {
 433                 cnp->cn_flags &= ~MAKEENTRY;
 434                 *vpp = NULL;
 435                 return (-1);
 436         }
 437
 438         cpup = curcpu()->ci_data.cpu_nch;
 439         mutex_enter(&cpup->cpu_lock);
 440         if (__predict_false(cnp->cn_namelen > NCHNAMLEN)) {
 441                 COUNT(cpup->cpu_stats, ncs_long);
 442                 cnp->cn_flags &= ~MAKEENTRY;
 443                 mutex_exit(&cpup->cpu_lock);
 444                 *vpp = NULL;
 445                 return -1;
 446         }
 447         ncp = cache_lookup_entry(dvp, cnp);
 448         if (__predict_false(ncp == NULL)) {
 449                 COUNT(cpup->cpu_stats, ncs_miss);
 450                 mutex_exit(&cpup->cpu_lock);
 451                 *vpp = NULL;
 452                 return -1;
 453         }
 454         vp = ncp->nc_vp;
 455         if (vp == NULL) {
 456                 /*
 457                  * Restore the ISWHITEOUT flag saved earlier.
 458                  */
 459                 KASSERT((ncp->nc_flags & ~ISWHITEOUT) == 0);
 460                 cnp->cn_flags |= ncp->nc_flags;
 461                 COUNT(cpup->cpu_stats, ncs_neghits);
 462                 mutex_exit(&ncp->nc_lock);
 463                 mutex_exit(&cpup->cpu_lock);
 464                 return ENOENT;
 465         }
 466         if (vtryget(vp)) {
 467                 mutex_exit(&ncp->nc_lock);
 468                 mutex_exit(&cpup->cpu_lock);
 469         } else {
 470                 mutex_enter(&vp->v_interlock);
 471                 mutex_exit(&ncp->nc_lock);
 472                 mutex_exit(&cpup->cpu_lock);
 473                 error = vget(vp, LK_NOWAIT | LK_INTERLOCK);
 474                 if (error) {
 475                         KASSERT(error == EBUSY);
 476                         /*
 477                          * This vnode is being cleaned out.
 478                          * XXX badhits?
 479                          */
 480                         COUNT(cpup->cpu_stats, ncs_falsehits);
 481                         *vpp = NULL;
 482                         return -1;
 483                 }
 484         }
 485
 486         /* Unlocked, but only for stats. */
 487         COUNT(cpup->cpu_stats, ncs_goodhits); /* XXX can be "badhits" */
 488         *vpp = vp;
 489         return 0;
 490 }
 491
 492 /*
 493  * Scan cache looking for name of directory entry pointing at vp.
 494  *
 495  * Fill in dvpp.
 496  *
 497  * If bufp is non-NULL, also place the name in the buffer which starts
 498  * at bufp, immediately before *bpp, and move bpp backwards to point
 499  * at the start of it.  (Yes, this is a little baroque, but it's done
 500  * this way to cater to the whims of getcwd).
 501  *
 502  * Returns 0 on success, -1 on cache miss, positive errno on failure.
 503  */
 504 int
 505 cache_revlookup(struct vnode *vp, struct vnode **dvpp, char **bpp, char *bufp)
 506 {
 507         struct namecache *ncp;
 508         struct vnode *dvp;
 509         struct ncvhashhead *nvcpp;
 510         char *bp;
 511
 512         if (!doingcache)
 513                 goto out;
 514
 515         nvcpp = &ncvhashtbl[NCVHASH(vp)];
 516
 517         mutex_enter(namecache_lock);
 518         LIST_FOREACH(ncp, nvcpp, nc_vhash) {
 519                 mutex_enter(&ncp->nc_lock);
 520                 if (ncp->nc_vp == vp &&
 521                     (dvp = ncp->nc_dvp) != NULL &&
 522                     dvp != vp) {                /* avoid pesky . entries.. */
 523
 524 #ifdef DIAGNOSTIC
 525                         if (ncp->nc_nlen == 1 &&
 526                             ncp->nc_name[0] == '.')
 527                                 panic("cache_revlookup: found entry for .");
 528
 529                         if (ncp->nc_nlen == 2 &&
 530                             ncp->nc_name[0] == '.' &&
 531                             ncp->nc_name[1] == '.')
 532                                 panic("cache_revlookup: found entry for ..");
 533 #endif
 534                         COUNT(nchstats, ncs_revhits);
 535
 536                         if (bufp) {
 537                                 bp = *bpp;
 538                                 bp -= ncp->nc_nlen;
 539                                 if (bp <= bufp) {
 540                                         *dvpp = NULL;
 541                                         mutex_exit(&ncp->nc_lock);
 542                                         mutex_exit(namecache_lock);
 543                                         return (ERANGE);
 544                                 }
 545                                 memcpy(bp, ncp->nc_name, ncp->nc_nlen);
 546                                 *bpp = bp;
 547                         }
 548
 549                         /* XXX MP: how do we know dvp won't evaporate? */
 550                         *dvpp = dvp;
 551                         mutex_exit(&ncp->nc_lock);
 552                         mutex_exit(namecache_lock);
 553                         return (0);
 554                 }
 555                 mutex_exit(&ncp->nc_lock);
 556         }
 557         COUNT(nchstats, ncs_revmiss);
 558         mutex_exit(namecache_lock);
 559  out:
 560         *dvpp = NULL;
 561         return (-1);
 562 }
 563
 564 /*
 565  * Add an entry to the cache
 566  */
 567 void
 568 cache_enter(struct vnode *dvp, struct vnode *vp, struct componentname *cnp)
 569 {
 570         struct namecache *ncp;
 571         struct namecache *oncp;
 572         struct nchashhead *ncpp;
 573         struct ncvhashhead *nvcpp;
 574
 575 #ifdef DIAGNOSTIC
 576         if (cnp->cn_namelen > NCHNAMLEN)
 577                 panic("cache_enter: name too long");
 578 #endif
 579         if (!doingcache)
 580                 return;
 581
 582         if (numcache > desiredvnodes) {
 583                 mutex_enter(namecache_lock);
 584                 cache_ev_forced.ev_count++;
 585                 cache_reclaim();
 586                 mutex_exit(namecache_lock);
 587         }
 588
 589         ncp = pool_cache_get(namecache_cache, PR_WAITOK);
 590         mutex_enter(namecache_lock);
 591         numcache++;
 592
 593         /*
 594          * Concurrent lookups in the same directory may race for a
 595          * cache entry.  if there's a duplicated entry, free it.
 596          */
 597         oncp = cache_lookup_entry(dvp, cnp);
 598         if (oncp) {
 599                 cache_invalidate(oncp);
 600                 mutex_exit(&oncp->nc_lock);
 601         }
 602
 603         /* Grab the vnode we just found. */
 604         mutex_enter(&ncp->nc_lock);
 605         ncp->nc_vp = vp;
 606         ncp->nc_flags = 0;
 607         ncp->nc_hittime = 0;
 608         ncp->nc_gcqueue = NULL;
 609         if (vp == NULL) {
 610                 /*
 611                  * For negative hits, save the ISWHITEOUT flag so we can
 612                  * restore it later when the cache entry is used again.
 613                  */
 614                 ncp->nc_flags = cnp->cn_flags & ISWHITEOUT;
 615         }
 616         /* Fill in cache info. */
 617         ncp->nc_dvp = dvp;
 618         LIST_INSERT_HEAD(&dvp->v_dnclist, ncp, nc_dvlist);
 619         if (vp)
 620                 LIST_INSERT_HEAD(&vp->v_nclist, ncp, nc_vlist);
 621         else {
 622                 ncp->nc_vlist.le_prev = NULL;
 623                 ncp->nc_vlist.le_next = NULL;
 624         }
 625         ncp->nc_nlen = cnp->cn_namelen;
 626         TAILQ_INSERT_TAIL(&nclruhead, ncp, nc_lru);
 627         memcpy(ncp->nc_name, cnp->cn_nameptr, (unsigned)ncp->nc_nlen);
 628         ncpp = &nchashtbl[NCHASH(cnp, dvp)];
 629
 630         /*
 631          * Flush updates before making visible in table.  No need for a
 632          * memory barrier on the other side: to see modifications the
 633          * list must be followed, meaning a dependent pointer load.
 634          * The below is LIST_INSERT_HEAD() inlined, with the memory
 635          * barrier included in the correct place.
 636          */
 637         if ((ncp->nc_hash.le_next = ncpp->lh_first) != NULL)
 638                 ncpp->lh_first->nc_hash.le_prev = &ncp->nc_hash.le_next;
 639         ncp->nc_hash.le_prev = &ncpp->lh_first;
 640         membar_producer();
 641         ncpp->lh_first = ncp;
 642
 643         ncp->nc_vhash.le_prev = NULL;
 644         ncp->nc_vhash.le_next = NULL;
 645
 646         /*
 647          * Create reverse-cache entries (used in getcwd) for directories.
 648          * (and in linux procfs exe node)
 649          */
 650         if (vp != NULL &&
 651             vp != dvp &&
 652 #ifndef NAMECACHE_ENTER_REVERSE
 653             vp->v_type == VDIR &&
 654 #endif
 655             (ncp->nc_nlen > 2 ||
 656             (ncp->nc_nlen > 1 && ncp->nc_name[1] != '.') ||
 657             (/* ncp->nc_nlen > 0 && */ ncp->nc_name[0] != '.'))) {
 658                 nvcpp = &ncvhashtbl[NCVHASH(vp)];
 659                 LIST_INSERT_HEAD(nvcpp, ncp, nc_vhash);
 660         }
 661         mutex_exit(&ncp->nc_lock);
 662         mutex_exit(namecache_lock);
 663 }
 664
 665 /*
 666  * Name cache initialization, from vfs_init() when we are booting
 667  */
 668 void
 669 nchinit(void)
 670 {
 671         int error;
 672
 673         namecache_cache = pool_cache_init(sizeof(struct namecache),
 674             coherency_unit, 0, 0, "ncache", NULL, IPL_NONE, cache_ctor,
 675             cache_dtor, NULL);
 676         KASSERT(namecache_cache != NULL);
 677
 678         namecache_lock = mutex_obj_alloc(MUTEX_DEFAULT, IPL_NONE);
 679
 680         nchashtbl = hashinit(desiredvnodes, HASH_LIST, true, &nchash);
 681         ncvhashtbl =
 682 #ifdef NAMECACHE_ENTER_REVERSE
 683             hashinit(desiredvnodes, HASH_LIST, true, &ncvhash);
 684 #else
 685             hashinit(desiredvnodes/8, HASH_LIST, true, &ncvhash);
 686 #endif
 687
 688         error = kthread_create(PRI_VM, KTHREAD_MPSAFE, NULL, cache_thread,
 689             NULL, NULL, "cachegc");
 690         if (error != 0)
 691                 panic("nchinit %d", error);
 692
 693         evcnt_attach_dynamic(&cache_ev_scan, EVCNT_TYPE_MISC, NULL,
 694            "namecache", "entries scanned");
 695         evcnt_attach_dynamic(&cache_ev_gc, EVCNT_TYPE_MISC, NULL,
 696            "namecache", "entries collected");
 697         evcnt_attach_dynamic(&cache_ev_over, EVCNT_TYPE_MISC, NULL,
 698            "namecache", "over scan target");
 699         evcnt_attach_dynamic(&cache_ev_under, EVCNT_TYPE_MISC, NULL,
 700            "namecache", "under scan target");
 701         evcnt_attach_dynamic(&cache_ev_forced, EVCNT_TYPE_MISC, NULL,
 702            "namecache", "forced reclaims");
 703 }
 704
 705 static int
 706 cache_ctor(void *arg, void *obj, int flag)
 707 {
 708         struct namecache *ncp;
 709
 710         ncp = obj;
 711         mutex_init(&ncp->nc_lock, MUTEX_DEFAULT, IPL_NONE);
 712
 713         return 0;
 714 }
 715
 716 static void
 717 cache_dtor(void *arg, void *obj)
 718 {
 719         struct namecache *ncp;
 720
 721         ncp = obj;
 722         mutex_destroy(&ncp->nc_lock);
 723 }
 724
 725 /*
 726  * Called once for each CPU in the system as attached.
 727  */
 728 void
 729 cache_cpu_init(struct cpu_info *ci)
 730 {
 731         struct nchcpu *cpup;
 732         size_t sz;
 733
 734         sz = roundup2(sizeof(*cpup), coherency_unit) + coherency_unit;
 735         cpup = kmem_zalloc(sz, KM_SLEEP);
 736         cpup = (void *)roundup2((uintptr_t)cpup, coherency_unit);
 737         mutex_init(&cpup->cpu_lock, MUTEX_DEFAULT, IPL_NONE);
 738         ci->ci_data.cpu_nch = cpup;
 739 }
 740
 741 /*
 742  * Name cache reinitialization, for when the maximum number of vnodes increases.
 743  */
 744 void
 745 nchreinit(void)
 746 {
 747         struct namecache *ncp;
 748         struct nchashhead *oldhash1, *hash1;
 749         struct ncvhashhead *oldhash2, *hash2;
 750         u_long i, oldmask1, oldmask2, mask1, mask2;
 751
 752         hash1 = hashinit(desiredvnodes, HASH_LIST, true, &mask1);
 753         hash2 =
 754 #ifdef NAMECACHE_ENTER_REVERSE
 755             hashinit(desiredvnodes, HASH_LIST, true, &mask2);
 756 #else
 757             hashinit(desiredvnodes/8, HASH_LIST, true, &mask2);
 758 #endif
 759         mutex_enter(namecache_lock);
 760         cache_lock_cpus();
 761         oldhash1 = nchashtbl;
 762         oldmask1 = nchash;
 763         nchashtbl = hash1;
 764         nchash = mask1;
 765         oldhash2 = ncvhashtbl;
 766         oldmask2 = ncvhash;
 767         ncvhashtbl = hash2;
 768         ncvhash = mask2;
 769         for (i = 0; i <= oldmask1; i++) {
 770                 while ((ncp = LIST_FIRST(&oldhash1[i])) != NULL) {
 771                         LIST_REMOVE(ncp, nc_hash);
 772                         ncp->nc_hash.le_prev = NULL;
 773                 }
 774         }
 775         for (i = 0; i <= oldmask2; i++) {
 776                 while ((ncp = LIST_FIRST(&oldhash2[i])) != NULL) {
 777                         LIST_REMOVE(ncp, nc_vhash);
 778                         ncp->nc_vhash.le_prev = NULL;
 779                 }
 780         }
 781         cache_unlock_cpus();
 782         mutex_exit(namecache_lock);
 783         hashdone(oldhash1, HASH_LIST, oldmask1);
 784         hashdone(oldhash2, HASH_LIST, oldmask2);
 785 }
 786
 787 /*
 788  * Cache flush, a particular vnode; called when a vnode is renamed to
 789  * hide entries that would now be invalid
 790  */
 791 void
 792 cache_purge1(struct vnode *vp, const struct componentname *cnp, int flags)
 793 {
 794         struct namecache *ncp, *ncnext;
 795
 796         mutex_enter(namecache_lock);
 797         if (flags & PURGE_PARENTS) {
 798                 for (ncp = LIST_FIRST(&vp->v_nclist); ncp != NULL;
 799                     ncp = ncnext) {
 800                         ncnext = LIST_NEXT(ncp, nc_vlist);
 801                         mutex_enter(&ncp->nc_lock);
 802                         cache_invalidate(ncp);
 803                         mutex_exit(&ncp->nc_lock);
 804                         cache_disassociate(ncp);
 805                 }
 806         }
 807         if (flags & PURGE_CHILDREN) {
 808                 for (ncp = LIST_FIRST(&vp->v_dnclist); ncp != NULL;
 809                     ncp = ncnext) {
 810                         ncnext = LIST_NEXT(ncp, nc_dvlist);
 811                         mutex_enter(&ncp->nc_lock);
 812                         cache_invalidate(ncp);
 813                         mutex_exit(&ncp->nc_lock);
 814                         cache_disassociate(ncp);
 815                 }
 816         }
 817         if (cnp != NULL) {
 818                 ncp = cache_lookup_entry(vp, cnp);
 819                 if (ncp) {
 820                         cache_invalidate(ncp);
 821                         mutex_exit(&ncp->nc_lock);
 822                         cache_disassociate(ncp);
 823                 }
 824         }
 825         mutex_exit(namecache_lock);
 826 }
 827
 828 /*
 829  * Cache flush, a whole filesystem; called when filesys is umounted to
 830  * remove entries that would now be invalid.
 831  */
 832 void
 833 cache_purgevfs(struct mount *mp)
 834 {
 835         struct namecache *ncp, *nxtcp;
 836
 837         mutex_enter(namecache_lock);
 838         for (ncp = TAILQ_FIRST(&nclruhead); ncp != NULL; ncp = nxtcp) {
 839                 nxtcp = TAILQ_NEXT(ncp, nc_lru);
 840                 mutex_enter(&ncp->nc_lock);
 841                 if (ncp->nc_dvp != NULL && ncp->nc_dvp->v_mount == mp) {
 842                         /* Free the resources we had. */
 843                         cache_invalidate(ncp);
 844                         cache_disassociate(ncp);
 845                 }
 846                 mutex_exit(&ncp->nc_lock);
 847         }
 848         cache_reclaim();
 849         mutex_exit(namecache_lock);
 850 }
 851
 852 /*
 853  * Scan global list invalidating entries until we meet a preset target.
 854  * Prefer to invalidate entries that have not scored a hit within
 855  * cache_hottime seconds.  We sort the LRU list only for this routine's
 856  * benefit.
 857  */
 858 static void
 859 cache_prune(int incache, int target)
 860 {
 861         struct namecache *ncp, *nxtcp, *sentinel;
 862         int items, recent, tryharder;
 863
 864         KASSERT(mutex_owned(namecache_lock));
 865
 866         items = 0;
 867         tryharder = 0;
 868         recent = hardclock_ticks - hz * cache_hottime;
 869         sentinel = NULL;
 870         for (ncp = TAILQ_FIRST(&nclruhead); ncp != NULL; ncp = nxtcp) {
 871                 if (incache <= target)
 872                         break;
 873                 items++;
 874                 nxtcp = TAILQ_NEXT(ncp, nc_lru);
 875                 if (ncp->nc_dvp == NULL)
 876                         continue;
 877                 if (ncp == sentinel) {
 878                         /*
 879                          * If we looped back on ourself, then ignore
 880                          * recent entries and purge whatever we find.
 881                          */
 882                         tryharder = 1;
 883                 }
 884                 if (!tryharder && (ncp->nc_hittime - recent) > 0) {
 885                         if (sentinel == NULL)
 886                                 sentinel = ncp;
 887                         TAILQ_REMOVE(&nclruhead, ncp, nc_lru);
 888                         TAILQ_INSERT_TAIL(&nclruhead, ncp, nc_lru);
 889                         continue;
 890                 }
 891                 mutex_enter(&ncp->nc_lock);
 892                 if (ncp->nc_dvp != NULL) {
 893                         cache_invalidate(ncp);
 894                         cache_disassociate(ncp);
 895                         incache--;
 896                 }
 897                 mutex_exit(&ncp->nc_lock);
 898         }
 899         cache_ev_scan.ev_count += items;
 900 }
 901
 902 /*
 903  * Collect dead cache entries from all CPUs and garbage collect.
 904  */
 905 static void
 906 cache_reclaim(void)
 907 {
 908         struct namecache *ncp, *next;
 909         int items;
 910
 911         KASSERT(mutex_owned(namecache_lock));
 912
 913         /*
 914          * If the number of extant entries not awaiting garbage collection
 915          * exceeds the high water mark, then reclaim stale entries until we
 916          * reach our low water mark.
 917          */
 918         items = numcache - cache_gcpend;
 919         if (items > (uint64_t)desiredvnodes * cache_hiwat / 100) {
 920                 cache_prune(items, (int)((uint64_t)desiredvnodes *
 921                     cache_lowat / 100));
 922                 cache_ev_over.ev_count++;
 923         } else
 924                 cache_ev_under.ev_count++;
 925
 926         /*
 927          * Stop forward lookup activity on all CPUs and garbage collect dead
 928          * entries.
 929          */
 930         cache_lock_cpus();
 931         ncp = cache_gcqueue;
 932         cache_gcqueue = NULL;
 933         items = cache_gcpend;
 934         cache_gcpend = 0;
 935         while (ncp != NULL) {
 936                 next = ncp->nc_gcqueue;
 937                 cache_disassociate(ncp);
 938                 KASSERT(ncp->nc_dvp == NULL);
 939                 if (ncp->nc_hash.le_prev != NULL) {
 940                         LIST_REMOVE(ncp, nc_hash);
 941                         ncp->nc_hash.le_prev = NULL;
 942                 }
 943                 pool_cache_put(namecache_cache, ncp);
 944                 ncp = next;
 945         }
 946         cache_unlock_cpus();
 947         numcache -= items;
 948         cache_ev_gc.ev_count += items;
 949 }
 950
 951 /*
 952  * Cache maintainence thread, awakening once per second to:
 953  *
 954  * => keep number of entries below the high water mark
 955  * => sort pseudo-LRU list
 956  * => garbage collect dead entries
 957  */
 958 static void
 959 cache_thread(void *arg)
 960 {
 961
 962         mutex_enter(namecache_lock);
 963         for (;;) {
 964                 cache_reclaim();
 965                 kpause("cachegc", false, hz, namecache_lock);
 966         }
 967 }
 968
 969 #ifdef DDB
 970 void
 971 namecache_print(struct vnode *vp, void (*pr)(const char *, ...))
 972 {
 973         struct vnode *dvp = NULL;
 974         struct namecache *ncp;
 975
 976         TAILQ_FOREACH(ncp, &nclruhead, nc_lru) {
 977                 if (ncp->nc_vp == vp && ncp->nc_dvp != NULL) {
 978                         (*pr)("name %.*s\n", ncp->nc_nlen, ncp->nc_name);
 979                         dvp = ncp->nc_dvp;
 980                 }
 981         }
 982         if (dvp == NULL) {
 983                 (*pr)("name not found\n");
 984                 return;
 985         }
 986         vp = dvp;
 987         TAILQ_FOREACH(ncp, &nclruhead, nc_lru) {
 988                 if (ncp->nc_vp == vp) {
 989                         (*pr)("parent %.*s\n", ncp->nc_nlen, ncp->nc_name);
 990                 }
 991         }
 992 }
 993 #endif