sys/miscfs/syncfs/sync_subr.c

   1 /*      $NetBSD: sync_subr.c,v 1.40 2009/03/15 17:22:38 cegger Exp $    */
   2
   3 /*-
   4  * Copyright (c) 2009 The NetBSD Foundation, Inc.
   5  * All rights reserved.
   6  *
   7  * This code is derived from software contributed to The NetBSD Foundation
   8  * by Andrew Doran.
   9  *
  10  * Redistribution and use in source and binary forms, with or without
  11  * modification, are permitted provided that the following conditions
  12  * are met:
  13  * 1. Redistributions of source code must retain the above copyright
  14  *    notice, this list of conditions and the following disclaimer.
  15  * 2. Redistributions in binary form must reproduce the above copyright
  16  *    notice, this list of conditions and the following disclaimer in the
  17  *    documentation and/or other materials provided with the distribution.
  18  *
  19  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
  20  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
  21  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
  22  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
  23  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  24  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  25  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  26  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  27  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  28  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  29  * POSSIBILITY OF SUCH DAMAGE.
  30  */
  31
  32 /*
  33  * Copyright 1997 Marshall Kirk McKusick. All Rights Reserved.
  34  *
  35  * This code is derived from work done by Greg Ganger at the
  36  * University of Michigan.
  37  *
  38  * Redistribution and use in source and binary forms, with or without
  39  * modification, are permitted provided that the following conditions
  40  * are met:
  41  * 1. Redistributions of source code must retain the above copyright
  42  *    notice, this list of conditions and the following disclaimer.
  43  * 2. Redistributions in binary form must reproduce the above copyright
  44  *    notice, this list of conditions and the following disclaimer in the
  45  *    documentation and/or other materials provided with the distribution.
  46  * 3. None of the names of McKusick, Ganger, or the University of Michigan
  47  *    may be used to endorse or promote products derived from this software
  48  *    without specific prior written permission.
  49  *
  50  * THIS SOFTWARE IS PROVIDED BY MARSHALL KIRK MCKUSICK ``AS IS'' AND
  51  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  52  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  53  * ARE DISCLAIMED.  IN NO EVENT SHALL MARSHALL KIRK MCKUSICK BE LIABLE
  54  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  55  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  56  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  57  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  58  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  59  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  60  * SUCH DAMAGE.
  61  */
  62
  63 #include <sys/cdefs.h>
  64 __KERNEL_RCSID(0, "$NetBSD: sync_subr.c,v 1.40 2009/03/15 17:22:38 cegger Exp $");
  65
  66 #include <sys/param.h>
  67 #include <sys/systm.h>
  68 #include <sys/sysctl.h>
  69 #include <sys/kernel.h>
  70 #include <sys/proc.h>
  71 #include <sys/mount.h>
  72 #include <sys/time.h>
  73 #include <sys/vnode.h>
  74 #include <sys/buf.h>
  75 #include <sys/errno.h>
  76 #include <sys/kmem.h>
  77
  78 #include <miscfs/genfs/genfs.h>
  79 #include <miscfs/syncfs/syncfs.h>
  80
  81 static void     vn_syncer_add1(struct vnode *, int);
  82
  83 /*
  84  * Defines and variables for the syncer process.
  85  */
  86 int syncer_maxdelay = SYNCER_MAXDELAY;  /* maximum delay time */
  87 time_t syncdelay = 30;                  /* max time to delay syncing data */
  88 time_t filedelay = 30;                  /* time to delay syncing files */
  89 time_t dirdelay  = 15;                  /* time to delay syncing directories */
  90 time_t metadelay = 10;                  /* time to delay syncing metadata */
  91 time_t lockdelay = 1;                   /* time to delay if locking fails */
  92
  93 kmutex_t syncer_mutex;                  /* used to freeze syncer, long term */
  94 static kmutex_t syncer_data_lock;       /* short term lock on data structures */
  95
  96 static int rushjob;                     /* number of slots to run ASAP */
  97 static kcondvar_t syncer_cv;            /* cv for rushjob */
  98 static int stat_rush_requests;          /* number of times I/O speeded up */
  99
 100 static int syncer_delayno = 0;
 101 static long syncer_last;
 102 static struct synclist *syncer_workitem_pending;
 103 struct lwp *updateproc = NULL;
 104
 105 static void sysctl_vfs_syncfs_setup(struct sysctllog **);
 106
 107 void
 108 vn_initialize_syncerd(void)
 109 {
 110         int i;
 111
 112         syncer_last = SYNCER_MAXDELAY + 2;
 113
 114         sysctl_vfs_syncfs_setup(NULL);
 115
 116         syncer_workitem_pending =
 117             kmem_alloc(syncer_last * sizeof (struct synclist), KM_SLEEP);
 118
 119         for (i = 0; i < syncer_last; i++)
 120                 TAILQ_INIT(&syncer_workitem_pending[i]);
 121
 122         mutex_init(&syncer_mutex, MUTEX_DEFAULT, IPL_NONE);
 123         mutex_init(&syncer_data_lock, MUTEX_DEFAULT, IPL_NONE);
 124         cv_init(&syncer_cv, "syncer");
 125 }
 126
 127 /*
 128  * The workitem queue.
 129  *
 130  * It is useful to delay writes of file data and filesystem metadata
 131  * for tens of seconds so that quickly created and deleted files need
 132  * not waste disk bandwidth being created and removed. To realize this,
 133  * we append vnodes to a "workitem" queue. When running with a soft
 134  * updates implementation, most pending metadata dependencies should
 135  * not wait for more than a few seconds. Thus, mounted on block devices
 136  * are delayed only about a half the time that file data is delayed.
 137  * Similarly, directory updates are more critical, so are only delayed
 138  * about a third the time that file data is delayed. Thus, there are
 139  * SYNCER_MAXDELAY queues that are processed round-robin at a rate of
 140  * one each second (driven off the filesystem syner process). The
 141  * syncer_delayno variable indicates the next queue that is to be processed.
 142  * Items that need to be processed soon are placed in this queue:
 143  *
 144  *      syncer_workitem_pending[syncer_delayno]
 145  *
 146  * A delay of fifteen seconds is done by placing the request fifteen
 147  * entries later in the queue:
 148  *
 149  *      syncer_workitem_pending[(syncer_delayno + 15) & syncer_mask]
 150  *
 151  */
 152
 153 /*
 154  * Add an item to the syncer work queue.
 155  */
 156 static void
 157 vn_syncer_add1(struct vnode *vp, int delayx)
 158 {
 159         struct synclist *slp;
 160
 161         KASSERT(mutex_owned(&syncer_data_lock));
 162
 163         if (vp->v_iflag & VI_ONWORKLST) {
 164                 slp = &syncer_workitem_pending[vp->v_synclist_slot];
 165                 TAILQ_REMOVE(slp, vp, v_synclist);
 166         } else {
 167                 /*
 168                  * We must not modify v_iflag if the vnode
 169                  * is already on a synclist: sched_sync()
 170                  * calls this routine while holding only
 171                  * syncer_data_lock in order to adjust the
 172                  * position of the vnode.  syncer_data_lock
 173                  * does not protect v_iflag.
 174                  */
 175                 KASSERT(mutex_owned(&vp->v_interlock));
 176                 vp->v_iflag |= VI_ONWORKLST;
 177         }
 178
 179         if (delayx > syncer_maxdelay - 2)
 180                 delayx = syncer_maxdelay - 2;
 181         vp->v_synclist_slot = (syncer_delayno + delayx) % syncer_last;
 182
 183         slp = &syncer_workitem_pending[vp->v_synclist_slot];
 184         TAILQ_INSERT_TAIL(slp, vp, v_synclist);
 185 }
 186
 187 void
 188 vn_syncer_add_to_worklist(struct vnode *vp, int delayx)
 189 {
 190
 191         KASSERT(mutex_owned(&vp->v_interlock));
 192
 193         mutex_enter(&syncer_data_lock);
 194         vn_syncer_add1(vp, delayx);
 195         mutex_exit(&syncer_data_lock);
 196 }
 197
 198 /*
 199  * Remove an item from the syncer work queue.
 200  */
 201 void
 202 vn_syncer_remove_from_worklist(struct vnode *vp)
 203 {
 204         struct synclist *slp;
 205
 206         KASSERT(mutex_owned(&vp->v_interlock));
 207
 208         mutex_enter(&syncer_data_lock);
 209
 210         if (vp->v_iflag & VI_ONWORKLST) {
 211                 vp->v_iflag &= ~VI_ONWORKLST;
 212                 slp = &syncer_workitem_pending[vp->v_synclist_slot];
 213                 TAILQ_REMOVE(slp, vp, v_synclist);
 214         }
 215
 216         mutex_exit(&syncer_data_lock);
 217 }
 218
 219 /*
 220  * System filesystem synchronizer daemon.
 221  */
 222 void
 223 sched_sync(void *v)
 224 {
 225         struct synclist *slp;
 226         struct vnode *vp;
 227         long starttime;
 228         bool synced;
 229
 230         updateproc = curlwp;
 231
 232         for (;;) {
 233                 mutex_enter(&syncer_mutex);
 234                 mutex_enter(&syncer_data_lock);
 235
 236                 starttime = time_second;
 237
 238                 /*
 239                  * Push files whose dirty time has expired.
 240                  */
 241                 slp = &syncer_workitem_pending[syncer_delayno];
 242                 syncer_delayno += 1;
 243                 if (syncer_delayno >= syncer_last)
 244                         syncer_delayno = 0;
 245
 246                 while ((vp = TAILQ_FIRST(slp)) != NULL) {
 247                         /* We are locking in the wrong direction. */
 248                         synced = false;
 249                         if (mutex_tryenter(&vp->v_interlock)) {
 250                                 mutex_exit(&syncer_data_lock);
 251                                 if (vget(vp, LK_EXCLUSIVE | LK_NOWAIT |
 252                                     LK_INTERLOCK) == 0) {
 253                                         synced = true;
 254                                         (void) VOP_FSYNC(vp, curlwp->l_cred,
 255                                             FSYNC_LAZY, 0, 0);
 256                                         vput(vp);
 257                                 }
 258                                 mutex_enter(&syncer_data_lock);
 259                         }
 260
 261                         /*
 262                          * XXX The vnode may have been recycled, in which
 263                          * case it may have a new identity.
 264                          */
 265                         if (TAILQ_FIRST(slp) == vp) {
 266                                 /*
 267                                  * Put us back on the worklist.  The worklist
 268                                  * routine will remove us from our current
 269                                  * position and then add us back in at a later
 270                                  * position.
 271                                  *
 272                                  * Try again sooner rather than later if
 273                                  * we were unable to lock the vnode.  Lock
 274                                  * failure should not prevent us from doing
 275                                  * the sync "soon".
 276                                  *
 277                                  * If we locked it yet arrive here, it's
 278                                  * likely that lazy sync is in progress and
 279                                  * so the vnode still has dirty metadata.
 280                                  * syncdelay is mainly to get this vnode out
 281                                  * of the way so we do not consider it again
 282                                  * "soon" in this loop, so the delay time is
 283                                  * not critical as long as it is not "soon".
 284                                  * While write-back strategy is the file
 285                                  * system's domain, we expect write-back to
 286                                  * occur no later than syncdelay seconds
 287                                  * into the future.
 288                                  */
 289                                 vn_syncer_add1(vp,
 290                                     synced ? syncdelay : lockdelay);
 291                         }
 292                 }
 293                 mutex_exit(&syncer_mutex);
 294
 295                 /*
 296                  * Wait until there are more workitems to process.
 297                  */
 298                 if (rushjob > 0) {
 299                         /*
 300                          * The variable rushjob allows the kernel to speed
 301                          * up the processing of the filesystem syncer
 302                          * process. A rushjob value of N tells the
 303                          * filesystem syncer to process the next N seconds
 304                          * worth of work on its queue ASAP. Currently
 305                          * rushjob is used by the soft update code to
 306                          * speed up the filesystem syncer process when the
 307                          * incore state is getting so far ahead of the
 308                          * disk that the kernel memory pool is being
 309                          * threatened with exhaustion.
 310                          */
 311                         rushjob--;
 312                 } else {
 313                         /*
 314                          * If it has taken us less than a second to
 315                          * process the current work, then wait. Otherwise
 316                          * start right over again. We can still lose time
 317                          * if any single round takes more than two
 318                          * seconds, but it does not really matter as we
 319                          * are just trying to generally pace the
 320                          * filesystem activity.
 321                          */
 322                         if (time_second == starttime)
 323                                 cv_timedwait(&syncer_cv, &syncer_data_lock, hz);
 324                 }
 325                 mutex_exit(&syncer_data_lock);
 326         }
 327 }
 328
 329 /*
 330  * Request the syncer daemon to speed up its work.
 331  * We never push it to speed up more than half of its
 332  * normal turn time, otherwise it could take over the CPU.
 333  */
 334 int
 335 speedup_syncer(void)
 336 {
 337
 338         mutex_enter(&syncer_data_lock);
 339         if (rushjob >= syncdelay / 2) {
 340                 mutex_exit(&syncer_data_lock);
 341                 return (0);
 342         }
 343         rushjob++;
 344         cv_signal(&syncer_cv);
 345         stat_rush_requests += 1;
 346         mutex_exit(&syncer_data_lock);
 347
 348         return (1);
 349 }
 350
 351 static void
 352 sysctl_vfs_syncfs_setup(struct sysctllog **clog)
 353 {
 354         const struct sysctlnode *rnode, *cnode;
 355
 356         sysctl_createv(clog, 0, NULL, &rnode,
 357                         CTLFLAG_PERMANENT,
 358                         CTLTYPE_NODE, "vfs", NULL,
 359                         NULL, 0, NULL, 0,
 360                         CTL_VFS, CTL_EOL);
 361
 362         sysctl_createv(clog, 0, &rnode, &rnode,
 363                         CTLFLAG_PERMANENT,
 364                         CTLTYPE_NODE, "sync",
 365                         SYSCTL_DESCR("syncer options"),
 366                         NULL, 0, NULL, 0,
 367                         CTL_CREATE, CTL_EOL);
 368
 369         sysctl_createv(clog, 0, &rnode, &cnode,
 370                         CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
 371                         CTLTYPE_INT, "delay",
 372                         SYSCTL_DESCR("max time to delay syncing data"),
 373                         NULL, 0, &syncdelay, 0,
 374                         CTL_CREATE, CTL_EOL);
 375
 376         sysctl_createv(clog, 0, &rnode, &cnode,
 377                         CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
 378                         CTLTYPE_INT, "filedelay",
 379                         SYSCTL_DESCR("time to delay syncing files"),
 380                         NULL, 0, &filedelay, 0,
 381                         CTL_CREATE, CTL_EOL);
 382
 383         sysctl_createv(clog, 0, &rnode, &cnode,
 384                         CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
 385                         CTLTYPE_INT, "dirdelay",
 386                         SYSCTL_DESCR("time to delay syncing directories"),
 387                         NULL, 0, &dirdelay, 0,
 388                         CTL_CREATE, CTL_EOL);
 389
 390         sysctl_createv(clog, 0, &rnode, &cnode,
 391                         CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
 392                         CTLTYPE_INT, "metadelay",
 393                         SYSCTL_DESCR("time to delay syncing metadata"),
 394                         NULL, 0, &metadelay, 0,
 395                         CTL_CREATE, CTL_EOL);
 396 }