Fix up mix of man(7)/mdoc(7).
[netbsd-mini2440.git] / sys / miscfs / syncfs / sync_subr.c
blobd0b64c50ba41f1dbb5407e477ba7f7a42dcb79e4
1 /* $NetBSD: sync_subr.c,v 1.40 2009/03/15 17:22:38 cegger Exp $ */
3 /*-
4 * Copyright (c) 2009 The NetBSD Foundation, Inc.
5 * All rights reserved.
7 * This code is derived from software contributed to The NetBSD Foundation
8 * by Andrew Doran.
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 * POSSIBILITY OF SUCH DAMAGE.
33 * Copyright 1997 Marshall Kirk McKusick. All Rights Reserved.
35 * This code is derived from work done by Greg Ganger at the
36 * University of Michigan.
38 * Redistribution and use in source and binary forms, with or without
39 * modification, are permitted provided that the following conditions
40 * are met:
41 * 1. Redistributions of source code must retain the above copyright
42 * notice, this list of conditions and the following disclaimer.
43 * 2. Redistributions in binary form must reproduce the above copyright
44 * notice, this list of conditions and the following disclaimer in the
45 * documentation and/or other materials provided with the distribution.
46 * 3. None of the names of McKusick, Ganger, or the University of Michigan
47 * may be used to endorse or promote products derived from this software
48 * without specific prior written permission.
50 * THIS SOFTWARE IS PROVIDED BY MARSHALL KIRK MCKUSICK ``AS IS'' AND
51 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
52 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
53 * ARE DISCLAIMED. IN NO EVENT SHALL MARSHALL KIRK MCKUSICK BE LIABLE
54 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
55 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
56 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
57 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
58 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
59 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
60 * SUCH DAMAGE.
63 #include <sys/cdefs.h>
64 __KERNEL_RCSID(0, "$NetBSD: sync_subr.c,v 1.40 2009/03/15 17:22:38 cegger Exp $");
66 #include <sys/param.h>
67 #include <sys/systm.h>
68 #include <sys/sysctl.h>
69 #include <sys/kernel.h>
70 #include <sys/proc.h>
71 #include <sys/mount.h>
72 #include <sys/time.h>
73 #include <sys/vnode.h>
74 #include <sys/buf.h>
75 #include <sys/errno.h>
76 #include <sys/kmem.h>
78 #include <miscfs/genfs/genfs.h>
79 #include <miscfs/syncfs/syncfs.h>
81 static void vn_syncer_add1(struct vnode *, int);
84 * Defines and variables for the syncer process.
86 int syncer_maxdelay = SYNCER_MAXDELAY; /* maximum delay time */
87 time_t syncdelay = 30; /* max time to delay syncing data */
88 time_t filedelay = 30; /* time to delay syncing files */
89 time_t dirdelay = 15; /* time to delay syncing directories */
90 time_t metadelay = 10; /* time to delay syncing metadata */
91 time_t lockdelay = 1; /* time to delay if locking fails */
93 kmutex_t syncer_mutex; /* used to freeze syncer, long term */
94 static kmutex_t syncer_data_lock; /* short term lock on data structures */
96 static int rushjob; /* number of slots to run ASAP */
97 static kcondvar_t syncer_cv; /* cv for rushjob */
98 static int stat_rush_requests; /* number of times I/O speeded up */
100 static int syncer_delayno = 0;
101 static long syncer_last;
102 static struct synclist *syncer_workitem_pending;
103 struct lwp *updateproc = NULL;
105 static void sysctl_vfs_syncfs_setup(struct sysctllog **);
107 void
108 vn_initialize_syncerd(void)
110 int i;
112 syncer_last = SYNCER_MAXDELAY + 2;
114 sysctl_vfs_syncfs_setup(NULL);
116 syncer_workitem_pending =
117 kmem_alloc(syncer_last * sizeof (struct synclist), KM_SLEEP);
119 for (i = 0; i < syncer_last; i++)
120 TAILQ_INIT(&syncer_workitem_pending[i]);
122 mutex_init(&syncer_mutex, MUTEX_DEFAULT, IPL_NONE);
123 mutex_init(&syncer_data_lock, MUTEX_DEFAULT, IPL_NONE);
124 cv_init(&syncer_cv, "syncer");
128 * The workitem queue.
130 * It is useful to delay writes of file data and filesystem metadata
131 * for tens of seconds so that quickly created and deleted files need
132 * not waste disk bandwidth being created and removed. To realize this,
133 * we append vnodes to a "workitem" queue. When running with a soft
134 * updates implementation, most pending metadata dependencies should
135 * not wait for more than a few seconds. Thus, mounted on block devices
136 * are delayed only about a half the time that file data is delayed.
137 * Similarly, directory updates are more critical, so are only delayed
138 * about a third the time that file data is delayed. Thus, there are
139 * SYNCER_MAXDELAY queues that are processed round-robin at a rate of
140 * one each second (driven off the filesystem syner process). The
141 * syncer_delayno variable indicates the next queue that is to be processed.
142 * Items that need to be processed soon are placed in this queue:
144 * syncer_workitem_pending[syncer_delayno]
146 * A delay of fifteen seconds is done by placing the request fifteen
147 * entries later in the queue:
149 * syncer_workitem_pending[(syncer_delayno + 15) & syncer_mask]
154 * Add an item to the syncer work queue.
156 static void
157 vn_syncer_add1(struct vnode *vp, int delayx)
159 struct synclist *slp;
161 KASSERT(mutex_owned(&syncer_data_lock));
163 if (vp->v_iflag & VI_ONWORKLST) {
164 slp = &syncer_workitem_pending[vp->v_synclist_slot];
165 TAILQ_REMOVE(slp, vp, v_synclist);
166 } else {
168 * We must not modify v_iflag if the vnode
169 * is already on a synclist: sched_sync()
170 * calls this routine while holding only
171 * syncer_data_lock in order to adjust the
172 * position of the vnode. syncer_data_lock
173 * does not protect v_iflag.
175 KASSERT(mutex_owned(&vp->v_interlock));
176 vp->v_iflag |= VI_ONWORKLST;
179 if (delayx > syncer_maxdelay - 2)
180 delayx = syncer_maxdelay - 2;
181 vp->v_synclist_slot = (syncer_delayno + delayx) % syncer_last;
183 slp = &syncer_workitem_pending[vp->v_synclist_slot];
184 TAILQ_INSERT_TAIL(slp, vp, v_synclist);
187 void
188 vn_syncer_add_to_worklist(struct vnode *vp, int delayx)
191 KASSERT(mutex_owned(&vp->v_interlock));
193 mutex_enter(&syncer_data_lock);
194 vn_syncer_add1(vp, delayx);
195 mutex_exit(&syncer_data_lock);
199 * Remove an item from the syncer work queue.
201 void
202 vn_syncer_remove_from_worklist(struct vnode *vp)
204 struct synclist *slp;
206 KASSERT(mutex_owned(&vp->v_interlock));
208 mutex_enter(&syncer_data_lock);
210 if (vp->v_iflag & VI_ONWORKLST) {
211 vp->v_iflag &= ~VI_ONWORKLST;
212 slp = &syncer_workitem_pending[vp->v_synclist_slot];
213 TAILQ_REMOVE(slp, vp, v_synclist);
216 mutex_exit(&syncer_data_lock);
220 * System filesystem synchronizer daemon.
222 void
223 sched_sync(void *v)
225 struct synclist *slp;
226 struct vnode *vp;
227 long starttime;
228 bool synced;
230 updateproc = curlwp;
232 for (;;) {
233 mutex_enter(&syncer_mutex);
234 mutex_enter(&syncer_data_lock);
236 starttime = time_second;
239 * Push files whose dirty time has expired.
241 slp = &syncer_workitem_pending[syncer_delayno];
242 syncer_delayno += 1;
243 if (syncer_delayno >= syncer_last)
244 syncer_delayno = 0;
246 while ((vp = TAILQ_FIRST(slp)) != NULL) {
247 /* We are locking in the wrong direction. */
248 synced = false;
249 if (mutex_tryenter(&vp->v_interlock)) {
250 mutex_exit(&syncer_data_lock);
251 if (vget(vp, LK_EXCLUSIVE | LK_NOWAIT |
252 LK_INTERLOCK) == 0) {
253 synced = true;
254 (void) VOP_FSYNC(vp, curlwp->l_cred,
255 FSYNC_LAZY, 0, 0);
256 vput(vp);
258 mutex_enter(&syncer_data_lock);
262 * XXX The vnode may have been recycled, in which
263 * case it may have a new identity.
265 if (TAILQ_FIRST(slp) == vp) {
267 * Put us back on the worklist. The worklist
268 * routine will remove us from our current
269 * position and then add us back in at a later
270 * position.
272 * Try again sooner rather than later if
273 * we were unable to lock the vnode. Lock
274 * failure should not prevent us from doing
275 * the sync "soon".
277 * If we locked it yet arrive here, it's
278 * likely that lazy sync is in progress and
279 * so the vnode still has dirty metadata.
280 * syncdelay is mainly to get this vnode out
281 * of the way so we do not consider it again
282 * "soon" in this loop, so the delay time is
283 * not critical as long as it is not "soon".
284 * While write-back strategy is the file
285 * system's domain, we expect write-back to
286 * occur no later than syncdelay seconds
287 * into the future.
289 vn_syncer_add1(vp,
290 synced ? syncdelay : lockdelay);
293 mutex_exit(&syncer_mutex);
296 * Wait until there are more workitems to process.
298 if (rushjob > 0) {
300 * The variable rushjob allows the kernel to speed
301 * up the processing of the filesystem syncer
302 * process. A rushjob value of N tells the
303 * filesystem syncer to process the next N seconds
304 * worth of work on its queue ASAP. Currently
305 * rushjob is used by the soft update code to
306 * speed up the filesystem syncer process when the
307 * incore state is getting so far ahead of the
308 * disk that the kernel memory pool is being
309 * threatened with exhaustion.
311 rushjob--;
312 } else {
314 * If it has taken us less than a second to
315 * process the current work, then wait. Otherwise
316 * start right over again. We can still lose time
317 * if any single round takes more than two
318 * seconds, but it does not really matter as we
319 * are just trying to generally pace the
320 * filesystem activity.
322 if (time_second == starttime)
323 cv_timedwait(&syncer_cv, &syncer_data_lock, hz);
325 mutex_exit(&syncer_data_lock);
330 * Request the syncer daemon to speed up its work.
331 * We never push it to speed up more than half of its
332 * normal turn time, otherwise it could take over the CPU.
335 speedup_syncer(void)
338 mutex_enter(&syncer_data_lock);
339 if (rushjob >= syncdelay / 2) {
340 mutex_exit(&syncer_data_lock);
341 return (0);
343 rushjob++;
344 cv_signal(&syncer_cv);
345 stat_rush_requests += 1;
346 mutex_exit(&syncer_data_lock);
348 return (1);
351 static void
352 sysctl_vfs_syncfs_setup(struct sysctllog **clog)
354 const struct sysctlnode *rnode, *cnode;
356 sysctl_createv(clog, 0, NULL, &rnode,
357 CTLFLAG_PERMANENT,
358 CTLTYPE_NODE, "vfs", NULL,
359 NULL, 0, NULL, 0,
360 CTL_VFS, CTL_EOL);
362 sysctl_createv(clog, 0, &rnode, &rnode,
363 CTLFLAG_PERMANENT,
364 CTLTYPE_NODE, "sync",
365 SYSCTL_DESCR("syncer options"),
366 NULL, 0, NULL, 0,
367 CTL_CREATE, CTL_EOL);
369 sysctl_createv(clog, 0, &rnode, &cnode,
370 CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
371 CTLTYPE_INT, "delay",
372 SYSCTL_DESCR("max time to delay syncing data"),
373 NULL, 0, &syncdelay, 0,
374 CTL_CREATE, CTL_EOL);
376 sysctl_createv(clog, 0, &rnode, &cnode,
377 CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
378 CTLTYPE_INT, "filedelay",
379 SYSCTL_DESCR("time to delay syncing files"),
380 NULL, 0, &filedelay, 0,
381 CTL_CREATE, CTL_EOL);
383 sysctl_createv(clog, 0, &rnode, &cnode,
384 CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
385 CTLTYPE_INT, "dirdelay",
386 SYSCTL_DESCR("time to delay syncing directories"),
387 NULL, 0, &dirdelay, 0,
388 CTL_CREATE, CTL_EOL);
390 sysctl_createv(clog, 0, &rnode, &cnode,
391 CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
392 CTLTYPE_INT, "metadelay",
393 SYSCTL_DESCR("time to delay syncing metadata"),
394 NULL, 0, &metadelay, 0,
395 CTL_CREATE, CTL_EOL);