1 /* $NetBSD: sync_subr.c,v 1.40 2009/03/15 17:22:38 cegger Exp $ */
4 * Copyright (c) 2009 The NetBSD Foundation, Inc.
7 * This code is derived from software contributed to The NetBSD Foundation
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 * POSSIBILITY OF SUCH DAMAGE.
33 * Copyright 1997 Marshall Kirk McKusick. All Rights Reserved.
35 * This code is derived from work done by Greg Ganger at the
36 * University of Michigan.
38 * Redistribution and use in source and binary forms, with or without
39 * modification, are permitted provided that the following conditions
41 * 1. Redistributions of source code must retain the above copyright
42 * notice, this list of conditions and the following disclaimer.
43 * 2. Redistributions in binary form must reproduce the above copyright
44 * notice, this list of conditions and the following disclaimer in the
45 * documentation and/or other materials provided with the distribution.
46 * 3. None of the names of McKusick, Ganger, or the University of Michigan
47 * may be used to endorse or promote products derived from this software
48 * without specific prior written permission.
50 * THIS SOFTWARE IS PROVIDED BY MARSHALL KIRK MCKUSICK ``AS IS'' AND
51 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
52 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
53 * ARE DISCLAIMED. IN NO EVENT SHALL MARSHALL KIRK MCKUSICK BE LIABLE
54 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
55 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
56 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
57 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
58 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
59 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
63 #include <sys/cdefs.h>
64 __KERNEL_RCSID(0, "$NetBSD: sync_subr.c,v 1.40 2009/03/15 17:22:38 cegger Exp $");
66 #include <sys/param.h>
67 #include <sys/systm.h>
68 #include <sys/sysctl.h>
69 #include <sys/kernel.h>
71 #include <sys/mount.h>
73 #include <sys/vnode.h>
75 #include <sys/errno.h>
78 #include <miscfs/genfs/genfs.h>
79 #include <miscfs/syncfs/syncfs.h>
81 static void vn_syncer_add1(struct vnode
*, int);
84 * Defines and variables for the syncer process.
86 int syncer_maxdelay
= SYNCER_MAXDELAY
; /* maximum delay time */
87 time_t syncdelay
= 30; /* max time to delay syncing data */
88 time_t filedelay
= 30; /* time to delay syncing files */
89 time_t dirdelay
= 15; /* time to delay syncing directories */
90 time_t metadelay
= 10; /* time to delay syncing metadata */
91 time_t lockdelay
= 1; /* time to delay if locking fails */
93 kmutex_t syncer_mutex
; /* used to freeze syncer, long term */
94 static kmutex_t syncer_data_lock
; /* short term lock on data structures */
96 static int rushjob
; /* number of slots to run ASAP */
97 static kcondvar_t syncer_cv
; /* cv for rushjob */
98 static int stat_rush_requests
; /* number of times I/O speeded up */
100 static int syncer_delayno
= 0;
101 static long syncer_last
;
102 static struct synclist
*syncer_workitem_pending
;
103 struct lwp
*updateproc
= NULL
;
105 static void sysctl_vfs_syncfs_setup(struct sysctllog
**);
108 vn_initialize_syncerd(void)
112 syncer_last
= SYNCER_MAXDELAY
+ 2;
114 sysctl_vfs_syncfs_setup(NULL
);
116 syncer_workitem_pending
=
117 kmem_alloc(syncer_last
* sizeof (struct synclist
), KM_SLEEP
);
119 for (i
= 0; i
< syncer_last
; i
++)
120 TAILQ_INIT(&syncer_workitem_pending
[i
]);
122 mutex_init(&syncer_mutex
, MUTEX_DEFAULT
, IPL_NONE
);
123 mutex_init(&syncer_data_lock
, MUTEX_DEFAULT
, IPL_NONE
);
124 cv_init(&syncer_cv
, "syncer");
128 * The workitem queue.
130 * It is useful to delay writes of file data and filesystem metadata
131 * for tens of seconds so that quickly created and deleted files need
132 * not waste disk bandwidth being created and removed. To realize this,
133 * we append vnodes to a "workitem" queue. When running with a soft
134 * updates implementation, most pending metadata dependencies should
135 * not wait for more than a few seconds. Thus, mounted on block devices
136 * are delayed only about a half the time that file data is delayed.
137 * Similarly, directory updates are more critical, so are only delayed
138 * about a third the time that file data is delayed. Thus, there are
139 * SYNCER_MAXDELAY queues that are processed round-robin at a rate of
140 * one each second (driven off the filesystem syner process). The
141 * syncer_delayno variable indicates the next queue that is to be processed.
142 * Items that need to be processed soon are placed in this queue:
144 * syncer_workitem_pending[syncer_delayno]
146 * A delay of fifteen seconds is done by placing the request fifteen
147 * entries later in the queue:
149 * syncer_workitem_pending[(syncer_delayno + 15) & syncer_mask]
154 * Add an item to the syncer work queue.
157 vn_syncer_add1(struct vnode
*vp
, int delayx
)
159 struct synclist
*slp
;
161 KASSERT(mutex_owned(&syncer_data_lock
));
163 if (vp
->v_iflag
& VI_ONWORKLST
) {
164 slp
= &syncer_workitem_pending
[vp
->v_synclist_slot
];
165 TAILQ_REMOVE(slp
, vp
, v_synclist
);
168 * We must not modify v_iflag if the vnode
169 * is already on a synclist: sched_sync()
170 * calls this routine while holding only
171 * syncer_data_lock in order to adjust the
172 * position of the vnode. syncer_data_lock
173 * does not protect v_iflag.
175 KASSERT(mutex_owned(&vp
->v_interlock
));
176 vp
->v_iflag
|= VI_ONWORKLST
;
179 if (delayx
> syncer_maxdelay
- 2)
180 delayx
= syncer_maxdelay
- 2;
181 vp
->v_synclist_slot
= (syncer_delayno
+ delayx
) % syncer_last
;
183 slp
= &syncer_workitem_pending
[vp
->v_synclist_slot
];
184 TAILQ_INSERT_TAIL(slp
, vp
, v_synclist
);
188 vn_syncer_add_to_worklist(struct vnode
*vp
, int delayx
)
191 KASSERT(mutex_owned(&vp
->v_interlock
));
193 mutex_enter(&syncer_data_lock
);
194 vn_syncer_add1(vp
, delayx
);
195 mutex_exit(&syncer_data_lock
);
199 * Remove an item from the syncer work queue.
202 vn_syncer_remove_from_worklist(struct vnode
*vp
)
204 struct synclist
*slp
;
206 KASSERT(mutex_owned(&vp
->v_interlock
));
208 mutex_enter(&syncer_data_lock
);
210 if (vp
->v_iflag
& VI_ONWORKLST
) {
211 vp
->v_iflag
&= ~VI_ONWORKLST
;
212 slp
= &syncer_workitem_pending
[vp
->v_synclist_slot
];
213 TAILQ_REMOVE(slp
, vp
, v_synclist
);
216 mutex_exit(&syncer_data_lock
);
220 * System filesystem synchronizer daemon.
225 struct synclist
*slp
;
233 mutex_enter(&syncer_mutex
);
234 mutex_enter(&syncer_data_lock
);
236 starttime
= time_second
;
239 * Push files whose dirty time has expired.
241 slp
= &syncer_workitem_pending
[syncer_delayno
];
243 if (syncer_delayno
>= syncer_last
)
246 while ((vp
= TAILQ_FIRST(slp
)) != NULL
) {
247 /* We are locking in the wrong direction. */
249 if (mutex_tryenter(&vp
->v_interlock
)) {
250 mutex_exit(&syncer_data_lock
);
251 if (vget(vp
, LK_EXCLUSIVE
| LK_NOWAIT
|
252 LK_INTERLOCK
) == 0) {
254 (void) VOP_FSYNC(vp
, curlwp
->l_cred
,
258 mutex_enter(&syncer_data_lock
);
262 * XXX The vnode may have been recycled, in which
263 * case it may have a new identity.
265 if (TAILQ_FIRST(slp
) == vp
) {
267 * Put us back on the worklist. The worklist
268 * routine will remove us from our current
269 * position and then add us back in at a later
272 * Try again sooner rather than later if
273 * we were unable to lock the vnode. Lock
274 * failure should not prevent us from doing
277 * If we locked it yet arrive here, it's
278 * likely that lazy sync is in progress and
279 * so the vnode still has dirty metadata.
280 * syncdelay is mainly to get this vnode out
281 * of the way so we do not consider it again
282 * "soon" in this loop, so the delay time is
283 * not critical as long as it is not "soon".
284 * While write-back strategy is the file
285 * system's domain, we expect write-back to
286 * occur no later than syncdelay seconds
290 synced
? syncdelay
: lockdelay
);
293 mutex_exit(&syncer_mutex
);
296 * Wait until there are more workitems to process.
300 * The variable rushjob allows the kernel to speed
301 * up the processing of the filesystem syncer
302 * process. A rushjob value of N tells the
303 * filesystem syncer to process the next N seconds
304 * worth of work on its queue ASAP. Currently
305 * rushjob is used by the soft update code to
306 * speed up the filesystem syncer process when the
307 * incore state is getting so far ahead of the
308 * disk that the kernel memory pool is being
309 * threatened with exhaustion.
314 * If it has taken us less than a second to
315 * process the current work, then wait. Otherwise
316 * start right over again. We can still lose time
317 * if any single round takes more than two
318 * seconds, but it does not really matter as we
319 * are just trying to generally pace the
320 * filesystem activity.
322 if (time_second
== starttime
)
323 cv_timedwait(&syncer_cv
, &syncer_data_lock
, hz
);
325 mutex_exit(&syncer_data_lock
);
330 * Request the syncer daemon to speed up its work.
331 * We never push it to speed up more than half of its
332 * normal turn time, otherwise it could take over the CPU.
338 mutex_enter(&syncer_data_lock
);
339 if (rushjob
>= syncdelay
/ 2) {
340 mutex_exit(&syncer_data_lock
);
344 cv_signal(&syncer_cv
);
345 stat_rush_requests
+= 1;
346 mutex_exit(&syncer_data_lock
);
352 sysctl_vfs_syncfs_setup(struct sysctllog
**clog
)
354 const struct sysctlnode
*rnode
, *cnode
;
356 sysctl_createv(clog
, 0, NULL
, &rnode
,
358 CTLTYPE_NODE
, "vfs", NULL
,
362 sysctl_createv(clog
, 0, &rnode
, &rnode
,
364 CTLTYPE_NODE
, "sync",
365 SYSCTL_DESCR("syncer options"),
367 CTL_CREATE
, CTL_EOL
);
369 sysctl_createv(clog
, 0, &rnode
, &cnode
,
370 CTLFLAG_PERMANENT
|CTLFLAG_READWRITE
,
371 CTLTYPE_INT
, "delay",
372 SYSCTL_DESCR("max time to delay syncing data"),
373 NULL
, 0, &syncdelay
, 0,
374 CTL_CREATE
, CTL_EOL
);
376 sysctl_createv(clog
, 0, &rnode
, &cnode
,
377 CTLFLAG_PERMANENT
|CTLFLAG_READWRITE
,
378 CTLTYPE_INT
, "filedelay",
379 SYSCTL_DESCR("time to delay syncing files"),
380 NULL
, 0, &filedelay
, 0,
381 CTL_CREATE
, CTL_EOL
);
383 sysctl_createv(clog
, 0, &rnode
, &cnode
,
384 CTLFLAG_PERMANENT
|CTLFLAG_READWRITE
,
385 CTLTYPE_INT
, "dirdelay",
386 SYSCTL_DESCR("time to delay syncing directories"),
387 NULL
, 0, &dirdelay
, 0,
388 CTL_CREATE
, CTL_EOL
);
390 sysctl_createv(clog
, 0, &rnode
, &cnode
,
391 CTLFLAG_PERMANENT
|CTLFLAG_READWRITE
,
392 CTLTYPE_INT
, "metadelay",
393 SYSCTL_DESCR("time to delay syncing metadata"),
394 NULL
, 0, &metadelay
, 0,
395 CTL_CREATE
, CTL_EOL
);