1 /* $NetBSD: lfs_vnops.c,v 1.225 2009/11/17 22:49:24 eeh Exp $ */
4 * Copyright (c) 1999, 2000, 2001, 2002, 2003 The NetBSD Foundation, Inc.
7 * This code is derived from software contributed to The NetBSD Foundation
8 * by Konrad E. Schroder <perseant@hhhh.org>.
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 * POSSIBILITY OF SUCH DAMAGE.
32 * Copyright (c) 1986, 1989, 1991, 1993, 1995
33 * The Regents of the University of California. All rights reserved.
35 * Redistribution and use in source and binary forms, with or without
36 * modification, are permitted provided that the following conditions
38 * 1. Redistributions of source code must retain the above copyright
39 * notice, this list of conditions and the following disclaimer.
40 * 2. Redistributions in binary form must reproduce the above copyright
41 * notice, this list of conditions and the following disclaimer in the
42 * documentation and/or other materials provided with the distribution.
43 * 3. Neither the name of the University nor the names of its contributors
44 * may be used to endorse or promote products derived from this software
45 * without specific prior written permission.
47 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
48 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
49 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
50 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
51 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
52 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
53 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
54 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
55 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
56 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
59 * @(#)lfs_vnops.c 8.13 (Berkeley) 6/10/95
62 #include <sys/cdefs.h>
63 __KERNEL_RCSID(0, "$NetBSD: lfs_vnops.c,v 1.225 2009/11/17 22:49:24 eeh Exp $");
66 #include "opt_compat_netbsd.h"
69 #include <sys/param.h>
70 #include <sys/systm.h>
71 #include <sys/namei.h>
72 #include <sys/resourcevar.h>
73 #include <sys/kernel.h>
78 #include <sys/mount.h>
79 #include <sys/vnode.h>
81 #include <sys/signalvar.h>
82 #include <sys/kauth.h>
83 #include <sys/syslog.h>
84 #include <sys/fstrans.h>
86 #include <miscfs/fifofs/fifo.h>
87 #include <miscfs/genfs/genfs.h>
88 #include <miscfs/specfs/specdev.h>
90 #include <ufs/ufs/inode.h>
91 #include <ufs/ufs/dir.h>
92 #include <ufs/ufs/ufsmount.h>
93 #include <ufs/ufs/ufs_extern.h>
96 #include <uvm/uvm_pmap.h>
97 #include <uvm/uvm_stat.h>
98 #include <uvm/uvm_pager.h>
100 #include <ufs/lfs/lfs.h>
101 #include <ufs/lfs/lfs_extern.h>
103 extern pid_t lfs_writer_daemon
;
104 int lfs_ignore_lazy_sync
= 1;
106 /* Global vfs data structures for lfs. */
107 int (**lfs_vnodeop_p
)(void *);
108 const struct vnodeopv_entry_desc lfs_vnodeop_entries
[] = {
109 { &vop_default_desc
, vn_default_error
},
110 { &vop_lookup_desc
, ufs_lookup
}, /* lookup */
111 { &vop_create_desc
, lfs_create
}, /* create */
112 { &vop_whiteout_desc
, ufs_whiteout
}, /* whiteout */
113 { &vop_mknod_desc
, lfs_mknod
}, /* mknod */
114 { &vop_open_desc
, ufs_open
}, /* open */
115 { &vop_close_desc
, lfs_close
}, /* close */
116 { &vop_access_desc
, ufs_access
}, /* access */
117 { &vop_getattr_desc
, lfs_getattr
}, /* getattr */
118 { &vop_setattr_desc
, lfs_setattr
}, /* setattr */
119 { &vop_read_desc
, lfs_read
}, /* read */
120 { &vop_write_desc
, lfs_write
}, /* write */
121 { &vop_ioctl_desc
, ufs_ioctl
}, /* ioctl */
122 { &vop_fcntl_desc
, lfs_fcntl
}, /* fcntl */
123 { &vop_poll_desc
, ufs_poll
}, /* poll */
124 { &vop_kqfilter_desc
, genfs_kqfilter
}, /* kqfilter */
125 { &vop_revoke_desc
, ufs_revoke
}, /* revoke */
126 { &vop_mmap_desc
, lfs_mmap
}, /* mmap */
127 { &vop_fsync_desc
, lfs_fsync
}, /* fsync */
128 { &vop_seek_desc
, ufs_seek
}, /* seek */
129 { &vop_remove_desc
, lfs_remove
}, /* remove */
130 { &vop_link_desc
, lfs_link
}, /* link */
131 { &vop_rename_desc
, lfs_rename
}, /* rename */
132 { &vop_mkdir_desc
, lfs_mkdir
}, /* mkdir */
133 { &vop_rmdir_desc
, lfs_rmdir
}, /* rmdir */
134 { &vop_symlink_desc
, lfs_symlink
}, /* symlink */
135 { &vop_readdir_desc
, ufs_readdir
}, /* readdir */
136 { &vop_readlink_desc
, ufs_readlink
}, /* readlink */
137 { &vop_abortop_desc
, ufs_abortop
}, /* abortop */
138 { &vop_inactive_desc
, lfs_inactive
}, /* inactive */
139 { &vop_reclaim_desc
, lfs_reclaim
}, /* reclaim */
140 { &vop_lock_desc
, ufs_lock
}, /* lock */
141 { &vop_unlock_desc
, ufs_unlock
}, /* unlock */
142 { &vop_bmap_desc
, ufs_bmap
}, /* bmap */
143 { &vop_strategy_desc
, lfs_strategy
}, /* strategy */
144 { &vop_print_desc
, ufs_print
}, /* print */
145 { &vop_islocked_desc
, ufs_islocked
}, /* islocked */
146 { &vop_pathconf_desc
, ufs_pathconf
}, /* pathconf */
147 { &vop_advlock_desc
, ufs_advlock
}, /* advlock */
148 { &vop_bwrite_desc
, lfs_bwrite
}, /* bwrite */
149 { &vop_getpages_desc
, lfs_getpages
}, /* getpages */
150 { &vop_putpages_desc
, lfs_putpages
}, /* putpages */
153 const struct vnodeopv_desc lfs_vnodeop_opv_desc
=
154 { &lfs_vnodeop_p
, lfs_vnodeop_entries
};
156 int (**lfs_specop_p
)(void *);
157 const struct vnodeopv_entry_desc lfs_specop_entries
[] = {
158 { &vop_default_desc
, vn_default_error
},
159 { &vop_lookup_desc
, spec_lookup
}, /* lookup */
160 { &vop_create_desc
, spec_create
}, /* create */
161 { &vop_mknod_desc
, spec_mknod
}, /* mknod */
162 { &vop_open_desc
, spec_open
}, /* open */
163 { &vop_close_desc
, lfsspec_close
}, /* close */
164 { &vop_access_desc
, ufs_access
}, /* access */
165 { &vop_getattr_desc
, lfs_getattr
}, /* getattr */
166 { &vop_setattr_desc
, lfs_setattr
}, /* setattr */
167 { &vop_read_desc
, ufsspec_read
}, /* read */
168 { &vop_write_desc
, ufsspec_write
}, /* write */
169 { &vop_ioctl_desc
, spec_ioctl
}, /* ioctl */
170 { &vop_fcntl_desc
, ufs_fcntl
}, /* fcntl */
171 { &vop_poll_desc
, spec_poll
}, /* poll */
172 { &vop_kqfilter_desc
, spec_kqfilter
}, /* kqfilter */
173 { &vop_revoke_desc
, spec_revoke
}, /* revoke */
174 { &vop_mmap_desc
, spec_mmap
}, /* mmap */
175 { &vop_fsync_desc
, spec_fsync
}, /* fsync */
176 { &vop_seek_desc
, spec_seek
}, /* seek */
177 { &vop_remove_desc
, spec_remove
}, /* remove */
178 { &vop_link_desc
, spec_link
}, /* link */
179 { &vop_rename_desc
, spec_rename
}, /* rename */
180 { &vop_mkdir_desc
, spec_mkdir
}, /* mkdir */
181 { &vop_rmdir_desc
, spec_rmdir
}, /* rmdir */
182 { &vop_symlink_desc
, spec_symlink
}, /* symlink */
183 { &vop_readdir_desc
, spec_readdir
}, /* readdir */
184 { &vop_readlink_desc
, spec_readlink
}, /* readlink */
185 { &vop_abortop_desc
, spec_abortop
}, /* abortop */
186 { &vop_inactive_desc
, lfs_inactive
}, /* inactive */
187 { &vop_reclaim_desc
, lfs_reclaim
}, /* reclaim */
188 { &vop_lock_desc
, ufs_lock
}, /* lock */
189 { &vop_unlock_desc
, ufs_unlock
}, /* unlock */
190 { &vop_bmap_desc
, spec_bmap
}, /* bmap */
191 { &vop_strategy_desc
, spec_strategy
}, /* strategy */
192 { &vop_print_desc
, ufs_print
}, /* print */
193 { &vop_islocked_desc
, ufs_islocked
}, /* islocked */
194 { &vop_pathconf_desc
, spec_pathconf
}, /* pathconf */
195 { &vop_advlock_desc
, spec_advlock
}, /* advlock */
196 { &vop_bwrite_desc
, vn_bwrite
}, /* bwrite */
197 { &vop_getpages_desc
, spec_getpages
}, /* getpages */
198 { &vop_putpages_desc
, spec_putpages
}, /* putpages */
201 const struct vnodeopv_desc lfs_specop_opv_desc
=
202 { &lfs_specop_p
, lfs_specop_entries
};
204 int (**lfs_fifoop_p
)(void *);
205 const struct vnodeopv_entry_desc lfs_fifoop_entries
[] = {
206 { &vop_default_desc
, vn_default_error
},
207 { &vop_lookup_desc
, fifo_lookup
}, /* lookup */
208 { &vop_create_desc
, fifo_create
}, /* create */
209 { &vop_mknod_desc
, fifo_mknod
}, /* mknod */
210 { &vop_open_desc
, fifo_open
}, /* open */
211 { &vop_close_desc
, lfsfifo_close
}, /* close */
212 { &vop_access_desc
, ufs_access
}, /* access */
213 { &vop_getattr_desc
, lfs_getattr
}, /* getattr */
214 { &vop_setattr_desc
, lfs_setattr
}, /* setattr */
215 { &vop_read_desc
, ufsfifo_read
}, /* read */
216 { &vop_write_desc
, ufsfifo_write
}, /* write */
217 { &vop_ioctl_desc
, fifo_ioctl
}, /* ioctl */
218 { &vop_fcntl_desc
, ufs_fcntl
}, /* fcntl */
219 { &vop_poll_desc
, fifo_poll
}, /* poll */
220 { &vop_kqfilter_desc
, fifo_kqfilter
}, /* kqfilter */
221 { &vop_revoke_desc
, fifo_revoke
}, /* revoke */
222 { &vop_mmap_desc
, fifo_mmap
}, /* mmap */
223 { &vop_fsync_desc
, fifo_fsync
}, /* fsync */
224 { &vop_seek_desc
, fifo_seek
}, /* seek */
225 { &vop_remove_desc
, fifo_remove
}, /* remove */
226 { &vop_link_desc
, fifo_link
}, /* link */
227 { &vop_rename_desc
, fifo_rename
}, /* rename */
228 { &vop_mkdir_desc
, fifo_mkdir
}, /* mkdir */
229 { &vop_rmdir_desc
, fifo_rmdir
}, /* rmdir */
230 { &vop_symlink_desc
, fifo_symlink
}, /* symlink */
231 { &vop_readdir_desc
, fifo_readdir
}, /* readdir */
232 { &vop_readlink_desc
, fifo_readlink
}, /* readlink */
233 { &vop_abortop_desc
, fifo_abortop
}, /* abortop */
234 { &vop_inactive_desc
, lfs_inactive
}, /* inactive */
235 { &vop_reclaim_desc
, lfs_reclaim
}, /* reclaim */
236 { &vop_lock_desc
, ufs_lock
}, /* lock */
237 { &vop_unlock_desc
, ufs_unlock
}, /* unlock */
238 { &vop_bmap_desc
, fifo_bmap
}, /* bmap */
239 { &vop_strategy_desc
, fifo_strategy
}, /* strategy */
240 { &vop_print_desc
, ufs_print
}, /* print */
241 { &vop_islocked_desc
, ufs_islocked
}, /* islocked */
242 { &vop_pathconf_desc
, fifo_pathconf
}, /* pathconf */
243 { &vop_advlock_desc
, fifo_advlock
}, /* advlock */
244 { &vop_bwrite_desc
, lfs_bwrite
}, /* bwrite */
245 { &vop_putpages_desc
, fifo_putpages
}, /* putpages */
248 const struct vnodeopv_desc lfs_fifoop_opv_desc
=
249 { &lfs_fifoop_p
, lfs_fifoop_entries
};
251 static int check_dirty(struct lfs
*, struct vnode
*, off_t
, off_t
, off_t
, int, int, struct vm_page
**);
253 #define LFS_READWRITE
254 #include <ufs/ufs/ufs_readwrite.c>
258 * Synch an open file.
264 struct vop_fsync_args
/* {
271 struct vnode
*vp
= ap
->a_vp
;
273 struct inode
*ip
= VTOI(vp
);
274 struct lfs
*fs
= ip
->i_lfs
;
276 /* If we're mounted read-only, don't try to sync. */
281 * Trickle sync simply adds this vnode to the pager list, as if
282 * the pagedaemon had requested a pageout.
284 if (ap
->a_flags
& FSYNC_LAZY
) {
285 if (lfs_ignore_lazy_sync
== 0) {
286 mutex_enter(&lfs_lock
);
287 if (!(ip
->i_flags
& IN_PAGING
)) {
288 ip
->i_flags
|= IN_PAGING
;
289 TAILQ_INSERT_TAIL(&fs
->lfs_pchainhd
, ip
,
292 wakeup(&lfs_writer_daemon
);
293 mutex_exit(&lfs_lock
);
299 * If a vnode is bring cleaned, flush it out before we try to
300 * reuse it. This prevents the cleaner from writing files twice
301 * in the same partial segment, causing an accounting underflow.
303 if (ap
->a_flags
& FSYNC_RECLAIM
&& ip
->i_flags
& IN_CLEANING
) {
307 wait
= (ap
->a_flags
& FSYNC_WAIT
);
309 mutex_enter(&vp
->v_interlock
);
310 error
= VOP_PUTPAGES(vp
, trunc_page(ap
->a_offlo
),
311 round_page(ap
->a_offhi
),
312 PGO_CLEANIT
| (wait
? PGO_SYNCIO
: 0));
313 if (error
== EAGAIN
) {
314 mutex_enter(&lfs_lock
);
315 mtsleep(&fs
->lfs_avail
, PCATCH
| PUSER
, "lfs_fsync",
316 hz
/ 100 + 1, &lfs_lock
);
317 mutex_exit(&lfs_lock
);
319 } while (error
== EAGAIN
);
323 if ((ap
->a_flags
& FSYNC_DATAONLY
) == 0)
324 error
= lfs_update(vp
, NULL
, NULL
, wait
? UPDATE_WAIT
: 0);
326 if (error
== 0 && ap
->a_flags
& FSYNC_CACHE
) {
328 error
= VOP_IOCTL(ip
->i_devvp
, DIOCCACHESYNC
, &l
, FWRITE
,
331 if (wait
&& !VPISEMPTY(vp
))
332 LFS_SET_UINO(ip
, IN_MODIFIED
);
338 * Take IN_ADIROP off, then call ufs_inactive.
341 lfs_inactive(void *v
)
343 struct vop_inactive_args
/* {
347 lfs_unmark_vnode(ap
->a_vp
);
350 * The Ifile is only ever inactivated on unmount.
351 * Streamline this process by not giving it more dirty blocks.
353 if (VTOI(ap
->a_vp
)->i_number
== LFS_IFILE_INUM
) {
354 mutex_enter(&lfs_lock
);
355 LFS_CLR_UINO(VTOI(ap
->a_vp
), IN_ALLMOD
);
356 mutex_exit(&lfs_lock
);
357 VOP_UNLOCK(ap
->a_vp
, 0);
361 return ufs_inactive(v
);
365 * These macros are used to bracket UFS directory ops, so that we can
366 * identify all the pages touched during directory ops which need to
367 * be ordered and flushed atomically, so that they may be recovered.
369 * Because we have to mark nodes VU_DIROP in order to prevent
370 * the cache from reclaiming them while a dirop is in progress, we must
371 * also manage the number of nodes so marked (otherwise we can run out).
372 * We do this by setting lfs_dirvcount to the number of marked vnodes; it
373 * is decremented during segment write, when VU_DIROP is taken off.
375 #define MARK_VNODE(vp) lfs_mark_vnode(vp)
376 #define UNMARK_VNODE(vp) lfs_unmark_vnode(vp)
377 #define SET_DIROP_CREATE(dvp, vpp) lfs_set_dirop_create((dvp), (vpp))
378 #define SET_DIROP_REMOVE(dvp, vp) lfs_set_dirop((dvp), (vp))
379 static int lfs_set_dirop_create(struct vnode
*, struct vnode
**);
380 static int lfs_set_dirop(struct vnode
*, struct vnode
*);
383 lfs_set_dirop(struct vnode
*dvp
, struct vnode
*vp
)
388 KASSERT(VOP_ISLOCKED(dvp
));
389 KASSERT(vp
== NULL
|| VOP_ISLOCKED(vp
));
391 fs
= VTOI(dvp
)->i_lfs
;
393 ASSERT_NO_SEGLOCK(fs
);
395 * LFS_NRESERVE calculates direct and indirect blocks as well
396 * as an inode block; an overestimate in most cases.
398 if ((error
= lfs_reserve(fs
, dvp
, vp
, LFS_NRESERVE(fs
))) != 0)
402 mutex_enter(&lfs_lock
);
403 if (fs
->lfs_dirops
== 0) {
404 mutex_exit(&lfs_lock
);
405 lfs_check(dvp
, LFS_UNUSED_LBN
, 0);
406 mutex_enter(&lfs_lock
);
408 while (fs
->lfs_writer
) {
409 error
= mtsleep(&fs
->lfs_dirops
, (PRIBIO
+ 1) | PCATCH
,
410 "lfs_sdirop", 0, &lfs_lock
);
411 if (error
== EINTR
) {
412 mutex_exit(&lfs_lock
);
416 if (lfs_dirvcount
> LFS_MAX_DIROP
&& fs
->lfs_dirops
== 0) {
417 wakeup(&lfs_writer_daemon
);
418 mutex_exit(&lfs_lock
);
423 if (lfs_dirvcount
> LFS_MAX_DIROP
) {
424 mutex_exit(&lfs_lock
);
425 DLOG((DLOG_DIROP
, "lfs_set_dirop: sleeping with dirops=%d, "
426 "dirvcount=%d\n", fs
->lfs_dirops
, lfs_dirvcount
));
427 if ((error
= mtsleep(&lfs_dirvcount
,
428 PCATCH
| PUSER
| PNORELOCK
, "lfs_maxdirop", 0,
437 mutex_exit(&lfs_lock
);
439 /* Hold a reference so SET_ENDOP will be happy */
450 lfs_reserve(fs
, dvp
, vp
, -LFS_NRESERVE(fs
));
455 * Get a new vnode *before* adjusting the dirop count, to avoid a deadlock
456 * in getnewvnode(), if we have a stacked filesystem mounted on top
459 * NB: this means we have to clear the new vnodes on error. Fortunately
460 * SET_ENDOP is there to do that for us.
463 lfs_set_dirop_create(struct vnode
*dvp
, struct vnode
**vpp
)
468 fs
= VFSTOUFS(dvp
->v_mount
)->um_lfs
;
469 ASSERT_NO_SEGLOCK(fs
);
472 if (vpp
&& (error
= getnewvnode(VT_LFS
, dvp
->v_mount
, lfs_vnodeop_p
, vpp
))) {
473 DLOG((DLOG_ALLOC
, "lfs_set_dirop_create: dvp %p error %d\n",
477 if ((error
= lfs_set_dirop(dvp
, NULL
)) != 0) {
487 #define SET_ENDOP_BASE(fs, dvp, str) \
489 mutex_enter(&lfs_lock); \
490 --(fs)->lfs_dirops; \
491 if (!(fs)->lfs_dirops) { \
492 if ((fs)->lfs_nadirop) { \
493 panic("SET_ENDOP: %s: no dirops but " \
494 " nadirop=%d", (str), \
495 (fs)->lfs_nadirop); \
497 wakeup(&(fs)->lfs_writer); \
498 mutex_exit(&lfs_lock); \
499 lfs_check((dvp), LFS_UNUSED_LBN, 0); \
501 mutex_exit(&lfs_lock); \
503 #define SET_ENDOP_CREATE(fs, dvp, nvpp, str) \
507 UNMARK_VNODE(*nvpp); \
508 /* Check for error return to stem vnode leakage */ \
509 if (nvpp && *nvpp && !((*nvpp)->v_uflag & VU_DIROP)) \
510 ungetnewvnode(*(nvpp)); \
511 SET_ENDOP_BASE((fs), (dvp), (str)); \
512 lfs_reserve((fs), (dvp), NULL, -LFS_NRESERVE(fs)); \
515 #define SET_ENDOP_CREATE_AP(ap, str) \
516 SET_ENDOP_CREATE(VTOI((ap)->a_dvp)->i_lfs, (ap)->a_dvp, \
518 #define SET_ENDOP_REMOVE(fs, dvp, ovp, str) \
523 SET_ENDOP_BASE((fs), (dvp), (str)); \
524 lfs_reserve((fs), (dvp), (ovp), -LFS_NRESERVE(fs)); \
531 lfs_mark_vnode(struct vnode
*vp
)
533 struct inode
*ip
= VTOI(vp
);
534 struct lfs
*fs
= ip
->i_lfs
;
536 mutex_enter(&lfs_lock
);
537 if (!(ip
->i_flag
& IN_ADIROP
)) {
538 if (!(vp
->v_uflag
& VU_DIROP
)) {
539 mutex_enter(&vp
->v_interlock
);
543 TAILQ_INSERT_TAIL(&fs
->lfs_dchainhd
, ip
, i_lfs_dchain
);
544 vp
->v_uflag
|= VU_DIROP
;
547 ip
->i_flag
|= IN_ADIROP
;
549 KASSERT(vp
->v_uflag
& VU_DIROP
);
550 mutex_exit(&lfs_lock
);
554 lfs_unmark_vnode(struct vnode
*vp
)
556 struct inode
*ip
= VTOI(vp
);
558 if (ip
&& (ip
->i_flag
& IN_ADIROP
)) {
559 KASSERT(vp
->v_uflag
& VU_DIROP
);
560 mutex_enter(&lfs_lock
);
561 --ip
->i_lfs
->lfs_nadirop
;
562 mutex_exit(&lfs_lock
);
563 ip
->i_flag
&= ~IN_ADIROP
;
570 struct vop_symlink_args
/* {
572 struct vnode **a_vpp;
573 struct componentname *a_cnp;
579 if ((error
= SET_DIROP_CREATE(ap
->a_dvp
, ap
->a_vpp
)) != 0) {
583 error
= ufs_symlink(ap
);
584 SET_ENDOP_CREATE_AP(ap
, "symlink");
591 struct vop_mknod_args
/* {
593 struct vnode **a_vpp;
594 struct componentname *a_cnp;
597 struct vattr
*vap
= ap
->a_vap
;
598 struct vnode
**vpp
= ap
->a_vpp
;
604 if ((error
= SET_DIROP_CREATE(ap
->a_dvp
, ap
->a_vpp
)) != 0) {
608 error
= ufs_makeinode(MAKEIMODE(vap
->va_type
, vap
->va_mode
),
609 ap
->a_dvp
, vpp
, ap
->a_cnp
);
611 /* Either way we're done with the dirop at this point */
612 SET_ENDOP_CREATE_AP(ap
, "mknod");
618 mp
= (*vpp
)->v_mount
;
620 ip
->i_flag
|= IN_ACCESS
| IN_CHANGE
| IN_UPDATE
;
621 if (vap
->va_rdev
!= VNOVAL
) {
623 * Want to be able to use this to make badblock
624 * inodes, so don't truncate the dev number.
627 ip
->i_ffs1_rdev
= ufs_rw32(vap
->va_rdev
,
628 UFS_MPNEEDSWAP((*vpp
)->v_mount
));
630 ip
->i_ffs1_rdev
= vap
->va_rdev
;
635 * Call fsync to write the vnode so that we don't have to deal with
636 * flushing it when it's marked VU_DIROP|VI_XLOCK.
638 * XXX KS - If we can't flush we also can't call vgone(), so must
639 * return. But, that leaves this vnode in limbo, also not good.
640 * Can this ever happen (barring hardware failure)?
642 if ((error
= VOP_FSYNC(*vpp
, NOCRED
, FSYNC_WAIT
, 0, 0)) != 0) {
643 panic("lfs_mknod: couldn't fsync (ino %llu)",
644 (unsigned long long)ino
);
645 /* return (error); */
648 * Remove vnode so that it will be reloaded by VFS_VGET and
649 * checked to see if it is an alias of an existing entry in
652 /* Used to be vput, but that causes us to call VOP_INACTIVE twice. */
655 (*vpp
)->v_type
= VNON
;
657 error
= VFS_VGET(mp
, ino
, vpp
);
669 struct vop_create_args
/* {
671 struct vnode **a_vpp;
672 struct componentname *a_cnp;
677 if ((error
= SET_DIROP_CREATE(ap
->a_dvp
, ap
->a_vpp
)) != 0) {
681 error
= ufs_create(ap
);
682 SET_ENDOP_CREATE_AP(ap
, "create");
689 struct vop_mkdir_args
/* {
691 struct vnode **a_vpp;
692 struct componentname *a_cnp;
697 if ((error
= SET_DIROP_CREATE(ap
->a_dvp
, ap
->a_vpp
)) != 0) {
701 error
= ufs_mkdir(ap
);
702 SET_ENDOP_CREATE_AP(ap
, "mkdir");
709 struct vop_remove_args
/* {
712 struct componentname *a_cnp;
714 struct vnode
*dvp
, *vp
;
721 if ((error
= SET_DIROP_REMOVE(dvp
, vp
)) != 0) {
729 error
= ufs_remove(ap
);
730 if (ip
->i_nlink
== 0)
731 lfs_orphan(ip
->i_lfs
, ip
->i_number
);
732 SET_ENDOP_REMOVE(ip
->i_lfs
, dvp
, ap
->a_vp
, "remove");
739 struct vop_rmdir_args
/* {
740 struct vnodeop_desc *a_desc;
743 struct componentname *a_cnp;
751 if ((error
= SET_DIROP_REMOVE(ap
->a_dvp
, ap
->a_vp
)) != 0) {
759 error
= ufs_rmdir(ap
);
760 if (ip
->i_nlink
== 0)
761 lfs_orphan(ip
->i_lfs
, ip
->i_number
);
762 SET_ENDOP_REMOVE(ip
->i_lfs
, ap
->a_dvp
, ap
->a_vp
, "rmdir");
769 struct vop_link_args
/* {
772 struct componentname *a_cnp;
775 struct vnode
**vpp
= NULL
;
777 if ((error
= SET_DIROP_CREATE(ap
->a_dvp
, vpp
)) != 0) {
781 error
= ufs_link(ap
);
782 SET_ENDOP_CREATE(VTOI(ap
->a_dvp
)->i_lfs
, ap
->a_dvp
, vpp
, "link");
789 struct vop_rename_args
/* {
790 struct vnode *a_fdvp;
792 struct componentname *a_fcnp;
793 struct vnode *a_tdvp;
795 struct componentname *a_tcnp;
797 struct vnode
*tvp
, *fvp
, *tdvp
, *fdvp
;
798 struct componentname
*tcnp
, *fcnp
;
802 fs
= VTOI(ap
->a_fdvp
)->i_lfs
;
811 * Check for cross-device rename.
812 * If it is, we don't want to set dirops, just error out.
813 * (In particular note that MARK_VNODE(tdvp) will DTWT on
814 * a cross-device rename.)
816 * Copied from ufs_rename.
818 if ((fvp
->v_mount
!= tdvp
->v_mount
) ||
819 (tvp
&& (fvp
->v_mount
!= tvp
->v_mount
))) {
825 * Check to make sure we're not renaming a vnode onto itself
826 * (deleting a hard link by renaming one name onto another);
827 * if we are we can't recursively call VOP_REMOVE since that
828 * would leave us with an unaccounted-for number of live dirops.
830 * Inline the relevant section of ufs_rename here, *before*
831 * calling SET_DIROP_REMOVE.
833 if (tvp
&& ((VTOI(tvp
)->i_flags
& (IMMUTABLE
| APPEND
)) ||
834 (VTOI(tdvp
)->i_flags
& APPEND
))) {
839 if (fvp
->v_type
== VDIR
) {
844 /* Release destination completely. */
845 VOP_ABORTOP(tdvp
, tcnp
);
851 fcnp
->cn_flags
&= ~(MODMASK
| SAVESTART
);
852 fcnp
->cn_flags
|= LOCKPARENT
| LOCKLEAF
;
853 fcnp
->cn_nameiop
= DELETE
;
854 vn_lock(fdvp
, LK_EXCLUSIVE
| LK_RETRY
);
855 if ((error
= relookup(fdvp
, &fvp
, fcnp
))) {
859 return (VOP_REMOVE(fdvp
, fvp
, fcnp
));
862 if ((error
= SET_DIROP_REMOVE(tdvp
, tvp
)) != 0)
867 error
= ufs_rename(ap
);
870 SET_ENDOP_REMOVE(fs
, tdvp
, tvp
, "rename");
874 VOP_ABORTOP(tdvp
, ap
->a_tcnp
); /* XXX, why not in NFS? */
881 VOP_ABORTOP(fdvp
, ap
->a_fcnp
); /* XXX, why not in NFS? */
887 /* XXX hack to avoid calling ITIMES in getattr */
891 struct vop_getattr_args
/* {
896 struct vnode
*vp
= ap
->a_vp
;
897 struct inode
*ip
= VTOI(vp
);
898 struct vattr
*vap
= ap
->a_vap
;
899 struct lfs
*fs
= ip
->i_lfs
;
901 * Copy from inode table
903 vap
->va_fsid
= ip
->i_dev
;
904 vap
->va_fileid
= ip
->i_number
;
905 vap
->va_mode
= ip
->i_mode
& ~IFMT
;
906 vap
->va_nlink
= ip
->i_nlink
;
907 vap
->va_uid
= ip
->i_uid
;
908 vap
->va_gid
= ip
->i_gid
;
909 vap
->va_rdev
= (dev_t
)ip
->i_ffs1_rdev
;
910 vap
->va_size
= vp
->v_size
;
911 vap
->va_atime
.tv_sec
= ip
->i_ffs1_atime
;
912 vap
->va_atime
.tv_nsec
= ip
->i_ffs1_atimensec
;
913 vap
->va_mtime
.tv_sec
= ip
->i_ffs1_mtime
;
914 vap
->va_mtime
.tv_nsec
= ip
->i_ffs1_mtimensec
;
915 vap
->va_ctime
.tv_sec
= ip
->i_ffs1_ctime
;
916 vap
->va_ctime
.tv_nsec
= ip
->i_ffs1_ctimensec
;
917 vap
->va_flags
= ip
->i_flags
;
918 vap
->va_gen
= ip
->i_gen
;
919 /* this doesn't belong here */
920 if (vp
->v_type
== VBLK
)
921 vap
->va_blocksize
= BLKDEV_IOSIZE
;
922 else if (vp
->v_type
== VCHR
)
923 vap
->va_blocksize
= MAXBSIZE
;
925 vap
->va_blocksize
= vp
->v_mount
->mnt_stat
.f_iosize
;
926 vap
->va_bytes
= fsbtob(fs
, (u_quad_t
)ip
->i_lfs_effnblks
);
927 vap
->va_type
= vp
->v_type
;
928 vap
->va_filerev
= ip
->i_modrev
;
933 * Check to make sure the inode blocks won't choke the buffer
934 * cache, then call ufs_setattr as usual.
939 struct vop_setattr_args
/* {
944 struct vnode
*vp
= ap
->a_vp
;
946 lfs_check(vp
, LFS_UNUSED_LBN
, 0);
947 return ufs_setattr(v
);
951 * Release the block we hold on lfs_newseg wrapping. Called on file close,
952 * or explicitly from LFCNWRAPGO. Called with the interlock held.
955 lfs_wrapgo(struct lfs
*fs
, struct inode
*ip
, int waitfor
)
957 if (fs
->lfs_stoplwp
!= curlwp
)
960 fs
->lfs_stoplwp
= NULL
;
961 cv_signal(&fs
->lfs_stopcv
);
963 KASSERT(fs
->lfs_nowrap
> 0);
964 if (fs
->lfs_nowrap
<= 0) {
968 if (--fs
->lfs_nowrap
== 0) {
969 log(LOG_NOTICE
, "%s: re-enabled log wrap\n", fs
->lfs_fsmnt
);
970 wakeup(&fs
->lfs_wrappass
);
971 lfs_wakeup_cleaner(fs
);
974 mtsleep(&fs
->lfs_nextseg
, PCATCH
| PUSER
, "segment",
988 struct vop_close_args
/* {
993 struct vnode
*vp
= ap
->a_vp
;
994 struct inode
*ip
= VTOI(vp
);
995 struct lfs
*fs
= ip
->i_lfs
;
997 if ((ip
->i_number
== ROOTINO
|| ip
->i_number
== LFS_IFILE_INUM
) &&
998 fs
->lfs_stoplwp
== curlwp
) {
999 mutex_enter(&lfs_lock
);
1000 log(LOG_NOTICE
, "lfs_close: releasing log wrap control\n");
1001 lfs_wrapgo(fs
, ip
, 0);
1002 mutex_exit(&lfs_lock
);
1005 if (vp
== ip
->i_lfs
->lfs_ivnode
&&
1006 vp
->v_mount
->mnt_iflag
& IMNT_UNMOUNT
)
1009 if (vp
->v_usecount
> 1 && vp
!= ip
->i_lfs
->lfs_ivnode
) {
1010 LFS_ITIMES(ip
, NULL
, NULL
, NULL
);
1016 * Close wrapper for special devices.
1018 * Update the times on the inode then do device close.
1021 lfsspec_close(void *v
)
1023 struct vop_close_args
/* {
1026 kauth_cred_t a_cred;
1033 if (vp
->v_usecount
> 1) {
1034 LFS_ITIMES(ip
, NULL
, NULL
, NULL
);
1036 return (VOCALL (spec_vnodeop_p
, VOFFSET(vop_close
), ap
));
1040 * Close wrapper for fifo's.
1042 * Update the times on the inode then do device close.
1045 lfsfifo_close(void *v
)
1047 struct vop_close_args
/* {
1057 if (ap
->a_vp
->v_usecount
> 1) {
1058 LFS_ITIMES(ip
, NULL
, NULL
, NULL
);
1060 return (VOCALL (fifo_vnodeop_p
, VOFFSET(vop_close
), ap
));
1064 * Reclaim an inode so that it can be used for other purposes.
1068 lfs_reclaim(void *v
)
1070 struct vop_reclaim_args
/* {
1073 struct vnode
*vp
= ap
->a_vp
;
1074 struct inode
*ip
= VTOI(vp
);
1075 struct lfs
*fs
= ip
->i_lfs
;
1078 mutex_enter(&lfs_lock
);
1079 LFS_CLR_UINO(ip
, IN_ALLMOD
);
1080 mutex_exit(&lfs_lock
);
1081 if ((error
= ufs_reclaim(vp
)))
1085 * Take us off the paging and/or dirop queues if we were on them.
1086 * We shouldn't be on them.
1088 mutex_enter(&lfs_lock
);
1089 if (ip
->i_flags
& IN_PAGING
) {
1090 log(LOG_WARNING
, "%s: reclaimed vnode is IN_PAGING\n",
1092 ip
->i_flags
&= ~IN_PAGING
;
1093 TAILQ_REMOVE(&fs
->lfs_pchainhd
, ip
, i_lfs_pchain
);
1095 if (vp
->v_uflag
& VU_DIROP
) {
1096 panic("reclaimed vnode is VU_DIROP");
1097 vp
->v_uflag
&= ~VU_DIROP
;
1098 TAILQ_REMOVE(&fs
->lfs_dchainhd
, ip
, i_lfs_dchain
);
1100 mutex_exit(&lfs_lock
);
1102 pool_put(&lfs_dinode_pool
, ip
->i_din
.ffs1_din
);
1103 lfs_deregister_all(vp
);
1104 pool_put(&lfs_inoext_pool
, ip
->inode_ext
.lfs
);
1105 ip
->inode_ext
.lfs
= NULL
;
1106 genfs_node_destroy(vp
);
1107 pool_put(&lfs_inode_pool
, vp
->v_data
);
1113 * Read a block from a storage device.
1114 * In order to avoid reading blocks that are in the process of being
1115 * written by the cleaner---and hence are not mutexed by the normal
1116 * buffer cache / page cache mechanisms---check for collisions before
1119 * We inline ufs_strategy to make sure that the VOP_BMAP occurs *before*
1120 * the active cleaner test.
1122 * XXX This code assumes that lfs_markv makes synchronous checkpoints.
1125 lfs_strategy(void *v
)
1127 struct vop_strategy_args
/* {
1136 int i
, sn
, error
, slept
;
1143 /* lfs uses its strategy routine only for read */
1144 KASSERT(bp
->b_flags
& B_READ
);
1146 if (vp
->v_type
== VBLK
|| vp
->v_type
== VCHR
)
1147 panic("lfs_strategy: spec");
1148 KASSERT(bp
->b_bcount
!= 0);
1149 if (bp
->b_blkno
== bp
->b_lblkno
) {
1150 error
= VOP_BMAP(vp
, bp
->b_lblkno
, NULL
, &bp
->b_blkno
,
1153 bp
->b_error
= error
;
1154 bp
->b_resid
= bp
->b_bcount
;
1158 if ((long)bp
->b_blkno
== -1) /* no valid data */
1161 if ((long)bp
->b_blkno
< 0) { /* block is not on disk */
1162 bp
->b_resid
= bp
->b_bcount
;
1168 mutex_enter(&lfs_lock
);
1169 while (slept
&& fs
->lfs_seglock
) {
1170 mutex_exit(&lfs_lock
);
1172 * Look through list of intervals.
1173 * There will only be intervals to look through
1174 * if the cleaner holds the seglock.
1175 * Since the cleaner is synchronous, we can trust
1176 * the list of intervals to be current.
1178 tbn
= dbtofsb(fs
, bp
->b_blkno
);
1179 sn
= dtosn(fs
, tbn
);
1181 for (i
= 0; i
< fs
->lfs_cleanind
; i
++) {
1182 if (sn
== dtosn(fs
, fs
->lfs_cleanint
[i
]) &&
1183 tbn
>= fs
->lfs_cleanint
[i
]) {
1185 "lfs_strategy: ino %d lbn %" PRId64
1186 " ind %d sn %d fsb %" PRIx32
1187 " given sn %d fsb %" PRIx64
"\n",
1188 ip
->i_number
, bp
->b_lblkno
, i
,
1189 dtosn(fs
, fs
->lfs_cleanint
[i
]),
1190 fs
->lfs_cleanint
[i
], sn
, tbn
));
1192 "lfs_strategy: sleeping on ino %d lbn %"
1193 PRId64
"\n", ip
->i_number
, bp
->b_lblkno
));
1194 mutex_enter(&lfs_lock
);
1195 if (LFS_SEGLOCK_HELD(fs
) && fs
->lfs_iocount
) {
1196 /* Cleaner can't wait for itself */
1197 mtsleep(&fs
->lfs_iocount
,
1198 (PRIBIO
+ 1) | PNORELOCK
,
1203 } else if (fs
->lfs_seglock
) {
1204 mtsleep(&fs
->lfs_seglock
,
1205 (PRIBIO
+ 1) | PNORELOCK
,
1211 mutex_exit(&lfs_lock
);
1214 mutex_enter(&lfs_lock
);
1216 mutex_exit(&lfs_lock
);
1219 VOP_STRATEGY(vp
, bp
);
1224 lfs_flush_dirops(struct lfs
*fs
)
1226 struct inode
*ip
, *nip
;
1228 extern int lfs_dostats
;
1232 ASSERT_MAYBE_SEGLOCK(fs
);
1233 KASSERT(fs
->lfs_nadirop
== 0);
1238 mutex_enter(&lfs_lock
);
1239 if (TAILQ_FIRST(&fs
->lfs_dchainhd
) == NULL
) {
1240 mutex_exit(&lfs_lock
);
1243 mutex_exit(&lfs_lock
);
1246 ++lfs_stats
.flush_invoked
;
1249 * Inline lfs_segwrite/lfs_writevnodes, but just for dirops.
1250 * Technically this is a checkpoint (the on-disk state is valid)
1251 * even though we are leaving out all the file data.
1254 lfs_seglock(fs
, SEGM_CKP
);
1258 * lfs_writevnodes, optimized to get dirops out of the way.
1259 * Only write dirops, and don't flush files' pages, only
1260 * blocks from the directories.
1262 * We don't need to vref these files because they are
1263 * dirops and so hold an extra reference until the
1264 * segunlock clears them of that status.
1266 * We don't need to check for IN_ADIROP because we know that
1267 * no dirops are active.
1270 mutex_enter(&lfs_lock
);
1271 for (ip
= TAILQ_FIRST(&fs
->lfs_dchainhd
); ip
!= NULL
; ip
= nip
) {
1272 nip
= TAILQ_NEXT(ip
, i_lfs_dchain
);
1273 mutex_exit(&lfs_lock
);
1276 KASSERT((ip
->i_flag
& IN_ADIROP
) == 0);
1279 * All writes to directories come from dirops; all
1280 * writes to files' direct blocks go through the page
1281 * cache, which we're not touching. Reads to files
1282 * and/or directories will not be affected by writing
1283 * directory blocks inodes and file inodes. So we don't
1284 * really need to lock. If we don't lock, though,
1285 * make sure that we don't clear IN_MODIFIED
1288 if (vp
->v_iflag
& VI_XLOCK
) {
1289 mutex_enter(&lfs_lock
);
1292 waslocked
= VOP_ISLOCKED(vp
);
1293 if (vp
->v_type
!= VREG
&&
1294 ((ip
->i_flag
& IN_ALLMOD
) || !VPISEMPTY(vp
))) {
1295 lfs_writefile(fs
, sp
, vp
);
1296 if (!VPISEMPTY(vp
) && !WRITEINPROG(vp
) &&
1297 !(ip
->i_flag
& IN_ALLMOD
)) {
1298 mutex_enter(&lfs_lock
);
1299 LFS_SET_UINO(ip
, IN_MODIFIED
);
1300 mutex_exit(&lfs_lock
);
1303 KDASSERT(ip
->i_number
!= LFS_IFILE_INUM
);
1304 (void) lfs_writeinode(fs
, sp
, ip
);
1305 mutex_enter(&lfs_lock
);
1306 if (waslocked
== LK_EXCLOTHER
)
1307 LFS_SET_UINO(ip
, IN_MODIFIED
);
1309 mutex_exit(&lfs_lock
);
1310 /* We've written all the dirops there are */
1311 ((SEGSUM
*)(sp
->segsum
))->ss_flags
&= ~(SS_CONT
);
1312 lfs_finalize_fs_seguse(fs
);
1313 (void) lfs_writeseg(fs
, sp
);
1318 * Flush all vnodes for which the pagedaemon has requested pageouts.
1319 * Skip over any files that are marked VU_DIROP (since lfs_flush_dirop()
1320 * has just run, this would be an error). If we have to skip a vnode
1321 * for any reason, just skip it; if we have to wait for the cleaner,
1322 * abort. The writer daemon will call us again later.
1325 lfs_flush_pchain(struct lfs
*fs
)
1327 struct inode
*ip
, *nip
;
1329 extern int lfs_dostats
;
1333 ASSERT_NO_SEGLOCK(fs
);
1338 mutex_enter(&lfs_lock
);
1339 if (TAILQ_FIRST(&fs
->lfs_pchainhd
) == NULL
) {
1340 mutex_exit(&lfs_lock
);
1343 mutex_exit(&lfs_lock
);
1345 /* Get dirops out of the way */
1346 lfs_flush_dirops(fs
);
1349 ++lfs_stats
.flush_invoked
;
1352 * Inline lfs_segwrite/lfs_writevnodes, but just for pageouts.
1359 * lfs_writevnodes, optimized to clear pageout requests.
1360 * Only write non-dirop files that are in the pageout queue.
1361 * We're very conservative about what we write; we want to be
1364 mutex_enter(&lfs_lock
);
1366 for (ip
= TAILQ_FIRST(&fs
->lfs_pchainhd
); ip
!= NULL
; ip
= nip
) {
1367 nip
= TAILQ_NEXT(ip
, i_lfs_pchain
);
1370 if (!(ip
->i_flags
& IN_PAGING
))
1373 mutex_enter(&vp
->v_interlock
);
1374 if ((vp
->v_iflag
& VI_XLOCK
) || (vp
->v_uflag
& VU_DIROP
) != 0) {
1375 mutex_exit(&vp
->v_interlock
);
1378 if (vp
->v_type
!= VREG
) {
1379 mutex_exit(&vp
->v_interlock
);
1384 mutex_exit(&lfs_lock
);
1386 if (VOP_ISLOCKED(vp
)) {
1388 mutex_enter(&lfs_lock
);
1392 error
= lfs_writefile(fs
, sp
, vp
);
1393 if (!VPISEMPTY(vp
) && !WRITEINPROG(vp
) &&
1394 !(ip
->i_flag
& IN_ALLMOD
)) {
1395 mutex_enter(&lfs_lock
);
1396 LFS_SET_UINO(ip
, IN_MODIFIED
);
1397 mutex_exit(&lfs_lock
);
1399 KDASSERT(ip
->i_number
!= LFS_IFILE_INUM
);
1400 (void) lfs_writeinode(fs
, sp
, ip
);
1404 if (error
== EAGAIN
) {
1405 lfs_writeseg(fs
, sp
);
1406 mutex_enter(&lfs_lock
);
1409 mutex_enter(&lfs_lock
);
1411 mutex_exit(&lfs_lock
);
1412 (void) lfs_writeseg(fs
, sp
);
1417 * Provide a fcntl interface to sys_lfs_{segwait,bmapv,markv}.
1422 struct vop_fcntl_args
/* {
1427 kauth_cred_t a_cred;
1430 struct timeval
*tvp
;
1434 int blkcnt
, error
, oclean
;
1436 struct lfs_fcntl_markv blkvp
;
1444 /* Only respect LFS fcntls on fs root or Ifile */
1445 if (VTOI(ap
->a_vp
)->i_number
!= ROOTINO
&&
1446 VTOI(ap
->a_vp
)->i_number
!= LFS_IFILE_INUM
) {
1447 return ufs_fcntl(v
);
1450 /* Avoid locking a draining lock */
1451 if (ap
->a_vp
->v_mount
->mnt_iflag
& IMNT_UNMOUNT
) {
1455 /* LFS control and monitoring fcntls are available only to root */
1457 if (((ap
->a_command
& 0xff00) >> 8) == 'L' &&
1458 (error
= kauth_authorize_generic(l
->l_cred
, KAUTH_GENERIC_ISSUSER
,
1462 fs
= VTOI(ap
->a_vp
)->i_lfs
;
1463 fsidp
= &ap
->a_vp
->v_mount
->mnt_stat
.f_fsidx
;
1466 switch ((int)ap
->a_command
) {
1467 case LFCNSEGWAITALL_COMPAT_50
:
1468 case LFCNSEGWAITALL_COMPAT
:
1471 case LFCNSEGWAIT_COMPAT_50
:
1472 case LFCNSEGWAIT_COMPAT
:
1474 struct timeval50
*tvp50
1475 = (struct timeval50
*)ap
->a_data
;
1476 timeval50_to_timeval(tvp50
, &tv
);
1479 goto segwait_common
;
1480 case LFCNSEGWAITALL
:
1484 tvp
= (struct timeval
*)ap
->a_data
;
1486 mutex_enter(&lfs_lock
);
1488 mutex_exit(&lfs_lock
);
1490 error
= lfs_segwait(fsidp
, tvp
);
1492 mutex_enter(&lfs_lock
);
1493 if (--fs
->lfs_sleepers
== 0)
1494 wakeup(&fs
->lfs_sleepers
);
1495 mutex_exit(&lfs_lock
);
1500 blkvp
= *(struct lfs_fcntl_markv
*)ap
->a_data
;
1502 blkcnt
= blkvp
.blkcnt
;
1503 if ((u_int
) blkcnt
> LFS_MARKV_MAXBLKCNT
)
1505 blkiov
= lfs_malloc(fs
, blkcnt
* sizeof(BLOCK_INFO
), LFS_NB_BLKIOV
);
1506 if ((error
= copyin(blkvp
.blkiov
, blkiov
,
1507 blkcnt
* sizeof(BLOCK_INFO
))) != 0) {
1508 lfs_free(fs
, blkiov
, LFS_NB_BLKIOV
);
1512 mutex_enter(&lfs_lock
);
1514 mutex_exit(&lfs_lock
);
1515 if (ap
->a_command
== LFCNBMAPV
)
1516 error
= lfs_bmapv(l
->l_proc
, fsidp
, blkiov
, blkcnt
);
1517 else /* LFCNMARKV */
1518 error
= lfs_markv(l
->l_proc
, fsidp
, blkiov
, blkcnt
);
1520 error
= copyout(blkiov
, blkvp
.blkiov
,
1521 blkcnt
* sizeof(BLOCK_INFO
));
1522 mutex_enter(&lfs_lock
);
1523 if (--fs
->lfs_sleepers
== 0)
1524 wakeup(&fs
->lfs_sleepers
);
1525 mutex_exit(&lfs_lock
);
1526 lfs_free(fs
, blkiov
, LFS_NB_BLKIOV
);
1531 * Flush dirops and write Ifile, allowing empty segments
1532 * to be immediately reclaimed.
1534 lfs_writer_enter(fs
, "pndirop");
1535 off
= fs
->lfs_offset
;
1536 lfs_seglock(fs
, SEGM_FORCE_CKP
| SEGM_CKP
);
1537 lfs_flush_dirops(fs
);
1538 LFS_CLEANERINFO(cip
, fs
, bp
);
1539 oclean
= cip
->clean
;
1540 LFS_SYNC_CLEANERINFO(cip
, fs
, bp
, 1);
1541 lfs_segwrite(ap
->a_vp
->v_mount
, SEGM_FORCE_CKP
);
1542 fs
->lfs_sp
->seg_flags
|= SEGM_PROT
;
1544 lfs_writer_leave(fs
);
1547 LFS_CLEANERINFO(cip
, fs
, bp
);
1548 DLOG((DLOG_CLEAN
, "lfs_fcntl: reclaim wrote %" PRId64
1549 " blocks, cleaned %" PRId32
" segments (activesb %d)\n",
1550 fs
->lfs_offset
- off
, cip
->clean
- oclean
,
1552 LFS_SYNC_CLEANERINFO(cip
, fs
, bp
, 0);
1557 case LFCNIFILEFH_COMPAT
:
1558 /* Return the filehandle of the Ifile */
1559 if ((error
= kauth_authorize_system(l
->l_cred
,
1560 KAUTH_SYSTEM_FILEHANDLE
, 0, NULL
, NULL
, NULL
)) != 0)
1562 fhp
= (struct fhandle
*)ap
->a_data
;
1563 fhp
->fh_fsid
= *fsidp
;
1564 fh_size
= 16; /* former VFS_MAXFIDSIZ */
1565 return lfs_vptofh(fs
->lfs_ivnode
, &(fhp
->fh_fid
), &fh_size
);
1567 case LFCNIFILEFH_COMPAT2
:
1569 /* Return the filehandle of the Ifile */
1570 fhp
= (struct fhandle
*)ap
->a_data
;
1571 fhp
->fh_fsid
= *fsidp
;
1572 fh_size
= sizeof(struct lfs_fhandle
) -
1573 offsetof(fhandle_t
, fh_fid
);
1574 return lfs_vptofh(fs
->lfs_ivnode
, &(fhp
->fh_fid
), &fh_size
);
1577 /* Move lfs_offset to the lowest-numbered segment */
1578 return lfs_rewind(fs
, *(int *)ap
->a_data
);
1581 /* Mark a segment SEGUSE_INVAL */
1582 LFS_SEGENTRY(sup
, fs
, *(int *)ap
->a_data
, bp
);
1583 if (sup
->su_nbytes
> 0) {
1585 lfs_unset_inval_all(fs
);
1588 sup
->su_flags
|= SEGUSE_INVAL
;
1593 /* Resize the filesystem */
1594 return lfs_resize_fs(fs
, *(int *)ap
->a_data
);
1597 case LFCNWRAPSTOP_COMPAT
:
1599 * Hold lfs_newseg at segment 0; if requested, sleep until
1600 * the filesystem wraps around. To support external agents
1601 * (dump, fsck-based regression test) that need to look at
1602 * a snapshot of the filesystem, without necessarily
1603 * requiring that all fs activity stops.
1605 if (fs
->lfs_stoplwp
== curlwp
)
1608 mutex_enter(&lfs_lock
);
1609 while (fs
->lfs_stoplwp
!= NULL
)
1610 cv_wait(&fs
->lfs_stopcv
, &lfs_lock
);
1611 fs
->lfs_stoplwp
= curlwp
;
1612 if (fs
->lfs_nowrap
== 0)
1613 log(LOG_NOTICE
, "%s: disabled log wrap\n", fs
->lfs_fsmnt
);
1615 if (*(int *)ap
->a_data
== 1
1616 || ap
->a_command
== LFCNWRAPSTOP_COMPAT
) {
1617 log(LOG_NOTICE
, "LFCNSTOPWRAP waiting for log wrap\n");
1618 error
= mtsleep(&fs
->lfs_nowrap
, PCATCH
| PUSER
,
1619 "segwrap", 0, &lfs_lock
);
1620 log(LOG_NOTICE
, "LFCNSTOPWRAP done waiting\n");
1622 lfs_wrapgo(fs
, VTOI(ap
->a_vp
), 0);
1625 mutex_exit(&lfs_lock
);
1629 case LFCNWRAPGO_COMPAT
:
1631 * Having done its work, the agent wakes up the writer.
1632 * If the argument is 1, it sleeps until a new segment
1635 mutex_enter(&lfs_lock
);
1636 error
= lfs_wrapgo(fs
, VTOI(ap
->a_vp
),
1637 ap
->a_command
== LFCNWRAPGO_COMPAT
? 1 :
1638 *((int *)ap
->a_data
));
1639 mutex_exit(&lfs_lock
);
1643 if ((VTOI(ap
->a_vp
)->i_lfs_iflags
& LFSI_WRAPWAIT
))
1645 mutex_enter(&lfs_lock
);
1646 if (fs
->lfs_stoplwp
!= curlwp
) {
1647 mutex_exit(&lfs_lock
);
1650 if (fs
->lfs_nowrap
== 0) {
1651 mutex_exit(&lfs_lock
);
1654 fs
->lfs_wrappass
= 1;
1655 wakeup(&fs
->lfs_wrappass
);
1656 /* Wait for the log to wrap, if asked */
1657 if (*(int *)ap
->a_data
) {
1658 mutex_enter(&ap
->a_vp
->v_interlock
);
1660 VTOI(ap
->a_vp
)->i_lfs_iflags
|= LFSI_WRAPWAIT
;
1661 log(LOG_NOTICE
, "LFCNPASS waiting for log wrap\n");
1662 error
= mtsleep(&fs
->lfs_nowrap
, PCATCH
| PUSER
,
1663 "segwrap", 0, &lfs_lock
);
1664 log(LOG_NOTICE
, "LFCNPASS done waiting\n");
1665 VTOI(ap
->a_vp
)->i_lfs_iflags
&= ~LFSI_WRAPWAIT
;
1666 lfs_vunref(ap
->a_vp
);
1668 mutex_exit(&lfs_lock
);
1671 case LFCNWRAPSTATUS
:
1672 mutex_enter(&lfs_lock
);
1673 *(int *)ap
->a_data
= fs
->lfs_wrapstatus
;
1674 mutex_exit(&lfs_lock
);
1678 return ufs_fcntl(v
);
1684 lfs_getpages(void *v
)
1686 struct vop_getpages_args
/* {
1689 struct vm_page **a_m;
1692 vm_prot_t a_access_type;
1697 if (VTOI(ap
->a_vp
)->i_number
== LFS_IFILE_INUM
&&
1698 (ap
->a_access_type
& VM_PROT_WRITE
) != 0) {
1701 if ((ap
->a_access_type
& VM_PROT_WRITE
) != 0) {
1702 mutex_enter(&lfs_lock
);
1703 LFS_SET_UINO(VTOI(ap
->a_vp
), IN_MODIFIED
);
1704 mutex_exit(&lfs_lock
);
1708 * we're relying on the fact that genfs_getpages() always read in
1709 * entire filesystem blocks.
1711 return genfs_getpages(v
);
1715 * Wait for a page to become unbusy, possibly printing diagnostic messages
1718 * Called with vp->v_interlock held; return with it held.
1721 wait_for_page(struct vnode
*vp
, struct vm_page
*pg
, const char *label
)
1723 if ((pg
->flags
& PG_BUSY
) == 0)
1724 return; /* Nothing to wait for! */
1726 #if defined(DEBUG) && defined(UVM_PAGE_TRKOWN)
1727 static struct vm_page
*lastpg
;
1729 if (label
!= NULL
&& pg
!= lastpg
) {
1730 if (pg
->owner_tag
) {
1731 printf("lfs_putpages[%d.%d]: %s: page %p owner %d.%d [%s]\n",
1732 curproc
->p_pid
, curlwp
->l_lid
, label
,
1733 pg
, pg
->owner
, pg
->lowner
, pg
->owner_tag
);
1735 printf("lfs_putpages[%d.%d]: %s: page %p unowned?!\n",
1736 curproc
->p_pid
, curlwp
->l_lid
, label
, pg
);
1742 pg
->flags
|= PG_WANTED
;
1743 UVM_UNLOCK_AND_WAIT(pg
, &vp
->v_interlock
, 0, "lfsput", 0);
1744 mutex_enter(&vp
->v_interlock
);
1748 * This routine is called by lfs_putpages() when it can't complete the
1749 * write because a page is busy. This means that either (1) someone,
1750 * possibly the pagedaemon, is looking at this page, and will give it up
1751 * presently; or (2) we ourselves are holding the page busy in the
1752 * process of being written (either gathered or actually on its way to
1753 * disk). We don't need to give up the segment lock, but we might need
1754 * to call lfs_writeseg() to expedite the page's journey to disk.
1756 * Called with vp->v_interlock held; return with it held.
1758 /* #define BUSYWAIT */
1760 write_and_wait(struct lfs
*fs
, struct vnode
*vp
, struct vm_page
*pg
,
1761 int seglocked
, const char *label
)
1764 struct inode
*ip
= VTOI(vp
);
1765 struct segment
*sp
= fs
->lfs_sp
;
1771 while (pg
->flags
& PG_BUSY
&&
1772 pg
->uobject
== &vp
->v_uobj
) {
1773 mutex_exit(&vp
->v_interlock
);
1774 if (sp
->cbpp
- sp
->bpp
> 1) {
1775 /* Write gathered pages */
1777 lfs_release_finfo(fs
);
1778 (void) lfs_writeseg(fs
, sp
);
1783 KASSERT(sp
->vp
== vp
);
1784 lfs_acquire_finfo(fs
, ip
->i_number
,
1788 mutex_enter(&vp
->v_interlock
);
1789 wait_for_page(vp
, pg
, label
);
1791 if (label
!= NULL
&& count
> 1)
1792 printf("lfs_putpages[%d]: %s: %sn = %d\n", curproc
->p_pid
,
1793 label
, (count
> 0 ? "looping, " : ""), count
);
1800 * Make sure that for all pages in every block in the given range,
1801 * either all are dirty or all are clean. If any of the pages
1802 * we've seen so far are dirty, put the vnode on the paging chain,
1803 * and mark it IN_PAGING.
1805 * If checkfirst != 0, don't check all the pages but return at the
1809 check_dirty(struct lfs
*fs
, struct vnode
*vp
,
1810 off_t startoffset
, off_t endoffset
, off_t blkeof
,
1811 int flags
, int checkfirst
, struct vm_page
**pgp
)
1814 struct vm_page
*curpg
= NULL
; /* XXX: gcc */
1815 struct vm_page
*pgs
[MAXBSIZE
/ PAGE_SIZE
], *pg
;
1816 off_t soff
= 0; /* XXX: gcc */
1820 int any_dirty
; /* number of dirty pages */
1821 int dirty
; /* number of dirty pages in a block */
1823 int pages_per_block
= fs
->lfs_bsize
>> PAGE_SHIFT
;
1824 int pagedaemon
= (curlwp
== uvm
.pagedaemon_lwp
);
1826 ASSERT_MAYBE_SEGLOCK(fs
);
1828 by_list
= (vp
->v_uobj
.uo_npages
<=
1829 ((endoffset
- startoffset
) >> PAGE_SHIFT
) *
1830 UVM_PAGE_TREE_PENALTY
);
1834 curpg
= TAILQ_FIRST(&vp
->v_uobj
.memq
);
1838 while (by_list
|| soff
< MIN(blkeof
, endoffset
)) {
1841 * Find the first page in a block. Skip
1842 * blocks outside our area of interest or beyond
1845 if (pages_per_block
> 1) {
1847 ((curpg
->offset
& fs
->lfs_bmask
) ||
1848 curpg
->offset
>= vp
->v_size
||
1849 curpg
->offset
>= endoffset
))
1850 curpg
= TAILQ_NEXT(curpg
, listq
.queue
);
1854 soff
= curpg
->offset
;
1858 * Mark all pages in extended range busy; find out if any
1859 * of them are dirty.
1861 nonexistent
= dirty
= 0;
1862 for (i
= 0; i
== 0 || i
< pages_per_block
; i
++) {
1863 if (by_list
&& pages_per_block
<= 1) {
1864 pgs
[i
] = pg
= curpg
;
1866 off
= soff
+ (i
<< PAGE_SHIFT
);
1867 pgs
[i
] = pg
= uvm_pagelookup(&vp
->v_uobj
, off
);
1873 KASSERT(pg
!= NULL
);
1876 * If we're holding the segment lock, we can deadlock
1877 * against a process that has our page and is waiting
1878 * for the cleaner, while the cleaner waits for the
1879 * segment lock. Just bail in that case.
1881 if ((pg
->flags
& PG_BUSY
) &&
1882 (pagedaemon
|| LFS_SEGLOCK_HELD(fs
))) {
1884 uvm_page_unbusy(pgs
, i
);
1885 DLOG((DLOG_PAGE
, "lfs_putpages: avoiding 3-way or pagedaemon deadlock\n"));
1891 while (pg
->flags
& PG_BUSY
) {
1892 wait_for_page(vp
, pg
, NULL
);
1894 uvm_page_unbusy(pgs
, i
);
1897 pg
->flags
|= PG_BUSY
;
1898 UVM_PAGE_OWN(pg
, "lfs_putpages");
1900 pmap_page_protect(pg
, VM_PROT_NONE
);
1901 tdirty
= (pmap_clear_modify(pg
) ||
1902 (pg
->flags
& PG_CLEAN
) == 0);
1905 if (pages_per_block
> 0 && nonexistent
>= pages_per_block
) {
1907 curpg
= TAILQ_NEXT(curpg
, listq
.queue
);
1909 soff
+= fs
->lfs_bsize
;
1915 KASSERT(nonexistent
== 0);
1918 * If any are dirty make all dirty; unbusy them,
1919 * but if we were asked to clean, wire them so that
1920 * the pagedaemon doesn't bother us about them while
1921 * they're on their way to disk.
1923 for (i
= 0; i
== 0 || i
< pages_per_block
; i
++) {
1925 KASSERT(!((pg
->flags
& PG_CLEAN
) && (pg
->flags
& PG_DELWRI
)));
1927 pg
->flags
&= ~PG_CLEAN
;
1928 if (flags
& PGO_FREE
) {
1930 * Wire the page so that
1931 * pdaemon doesn't see it again.
1933 mutex_enter(&uvm_pageqlock
);
1935 mutex_exit(&uvm_pageqlock
);
1937 /* Suspended write flag */
1938 pg
->flags
|= PG_DELWRI
;
1941 if (pg
->flags
& PG_WANTED
)
1943 pg
->flags
&= ~(PG_WANTED
|PG_BUSY
);
1944 UVM_PAGE_OWN(pg
, NULL
);
1947 if (checkfirst
&& any_dirty
)
1951 curpg
= TAILQ_NEXT(curpg
, listq
.queue
);
1953 soff
+= MAX(PAGE_SIZE
, fs
->lfs_bsize
);
1961 * lfs_putpages functions like genfs_putpages except that
1963 * (1) It needs to bounds-check the incoming requests to ensure that
1964 * they are block-aligned; if they are not, expand the range and
1965 * do the right thing in case, e.g., the requested range is clean
1966 * but the expanded range is dirty.
1968 * (2) It needs to explicitly send blocks to be written when it is done.
1969 * If VOP_PUTPAGES is called without the seglock held, we simply take
1970 * the seglock and let lfs_segunlock wait for us.
1971 * XXX There might be a bad situation if we have to flush a vnode while
1972 * XXX lfs_markv is in operation. As of this writing we panic in this
1977 * (1) The caller does not hold any pages in this vnode busy. If it does,
1978 * there is a danger that when we expand the page range and busy the
1979 * pages we will deadlock.
1981 * (2) We are called with vp->v_interlock held; we must return with it
1984 * (3) We don't absolutely have to free pages right away, provided that
1985 * the request does not have PGO_SYNCIO. When the pagedaemon gives
1986 * us a request with PGO_FREE, we take the pages out of the paging
1987 * queue and wake up the writer, which will handle freeing them for us.
1989 * We ensure that for any filesystem block, all pages for that
1990 * block are either resident or not, even if those pages are higher
1991 * than EOF; that means that we will be getting requests to free
1992 * "unused" pages above EOF all the time, and should ignore them.
1994 * (4) If we are called with PGO_LOCKED, the finfo array we are to write
1995 * into has been set up for us by lfs_writefile. If not, we will
1996 * have to handle allocating and/or freeing an finfo entry.
1998 * XXX note that we're (ab)using PGO_LOCKED as "seglock held".
2001 /* How many times to loop before we should start to worry */
2005 lfs_putpages(void *v
)
2008 struct vop_putpages_args
/* {
2018 off_t origoffset
, startoffset
, endoffset
, origendoffset
, blkeof
;
2019 off_t off
, max_endoffset
;
2020 bool seglocked
, sync
, pagedaemon
;
2021 struct vm_page
*pg
, *busypg
;
2022 UVMHIST_FUNC("lfs_putpages"); UVMHIST_CALLED(ubchist
);
2024 int debug_n_again
, debug_n_dirtyclean
;
2030 sync
= (ap
->a_flags
& PGO_SYNCIO
) != 0;
2031 pagedaemon
= (curlwp
== uvm
.pagedaemon_lwp
);
2033 /* Putpages does nothing for metadata. */
2034 if (vp
== fs
->lfs_ivnode
|| vp
->v_type
!= VREG
) {
2035 mutex_exit(&vp
->v_interlock
);
2040 * If there are no pages, don't do anything.
2042 if (vp
->v_uobj
.uo_npages
== 0) {
2043 if (TAILQ_EMPTY(&vp
->v_uobj
.memq
) &&
2044 (vp
->v_iflag
& VI_ONWORKLST
) &&
2045 LIST_FIRST(&vp
->v_dirtyblkhd
) == NULL
) {
2046 vp
->v_iflag
&= ~VI_WRMAPDIRTY
;
2047 vn_syncer_remove_from_worklist(vp
);
2049 mutex_exit(&vp
->v_interlock
);
2051 /* Remove us from paging queue, if we were on it */
2052 mutex_enter(&lfs_lock
);
2053 if (ip
->i_flags
& IN_PAGING
) {
2054 ip
->i_flags
&= ~IN_PAGING
;
2055 TAILQ_REMOVE(&fs
->lfs_pchainhd
, ip
, i_lfs_pchain
);
2057 mutex_exit(&lfs_lock
);
2061 blkeof
= blkroundup(fs
, ip
->i_size
);
2064 * Ignore requests to free pages past EOF but in the same block
2065 * as EOF, unless the request is synchronous. (If the request is
2066 * sync, it comes from lfs_truncate.)
2067 * XXXUBC Make these pages look "active" so the pagedaemon won't
2068 * XXXUBC bother us with them again.
2070 if (!sync
&& ap
->a_offlo
>= ip
->i_size
&& ap
->a_offlo
< blkeof
) {
2071 origoffset
= ap
->a_offlo
;
2072 for (off
= origoffset
; off
< blkeof
; off
+= fs
->lfs_bsize
) {
2073 pg
= uvm_pagelookup(&vp
->v_uobj
, off
);
2074 KASSERT(pg
!= NULL
);
2075 while (pg
->flags
& PG_BUSY
) {
2076 pg
->flags
|= PG_WANTED
;
2077 UVM_UNLOCK_AND_WAIT(pg
, &vp
->v_interlock
, 0,
2079 mutex_enter(&vp
->v_interlock
);
2081 mutex_enter(&uvm_pageqlock
);
2082 uvm_pageactivate(pg
);
2083 mutex_exit(&uvm_pageqlock
);
2085 ap
->a_offlo
= blkeof
;
2086 if (ap
->a_offhi
> 0 && ap
->a_offhi
<= ap
->a_offlo
) {
2087 mutex_exit(&vp
->v_interlock
);
2093 * Extend page range to start and end at block boundaries.
2094 * (For the purposes of VOP_PUTPAGES, fragments don't exist.)
2096 origoffset
= ap
->a_offlo
;
2097 origendoffset
= ap
->a_offhi
;
2098 startoffset
= origoffset
& ~(fs
->lfs_bmask
);
2099 max_endoffset
= (trunc_page(LLONG_MAX
) >> fs
->lfs_bshift
)
2102 if (origendoffset
== 0 || ap
->a_flags
& PGO_ALLPAGES
) {
2103 endoffset
= max_endoffset
;
2104 origendoffset
= endoffset
;
2106 origendoffset
= round_page(ap
->a_offhi
);
2107 endoffset
= round_page(blkroundup(fs
, origendoffset
));
2110 KASSERT(startoffset
> 0 || endoffset
>= startoffset
);
2111 if (startoffset
== endoffset
) {
2112 /* Nothing to do, why were we called? */
2113 mutex_exit(&vp
->v_interlock
);
2114 DLOG((DLOG_PAGE
, "lfs_putpages: startoffset = endoffset = %"
2115 PRId64
"\n", startoffset
));
2119 ap
->a_offlo
= startoffset
;
2120 ap
->a_offhi
= endoffset
;
2123 * If not cleaning, just send the pages through genfs_putpages
2124 * to be returned to the pool.
2126 if (!(ap
->a_flags
& PGO_CLEANIT
))
2127 return genfs_putpages(v
);
2129 /* Set PGO_BUSYFAIL to avoid deadlocks */
2130 ap
->a_flags
|= PGO_BUSYFAIL
;
2133 * Likewise, if we are asked to clean but the pages are not
2134 * dirty, we can just free them using genfs_putpages.
2137 debug_n_dirtyclean
= 0;
2142 /* Count the number of dirty pages */
2143 r
= check_dirty(fs
, vp
, startoffset
, endoffset
, blkeof
,
2144 ap
->a_flags
, 1, NULL
);
2146 /* Pages are busy with another process */
2147 mutex_exit(&vp
->v_interlock
);
2150 if (r
> 0) /* Some pages are dirty */
2154 * Sometimes pages are dirtied between the time that
2155 * we check and the time we try to clean them.
2156 * Instruct lfs_gop_write to return EDEADLK in this case
2157 * so we can write them properly.
2159 ip
->i_lfs_iflags
|= LFSI_NO_GOP_WRITE
;
2160 r
= genfs_do_putpages(vp
, startoffset
, endoffset
,
2161 ap
->a_flags
& ~PGO_SYNCIO
, &busypg
);
2162 ip
->i_lfs_iflags
&= ~LFSI_NO_GOP_WRITE
;
2166 /* One of the pages was busy. Start over. */
2167 mutex_enter(&vp
->v_interlock
);
2168 wait_for_page(vp
, busypg
, "dirtyclean");
2170 ++debug_n_dirtyclean
;
2175 if (debug_n_dirtyclean
> TOOMANY
)
2176 printf("lfs_putpages: dirtyclean: looping, n = %d\n",
2177 debug_n_dirtyclean
);
2181 * Dirty and asked to clean.
2183 * Pagedaemon can't actually write LFS pages; wake up
2184 * the writer to take care of that. The writer will
2185 * notice the pager inode queue and act on that.
2188 mutex_enter(&lfs_lock
);
2189 if (!(ip
->i_flags
& IN_PAGING
)) {
2190 ip
->i_flags
|= IN_PAGING
;
2191 TAILQ_INSERT_TAIL(&fs
->lfs_pchainhd
, ip
, i_lfs_pchain
);
2193 wakeup(&lfs_writer_daemon
);
2194 mutex_exit(&lfs_lock
);
2195 mutex_exit(&vp
->v_interlock
);
2201 * If this is a file created in a recent dirop, we can't flush its
2202 * inode until the dirop is complete. Drain dirops, then flush the
2203 * filesystem (taking care of any other pending dirops while we're
2206 if ((ap
->a_flags
& (PGO_CLEANIT
|PGO_LOCKED
)) == PGO_CLEANIT
&&
2207 (vp
->v_uflag
& VU_DIROP
)) {
2210 DLOG((DLOG_PAGE
, "lfs_putpages: flushing VU_DIROP\n"));
2211 locked
= (VOP_ISLOCKED(vp
) == LK_EXCLUSIVE
);
2212 mutex_exit(&vp
->v_interlock
);
2213 lfs_writer_enter(fs
, "ppdirop");
2215 VOP_UNLOCK(vp
, 0); /* XXX why? */
2217 mutex_enter(&lfs_lock
);
2218 lfs_flush_fs(fs
, sync
? SEGM_SYNC
: 0);
2219 mutex_exit(&lfs_lock
);
2221 mutex_enter(&vp
->v_interlock
);
2223 VOP_LOCK(vp
, LK_EXCLUSIVE
| LK_INTERLOCK
);
2224 mutex_enter(&vp
->v_interlock
);
2226 lfs_writer_leave(fs
);
2228 /* XXX the flush should have taken care of this one too! */
2232 * This is it. We are going to write some pages. From here on
2233 * down it's all just mechanics.
2235 * Don't let genfs_putpages wait; lfs_segunlock will wait for us.
2237 ap
->a_flags
&= ~PGO_SYNCIO
;
2240 * If we've already got the seglock, flush the node and return.
2241 * The FIP has already been set up for us by lfs_writefile,
2242 * and FIP cleanup and lfs_updatemeta will also be done there,
2243 * unless genfs_putpages returns EDEADLK; then we must flush
2244 * what we have, and correct FIP and segment header accounting.
2248 * If we are not called with the segment locked, lock it.
2249 * Account for a new FIP in the segment header, and set sp->vp.
2250 * (This should duplicate the setup at the top of lfs_writefile().)
2252 seglocked
= (ap
->a_flags
& PGO_LOCKED
) != 0;
2254 mutex_exit(&vp
->v_interlock
);
2255 error
= lfs_seglock(fs
, SEGM_PROT
| (sync
? SEGM_SYNC
: 0));
2258 mutex_enter(&vp
->v_interlock
);
2259 lfs_acquire_finfo(fs
, ip
->i_number
, ip
->i_gen
);
2262 KASSERT(sp
->vp
== NULL
);
2266 * Ensure that the partial segment is marked SS_DIROP if this
2269 if (!seglocked
&& vp
->v_uflag
& VU_DIROP
)
2270 ((SEGSUM
*)(sp
->segsum
))->ss_flags
|= (SS_DIROP
|SS_CONT
);
2273 * Loop over genfs_putpages until all pages are gathered.
2274 * genfs_putpages() drops the interlock, so reacquire it if necessary.
2275 * Whenever we lose the interlock we have to rerun check_dirty, as
2276 * well, since more pages might have been dirtied in our absence.
2283 if (check_dirty(fs
, vp
, startoffset
, endoffset
, blkeof
,
2284 ap
->a_flags
, 0, &busypg
) < 0) {
2285 mutex_exit(&vp
->v_interlock
);
2287 mutex_enter(&vp
->v_interlock
);
2288 write_and_wait(fs
, vp
, busypg
, seglocked
, NULL
);
2290 mutex_exit(&vp
->v_interlock
);
2291 lfs_release_finfo(fs
);
2293 mutex_enter(&vp
->v_interlock
);
2300 error
= genfs_do_putpages(vp
, startoffset
, endoffset
,
2301 ap
->a_flags
, &busypg
);
2303 if (error
== EDEADLK
|| error
== EAGAIN
) {
2304 DLOG((DLOG_PAGE
, "lfs_putpages: genfs_putpages returned"
2305 " %d ino %d off %x (seg %d)\n", error
,
2306 ip
->i_number
, fs
->lfs_offset
,
2307 dtosn(fs
, fs
->lfs_offset
)));
2309 mutex_enter(&vp
->v_interlock
);
2310 write_and_wait(fs
, vp
, busypg
, seglocked
, "again");
2315 } while (error
== EDEADLK
);
2317 if (debug_n_again
> TOOMANY
)
2318 printf("lfs_putpages: again: looping, n = %d\n", debug_n_again
);
2321 KASSERT(sp
!= NULL
&& sp
->vp
== vp
);
2325 /* Write indirect blocks as well */
2326 lfs_gather(fs
, fs
->lfs_sp
, vp
, lfs_match_indir
);
2327 lfs_gather(fs
, fs
->lfs_sp
, vp
, lfs_match_dindir
);
2328 lfs_gather(fs
, fs
->lfs_sp
, vp
, lfs_match_tindir
);
2330 KASSERT(sp
->vp
== NULL
);
2335 * Blocks are now gathered into a segment waiting to be written.
2336 * All that's left to do is update metadata, and write them.
2339 KASSERT(sp
->vp
== vp
);
2343 * If we were called from lfs_writefile, we don't need to clean up
2344 * the FIP or unlock the segment lock. We're done.
2349 /* Clean up FIP and send it to disk. */
2350 lfs_release_finfo(fs
);
2351 lfs_writeseg(fs
, fs
->lfs_sp
);
2354 * Remove us from paging queue if we wrote all our pages.
2356 if (origendoffset
== 0 || ap
->a_flags
& PGO_ALLPAGES
) {
2357 mutex_enter(&lfs_lock
);
2358 if (ip
->i_flags
& IN_PAGING
) {
2359 ip
->i_flags
&= ~IN_PAGING
;
2360 TAILQ_REMOVE(&fs
->lfs_pchainhd
, ip
, i_lfs_pchain
);
2362 mutex_exit(&lfs_lock
);
2366 * XXX - with the malloc/copy writeseg, the pages are freed by now
2367 * even if we don't wait (e.g. if we hold a nested lock). This
2368 * will not be true if we stop using malloc/copy.
2370 KASSERT(fs
->lfs_sp
->seg_flags
& SEGM_PROT
);
2374 * Wait for v_numoutput to drop to zero. The seglock should
2375 * take care of this, but there is a slight possibility that
2376 * aiodoned might not have got around to our buffers yet.
2379 mutex_enter(&vp
->v_interlock
);
2380 while (vp
->v_numoutput
> 0) {
2381 DLOG((DLOG_PAGE
, "lfs_putpages: ino %d sleeping on"
2382 " num %d\n", ip
->i_number
, vp
->v_numoutput
));
2383 cv_wait(&vp
->v_cv
, &vp
->v_interlock
);
2385 mutex_exit(&vp
->v_interlock
);
2391 * Return the last logical file offset that should be written for this file
2392 * if we're doing a write that ends at "size". If writing, we need to know
2393 * about sizes on disk, i.e. fragments if there are any; if reading, we need
2394 * to know about entire blocks.
2397 lfs_gop_size(struct vnode
*vp
, off_t size
, off_t
*eobp
, int flags
)
2399 struct inode
*ip
= VTOI(vp
);
2400 struct lfs
*fs
= ip
->i_lfs
;
2403 olbn
= lblkno(fs
, ip
->i_size
);
2404 nlbn
= lblkno(fs
, size
);
2405 if (!(flags
& GOP_SIZE_MEM
) && nlbn
< NDADDR
&& olbn
<= nlbn
) {
2406 *eobp
= fragroundup(fs
, size
);
2408 *eobp
= blkroundup(fs
, size
);
2413 void lfs_dump_vop(void *);
2416 lfs_dump_vop(void *v
)
2418 struct vop_putpages_args
/* {
2426 vfs_vnode_print(ap
->a_vp
, 0, printf
);
2428 lfs_dump_dinode(VTOI(ap
->a_vp
)->i_din
.ffs1_din
);
2435 struct vop_mmap_args
/* {
2436 const struct vnodeop_desc *a_desc;
2439 kauth_cred_t a_cred;
2442 if (VTOI(ap
->a_vp
)->i_number
== LFS_IFILE_INUM
)