1 /* $NetBSD: lfs_vnops.c,v 1.238 2011/09/20 14:01:33 chs Exp $ */
4 * Copyright (c) 1999, 2000, 2001, 2002, 2003 The NetBSD Foundation, Inc.
7 * This code is derived from software contributed to The NetBSD Foundation
8 * by Konrad E. Schroder <perseant@hhhh.org>.
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 * POSSIBILITY OF SUCH DAMAGE.
32 * Copyright (c) 1986, 1989, 1991, 1993, 1995
33 * The Regents of the University of California. All rights reserved.
35 * Redistribution and use in source and binary forms, with or without
36 * modification, are permitted provided that the following conditions
38 * 1. Redistributions of source code must retain the above copyright
39 * notice, this list of conditions and the following disclaimer.
40 * 2. Redistributions in binary form must reproduce the above copyright
41 * notice, this list of conditions and the following disclaimer in the
42 * documentation and/or other materials provided with the distribution.
43 * 3. Neither the name of the University nor the names of its contributors
44 * may be used to endorse or promote products derived from this software
45 * without specific prior written permission.
47 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
48 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
49 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
50 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
51 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
52 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
53 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
54 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
55 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
56 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
59 * @(#)lfs_vnops.c 8.13 (Berkeley) 6/10/95
62 #include <sys/cdefs.h>
63 __KERNEL_RCSID(0, "$NetBSD: lfs_vnops.c,v 1.238 2011/09/20 14:01:33 chs Exp $");
66 #include "opt_compat_netbsd.h"
67 #include "opt_uvm_page_trkown.h"
70 #include <sys/param.h>
71 #include <sys/systm.h>
72 #include <sys/namei.h>
73 #include <sys/resourcevar.h>
74 #include <sys/kernel.h>
79 #include <sys/mount.h>
80 #include <sys/vnode.h>
82 #include <sys/signalvar.h>
83 #include <sys/kauth.h>
84 #include <sys/syslog.h>
85 #include <sys/fstrans.h>
87 #include <miscfs/fifofs/fifo.h>
88 #include <miscfs/genfs/genfs.h>
89 #include <miscfs/specfs/specdev.h>
91 #include <ufs/ufs/inode.h>
92 #include <ufs/ufs/dir.h>
93 #include <ufs/ufs/ufsmount.h>
94 #include <ufs/ufs/ufs_extern.h>
97 #include <uvm/uvm_pmap.h>
98 #include <uvm/uvm_stat.h>
99 #include <uvm/uvm_pager.h>
101 #include <ufs/lfs/lfs.h>
102 #include <ufs/lfs/lfs_extern.h>
104 extern pid_t lfs_writer_daemon
;
105 int lfs_ignore_lazy_sync
= 1;
107 /* Global vfs data structures for lfs. */
108 int (**lfs_vnodeop_p
)(void *);
109 const struct vnodeopv_entry_desc lfs_vnodeop_entries
[] = {
110 { &vop_default_desc
, vn_default_error
},
111 { &vop_lookup_desc
, ufs_lookup
}, /* lookup */
112 { &vop_create_desc
, lfs_create
}, /* create */
113 { &vop_whiteout_desc
, ufs_whiteout
}, /* whiteout */
114 { &vop_mknod_desc
, lfs_mknod
}, /* mknod */
115 { &vop_open_desc
, ufs_open
}, /* open */
116 { &vop_close_desc
, lfs_close
}, /* close */
117 { &vop_access_desc
, ufs_access
}, /* access */
118 { &vop_getattr_desc
, lfs_getattr
}, /* getattr */
119 { &vop_setattr_desc
, lfs_setattr
}, /* setattr */
120 { &vop_read_desc
, lfs_read
}, /* read */
121 { &vop_write_desc
, lfs_write
}, /* write */
122 { &vop_ioctl_desc
, ufs_ioctl
}, /* ioctl */
123 { &vop_fcntl_desc
, lfs_fcntl
}, /* fcntl */
124 { &vop_poll_desc
, ufs_poll
}, /* poll */
125 { &vop_kqfilter_desc
, genfs_kqfilter
}, /* kqfilter */
126 { &vop_revoke_desc
, ufs_revoke
}, /* revoke */
127 { &vop_mmap_desc
, lfs_mmap
}, /* mmap */
128 { &vop_fsync_desc
, lfs_fsync
}, /* fsync */
129 { &vop_seek_desc
, ufs_seek
}, /* seek */
130 { &vop_remove_desc
, lfs_remove
}, /* remove */
131 { &vop_link_desc
, lfs_link
}, /* link */
132 { &vop_rename_desc
, lfs_rename
}, /* rename */
133 { &vop_mkdir_desc
, lfs_mkdir
}, /* mkdir */
134 { &vop_rmdir_desc
, lfs_rmdir
}, /* rmdir */
135 { &vop_symlink_desc
, lfs_symlink
}, /* symlink */
136 { &vop_readdir_desc
, ufs_readdir
}, /* readdir */
137 { &vop_readlink_desc
, ufs_readlink
}, /* readlink */
138 { &vop_abortop_desc
, ufs_abortop
}, /* abortop */
139 { &vop_inactive_desc
, lfs_inactive
}, /* inactive */
140 { &vop_reclaim_desc
, lfs_reclaim
}, /* reclaim */
141 { &vop_lock_desc
, ufs_lock
}, /* lock */
142 { &vop_unlock_desc
, ufs_unlock
}, /* unlock */
143 { &vop_bmap_desc
, ufs_bmap
}, /* bmap */
144 { &vop_strategy_desc
, lfs_strategy
}, /* strategy */
145 { &vop_print_desc
, ufs_print
}, /* print */
146 { &vop_islocked_desc
, ufs_islocked
}, /* islocked */
147 { &vop_pathconf_desc
, ufs_pathconf
}, /* pathconf */
148 { &vop_advlock_desc
, ufs_advlock
}, /* advlock */
149 { &vop_bwrite_desc
, lfs_bwrite
}, /* bwrite */
150 { &vop_getpages_desc
, lfs_getpages
}, /* getpages */
151 { &vop_putpages_desc
, lfs_putpages
}, /* putpages */
154 const struct vnodeopv_desc lfs_vnodeop_opv_desc
=
155 { &lfs_vnodeop_p
, lfs_vnodeop_entries
};
157 int (**lfs_specop_p
)(void *);
158 const struct vnodeopv_entry_desc lfs_specop_entries
[] = {
159 { &vop_default_desc
, vn_default_error
},
160 { &vop_lookup_desc
, spec_lookup
}, /* lookup */
161 { &vop_create_desc
, spec_create
}, /* create */
162 { &vop_mknod_desc
, spec_mknod
}, /* mknod */
163 { &vop_open_desc
, spec_open
}, /* open */
164 { &vop_close_desc
, lfsspec_close
}, /* close */
165 { &vop_access_desc
, ufs_access
}, /* access */
166 { &vop_getattr_desc
, lfs_getattr
}, /* getattr */
167 { &vop_setattr_desc
, lfs_setattr
}, /* setattr */
168 { &vop_read_desc
, ufsspec_read
}, /* read */
169 { &vop_write_desc
, ufsspec_write
}, /* write */
170 { &vop_ioctl_desc
, spec_ioctl
}, /* ioctl */
171 { &vop_fcntl_desc
, ufs_fcntl
}, /* fcntl */
172 { &vop_poll_desc
, spec_poll
}, /* poll */
173 { &vop_kqfilter_desc
, spec_kqfilter
}, /* kqfilter */
174 { &vop_revoke_desc
, spec_revoke
}, /* revoke */
175 { &vop_mmap_desc
, spec_mmap
}, /* mmap */
176 { &vop_fsync_desc
, spec_fsync
}, /* fsync */
177 { &vop_seek_desc
, spec_seek
}, /* seek */
178 { &vop_remove_desc
, spec_remove
}, /* remove */
179 { &vop_link_desc
, spec_link
}, /* link */
180 { &vop_rename_desc
, spec_rename
}, /* rename */
181 { &vop_mkdir_desc
, spec_mkdir
}, /* mkdir */
182 { &vop_rmdir_desc
, spec_rmdir
}, /* rmdir */
183 { &vop_symlink_desc
, spec_symlink
}, /* symlink */
184 { &vop_readdir_desc
, spec_readdir
}, /* readdir */
185 { &vop_readlink_desc
, spec_readlink
}, /* readlink */
186 { &vop_abortop_desc
, spec_abortop
}, /* abortop */
187 { &vop_inactive_desc
, lfs_inactive
}, /* inactive */
188 { &vop_reclaim_desc
, lfs_reclaim
}, /* reclaim */
189 { &vop_lock_desc
, ufs_lock
}, /* lock */
190 { &vop_unlock_desc
, ufs_unlock
}, /* unlock */
191 { &vop_bmap_desc
, spec_bmap
}, /* bmap */
192 { &vop_strategy_desc
, spec_strategy
}, /* strategy */
193 { &vop_print_desc
, ufs_print
}, /* print */
194 { &vop_islocked_desc
, ufs_islocked
}, /* islocked */
195 { &vop_pathconf_desc
, spec_pathconf
}, /* pathconf */
196 { &vop_advlock_desc
, spec_advlock
}, /* advlock */
197 { &vop_bwrite_desc
, vn_bwrite
}, /* bwrite */
198 { &vop_getpages_desc
, spec_getpages
}, /* getpages */
199 { &vop_putpages_desc
, spec_putpages
}, /* putpages */
202 const struct vnodeopv_desc lfs_specop_opv_desc
=
203 { &lfs_specop_p
, lfs_specop_entries
};
205 int (**lfs_fifoop_p
)(void *);
206 const struct vnodeopv_entry_desc lfs_fifoop_entries
[] = {
207 { &vop_default_desc
, vn_default_error
},
208 { &vop_lookup_desc
, vn_fifo_bypass
}, /* lookup */
209 { &vop_create_desc
, vn_fifo_bypass
}, /* create */
210 { &vop_mknod_desc
, vn_fifo_bypass
}, /* mknod */
211 { &vop_open_desc
, vn_fifo_bypass
}, /* open */
212 { &vop_close_desc
, lfsfifo_close
}, /* close */
213 { &vop_access_desc
, ufs_access
}, /* access */
214 { &vop_getattr_desc
, lfs_getattr
}, /* getattr */
215 { &vop_setattr_desc
, lfs_setattr
}, /* setattr */
216 { &vop_read_desc
, ufsfifo_read
}, /* read */
217 { &vop_write_desc
, ufsfifo_write
}, /* write */
218 { &vop_ioctl_desc
, vn_fifo_bypass
}, /* ioctl */
219 { &vop_fcntl_desc
, ufs_fcntl
}, /* fcntl */
220 { &vop_poll_desc
, vn_fifo_bypass
}, /* poll */
221 { &vop_kqfilter_desc
, vn_fifo_bypass
}, /* kqfilter */
222 { &vop_revoke_desc
, vn_fifo_bypass
}, /* revoke */
223 { &vop_mmap_desc
, vn_fifo_bypass
}, /* mmap */
224 { &vop_fsync_desc
, vn_fifo_bypass
}, /* fsync */
225 { &vop_seek_desc
, vn_fifo_bypass
}, /* seek */
226 { &vop_remove_desc
, vn_fifo_bypass
}, /* remove */
227 { &vop_link_desc
, vn_fifo_bypass
}, /* link */
228 { &vop_rename_desc
, vn_fifo_bypass
}, /* rename */
229 { &vop_mkdir_desc
, vn_fifo_bypass
}, /* mkdir */
230 { &vop_rmdir_desc
, vn_fifo_bypass
}, /* rmdir */
231 { &vop_symlink_desc
, vn_fifo_bypass
}, /* symlink */
232 { &vop_readdir_desc
, vn_fifo_bypass
}, /* readdir */
233 { &vop_readlink_desc
, vn_fifo_bypass
}, /* readlink */
234 { &vop_abortop_desc
, vn_fifo_bypass
}, /* abortop */
235 { &vop_inactive_desc
, lfs_inactive
}, /* inactive */
236 { &vop_reclaim_desc
, lfs_reclaim
}, /* reclaim */
237 { &vop_lock_desc
, ufs_lock
}, /* lock */
238 { &vop_unlock_desc
, ufs_unlock
}, /* unlock */
239 { &vop_bmap_desc
, vn_fifo_bypass
}, /* bmap */
240 { &vop_strategy_desc
, vn_fifo_bypass
}, /* strategy */
241 { &vop_print_desc
, ufs_print
}, /* print */
242 { &vop_islocked_desc
, ufs_islocked
}, /* islocked */
243 { &vop_pathconf_desc
, vn_fifo_bypass
}, /* pathconf */
244 { &vop_advlock_desc
, vn_fifo_bypass
}, /* advlock */
245 { &vop_bwrite_desc
, lfs_bwrite
}, /* bwrite */
246 { &vop_putpages_desc
, vn_fifo_bypass
}, /* putpages */
249 const struct vnodeopv_desc lfs_fifoop_opv_desc
=
250 { &lfs_fifoop_p
, lfs_fifoop_entries
};
252 static int check_dirty(struct lfs
*, struct vnode
*, off_t
, off_t
, off_t
, int, int, struct vm_page
**);
254 #define LFS_READWRITE
255 #include <ufs/ufs/ufs_readwrite.c>
259 * Synch an open file.
265 struct vop_fsync_args
/* {
272 struct vnode
*vp
= ap
->a_vp
;
274 struct inode
*ip
= VTOI(vp
);
275 struct lfs
*fs
= ip
->i_lfs
;
277 /* If we're mounted read-only, don't try to sync. */
281 /* If a removed vnode is being cleaned, no need to sync here. */
282 if ((ap
->a_flags
& FSYNC_RECLAIM
) != 0 && ip
->i_mode
== 0)
286 * Trickle sync simply adds this vnode to the pager list, as if
287 * the pagedaemon had requested a pageout.
289 if (ap
->a_flags
& FSYNC_LAZY
) {
290 if (lfs_ignore_lazy_sync
== 0) {
291 mutex_enter(&lfs_lock
);
292 if (!(ip
->i_flags
& IN_PAGING
)) {
293 ip
->i_flags
|= IN_PAGING
;
294 TAILQ_INSERT_TAIL(&fs
->lfs_pchainhd
, ip
,
297 wakeup(&lfs_writer_daemon
);
298 mutex_exit(&lfs_lock
);
304 * If a vnode is bring cleaned, flush it out before we try to
305 * reuse it. This prevents the cleaner from writing files twice
306 * in the same partial segment, causing an accounting underflow.
308 if (ap
->a_flags
& FSYNC_RECLAIM
&& ip
->i_flags
& IN_CLEANING
) {
312 wait
= (ap
->a_flags
& FSYNC_WAIT
);
314 mutex_enter(vp
->v_interlock
);
315 error
= VOP_PUTPAGES(vp
, trunc_page(ap
->a_offlo
),
316 round_page(ap
->a_offhi
),
317 PGO_CLEANIT
| (wait
? PGO_SYNCIO
: 0));
318 if (error
== EAGAIN
) {
319 mutex_enter(&lfs_lock
);
320 mtsleep(&fs
->lfs_avail
, PCATCH
| PUSER
, "lfs_fsync",
321 hz
/ 100 + 1, &lfs_lock
);
322 mutex_exit(&lfs_lock
);
324 } while (error
== EAGAIN
);
328 if ((ap
->a_flags
& FSYNC_DATAONLY
) == 0)
329 error
= lfs_update(vp
, NULL
, NULL
, wait
? UPDATE_WAIT
: 0);
331 if (error
== 0 && ap
->a_flags
& FSYNC_CACHE
) {
333 error
= VOP_IOCTL(ip
->i_devvp
, DIOCCACHESYNC
, &l
, FWRITE
,
336 if (wait
&& !VPISEMPTY(vp
))
337 LFS_SET_UINO(ip
, IN_MODIFIED
);
343 * Take IN_ADIROP off, then call ufs_inactive.
346 lfs_inactive(void *v
)
348 struct vop_inactive_args
/* {
352 lfs_unmark_vnode(ap
->a_vp
);
355 * The Ifile is only ever inactivated on unmount.
356 * Streamline this process by not giving it more dirty blocks.
358 if (VTOI(ap
->a_vp
)->i_number
== LFS_IFILE_INUM
) {
359 mutex_enter(&lfs_lock
);
360 LFS_CLR_UINO(VTOI(ap
->a_vp
), IN_ALLMOD
);
361 mutex_exit(&lfs_lock
);
362 VOP_UNLOCK(ap
->a_vp
);
366 return ufs_inactive(v
);
370 * These macros are used to bracket UFS directory ops, so that we can
371 * identify all the pages touched during directory ops which need to
372 * be ordered and flushed atomically, so that they may be recovered.
374 * Because we have to mark nodes VU_DIROP in order to prevent
375 * the cache from reclaiming them while a dirop is in progress, we must
376 * also manage the number of nodes so marked (otherwise we can run out).
377 * We do this by setting lfs_dirvcount to the number of marked vnodes; it
378 * is decremented during segment write, when VU_DIROP is taken off.
380 #define MARK_VNODE(vp) lfs_mark_vnode(vp)
381 #define UNMARK_VNODE(vp) lfs_unmark_vnode(vp)
382 #define SET_DIROP_CREATE(dvp, vpp) lfs_set_dirop_create((dvp), (vpp))
383 #define SET_DIROP_REMOVE(dvp, vp) lfs_set_dirop((dvp), (vp))
384 static int lfs_set_dirop_create(struct vnode
*, struct vnode
**);
385 static int lfs_set_dirop(struct vnode
*, struct vnode
*);
388 lfs_set_dirop(struct vnode
*dvp
, struct vnode
*vp
)
393 KASSERT(VOP_ISLOCKED(dvp
));
394 KASSERT(vp
== NULL
|| VOP_ISLOCKED(vp
));
396 fs
= VTOI(dvp
)->i_lfs
;
398 ASSERT_NO_SEGLOCK(fs
);
400 * LFS_NRESERVE calculates direct and indirect blocks as well
401 * as an inode block; an overestimate in most cases.
403 if ((error
= lfs_reserve(fs
, dvp
, vp
, LFS_NRESERVE(fs
))) != 0)
407 mutex_enter(&lfs_lock
);
408 if (fs
->lfs_dirops
== 0) {
409 mutex_exit(&lfs_lock
);
410 lfs_check(dvp
, LFS_UNUSED_LBN
, 0);
411 mutex_enter(&lfs_lock
);
413 while (fs
->lfs_writer
) {
414 error
= mtsleep(&fs
->lfs_dirops
, (PRIBIO
+ 1) | PCATCH
,
415 "lfs_sdirop", 0, &lfs_lock
);
416 if (error
== EINTR
) {
417 mutex_exit(&lfs_lock
);
421 if (lfs_dirvcount
> LFS_MAX_DIROP
&& fs
->lfs_dirops
== 0) {
422 wakeup(&lfs_writer_daemon
);
423 mutex_exit(&lfs_lock
);
428 if (lfs_dirvcount
> LFS_MAX_DIROP
) {
429 mutex_exit(&lfs_lock
);
430 DLOG((DLOG_DIROP
, "lfs_set_dirop: sleeping with dirops=%d, "
431 "dirvcount=%d\n", fs
->lfs_dirops
, lfs_dirvcount
));
432 if ((error
= mtsleep(&lfs_dirvcount
,
433 PCATCH
| PUSER
| PNORELOCK
, "lfs_maxdirop", 0,
442 mutex_exit(&lfs_lock
);
444 /* Hold a reference so SET_ENDOP will be happy */
455 lfs_reserve(fs
, dvp
, vp
, -LFS_NRESERVE(fs
));
460 * Get a new vnode *before* adjusting the dirop count, to avoid a deadlock
461 * in getnewvnode(), if we have a stacked filesystem mounted on top
464 * NB: this means we have to clear the new vnodes on error. Fortunately
465 * SET_ENDOP is there to do that for us.
468 lfs_set_dirop_create(struct vnode
*dvp
, struct vnode
**vpp
)
473 fs
= VFSTOUFS(dvp
->v_mount
)->um_lfs
;
474 ASSERT_NO_SEGLOCK(fs
);
478 return lfs_set_dirop(dvp
, NULL
);
480 error
= getnewvnode(VT_LFS
, dvp
->v_mount
, lfs_vnodeop_p
, NULL
, vpp
);
482 DLOG((DLOG_ALLOC
, "lfs_set_dirop_create: dvp %p error %d\n",
486 if ((error
= lfs_set_dirop(dvp
, NULL
)) != 0) {
494 #define SET_ENDOP_BASE(fs, dvp, str) \
496 mutex_enter(&lfs_lock); \
497 --(fs)->lfs_dirops; \
498 if (!(fs)->lfs_dirops) { \
499 if ((fs)->lfs_nadirop) { \
500 panic("SET_ENDOP: %s: no dirops but " \
501 " nadirop=%d", (str), \
502 (fs)->lfs_nadirop); \
504 wakeup(&(fs)->lfs_writer); \
505 mutex_exit(&lfs_lock); \
506 lfs_check((dvp), LFS_UNUSED_LBN, 0); \
508 mutex_exit(&lfs_lock); \
510 #define SET_ENDOP_CREATE(fs, dvp, nvpp, str) \
514 UNMARK_VNODE(*nvpp); \
515 /* Check for error return to stem vnode leakage */ \
516 if (nvpp && *nvpp && !((*nvpp)->v_uflag & VU_DIROP)) \
517 ungetnewvnode(*(nvpp)); \
518 SET_ENDOP_BASE((fs), (dvp), (str)); \
519 lfs_reserve((fs), (dvp), NULL, -LFS_NRESERVE(fs)); \
522 #define SET_ENDOP_CREATE_AP(ap, str) \
523 SET_ENDOP_CREATE(VTOI((ap)->a_dvp)->i_lfs, (ap)->a_dvp, \
525 #define SET_ENDOP_REMOVE(fs, dvp, ovp, str) \
530 SET_ENDOP_BASE((fs), (dvp), (str)); \
531 lfs_reserve((fs), (dvp), (ovp), -LFS_NRESERVE(fs)); \
538 lfs_mark_vnode(struct vnode
*vp
)
540 struct inode
*ip
= VTOI(vp
);
541 struct lfs
*fs
= ip
->i_lfs
;
543 mutex_enter(&lfs_lock
);
544 if (!(ip
->i_flag
& IN_ADIROP
)) {
545 if (!(vp
->v_uflag
& VU_DIROP
)) {
546 mutex_enter(vp
->v_interlock
);
550 TAILQ_INSERT_TAIL(&fs
->lfs_dchainhd
, ip
, i_lfs_dchain
);
551 vp
->v_uflag
|= VU_DIROP
;
554 ip
->i_flag
|= IN_ADIROP
;
556 KASSERT(vp
->v_uflag
& VU_DIROP
);
557 mutex_exit(&lfs_lock
);
561 lfs_unmark_vnode(struct vnode
*vp
)
563 struct inode
*ip
= VTOI(vp
);
565 if (ip
&& (ip
->i_flag
& IN_ADIROP
)) {
566 KASSERT(vp
->v_uflag
& VU_DIROP
);
567 mutex_enter(&lfs_lock
);
568 --ip
->i_lfs
->lfs_nadirop
;
569 mutex_exit(&lfs_lock
);
570 ip
->i_flag
&= ~IN_ADIROP
;
577 struct vop_symlink_args
/* {
579 struct vnode **a_vpp;
580 struct componentname *a_cnp;
586 if ((error
= SET_DIROP_CREATE(ap
->a_dvp
, ap
->a_vpp
)) != 0) {
590 error
= ufs_symlink(ap
);
591 SET_ENDOP_CREATE_AP(ap
, "symlink");
598 struct vop_mknod_args
/* {
600 struct vnode **a_vpp;
601 struct componentname *a_cnp;
604 struct vattr
*vap
= ap
->a_vap
;
605 struct vnode
**vpp
= ap
->a_vpp
;
610 struct ufs_lookup_results
*ulr
;
612 /* XXX should handle this material another way */
613 ulr
= &VTOI(ap
->a_dvp
)->i_crap
;
614 UFS_CHECK_CRAPCOUNTER(VTOI(ap
->a_dvp
));
616 if ((error
= SET_DIROP_CREATE(ap
->a_dvp
, ap
->a_vpp
)) != 0) {
620 error
= ufs_makeinode(MAKEIMODE(vap
->va_type
, vap
->va_mode
),
621 ap
->a_dvp
, ulr
, vpp
, ap
->a_cnp
);
623 /* Either way we're done with the dirop at this point */
624 SET_ENDOP_CREATE_AP(ap
, "mknod");
630 mp
= (*vpp
)->v_mount
;
632 ip
->i_flag
|= IN_ACCESS
| IN_CHANGE
| IN_UPDATE
;
633 if (vap
->va_rdev
!= VNOVAL
) {
635 * Want to be able to use this to make badblock
636 * inodes, so don't truncate the dev number.
639 ip
->i_ffs1_rdev
= ufs_rw32(vap
->va_rdev
,
640 UFS_MPNEEDSWAP((*vpp
)->v_mount
));
642 ip
->i_ffs1_rdev
= vap
->va_rdev
;
647 * Call fsync to write the vnode so that we don't have to deal with
648 * flushing it when it's marked VU_DIROP|VI_XLOCK.
650 * XXX KS - If we can't flush we also can't call vgone(), so must
651 * return. But, that leaves this vnode in limbo, also not good.
652 * Can this ever happen (barring hardware failure)?
654 if ((error
= VOP_FSYNC(*vpp
, NOCRED
, FSYNC_WAIT
, 0, 0)) != 0) {
655 panic("lfs_mknod: couldn't fsync (ino %llu)",
656 (unsigned long long)ino
);
657 /* return (error); */
660 * Remove vnode so that it will be reloaded by VFS_VGET and
661 * checked to see if it is an alias of an existing entry in
664 /* Used to be vput, but that causes us to call VOP_INACTIVE twice. */
667 (*vpp
)->v_type
= VNON
;
669 error
= VFS_VGET(mp
, ino
, vpp
);
681 struct vop_create_args
/* {
683 struct vnode **a_vpp;
684 struct componentname *a_cnp;
689 if ((error
= SET_DIROP_CREATE(ap
->a_dvp
, ap
->a_vpp
)) != 0) {
693 error
= ufs_create(ap
);
694 SET_ENDOP_CREATE_AP(ap
, "create");
701 struct vop_mkdir_args
/* {
703 struct vnode **a_vpp;
704 struct componentname *a_cnp;
709 if ((error
= SET_DIROP_CREATE(ap
->a_dvp
, ap
->a_vpp
)) != 0) {
713 error
= ufs_mkdir(ap
);
714 SET_ENDOP_CREATE_AP(ap
, "mkdir");
721 struct vop_remove_args
/* {
724 struct componentname *a_cnp;
726 struct vnode
*dvp
, *vp
;
733 if ((error
= SET_DIROP_REMOVE(dvp
, vp
)) != 0) {
741 error
= ufs_remove(ap
);
742 if (ip
->i_nlink
== 0)
743 lfs_orphan(ip
->i_lfs
, ip
->i_number
);
744 SET_ENDOP_REMOVE(ip
->i_lfs
, dvp
, ap
->a_vp
, "remove");
751 struct vop_rmdir_args
/* {
752 struct vnodeop_desc *a_desc;
755 struct componentname *a_cnp;
763 if ((error
= SET_DIROP_REMOVE(ap
->a_dvp
, ap
->a_vp
)) != 0) {
771 error
= ufs_rmdir(ap
);
772 if (ip
->i_nlink
== 0)
773 lfs_orphan(ip
->i_lfs
, ip
->i_number
);
774 SET_ENDOP_REMOVE(ip
->i_lfs
, ap
->a_dvp
, ap
->a_vp
, "rmdir");
781 struct vop_link_args
/* {
784 struct componentname *a_cnp;
787 struct vnode
**vpp
= NULL
;
789 if ((error
= SET_DIROP_CREATE(ap
->a_dvp
, vpp
)) != 0) {
793 error
= ufs_link(ap
);
794 SET_ENDOP_CREATE(VTOI(ap
->a_dvp
)->i_lfs
, ap
->a_dvp
, vpp
, "link");
801 struct vop_rename_args
/* {
802 struct vnode *a_fdvp;
804 struct componentname *a_fcnp;
805 struct vnode *a_tdvp;
807 struct componentname *a_tcnp;
809 struct vnode
*tvp
, *fvp
, *tdvp
, *fdvp
;
810 struct componentname
*tcnp
, *fcnp
;
814 fs
= VTOI(ap
->a_fdvp
)->i_lfs
;
823 * Check for cross-device rename.
824 * If it is, we don't want to set dirops, just error out.
825 * (In particular note that MARK_VNODE(tdvp) will DTWT on
826 * a cross-device rename.)
828 * Copied from ufs_rename.
830 if ((fvp
->v_mount
!= tdvp
->v_mount
) ||
831 (tvp
&& (fvp
->v_mount
!= tvp
->v_mount
))) {
837 * Check to make sure we're not renaming a vnode onto itself
838 * (deleting a hard link by renaming one name onto another);
839 * if we are we can't recursively call VOP_REMOVE since that
840 * would leave us with an unaccounted-for number of live dirops.
842 * Inline the relevant section of ufs_rename here, *before*
843 * calling SET_DIROP_REMOVE.
845 if (tvp
&& ((VTOI(tvp
)->i_flags
& (IMMUTABLE
| APPEND
)) ||
846 (VTOI(tdvp
)->i_flags
& APPEND
))) {
851 if (fvp
->v_type
== VDIR
) {
856 /* Release destination completely. */
857 VOP_ABORTOP(tdvp
, tcnp
);
863 fcnp
->cn_flags
&= ~(MODMASK
);
864 fcnp
->cn_flags
|= LOCKPARENT
| LOCKLEAF
;
865 fcnp
->cn_nameiop
= DELETE
;
866 vn_lock(fdvp
, LK_EXCLUSIVE
| LK_RETRY
);
867 if ((error
= relookup(fdvp
, &fvp
, fcnp
, 0))) {
871 return (VOP_REMOVE(fdvp
, fvp
, fcnp
));
874 if ((error
= SET_DIROP_REMOVE(tdvp
, tvp
)) != 0)
879 error
= ufs_rename(ap
);
882 SET_ENDOP_REMOVE(fs
, tdvp
, tvp
, "rename");
886 VOP_ABORTOP(tdvp
, ap
->a_tcnp
); /* XXX, why not in NFS? */
893 VOP_ABORTOP(fdvp
, ap
->a_fcnp
); /* XXX, why not in NFS? */
899 /* XXX hack to avoid calling ITIMES in getattr */
903 struct vop_getattr_args
/* {
908 struct vnode
*vp
= ap
->a_vp
;
909 struct inode
*ip
= VTOI(vp
);
910 struct vattr
*vap
= ap
->a_vap
;
911 struct lfs
*fs
= ip
->i_lfs
;
913 * Copy from inode table
915 vap
->va_fsid
= ip
->i_dev
;
916 vap
->va_fileid
= ip
->i_number
;
917 vap
->va_mode
= ip
->i_mode
& ~IFMT
;
918 vap
->va_nlink
= ip
->i_nlink
;
919 vap
->va_uid
= ip
->i_uid
;
920 vap
->va_gid
= ip
->i_gid
;
921 vap
->va_rdev
= (dev_t
)ip
->i_ffs1_rdev
;
922 vap
->va_size
= vp
->v_size
;
923 vap
->va_atime
.tv_sec
= ip
->i_ffs1_atime
;
924 vap
->va_atime
.tv_nsec
= ip
->i_ffs1_atimensec
;
925 vap
->va_mtime
.tv_sec
= ip
->i_ffs1_mtime
;
926 vap
->va_mtime
.tv_nsec
= ip
->i_ffs1_mtimensec
;
927 vap
->va_ctime
.tv_sec
= ip
->i_ffs1_ctime
;
928 vap
->va_ctime
.tv_nsec
= ip
->i_ffs1_ctimensec
;
929 vap
->va_flags
= ip
->i_flags
;
930 vap
->va_gen
= ip
->i_gen
;
931 /* this doesn't belong here */
932 if (vp
->v_type
== VBLK
)
933 vap
->va_blocksize
= BLKDEV_IOSIZE
;
934 else if (vp
->v_type
== VCHR
)
935 vap
->va_blocksize
= MAXBSIZE
;
937 vap
->va_blocksize
= vp
->v_mount
->mnt_stat
.f_iosize
;
938 vap
->va_bytes
= fsbtob(fs
, (u_quad_t
)ip
->i_lfs_effnblks
);
939 vap
->va_type
= vp
->v_type
;
940 vap
->va_filerev
= ip
->i_modrev
;
945 * Check to make sure the inode blocks won't choke the buffer
946 * cache, then call ufs_setattr as usual.
951 struct vop_setattr_args
/* {
956 struct vnode
*vp
= ap
->a_vp
;
958 lfs_check(vp
, LFS_UNUSED_LBN
, 0);
959 return ufs_setattr(v
);
963 * Release the block we hold on lfs_newseg wrapping. Called on file close,
964 * or explicitly from LFCNWRAPGO. Called with the interlock held.
967 lfs_wrapgo(struct lfs
*fs
, struct inode
*ip
, int waitfor
)
969 if (fs
->lfs_stoplwp
!= curlwp
)
972 fs
->lfs_stoplwp
= NULL
;
973 cv_signal(&fs
->lfs_stopcv
);
975 KASSERT(fs
->lfs_nowrap
> 0);
976 if (fs
->lfs_nowrap
<= 0) {
980 if (--fs
->lfs_nowrap
== 0) {
981 log(LOG_NOTICE
, "%s: re-enabled log wrap\n", fs
->lfs_fsmnt
);
982 wakeup(&fs
->lfs_wrappass
);
983 lfs_wakeup_cleaner(fs
);
986 mtsleep(&fs
->lfs_nextseg
, PCATCH
| PUSER
, "segment",
1000 struct vop_close_args
/* {
1003 kauth_cred_t a_cred;
1005 struct vnode
*vp
= ap
->a_vp
;
1006 struct inode
*ip
= VTOI(vp
);
1007 struct lfs
*fs
= ip
->i_lfs
;
1009 if ((ip
->i_number
== ROOTINO
|| ip
->i_number
== LFS_IFILE_INUM
) &&
1010 fs
->lfs_stoplwp
== curlwp
) {
1011 mutex_enter(&lfs_lock
);
1012 log(LOG_NOTICE
, "lfs_close: releasing log wrap control\n");
1013 lfs_wrapgo(fs
, ip
, 0);
1014 mutex_exit(&lfs_lock
);
1017 if (vp
== ip
->i_lfs
->lfs_ivnode
&&
1018 vp
->v_mount
->mnt_iflag
& IMNT_UNMOUNT
)
1021 if (vp
->v_usecount
> 1 && vp
!= ip
->i_lfs
->lfs_ivnode
) {
1022 LFS_ITIMES(ip
, NULL
, NULL
, NULL
);
1028 * Close wrapper for special devices.
1030 * Update the times on the inode then do device close.
1033 lfsspec_close(void *v
)
1035 struct vop_close_args
/* {
1038 kauth_cred_t a_cred;
1045 if (vp
->v_usecount
> 1) {
1046 LFS_ITIMES(ip
, NULL
, NULL
, NULL
);
1048 return (VOCALL (spec_vnodeop_p
, VOFFSET(vop_close
), ap
));
1052 * Close wrapper for fifo's.
1054 * Update the times on the inode then do device close.
1057 lfsfifo_close(void *v
)
1059 struct vop_close_args
/* {
1069 if (ap
->a_vp
->v_usecount
> 1) {
1070 LFS_ITIMES(ip
, NULL
, NULL
, NULL
);
1072 return (VOCALL (fifo_vnodeop_p
, VOFFSET(vop_close
), ap
));
1076 * Reclaim an inode so that it can be used for other purposes.
1080 lfs_reclaim(void *v
)
1082 struct vop_reclaim_args
/* {
1085 struct vnode
*vp
= ap
->a_vp
;
1086 struct inode
*ip
= VTOI(vp
);
1087 struct lfs
*fs
= ip
->i_lfs
;
1091 * The inode must be freed and updated before being removed
1092 * from its hash chain. Other threads trying to gain a hold
1093 * on the inode will be stalled because it is locked (VI_XLOCK).
1095 if (ip
->i_nlink
<= 0 && (vp
->v_mount
->mnt_flag
& MNT_RDONLY
) == 0)
1096 lfs_vfree(vp
, ip
->i_number
, ip
->i_omode
);
1098 mutex_enter(&lfs_lock
);
1099 LFS_CLR_UINO(ip
, IN_ALLMOD
);
1100 mutex_exit(&lfs_lock
);
1101 if ((error
= ufs_reclaim(vp
)))
1105 * Take us off the paging and/or dirop queues if we were on them.
1106 * We shouldn't be on them.
1108 mutex_enter(&lfs_lock
);
1109 if (ip
->i_flags
& IN_PAGING
) {
1110 log(LOG_WARNING
, "%s: reclaimed vnode is IN_PAGING\n",
1112 ip
->i_flags
&= ~IN_PAGING
;
1113 TAILQ_REMOVE(&fs
->lfs_pchainhd
, ip
, i_lfs_pchain
);
1115 if (vp
->v_uflag
& VU_DIROP
) {
1116 panic("reclaimed vnode is VU_DIROP");
1117 vp
->v_uflag
&= ~VU_DIROP
;
1118 TAILQ_REMOVE(&fs
->lfs_dchainhd
, ip
, i_lfs_dchain
);
1120 mutex_exit(&lfs_lock
);
1122 pool_put(&lfs_dinode_pool
, ip
->i_din
.ffs1_din
);
1123 lfs_deregister_all(vp
);
1124 pool_put(&lfs_inoext_pool
, ip
->inode_ext
.lfs
);
1125 ip
->inode_ext
.lfs
= NULL
;
1126 genfs_node_destroy(vp
);
1127 pool_put(&lfs_inode_pool
, vp
->v_data
);
1133 * Read a block from a storage device.
1134 * In order to avoid reading blocks that are in the process of being
1135 * written by the cleaner---and hence are not mutexed by the normal
1136 * buffer cache / page cache mechanisms---check for collisions before
1139 * We inline ufs_strategy to make sure that the VOP_BMAP occurs *before*
1140 * the active cleaner test.
1142 * XXX This code assumes that lfs_markv makes synchronous checkpoints.
1145 lfs_strategy(void *v
)
1147 struct vop_strategy_args
/* {
1156 int i
, sn
, error
, slept
;
1163 /* lfs uses its strategy routine only for read */
1164 KASSERT(bp
->b_flags
& B_READ
);
1166 if (vp
->v_type
== VBLK
|| vp
->v_type
== VCHR
)
1167 panic("lfs_strategy: spec");
1168 KASSERT(bp
->b_bcount
!= 0);
1169 if (bp
->b_blkno
== bp
->b_lblkno
) {
1170 error
= VOP_BMAP(vp
, bp
->b_lblkno
, NULL
, &bp
->b_blkno
,
1173 bp
->b_error
= error
;
1174 bp
->b_resid
= bp
->b_bcount
;
1178 if ((long)bp
->b_blkno
== -1) /* no valid data */
1181 if ((long)bp
->b_blkno
< 0) { /* block is not on disk */
1182 bp
->b_resid
= bp
->b_bcount
;
1188 mutex_enter(&lfs_lock
);
1189 while (slept
&& fs
->lfs_seglock
) {
1190 mutex_exit(&lfs_lock
);
1192 * Look through list of intervals.
1193 * There will only be intervals to look through
1194 * if the cleaner holds the seglock.
1195 * Since the cleaner is synchronous, we can trust
1196 * the list of intervals to be current.
1198 tbn
= dbtofsb(fs
, bp
->b_blkno
);
1199 sn
= dtosn(fs
, tbn
);
1201 for (i
= 0; i
< fs
->lfs_cleanind
; i
++) {
1202 if (sn
== dtosn(fs
, fs
->lfs_cleanint
[i
]) &&
1203 tbn
>= fs
->lfs_cleanint
[i
]) {
1205 "lfs_strategy: ino %d lbn %" PRId64
1206 " ind %d sn %d fsb %" PRIx32
1207 " given sn %d fsb %" PRIx64
"\n",
1208 ip
->i_number
, bp
->b_lblkno
, i
,
1209 dtosn(fs
, fs
->lfs_cleanint
[i
]),
1210 fs
->lfs_cleanint
[i
], sn
, tbn
));
1212 "lfs_strategy: sleeping on ino %d lbn %"
1213 PRId64
"\n", ip
->i_number
, bp
->b_lblkno
));
1214 mutex_enter(&lfs_lock
);
1215 if (LFS_SEGLOCK_HELD(fs
) && fs
->lfs_iocount
) {
1216 /* Cleaner can't wait for itself */
1217 mtsleep(&fs
->lfs_iocount
,
1218 (PRIBIO
+ 1) | PNORELOCK
,
1223 } else if (fs
->lfs_seglock
) {
1224 mtsleep(&fs
->lfs_seglock
,
1225 (PRIBIO
+ 1) | PNORELOCK
,
1231 mutex_exit(&lfs_lock
);
1234 mutex_enter(&lfs_lock
);
1236 mutex_exit(&lfs_lock
);
1239 VOP_STRATEGY(vp
, bp
);
1244 lfs_flush_dirops(struct lfs
*fs
)
1246 struct inode
*ip
, *nip
;
1248 extern int lfs_dostats
;
1251 ASSERT_MAYBE_SEGLOCK(fs
);
1252 KASSERT(fs
->lfs_nadirop
== 0);
1257 mutex_enter(&lfs_lock
);
1258 if (TAILQ_FIRST(&fs
->lfs_dchainhd
) == NULL
) {
1259 mutex_exit(&lfs_lock
);
1262 mutex_exit(&lfs_lock
);
1265 ++lfs_stats
.flush_invoked
;
1268 * Inline lfs_segwrite/lfs_writevnodes, but just for dirops.
1269 * Technically this is a checkpoint (the on-disk state is valid)
1270 * even though we are leaving out all the file data.
1273 lfs_seglock(fs
, SEGM_CKP
);
1277 * lfs_writevnodes, optimized to get dirops out of the way.
1278 * Only write dirops, and don't flush files' pages, only
1279 * blocks from the directories.
1281 * We don't need to vref these files because they are
1282 * dirops and so hold an extra reference until the
1283 * segunlock clears them of that status.
1285 * We don't need to check for IN_ADIROP because we know that
1286 * no dirops are active.
1289 mutex_enter(&lfs_lock
);
1290 for (ip
= TAILQ_FIRST(&fs
->lfs_dchainhd
); ip
!= NULL
; ip
= nip
) {
1291 nip
= TAILQ_NEXT(ip
, i_lfs_dchain
);
1292 mutex_exit(&lfs_lock
);
1295 KASSERT((ip
->i_flag
& IN_ADIROP
) == 0);
1298 * All writes to directories come from dirops; all
1299 * writes to files' direct blocks go through the page
1300 * cache, which we're not touching. Reads to files
1301 * and/or directories will not be affected by writing
1302 * directory blocks inodes and file inodes. So we don't
1303 * really need to lock. If we don't lock, though,
1304 * make sure that we don't clear IN_MODIFIED
1307 if (vp
->v_iflag
& VI_XLOCK
) {
1308 mutex_enter(&lfs_lock
);
1312 * waslocked = VOP_ISLOCKED(vp);
1314 if (vp
->v_type
!= VREG
&&
1315 ((ip
->i_flag
& IN_ALLMOD
) || !VPISEMPTY(vp
))) {
1316 lfs_writefile(fs
, sp
, vp
);
1317 if (!VPISEMPTY(vp
) && !WRITEINPROG(vp
) &&
1318 !(ip
->i_flag
& IN_ALLMOD
)) {
1319 mutex_enter(&lfs_lock
);
1320 LFS_SET_UINO(ip
, IN_MODIFIED
);
1321 mutex_exit(&lfs_lock
);
1324 KDASSERT(ip
->i_number
!= LFS_IFILE_INUM
);
1325 (void) lfs_writeinode(fs
, sp
, ip
);
1326 mutex_enter(&lfs_lock
);
1329 * LK_EXCLOTHER is dead -- what is intended here?
1330 * if (waslocked == LK_EXCLOTHER)
1331 * LFS_SET_UINO(ip, IN_MODIFIED);
1334 mutex_exit(&lfs_lock
);
1335 /* We've written all the dirops there are */
1336 ((SEGSUM
*)(sp
->segsum
))->ss_flags
&= ~(SS_CONT
);
1337 lfs_finalize_fs_seguse(fs
);
1338 (void) lfs_writeseg(fs
, sp
);
1343 * Flush all vnodes for which the pagedaemon has requested pageouts.
1344 * Skip over any files that are marked VU_DIROP (since lfs_flush_dirop()
1345 * has just run, this would be an error). If we have to skip a vnode
1346 * for any reason, just skip it; if we have to wait for the cleaner,
1347 * abort. The writer daemon will call us again later.
1350 lfs_flush_pchain(struct lfs
*fs
)
1352 struct inode
*ip
, *nip
;
1354 extern int lfs_dostats
;
1358 ASSERT_NO_SEGLOCK(fs
);
1363 mutex_enter(&lfs_lock
);
1364 if (TAILQ_FIRST(&fs
->lfs_pchainhd
) == NULL
) {
1365 mutex_exit(&lfs_lock
);
1368 mutex_exit(&lfs_lock
);
1370 /* Get dirops out of the way */
1371 lfs_flush_dirops(fs
);
1374 ++lfs_stats
.flush_invoked
;
1377 * Inline lfs_segwrite/lfs_writevnodes, but just for pageouts.
1384 * lfs_writevnodes, optimized to clear pageout requests.
1385 * Only write non-dirop files that are in the pageout queue.
1386 * We're very conservative about what we write; we want to be
1389 mutex_enter(&lfs_lock
);
1391 for (ip
= TAILQ_FIRST(&fs
->lfs_pchainhd
); ip
!= NULL
; ip
= nip
) {
1392 nip
= TAILQ_NEXT(ip
, i_lfs_pchain
);
1395 if (!(ip
->i_flags
& IN_PAGING
))
1398 mutex_enter(vp
->v_interlock
);
1399 if ((vp
->v_iflag
& VI_XLOCK
) || (vp
->v_uflag
& VU_DIROP
) != 0) {
1400 mutex_exit(vp
->v_interlock
);
1403 if (vp
->v_type
!= VREG
) {
1404 mutex_exit(vp
->v_interlock
);
1409 mutex_exit(&lfs_lock
);
1411 if (vn_lock(vp
, LK_EXCLUSIVE
| LK_NOWAIT
| LK_RETRY
) != 0) {
1413 mutex_enter(&lfs_lock
);
1417 error
= lfs_writefile(fs
, sp
, vp
);
1418 if (!VPISEMPTY(vp
) && !WRITEINPROG(vp
) &&
1419 !(ip
->i_flag
& IN_ALLMOD
)) {
1420 mutex_enter(&lfs_lock
);
1421 LFS_SET_UINO(ip
, IN_MODIFIED
);
1422 mutex_exit(&lfs_lock
);
1424 KDASSERT(ip
->i_number
!= LFS_IFILE_INUM
);
1425 (void) lfs_writeinode(fs
, sp
, ip
);
1430 if (error
== EAGAIN
) {
1431 lfs_writeseg(fs
, sp
);
1432 mutex_enter(&lfs_lock
);
1435 mutex_enter(&lfs_lock
);
1437 mutex_exit(&lfs_lock
);
1438 (void) lfs_writeseg(fs
, sp
);
1443 * Provide a fcntl interface to sys_lfs_{segwait,bmapv,markv}.
1448 struct vop_fcntl_args
/* {
1453 kauth_cred_t a_cred;
1456 struct timeval
*tvp
;
1460 int blkcnt
, error
, oclean
;
1462 struct lfs_fcntl_markv blkvp
;
1470 /* Only respect LFS fcntls on fs root or Ifile */
1471 if (VTOI(ap
->a_vp
)->i_number
!= ROOTINO
&&
1472 VTOI(ap
->a_vp
)->i_number
!= LFS_IFILE_INUM
) {
1473 return ufs_fcntl(v
);
1476 /* Avoid locking a draining lock */
1477 if (ap
->a_vp
->v_mount
->mnt_iflag
& IMNT_UNMOUNT
) {
1481 /* LFS control and monitoring fcntls are available only to root */
1483 if (((ap
->a_command
& 0xff00) >> 8) == 'L' &&
1484 (error
= kauth_authorize_generic(l
->l_cred
, KAUTH_GENERIC_ISSUSER
,
1488 fs
= VTOI(ap
->a_vp
)->i_lfs
;
1489 fsidp
= &ap
->a_vp
->v_mount
->mnt_stat
.f_fsidx
;
1492 switch ((int)ap
->a_command
) {
1493 case LFCNSEGWAITALL_COMPAT_50
:
1494 case LFCNSEGWAITALL_COMPAT
:
1497 case LFCNSEGWAIT_COMPAT_50
:
1498 case LFCNSEGWAIT_COMPAT
:
1500 struct timeval50
*tvp50
1501 = (struct timeval50
*)ap
->a_data
;
1502 timeval50_to_timeval(tvp50
, &tv
);
1505 goto segwait_common
;
1506 case LFCNSEGWAITALL
:
1510 tvp
= (struct timeval
*)ap
->a_data
;
1512 mutex_enter(&lfs_lock
);
1514 mutex_exit(&lfs_lock
);
1516 error
= lfs_segwait(fsidp
, tvp
);
1518 mutex_enter(&lfs_lock
);
1519 if (--fs
->lfs_sleepers
== 0)
1520 wakeup(&fs
->lfs_sleepers
);
1521 mutex_exit(&lfs_lock
);
1526 blkvp
= *(struct lfs_fcntl_markv
*)ap
->a_data
;
1528 blkcnt
= blkvp
.blkcnt
;
1529 if ((u_int
) blkcnt
> LFS_MARKV_MAXBLKCNT
)
1531 blkiov
= lfs_malloc(fs
, blkcnt
* sizeof(BLOCK_INFO
), LFS_NB_BLKIOV
);
1532 if ((error
= copyin(blkvp
.blkiov
, blkiov
,
1533 blkcnt
* sizeof(BLOCK_INFO
))) != 0) {
1534 lfs_free(fs
, blkiov
, LFS_NB_BLKIOV
);
1538 mutex_enter(&lfs_lock
);
1540 mutex_exit(&lfs_lock
);
1541 if (ap
->a_command
== LFCNBMAPV
)
1542 error
= lfs_bmapv(l
->l_proc
, fsidp
, blkiov
, blkcnt
);
1543 else /* LFCNMARKV */
1544 error
= lfs_markv(l
->l_proc
, fsidp
, blkiov
, blkcnt
);
1546 error
= copyout(blkiov
, blkvp
.blkiov
,
1547 blkcnt
* sizeof(BLOCK_INFO
));
1548 mutex_enter(&lfs_lock
);
1549 if (--fs
->lfs_sleepers
== 0)
1550 wakeup(&fs
->lfs_sleepers
);
1551 mutex_exit(&lfs_lock
);
1552 lfs_free(fs
, blkiov
, LFS_NB_BLKIOV
);
1557 * Flush dirops and write Ifile, allowing empty segments
1558 * to be immediately reclaimed.
1560 lfs_writer_enter(fs
, "pndirop");
1561 off
= fs
->lfs_offset
;
1562 lfs_seglock(fs
, SEGM_FORCE_CKP
| SEGM_CKP
);
1563 lfs_flush_dirops(fs
);
1564 LFS_CLEANERINFO(cip
, fs
, bp
);
1565 oclean
= cip
->clean
;
1566 LFS_SYNC_CLEANERINFO(cip
, fs
, bp
, 1);
1567 lfs_segwrite(ap
->a_vp
->v_mount
, SEGM_FORCE_CKP
);
1568 fs
->lfs_sp
->seg_flags
|= SEGM_PROT
;
1570 lfs_writer_leave(fs
);
1573 LFS_CLEANERINFO(cip
, fs
, bp
);
1574 DLOG((DLOG_CLEAN
, "lfs_fcntl: reclaim wrote %" PRId64
1575 " blocks, cleaned %" PRId32
" segments (activesb %d)\n",
1576 fs
->lfs_offset
- off
, cip
->clean
- oclean
,
1578 LFS_SYNC_CLEANERINFO(cip
, fs
, bp
, 0);
1583 case LFCNIFILEFH_COMPAT
:
1584 /* Return the filehandle of the Ifile */
1585 if ((error
= kauth_authorize_system(l
->l_cred
,
1586 KAUTH_SYSTEM_FILEHANDLE
, 0, NULL
, NULL
, NULL
)) != 0)
1588 fhp
= (struct fhandle
*)ap
->a_data
;
1589 fhp
->fh_fsid
= *fsidp
;
1590 fh_size
= 16; /* former VFS_MAXFIDSIZ */
1591 return lfs_vptofh(fs
->lfs_ivnode
, &(fhp
->fh_fid
), &fh_size
);
1593 case LFCNIFILEFH_COMPAT2
:
1595 /* Return the filehandle of the Ifile */
1596 fhp
= (struct fhandle
*)ap
->a_data
;
1597 fhp
->fh_fsid
= *fsidp
;
1598 fh_size
= sizeof(struct lfs_fhandle
) -
1599 offsetof(fhandle_t
, fh_fid
);
1600 return lfs_vptofh(fs
->lfs_ivnode
, &(fhp
->fh_fid
), &fh_size
);
1603 /* Move lfs_offset to the lowest-numbered segment */
1604 return lfs_rewind(fs
, *(int *)ap
->a_data
);
1607 /* Mark a segment SEGUSE_INVAL */
1608 LFS_SEGENTRY(sup
, fs
, *(int *)ap
->a_data
, bp
);
1609 if (sup
->su_nbytes
> 0) {
1611 lfs_unset_inval_all(fs
);
1614 sup
->su_flags
|= SEGUSE_INVAL
;
1615 VOP_BWRITE(bp
->b_vp
, bp
);
1619 /* Resize the filesystem */
1620 return lfs_resize_fs(fs
, *(int *)ap
->a_data
);
1623 case LFCNWRAPSTOP_COMPAT
:
1625 * Hold lfs_newseg at segment 0; if requested, sleep until
1626 * the filesystem wraps around. To support external agents
1627 * (dump, fsck-based regression test) that need to look at
1628 * a snapshot of the filesystem, without necessarily
1629 * requiring that all fs activity stops.
1631 if (fs
->lfs_stoplwp
== curlwp
)
1634 mutex_enter(&lfs_lock
);
1635 while (fs
->lfs_stoplwp
!= NULL
)
1636 cv_wait(&fs
->lfs_stopcv
, &lfs_lock
);
1637 fs
->lfs_stoplwp
= curlwp
;
1638 if (fs
->lfs_nowrap
== 0)
1639 log(LOG_NOTICE
, "%s: disabled log wrap\n", fs
->lfs_fsmnt
);
1641 if (*(int *)ap
->a_data
== 1
1642 || ap
->a_command
== LFCNWRAPSTOP_COMPAT
) {
1643 log(LOG_NOTICE
, "LFCNSTOPWRAP waiting for log wrap\n");
1644 error
= mtsleep(&fs
->lfs_nowrap
, PCATCH
| PUSER
,
1645 "segwrap", 0, &lfs_lock
);
1646 log(LOG_NOTICE
, "LFCNSTOPWRAP done waiting\n");
1648 lfs_wrapgo(fs
, VTOI(ap
->a_vp
), 0);
1651 mutex_exit(&lfs_lock
);
1655 case LFCNWRAPGO_COMPAT
:
1657 * Having done its work, the agent wakes up the writer.
1658 * If the argument is 1, it sleeps until a new segment
1661 mutex_enter(&lfs_lock
);
1662 error
= lfs_wrapgo(fs
, VTOI(ap
->a_vp
),
1663 ap
->a_command
== LFCNWRAPGO_COMPAT
? 1 :
1664 *((int *)ap
->a_data
));
1665 mutex_exit(&lfs_lock
);
1669 if ((VTOI(ap
->a_vp
)->i_lfs_iflags
& LFSI_WRAPWAIT
))
1671 mutex_enter(&lfs_lock
);
1672 if (fs
->lfs_stoplwp
!= curlwp
) {
1673 mutex_exit(&lfs_lock
);
1676 if (fs
->lfs_nowrap
== 0) {
1677 mutex_exit(&lfs_lock
);
1680 fs
->lfs_wrappass
= 1;
1681 wakeup(&fs
->lfs_wrappass
);
1682 /* Wait for the log to wrap, if asked */
1683 if (*(int *)ap
->a_data
) {
1684 mutex_enter(ap
->a_vp
->v_interlock
);
1686 VTOI(ap
->a_vp
)->i_lfs_iflags
|= LFSI_WRAPWAIT
;
1687 log(LOG_NOTICE
, "LFCNPASS waiting for log wrap\n");
1688 error
= mtsleep(&fs
->lfs_nowrap
, PCATCH
| PUSER
,
1689 "segwrap", 0, &lfs_lock
);
1690 log(LOG_NOTICE
, "LFCNPASS done waiting\n");
1691 VTOI(ap
->a_vp
)->i_lfs_iflags
&= ~LFSI_WRAPWAIT
;
1692 lfs_vunref(ap
->a_vp
);
1694 mutex_exit(&lfs_lock
);
1697 case LFCNWRAPSTATUS
:
1698 mutex_enter(&lfs_lock
);
1699 *(int *)ap
->a_data
= fs
->lfs_wrapstatus
;
1700 mutex_exit(&lfs_lock
);
1704 return ufs_fcntl(v
);
1710 lfs_getpages(void *v
)
1712 struct vop_getpages_args
/* {
1715 struct vm_page **a_m;
1718 vm_prot_t a_access_type;
1723 if (VTOI(ap
->a_vp
)->i_number
== LFS_IFILE_INUM
&&
1724 (ap
->a_access_type
& VM_PROT_WRITE
) != 0) {
1727 if ((ap
->a_access_type
& VM_PROT_WRITE
) != 0) {
1728 mutex_enter(&lfs_lock
);
1729 LFS_SET_UINO(VTOI(ap
->a_vp
), IN_MODIFIED
);
1730 mutex_exit(&lfs_lock
);
1734 * we're relying on the fact that genfs_getpages() always read in
1735 * entire filesystem blocks.
1737 return genfs_getpages(v
);
1741 * Wait for a page to become unbusy, possibly printing diagnostic messages
1744 * Called with vp->v_interlock held; return with it held.
1747 wait_for_page(struct vnode
*vp
, struct vm_page
*pg
, const char *label
)
1749 if ((pg
->flags
& PG_BUSY
) == 0)
1750 return; /* Nothing to wait for! */
1752 #if defined(DEBUG) && defined(UVM_PAGE_TRKOWN)
1753 static struct vm_page
*lastpg
;
1755 if (label
!= NULL
&& pg
!= lastpg
) {
1756 if (pg
->owner_tag
) {
1757 printf("lfs_putpages[%d.%d]: %s: page %p owner %d.%d [%s]\n",
1758 curproc
->p_pid
, curlwp
->l_lid
, label
,
1759 pg
, pg
->owner
, pg
->lowner
, pg
->owner_tag
);
1761 printf("lfs_putpages[%d.%d]: %s: page %p unowned?!\n",
1762 curproc
->p_pid
, curlwp
->l_lid
, label
, pg
);
1768 pg
->flags
|= PG_WANTED
;
1769 UVM_UNLOCK_AND_WAIT(pg
, vp
->v_interlock
, 0, "lfsput", 0);
1770 mutex_enter(vp
->v_interlock
);
1774 * This routine is called by lfs_putpages() when it can't complete the
1775 * write because a page is busy. This means that either (1) someone,
1776 * possibly the pagedaemon, is looking at this page, and will give it up
1777 * presently; or (2) we ourselves are holding the page busy in the
1778 * process of being written (either gathered or actually on its way to
1779 * disk). We don't need to give up the segment lock, but we might need
1780 * to call lfs_writeseg() to expedite the page's journey to disk.
1782 * Called with vp->v_interlock held; return with it held.
1784 /* #define BUSYWAIT */
1786 write_and_wait(struct lfs
*fs
, struct vnode
*vp
, struct vm_page
*pg
,
1787 int seglocked
, const char *label
)
1790 struct inode
*ip
= VTOI(vp
);
1791 struct segment
*sp
= fs
->lfs_sp
;
1797 while (pg
->flags
& PG_BUSY
&&
1798 pg
->uobject
== &vp
->v_uobj
) {
1799 mutex_exit(vp
->v_interlock
);
1800 if (sp
->cbpp
- sp
->bpp
> 1) {
1801 /* Write gathered pages */
1803 lfs_release_finfo(fs
);
1804 (void) lfs_writeseg(fs
, sp
);
1809 KASSERT(sp
->vp
== vp
);
1810 lfs_acquire_finfo(fs
, ip
->i_number
,
1814 mutex_enter(vp
->v_interlock
);
1815 wait_for_page(vp
, pg
, label
);
1817 if (label
!= NULL
&& count
> 1)
1818 printf("lfs_putpages[%d]: %s: %sn = %d\n", curproc
->p_pid
,
1819 label
, (count
> 0 ? "looping, " : ""), count
);
1826 * Make sure that for all pages in every block in the given range,
1827 * either all are dirty or all are clean. If any of the pages
1828 * we've seen so far are dirty, put the vnode on the paging chain,
1829 * and mark it IN_PAGING.
1831 * If checkfirst != 0, don't check all the pages but return at the
1835 check_dirty(struct lfs
*fs
, struct vnode
*vp
,
1836 off_t startoffset
, off_t endoffset
, off_t blkeof
,
1837 int flags
, int checkfirst
, struct vm_page
**pgp
)
1840 struct vm_page
*curpg
= NULL
; /* XXX: gcc */
1841 struct vm_page
*pgs
[MAXBSIZE
/ PAGE_SIZE
], *pg
;
1842 off_t soff
= 0; /* XXX: gcc */
1846 int any_dirty
; /* number of dirty pages */
1847 int dirty
; /* number of dirty pages in a block */
1849 int pages_per_block
= fs
->lfs_bsize
>> PAGE_SHIFT
;
1850 int pagedaemon
= (curlwp
== uvm
.pagedaemon_lwp
);
1852 ASSERT_MAYBE_SEGLOCK(fs
);
1854 by_list
= (vp
->v_uobj
.uo_npages
<=
1855 ((endoffset
- startoffset
) >> PAGE_SHIFT
) *
1856 UVM_PAGE_TREE_PENALTY
);
1860 curpg
= TAILQ_FIRST(&vp
->v_uobj
.memq
);
1864 while (by_list
|| soff
< MIN(blkeof
, endoffset
)) {
1867 * Find the first page in a block. Skip
1868 * blocks outside our area of interest or beyond
1871 KASSERT(curpg
== NULL
1872 || (curpg
->flags
& PG_MARKER
) == 0);
1873 if (pages_per_block
> 1) {
1875 ((curpg
->offset
& fs
->lfs_bmask
) ||
1876 curpg
->offset
>= vp
->v_size
||
1877 curpg
->offset
>= endoffset
)) {
1878 curpg
= TAILQ_NEXT(curpg
, listq
.queue
);
1879 KASSERT(curpg
== NULL
||
1880 (curpg
->flags
& PG_MARKER
) == 0);
1885 soff
= curpg
->offset
;
1889 * Mark all pages in extended range busy; find out if any
1890 * of them are dirty.
1892 nonexistent
= dirty
= 0;
1893 for (i
= 0; i
== 0 || i
< pages_per_block
; i
++) {
1894 if (by_list
&& pages_per_block
<= 1) {
1895 pgs
[i
] = pg
= curpg
;
1897 off
= soff
+ (i
<< PAGE_SHIFT
);
1898 pgs
[i
] = pg
= uvm_pagelookup(&vp
->v_uobj
, off
);
1904 KASSERT(pg
!= NULL
);
1907 * If we're holding the segment lock, we can deadlock
1908 * against a process that has our page and is waiting
1909 * for the cleaner, while the cleaner waits for the
1910 * segment lock. Just bail in that case.
1912 if ((pg
->flags
& PG_BUSY
) &&
1913 (pagedaemon
|| LFS_SEGLOCK_HELD(fs
))) {
1915 uvm_page_unbusy(pgs
, i
);
1916 DLOG((DLOG_PAGE
, "lfs_putpages: avoiding 3-way or pagedaemon deadlock\n"));
1922 while (pg
->flags
& PG_BUSY
) {
1923 wait_for_page(vp
, pg
, NULL
);
1925 uvm_page_unbusy(pgs
, i
);
1928 pg
->flags
|= PG_BUSY
;
1929 UVM_PAGE_OWN(pg
, "lfs_putpages");
1931 pmap_page_protect(pg
, VM_PROT_NONE
);
1932 tdirty
= (pmap_clear_modify(pg
) ||
1933 (pg
->flags
& PG_CLEAN
) == 0);
1936 if (pages_per_block
> 0 && nonexistent
>= pages_per_block
) {
1938 curpg
= TAILQ_NEXT(curpg
, listq
.queue
);
1940 soff
+= fs
->lfs_bsize
;
1946 KASSERT(nonexistent
== 0);
1949 * If any are dirty make all dirty; unbusy them,
1950 * but if we were asked to clean, wire them so that
1951 * the pagedaemon doesn't bother us about them while
1952 * they're on their way to disk.
1954 for (i
= 0; i
== 0 || i
< pages_per_block
; i
++) {
1956 KASSERT(!((pg
->flags
& PG_CLEAN
) && (pg
->flags
& PG_DELWRI
)));
1958 pg
->flags
&= ~PG_CLEAN
;
1959 if (flags
& PGO_FREE
) {
1961 * Wire the page so that
1962 * pdaemon doesn't see it again.
1964 mutex_enter(&uvm_pageqlock
);
1966 mutex_exit(&uvm_pageqlock
);
1968 /* Suspended write flag */
1969 pg
->flags
|= PG_DELWRI
;
1972 if (pg
->flags
& PG_WANTED
)
1974 pg
->flags
&= ~(PG_WANTED
|PG_BUSY
);
1975 UVM_PAGE_OWN(pg
, NULL
);
1978 if (checkfirst
&& any_dirty
)
1982 curpg
= TAILQ_NEXT(curpg
, listq
.queue
);
1984 soff
+= MAX(PAGE_SIZE
, fs
->lfs_bsize
);
1992 * lfs_putpages functions like genfs_putpages except that
1994 * (1) It needs to bounds-check the incoming requests to ensure that
1995 * they are block-aligned; if they are not, expand the range and
1996 * do the right thing in case, e.g., the requested range is clean
1997 * but the expanded range is dirty.
1999 * (2) It needs to explicitly send blocks to be written when it is done.
2000 * If VOP_PUTPAGES is called without the seglock held, we simply take
2001 * the seglock and let lfs_segunlock wait for us.
2002 * XXX There might be a bad situation if we have to flush a vnode while
2003 * XXX lfs_markv is in operation. As of this writing we panic in this
2008 * (1) The caller does not hold any pages in this vnode busy. If it does,
2009 * there is a danger that when we expand the page range and busy the
2010 * pages we will deadlock.
2012 * (2) We are called with vp->v_interlock held; we must return with it
2015 * (3) We don't absolutely have to free pages right away, provided that
2016 * the request does not have PGO_SYNCIO. When the pagedaemon gives
2017 * us a request with PGO_FREE, we take the pages out of the paging
2018 * queue and wake up the writer, which will handle freeing them for us.
2020 * We ensure that for any filesystem block, all pages for that
2021 * block are either resident or not, even if those pages are higher
2022 * than EOF; that means that we will be getting requests to free
2023 * "unused" pages above EOF all the time, and should ignore them.
2025 * (4) If we are called with PGO_LOCKED, the finfo array we are to write
2026 * into has been set up for us by lfs_writefile. If not, we will
2027 * have to handle allocating and/or freeing an finfo entry.
2029 * XXX note that we're (ab)using PGO_LOCKED as "seglock held".
2032 /* How many times to loop before we should start to worry */
2036 lfs_putpages(void *v
)
2039 struct vop_putpages_args
/* {
2049 off_t origoffset
, startoffset
, endoffset
, origendoffset
, blkeof
;
2050 off_t off
, max_endoffset
;
2051 bool seglocked
, sync
, pagedaemon
;
2052 struct vm_page
*pg
, *busypg
;
2053 UVMHIST_FUNC("lfs_putpages"); UVMHIST_CALLED(ubchist
);
2055 int debug_n_again
, debug_n_dirtyclean
;
2061 sync
= (ap
->a_flags
& PGO_SYNCIO
) != 0;
2062 pagedaemon
= (curlwp
== uvm
.pagedaemon_lwp
);
2064 /* Putpages does nothing for metadata. */
2065 if (vp
== fs
->lfs_ivnode
|| vp
->v_type
!= VREG
) {
2066 mutex_exit(vp
->v_interlock
);
2071 * If there are no pages, don't do anything.
2073 if (vp
->v_uobj
.uo_npages
== 0) {
2074 if (TAILQ_EMPTY(&vp
->v_uobj
.memq
) &&
2075 (vp
->v_iflag
& VI_ONWORKLST
) &&
2076 LIST_FIRST(&vp
->v_dirtyblkhd
) == NULL
) {
2077 vp
->v_iflag
&= ~VI_WRMAPDIRTY
;
2078 vn_syncer_remove_from_worklist(vp
);
2080 mutex_exit(vp
->v_interlock
);
2082 /* Remove us from paging queue, if we were on it */
2083 mutex_enter(&lfs_lock
);
2084 if (ip
->i_flags
& IN_PAGING
) {
2085 ip
->i_flags
&= ~IN_PAGING
;
2086 TAILQ_REMOVE(&fs
->lfs_pchainhd
, ip
, i_lfs_pchain
);
2088 mutex_exit(&lfs_lock
);
2092 blkeof
= blkroundup(fs
, ip
->i_size
);
2095 * Ignore requests to free pages past EOF but in the same block
2096 * as EOF, unless the request is synchronous. (If the request is
2097 * sync, it comes from lfs_truncate.)
2098 * XXXUBC Make these pages look "active" so the pagedaemon won't
2099 * XXXUBC bother us with them again.
2101 if (!sync
&& ap
->a_offlo
>= ip
->i_size
&& ap
->a_offlo
< blkeof
) {
2102 origoffset
= ap
->a_offlo
;
2103 for (off
= origoffset
; off
< blkeof
; off
+= fs
->lfs_bsize
) {
2104 pg
= uvm_pagelookup(&vp
->v_uobj
, off
);
2105 KASSERT(pg
!= NULL
);
2106 while (pg
->flags
& PG_BUSY
) {
2107 pg
->flags
|= PG_WANTED
;
2108 UVM_UNLOCK_AND_WAIT(pg
, vp
->v_interlock
, 0,
2110 mutex_enter(vp
->v_interlock
);
2112 mutex_enter(&uvm_pageqlock
);
2113 uvm_pageactivate(pg
);
2114 mutex_exit(&uvm_pageqlock
);
2116 ap
->a_offlo
= blkeof
;
2117 if (ap
->a_offhi
> 0 && ap
->a_offhi
<= ap
->a_offlo
) {
2118 mutex_exit(vp
->v_interlock
);
2124 * Extend page range to start and end at block boundaries.
2125 * (For the purposes of VOP_PUTPAGES, fragments don't exist.)
2127 origoffset
= ap
->a_offlo
;
2128 origendoffset
= ap
->a_offhi
;
2129 startoffset
= origoffset
& ~(fs
->lfs_bmask
);
2130 max_endoffset
= (trunc_page(LLONG_MAX
) >> fs
->lfs_bshift
)
2133 if (origendoffset
== 0 || ap
->a_flags
& PGO_ALLPAGES
) {
2134 endoffset
= max_endoffset
;
2135 origendoffset
= endoffset
;
2137 origendoffset
= round_page(ap
->a_offhi
);
2138 endoffset
= round_page(blkroundup(fs
, origendoffset
));
2141 KASSERT(startoffset
> 0 || endoffset
>= startoffset
);
2142 if (startoffset
== endoffset
) {
2143 /* Nothing to do, why were we called? */
2144 mutex_exit(vp
->v_interlock
);
2145 DLOG((DLOG_PAGE
, "lfs_putpages: startoffset = endoffset = %"
2146 PRId64
"\n", startoffset
));
2150 ap
->a_offlo
= startoffset
;
2151 ap
->a_offhi
= endoffset
;
2154 * If not cleaning, just send the pages through genfs_putpages
2155 * to be returned to the pool.
2157 if (!(ap
->a_flags
& PGO_CLEANIT
))
2158 return genfs_putpages(v
);
2160 /* Set PGO_BUSYFAIL to avoid deadlocks */
2161 ap
->a_flags
|= PGO_BUSYFAIL
;
2164 * Likewise, if we are asked to clean but the pages are not
2165 * dirty, we can just free them using genfs_putpages.
2168 debug_n_dirtyclean
= 0;
2173 /* Count the number of dirty pages */
2174 r
= check_dirty(fs
, vp
, startoffset
, endoffset
, blkeof
,
2175 ap
->a_flags
, 1, NULL
);
2177 /* Pages are busy with another process */
2178 mutex_exit(vp
->v_interlock
);
2181 if (r
> 0) /* Some pages are dirty */
2185 * Sometimes pages are dirtied between the time that
2186 * we check and the time we try to clean them.
2187 * Instruct lfs_gop_write to return EDEADLK in this case
2188 * so we can write them properly.
2190 ip
->i_lfs_iflags
|= LFSI_NO_GOP_WRITE
;
2191 r
= genfs_do_putpages(vp
, startoffset
, endoffset
,
2192 ap
->a_flags
& ~PGO_SYNCIO
, &busypg
);
2193 ip
->i_lfs_iflags
&= ~LFSI_NO_GOP_WRITE
;
2197 /* One of the pages was busy. Start over. */
2198 mutex_enter(vp
->v_interlock
);
2199 wait_for_page(vp
, busypg
, "dirtyclean");
2201 ++debug_n_dirtyclean
;
2206 if (debug_n_dirtyclean
> TOOMANY
)
2207 printf("lfs_putpages: dirtyclean: looping, n = %d\n",
2208 debug_n_dirtyclean
);
2212 * Dirty and asked to clean.
2214 * Pagedaemon can't actually write LFS pages; wake up
2215 * the writer to take care of that. The writer will
2216 * notice the pager inode queue and act on that.
2218 * XXX We must drop the vp->interlock before taking the lfs_lock or we
2219 * get a nasty deadlock with lfs_flush_pchain().
2222 mutex_exit(vp
->v_interlock
);
2223 mutex_enter(&lfs_lock
);
2224 if (!(ip
->i_flags
& IN_PAGING
)) {
2225 ip
->i_flags
|= IN_PAGING
;
2226 TAILQ_INSERT_TAIL(&fs
->lfs_pchainhd
, ip
, i_lfs_pchain
);
2228 wakeup(&lfs_writer_daemon
);
2229 mutex_exit(&lfs_lock
);
2235 * If this is a file created in a recent dirop, we can't flush its
2236 * inode until the dirop is complete. Drain dirops, then flush the
2237 * filesystem (taking care of any other pending dirops while we're
2240 if ((ap
->a_flags
& (PGO_CLEANIT
|PGO_LOCKED
)) == PGO_CLEANIT
&&
2241 (vp
->v_uflag
& VU_DIROP
)) {
2244 DLOG((DLOG_PAGE
, "lfs_putpages: flushing VU_DIROP\n"));
2245 /* XXX VOP_ISLOCKED() may not be used for lock decisions. */
2246 locked
= (VOP_ISLOCKED(vp
) == LK_EXCLUSIVE
);
2247 mutex_exit(vp
->v_interlock
);
2248 lfs_writer_enter(fs
, "ppdirop");
2250 VOP_UNLOCK(vp
); /* XXX why? */
2252 mutex_enter(&lfs_lock
);
2253 lfs_flush_fs(fs
, sync
? SEGM_SYNC
: 0);
2254 mutex_exit(&lfs_lock
);
2257 VOP_LOCK(vp
, LK_EXCLUSIVE
);
2258 mutex_enter(vp
->v_interlock
);
2259 lfs_writer_leave(fs
);
2261 /* XXX the flush should have taken care of this one too! */
2265 * This is it. We are going to write some pages. From here on
2266 * down it's all just mechanics.
2268 * Don't let genfs_putpages wait; lfs_segunlock will wait for us.
2270 ap
->a_flags
&= ~PGO_SYNCIO
;
2273 * If we've already got the seglock, flush the node and return.
2274 * The FIP has already been set up for us by lfs_writefile,
2275 * and FIP cleanup and lfs_updatemeta will also be done there,
2276 * unless genfs_putpages returns EDEADLK; then we must flush
2277 * what we have, and correct FIP and segment header accounting.
2281 * If we are not called with the segment locked, lock it.
2282 * Account for a new FIP in the segment header, and set sp->vp.
2283 * (This should duplicate the setup at the top of lfs_writefile().)
2285 seglocked
= (ap
->a_flags
& PGO_LOCKED
) != 0;
2287 mutex_exit(vp
->v_interlock
);
2288 error
= lfs_seglock(fs
, SEGM_PROT
| (sync
? SEGM_SYNC
: 0));
2291 mutex_enter(vp
->v_interlock
);
2292 lfs_acquire_finfo(fs
, ip
->i_number
, ip
->i_gen
);
2295 KASSERT(sp
->vp
== NULL
);
2299 * Ensure that the partial segment is marked SS_DIROP if this
2302 if (!seglocked
&& vp
->v_uflag
& VU_DIROP
)
2303 ((SEGSUM
*)(sp
->segsum
))->ss_flags
|= (SS_DIROP
|SS_CONT
);
2306 * Loop over genfs_putpages until all pages are gathered.
2307 * genfs_putpages() drops the interlock, so reacquire it if necessary.
2308 * Whenever we lose the interlock we have to rerun check_dirty, as
2309 * well, since more pages might have been dirtied in our absence.
2316 if (check_dirty(fs
, vp
, startoffset
, endoffset
, blkeof
,
2317 ap
->a_flags
, 0, &busypg
) < 0) {
2318 mutex_exit(vp
->v_interlock
);
2320 mutex_enter(vp
->v_interlock
);
2321 write_and_wait(fs
, vp
, busypg
, seglocked
, NULL
);
2323 mutex_exit(vp
->v_interlock
);
2324 lfs_release_finfo(fs
);
2326 mutex_enter(vp
->v_interlock
);
2333 error
= genfs_do_putpages(vp
, startoffset
, endoffset
,
2334 ap
->a_flags
, &busypg
);
2336 if (error
== EDEADLK
|| error
== EAGAIN
) {
2337 DLOG((DLOG_PAGE
, "lfs_putpages: genfs_putpages returned"
2338 " %d ino %d off %x (seg %d)\n", error
,
2339 ip
->i_number
, fs
->lfs_offset
,
2340 dtosn(fs
, fs
->lfs_offset
)));
2342 mutex_enter(vp
->v_interlock
);
2343 write_and_wait(fs
, vp
, busypg
, seglocked
, "again");
2348 } while (error
== EDEADLK
);
2350 if (debug_n_again
> TOOMANY
)
2351 printf("lfs_putpages: again: looping, n = %d\n", debug_n_again
);
2354 KASSERT(sp
!= NULL
&& sp
->vp
== vp
);
2358 /* Write indirect blocks as well */
2359 lfs_gather(fs
, fs
->lfs_sp
, vp
, lfs_match_indir
);
2360 lfs_gather(fs
, fs
->lfs_sp
, vp
, lfs_match_dindir
);
2361 lfs_gather(fs
, fs
->lfs_sp
, vp
, lfs_match_tindir
);
2363 KASSERT(sp
->vp
== NULL
);
2368 * Blocks are now gathered into a segment waiting to be written.
2369 * All that's left to do is update metadata, and write them.
2372 KASSERT(sp
->vp
== vp
);
2376 * If we were called from lfs_writefile, we don't need to clean up
2377 * the FIP or unlock the segment lock. We're done.
2382 /* Clean up FIP and send it to disk. */
2383 lfs_release_finfo(fs
);
2384 lfs_writeseg(fs
, fs
->lfs_sp
);
2387 * Remove us from paging queue if we wrote all our pages.
2389 if (origendoffset
== 0 || ap
->a_flags
& PGO_ALLPAGES
) {
2390 mutex_enter(&lfs_lock
);
2391 if (ip
->i_flags
& IN_PAGING
) {
2392 ip
->i_flags
&= ~IN_PAGING
;
2393 TAILQ_REMOVE(&fs
->lfs_pchainhd
, ip
, i_lfs_pchain
);
2395 mutex_exit(&lfs_lock
);
2399 * XXX - with the malloc/copy writeseg, the pages are freed by now
2400 * even if we don't wait (e.g. if we hold a nested lock). This
2401 * will not be true if we stop using malloc/copy.
2403 KASSERT(fs
->lfs_sp
->seg_flags
& SEGM_PROT
);
2407 * Wait for v_numoutput to drop to zero. The seglock should
2408 * take care of this, but there is a slight possibility that
2409 * aiodoned might not have got around to our buffers yet.
2412 mutex_enter(vp
->v_interlock
);
2413 while (vp
->v_numoutput
> 0) {
2414 DLOG((DLOG_PAGE
, "lfs_putpages: ino %d sleeping on"
2415 " num %d\n", ip
->i_number
, vp
->v_numoutput
));
2416 cv_wait(&vp
->v_cv
, vp
->v_interlock
);
2418 mutex_exit(vp
->v_interlock
);
2424 * Return the last logical file offset that should be written for this file
2425 * if we're doing a write that ends at "size". If writing, we need to know
2426 * about sizes on disk, i.e. fragments if there are any; if reading, we need
2427 * to know about entire blocks.
2430 lfs_gop_size(struct vnode
*vp
, off_t size
, off_t
*eobp
, int flags
)
2432 struct inode
*ip
= VTOI(vp
);
2433 struct lfs
*fs
= ip
->i_lfs
;
2436 olbn
= lblkno(fs
, ip
->i_size
);
2437 nlbn
= lblkno(fs
, size
);
2438 if (!(flags
& GOP_SIZE_MEM
) && nlbn
< NDADDR
&& olbn
<= nlbn
) {
2439 *eobp
= fragroundup(fs
, size
);
2441 *eobp
= blkroundup(fs
, size
);
2446 void lfs_dump_vop(void *);
2449 lfs_dump_vop(void *v
)
2451 struct vop_putpages_args
/* {
2459 vfs_vnode_print(ap
->a_vp
, 0, printf
);
2461 lfs_dump_dinode(VTOI(ap
->a_vp
)->i_din
.ffs1_din
);
2468 struct vop_mmap_args
/* {
2469 const struct vnodeop_desc *a_desc;
2472 kauth_cred_t a_cred;
2475 if (VTOI(ap
->a_vp
)->i_number
== LFS_IFILE_INUM
)