1 /* $NetBSD: lfs_vfsops.c,v 1.345 2015/09/01 06:16:59 dholland Exp $ */
4 * Copyright (c) 1999, 2000, 2001, 2002, 2003, 2007, 2007
5 * The NetBSD Foundation, Inc.
8 * This code is derived from software contributed to The NetBSD Foundation
9 * by Konrad E. Schroder <perseant@hhhh.org>.
11 * Redistribution and use in source and binary forms, with or without
12 * modification, are permitted provided that the following conditions
14 * 1. Redistributions of source code must retain the above copyright
15 * notice, this list of conditions and the following disclaimer.
16 * 2. Redistributions in binary form must reproduce the above copyright
17 * notice, this list of conditions and the following disclaimer in the
18 * documentation and/or other materials provided with the distribution.
20 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
21 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
22 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
23 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
24 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
25 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
26 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
27 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
28 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
29 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
30 * POSSIBILITY OF SUCH DAMAGE.
33 * Copyright (c) 1989, 1991, 1993, 1994
34 * The Regents of the University of California. All rights reserved.
36 * Redistribution and use in source and binary forms, with or without
37 * modification, are permitted provided that the following conditions
39 * 1. Redistributions of source code must retain the above copyright
40 * notice, this list of conditions and the following disclaimer.
41 * 2. Redistributions in binary form must reproduce the above copyright
42 * notice, this list of conditions and the following disclaimer in the
43 * documentation and/or other materials provided with the distribution.
44 * 3. Neither the name of the University nor the names of its contributors
45 * may be used to endorse or promote products derived from this software
46 * without specific prior written permission.
48 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
49 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
50 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
51 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
52 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
53 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
54 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
55 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
56 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
57 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
60 * @(#)lfs_vfsops.c 8.20 (Berkeley) 6/10/95
63 #include <sys/cdefs.h>
64 __KERNEL_RCSID(0, "$NetBSD: lfs_vfsops.c,v 1.345 2015/09/01 06:16:59 dholland Exp $");
66 #if defined(_KERNEL_OPT)
68 #include "opt_quota.h"
71 #include <sys/param.h>
72 #include <sys/systm.h>
73 #include <sys/namei.h>
75 #include <sys/kernel.h>
76 #include <sys/vnode.h>
77 #include <sys/mount.h>
78 #include <sys/kthread.h>
80 #include <sys/device.h>
83 #include <sys/disklabel.h>
84 #include <sys/ioctl.h>
85 #include <sys/errno.h>
86 #include <sys/malloc.h>
88 #include <sys/socket.h>
89 #include <sys/syslog.h>
90 #include <uvm/uvm_extern.h>
91 #include <sys/sysctl.h>
93 #include <sys/kauth.h>
94 #include <sys/module.h>
95 #include <sys/syscallvar.h>
96 #include <sys/syscall.h>
97 #include <sys/syscallargs.h>
99 #include <miscfs/specfs/specdev.h>
101 #include <ufs/lfs/ulfs_quotacommon.h>
102 #include <ufs/lfs/ulfs_inode.h>
103 #include <ufs/lfs/ulfsmount.h>
104 #include <ufs/lfs/ulfs_bswap.h>
105 #include <ufs/lfs/ulfs_extern.h>
108 #include <uvm/uvm_stat.h>
109 #include <uvm/uvm_pager.h>
110 #include <uvm/uvm_pdaemon.h>
112 #include <ufs/lfs/lfs.h>
113 #include <ufs/lfs/lfs_accessors.h>
114 #include <ufs/lfs/lfs_kernel.h>
115 #include <ufs/lfs/lfs_extern.h>
117 #include <miscfs/genfs/genfs.h>
118 #include <miscfs/genfs/genfs_node.h>
120 MODULE(MODULE_CLASS_VFS
, lfs
, NULL
);
122 static int lfs_gop_write(struct vnode
*, struct vm_page
**, int, int);
123 static int lfs_mountfs(struct vnode
*, struct mount
*, struct lwp
*);
125 static struct sysctllog
*lfs_sysctl_log
;
127 extern const struct vnodeopv_desc lfs_vnodeop_opv_desc
;
128 extern const struct vnodeopv_desc lfs_specop_opv_desc
;
129 extern const struct vnodeopv_desc lfs_fifoop_opv_desc
;
131 pid_t lfs_writer_daemon
= 0;
132 lwpid_t lfs_writer_lid
= 0;
133 int lfs_do_flush
= 0;
134 #ifdef LFS_KERNEL_RFW
138 const struct vnodeopv_desc
* const lfs_vnodeopv_descs
[] = {
139 &lfs_vnodeop_opv_desc
,
140 &lfs_specop_opv_desc
,
141 &lfs_fifoop_opv_desc
,
145 struct vfsops lfs_vfsops
= {
146 .vfs_name
= MOUNT_LFS
,
147 .vfs_min_mount_data
= sizeof (struct ulfs_args
),
148 .vfs_mount
= lfs_mount
,
149 .vfs_start
= ulfs_start
,
150 .vfs_unmount
= lfs_unmount
,
151 .vfs_root
= ulfs_root
,
152 .vfs_quotactl
= ulfs_quotactl
,
153 .vfs_statvfs
= lfs_statvfs
,
154 .vfs_sync
= lfs_sync
,
155 .vfs_vget
= lfs_vget
,
156 .vfs_loadvnode
= lfs_loadvnode
,
157 .vfs_newvnode
= lfs_newvnode
,
158 .vfs_fhtovp
= lfs_fhtovp
,
159 .vfs_vptofh
= lfs_vptofh
,
160 .vfs_init
= lfs_init
,
161 .vfs_reinit
= lfs_reinit
,
162 .vfs_done
= lfs_done
,
163 .vfs_mountroot
= lfs_mountroot
,
164 .vfs_snapshot
= (void *)eopnotsupp
,
165 .vfs_extattrctl
= lfs_extattrctl
,
166 .vfs_suspendctl
= (void *)eopnotsupp
,
167 .vfs_renamelock_enter
= genfs_renamelock_enter
,
168 .vfs_renamelock_exit
= genfs_renamelock_exit
,
169 .vfs_fsync
= (void *)eopnotsupp
,
170 .vfs_opv_descs
= lfs_vnodeopv_descs
173 const struct genfs_ops lfs_genfsops
= {
174 .gop_size
= lfs_gop_size
,
175 .gop_alloc
= ulfs_gop_alloc
,
176 .gop_write
= lfs_gop_write
,
177 .gop_markupdate
= ulfs_gop_markupdate
,
186 sysctl_lfs_dostats(SYSCTLFN_ARGS
)
188 extern struct lfs_stats lfs_stats
;
189 extern int lfs_dostats
;
192 error
= sysctl_lookup(SYSCTLFN_CALL(rnode
));
193 if (error
|| newp
== NULL
)
196 if (lfs_dostats
== 0)
197 memset(&lfs_stats
, 0, sizeof(lfs_stats
));
203 lfs_sysctl_setup(struct sysctllog
**clog
)
206 extern int lfs_writeindir
, lfs_dostats
, lfs_clean_vnhead
,
207 lfs_fs_pagetrip
, lfs_ignore_lazy_sync
;
209 extern int lfs_debug_log_subsys
[DLOG_MAX
];
210 struct shortlong dlog_names
[DLOG_MAX
] = { /* Must match lfs.h ! */
211 { "rollforward", "Debug roll-forward code" },
212 { "alloc", "Debug inode allocation and free list" },
213 { "avail", "Debug space-available-now accounting" },
214 { "flush", "Debug flush triggers" },
215 { "lockedlist", "Debug locked list accounting" },
216 { "vnode_verbose", "Verbose per-vnode-written debugging" },
217 { "vnode", "Debug vnode use during segment write" },
218 { "segment", "Debug segment writing" },
219 { "seguse", "Debug segment used-bytes accounting" },
220 { "cleaner", "Debug cleaning routines" },
221 { "mount", "Debug mount/unmount routines" },
222 { "pagecache", "Debug UBC interactions" },
223 { "dirop", "Debug directory-operation accounting" },
224 { "malloc", "Debug private malloc accounting" },
227 struct shortlong stat_names
[] = { /* Must match lfs.h! */
228 { "segsused", "Number of new segments allocated" },
229 { "psegwrites", "Number of partial-segment writes" },
230 { "psyncwrites", "Number of synchronous partial-segment"
232 { "pcleanwrites", "Number of partial-segment writes by the"
234 { "blocktot", "Number of blocks written" },
235 { "cleanblocks", "Number of blocks written by the cleaner" },
236 { "ncheckpoints", "Number of checkpoints made" },
237 { "nwrites", "Number of whole writes" },
238 { "nsync_writes", "Number of synchronous writes" },
239 { "wait_exceeded", "Number of times writer waited for"
241 { "write_exceeded", "Number of times writer invoked flush" },
242 { "flush_invoked", "Number of times flush was invoked" },
243 { "vflush_invoked", "Number of time vflush was called" },
244 { "clean_inlocked", "Number of vnodes skipped for being dead" },
245 { "clean_vnlocked", "Number of vnodes skipped for vget failure" },
246 { "segs_reclaimed", "Number of segments reclaimed" },
249 sysctl_createv(clog
, 0, NULL
, NULL
,
252 SYSCTL_DESCR("Log-structured file system"),
254 CTL_VFS
, 5, CTL_EOL
);
256 * XXX the "5" above could be dynamic, thereby eliminating one
257 * more instance of the "number to vfs" mapping problem, but
258 * "5" is the order as taken from sys/mount.h
261 sysctl_createv(clog
, 0, NULL
, NULL
,
262 CTLFLAG_PERMANENT
|CTLFLAG_READWRITE
,
263 CTLTYPE_INT
, "flushindir", NULL
,
264 NULL
, 0, &lfs_writeindir
, 0,
265 CTL_VFS
, 5, LFS_WRITEINDIR
, CTL_EOL
);
266 sysctl_createv(clog
, 0, NULL
, NULL
,
267 CTLFLAG_PERMANENT
|CTLFLAG_READWRITE
,
268 CTLTYPE_INT
, "clean_vnhead", NULL
,
269 NULL
, 0, &lfs_clean_vnhead
, 0,
270 CTL_VFS
, 5, LFS_CLEAN_VNHEAD
, CTL_EOL
);
271 sysctl_createv(clog
, 0, NULL
, NULL
,
272 CTLFLAG_PERMANENT
|CTLFLAG_READWRITE
,
273 CTLTYPE_INT
, "dostats",
274 SYSCTL_DESCR("Maintain statistics on LFS operations"),
275 sysctl_lfs_dostats
, 0, &lfs_dostats
, 0,
276 CTL_VFS
, 5, LFS_DOSTATS
, CTL_EOL
);
277 sysctl_createv(clog
, 0, NULL
, NULL
,
278 CTLFLAG_PERMANENT
|CTLFLAG_READWRITE
,
279 CTLTYPE_INT
, "pagetrip",
280 SYSCTL_DESCR("How many dirty pages in fs triggers"
282 NULL
, 0, &lfs_fs_pagetrip
, 0,
283 CTL_VFS
, 5, LFS_FS_PAGETRIP
, CTL_EOL
);
284 sysctl_createv(clog
, 0, NULL
, NULL
,
285 CTLFLAG_PERMANENT
|CTLFLAG_READWRITE
,
286 CTLTYPE_INT
, "ignore_lazy_sync",
287 SYSCTL_DESCR("Lazy Sync is ignored entirely"),
288 NULL
, 0, &lfs_ignore_lazy_sync
, 0,
289 CTL_VFS
, 5, LFS_IGNORE_LAZY_SYNC
, CTL_EOL
);
290 #ifdef LFS_KERNEL_RFW
291 sysctl_createv(clog
, 0, NULL
, NULL
,
292 CTLFLAG_PERMANENT
|CTLFLAG_READWRITE
,
294 SYSCTL_DESCR("Use in-kernel roll-forward on mount"),
295 NULL
, 0, &lfs_do_rfw
, 0,
296 CTL_VFS
, 5, LFS_DO_RFW
, CTL_EOL
);
299 sysctl_createv(clog
, 0, NULL
, NULL
,
301 CTLTYPE_NODE
, "stats",
302 SYSCTL_DESCR("Debugging options"),
304 CTL_VFS
, 5, LFS_STATS
, CTL_EOL
);
305 for (i
= 0; i
< sizeof(struct lfs_stats
) / sizeof(u_int
); i
++) {
306 sysctl_createv(clog
, 0, NULL
, NULL
,
307 CTLFLAG_PERMANENT
|CTLFLAG_READONLY
,
308 CTLTYPE_INT
, stat_names
[i
].sname
,
309 SYSCTL_DESCR(stat_names
[i
].lname
),
310 NULL
, 0, &(((u_int
*)&lfs_stats
.segsused
)[i
]),
311 0, CTL_VFS
, 5, LFS_STATS
, i
, CTL_EOL
);
315 sysctl_createv(clog
, 0, NULL
, NULL
,
317 CTLTYPE_NODE
, "debug",
318 SYSCTL_DESCR("Debugging options"),
320 CTL_VFS
, 5, LFS_DEBUGLOG
, CTL_EOL
);
321 for (i
= 0; i
< DLOG_MAX
; i
++) {
322 sysctl_createv(clog
, 0, NULL
, NULL
,
323 CTLFLAG_PERMANENT
|CTLFLAG_READWRITE
,
324 CTLTYPE_INT
, dlog_names
[i
].sname
,
325 SYSCTL_DESCR(dlog_names
[i
].lname
),
326 NULL
, 0, &(lfs_debug_log_subsys
[i
]), 0,
327 CTL_VFS
, 5, LFS_DEBUGLOG
, i
, CTL_EOL
);
332 /* old cleaner syscall interface. see VOP_FCNTL() */
333 static const struct syscall_package lfs_syscalls
[] = {
334 { SYS_lfs_bmapv
, 0, (sy_call_t
*)sys_lfs_bmapv
},
335 { SYS_lfs_markv
, 0, (sy_call_t
*)sys_lfs_markv
},
336 { SYS___lfs_segwait50
, 0, (sy_call_t
*)sys___lfs_segwait50
},
337 { SYS_lfs_segclean
, 0, (sy_call_t
*)sys_lfs_segclean
},
342 lfs_modcmd(modcmd_t cmd
, void *arg
)
347 case MODULE_CMD_INIT
:
348 error
= syscall_establish(NULL
, lfs_syscalls
);
351 error
= vfs_attach(&lfs_vfsops
);
353 syscall_disestablish(NULL
, lfs_syscalls
);
356 lfs_sysctl_setup(&lfs_sysctl_log
);
358 case MODULE_CMD_FINI
:
359 error
= vfs_detach(&lfs_vfsops
);
362 syscall_disestablish(NULL
, lfs_syscalls
);
363 sysctl_teardown(&lfs_sysctl_log
);
374 * XXX Same structure as FFS inodes? Should we share a common pool?
376 struct pool lfs_inode_pool
;
377 struct pool lfs_dinode_pool
;
378 struct pool lfs_inoext_pool
;
379 struct pool lfs_lbnentry_pool
;
382 * The writer daemon. UVM keeps track of how many dirty pages we are holding
383 * in lfs_subsys_pages; the daemon flushes the filesystem when this value
384 * crosses the (user-defined) threshhold LFS_MAX_PAGES.
387 lfs_writerd(void *arg
)
389 struct mount
*mp
, *nmp
;
391 struct vfsops
*vfs
= NULL
;
395 int wrote_something
= 0;
397 mutex_enter(&lfs_lock
);
398 lfs_writer_daemon
= curproc
->p_pid
;
399 lfs_writer_lid
= curlwp
->l_lid
;
400 mutex_exit(&lfs_lock
);
402 /* Take an extra reference to the LFS vfsops. */
403 vfs
= vfs_getopsbyname(MOUNT_LFS
);
405 mutex_enter(&lfs_lock
);
407 KASSERT(mutex_owned(&lfs_lock
));
408 if (wrote_something
== 0)
409 mtsleep(&lfs_writer_daemon
, PVM
, "lfswriter", hz
/10 + 1,
412 KASSERT(mutex_owned(&lfs_lock
));
416 * If global state wants a flush, flush everything.
418 if (lfs_do_flush
|| locked_queue_count
> LFS_MAX_BUFS
||
419 locked_queue_bytes
> LFS_MAX_BYTES
||
420 lfs_subsys_pages
> LFS_MAX_PAGES
) {
423 DLOG((DLOG_FLUSH
, "lfs_writerd: lfs_do_flush\n"));
425 if (locked_queue_count
> LFS_MAX_BUFS
) {
426 DLOG((DLOG_FLUSH
, "lfs_writerd: lqc = %d, max %d\n",
427 locked_queue_count
, LFS_MAX_BUFS
));
429 if (locked_queue_bytes
> LFS_MAX_BYTES
) {
430 DLOG((DLOG_FLUSH
, "lfs_writerd: lqb = %ld, max %ld\n",
431 locked_queue_bytes
, LFS_MAX_BYTES
));
433 if (lfs_subsys_pages
> LFS_MAX_PAGES
) {
434 DLOG((DLOG_FLUSH
, "lfs_writerd: lssp = %d, max %d\n",
435 lfs_subsys_pages
, LFS_MAX_PAGES
));
438 lfs_flush(NULL
, SEGM_WRITERD
, 0);
440 KASSERT(mutex_owned(&lfs_lock
));
443 KASSERT(mutex_owned(&lfs_lock
));
444 mutex_exit(&lfs_lock
);
447 * Look through the list of LFSs to see if any of them
448 * have requested pageouts.
450 mutex_enter(&mountlist_lock
);
453 for (mp
= TAILQ_FIRST(&mountlist
); mp
!= NULL
; mp
= nmp
) {
454 if (vfs_busy(mp
, &nmp
)) {
458 KASSERT(!mutex_owned(&lfs_lock
));
459 if (strncmp(mp
->mnt_stat
.f_fstypename
, MOUNT_LFS
,
460 sizeof(mp
->mnt_stat
.f_fstypename
)) == 0) {
462 fs
= VFSTOULFS(mp
)->um_lfs
;
464 fsflags
= SEGM_SINGLE
;
466 mutex_enter(&lfs_lock
);
467 ooffset
= lfs_sb_getoffset(fs
);
469 if (lfs_sb_getnextseg(fs
) < lfs_sb_getcurseg(fs
) && fs
->lfs_nowrap
) {
470 /* Don't try to write if we're suspended */
471 mutex_exit(&lfs_lock
);
472 vfs_unbusy(mp
, false, &nmp
);
475 if (LFS_STARVED_FOR_SEGS(fs
)) {
476 mutex_exit(&lfs_lock
);
478 DLOG((DLOG_FLUSH
, "lfs_writerd: need cleaning before writing possible\n"));
479 lfs_wakeup_cleaner(fs
);
480 vfs_unbusy(mp
, false, &nmp
);
484 if ((fs
->lfs_dirvcount
> LFS_MAX_FSDIROP(fs
) ||
485 lfs_dirvcount
> LFS_MAX_DIROP
) &&
486 fs
->lfs_dirops
== 0) {
487 fsflags
&= ~SEGM_SINGLE
;
489 DLOG((DLOG_FLUSH
, "lfs_writerd: checkpoint\n"));
490 lfs_flush_fs(fs
, fsflags
);
491 } else if (fs
->lfs_pdflush
) {
492 DLOG((DLOG_FLUSH
, "lfs_writerd: pdflush set\n"));
493 lfs_flush_fs(fs
, fsflags
);
494 } else if (!TAILQ_EMPTY(&fs
->lfs_pchainhd
)) {
495 DLOG((DLOG_FLUSH
, "lfs_writerd: pchain non-empty\n"));
496 mutex_exit(&lfs_lock
);
497 lfs_writer_enter(fs
, "wrdirop");
498 lfs_flush_pchain(fs
);
499 lfs_writer_leave(fs
);
500 mutex_enter(&lfs_lock
);
502 if (lfs_sb_getoffset(fs
) != ooffset
)
504 mutex_exit(&lfs_lock
);
506 KASSERT(!mutex_owned(&lfs_lock
));
507 vfs_unbusy(mp
, false, &nmp
);
509 if (lfsc
+ skipc
== 0) {
510 mutex_enter(&lfs_lock
);
511 lfs_writer_daemon
= 0;
513 mutex_exit(&lfs_lock
);
514 mutex_exit(&mountlist_lock
);
517 mutex_exit(&mountlist_lock
);
519 mutex_enter(&lfs_lock
);
521 KASSERT(!mutex_owned(&lfs_lock
));
522 KASSERT(!mutex_owned(&mountlist_lock
));
524 /* Give up our extra reference so the module can be unloaded. */
525 mutex_enter(&vfs_list_lock
);
528 mutex_exit(&vfs_list_lock
);
535 * Initialize the filesystem, most work done by ulfs_init.
542 * XXX: should we use separate pools for 32-bit and 64-bit
545 malloc_type_attach(M_SEGMENT
);
546 pool_init(&lfs_inode_pool
, sizeof(struct inode
), 0, 0, 0,
547 "lfsinopl", &pool_allocator_nointr
, IPL_NONE
);
548 pool_init(&lfs_dinode_pool
, sizeof(union lfs_dinode
), 0, 0, 0,
549 "lfsdinopl", &pool_allocator_nointr
, IPL_NONE
);
550 pool_init(&lfs_inoext_pool
, sizeof(struct lfs_inode_ext
), 8, 0, 0,
551 "lfsinoextpl", &pool_allocator_nointr
, IPL_NONE
);
552 pool_init(&lfs_lbnentry_pool
, sizeof(struct lbnentry
), 0, 0, 0,
553 "lfslbnpool", &pool_allocator_nointr
, IPL_NONE
);
557 memset(lfs_log
, 0, sizeof(lfs_log
));
559 mutex_init(&lfs_lock
, MUTEX_DEFAULT
, IPL_NONE
);
560 cv_init(&locked_queue_cv
, "lfsbuf");
561 cv_init(&lfs_writing_cv
, "lfsflush");
574 mutex_destroy(&lfs_lock
);
575 cv_destroy(&locked_queue_cv
);
576 cv_destroy(&lfs_writing_cv
);
577 pool_destroy(&lfs_inode_pool
);
578 pool_destroy(&lfs_dinode_pool
);
579 pool_destroy(&lfs_inoext_pool
);
580 pool_destroy(&lfs_lbnentry_pool
);
581 malloc_type_detach(M_SEGMENT
);
585 * Called by main() when ulfs is going to be mounted as root.
590 extern struct vnode
*rootvp
;
591 struct lfs
*fs
= NULL
; /* LFS */
593 struct lwp
*l
= curlwp
;
594 struct ulfsmount
*ump
;
597 if (device_class(root_device
) != DV_DISK
)
600 if (rootdev
== NODEV
)
602 if ((error
= vfs_rootmountalloc(MOUNT_LFS
, "root_device", &mp
))) {
606 if ((error
= lfs_mountfs(rootvp
, mp
, l
))) {
607 vfs_unbusy(mp
, false, NULL
);
611 mountlist_append(mp
);
614 lfs_sb_setfsmnt(fs
, mp
->mnt_stat
.f_mntonname
);
615 (void)lfs_statvfs(mp
, &mp
->mnt_stat
);
616 vfs_unbusy(mp
, false, NULL
);
617 setrootfstime((time_t)lfs_sb_gettstamp(VFSTOULFS(mp
)->um_lfs
));
627 lfs_mount(struct mount
*mp
, const char *path
, void *data
, size_t *data_len
)
629 struct lwp
*l
= curlwp
;
631 struct ulfs_args
*args
= data
;
632 struct ulfsmount
*ump
= NULL
;
633 struct lfs
*fs
= NULL
; /* LFS */
634 int error
= 0, update
;
639 if (*data_len
< sizeof *args
)
642 if (mp
->mnt_flag
& MNT_GETARGS
) {
647 *data_len
= sizeof *args
;
651 update
= mp
->mnt_flag
& MNT_UPDATE
;
653 /* Check arguments */
654 if (args
->fspec
!= NULL
) {
656 * Look up the name and verify that it's sane.
658 error
= namei_simple_user(args
->fspec
,
659 NSM_FOLLOW_NOEMULROOT
, &devvp
);
665 * Be sure this is a valid block device
667 if (devvp
->v_type
!= VBLK
)
669 else if (bdevsw_lookup(devvp
->v_rdev
) == NULL
)
673 * Be sure we're still naming the same device
674 * used for our initial mount
677 if (devvp
!= ump
->um_devvp
) {
678 if (devvp
->v_rdev
!= ump
->um_devvp
->v_rdev
)
682 devvp
= ump
->um_devvp
;
689 /* New mounts must have a filename for the device */
692 /* Use the extant mount */
694 devvp
= ump
->um_devvp
;
701 * If mount by non-root, then verify that user has necessary
702 * permissions on the device.
707 (mp
->mnt_iflag
& IMNT_WANTRDWR
) != 0 :
708 (mp
->mnt_flag
& MNT_RDONLY
) == 0)
709 accessmode
|= VWRITE
;
710 vn_lock(devvp
, LK_EXCLUSIVE
| LK_RETRY
);
711 error
= kauth_authorize_system(l
->l_cred
, KAUTH_SYSTEM_MOUNT
,
712 KAUTH_REQ_SYSTEM_MOUNT_DEVICE
, mp
, devvp
,
713 KAUTH_ARG(accessmode
));
725 if (mp
->mnt_flag
& MNT_RDONLY
)
728 flags
= FREAD
|FWRITE
;
729 vn_lock(devvp
, LK_EXCLUSIVE
| LK_RETRY
);
730 error
= VOP_OPEN(devvp
, flags
, FSCRED
);
734 error
= lfs_mountfs(devvp
, mp
, l
); /* LFS */
736 vn_lock(devvp
, LK_EXCLUSIVE
| LK_RETRY
);
737 (void)VOP_CLOSE(devvp
, flags
, NOCRED
);
750 * The initial mount got a reference on this
751 * device, so drop the one obtained via
759 if (fs
->lfs_ronly
== 0 && (mp
->mnt_flag
& MNT_RDONLY
)) {
761 * Changing from read/write to read-only.
762 * XXX: shouldn't we sync here? or does vfs do that?
765 /* XXX: quotas should remain on when readonly */
766 if (fs
->lfs_use_quota2
) {
767 error
= lfsquota2_umount(mp
, 0);
775 if (fs
->lfs_ronly
&& (mp
->mnt_iflag
& IMNT_WANTRDWR
)) {
777 * Changing from read-only to read/write.
778 * Note in the superblocks that we're writing.
781 /* XXX: quotas should have been on even if readonly */
782 if (fs
->lfs_use_quota2
) {
784 error
= lfs_quota2_mount(mp
);
786 uprintf("%s: no kernel support for this "
787 "filesystem's quotas\n",
788 mp
->mnt_stat
.f_mntonname
);
789 if (mp
->mnt_flag
& MNT_FORCE
) {
790 uprintf("%s: mounting anyway; "
792 mp
->mnt_stat
.f_mntonname
);
803 if (lfs_sb_getpflags(fs
) & LFS_PF_CLEAN
) {
804 lfs_sb_setpflags(fs
, lfs_sb_getpflags(fs
) & ~LFS_PF_CLEAN
);
805 lfs_writesuper(fs
, lfs_sb_getsboff(fs
, 0));
806 lfs_writesuper(fs
, lfs_sb_getsboff(fs
, 1));
809 if (args
->fspec
== NULL
)
813 error
= set_statvfs_info(path
, UIO_USERSPACE
, args
->fspec
,
814 UIO_USERSPACE
, mp
->mnt_op
->vfs_name
, mp
, l
);
816 lfs_sb_setfsmnt(fs
, mp
->mnt_stat
.f_mntonname
);
826 * Common code for mount and mountroot
830 lfs_mountfs(struct vnode
*devvp
, struct mount
*mp
, struct lwp
*l
)
832 struct dlfs
*tdfs
, *dfs
, *adfs
;
834 struct ulfsmount
*ump
;
836 struct buf
*bp
, *abp
;
838 int error
, i
, ronly
, fsbsize
;
844 cred
= l
? l
->l_cred
: NOCRED
;
846 /* The superblock is supposed to be 512 bytes. */
847 __CTASSERT(sizeof(struct dlfs
) == DEV_BSIZE
);
850 * Flush out any old buffers remaining from a previous use.
852 vn_lock(devvp
, LK_EXCLUSIVE
| LK_RETRY
);
853 error
= vinvalbuf(devvp
, V_SAVE
, cred
, l
, 0, 0);
858 ronly
= (mp
->mnt_flag
& MNT_RDONLY
) != 0;
860 /* Don't free random space on error. */
865 sb_addr
= LFS_LABELPAD
/ DEV_BSIZE
;
867 /* Read in the superblock. */
868 error
= bread(devvp
, sb_addr
, LFS_SBPAD
, 0, &bp
);
871 dfs
= (struct dlfs
*)bp
->b_data
;
873 /* Check the basics. */
874 if (dfs
->dlfs_magic
!= LFS_MAGIC
|| dfs
->dlfs_bsize
> MAXBSIZE
||
875 dfs
->dlfs_version
> LFS_VERSION
||
876 dfs
->dlfs_bsize
< sizeof(struct dlfs
)) {
877 DLOG((DLOG_MOUNT
, "lfs_mountfs: primary superblock sanity failed\n"));
878 error
= EINVAL
; /* XXX needs translation */
881 if (dfs
->dlfs_inodefmt
> LFS_MAXINODEFMT
) {
882 DLOG((DLOG_MOUNT
, "lfs_mountfs: unknown inode format %d\n",
883 dfs
->dlfs_inodefmt
));
888 if (dfs
->dlfs_version
== 1)
891 fsbsize
= 1 << dfs
->dlfs_ffshift
;
893 * Could be, if the frag size is large enough, that we
894 * don't have the "real" primary superblock. If that's
895 * the case, get the real one, and try again.
897 if (sb_addr
!= (dfs
->dlfs_sboffs
[0] << (dfs
->dlfs_ffshift
- DEV_BSHIFT
))) {
898 DLOG((DLOG_MOUNT
, "lfs_mountfs: sb daddr"
899 " 0x%llx is not right, trying 0x%llx\n",
901 (long long)(dfs
->dlfs_sboffs
[0] << (dfs
->dlfs_ffshift
- DEV_BSHIFT
))));
902 sb_addr
= dfs
->dlfs_sboffs
[0] << (dfs
->dlfs_ffshift
- DEV_BSHIFT
);
911 * Check the second superblock to see which is newer; then mount
912 * using the older of the two. This is necessary to ensure that
913 * the filesystem is valid if it was not unmounted cleanly.
916 if (dfs
->dlfs_sboffs
[1] &&
917 dfs
->dlfs_sboffs
[1] - LFS_LABELPAD
/ fsbsize
> LFS_SBPAD
/ fsbsize
)
919 error
= bread(devvp
, dfs
->dlfs_sboffs
[1] * (fsbsize
/ DEV_BSIZE
),
923 adfs
= (struct dlfs
*)abp
->b_data
;
925 if (dfs
->dlfs_version
== 1) {
926 /* 1s resolution comparison */
927 if (adfs
->dlfs_tstamp
< dfs
->dlfs_tstamp
)
932 /* monotonic infinite-resolution comparison */
933 if (adfs
->dlfs_serial
< dfs
->dlfs_serial
)
939 /* Check the basics. */
940 if (tdfs
->dlfs_magic
!= LFS_MAGIC
||
941 tdfs
->dlfs_bsize
> MAXBSIZE
||
942 tdfs
->dlfs_version
> LFS_VERSION
||
943 tdfs
->dlfs_bsize
< sizeof(struct dlfs
)) {
944 DLOG((DLOG_MOUNT
, "lfs_mountfs: alt superblock"
945 " sanity failed\n"));
946 error
= EINVAL
; /* XXX needs translation */
950 DLOG((DLOG_MOUNT
, "lfs_mountfs: invalid alt superblock"
951 " daddr=0x%x\n", dfs
->dlfs_sboffs
[1]));
956 /* Allocate the mount structure, copy the superblock into it. */
957 fs
= kmem_zalloc(sizeof(struct lfs
), KM_SLEEP
);
958 memcpy(&fs
->lfs_dlfs_u
.u_32
, tdfs
, sizeof(struct dlfs
));
959 fs
->lfs_is64
= false; /* XXX notyet */
960 fs
->lfs_dobyteswap
= false; /* XXX notyet */
961 fs
->lfs_hasolddirfmt
= false; /* set for real below */
964 if (lfs_sb_getversion(fs
) < 2) {
965 lfs_sb_setsumsize(fs
, LFS_V1_SUMMARY_SIZE
);
966 lfs_sb_setibsize(fs
, lfs_sb_getbsize(fs
));
967 lfs_sb_sets0addr(fs
, lfs_sb_getsboff(fs
, 0));
968 lfs_sb_settstamp(fs
, lfs_sb_getotstamp(fs
));
969 lfs_sb_setfsbtodb(fs
, 0);
971 if (lfs_sb_getresvseg(fs
) == 0)
972 lfs_sb_setresvseg(fs
, MIN(lfs_sb_getminfreeseg(fs
) - 1, \
973 MAX(MIN_RESV_SEGS
, lfs_sb_getminfreeseg(fs
) / 2 + 1)));
976 * If we aren't going to be able to write meaningfully to this
977 * filesystem, and were not mounted readonly, bomb out now.
979 if (lfs_fsbtob(fs
, LFS_NRESERVE(fs
)) > LFS_MAX_BYTES
&& !ronly
) {
980 DLOG((DLOG_MOUNT
, "lfs_mount: to mount this filesystem read/write,"
981 " we need BUFPAGES >= %lld\n",
982 (long long)((bufmem_hiwater
/ bufmem_lowater
) *
983 LFS_INVERSE_MAX_BYTES(
984 lfs_fsbtob(fs
, LFS_NRESERVE(fs
))) >> PAGE_SHIFT
)));
985 kmem_free(fs
, sizeof(struct lfs
));
986 error
= EFBIG
; /* XXX needs translation */
990 /* Before rolling forward, lock so vget will sleep for other procs */
992 fs
->lfs_flags
= LFS_NOTYET
;
993 fs
->lfs_rfpid
= l
->l_proc
->p_pid
;
996 ump
= kmem_zalloc(sizeof(*ump
), KM_SLEEP
);
998 ump
->um_fstype
= ULFS1
;
999 /* ump->um_cleaner_thread = NULL; */
1000 if (sizeof(struct lfs
) < LFS_SBPAD
) { /* XXX why? */
1001 brelse(bp
, BC_INVAL
);
1002 brelse(abp
, BC_INVAL
);
1011 /* Set up the I/O information */
1012 fs
->lfs_devbsize
= DEV_BSIZE
;
1013 fs
->lfs_iocount
= 0;
1014 fs
->lfs_diropwait
= 0;
1015 fs
->lfs_activesb
= 0;
1016 lfs_sb_setuinodes(fs
, 0);
1019 fs
->lfs_sbactive
= 0;
1021 /* Set up the ifile and lock aflags */
1022 fs
->lfs_doifile
= 0;
1025 fs
->lfs_nadirop
= 0;
1026 fs
->lfs_seglock
= 0;
1027 fs
->lfs_pdflush
= 0;
1028 fs
->lfs_sleepers
= 0;
1030 rw_init(&fs
->lfs_fraglock
);
1031 rw_init(&fs
->lfs_iflock
);
1032 cv_init(&fs
->lfs_stopcv
, "lfsstop");
1034 /* Set the file system readonly/modify bits. */
1035 fs
->lfs_ronly
= ronly
;
1039 /* ulfs-level information */
1041 fs
->um_bptrtodb
= lfs_sb_getffshift(fs
) - DEV_BSHIFT
;
1042 fs
->um_seqinc
= lfs_sb_getfrag(fs
);
1043 fs
->um_nindir
= lfs_sb_getnindir(fs
);
1044 fs
->um_lognindir
= ffs(lfs_sb_getnindir(fs
)) - 1;
1045 fs
->um_maxsymlinklen
= lfs_sb_getmaxsymlinklen(fs
);
1046 fs
->um_dirblksiz
= LFS_DIRBLKSIZ
;
1047 fs
->um_maxfilesize
= lfs_sb_getmaxfilesize(fs
);
1050 /* XXX: these need to come from the on-disk superblock to be used */
1051 fs
->lfs_use_quota2
= 0;
1052 fs
->lfs_quota_magic
= 0;
1053 fs
->lfs_quota_flags
= 0;
1054 fs
->lfs_quotaino
[0] = 0;
1055 fs
->lfs_quotaino
[1] = 0;
1057 /* Initialize the mount structure. */
1058 dev
= devvp
->v_rdev
;
1060 mp
->mnt_stat
.f_fsidx
.__fsid_val
[0] = (long)dev
;
1061 mp
->mnt_stat
.f_fsidx
.__fsid_val
[1] = makefstype(MOUNT_LFS
);
1062 mp
->mnt_stat
.f_fsid
= mp
->mnt_stat
.f_fsidx
.__fsid_val
[0];
1063 mp
->mnt_stat
.f_namemax
= LFS_MAXNAMLEN
;
1064 mp
->mnt_stat
.f_iosize
= lfs_sb_getbsize(fs
);
1065 mp
->mnt_flag
|= MNT_LOCAL
;
1066 mp
->mnt_fs_bshift
= lfs_sb_getbshift(fs
);
1067 if (fs
->um_maxsymlinklen
> 0)
1068 mp
->mnt_iflag
|= IMNT_DTYPE
;
1070 fs
->lfs_hasolddirfmt
= true;
1072 ump
->um_mountp
= mp
;
1074 ump
->um_devvp
= devvp
;
1075 for (i
= 0; i
< ULFS_MAXQUOTAS
; i
++)
1076 ump
->um_quotas
[i
] = NULLVP
;
1077 spec_node_setmountedfs(devvp
, mp
);
1079 /* Set up reserved memory for pageout */
1080 lfs_setup_resblks(fs
);
1081 /* Set up vdirop tailq */
1082 TAILQ_INIT(&fs
->lfs_dchainhd
);
1083 /* and paging tailq */
1084 TAILQ_INIT(&fs
->lfs_pchainhd
);
1085 /* and delayed segment accounting for truncation list */
1086 LIST_INIT(&fs
->lfs_segdhd
);
1089 * We use the ifile vnode for almost every operation. Instead of
1090 * retrieving it from the hash table each time we retrieve it here,
1091 * artificially increment the reference count and keep a pointer
1092 * to it in the incore copy of the superblock.
1094 if ((error
= VFS_VGET(mp
, LFS_IFILE_INUM
, &vp
)) != 0) {
1095 DLOG((DLOG_MOUNT
, "lfs_mountfs: ifile vget failed, error=%d\n", error
));
1098 fs
->lfs_ivnode
= vp
;
1101 /* Set up inode bitmap and order free list */
1102 lfs_order_freelist(fs
);
1104 /* Set up segment usage flags for the autocleaner. */
1105 fs
->lfs_nactive
= 0;
1106 fs
->lfs_suflags
= malloc(2 * sizeof(u_int32_t
*),
1107 M_SEGMENT
, M_WAITOK
);
1108 fs
->lfs_suflags
[0] = malloc(lfs_sb_getnseg(fs
) * sizeof(u_int32_t
),
1109 M_SEGMENT
, M_WAITOK
);
1110 fs
->lfs_suflags
[1] = malloc(lfs_sb_getnseg(fs
) * sizeof(u_int32_t
),
1111 M_SEGMENT
, M_WAITOK
);
1112 memset(fs
->lfs_suflags
[1], 0, lfs_sb_getnseg(fs
) * sizeof(u_int32_t
));
1113 for (i
= 0; i
< lfs_sb_getnseg(fs
); i
++) {
1116 LFS_SEGENTRY(sup
, fs
, i
, bp
);
1119 if (sup
->su_nbytes
== 0 &&
1120 !(sup
->su_flags
& SEGUSE_EMPTY
)) {
1121 sup
->su_flags
|= SEGUSE_EMPTY
;
1123 } else if (!(sup
->su_nbytes
== 0) &&
1124 (sup
->su_flags
& SEGUSE_EMPTY
)) {
1125 sup
->su_flags
&= ~SEGUSE_EMPTY
;
1128 if (sup
->su_flags
& (SEGUSE_ACTIVE
|SEGUSE_INVAL
)) {
1129 sup
->su_flags
&= ~(SEGUSE_ACTIVE
|SEGUSE_INVAL
);
1133 fs
->lfs_suflags
[0][i
] = sup
->su_flags
;
1135 LFS_WRITESEGENTRY(sup
, fs
, i
, bp
);
1141 * XXX: if the fs has quotas, quotas should be on even if
1142 * readonly. Otherwise you can't query the quota info!
1143 * However, that's not how the quota2 code got written and I
1144 * don't know if it'll behave itself if enabled while
1145 * readonly, so for now use the same enable logic as ffs.
1147 * XXX: also, if you use the -f behavior allowed here (and
1148 * equivalently above for remount) it will corrupt the fs. It
1149 * ought not to allow that. It should allow mounting readonly
1150 * if there are quotas and the kernel doesn't have the quota
1151 * code, but only readonly.
1153 * XXX: and if you use the -f behavior allowed here it will
1154 * likely crash at unmount time (or remount time) because we
1155 * think quotas are active.
1157 * Although none of this applies until there's a way to set
1158 * lfs_use_quota2 and have quotas in the fs at all.
1160 if (!ronly
&& fs
->lfs_use_quota2
) {
1162 error
= lfs_quota2_mount(mp
);
1164 uprintf("%s: no kernel support for this filesystem's quotas\n",
1165 mp
->mnt_stat
.f_mntonname
);
1166 if (mp
->mnt_flag
& MNT_FORCE
) {
1167 uprintf("%s: mounting anyway; fsck afterwards\n",
1168 mp
->mnt_stat
.f_mntonname
);
1174 /* XXX XXX must clean up the stuff immediately above */
1175 printf("lfs_mountfs: sorry, leaking some memory\n");
1182 * Initialize file-backed extended attributes for ULFS1 file
1185 * XXX: why is this limited to ULFS1?
1187 if (ump
->um_fstype
== ULFS1
) {
1188 ulfs_extattr_uepm_init(&ump
->um_extattr
);
1192 #ifdef LFS_KERNEL_RFW
1193 lfs_roll_forward(fs
, mp
, l
);
1196 /* If writing, sb is not clean; record in case of immediate crash */
1197 if (!fs
->lfs_ronly
) {
1198 lfs_sb_setpflags(fs
, lfs_sb_getpflags(fs
) & ~LFS_PF_CLEAN
);
1199 lfs_writesuper(fs
, lfs_sb_getsboff(fs
, 0));
1200 lfs_writesuper(fs
, lfs_sb_getsboff(fs
, 1));
1203 /* Allow vget now that roll-forward is complete */
1204 fs
->lfs_flags
&= ~(LFS_NOTYET
);
1205 wakeup(&fs
->lfs_flags
);
1208 * Initialize the ifile cleaner info with information from
1211 LFS_CLEANERINFO(cip
, fs
, bp
);
1212 lfs_ci_setclean(fs
, cip
, lfs_sb_getnclean(fs
));
1213 lfs_ci_setdirty(fs
, cip
, lfs_sb_getnseg(fs
) - lfs_sb_getnclean(fs
));
1214 lfs_ci_setavail(fs
, cip
, lfs_sb_getavail(fs
));
1215 lfs_ci_setbfree(fs
, cip
, lfs_sb_getbfree(fs
));
1216 (void) LFS_BWRITE_LOG(bp
); /* Ifile */
1219 * Mark the current segment as ACTIVE, since we're going to
1222 LFS_SEGENTRY(sup
, fs
, lfs_dtosn(fs
, lfs_sb_getoffset(fs
)), bp
);
1223 sup
->su_flags
|= SEGUSE_DIRTY
| SEGUSE_ACTIVE
;
1225 LFS_WRITESEGENTRY(sup
, fs
, lfs_dtosn(fs
, lfs_sb_getoffset(fs
)), bp
); /* Ifile */
1227 /* Now that roll-forward is done, unlock the Ifile */
1230 /* Start the pagedaemon-anticipating daemon */
1231 mutex_enter(&lfs_lock
);
1232 if (lfs_writer_daemon
== 0 && lfs_writer_lid
== 0 &&
1233 kthread_create(PRI_BIO
, 0, NULL
,
1234 lfs_writerd
, NULL
, NULL
, "lfs_writer") != 0)
1235 panic("fork lfs_writer");
1236 mutex_exit(&lfs_lock
);
1238 printf("WARNING: the log-structured file system is experimental\n"
1239 "WARNING: it may cause system crashes and/or corrupt data\n");
1249 kmem_free(ump
->um_lfs
, sizeof(struct lfs
));
1250 kmem_free(ump
, sizeof(*ump
));
1251 mp
->mnt_data
= NULL
;
1258 * unmount system call
1261 lfs_unmount(struct mount
*mp
, int mntflags
)
1263 struct lwp
*l
= curlwp
;
1264 struct ulfsmount
*ump
;
1266 int error
, flags
, ronly
;
1270 if (mntflags
& MNT_FORCE
)
1271 flags
|= FORCECLOSE
;
1273 ump
= VFSTOULFS(mp
);
1276 /* Two checkpoints */
1277 lfs_segwrite(mp
, SEGM_CKP
| SEGM_SYNC
);
1278 lfs_segwrite(mp
, SEGM_CKP
| SEGM_SYNC
);
1280 /* wake up the cleaner so it can die */
1281 /* XXX: shouldn't this be *after* the error cases below? */
1282 lfs_wakeup_cleaner(fs
);
1283 mutex_enter(&lfs_lock
);
1284 while (fs
->lfs_sleepers
)
1285 mtsleep(&fs
->lfs_sleepers
, PRIBIO
+ 1, "lfs_sleepers", 0,
1287 mutex_exit(&lfs_lock
);
1290 if (ump
->um_fstype
== ULFS1
) {
1291 if (ump
->um_extattr
.uepm_flags
& ULFS_EXTATTR_UEPM_STARTED
) {
1292 ulfs_extattr_stop(mp
, curlwp
);
1294 if (ump
->um_extattr
.uepm_flags
& ULFS_EXTATTR_UEPM_INITIALIZED
) {
1295 ulfs_extattr_uepm_destroy(&ump
->um_extattr
);
1300 if ((error
= lfsquota1_umount(mp
, flags
)) != 0)
1304 if ((error
= lfsquota2_umount(mp
, flags
)) != 0)
1307 if ((error
= vflush(mp
, fs
->lfs_ivnode
, flags
)) != 0)
1309 if ((error
= VFS_SYNC(mp
, 1, l
->l_cred
)) != 0)
1311 vp
= fs
->lfs_ivnode
;
1312 mutex_enter(vp
->v_interlock
);
1313 if (LIST_FIRST(&vp
->v_dirtyblkhd
))
1314 panic("lfs_unmount: still dirty blocks on ifile vnode");
1315 mutex_exit(vp
->v_interlock
);
1317 /* Explicitly write the superblock, to update serial and pflags */
1318 lfs_sb_setpflags(fs
, lfs_sb_getpflags(fs
) | LFS_PF_CLEAN
);
1319 lfs_writesuper(fs
, lfs_sb_getsboff(fs
, 0));
1320 lfs_writesuper(fs
, lfs_sb_getsboff(fs
, 1));
1321 mutex_enter(&lfs_lock
);
1322 while (fs
->lfs_iocount
)
1323 mtsleep(&fs
->lfs_iocount
, PRIBIO
+ 1, "lfs_umount", 0,
1325 mutex_exit(&lfs_lock
);
1327 /* Finish with the Ifile, now that we're done with it */
1328 vgone(fs
->lfs_ivnode
);
1330 ronly
= !fs
->lfs_ronly
;
1331 if (ump
->um_devvp
->v_type
!= VBAD
)
1332 spec_node_setmountedfs(ump
->um_devvp
, NULL
);
1333 vn_lock(ump
->um_devvp
, LK_EXCLUSIVE
| LK_RETRY
);
1334 error
= VOP_CLOSE(ump
->um_devvp
,
1335 ronly
? FREAD
: FREAD
|FWRITE
, NOCRED
);
1336 vput(ump
->um_devvp
);
1338 /* Complain about page leakage */
1339 if (fs
->lfs_pages
> 0)
1340 printf("lfs_unmount: still claim %d pages (%d in subsystem)\n",
1341 fs
->lfs_pages
, lfs_subsys_pages
);
1343 /* Free per-mount data structures */
1344 free(fs
->lfs_ino_bitmap
, M_SEGMENT
);
1345 free(fs
->lfs_suflags
[0], M_SEGMENT
);
1346 free(fs
->lfs_suflags
[1], M_SEGMENT
);
1347 free(fs
->lfs_suflags
, M_SEGMENT
);
1348 lfs_free_resblks(fs
);
1349 cv_destroy(&fs
->lfs_stopcv
);
1350 rw_destroy(&fs
->lfs_fraglock
);
1351 rw_destroy(&fs
->lfs_iflock
);
1353 kmem_free(fs
, sizeof(struct lfs
));
1354 kmem_free(ump
, sizeof(*ump
));
1356 mp
->mnt_data
= NULL
;
1357 mp
->mnt_flag
&= ~MNT_LOCAL
;
1362 * Get file system statistics.
1364 * NB: We don't lock to access the superblock here, because it's not
1365 * really that important if we get it wrong.
1368 lfs_statvfs(struct mount
*mp
, struct statvfs
*sbp
)
1371 struct ulfsmount
*ump
;
1373 ump
= VFSTOULFS(mp
);
1376 sbp
->f_bsize
= lfs_sb_getbsize(fs
);
1377 sbp
->f_frsize
= lfs_sb_getfsize(fs
);
1378 sbp
->f_iosize
= lfs_sb_getbsize(fs
);
1379 sbp
->f_blocks
= LFS_EST_NONMETA(fs
) - VTOI(fs
->lfs_ivnode
)->i_lfs_effnblks
;
1381 sbp
->f_bfree
= LFS_EST_BFREE(fs
);
1383 * XXX this should be lfs_sb_getsize (measured in frags)
1384 * rather than dsize (measured in diskblocks). However,
1385 * getsize needs a format version check (for version 1 it
1386 * needs to be blockstofrags'd) so for the moment I'm going to
1387 * leave this... it won't fire wrongly as frags are at least
1388 * as big as diskblocks.
1390 KASSERT(sbp
->f_bfree
<= lfs_sb_getdsize(fs
));
1392 if (sbp
->f_bfree
< 0)
1396 sbp
->f_bresvd
= LFS_EST_RSVD(fs
);
1397 if (sbp
->f_bfree
> sbp
->f_bresvd
)
1398 sbp
->f_bavail
= sbp
->f_bfree
- sbp
->f_bresvd
;
1402 /* XXX: huh? - dholland 20150728 */
1403 sbp
->f_files
= lfs_sb_getbfree(fs
) / lfs_btofsb(fs
, lfs_sb_getibsize(fs
))
1405 sbp
->f_ffree
= sbp
->f_files
- lfs_sb_getnfiles(fs
);
1406 sbp
->f_favail
= sbp
->f_ffree
;
1408 copy_statvfs_info(sbp
, mp
);
1413 * Go through the disk queues to initiate sandbagged IO;
1414 * go through the inodes to write those that have been modified;
1415 * initiate the writing of the super block if it has been modified.
1417 * Note: we are always called with the filesystem marked `MPBUSY'.
1420 lfs_sync(struct mount
*mp
, int waitfor
, kauth_cred_t cred
)
1425 fs
= VFSTOULFS(mp
)->um_lfs
;
1429 /* Snapshots should not hose the syncer */
1431 * XXX Sync can block here anyway, since we don't have a very
1432 * XXX good idea of how much data is pending. If it's more
1433 * XXX than a segment and lfs_nextseg is close to the end of
1434 * XXX the log, we'll likely block.
1436 mutex_enter(&lfs_lock
);
1437 if (fs
->lfs_nowrap
&& lfs_sb_getnextseg(fs
) < lfs_sb_getcurseg(fs
)) {
1438 mutex_exit(&lfs_lock
);
1441 mutex_exit(&lfs_lock
);
1443 lfs_writer_enter(fs
, "lfs_dirops");
1445 /* All syncs must be checkpoints until roll-forward is implemented. */
1446 DLOG((DLOG_FLUSH
, "lfs_sync at 0x%jx\n",
1447 (uintmax_t)lfs_sb_getoffset(fs
)));
1448 error
= lfs_segwrite(mp
, SEGM_CKP
| (waitfor
? SEGM_SYNC
: 0));
1449 lfs_writer_leave(fs
);
1457 * Look up an LFS dinode number to find its incore vnode. If not already
1458 * in core, read it in from the specified device. Return the inode locked.
1459 * Detection and handling of mount points must be done by the calling routine.
1462 lfs_vget(struct mount
*mp
, ino_t ino
, struct vnode
**vpp
)
1466 error
= vcache_get(mp
, &ino
, sizeof(ino
), vpp
);
1469 error
= vn_lock(*vpp
, LK_EXCLUSIVE
);
1480 * Create a new vnode/inode pair and initialize what fields we can.
1483 lfs_init_vnode(struct ulfsmount
*ump
, ino_t ino
, struct vnode
*vp
)
1485 struct lfs
*fs
= ump
->um_lfs
;
1487 union lfs_dinode
*dp
;
1489 ASSERT_NO_SEGLOCK(ump
->um_lfs
);
1491 /* Initialize the inode. */
1492 ip
= pool_get(&lfs_inode_pool
, PR_WAITOK
);
1493 memset(ip
, 0, sizeof(*ip
));
1494 dp
= pool_get(&lfs_dinode_pool
, PR_WAITOK
);
1495 memset(dp
, 0, sizeof(*dp
));
1496 ip
->inode_ext
.lfs
= pool_get(&lfs_inoext_pool
, PR_WAITOK
);
1497 memset(ip
->inode_ext
.lfs
, 0, sizeof(*ip
->inode_ext
.lfs
));
1501 ip
->i_dev
= ump
->um_dev
;
1502 lfs_dino_setinumber(fs
, dp
, ino
);
1504 ip
->i_lfs
= ump
->um_lfs
;
1505 ip
->i_lfs_effnblks
= 0;
1506 SPLAY_INIT(&ip
->i_lfs_lbtree
);
1507 ip
->i_lfs_nbtree
= 0;
1508 LIST_INIT(&ip
->i_lfs_segdhd
);
1511 vp
->v_op
= lfs_vnodeop_p
;
1516 * Undo lfs_init_vnode().
1519 lfs_deinit_vnode(struct ulfsmount
*ump
, struct vnode
*vp
)
1521 struct inode
*ip
= VTOI(vp
);
1523 pool_put(&lfs_inoext_pool
, ip
->inode_ext
.lfs
);
1524 pool_put(&lfs_dinode_pool
, ip
->i_din
);
1525 pool_put(&lfs_inode_pool
, ip
);
1530 * Read an inode from disk and initialize this vnode / inode pair.
1531 * Caller assures no other thread will try to load this inode.
1534 lfs_loadvnode(struct mount
*mp
, struct vnode
*vp
,
1535 const void *key
, size_t key_len
, const void **new_key
)
1538 union lfs_dinode
*dip
;
1542 struct ulfsmount
*ump
;
1548 KASSERT(key_len
== sizeof(ino
));
1549 memcpy(&ino
, key
, key_len
);
1551 memset(&ts
, 0, sizeof ts
); /* XXX gcc */
1553 ump
= VFSTOULFS(mp
);
1557 * If the filesystem is not completely mounted yet, suspend
1558 * any access requests (wait for roll-forward to complete).
1560 mutex_enter(&lfs_lock
);
1561 while ((fs
->lfs_flags
& LFS_NOTYET
) && curproc
->p_pid
!= fs
->lfs_rfpid
)
1562 mtsleep(&fs
->lfs_flags
, PRIBIO
+1, "lfs_notyet", 0,
1564 mutex_exit(&lfs_lock
);
1566 /* Translate the inode number to a disk address. */
1567 if (ino
== LFS_IFILE_INUM
)
1568 daddr
= lfs_sb_getidaddr(fs
);
1570 /* XXX bounds-check this too */
1571 LFS_IENTRY(ifp
, fs
, ino
, bp
);
1572 daddr
= lfs_if_getdaddr(fs
, ifp
);
1573 if (lfs_sb_getversion(fs
) > 1) {
1574 ts
.tv_sec
= lfs_if_getatime_sec(fs
, ifp
);
1575 ts
.tv_nsec
= lfs_if_getatime_nsec(fs
, ifp
);
1579 if (daddr
== LFS_UNUSED_DADDR
)
1583 /* Allocate/init new vnode/inode. */
1584 lfs_init_vnode(ump
, ino
, vp
);
1587 /* If the cleaner supplied the inode, use it. */
1588 if (curlwp
== ump
->um_cleaner_thread
&& ump
->um_cleaner_hint
!= NULL
&&
1589 ump
->um_cleaner_hint
->bi_lbn
== LFS_UNUSED_LBN
) {
1590 dip
= ump
->um_cleaner_hint
->bi_bp
;
1592 error
= copyin(dip
, &ip
->i_din
->u_64
,
1593 sizeof(struct lfs64_dinode
));
1595 error
= copyin(dip
, &ip
->i_din
->u_32
,
1596 sizeof(struct lfs32_dinode
));
1599 lfs_deinit_vnode(ump
, vp
);
1602 KASSERT(ip
->i_number
== ino
);
1606 /* Read in the disk contents for the inode, copy into the inode. */
1609 error
= bread(ump
->um_devvp
, LFS_FSBTODB(fs
, daddr
),
1610 (lfs_sb_getversion(fs
) == 1 ? lfs_sb_getbsize(fs
) : lfs_sb_getibsize(fs
)),
1613 lfs_deinit_vnode(ump
, vp
);
1617 dip
= lfs_ifind(fs
, ino
, bp
);
1619 /* Assume write has not completed yet; try again */
1620 brelse(bp
, BC_INVAL
);
1622 if (retries
<= LFS_IFIND_RETRIES
) {
1623 mutex_enter(&lfs_lock
);
1624 if (fs
->lfs_iocount
) {
1626 "%s: dinode %d not found, retrying...\n",
1628 (void)mtsleep(&fs
->lfs_iocount
, PRIBIO
+ 1,
1629 "lfs ifind", 1, &lfs_lock
);
1631 retries
= LFS_IFIND_RETRIES
;
1632 mutex_exit(&lfs_lock
);
1636 /* If the seglock is held look at the bpp to see
1637 what is there anyway */
1638 mutex_enter(&lfs_lock
);
1639 if (fs
->lfs_seglock
> 0) {
1641 union lfs_dinode
*dp
;
1644 for (bpp
= fs
->lfs_sp
->bpp
;
1645 bpp
!= fs
->lfs_sp
->cbpp
; ++bpp
) {
1646 if ((*bpp
)->b_vp
== fs
->lfs_ivnode
&&
1647 bpp
!= fs
->lfs_sp
->bpp
) {
1649 printf("%s: block 0x%" PRIx64
": ",
1650 __func__
, (*bpp
)->b_blkno
);
1651 for (i
= 0; i
< LFS_INOPB(fs
); i
++) {
1652 dp
= DINO_IN_BLOCK(fs
,
1654 if (lfs_dino_getinumber(fs
, dp
))
1656 (uintmax_t)lfs_dino_getinumber(fs
, dp
));
1662 mutex_exit(&lfs_lock
);
1664 panic("lfs_loadvnode: dinode not found");
1666 lfs_copy_dinode(fs
, ip
->i_din
, dip
);
1670 if (lfs_sb_getversion(fs
) > 1) {
1671 lfs_dino_setatime(fs
, ip
->i_din
, ts
.tv_sec
);
1672 lfs_dino_setatimensec(fs
, ip
->i_din
, ts
.tv_nsec
);
1677 *new_key
= &ip
->i_number
;
1682 * Create a new inode and initialize this vnode / inode pair.
1685 lfs_newvnode(struct mount
*mp
, struct vnode
*dvp
, struct vnode
*vp
,
1686 struct vattr
*vap
, kauth_cred_t cred
,
1687 size_t *key_len
, const void **new_key
)
1691 struct ulfsmount
*ump
;
1693 int error
, mode
, gen
;
1695 KASSERT(dvp
!= NULL
|| vap
->va_fileid
> 0);
1696 KASSERT(dvp
!= NULL
&& dvp
->v_mount
== mp
);
1697 KASSERT(vap
->va_type
!= VNON
);
1699 *key_len
= sizeof(ino
);
1700 ump
= VFSTOULFS(mp
);
1702 mode
= MAKEIMODE(vap
->va_type
, vap
->va_mode
);
1705 * Allocate fresh inode. With "dvp == NULL" take the inode number
1706 * and version from "vap".
1709 ino
= vap
->va_fileid
;
1711 error
= lfs_valloc_fixed(fs
, ino
, gen
);
1713 error
= lfs_valloc(dvp
, mode
, cred
, &ino
, &gen
);
1718 /* Attach inode to vnode. */
1719 lfs_init_vnode(ump
, ino
, vp
);
1722 mutex_enter(&lfs_lock
);
1723 LFS_SET_UINO(ip
, IN_CHANGE
);
1724 mutex_exit(&lfs_lock
);
1726 /* Note no blocks yet */
1727 ip
->i_lfs_hiblk
= -1;
1729 /* Set a new generation number for this inode. */
1731 lfs_dino_setgen(fs
, ip
->i_din
, gen
);
1733 memset(ip
->i_lfs_fragsize
, 0,
1734 ULFS_NDADDR
* sizeof(*ip
->i_lfs_fragsize
));
1736 /* Set uid / gid. */
1737 if (cred
== NOCRED
|| cred
== FSCRED
) {
1741 ip
->i_gid
= VTOI(dvp
)->i_gid
;
1742 ip
->i_uid
= kauth_cred_geteuid(cred
);
1744 DIP_ASSIGN(ip
, gid
, ip
->i_gid
);
1745 DIP_ASSIGN(ip
, uid
, ip
->i_uid
);
1747 #if defined(LFS_QUOTA) || defined(LFS_QUOTA2)
1748 error
= lfs_chkiq(ip
, 1, cred
, 0);
1750 lfs_vfree(dvp
, ino
, mode
);
1751 lfs_deinit_vnode(ump
, vp
);
1757 /* Set type and finalize. */
1759 DIP_ASSIGN(ip
, flags
, 0);
1761 DIP_ASSIGN(ip
, mode
, mode
);
1762 if (vap
->va_rdev
!= VNOVAL
) {
1764 * Want to be able to use this to make badblock
1765 * inodes, so don't truncate the dev number.
1767 // XXX clean this up
1768 if (ump
->um_fstype
== ULFS1
)
1769 ip
->i_din
->u_32
.di_rdev
= ulfs_rw32(vap
->va_rdev
,
1770 ULFS_MPNEEDSWAP(fs
));
1772 ip
->i_din
->u_64
.di_rdev
= ulfs_rw64(vap
->va_rdev
,
1773 ULFS_MPNEEDSWAP(fs
));
1777 *new_key
= &ip
->i_number
;
1782 * File handle to vnode
1785 lfs_fhtovp(struct mount
*mp
, struct fid
*fhp
, struct vnode
**vpp
)
1790 if (fhp
->fid_len
!= sizeof(struct lfid
))
1793 memcpy(&lfh
, fhp
, sizeof(lfh
));
1794 if (lfh
.lfid_ino
< LFS_IFILE_INUM
)
1797 fs
= VFSTOULFS(mp
)->um_lfs
;
1798 if (lfh
.lfid_ident
!= lfs_sb_getident(fs
))
1802 ((lfs_dino_getsize(fs
, VTOI(fs
->lfs_ivnode
)->i_din
) >> lfs_sb_getbshift(fs
)) -
1803 lfs_sb_getcleansz(fs
) - lfs_sb_getsegtabsz(fs
)) * lfs_sb_getifpb(fs
))
1806 return (ulfs_fhtovp(mp
, &lfh
.lfid_ufid
, vpp
));
1810 * Vnode pointer to File handle
1814 lfs_vptofh(struct vnode
*vp
, struct fid
*fhp
, size_t *fh_size
)
1819 if (*fh_size
< sizeof(struct lfid
)) {
1820 *fh_size
= sizeof(struct lfid
);
1823 *fh_size
= sizeof(struct lfid
);
1825 memset(&lfh
, 0, sizeof(lfh
));
1826 lfh
.lfid_len
= sizeof(struct lfid
);
1827 lfh
.lfid_ino
= ip
->i_number
;
1828 lfh
.lfid_gen
= ip
->i_gen
;
1829 lfh
.lfid_ident
= lfs_sb_getident(ip
->i_lfs
);
1830 memcpy(fhp
, &lfh
, sizeof(lfh
));
1835 * ulfs_bmaparray callback function for writing.
1837 * Since blocks will be written to the new segment anyway,
1838 * we don't care about current daddr of them.
1841 lfs_issequential_hole(const struct lfs
*fs
,
1842 daddr_t daddr0
, daddr_t daddr1
)
1844 (void)fs
; /* not used */
1846 daddr0
= (daddr_t
)((int32_t)daddr0
); /* XXX ondisk32 */
1847 daddr1
= (daddr_t
)((int32_t)daddr1
); /* XXX ondisk32 */
1849 KASSERT(daddr0
== UNWRITTEN
||
1850 (0 <= daddr0
&& daddr0
<= LFS_MAX_DADDR(fs
)));
1851 KASSERT(daddr1
== UNWRITTEN
||
1852 (0 <= daddr1
&& daddr1
<= LFS_MAX_DADDR(fs
)));
1854 /* NOTE: all we want to know here is 'hole or not'. */
1855 /* NOTE: UNASSIGNED is converted to 0 by ulfs_bmaparray. */
1858 * treat UNWRITTENs and all resident blocks as 'contiguous'
1860 if (daddr0
!= 0 && daddr1
!= 0)
1866 if (daddr0
== 0 && daddr1
== 0)
1867 return true; /* all holes are 'contiguous' for us. */
1873 * lfs_gop_write functions exactly like genfs_gop_write, except that
1874 * (1) it requires the seglock to be held by its caller, and sp->fip
1875 * to be properly initialized (it will return without re-initializing
1876 * sp->fip, and without calling lfs_writeseg).
1877 * (2) it uses the remaining space in the segment, rather than VOP_BMAP,
1878 * to determine how large a block it can write at once (though it does
1879 * still use VOP_BMAP to find holes in the file);
1880 * (3) it calls lfs_gatherblock instead of VOP_STRATEGY on its blocks
1881 * (leaving lfs_writeseg to deal with the cluster blocks, so we might
1882 * now have clusters of clusters, ick.)
1885 lfs_gop_write(struct vnode
*vp
, struct vm_page
**pgs
, int npages
,
1888 int i
, error
, run
, haveeof
= 0;
1891 off_t eof
, offset
, startoffset
= 0;
1892 size_t bytes
, iobytes
, skipbytes
;
1893 bool async
= (flags
& PGO_SYNCIO
) == 0;
1896 struct buf
*mbp
, *bp
;
1897 struct vnode
*devvp
= VTOI(vp
)->i_devvp
;
1898 struct inode
*ip
= VTOI(vp
);
1899 struct lfs
*fs
= ip
->i_lfs
;
1900 struct segment
*sp
= fs
->lfs_sp
;
1902 UVMHIST_FUNC("lfs_gop_write"); UVMHIST_CALLED(ubchist
);
1903 const char * failreason
= NULL
;
1907 /* The Ifile lives in the buffer cache */
1908 KASSERT(vp
!= fs
->lfs_ivnode
);
1911 * We don't want to fill the disk before the cleaner has a chance
1912 * to make room for us. If we're in danger of doing that, fail
1913 * with EAGAIN. The caller will have to notice this, unlock
1914 * so the cleaner can run, relock and try again.
1916 * We must write everything, however, if our vnode is being
1919 mutex_enter(vp
->v_interlock
);
1920 if (LFS_STARVED_FOR_SEGS(fs
) && vdead_check(vp
, VDEAD_NOWAIT
) == 0) {
1921 mutex_exit(vp
->v_interlock
);
1922 failreason
= "Starved for segs and not flushing vp";
1925 mutex_exit(vp
->v_interlock
);
1928 * Sometimes things slip past the filters in lfs_putpages,
1929 * and the pagedaemon tries to write pages---problem is
1930 * that the pagedaemon never acquires the segment lock.
1932 * Alternatively, pages that were clean when we called
1933 * genfs_putpages may have become dirty in the meantime. In this
1934 * case the segment header is not properly set up for blocks
1935 * to be added to it.
1937 * Unbusy and unclean the pages, and put them on the ACTIVE
1938 * queue under the hypothesis that they couldn't have got here
1939 * unless they were modified *quite* recently.
1941 * XXXUBC that last statement is an oversimplification of course.
1943 if (!LFS_SEGLOCK_HELD(fs
)) {
1944 failreason
= "Seglock not held";
1947 if (ip
->i_lfs_iflags
& LFSI_NO_GOP_WRITE
) {
1948 failreason
= "Inode with no_gop_write";
1951 if ((pgs
[0]->offset
& lfs_sb_getbmask(fs
)) != 0) {
1952 failreason
= "Bad page offset";
1956 UVMHIST_LOG(ubchist
, "vp %p pgs %p npages %d flags 0x%x",
1957 vp
, pgs
, npages
, flags
);
1959 GOP_SIZE(vp
, vp
->v_size
, &eof
, 0);
1962 if (vp
->v_type
== VREG
)
1963 fs_bshift
= vp
->v_mount
->mnt_fs_bshift
;
1965 fs_bshift
= DEV_BSHIFT
;
1968 startoffset
= pg
->offset
;
1971 if (startoffset
>= eof
) {
1972 failreason
= "Offset beyond EOF";
1975 bytes
= MIN(npages
<< PAGE_SHIFT
, eof
- startoffset
);
1978 KASSERT(bytes
!= 0);
1980 /* Swap PG_DELWRI for PG_PAGEOUT */
1981 for (i
= 0; i
< npages
; i
++) {
1982 if (pgs
[i
]->flags
& PG_DELWRI
) {
1983 KASSERT(!(pgs
[i
]->flags
& PG_PAGEOUT
));
1984 pgs
[i
]->flags
&= ~PG_DELWRI
;
1985 pgs
[i
]->flags
|= PG_PAGEOUT
;
1986 uvm_pageout_start(1);
1987 mutex_enter(vp
->v_interlock
);
1988 mutex_enter(&uvm_pageqlock
);
1989 uvm_pageunwire(pgs
[i
]);
1990 mutex_exit(&uvm_pageqlock
);
1991 mutex_exit(vp
->v_interlock
);
1996 * Check to make sure we're starting on a block boundary.
1997 * We'll check later to make sure we always write entire
1998 * blocks (or fragments).
2000 if (startoffset
& lfs_sb_getbmask(fs
))
2001 printf("%" PRId64
" & %" PRIu64
" = %" PRId64
"\n",
2002 startoffset
, lfs_sb_getbmask(fs
),
2003 startoffset
& lfs_sb_getbmask(fs
));
2004 KASSERT((startoffset
& lfs_sb_getbmask(fs
)) == 0);
2005 if (bytes
& lfs_sb_getffmask(fs
)) {
2006 printf("lfs_gop_write: asked to write %ld bytes\n", (long)bytes
);
2007 panic("lfs_gop_write: non-integer blocks");
2011 * We could deadlock here on pager_map with UVMPAGER_MAPIN_WAITOK.
2012 * If we would, write what we have and try again. If we don't
2013 * have anything to write, we'll have to sleep.
2015 ssp
= (SEGSUM
*)sp
->segsum
;
2016 if ((kva
= uvm_pagermapin(pgs
, npages
, UVMPAGER_MAPIN_WRITE
|
2017 (lfs_ss_getnfinfo(fs
, ssp
) < 1 ?
2018 UVMPAGER_MAPIN_WAITOK
: 0))) == 0x0) {
2019 DLOG((DLOG_PAGE
, "lfs_gop_write: forcing write\n"));
2021 " with nfinfo=%d at offset 0x%jx\n",
2022 (int)lfs_ss_getnfinfo(fs
, ssp
),
2023 (uintmax_t)lfs_sb_getoffset(fs
)));
2026 lfs_release_finfo(fs
);
2027 (void) lfs_writeseg(fs
, sp
);
2029 lfs_acquire_finfo(fs
, ip
->i_number
, ip
->i_gen
);
2032 * Having given up all of the pager_map we were holding,
2033 * we can now wait for aiodoned to reclaim it for us
2034 * without fear of deadlock.
2036 kva
= uvm_pagermapin(pgs
, npages
, UVMPAGER_MAPIN_WRITE
|
2037 UVMPAGER_MAPIN_WAITOK
);
2040 mbp
= getiobuf(NULL
, true);
2041 UVMHIST_LOG(ubchist
, "vp %p mbp %p num now %d bytes 0x%x",
2042 vp
, mbp
, vp
->v_numoutput
, bytes
);
2043 mbp
->b_bufsize
= npages
<< PAGE_SHIFT
;
2044 mbp
->b_data
= (void *)kva
;
2045 mbp
->b_resid
= mbp
->b_bcount
= bytes
;
2046 mbp
->b_cflags
= BC_BUSY
|BC_AGE
;
2047 mbp
->b_iodone
= uvm_aio_biodone
;
2050 for (offset
= startoffset
;
2052 offset
+= iobytes
, bytes
-= iobytes
) {
2053 lbn
= offset
>> fs_bshift
;
2054 error
= ulfs_bmaparray(vp
, lbn
, &blkno
, NULL
, NULL
, &run
,
2055 lfs_issequential_hole
);
2057 UVMHIST_LOG(ubchist
, "ulfs_bmaparray() -> %d",
2064 iobytes
= MIN((((off_t
)lbn
+ 1 + run
) << fs_bshift
) - offset
,
2066 if (blkno
== (daddr_t
)-1) {
2067 skipbytes
+= iobytes
;
2072 * Discover how much we can really pack into this buffer.
2074 /* If no room in the current segment, finish it up */
2075 if (sp
->sum_bytes_left
< sizeof(int32_t) ||
2076 sp
->seg_bytes_left
< (1 << lfs_sb_getbshift(fs
))) {
2080 vers
= lfs_fi_getversion(fs
, sp
->fip
);
2081 lfs_release_finfo(fs
);
2082 (void) lfs_writeseg(fs
, sp
);
2084 lfs_acquire_finfo(fs
, ip
->i_number
, vers
);
2086 /* Check both for space in segment and space in segsum */
2087 iobytes
= MIN(iobytes
, (sp
->seg_bytes_left
>> fs_bshift
)
2089 iobytes
= MIN(iobytes
, (sp
->sum_bytes_left
/ sizeof(int32_t))
2091 KASSERT(iobytes
> 0);
2093 /* if it's really one i/o, don't make a second buf */
2094 if (offset
== startoffset
&& iobytes
== bytes
) {
2097 * All the LFS output is done by the segwriter. It
2098 * will increment numoutput by one for all the bufs it
2099 * recieves. However this buffer needs one extra to
2100 * account for aiodone.
2102 mutex_enter(vp
->v_interlock
);
2104 mutex_exit(vp
->v_interlock
);
2106 bp
= getiobuf(NULL
, true);
2107 UVMHIST_LOG(ubchist
, "vp %p bp %p num now %d",
2108 vp
, bp
, vp
->v_numoutput
, 0);
2109 nestiobuf_setup(mbp
, bp
, offset
- pg
->offset
, iobytes
);
2111 * LFS doesn't like async I/O here, dies with
2112 * an assert in lfs_bwrite(). Is that assert
2113 * valid? I retained non-async behaviour when
2114 * converted this to use nestiobuf --pooka
2116 bp
->b_flags
&= ~B_ASYNC
;
2119 /* XXX This is silly ... is this necessary? */
2120 mutex_enter(&bufcache_lock
);
2121 mutex_enter(vp
->v_interlock
);
2123 mutex_exit(vp
->v_interlock
);
2124 mutex_exit(&bufcache_lock
);
2126 bp
->b_lblkno
= lfs_lblkno(fs
, offset
);
2127 bp
->b_private
= mbp
;
2128 if (devvp
->v_type
== VBLK
) {
2129 bp
->b_dev
= devvp
->v_rdev
;
2131 VOP_BWRITE(bp
->b_vp
, bp
);
2132 while (lfs_gatherblock(sp
, bp
, NULL
))
2136 nestiobuf_done(mbp
, skipbytes
, error
);
2138 UVMHIST_LOG(ubchist
, "skipbytes %d", skipbytes
, 0,0,0);
2140 UVMHIST_LOG(ubchist
, "returning 0", 0,0,0,0);
2143 /* Start a segment write. */
2144 UVMHIST_LOG(ubchist
, "flushing", 0,0,0,0);
2145 mutex_enter(&lfs_lock
);
2146 lfs_flush(fs
, 0, 1);
2147 mutex_exit(&lfs_lock
);
2150 if ((sp
->seg_flags
& SEGM_SINGLE
) && lfs_sb_getcurseg(fs
) != fs
->lfs_startseg
)
2157 * We can't write the pages, for whatever reason.
2158 * Clean up after ourselves, and make the caller try again.
2160 mutex_enter(vp
->v_interlock
);
2162 /* Tell why we're here, if we know */
2163 if (failreason
!= NULL
) {
2164 DLOG((DLOG_PAGE
, "lfs_gop_write: %s\n", failreason
));
2166 if (haveeof
&& startoffset
>= eof
) {
2167 DLOG((DLOG_PAGE
, "lfs_gop_write: ino %d start 0x%" PRIx64
2168 " eof 0x%" PRIx64
" npages=%d\n", VTOI(vp
)->i_number
,
2169 pgs
[0]->offset
, eof
, npages
));
2172 mutex_enter(&uvm_pageqlock
);
2173 for (i
= 0; i
< npages
; i
++) {
2176 if (pg
->flags
& PG_PAGEOUT
)
2177 uvm_pageout_done(1);
2178 if (pg
->flags
& PG_DELWRI
) {
2181 uvm_pageactivate(pg
);
2182 pg
->flags
&= ~(PG_CLEAN
|PG_DELWRI
|PG_PAGEOUT
|PG_RELEASED
);
2183 DLOG((DLOG_PAGE
, "pg[%d] = %p (vp %p off %" PRIx64
")\n", i
, pg
,
2185 DLOG((DLOG_PAGE
, "pg[%d]->flags = %x\n", i
, pg
->flags
));
2186 DLOG((DLOG_PAGE
, "pg[%d]->pqflags = %x\n", i
, pg
->pqflags
));
2187 DLOG((DLOG_PAGE
, "pg[%d]->uanon = %p\n", i
, pg
->uanon
));
2188 DLOG((DLOG_PAGE
, "pg[%d]->uobject = %p\n", i
, pg
->uobject
));
2189 DLOG((DLOG_PAGE
, "pg[%d]->wire_count = %d\n", i
,
2191 DLOG((DLOG_PAGE
, "pg[%d]->loan_count = %d\n", i
,
2194 /* uvm_pageunbusy takes care of PG_BUSY, PG_WANTED */
2195 uvm_page_unbusy(pgs
, npages
);
2196 mutex_exit(&uvm_pageqlock
);
2197 mutex_exit(vp
->v_interlock
);
2202 * finish vnode/inode initialization.
2206 lfs_vinit(struct mount
*mp
, struct vnode
**vpp
)
2208 struct vnode
*vp
= *vpp
;
2209 struct inode
*ip
= VTOI(vp
);
2210 struct ulfsmount
*ump
= VFSTOULFS(mp
);
2211 struct lfs
*fs
= ump
->um_lfs
;
2214 ip
->i_mode
= lfs_dino_getmode(fs
, ip
->i_din
);
2215 ip
->i_nlink
= lfs_dino_getnlink(fs
, ip
->i_din
);
2216 ip
->i_lfs_osize
= ip
->i_size
= lfs_dino_getsize(fs
, ip
->i_din
);
2217 ip
->i_flags
= lfs_dino_getflags(fs
, ip
->i_din
);
2218 ip
->i_gen
= lfs_dino_getgen(fs
, ip
->i_din
);
2219 ip
->i_uid
= lfs_dino_getuid(fs
, ip
->i_din
);
2220 ip
->i_gid
= lfs_dino_getgid(fs
, ip
->i_din
);
2222 ip
->i_lfs_effnblks
= lfs_dino_getblocks(fs
, ip
->i_din
);
2223 ip
->i_lfs_odnlink
= lfs_dino_getnlink(fs
, ip
->i_din
);
2226 * Initialize the vnode from the inode, check for aliases. In all
2227 * cases re-init ip, the underlying vnode/inode may have changed.
2229 ulfs_vinit(mp
, lfs_specop_p
, lfs_fifoop_p
, &vp
);
2232 memset(ip
->i_lfs_fragsize
, 0, ULFS_NDADDR
* sizeof(*ip
->i_lfs_fragsize
));
2233 if (vp
->v_type
!= VLNK
|| ip
->i_size
>= ip
->i_lfs
->um_maxsymlinklen
) {
2235 for (i
= (ip
->i_size
+ lfs_sb_getbsize(fs
) - 1) >> lfs_sb_getbshift(fs
);
2236 i
< ULFS_NDADDR
; i
++) {
2237 if ((vp
->v_type
== VBLK
|| vp
->v_type
== VCHR
) &&
2240 if (lfs_dino_getdb(fs
, ip
->i_din
, i
) != 0) {
2241 lfs_dump_dinode(fs
, ip
->i_din
);
2242 panic("inconsistent inode (direct)");
2245 for ( ; i
< ULFS_NDADDR
+ ULFS_NIADDR
; i
++) {
2246 if (lfs_dino_getib(fs
, ip
->i_din
, i
- ULFS_NDADDR
) != 0) {
2247 lfs_dump_dinode(fs
, ip
->i_din
);
2248 panic("inconsistent inode (indirect)");
2252 for (i
= 0; i
< ULFS_NDADDR
; i
++)
2253 if (lfs_dino_getdb(fs
, ip
->i_din
, i
) != 0)
2254 ip
->i_lfs_fragsize
[i
] = lfs_blksize(fs
, ip
, i
);
2258 if (vp
->v_type
== VNON
) {
2260 lfs_dump_dinode(fs
, ip
->i_din
);
2262 panic("lfs_vinit: ino %llu is type VNON! (ifmt=%o)\n",
2263 (unsigned long long)ip
->i_number
,
2264 (ip
->i_mode
& LFS_IFMT
) >> 12);
2266 #endif /* DIAGNOSTIC */
2269 * Finish inode initialization now that aliasing has been resolved.
2272 ip
->i_devvp
= ump
->um_devvp
;
2274 #if defined(LFS_QUOTA) || defined(LFS_QUOTA2)
2277 genfs_node_init(vp
, &lfs_genfsops
);
2278 uvm_vnp_setsize(vp
, ip
->i_size
);
2280 /* Initialize hiblk from file size */
2281 ip
->i_lfs_hiblk
= lfs_lblkno(ip
->i_lfs
, ip
->i_size
+ lfs_sb_getbsize(ip
->i_lfs
) - 1) - 1;
2287 * Resize the filesystem to contain the specified number of segments.
2290 lfs_resize_fs(struct lfs
*fs
, int newnsegs
)
2294 struct buf
*bp
, *obp
;
2295 daddr_t olast
, nlast
, ilast
, noff
, start
, end
;
2298 int error
, badnews
, inc
, oldnsegs
;
2299 int sbbytes
, csbbytes
, gain
, cgain
;
2302 /* Only support v2 and up */
2303 if (lfs_sb_getversion(fs
) < 2)
2306 /* If we're doing nothing, do it fast */
2307 oldnsegs
= lfs_sb_getnseg(fs
);
2308 if (newnsegs
== oldnsegs
)
2311 /* We always have to have two superblocks */
2312 if (newnsegs
<= lfs_dtosn(fs
, lfs_sb_getsboff(fs
, 1)))
2313 /* XXX this error code is rather nonsense */
2316 ivp
= fs
->lfs_ivnode
;
2320 /* Take the segment lock so no one else calls lfs_newseg() */
2321 lfs_seglock(fs
, SEGM_PROT
);
2324 * Make sure the segments we're going to be losing, if any,
2325 * are in fact empty. We hold the seglock, so their status
2326 * cannot change underneath us. Count the superblocks we lose,
2327 * while we're at it.
2329 sbbytes
= csbbytes
= 0;
2331 for (i
= newnsegs
; i
< oldnsegs
; i
++) {
2332 LFS_SEGENTRY(sup
, fs
, i
, bp
);
2333 badnews
= sup
->su_nbytes
|| !(sup
->su_flags
& SEGUSE_INVAL
);
2334 if (sup
->su_flags
& SEGUSE_SUPERBLOCK
)
2335 sbbytes
+= LFS_SBPAD
;
2336 if (!(sup
->su_flags
& SEGUSE_DIRTY
)) {
2338 if (sup
->su_flags
& SEGUSE_SUPERBLOCK
)
2339 csbbytes
+= LFS_SBPAD
;
2348 /* Note old and new segment table endpoints, and old ifile size */
2349 olast
= lfs_sb_getcleansz(fs
) + lfs_sb_getsegtabsz(fs
);
2350 nlast
= howmany(newnsegs
, lfs_sb_getsepb(fs
)) + lfs_sb_getcleansz(fs
);
2351 ilast
= ivp
->v_size
>> lfs_sb_getbshift(fs
);
2352 noff
= nlast
- olast
;
2355 * Make sure no one can use the Ifile while we change it around.
2356 * Even after taking the iflock we need to make sure no one still
2357 * is holding Ifile buffers, so we get each one, to drain them.
2358 * (XXX this could be done better.)
2360 rw_enter(&fs
->lfs_iflock
, RW_WRITER
);
2361 for (i
= 0; i
< ilast
; i
++) {
2362 /* XXX what to do if bread fails? */
2363 bread(ivp
, i
, lfs_sb_getbsize(fs
), 0, &bp
);
2367 /* Allocate new Ifile blocks */
2368 for (i
= ilast
; i
< ilast
+ noff
; i
++) {
2369 if (lfs_balloc(ivp
, i
* lfs_sb_getbsize(fs
), lfs_sb_getbsize(fs
), NOCRED
, 0,
2371 panic("balloc extending ifile");
2372 memset(bp
->b_data
, 0, lfs_sb_getbsize(fs
));
2373 VOP_BWRITE(bp
->b_vp
, bp
);
2376 /* Register new ifile size */
2377 ip
->i_size
+= noff
* lfs_sb_getbsize(fs
);
2378 lfs_dino_setsize(fs
, ip
->i_din
, ip
->i_size
);
2379 uvm_vnp_setsize(ivp
, ip
->i_size
);
2381 /* Copy the inode table to its new position */
2388 start
= ilast
+ noff
- 1;
2392 for (i
= start
; i
!= end
; i
+= inc
) {
2393 if (bread(ivp
, i
, lfs_sb_getbsize(fs
),
2394 B_MODIFY
, &bp
) != 0)
2395 panic("resize: bread dst blk failed");
2396 if (bread(ivp
, i
- noff
, lfs_sb_getbsize(fs
),
2398 panic("resize: bread src blk failed");
2399 memcpy(bp
->b_data
, obp
->b_data
, lfs_sb_getbsize(fs
));
2400 VOP_BWRITE(bp
->b_vp
, bp
);
2405 /* If we are expanding, write the new empty SEGUSE entries */
2406 if (newnsegs
> oldnsegs
) {
2407 for (i
= oldnsegs
; i
< newnsegs
; i
++) {
2408 if ((error
= bread(ivp
, i
/ lfs_sb_getsepb(fs
) +
2409 lfs_sb_getcleansz(fs
), lfs_sb_getbsize(fs
),
2410 B_MODIFY
, &bp
)) != 0)
2411 panic("lfs: ifile read: %d", error
);
2412 while ((i
+ 1) % lfs_sb_getsepb(fs
) && i
< newnsegs
) {
2413 sup
= &((SEGUSE
*)bp
->b_data
)[i
% lfs_sb_getsepb(fs
)];
2414 memset(sup
, 0, sizeof(*sup
));
2417 VOP_BWRITE(bp
->b_vp
, bp
);
2421 /* Zero out unused superblock offsets */
2422 for (i
= 2; i
< LFS_MAXNUMSB
; i
++)
2423 if (lfs_dtosn(fs
, lfs_sb_getsboff(fs
, i
)) >= newnsegs
)
2424 lfs_sb_setsboff(fs
, i
, 0x0);
2427 * Correct superblock entries that depend on fs size.
2428 * The computations of these are as follows:
2430 * size = lfs_segtod(fs, nseg)
2431 * dsize = lfs_segtod(fs, nseg - minfreeseg) - lfs_btofsb(#super * LFS_SBPAD)
2432 * bfree = dsize - lfs_btofsb(fs, bsize * nseg / 2) - blocks_actually_used
2433 * avail = lfs_segtod(fs, nclean) - lfs_btofsb(#clean_super * LFS_SBPAD)
2434 * + (lfs_segtod(fs, 1) - (offset - curseg))
2435 * - lfs_segtod(fs, minfreeseg - (minfreeseg / 2))
2437 * XXX - we should probably adjust minfreeseg as well.
2439 gain
= (newnsegs
- oldnsegs
);
2440 lfs_sb_setnseg(fs
, newnsegs
);
2441 lfs_sb_setsegtabsz(fs
, nlast
- lfs_sb_getcleansz(fs
));
2442 lfs_sb_addsize(fs
, gain
* lfs_btofsb(fs
, lfs_sb_getssize(fs
)));
2443 lfs_sb_adddsize(fs
, gain
* lfs_btofsb(fs
, lfs_sb_getssize(fs
)) - lfs_btofsb(fs
, sbbytes
));
2444 lfs_sb_addbfree(fs
, gain
* lfs_btofsb(fs
, lfs_sb_getssize(fs
)) - lfs_btofsb(fs
, sbbytes
)
2445 - gain
* lfs_btofsb(fs
, lfs_sb_getbsize(fs
) / 2));
2447 lfs_sb_addnclean(fs
, gain
);
2448 lfs_sb_addavail(fs
, gain
* lfs_btofsb(fs
, lfs_sb_getssize(fs
)));
2450 lfs_sb_subnclean(fs
, cgain
);
2451 lfs_sb_subavail(fs
, cgain
* lfs_btofsb(fs
, lfs_sb_getssize(fs
)) -
2452 lfs_btofsb(fs
, csbbytes
));
2455 /* Resize segment flag cache */
2456 fs
->lfs_suflags
[0] = realloc(fs
->lfs_suflags
[0],
2457 lfs_sb_getnseg(fs
) * sizeof(u_int32_t
), M_SEGMENT
, M_WAITOK
);
2458 fs
->lfs_suflags
[1] = realloc(fs
->lfs_suflags
[1],
2459 lfs_sb_getnseg(fs
) * sizeof(u_int32_t
), M_SEGMENT
, M_WAITOK
);
2460 for (i
= oldnsegs
; i
< newnsegs
; i
++)
2461 fs
->lfs_suflags
[0][i
] = fs
->lfs_suflags
[1][i
] = 0x0;
2463 /* Truncate Ifile if necessary */
2465 lfs_truncate(ivp
, ivp
->v_size
+ (noff
<< lfs_sb_getbshift(fs
)), 0,
2468 /* Update cleaner info so the cleaner can die */
2469 /* XXX what to do if bread fails? */
2470 bread(ivp
, 0, lfs_sb_getbsize(fs
), B_MODIFY
, &bp
);
2472 lfs_ci_setclean(fs
, cip
, lfs_sb_getnclean(fs
));
2473 lfs_ci_setdirty(fs
, cip
, lfs_sb_getnseg(fs
) - lfs_sb_getnclean(fs
));
2474 VOP_BWRITE(bp
->b_vp
, bp
);
2476 /* Let Ifile accesses proceed */
2477 rw_exit(&fs
->lfs_iflock
);
2485 * Extended attribute dispatch
2488 lfs_extattrctl(struct mount
*mp
, int cmd
, struct vnode
*vp
,
2489 int attrnamespace
, const char *attrname
)
2492 struct ulfsmount
*ump
;
2494 ump
= VFSTOULFS(mp
);
2495 if (ump
->um_fstype
== ULFS1
) {
2496 return ulfs_extattrctl(mp
, cmd
, vp
, attrnamespace
, attrname
);
2499 return vfs_stdextattrctl(mp
, cmd
, vp
, attrnamespace
, attrname
);