Patrick Welche <prlw1@cam.ac.uk>
[netbsd-mini2440.git] / external / cddl / osnet / dist / uts / common / fs / zfs / zfs_vfsops.c
blobb6eb880221736034b77a617fcdf80cb795ba58b2
1 /*
2 * CDDL HEADER START
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
19 * CDDL HEADER END
22 * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
23 * Use is subject to license terms.
26 #include <sys/types.h>
27 #include <sys/param.h>
28 #include <sys/systm.h>
29 #include <sys/sysmacros.h>
30 #include <sys/kmem.h>
31 #include <sys/pathname.h>
32 #include <sys/vnode.h>
33 #include <sys/vfs.h>
34 #include <sys/vfs_opreg.h>
35 #include <sys/mntent.h>
36 #include <sys/mount.h>
37 #include <sys/cmn_err.h>
38 #include <sys/zfs_znode.h>
39 #include <sys/zfs_dir.h>
40 #include <sys/zil.h>
41 #include <sys/fs/zfs.h>
42 #include <sys/dmu.h>
43 #include <sys/dsl_prop.h>
44 #include <sys/dsl_dataset.h>
45 #include <sys/dsl_deleg.h>
46 #include <sys/spa.h>
47 #include <sys/zap.h>
48 #include <sys/varargs.h>
49 #include <sys/policy.h>
50 #include <sys/atomic.h>
51 #include <sys/mkdev.h>
52 #include <sys/modctl.h>
53 #include <sys/zfs_ioctl.h>
54 #include <sys/zfs_ctldir.h>
55 #include <sys/zfs_fuid.h>
56 #include <sys/sunddi.h>
57 #include <sys/dnlc.h>
58 #include <sys/dmu_objset.h>
59 #include <sys/spa_boot.h>
61 #ifdef __NetBSD__
62 /* include ddi_name_to_major function is there better place for it ?*/
63 #include <sys/ddi.h>
64 #include <sys/systm.h>
65 #endif
67 int zfsfstype;
68 vfsops_t *zfs_vfsops = NULL;
69 static major_t zfs_major;
70 static minor_t zfs_minor;
71 static kmutex_t zfs_dev_mtx;
73 int zfs_debug_level;
74 kmutex_t zfs_debug_mtx;
76 /* XXX NetBSD static int zfs_mount(vfs_t *vfsp, vnode_t *mvp, struct mounta *uap, cred_t *cr);*/
77 static int zfs_mount(vfs_t *vfsp, const char *path, void *data, size_t *data_len);
78 static int zfs_umount(vfs_t *vfsp, int fflag);
79 static int zfs_root(vfs_t *vfsp, vnode_t **vpp);
80 static int zfs_statvfs(vfs_t *vfsp, struct statvfs *statp);
81 static int zfs_fhtovp(vfs_t *vfsp, fid_t *fidp, vnode_t **vpp);
82 static int zfs_vget(vfs_t *vfsp, ino_t ino, vnode_t **vpp);
83 static int zfs_start(vfs_t *vfsp, int flags);
84 static void zfs_freevfs(vfs_t *vfsp);
86 void zfs_init(void);
87 void zfs_fini(void);
90 extern const struct vnodeopv_desc zfs_vnodeop_opv_desc;
92 static const struct vnodeopv_desc * const zfs_vnodeop_descs[] = {
93 &zfs_vnodeop_opv_desc,
94 NULL,
97 static struct vfsops zfs_vfsops_template = {
98 .vfs_name = MOUNT_ZFS,
99 .vfs_min_mount_data = sizeof(struct zfs_args),
100 .vfs_opv_descs = zfs_vnodeop_descs,
101 .vfs_mount = zfs_mount,
102 .vfs_unmount = zfs_umount,
103 .vfs_root = zfs_root,
104 .vfs_statvfs = zfs_statvfs,
105 .vfs_sync = zfs_sync,
106 .vfs_vget = zfs_vget,
107 .vfs_fhtovp = zfs_fhtovp,
108 .vfs_init = zfs_init,
109 .vfs_done = zfs_fini,
110 .vfs_start = zfs_start,
111 .vfs_renamelock_enter = (void*)nullop,
112 .vfs_renamelock_exit = (void*)nullop,
113 .vfs_reinit = (void *)nullop,
114 .vfs_vptofh = (void *)eopnotsupp,
115 .vfs_fhtovp = (void *)eopnotsupp,
116 .vfs_quotactl = (void *)eopnotsupp,
117 .vfs_extattrctl = (void *)eopnotsupp,
118 .vfs_snapshot = (void *)eopnotsupp,
119 .vfs_fsync = (void *)eopnotsupp,
123 * We need to keep a count of active fs's.
124 * This is necessary to prevent our module
125 * from being unloaded after a umount -f
127 static uint32_t zfs_active_fs_count = 0;
129 static char *noatime_cancel[] = { MNTOPT_ATIME, NULL };
130 static char *atime_cancel[] = { MNTOPT_NOATIME, NULL };
131 static char *noxattr_cancel[] = { MNTOPT_XATTR, NULL };
132 static char *xattr_cancel[] = { MNTOPT_NOXATTR, NULL };
135 * MO_DEFAULT is not used since the default value is determined
136 * by the equivalent property.
138 static mntopt_t mntopts[] = {
139 { MNTOPT_NOXATTR, noxattr_cancel, NULL, 0, NULL },
140 { MNTOPT_XATTR, xattr_cancel, NULL, 0, NULL },
141 { MNTOPT_NOATIME, noatime_cancel, NULL, 0, NULL },
142 { MNTOPT_ATIME, atime_cancel, NULL, 0, NULL }
145 static mntopts_t zfs_mntopts = {
146 sizeof (mntopts) / sizeof (mntopt_t),
147 mntopts
150 /*ARGSUSED*/
152 zfs_sync(vfs_t *vfsp, int flag, cred_t *cr)
154 zfsvfs_t *zfsvfs = vfsp->vfs_data;
155 znode_t *zp;
156 vnode_t *vp, *nvp, *mvp;
157 dmu_tx_t *tx;
158 int error;
161 error = 0;
164 * Data integrity is job one. We don't want a compromised kernel
165 * writing to the storage pool, so we never sync during panic.
167 if (panicstr)
168 return (0);
170 /* Allocate a marker vnode. */
171 if ((mvp = vnalloc(vfsp)) == NULL)
172 return (ENOMEM);
176 * On NetBSD, we need to push out atime updates. Solaris does
177 * this during VOP_INACTIVE, but that does not work well with the
178 * BSD VFS, so we do it in batch here.
180 mutex_enter(&mntvnode_lock);
181 loop:
182 for (vp = TAILQ_FIRST(&vfsp->mnt_vnodelist); vp; vp = nvp) {
183 nvp = TAILQ_NEXT(vp, v_mntvnodes);
185 * If the vnode that we are about to sync is no
186 * longer associated with this mount point, start
187 * over.
189 if (vp->v_mount != vfsp)
190 goto loop;
192 * Don't interfere with concurrent scans of this FS.
194 if (vismarker(vp))
195 continue;
197 * Skip the vnode/inode if inaccessible, or if the
198 * atime is clean.
200 mutex_enter(&vp->v_interlock);
201 zp = VTOZ(vp);
202 if (zp == NULL || vp->v_type == VNON ||
203 (vp->v_iflag & (VI_XLOCK | VI_CLEAN)) != 0 ||
204 zp->z_atime_dirty == 0 || zp->z_unlinked) {
205 mutex_exit(&vp->v_interlock);
206 continue;
208 vmark(mvp, vp);
209 mutex_exit(&mntvnode_lock);
210 error = vget(vp, LK_EXCLUSIVE | LK_INTERLOCK);
211 if (error) {
212 mutex_enter(&mntvnode_lock);
213 nvp = vunmark(mvp);
214 if (error == ENOENT) {
215 goto loop;
217 continue;
219 tx = dmu_tx_create(zfsvfs->z_os);
220 dmu_tx_hold_bonus(tx, zp->z_id);
221 error = dmu_tx_assign(tx, TXG_WAIT);
222 if (error) {
223 dmu_tx_abort(tx);
224 } else {
225 dmu_buf_will_dirty(zp->z_dbuf, tx);
226 mutex_enter(&zp->z_lock);
227 zp->z_atime_dirty = 0;
228 mutex_exit(&zp->z_lock);
229 dmu_tx_commit(tx);
231 vput(vp);
232 mutex_enter(&mntvnode_lock);
233 nvp = vunmark(mvp);
235 mutex_exit(&mntvnode_lock);
238 * SYNC_ATTR is used by fsflush() to force old filesystems like UFS
239 * to sync metadata, which they would otherwise cache indefinitely.
240 * Semantically, the only requirement is that the sync be initiated.
241 * The DMU syncs out txgs frequently, so there's nothing to do.
243 if ((flag & MNT_LAZY) != 0)
244 return (0);
246 if (vfsp != NULL) {
248 * Sync a specific filesystem.
252 ZFS_ENTER(zfsvfs);
253 if (zfsvfs->z_log != NULL)
254 zil_commit(zfsvfs->z_log, UINT64_MAX, 0);
255 else
256 txg_wait_synced(dmu_objset_pool(zfsvfs->z_os), 0);
257 ZFS_EXIT(zfsvfs);
258 } else {
260 * Sync all ZFS filesystems. This is what happens when you
261 * run sync(1M). Unlike other filesystems, ZFS honors the
262 * request by waiting for all pools to commit all dirty data.
264 spa_sync_allpools();
267 vnfree(nvp);
269 return (0);
272 static int
273 zfs_create_unique_device(dev_t *dev)
275 major_t new_major;
277 do {
278 ASSERT3U(zfs_minor, <=, MAXMIN);
279 minor_t start = zfs_minor;
280 do {
281 mutex_enter(&zfs_dev_mtx);
282 if (zfs_minor >= MAXMIN) {
284 * If we're still using the real major
285 * keep out of /dev/zfs and /dev/zvol minor
286 * number space. If we're using a getudev()'ed
287 * major number, we can use all of its minors.
289 if (zfs_major == ddi_name_to_major(ZFS_DRIVER))
290 zfs_minor = ZFS_MIN_MINOR;
291 else
292 zfs_minor = 0;
293 } else {
294 zfs_minor++;
296 *dev = makedevice(zfs_major, zfs_minor);
297 mutex_exit(&zfs_dev_mtx);
298 } while (vfs_devismounted(*dev) && zfs_minor != start);
299 break;
300 #ifndef __NetBSD__
301 if (zfs_minor == start) {
303 * We are using all ~262,000 minor numbers for the
304 * current major number. Create a new major number.
306 if ((new_major = getudev()) == (major_t)-1) {
307 cmn_err(CE_WARN,
308 "zfs_mount: Can't get unique major "
309 "device number.");
310 return (-1);
312 mutex_enter(&zfs_dev_mtx);
313 zfs_major = new_major;
314 zfs_minor = 0;
316 mutex_exit(&zfs_dev_mtx);
317 } else {
318 break;
320 /* CONSTANTCONDITION */
321 #endif
322 } while (1);
324 return (0);
327 static void
328 atime_changed_cb(void *arg, uint64_t newval)
330 zfsvfs_t *zfsvfs = arg;
332 if (newval == TRUE) {
333 zfsvfs->z_atime = TRUE;
334 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOATIME);
335 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_ATIME, NULL, 0);
336 } else {
337 zfsvfs->z_atime = FALSE;
338 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_ATIME);
339 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOATIME, NULL, 0);
343 static void
344 xattr_changed_cb(void *arg, uint64_t newval)
346 zfsvfs_t *zfsvfs = arg;
348 if (newval == TRUE) {
349 /* XXX locking on vfs_flag? */
350 #ifdef TODO
351 zfsvfs->z_vfs->vfs_flag |= VFS_XATTR;
352 #endif
353 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOXATTR);
354 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_XATTR, NULL, 0);
355 } else {
356 /* XXX locking on vfs_flag? */
357 #ifdef TODO
358 zfsvfs->z_vfs->vfs_flag &= ~VFS_XATTR;
359 #endif
360 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_XATTR);
361 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOXATTR, NULL, 0);
365 static void
366 blksz_changed_cb(void *arg, uint64_t newval)
368 zfsvfs_t *zfsvfs = arg;
370 if (newval < SPA_MINBLOCKSIZE ||
371 newval > SPA_MAXBLOCKSIZE || !ISP2(newval))
372 newval = SPA_MAXBLOCKSIZE;
374 zfsvfs->z_max_blksz = newval;
375 zfsvfs->z_vfs->vfs_bsize = newval;
378 static void
379 readonly_changed_cb(void *arg, uint64_t newval)
381 zfsvfs_t *zfsvfs = arg;
383 if (newval) {
384 /* XXX locking on vfs_flag? */
385 zfsvfs->z_vfs->vfs_flag |= VFS_RDONLY;
386 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_RW);
387 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_RO, NULL, 0);
388 } else {
389 /* XXX locking on vfs_flag? */
390 zfsvfs->z_vfs->vfs_flag &= ~VFS_RDONLY;
391 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_RO);
392 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_RW, NULL, 0);
396 static void
397 devices_changed_cb(void *arg, uint64_t newval)
399 zfsvfs_t *zfsvfs = arg;
401 if (newval == FALSE) {
402 zfsvfs->z_vfs->vfs_flag |= VFS_NODEVICES;
403 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_DEVICES);
404 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NODEVICES, NULL, 0);
405 } else {
406 zfsvfs->z_vfs->vfs_flag &= ~VFS_NODEVICES;
407 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NODEVICES);
408 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_DEVICES, NULL, 0);
412 static void
413 setuid_changed_cb(void *arg, uint64_t newval)
415 zfsvfs_t *zfsvfs = arg;
417 if (newval == FALSE) {
418 zfsvfs->z_vfs->vfs_flag |= VFS_NOSETUID;
419 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_SETUID);
420 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOSETUID, NULL, 0);
421 } else {
422 zfsvfs->z_vfs->vfs_flag &= ~VFS_NOSETUID;
423 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOSETUID);
424 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_SETUID, NULL, 0);
428 static void
429 exec_changed_cb(void *arg, uint64_t newval)
431 zfsvfs_t *zfsvfs = arg;
433 if (newval == FALSE) {
434 zfsvfs->z_vfs->vfs_flag |= VFS_NOEXEC;
435 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_EXEC);
436 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOEXEC, NULL, 0);
437 } else {
438 zfsvfs->z_vfs->vfs_flag &= ~VFS_NOEXEC;
439 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOEXEC);
440 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_EXEC, NULL, 0);
445 * The nbmand mount option can be changed at mount time.
446 * We can't allow it to be toggled on live file systems or incorrect
447 * behavior may be seen from cifs clients
449 * This property isn't registered via dsl_prop_register(), but this callback
450 * will be called when a file system is first mounted
452 static void
453 nbmand_changed_cb(void *arg, uint64_t newval)
455 zfsvfs_t *zfsvfs = arg;
456 if (newval == FALSE) {
457 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NBMAND);
458 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NONBMAND, NULL, 0);
459 } else {
460 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NONBMAND);
461 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NBMAND, NULL, 0);
465 static void
466 snapdir_changed_cb(void *arg, uint64_t newval)
468 zfsvfs_t *zfsvfs = arg;
470 zfsvfs->z_show_ctldir = newval;
473 static void
474 vscan_changed_cb(void *arg, uint64_t newval)
476 zfsvfs_t *zfsvfs = arg;
478 zfsvfs->z_vscan = newval;
481 static void
482 acl_mode_changed_cb(void *arg, uint64_t newval)
484 zfsvfs_t *zfsvfs = arg;
486 zfsvfs->z_acl_mode = newval;
489 static void
490 acl_inherit_changed_cb(void *arg, uint64_t newval)
492 zfsvfs_t *zfsvfs = arg;
494 zfsvfs->z_acl_inherit = newval;
497 static int
498 zfs_register_callbacks(vfs_t *vfsp)
500 struct dsl_dataset *ds = NULL;
501 objset_t *os = NULL;
502 zfsvfs_t *zfsvfs = NULL;
503 uint64_t nbmand;
504 int readonly, do_readonly = B_FALSE;
505 int setuid, do_setuid = B_FALSE;
506 int exec, do_exec = B_FALSE;
507 int devices, do_devices = B_FALSE;
508 int xattr, do_xattr = B_FALSE;
509 int atime, do_atime = B_FALSE;
510 int error = 0;
512 ASSERT(vfsp);
513 zfsvfs = vfsp->vfs_data;
514 ASSERT(zfsvfs);
515 os = zfsvfs->z_os;
518 * The act of registering our callbacks will destroy any mount
519 * options we may have. In order to enable temporary overrides
520 * of mount options, we stash away the current values and
521 * restore them after we register the callbacks.
523 if (vfs_optionisset(vfsp, MNTOPT_RO, NULL)) {
524 readonly = B_TRUE;
525 do_readonly = B_TRUE;
526 } else if (vfs_optionisset(vfsp, MNTOPT_RW, NULL)) {
527 readonly = B_FALSE;
528 do_readonly = B_TRUE;
530 if (vfs_optionisset(vfsp, MNTOPT_NOSUID, NULL)) {
531 devices = B_FALSE;
532 setuid = B_FALSE;
533 do_devices = B_TRUE;
534 do_setuid = B_TRUE;
535 } else {
536 if (vfs_optionisset(vfsp, MNTOPT_NODEVICES, NULL)) {
537 devices = B_FALSE;
538 do_devices = B_TRUE;
539 } else if (vfs_optionisset(vfsp, MNTOPT_DEVICES, NULL)) {
540 devices = B_TRUE;
541 do_devices = B_TRUE;
544 if (vfs_optionisset(vfsp, MNTOPT_NOSETUID, NULL)) {
545 setuid = B_FALSE;
546 do_setuid = B_TRUE;
547 } else if (vfs_optionisset(vfsp, MNTOPT_SETUID, NULL)) {
548 setuid = B_TRUE;
549 do_setuid = B_TRUE;
552 if (vfs_optionisset(vfsp, MNTOPT_NOEXEC, NULL)) {
553 exec = B_FALSE;
554 do_exec = B_TRUE;
555 } else if (vfs_optionisset(vfsp, MNTOPT_EXEC, NULL)) {
556 exec = B_TRUE;
557 do_exec = B_TRUE;
559 if (vfs_optionisset(vfsp, MNTOPT_NOXATTR, NULL)) {
560 xattr = B_FALSE;
561 do_xattr = B_TRUE;
562 } else if (vfs_optionisset(vfsp, MNTOPT_XATTR, NULL)) {
563 xattr = B_TRUE;
564 do_xattr = B_TRUE;
566 if (vfs_optionisset(vfsp, MNTOPT_NOATIME, NULL)) {
567 atime = B_FALSE;
568 do_atime = B_TRUE;
569 } else if (vfs_optionisset(vfsp, MNTOPT_ATIME, NULL)) {
570 atime = B_TRUE;
571 do_atime = B_TRUE;
575 * nbmand is a special property. It can only be changed at
576 * mount time.
578 * This is weird, but it is documented to only be changeable
579 * at mount time.
581 if (vfs_optionisset(vfsp, MNTOPT_NONBMAND, NULL)) {
582 nbmand = B_FALSE;
583 } else if (vfs_optionisset(vfsp, MNTOPT_NBMAND, NULL)) {
584 nbmand = B_TRUE;
585 } else {
586 char osname[MAXNAMELEN];
588 dmu_objset_name(os, osname);
589 if (error = dsl_prop_get_integer(osname, "nbmand", &nbmand,
590 NULL)) {
591 return (error);
596 * Register property callbacks.
598 * It would probably be fine to just check for i/o error from
599 * the first prop_register(), but I guess I like to go
600 * overboard...
602 ds = dmu_objset_ds(os);
603 error = dsl_prop_register(ds, "atime", atime_changed_cb, zfsvfs);
604 error = error ? error : dsl_prop_register(ds,
605 "xattr", xattr_changed_cb, zfsvfs);
606 error = error ? error : dsl_prop_register(ds,
607 "recordsize", blksz_changed_cb, zfsvfs);
608 error = error ? error : dsl_prop_register(ds,
609 "readonly", readonly_changed_cb, zfsvfs);
610 error = error ? error : dsl_prop_register(ds,
611 "devices", devices_changed_cb, zfsvfs);
612 error = error ? error : dsl_prop_register(ds,
613 "setuid", setuid_changed_cb, zfsvfs);
614 error = error ? error : dsl_prop_register(ds,
615 "exec", exec_changed_cb, zfsvfs);
616 error = error ? error : dsl_prop_register(ds,
617 "snapdir", snapdir_changed_cb, zfsvfs);
618 error = error ? error : dsl_prop_register(ds,
619 "aclmode", acl_mode_changed_cb, zfsvfs);
620 error = error ? error : dsl_prop_register(ds,
621 "aclinherit", acl_inherit_changed_cb, zfsvfs);
622 error = error ? error : dsl_prop_register(ds,
623 "vscan", vscan_changed_cb, zfsvfs);
624 if (error)
625 goto unregister;
628 * Invoke our callbacks to restore temporary mount options.
630 if (do_readonly)
631 readonly_changed_cb(zfsvfs, readonly);
632 if (do_setuid)
633 setuid_changed_cb(zfsvfs, setuid);
634 if (do_exec)
635 exec_changed_cb(zfsvfs, exec);
636 if (do_devices)
637 devices_changed_cb(zfsvfs, devices);
638 if (do_xattr)
639 xattr_changed_cb(zfsvfs, xattr);
640 if (do_atime)
641 atime_changed_cb(zfsvfs, atime);
643 nbmand_changed_cb(zfsvfs, nbmand);
645 return (0);
647 unregister:
649 * We may attempt to unregister some callbacks that are not
650 * registered, but this is OK; it will simply return ENOMSG,
651 * which we will ignore.
653 (void) dsl_prop_unregister(ds, "atime", atime_changed_cb, zfsvfs);
654 (void) dsl_prop_unregister(ds, "xattr", xattr_changed_cb, zfsvfs);
655 (void) dsl_prop_unregister(ds, "recordsize", blksz_changed_cb, zfsvfs);
656 (void) dsl_prop_unregister(ds, "readonly", readonly_changed_cb, zfsvfs);
657 (void) dsl_prop_unregister(ds, "devices", devices_changed_cb, zfsvfs);
658 (void) dsl_prop_unregister(ds, "setuid", setuid_changed_cb, zfsvfs);
659 (void) dsl_prop_unregister(ds, "exec", exec_changed_cb, zfsvfs);
660 (void) dsl_prop_unregister(ds, "snapdir", snapdir_changed_cb, zfsvfs);
661 (void) dsl_prop_unregister(ds, "aclmode", acl_mode_changed_cb, zfsvfs);
662 (void) dsl_prop_unregister(ds, "aclinherit", acl_inherit_changed_cb,
663 zfsvfs);
664 (void) dsl_prop_unregister(ds, "vscan", vscan_changed_cb, zfsvfs);
665 return (error);
669 static int
670 zfsvfs_setup(zfsvfs_t *zfsvfs, boolean_t mounting)
672 int error;
674 error = zfs_register_callbacks(zfsvfs->z_vfs);
675 if (error)
676 return (error);
679 * Set the objset user_ptr to track its zfsvfs.
681 mutex_enter(&zfsvfs->z_os->os->os_user_ptr_lock);
682 dmu_objset_set_user(zfsvfs->z_os, zfsvfs);
683 mutex_exit(&zfsvfs->z_os->os->os_user_ptr_lock);
686 * If we are not mounting (ie: online recv), then we don't
687 * have to worry about replaying the log as we blocked all
688 * operations out since we closed the ZIL.
690 if (mounting) {
691 boolean_t readonly;
694 * During replay we remove the read only flag to
695 * allow replays to succeed.
697 readonly = zfsvfs->z_vfs->vfs_flag & VFS_RDONLY;
698 zfsvfs->z_vfs->vfs_flag &= ~VFS_RDONLY;
701 * Parse and replay the intent log.
703 zil_replay(zfsvfs->z_os, zfsvfs, &zfsvfs->z_assign,
704 zfs_replay_vector, zfs_unlinked_drain);
706 zfs_unlinked_drain(zfsvfs);
707 zfsvfs->z_vfs->vfs_flag |= readonly; /* restore readonly bit */
710 if (!zil_disable)
711 zfsvfs->z_log = zil_open(zfsvfs->z_os, zfs_get_data);
713 return (0);
716 static void
717 zfs_freezfsvfs(zfsvfs_t *zfsvfs)
720 mutex_destroy(&zfsvfs->z_znodes_lock);
721 mutex_destroy(&zfsvfs->z_online_recv_lock);
722 list_destroy(&zfsvfs->z_all_znodes);
723 rrw_destroy(&zfsvfs->z_teardown_lock);
724 rw_destroy(&zfsvfs->z_teardown_inactive_lock);
725 rw_destroy(&zfsvfs->z_fuid_lock);
726 kmem_free(zfsvfs, sizeof (zfsvfs_t));
729 static int
730 zfs_domount(vfs_t *vfsp, char *osname)
732 dev_t mount_dev;
733 uint64_t recordsize, readonly;
734 int error = 0;
735 int mode;
736 zfsvfs_t *zfsvfs;
737 znode_t *zp = NULL;
739 ASSERT(vfsp);
740 ASSERT(osname);
743 * Initialize the zfs-specific filesystem structure.
744 * Should probably make this a kmem cache, shuffle fields,
745 * and just bzero up to z_hold_mtx[].
747 zfsvfs = kmem_zalloc(sizeof (zfsvfs_t), KM_SLEEP);
748 zfsvfs->z_vfs = vfsp;
749 zfsvfs->z_parent = zfsvfs;
750 zfsvfs->z_assign = TXG_NOWAIT;
751 zfsvfs->z_max_blksz = SPA_MAXBLOCKSIZE;
752 zfsvfs->z_show_ctldir = ZFS_SNAPDIR_VISIBLE;
754 dprintf("Creating zfsvfs %p\n", zfsvfs);
755 mutex_init(&zfsvfs->z_znodes_lock, NULL, MUTEX_DEFAULT, NULL);
756 mutex_init(&zfsvfs->z_online_recv_lock, NULL, MUTEX_DEFAULT, NULL);
757 list_create(&zfsvfs->z_all_znodes, sizeof (znode_t),
758 offsetof(znode_t, z_link_node));
759 rrw_init(&zfsvfs->z_teardown_lock);
760 rw_init(&zfsvfs->z_teardown_inactive_lock, NULL, RW_DEFAULT, NULL);
761 rw_init(&zfsvfs->z_fuid_lock, NULL, RW_DEFAULT, NULL);
763 /* Initialize the generic filesystem structure. */
764 vfsp->vfs_data = NULL;
766 if (zfs_create_unique_device(&mount_dev) == -1) {
767 error = ENODEV;
768 goto out;
770 ASSERT(vfs_devismounted(mount_dev) == 0);
772 if (error = dsl_prop_get_integer(osname, "recordsize", &recordsize,
773 NULL))
774 goto out;
776 vfsp->vfs_bsize = DEV_BSIZE;
777 vfsp->vfs_flag |= VFS_NOTRUNC;
778 vfsp->vfs_data = zfsvfs;
780 if (error = dsl_prop_get_integer(osname, "readonly", &readonly, NULL))
781 goto out;
783 mode = DS_MODE_OWNER;
784 if (readonly)
785 mode |= DS_MODE_READONLY;
787 error = dmu_objset_open(osname, DMU_OST_ZFS, mode, &zfsvfs->z_os);
788 if (error == EROFS) {
789 mode = DS_MODE_OWNER | DS_MODE_READONLY;
790 error = dmu_objset_open(osname, DMU_OST_ZFS, mode,
791 &zfsvfs->z_os);
794 if (error)
795 goto out;
797 if (error = zfs_init_fs(zfsvfs, &zp))
798 goto out;
800 dprintf("zfs_domount vrele before vfsp->vfs_count %d\n", vfsp->vfs_count);
801 /* The call to zfs_init_fs leaves the vnode held, release it here. */
802 VN_RELE(ZTOV(zp));
804 dprintf("zfs_domount vrele after vfsp->vfs_count %d\n", vfsp->vfs_count);
806 * Set features for file system.
808 zfsvfs->z_use_fuids = USE_FUIDS(zfsvfs->z_version, zfsvfs->z_os);
809 if (zfsvfs->z_use_fuids) {
810 vfs_set_feature(vfsp, VFSFT_XVATTR);
811 vfs_set_feature(vfsp, VFSFT_SYSATTR_VIEWS);
812 vfs_set_feature(vfsp, VFSFT_ACEMASKONACCESS);
813 vfs_set_feature(vfsp, VFSFT_ACLONCREATE);
815 if (zfsvfs->z_case == ZFS_CASE_INSENSITIVE) {
816 vfs_set_feature(vfsp, VFSFT_DIRENTFLAGS);
817 vfs_set_feature(vfsp, VFSFT_CASEINSENSITIVE);
818 vfs_set_feature(vfsp, VFSFT_NOCASESENSITIVE);
819 } else if (zfsvfs->z_case == ZFS_CASE_MIXED) {
820 vfs_set_feature(vfsp, VFSFT_DIRENTFLAGS);
821 vfs_set_feature(vfsp, VFSFT_CASEINSENSITIVE);
824 if (dmu_objset_is_snapshot(zfsvfs->z_os)) {
825 uint64_t pval;
826 ASSERT(mode & DS_MODE_READONLY);
827 atime_changed_cb(zfsvfs, B_FALSE);
828 readonly_changed_cb(zfsvfs, B_TRUE);
829 if (error = dsl_prop_get_integer(osname, "xattr", &pval, NULL))
830 goto out;
831 xattr_changed_cb(zfsvfs, pval);
832 zfsvfs->z_issnap = B_TRUE;
833 } else {
834 error = zfsvfs_setup(zfsvfs, B_TRUE);
837 dprintf("zfs_vfsops.c zfs_domount called\n");
838 dprintf("vfsp->vfs_count %d\n", vfsp->vfs_count);
840 if (!zfsvfs->z_issnap)
841 zfsctl_create(zfsvfs);
842 out:
843 if (error) {
844 if (zfsvfs->z_os)
845 dmu_objset_close(zfsvfs->z_os);
846 zfs_freezfsvfs(zfsvfs);
847 } else {
848 atomic_add_32(&zfs_active_fs_count, 1);
850 return (error);
853 void
854 zfs_unregister_callbacks(zfsvfs_t *zfsvfs)
856 objset_t *os = zfsvfs->z_os;
857 struct dsl_dataset *ds;
860 * Unregister properties.
862 if (!dmu_objset_is_snapshot(os)) {
863 ds = dmu_objset_ds(os);
864 VERIFY(dsl_prop_unregister(ds, "atime", atime_changed_cb,
865 zfsvfs) == 0);
867 VERIFY(dsl_prop_unregister(ds, "xattr", xattr_changed_cb,
868 zfsvfs) == 0);
870 VERIFY(dsl_prop_unregister(ds, "recordsize", blksz_changed_cb,
871 zfsvfs) == 0);
873 VERIFY(dsl_prop_unregister(ds, "readonly", readonly_changed_cb,
874 zfsvfs) == 0);
876 VERIFY(dsl_prop_unregister(ds, "devices", devices_changed_cb,
877 zfsvfs) == 0);
879 VERIFY(dsl_prop_unregister(ds, "setuid", setuid_changed_cb,
880 zfsvfs) == 0);
882 VERIFY(dsl_prop_unregister(ds, "exec", exec_changed_cb,
883 zfsvfs) == 0);
885 VERIFY(dsl_prop_unregister(ds, "snapdir", snapdir_changed_cb,
886 zfsvfs) == 0);
888 VERIFY(dsl_prop_unregister(ds, "aclmode", acl_mode_changed_cb,
889 zfsvfs) == 0);
891 VERIFY(dsl_prop_unregister(ds, "aclinherit",
892 acl_inherit_changed_cb, zfsvfs) == 0);
894 VERIFY(dsl_prop_unregister(ds, "vscan",
895 vscan_changed_cb, zfsvfs) == 0);
900 * Convert a decimal digit string to a uint64_t integer.
902 static int
903 str_to_uint64(char *str, uint64_t *objnum)
905 uint64_t num = 0;
907 while (*str) {
908 if (*str < '0' || *str > '9')
909 return (EINVAL);
911 num = num*10 + *str++ - '0';
914 *objnum = num;
915 return (0);
919 * The boot path passed from the boot loader is in the form of
920 * "rootpool-name/root-filesystem-object-number'. Convert this
921 * string to a dataset name: "rootpool-name/root-filesystem-name".
923 static int
924 zfs_parse_bootfs(char *bpath, char *outpath)
926 char *slashp;
927 uint64_t objnum;
928 int error;
930 if (*bpath == 0 || *bpath == '/')
931 return (EINVAL);
933 (void) strcpy(outpath, bpath);
935 slashp = strchr(bpath, '/');
937 /* if no '/', just return the pool name */
938 if (slashp == NULL) {
939 return (0);
942 /* if not a number, just return the root dataset name */
943 if (str_to_uint64(slashp+1, &objnum)) {
944 return (0);
947 *slashp = '\0';
948 error = dsl_dsobj_to_dsname(bpath, objnum, outpath);
949 *slashp = '/';
951 return (error);
954 #ifndef __NetBSD__
955 static int
956 zfs_mountroot(vfs_t *vfsp, enum whymountroot why)
958 int error = 0;
959 static int zfsrootdone = 0;
960 zfsvfs_t *zfsvfs = NULL;
961 znode_t *zp = NULL;
962 vnode_t *vp = NULL;
963 char *zfs_bootfs;
964 char *zfs_devid;
966 ASSERT(vfsp);
969 * The filesystem that we mount as root is defined in the
970 * boot property "zfs-bootfs" with a format of
971 * "poolname/root-dataset-objnum".
973 if (why == ROOT_INIT) {
974 if (zfsrootdone++)
975 return (EBUSY);
977 * the process of doing a spa_load will require the
978 * clock to be set before we could (for example) do
979 * something better by looking at the timestamp on
980 * an uberblock, so just set it to -1.
982 clkset(-1);
984 if ((zfs_bootfs = spa_get_bootprop("zfs-bootfs")) == NULL) {
985 cmn_err(CE_NOTE, "spa_get_bootfs: can not get "
986 "bootfs name");
987 return (EINVAL);
989 zfs_devid = spa_get_bootprop("diskdevid");
990 error = spa_import_rootpool(rootfs.bo_name, zfs_devid);
991 if (zfs_devid)
992 spa_free_bootprop(zfs_devid);
993 if (error) {
994 spa_free_bootprop(zfs_bootfs);
995 cmn_err(CE_NOTE, "spa_import_rootpool: error %d",
996 error);
997 return (error);
999 if (error = zfs_parse_bootfs(zfs_bootfs, rootfs.bo_name)) {
1000 spa_free_bootprop(zfs_bootfs);
1001 cmn_err(CE_NOTE, "zfs_parse_bootfs: error %d",
1002 error);
1003 return (error);
1006 spa_free_bootprop(zfs_bootfs);
1008 if (error = vfs_lock(vfsp))
1009 return (error);
1011 if (error = zfs_domount(vfsp, rootfs.bo_name)) {
1012 cmn_err(CE_NOTE, "zfs_domount: error %d", error);
1013 goto out;
1016 zfsvfs = (zfsvfs_t *)vfsp->vfs_data;
1017 ASSERT(zfsvfs);
1018 if (error = zfs_zget(zfsvfs, zfsvfs->z_root, &zp)) {
1019 cmn_err(CE_NOTE, "zfs_zget: error %d", error);
1020 goto out;
1023 vp = ZTOV(zp);
1024 mutex_enter(&vp->v_lock);
1025 vp->v_flag |= VROOT;
1026 mutex_exit(&vp->v_lock);
1027 rootvp = vp;
1030 * Leave rootvp held. The root file system is never unmounted.
1033 vfs_add((struct vnode *)0, vfsp,
1034 (vfsp->vfs_flag & VFS_RDONLY) ? MS_RDONLY : 0);
1035 out:
1036 vfs_unlock(vfsp);
1037 return (error);
1038 } else if (why == ROOT_REMOUNT) {
1039 readonly_changed_cb(vfsp->vfs_data, B_FALSE);
1040 vfsp->vfs_flag |= VFS_REMOUNT;
1042 /* refresh mount options */
1043 zfs_unregister_callbacks(vfsp->vfs_data);
1044 return (zfs_register_callbacks(vfsp));
1046 } else if (why == ROOT_UNMOUNT) {
1047 zfs_unregister_callbacks((zfsvfs_t *)vfsp->vfs_data);
1048 (void) zfs_sync(vfsp, 0, 0);
1049 return (0);
1053 * if "why" is equal to anything else other than ROOT_INIT,
1054 * ROOT_REMOUNT, or ROOT_UNMOUNT, we do not support it.
1056 return (ENOTSUP);
1058 #endif /*__NetBSD__ */
1060 /*ARGSUSED*/
1061 static int
1062 zfs_mount(vfs_t *vfsp, const char *path, void *data, size_t *data_len)
1064 char *osname;
1065 pathname_t spn;
1066 vnode_t *mvp = vfsp->mnt_vnodecovered;
1067 struct mounta *uap = data;
1068 int error = 0;
1069 int canwrite;
1070 cred_t *cr;
1072 crget(cr);
1073 dprintf("zfs_vfsops.c zfs_mount called\n");
1074 dprintf("vfsp->vfs_count %d\n", vfsp->vfs_count);
1075 if (mvp->v_type != VDIR)
1076 return (ENOTDIR);
1078 mutex_enter(&mvp->v_interlock);
1079 if ((uap->flags & MS_REMOUNT) == 0 &&
1080 (uap->flags & MS_OVERLAY) == 0 &&
1081 (mvp->v_count != 1 || (mvp->v_flag & VROOT))) {
1082 mutex_exit(&mvp->v_interlock);
1083 return (EBUSY);
1085 mutex_exit(&mvp->v_interlock);
1088 * ZFS does not support passing unparsed data in via MS_DATA.
1089 * Users should use the MS_OPTIONSTR interface; this means
1090 * that all option parsing is already done and the options struct
1091 * can be interrogated.
1093 if ((uap->flags & MS_DATA) && uap->datalen > 0)
1094 return (EINVAL);
1096 osname = PNBUF_GET();
1098 strlcpy(osname, uap->fspec, strlen(uap->fspec) + 1);
1101 * Check for mount privilege?
1103 * If we don't have privilege then see if
1104 * we have local permission to allow it
1106 error = secpolicy_fs_mount(cr, mvp, vfsp);
1107 if (error) {
1108 error = dsl_deleg_access(osname, ZFS_DELEG_PERM_MOUNT, cr);
1109 if (error == 0) {
1110 vattr_t vattr;
1113 * Make sure user is the owner of the mount point
1114 * or has sufficient privileges.
1117 vattr.va_mask = AT_UID;
1119 if (error = VOP_GETATTR(mvp, &vattr, 0, cr, NULL)) {
1120 goto out;
1123 if (secpolicy_vnode_owner(cr, vattr.va_uid) != 0 &&
1124 VOP_ACCESS(mvp, VWRITE, cr) != 0) {
1125 error = EPERM;
1126 goto out;
1129 /* XXX NetBSD secpolicy_fs_mount_clearopts(cr, vfsp);*/
1130 } else {
1131 goto out;
1136 * Refuse to mount a filesystem if we are in a local zone and the
1137 * dataset is not visible.
1139 if (!INGLOBALZONE(curproc) &&
1140 (!zone_dataset_visible(osname, &canwrite) || !canwrite)) {
1141 error = EPERM;
1142 goto out;
1145 dprintf("uap->flags %d -- mflag %d -- MS_REMOUNT %d -- MTN_UPDATE %d\n", uap->flags, uap->mflag, MS_REMOUNT, MNT_UPDATE);
1147 * When doing a remount, we simply refresh our temporary properties
1148 * according to those options set in the current VFS options.
1150 if (uap->flags & MS_REMOUNT) {
1151 /* refresh mount options */
1152 zfs_unregister_callbacks(vfsp->vfs_data);
1153 error = zfs_register_callbacks(vfsp);
1154 goto out;
1157 /* Mark ZFS as MP SAFE */
1158 vfsp->mnt_iflag |= IMNT_MPSAFE;
1160 error = zfs_domount(vfsp, osname);
1162 vfs_getnewfsid(vfsp);
1164 /* setup zfs mount info */
1165 strlcpy(vfsp->mnt_stat.f_mntfromname, osname,
1166 sizeof(vfsp->mnt_stat.f_mntfromname));
1167 set_statvfs_info(path, UIO_USERSPACE, vfsp->mnt_stat.f_mntfromname,
1168 UIO_SYSSPACE, vfsp->mnt_op->vfs_name, vfsp, curlwp);
1171 out:
1172 PNBUF_PUT(osname);
1173 return (error);
1176 static int
1177 zfs_statvfs(vfs_t *vfsp, struct statvfs *statp)
1179 zfsvfs_t *zfsvfs = vfsp->vfs_data;
1180 dev_t dev;
1181 uint64_t refdbytes, availbytes, usedobjs, availobjs;
1183 ZFS_ENTER(zfsvfs);
1185 dmu_objset_space(zfsvfs->z_os,
1186 &refdbytes, &availbytes, &usedobjs, &availobjs);
1189 * The underlying storage pool actually uses multiple block sizes.
1190 * We report the fragsize as the smallest block size we support,
1191 * and we report our blocksize as the filesystem's maximum blocksize.
1193 statp->f_frsize = 1UL << SPA_MINBLOCKSHIFT;
1194 statp->f_bsize = zfsvfs->z_max_blksz;
1197 * The following report "total" blocks of various kinds in the
1198 * file system, but reported in terms of f_frsize - the
1199 * "fragment" size.
1202 statp->f_blocks = (refdbytes + availbytes) >> SPA_MINBLOCKSHIFT;
1203 statp->f_bfree = availbytes >> SPA_MINBLOCKSHIFT;
1204 statp->f_bavail = statp->f_bfree; /* no root reservation */
1207 * statvfs() should really be called statufs(), because it assumes
1208 * static metadata. ZFS doesn't preallocate files, so the best
1209 * we can do is report the max that could possibly fit in f_files,
1210 * and that minus the number actually used in f_ffree.
1211 * For f_ffree, report the smaller of the number of object available
1212 * and the number of blocks (each object will take at least a block).
1214 statp->f_ffree = MIN(availobjs, statp->f_bfree);
1215 statp->f_favail = statp->f_ffree; /* no "root reservation" */
1216 statp->f_files = statp->f_ffree + usedobjs;
1218 statp->f_fsid = vfsp->mnt_stat.f_fsidx.__fsid_val[0];
1221 * We're a zfs filesystem.
1223 (void) strlcpy(statp->f_fstypename, "zfs", sizeof(statp->f_fstypename));
1224 (void) strlcpy(statp->f_mntfromname, vfsp->mnt_stat.f_mntfromname,
1225 sizeof(statp->f_mntfromname));
1226 (void) strlcpy(statp->f_mntonname, vfsp->mnt_stat.f_mntonname,
1227 sizeof(statp->f_mntonname));
1229 statp->f_namemax = ZFS_MAXNAMELEN;
1232 * We have all of 32 characters to stuff a string here.
1233 * Is there anything useful we could/should provide?
1235 #ifndef __NetBSD__
1236 bzero(statp->f_fstr, sizeof (statp->f_fstr));
1237 #endif
1238 ZFS_EXIT(zfsvfs);
1239 return (0);
1242 static int
1243 zfs_root(vfs_t *vfsp, vnode_t **vpp)
1245 zfsvfs_t *zfsvfs = vfsp->vfs_data;
1246 znode_t *rootzp;
1247 int error;
1249 ZFS_ENTER(zfsvfs);
1250 dprintf("zfs_root called\n");
1251 error = zfs_zget(zfsvfs, zfsvfs->z_root, &rootzp);
1252 if (error == 0)
1253 *vpp = ZTOV(rootzp);
1254 dprintf("vpp -> %d, error %d -- %p\n", (*vpp)->v_type, error, *vpp);
1255 ZFS_EXIT(zfsvfs);
1256 return (error);
1260 * Teardown the zfsvfs::z_os.
1262 * Note, if 'unmounting' if FALSE, we return with the 'z_teardown_lock'
1263 * and 'z_teardown_inactive_lock' held.
1265 static int
1266 zfsvfs_teardown(zfsvfs_t *zfsvfs, boolean_t unmounting)
1268 znode_t *zp;
1270 rrw_enter(&zfsvfs->z_teardown_lock, RW_WRITER, FTAG);
1272 if (!unmounting) {
1274 * We purge the parent filesystem's vfsp as the parent
1275 * filesystem and all of its snapshots have their vnode's
1276 * v_vfsp set to the parent's filesystem's vfsp. Note,
1277 * 'z_parent' is self referential for non-snapshots.
1279 (void) dnlc_purge_vfsp(zfsvfs->z_parent->z_vfs, 0);
1283 * Close the zil. NB: Can't close the zil while zfs_inactive
1284 * threads are blocked as zil_close can call zfs_inactive.
1286 if (zfsvfs->z_log) {
1287 zil_close(zfsvfs->z_log);
1288 zfsvfs->z_log = NULL;
1291 rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_WRITER);
1294 * If we are not unmounting (ie: online recv) and someone already
1295 * unmounted this file system while we were doing the switcheroo,
1296 * or a reopen of z_os failed then just bail out now.
1298 if (!unmounting && (zfsvfs->z_unmounted || zfsvfs->z_os == NULL)) {
1299 rw_exit(&zfsvfs->z_teardown_inactive_lock);
1300 rrw_exit(&zfsvfs->z_teardown_lock, FTAG);
1301 return (EIO);
1305 * At this point there are no vops active, and any new vops will
1306 * fail with EIO since we have z_teardown_lock for writer (only
1307 * relavent for forced unmount).
1309 * Release all holds on dbufs.
1311 mutex_enter(&zfsvfs->z_znodes_lock);
1312 for (zp = list_head(&zfsvfs->z_all_znodes); zp != NULL;
1313 zp = list_next(&zfsvfs->z_all_znodes, zp))
1314 if (zp->z_dbuf) {
1315 ASSERT(ZTOV(zp)->v_count > 0);
1316 zfs_znode_dmu_fini(zp);
1318 mutex_exit(&zfsvfs->z_znodes_lock);
1321 * If we are unmounting, set the unmounted flag and let new vops
1322 * unblock. zfs_inactive will have the unmounted behavior, and all
1323 * other vops will fail with EIO.
1325 if (unmounting) {
1326 zfsvfs->z_unmounted = B_TRUE;
1327 rrw_exit(&zfsvfs->z_teardown_lock, FTAG);
1328 rw_exit(&zfsvfs->z_teardown_inactive_lock);
1332 * z_os will be NULL if there was an error in attempting to reopen
1333 * zfsvfs, so just return as the properties had already been
1334 * unregistered and cached data had been evicted before.
1336 if (zfsvfs->z_os == NULL)
1337 return (0);
1340 * Unregister properties.
1342 zfs_unregister_callbacks(zfsvfs);
1345 * Evict cached data
1347 if (dmu_objset_evict_dbufs(zfsvfs->z_os)) {
1348 txg_wait_synced(dmu_objset_pool(zfsvfs->z_os), 0);
1349 (void) dmu_objset_evict_dbufs(zfsvfs->z_os);
1352 return (0);
1355 /*ARGSUSED*/
1356 static int
1357 zfs_umount(vfs_t *vfsp, int fflag)
1359 zfsvfs_t *zfsvfs = vfsp->vfs_data;
1360 objset_t *os;
1361 int ret, flags = 0;
1362 cred_t *cr;
1364 vnode_t *vpp;
1365 int counter;
1367 counter = 0;
1369 dprintf("ZFS_UMOUNT called\n");
1371 /*TAILQ_FOREACH(vpp, &vfsp->mnt_vnodelist, v_mntvnodes) {
1372 printf("vnode list vnode number %d -- vnode address %p\n", counter, vpp);
1373 vprint("ZFS vfsp vnode list", vpp);
1374 counter++;
1375 } */
1377 crget(cr);
1378 #ifdef TODO
1379 ret = secpolicy_fs_unmount(cr, vfsp);
1380 if (ret) {
1381 ret = dsl_deleg_access((char *)refstr_value(vfsp->vfs_resource),
1382 ZFS_DELEG_PERM_MOUNT, cr);
1383 if (ret)
1384 return (ret);
1386 #endif
1388 * We purge the parent filesystem's vfsp as the parent filesystem
1389 * and all of its snapshots have their vnode's v_vfsp set to the
1390 * parent's filesystem's vfsp. Note, 'z_parent' is self
1391 * referential for non-snapshots.
1393 (void) dnlc_purge_vfsp(zfsvfs->z_parent->z_vfs, 0);
1396 * Unmount any snapshots mounted under .zfs before unmounting the
1397 * dataset itself.
1399 if (zfsvfs->z_ctldir != NULL &&
1400 (ret = zfsctl_umount_snapshots(vfsp, fflag, cr)) != 0) {
1401 return (ret);
1404 #if 0
1405 if (!(fflag & MS_FORCE)) {
1407 * Check the number of active vnodes in the file system.
1408 * Our count is maintained in the vfs structure, but the
1409 * number is off by 1 to indicate a hold on the vfs
1410 * structure itself.
1412 * The '.zfs' directory maintains a reference of its
1413 * own, and any active references underneath are
1414 * reflected in the vnode count.
1416 if (zfsvfs->z_ctldir == NULL) {
1417 if (vfsp->vfs_count > 1){
1418 return (EBUSY);
1420 } else {
1421 if (vfsp->vfs_count > 2 ||
1422 zfsvfs->z_ctldir->v_count > 1) {
1423 return (EBUSY);
1427 #endif
1428 vfsp->vfs_flag |= VFS_UNMOUNTED;
1430 VERIFY(zfsvfs_teardown(zfsvfs, B_TRUE) == 0);
1431 os = zfsvfs->z_os;
1434 * z_os will be NULL if there was an error in
1435 * attempting to reopen zfsvfs.
1437 if (os != NULL) {
1439 * Unset the objset user_ptr.
1441 mutex_enter(&os->os->os_user_ptr_lock);
1442 dmu_objset_set_user(os, NULL);
1443 mutex_exit(&os->os->os_user_ptr_lock);
1446 * Finally release the objset
1448 dmu_objset_close(os);
1452 * We can now safely destroy the '.zfs' directory node.
1454 if (zfsvfs->z_ctldir != NULL)
1455 zfsctl_destroy(zfsvfs);
1457 if (fflag & MS_FORCE)
1458 flags |= FORCECLOSE;
1460 ret = vflush(vfsp, NULL, 0);
1461 if (ret != 0)
1462 return ret;
1464 return (0);
1467 static int
1468 zfs_vget(vfs_t *vfsp, ino_t ino, vnode_t **vpp)
1470 zfsvfs_t *zfsvfs = vfsp->vfs_data;
1471 znode_t *zp;
1472 int err;
1474 dprintf("zfs_vget called\n");
1475 dprintf("vfsp->vfs_count %d\n", vfsp->vfs_count);
1477 ZFS_ENTER(zfsvfs);
1478 err = zfs_zget(zfsvfs, ino, &zp);
1479 if (err == 0 && zp->z_unlinked) {
1480 VN_RELE(ZTOV(zp));
1481 err = EINVAL;
1483 if (err != 0)
1484 *vpp = NULL;
1485 else {
1486 *vpp = ZTOV(zp);
1487 /* XXX NetBSD how to get flags for vn_lock ? */
1488 vn_lock(*vpp, 0);
1490 ZFS_EXIT(zfsvfs);
1491 return (err);
1494 static int
1495 zfs_fhtovp(vfs_t *vfsp, fid_t *fidp, vnode_t **vpp)
1497 zfsvfs_t *zfsvfs = vfsp->vfs_data;
1498 znode_t *zp;
1499 uint64_t object = 0;
1500 uint64_t fid_gen = 0;
1501 uint64_t gen_mask;
1502 uint64_t zp_gen;
1503 int i, err;
1505 *vpp = NULL;
1507 dprintf("zfs_fhtovp called\n");
1508 dprintf("vfsp->vfs_count %d\n", vfsp->vfs_count);
1510 ZFS_ENTER(zfsvfs);
1512 if (fidp->fid_len == LONG_FID_LEN) {
1513 zfid_long_t *zlfid = (zfid_long_t *)fidp;
1514 uint64_t objsetid = 0;
1515 uint64_t setgen = 0;
1517 for (i = 0; i < sizeof (zlfid->zf_setid); i++)
1518 objsetid |= ((uint64_t)zlfid->zf_setid[i]) << (8 * i);
1520 for (i = 0; i < sizeof (zlfid->zf_setgen); i++)
1521 setgen |= ((uint64_t)zlfid->zf_setgen[i]) << (8 * i);
1523 ZFS_EXIT(zfsvfs);
1525 err = zfsctl_lookup_objset(vfsp, objsetid, &zfsvfs);
1526 if (err)
1527 return (EINVAL);
1528 ZFS_ENTER(zfsvfs);
1531 if (fidp->fid_len == SHORT_FID_LEN || fidp->fid_len == LONG_FID_LEN) {
1532 zfid_short_t *zfid = (zfid_short_t *)fidp;
1534 for (i = 0; i < sizeof (zfid->zf_object); i++)
1535 object |= ((uint64_t)zfid->zf_object[i]) << (8 * i);
1537 for (i = 0; i < sizeof (zfid->zf_gen); i++)
1538 fid_gen |= ((uint64_t)zfid->zf_gen[i]) << (8 * i);
1539 } else {
1540 ZFS_EXIT(zfsvfs);
1541 return (EINVAL);
1544 /* A zero fid_gen means we are in the .zfs control directories */
1545 if (fid_gen == 0 &&
1546 (object == ZFSCTL_INO_ROOT || object == ZFSCTL_INO_SNAPDIR)) {
1547 *vpp = zfsvfs->z_ctldir;
1548 ASSERT(*vpp != NULL);
1549 if (object == ZFSCTL_INO_SNAPDIR) {
1550 VERIFY(zfsctl_root_lookup(*vpp, "snapshot", vpp, NULL,
1551 0, NULL, NULL, NULL, NULL, NULL) == 0);
1552 } else {
1553 VN_HOLD(*vpp);
1555 ZFS_EXIT(zfsvfs);
1556 /* XXX: LK_RETRY? */
1557 vn_lock(*vpp, LK_EXCLUSIVE | LK_RETRY);
1558 return (0);
1561 gen_mask = -1ULL >> (64 - 8 * i);
1563 dprintf("getting %llu [%u mask %llx]\n", object, fid_gen, gen_mask);
1564 if (err = zfs_zget(zfsvfs, object, &zp)) {
1565 ZFS_EXIT(zfsvfs);
1566 return (err);
1568 zp_gen = zp->z_phys->zp_gen & gen_mask;
1569 if (zp_gen == 0)
1570 zp_gen = 1;
1571 if (zp->z_unlinked || zp_gen != fid_gen) {
1572 dprintf("znode gen (%u) != fid gen (%u)\n", zp_gen, fid_gen);
1573 VN_RELE(ZTOV(zp));
1574 ZFS_EXIT(zfsvfs);
1575 return (EINVAL);
1578 *vpp = ZTOV(zp);
1579 /* XXX: LK_RETRY? */
1580 vn_lock(*vpp, LK_EXCLUSIVE | LK_RETRY);
1581 ZFS_EXIT(zfsvfs);
1582 return (0);
1586 * Block out VOPs and close zfsvfs_t::z_os
1588 * Note, if successful, then we return with the 'z_teardown_lock' and
1589 * 'z_teardown_inactive_lock' write held.
1592 zfs_suspend_fs(zfsvfs_t *zfsvfs, char *name, int *mode)
1594 int error;
1596 if ((error = zfsvfs_teardown(zfsvfs, B_FALSE)) != 0)
1597 return (error);
1599 *mode = zfsvfs->z_os->os_mode;
1600 dmu_objset_name(zfsvfs->z_os, name);
1601 dmu_objset_close(zfsvfs->z_os);
1603 return (0);
1607 * Reopen zfsvfs_t::z_os and release VOPs.
1610 zfs_resume_fs(zfsvfs_t *zfsvfs, const char *osname, int mode)
1612 int err;
1614 ASSERT(RRW_WRITE_HELD(&zfsvfs->z_teardown_lock));
1615 ASSERT(RW_WRITE_HELD(&zfsvfs->z_teardown_inactive_lock));
1617 err = dmu_objset_open(osname, DMU_OST_ZFS, mode, &zfsvfs->z_os);
1618 if (err) {
1619 zfsvfs->z_os = NULL;
1620 } else {
1621 znode_t *zp;
1623 VERIFY(zfsvfs_setup(zfsvfs, B_FALSE) == 0);
1626 * Attempt to re-establish all the active znodes with
1627 * their dbufs. If a zfs_rezget() fails, then we'll let
1628 * any potential callers discover that via ZFS_ENTER_VERIFY_VP
1629 * when they try to use their znode.
1631 mutex_enter(&zfsvfs->z_znodes_lock);
1632 for (zp = list_head(&zfsvfs->z_all_znodes); zp;
1633 zp = list_next(&zfsvfs->z_all_znodes, zp)) {
1634 (void) zfs_rezget(zp);
1636 mutex_exit(&zfsvfs->z_znodes_lock);
1640 /* release the VOPs */
1641 rw_exit(&zfsvfs->z_teardown_inactive_lock);
1642 rrw_exit(&zfsvfs->z_teardown_lock, FTAG);
1644 if (err) {
1646 * Since we couldn't reopen zfsvfs::z_os, force
1647 * unmount this file system.
1649 if (vn_vfswlock(zfsvfs->z_vfs->vfs_vnodecovered) == 0)
1650 (void) dounmount(zfsvfs->z_vfs, MS_FORCE, curlwp);
1652 return (err);
1655 static void
1656 zfs_freevfs(vfs_t *vfsp)
1658 zfsvfs_t *zfsvfs = vfsp->vfs_data;
1659 int i;
1661 for (i = 0; i != ZFS_OBJ_MTX_SZ; i++)
1662 mutex_destroy(&zfsvfs->z_hold_mtx[i]);
1664 zfs_fuid_destroy(zfsvfs);
1665 zfs_freezfsvfs(zfsvfs);
1667 atomic_add_32(&zfs_active_fs_count, -1);
1671 * VFS_INIT() initialization. Note that there is no VFS_FINI(),
1672 * so we can't safely do any non-idempotent initialization here.
1673 * Leave that to zfs_init() and zfs_fini(), which are called
1674 * from the module's _init() and _fini() entry points.
1676 /*ARGSUSED*/
1678 zfs_vfsinit(int fstype, char *name)
1680 int error;
1682 zfsfstype = fstype;
1685 * Setup vfsops and vnodeops tables.
1687 error = vfs_setfsops(fstype, zfs_vfsops_template, &zfs_vfsops);
1689 error = zfs_create_op_tables();
1690 if (error) {
1691 zfs_remove_op_tables();
1692 cmn_err(CE_WARN, "zfs: bad vnode ops template");
1693 vfs_freevfsops_by_type(zfsfstype);
1694 return (error);
1697 mutex_init(&zfs_dev_mtx, NULL, MUTEX_DEFAULT, NULL);
1698 mutex_init(&zfs_debug_mtx, NULL, MUTEX_DEFAULT, NULL);
1701 * Unique major number for all zfs mounts.
1702 * If we run out of 32-bit minors, we'll getudev() another major.
1704 zfs_major = ddi_name_to_major(ZFS_DRIVER);
1705 zfs_minor = ZFS_MIN_MINOR;
1707 return (0);
1711 zfs_vfsfini(void)
1713 int err;
1715 err = vfs_detach(&zfs_vfsops_template);
1716 if (err != 0)
1717 return err;
1719 mutex_destroy(&zfs_debug_mtx);
1720 mutex_destroy(&zfs_dev_mtx);
1722 return 0;
1725 void
1726 zfs_init(void)
1729 * Initialize .zfs directory structures
1731 zfsctl_init();
1734 * Initialize znode cache, vnode ops, etc...
1736 zfs_znode_init();
1739 void
1740 zfs_fini(void)
1742 zfsctl_fini();
1743 zfs_znode_fini();
1747 zfs_busy(void)
1749 return (zfs_active_fs_count != 0);
1753 zfs_set_version(const char *name, uint64_t newvers)
1755 int error;
1756 objset_t *os;
1757 dmu_tx_t *tx;
1758 uint64_t curvers;
1761 * XXX for now, require that the filesystem be unmounted. Would
1762 * be nice to find the zfsvfs_t and just update that if
1763 * possible.
1766 if (newvers < ZPL_VERSION_INITIAL || newvers > ZPL_VERSION)
1767 return (EINVAL);
1769 error = dmu_objset_open(name, DMU_OST_ZFS, DS_MODE_OWNER, &os);
1770 if (error)
1771 return (error);
1773 error = zap_lookup(os, MASTER_NODE_OBJ, ZPL_VERSION_STR,
1774 8, 1, &curvers);
1775 if (error)
1776 goto out;
1777 if (newvers < curvers) {
1778 error = EINVAL;
1779 goto out;
1782 tx = dmu_tx_create(os);
1783 dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, 0, ZPL_VERSION_STR);
1784 error = dmu_tx_assign(tx, TXG_WAIT);
1785 if (error) {
1786 dmu_tx_abort(tx);
1787 goto out;
1789 error = zap_update(os, MASTER_NODE_OBJ, ZPL_VERSION_STR, 8, 1,
1790 &newvers, tx);
1792 spa_history_internal_log(LOG_DS_UPGRADE,
1793 dmu_objset_spa(os), tx, CRED(),
1794 "oldver=%llu newver=%llu dataset = %llu", curvers, newvers,
1795 dmu_objset_id(os));
1796 dmu_tx_commit(tx);
1798 out:
1799 dmu_objset_close(os);
1800 return (error);
1804 * Read a property stored within the master node.
1807 zfs_get_zplprop(objset_t *os, zfs_prop_t prop, uint64_t *value)
1809 const char *pname;
1810 int error = ENOENT;
1813 * Look up the file system's value for the property. For the
1814 * version property, we look up a slightly different string.
1816 if (prop == ZFS_PROP_VERSION)
1817 pname = ZPL_VERSION_STR;
1818 else
1819 pname = zfs_prop_to_name(prop);
1821 if (os != NULL)
1822 error = zap_lookup(os, MASTER_NODE_OBJ, pname, 8, 1, value);
1824 if (error == ENOENT) {
1825 /* No value set, use the default value */
1826 switch (prop) {
1827 case ZFS_PROP_VERSION:
1828 *value = ZPL_VERSION;
1829 break;
1830 case ZFS_PROP_NORMALIZE:
1831 case ZFS_PROP_UTF8ONLY:
1832 *value = 0;
1833 break;
1834 case ZFS_PROP_CASE:
1835 *value = ZFS_CASE_SENSITIVE;
1836 break;
1837 default:
1838 return (error);
1840 error = 0;
1842 return (error);
1845 static int
1846 zfs_start(vfs_t *vfsp, int flags)
1849 return (0);
1853 #ifdef TODO
1854 static vfsdef_t vfw = {
1855 VFSDEF_VERSION,
1856 MNTTYPE_ZFS,
1857 zfs_vfsinit,
1858 VSW_HASPROTO|VSW_CANRWRO|VSW_CANREMOUNT|VSW_VOLATILEDEV|VSW_STATS|
1859 VSW_XID,
1860 &zfs_mntopts
1863 struct modlfs zfs_modlfs = {
1864 &mod_fsops, "ZFS filesystem version " SPA_VERSION_STRING, &vfw
1866 #endif