4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or https://opensource.org/licenses/CDDL-1.0.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23 * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
24 * Copyright 2015, OmniTI Computer Consulting, Inc. All rights reserved.
28 * ZFS control directory (a.k.a. ".zfs")
30 * This directory provides a common location for all ZFS meta-objects.
31 * Currently, this is only the 'snapshot' directory, but this may expand in the
32 * future. The elements are built using the GFS primitives, as the hierarchy
33 * does not actually exist on disk.
35 * For 'snapshot', we don't want to have all snapshots always mounted, because
36 * this would take up a huge amount of space in /etc/mnttab. We have three
39 * ctldir ------> snapshotdir -------> snapshot
45 * The 'snapshot' node contains just enough information to lookup '..' and act
46 * as a mountpoint for the snapshot. Whenever we lookup a specific snapshot, we
47 * perform an automount of the underlying filesystem and return the
48 * corresponding vnode.
50 * All mounts are handled automatically by the kernel, but unmounts are
51 * (currently) handled from user land. The main reason is that there is no
52 * reliable way to auto-unmount the filesystem when it's "no longer in use".
53 * When the user unmounts a filesystem, we call zfsctl_unmount(), which
54 * unmounts any snapshots within the snapshot directory.
56 * The '.zfs', '.zfs/snapshot', and all directories created under
57 * '.zfs/snapshot' (ie: '.zfs/snapshot/<snapname>') are all GFS nodes and
58 * share the same vfs_t as the head filesystem (what '.zfs' lives under).
60 * File systems mounted ontop of the GFS nodes '.zfs/snapshot/<snapname>'
61 * (ie: snapshots) are ZFS nodes and have their own unique vfs_t.
62 * However, vnodes within these mounted on file systems have their v_vfsp
63 * fields set to the head filesystem to make NFS happy (see
64 * zfsctl_snapdir_lookup()). We VFS_HOLD the head filesystem's vfs_t
65 * so that it cannot be freed until all snapshots have been unmounted.
68 #include <sys/types.h>
69 #include <sys/param.h>
70 #include <sys/libkern.h>
71 #include <sys/dirent.h>
72 #include <sys/zfs_context.h>
73 #include <sys/zfs_ctldir.h>
74 #include <sys/zfs_ioctl.h>
75 #include <sys/zfs_vfsops.h>
76 #include <sys/namei.h>
79 #include <sys/dsl_dataset.h>
80 #include <sys/dsl_destroy.h>
81 #include <sys/dsl_deleg.h>
82 #include <sys/mount.h>
84 #include <sys/sysproto.h>
86 #include "zfs_namecheck.h"
88 #include <sys/kernel.h>
89 #include <sys/ccompat.h>
91 /* Common access mode for all virtual directories under the ctldir */
92 const uint16_t zfsctl_ctldir_mode
= S_IRUSR
| S_IXUSR
| S_IRGRP
| S_IXGRP
|
96 * "Synthetic" filesystem implementation.
100 * Assert that A implies B.
102 #define KASSERT_IMPLY(A, B, msg) KASSERT(!(A) || (B), (msg));
104 static MALLOC_DEFINE(M_SFSNODES
, "sfs_nodes", "synthetic-fs nodes");
106 typedef struct sfs_node
{
107 char sn_name
[ZFS_MAX_DATASET_NAME_LEN
];
108 uint64_t sn_parent_id
;
113 * Check the parent's ID as well as the node's to account for a chance
114 * that IDs originating from different domains (snapshot IDs, artificial
115 * IDs, znode IDs) may clash.
118 sfs_compare_ids(struct vnode
*vp
, void *arg
)
120 sfs_node_t
*n1
= vp
->v_data
;
121 sfs_node_t
*n2
= arg
;
124 equal
= n1
->sn_id
== n2
->sn_id
&&
125 n1
->sn_parent_id
== n2
->sn_parent_id
;
127 /* Zero means equality. */
132 sfs_vnode_get(const struct mount
*mp
, int flags
, uint64_t parent_id
,
133 uint64_t id
, struct vnode
**vpp
)
139 search
.sn_parent_id
= parent_id
;
140 err
= vfs_hash_get(mp
, (uint32_t)id
, flags
, curthread
, vpp
,
141 sfs_compare_ids
, &search
);
146 sfs_vnode_insert(struct vnode
*vp
, int flags
, uint64_t parent_id
,
147 uint64_t id
, struct vnode
**vpp
)
151 KASSERT(vp
->v_data
!= NULL
, ("sfs_vnode_insert with NULL v_data"));
152 err
= vfs_hash_insert(vp
, (uint32_t)id
, flags
, curthread
, vpp
,
153 sfs_compare_ids
, vp
->v_data
);
158 sfs_vnode_remove(struct vnode
*vp
)
163 typedef void sfs_vnode_setup_fn(vnode_t
*vp
, void *arg
);
166 sfs_vgetx(struct mount
*mp
, int flags
, uint64_t parent_id
, uint64_t id
,
167 const char *tag
, struct vop_vector
*vops
,
168 sfs_vnode_setup_fn setup
, void *arg
,
174 error
= sfs_vnode_get(mp
, flags
, parent_id
, id
, vpp
);
175 if (error
!= 0 || *vpp
!= NULL
) {
176 KASSERT_IMPLY(error
== 0, (*vpp
)->v_data
!= NULL
,
177 "sfs vnode with no data");
181 /* Allocate a new vnode/inode. */
182 error
= getnewvnode(tag
, mp
, vops
, &vp
);
189 * Exclusively lock the vnode vnode while it's being constructed.
191 lockmgr(vp
->v_vnlock
, LK_EXCLUSIVE
, NULL
);
192 error
= insmntque(vp
, mp
);
200 error
= sfs_vnode_insert(vp
, flags
, parent_id
, id
, vpp
);
201 if (error
!= 0 || *vpp
!= NULL
) {
202 KASSERT_IMPLY(error
== 0, (*vpp
)->v_data
!= NULL
,
203 "sfs vnode with no data");
207 #if __FreeBSD_version >= 1400077
208 vn_set_state(vp
, VSTATE_CONSTRUCTED
);
216 sfs_print_node(sfs_node_t
*node
)
218 printf("\tname = %s\n", node
->sn_name
);
219 printf("\tparent_id = %ju\n", (uintmax_t)node
->sn_parent_id
);
220 printf("\tid = %ju\n", (uintmax_t)node
->sn_id
);
224 sfs_alloc_node(size_t size
, const char *name
, uint64_t parent_id
, uint64_t id
)
226 struct sfs_node
*node
;
228 KASSERT(strlen(name
) < sizeof (node
->sn_name
),
229 ("sfs node name is too long"));
230 KASSERT(size
>= sizeof (*node
), ("sfs node size is too small"));
231 node
= malloc(size
, M_SFSNODES
, M_WAITOK
| M_ZERO
);
232 strlcpy(node
->sn_name
, name
, sizeof (node
->sn_name
));
233 node
->sn_parent_id
= parent_id
;
240 sfs_destroy_node(sfs_node_t
*node
)
242 free(node
, M_SFSNODES
);
246 sfs_reclaim_vnode(vnode_t
*vp
)
250 sfs_vnode_remove(vp
);
257 sfs_readdir_common(uint64_t parent_id
, uint64_t id
, struct vop_readdir_args
*ap
,
258 zfs_uio_t
*uio
, off_t
*offp
)
263 /* Reset ncookies for subsequent use of vfs_read_dirent. */
264 if (ap
->a_ncookies
!= NULL
)
267 if (zfs_uio_resid(uio
) < sizeof (entry
))
268 return (SET_ERROR(EINVAL
));
270 if (zfs_uio_offset(uio
) < 0)
271 return (SET_ERROR(EINVAL
));
272 if (zfs_uio_offset(uio
) == 0) {
274 entry
.d_type
= DT_DIR
;
275 entry
.d_name
[0] = '.';
276 entry
.d_name
[1] = '\0';
278 entry
.d_reclen
= sizeof (entry
);
279 error
= vfs_read_dirent(ap
, &entry
, zfs_uio_offset(uio
));
281 return (SET_ERROR(error
));
284 if (zfs_uio_offset(uio
) < sizeof (entry
))
285 return (SET_ERROR(EINVAL
));
286 if (zfs_uio_offset(uio
) == sizeof (entry
)) {
287 entry
.d_fileno
= parent_id
;
288 entry
.d_type
= DT_DIR
;
289 entry
.d_name
[0] = '.';
290 entry
.d_name
[1] = '.';
291 entry
.d_name
[2] = '\0';
293 entry
.d_reclen
= sizeof (entry
);
294 error
= vfs_read_dirent(ap
, &entry
, zfs_uio_offset(uio
));
296 return (SET_ERROR(error
));
300 *offp
= 2 * sizeof (entry
);
306 * .zfs inode namespace
308 * We need to generate unique inode numbers for all files and directories
309 * within the .zfs pseudo-filesystem. We use the following scheme:
314 * .zfs/snapshot/<snap> objectid(snap)
316 #define ZFSCTL_INO_SNAP(id) (id)
318 static struct vop_vector zfsctl_ops_root
;
319 static struct vop_vector zfsctl_ops_snapdir
;
320 static struct vop_vector zfsctl_ops_snapshot
;
333 zfsctl_is_node(vnode_t
*vp
)
335 return (vn_matchops(vp
, zfsctl_ops_root
) ||
336 vn_matchops(vp
, zfsctl_ops_snapdir
) ||
337 vn_matchops(vp
, zfsctl_ops_snapshot
));
341 typedef struct zfsctl_root
{
349 * Create the '.zfs' directory.
352 zfsctl_create(zfsvfs_t
*zfsvfs
)
354 zfsctl_root_t
*dot_zfs
;
359 ASSERT3P(zfsvfs
->z_ctldir
, ==, NULL
);
361 snapdir
= sfs_alloc_node(sizeof (*snapdir
), "snapshot", ZFSCTL_INO_ROOT
,
363 dot_zfs
= (zfsctl_root_t
*)sfs_alloc_node(sizeof (*dot_zfs
), ".zfs", 0,
365 dot_zfs
->snapdir
= snapdir
;
367 VERIFY0(VFS_ROOT(zfsvfs
->z_vfs
, LK_EXCLUSIVE
, &rvp
));
368 VERIFY0(sa_lookup(VTOZ(rvp
)->z_sa_hdl
, SA_ZPL_CRTIME(zfsvfs
),
369 &crtime
, sizeof (crtime
)));
370 ZFS_TIME_DECODE(&dot_zfs
->cmtime
, crtime
);
373 zfsvfs
->z_ctldir
= dot_zfs
;
377 * Destroy the '.zfs' directory. Only called when the filesystem is unmounted.
378 * The nodes must not have any associated vnodes by now as they should be
382 zfsctl_destroy(zfsvfs_t
*zfsvfs
)
384 sfs_destroy_node(zfsvfs
->z_ctldir
->snapdir
);
385 sfs_destroy_node((sfs_node_t
*)zfsvfs
->z_ctldir
);
386 zfsvfs
->z_ctldir
= NULL
;
390 zfsctl_fs_root_vnode(struct mount
*mp
, void *arg __unused
, int flags
,
393 return (VFS_ROOT(mp
, flags
, vpp
));
397 zfsctl_common_vnode_setup(vnode_t
*vp
, void *arg
)
399 ASSERT_VOP_ELOCKED(vp
, __func__
);
401 /* We support shared locking. */
408 zfsctl_root_vnode(struct mount
*mp
, void *arg __unused
, int flags
,
414 node
= ((zfsvfs_t
*)mp
->mnt_data
)->z_ctldir
;
415 err
= sfs_vgetx(mp
, flags
, 0, ZFSCTL_INO_ROOT
, "zfs", &zfsctl_ops_root
,
416 zfsctl_common_vnode_setup
, node
, vpp
);
421 zfsctl_snapdir_vnode(struct mount
*mp
, void *arg __unused
, int flags
,
427 node
= ((zfsvfs_t
*)mp
->mnt_data
)->z_ctldir
->snapdir
;
428 err
= sfs_vgetx(mp
, flags
, ZFSCTL_INO_ROOT
, ZFSCTL_INO_SNAPDIR
, "zfs",
429 &zfsctl_ops_snapdir
, zfsctl_common_vnode_setup
, node
, vpp
);
434 * Given a root znode, retrieve the associated .zfs directory.
435 * Add a hold to the vnode and return it.
438 zfsctl_root(zfsvfs_t
*zfsvfs
, int flags
, vnode_t
**vpp
)
442 error
= zfsctl_root_vnode(zfsvfs
->z_vfs
, NULL
, flags
, vpp
);
447 * Common open routine. Disallow any write access.
450 zfsctl_common_open(struct vop_open_args
*ap
)
452 int flags
= ap
->a_mode
;
455 return (SET_ERROR(EACCES
));
461 * Common close routine. Nothing to do here.
464 zfsctl_common_close(struct vop_close_args
*ap
)
471 * Common access routine. Disallow writes.
474 zfsctl_common_access(struct vop_access_args
*ap
)
476 accmode_t accmode
= ap
->a_accmode
;
478 if (accmode
& VWRITE
)
479 return (SET_ERROR(EACCES
));
484 * Common getattr function. Fill in basic information.
487 zfsctl_common_getattr(vnode_t
*vp
, vattr_t
*vap
)
498 * We are a purely virtual object, so we have no
499 * blocksize or allocated blocks.
505 vap
->va_mode
= zfsctl_ctldir_mode
;
508 * We live in the now (for atime).
512 /* FreeBSD: Reset chflags(2) flags. */
515 vap
->va_nodeid
= node
->sn_id
;
517 /* At least '.' and '..'. */
521 #ifndef _OPENSOLARIS_SYS_VNODE_H_
522 struct vop_fid_args
{
529 zfsctl_common_fid(struct vop_fid_args
*ap
)
531 vnode_t
*vp
= ap
->a_vp
;
532 fid_t
*fidp
= (void *)ap
->a_fid
;
533 sfs_node_t
*node
= vp
->v_data
;
534 uint64_t object
= node
->sn_id
;
538 zfid
= (zfid_short_t
*)fidp
;
539 zfid
->zf_len
= SHORT_FID_LEN
;
541 for (i
= 0; i
< sizeof (zfid
->zf_object
); i
++)
542 zfid
->zf_object
[i
] = (uint8_t)(object
>> (8 * i
));
544 /* .zfs nodes always have a generation number of 0 */
545 for (i
= 0; i
< sizeof (zfid
->zf_gen
); i
++)
551 #ifndef _SYS_SYSPROTO_H_
552 struct vop_reclaim_args
{
559 zfsctl_common_reclaim(struct vop_reclaim_args
*ap
)
561 vnode_t
*vp
= ap
->a_vp
;
563 (void) sfs_reclaim_vnode(vp
);
567 #ifndef _SYS_SYSPROTO_H_
568 struct vop_print_args
{
574 zfsctl_common_print(struct vop_print_args
*ap
)
576 sfs_print_node(ap
->a_vp
->v_data
);
580 #ifndef _SYS_SYSPROTO_H_
581 struct vop_getattr_args
{
584 struct ucred
*a_cred
;
589 * Get root directory attributes.
592 zfsctl_root_getattr(struct vop_getattr_args
*ap
)
594 struct vnode
*vp
= ap
->a_vp
;
595 struct vattr
*vap
= ap
->a_vap
;
596 zfsctl_root_t
*node
= vp
->v_data
;
598 zfsctl_common_getattr(vp
, vap
);
599 vap
->va_ctime
= node
->cmtime
;
600 vap
->va_mtime
= vap
->va_ctime
;
601 vap
->va_birthtime
= vap
->va_ctime
;
602 vap
->va_nlink
+= 1; /* snapdir */
603 vap
->va_size
= vap
->va_nlink
;
608 * When we lookup "." we still can be asked to lock it
609 * differently, can't we?
612 zfsctl_relock_dot(vnode_t
*dvp
, int ltype
)
615 if (ltype
!= VOP_ISLOCKED(dvp
)) {
616 if (ltype
== LK_EXCLUSIVE
)
617 vn_lock(dvp
, LK_UPGRADE
| LK_RETRY
);
618 else /* if (ltype == LK_SHARED) */
619 vn_lock(dvp
, LK_DOWNGRADE
| LK_RETRY
);
621 /* Relock for the "." case may left us with reclaimed vnode. */
622 if (VN_IS_DOOMED(dvp
)) {
624 return (SET_ERROR(ENOENT
));
631 * Special case the handling of "..".
634 zfsctl_root_lookup(struct vop_lookup_args
*ap
)
636 struct componentname
*cnp
= ap
->a_cnp
;
637 vnode_t
*dvp
= ap
->a_dvp
;
638 vnode_t
**vpp
= ap
->a_vpp
;
639 int flags
= ap
->a_cnp
->cn_flags
;
640 int lkflags
= ap
->a_cnp
->cn_lkflags
;
641 int nameiop
= ap
->a_cnp
->cn_nameiop
;
644 ASSERT3S(dvp
->v_type
, ==, VDIR
);
646 if ((flags
& ISLASTCN
) != 0 && nameiop
!= LOOKUP
)
647 return (SET_ERROR(ENOTSUP
));
649 if (cnp
->cn_namelen
== 1 && *cnp
->cn_nameptr
== '.') {
650 err
= zfsctl_relock_dot(dvp
, lkflags
& LK_TYPE_MASK
);
653 } else if ((flags
& ISDOTDOT
) != 0) {
654 err
= vn_vget_ino_gen(dvp
, zfsctl_fs_root_vnode
, NULL
,
656 } else if (strncmp(cnp
->cn_nameptr
, "snapshot", cnp
->cn_namelen
) == 0) {
657 err
= zfsctl_snapdir_vnode(dvp
->v_mount
, NULL
, lkflags
, vpp
);
659 err
= SET_ERROR(ENOENT
);
667 zfsctl_root_readdir(struct vop_readdir_args
*ap
)
670 vnode_t
*vp
= ap
->a_vp
;
671 zfsvfs_t
*zfsvfs
= vp
->v_vfsp
->vfs_data
;
672 zfsctl_root_t
*node
= vp
->v_data
;
674 int *eofp
= ap
->a_eofflag
;
678 zfs_uio_init(&uio
, ap
->a_uio
);
680 ASSERT3S(vp
->v_type
, ==, VDIR
);
683 * FIXME: this routine only ever emits 3 entries and does not tolerate
684 * being called with a buffer too small to handle all of them.
686 * The check below facilitates the idiom of repeating calls until the
687 * count to return is 0.
689 if (zfs_uio_offset(&uio
) == 3 * sizeof (entry
)) {
693 error
= sfs_readdir_common(zfsvfs
->z_root
, ZFSCTL_INO_ROOT
, ap
, &uio
,
696 if (error
== ENAMETOOLONG
) /* ran out of destination space */
700 if (zfs_uio_offset(&uio
) != dots_offset
)
701 return (SET_ERROR(EINVAL
));
703 _Static_assert(sizeof (node
->snapdir
->sn_name
) <= sizeof (entry
.d_name
),
704 "node->snapdir->sn_name too big for entry.d_name");
705 entry
.d_fileno
= node
->snapdir
->sn_id
;
706 entry
.d_type
= DT_DIR
;
707 strcpy(entry
.d_name
, node
->snapdir
->sn_name
);
708 entry
.d_namlen
= strlen(entry
.d_name
);
709 entry
.d_reclen
= sizeof (entry
);
710 error
= vfs_read_dirent(ap
, &entry
, zfs_uio_offset(&uio
));
712 if (error
== ENAMETOOLONG
)
714 return (SET_ERROR(error
));
722 zfsctl_root_vptocnp(struct vop_vptocnp_args
*ap
)
724 static const char dotzfs_name
[4] = ".zfs";
728 if (*ap
->a_buflen
< sizeof (dotzfs_name
))
729 return (SET_ERROR(ENOMEM
));
731 error
= vn_vget_ino_gen(ap
->a_vp
, zfsctl_fs_root_vnode
, NULL
,
734 return (SET_ERROR(error
));
738 *ap
->a_buflen
-= sizeof (dotzfs_name
);
739 memcpy(ap
->a_buf
+ *ap
->a_buflen
, dotzfs_name
, sizeof (dotzfs_name
));
744 zfsctl_common_pathconf(struct vop_pathconf_args
*ap
)
747 * We care about ACL variables so that user land utilities like ls
748 * can display them correctly. Since the ctldir's st_dev is set to be
749 * the same as the parent dataset, we must support all variables that
752 switch (ap
->a_name
) {
754 *ap
->a_retval
= MIN(LONG_MAX
, ZFS_LINK_MAX
);
757 case _PC_FILESIZEBITS
:
761 case _PC_MIN_HOLE_SIZE
:
762 *ap
->a_retval
= (int)SPA_MINBLOCKSIZE
;
765 case _PC_ACL_EXTENDED
:
773 case _PC_ACL_PATH_MAX
:
774 *ap
->a_retval
= ACL_MAX_ENTRIES
;
778 *ap
->a_retval
= NAME_MAX
;
782 return (vop_stdpathconf(ap
));
787 * Returns a trivial ACL
790 zfsctl_common_getacl(struct vop_getacl_args
*ap
)
794 if (ap
->a_type
!= ACL_TYPE_NFS4
)
797 acl_nfs4_sync_acl_from_mode(ap
->a_aclp
, zfsctl_ctldir_mode
, 0);
799 * acl_nfs4_sync_acl_from_mode assumes that the owner can always modify
800 * attributes. That is not the case for the ctldir, so we must clear
801 * those bits. We also must clear ACL_READ_NAMED_ATTRS, because xattrs
802 * aren't supported by the ctldir.
804 for (i
= 0; i
< ap
->a_aclp
->acl_cnt
; i
++) {
805 struct acl_entry
*entry
;
806 entry
= &(ap
->a_aclp
->acl_entry
[i
]);
807 entry
->ae_perm
&= ~(ACL_WRITE_ACL
| ACL_WRITE_OWNER
|
808 ACL_WRITE_ATTRIBUTES
| ACL_WRITE_NAMED_ATTRS
|
809 ACL_READ_NAMED_ATTRS
);
815 static struct vop_vector zfsctl_ops_root
= {
816 .vop_default
= &default_vnodeops
,
817 .vop_fplookup_vexec
= VOP_EAGAIN
,
818 .vop_fplookup_symlink
= VOP_EAGAIN
,
819 .vop_open
= zfsctl_common_open
,
820 .vop_close
= zfsctl_common_close
,
821 .vop_ioctl
= VOP_EINVAL
,
822 .vop_getattr
= zfsctl_root_getattr
,
823 .vop_access
= zfsctl_common_access
,
824 .vop_readdir
= zfsctl_root_readdir
,
825 .vop_lookup
= zfsctl_root_lookup
,
826 .vop_inactive
= VOP_NULL
,
827 .vop_reclaim
= zfsctl_common_reclaim
,
828 .vop_fid
= zfsctl_common_fid
,
829 .vop_print
= zfsctl_common_print
,
830 .vop_vptocnp
= zfsctl_root_vptocnp
,
831 .vop_pathconf
= zfsctl_common_pathconf
,
832 .vop_getacl
= zfsctl_common_getacl
,
833 #if __FreeBSD_version >= 1400043
834 .vop_add_writecount
= vop_stdadd_writecount_nomsync
,
837 VFS_VOP_VECTOR_REGISTER(zfsctl_ops_root
);
840 zfsctl_snapshot_zname(vnode_t
*vp
, const char *name
, int len
, char *zname
)
842 objset_t
*os
= ((zfsvfs_t
*)((vp
)->v_vfsp
->vfs_data
))->z_os
;
844 dmu_objset_name(os
, zname
);
845 if (strlen(zname
) + 1 + strlen(name
) >= len
)
846 return (SET_ERROR(ENAMETOOLONG
));
847 (void) strcat(zname
, "@");
848 (void) strcat(zname
, name
);
853 zfsctl_snapshot_lookup(vnode_t
*vp
, const char *name
, uint64_t *id
)
855 objset_t
*os
= ((zfsvfs_t
*)((vp
)->v_vfsp
->vfs_data
))->z_os
;
858 err
= dsl_dataset_snap_lookup(dmu_objset_ds(os
), name
, id
);
863 * Given a vnode get a root vnode of a filesystem mounted on top of
864 * the vnode, if any. The root vnode is referenced and locked.
865 * If no filesystem is mounted then the orinal vnode remains referenced
866 * and locked. If any error happens the orinal vnode is unlocked and
870 zfsctl_mounted_here(vnode_t
**vpp
, int flags
)
875 ASSERT_VOP_LOCKED(*vpp
, __func__
);
876 ASSERT3S((*vpp
)->v_type
, ==, VDIR
);
878 if ((mp
= (*vpp
)->v_mountedhere
) != NULL
) {
879 err
= vfs_busy(mp
, 0);
880 KASSERT(err
== 0, ("vfs_busy(mp, 0) failed with %d", err
));
881 KASSERT(vrefcnt(*vpp
) > 1, ("unreferenced mountpoint"));
883 err
= VFS_ROOT(mp
, flags
, vpp
);
887 return (EJUSTRETURN
);
891 const char *snap_name
;
893 } snapshot_setup_arg_t
;
896 zfsctl_snapshot_vnode_setup(vnode_t
*vp
, void *arg
)
898 snapshot_setup_arg_t
*ssa
= arg
;
901 ASSERT_VOP_ELOCKED(vp
, __func__
);
903 node
= sfs_alloc_node(sizeof (sfs_node_t
),
904 ssa
->snap_name
, ZFSCTL_INO_SNAPDIR
, ssa
->snap_id
);
905 zfsctl_common_vnode_setup(vp
, node
);
907 /* We have to support recursive locking. */
912 * Lookup entry point for the 'snapshot' directory. Try to open the
913 * snapshot if it exist, creating the pseudo filesystem vnode as necessary.
914 * Perform a mount of the associated dataset on top of the vnode.
915 * There are four possibilities:
916 * - the snapshot node and vnode do not exist
917 * - the snapshot vnode is covered by the mounted snapshot
918 * - the snapshot vnode is not covered yet, the mount operation is in progress
919 * - the snapshot vnode is not covered, because the snapshot has been unmounted
920 * The last two states are transient and should be relatively short-lived.
923 zfsctl_snapdir_lookup(struct vop_lookup_args
*ap
)
925 vnode_t
*dvp
= ap
->a_dvp
;
926 vnode_t
**vpp
= ap
->a_vpp
;
927 struct componentname
*cnp
= ap
->a_cnp
;
928 char name
[NAME_MAX
+ 1];
929 char fullname
[ZFS_MAX_DATASET_NAME_LEN
];
931 size_t mountpoint_len
;
932 zfsvfs_t
*zfsvfs
= dvp
->v_vfsp
->vfs_data
;
934 int nameiop
= cnp
->cn_nameiop
;
935 int lkflags
= cnp
->cn_lkflags
;
936 int flags
= cnp
->cn_flags
;
939 ASSERT3S(dvp
->v_type
, ==, VDIR
);
941 if ((flags
& ISLASTCN
) != 0 && nameiop
!= LOOKUP
)
942 return (SET_ERROR(ENOTSUP
));
944 if (cnp
->cn_namelen
== 1 && *cnp
->cn_nameptr
== '.') {
945 err
= zfsctl_relock_dot(dvp
, lkflags
& LK_TYPE_MASK
);
950 if (flags
& ISDOTDOT
) {
951 err
= vn_vget_ino_gen(dvp
, zfsctl_root_vnode
, NULL
, lkflags
,
956 if (cnp
->cn_namelen
>= sizeof (name
))
957 return (SET_ERROR(ENAMETOOLONG
));
959 strlcpy(name
, ap
->a_cnp
->cn_nameptr
, ap
->a_cnp
->cn_namelen
+ 1);
960 err
= zfsctl_snapshot_lookup(dvp
, name
, &snap_id
);
962 return (SET_ERROR(ENOENT
));
965 snapshot_setup_arg_t ssa
;
967 ssa
.snap_name
= name
;
968 ssa
.snap_id
= snap_id
;
969 err
= sfs_vgetx(dvp
->v_mount
, LK_SHARED
, ZFSCTL_INO_SNAPDIR
,
970 snap_id
, "zfs", &zfsctl_ops_snapshot
,
971 zfsctl_snapshot_vnode_setup
, &ssa
, vpp
);
975 /* Check if a new vnode has just been created. */
976 if (VOP_ISLOCKED(*vpp
) == LK_EXCLUSIVE
)
980 * Check if a snapshot is already mounted on top of the vnode.
982 err
= zfsctl_mounted_here(vpp
, lkflags
);
983 if (err
!= EJUSTRETURN
)
987 * If the vnode is not covered, then either the mount operation
988 * is in progress or the snapshot has already been unmounted
989 * but the vnode hasn't been inactivated and reclaimed yet.
990 * We can try to re-use the vnode in the latter case.
993 if (((*vpp
)->v_iflag
& VI_MOUNT
) == 0) {
996 * Upgrade to exclusive lock in order to:
997 * - avoid race conditions
998 * - satisfy the contract of mount_snapshot()
1000 err
= VOP_LOCK(*vpp
, LK_TRYUPGRADE
);
1008 * In this state we can loop on uncontested locks and starve
1009 * the thread doing the lengthy, non-trivial mount operation.
1010 * So, yield to prevent that from happening.
1013 kern_yield(PRI_USER
);
1016 VERIFY0(zfsctl_snapshot_zname(dvp
, name
, sizeof (fullname
), fullname
));
1018 mountpoint_len
= strlen(dvp
->v_vfsp
->mnt_stat
.f_mntonname
) +
1019 strlen("/" ZFS_CTLDIR_NAME
"/snapshot/") + strlen(name
) + 1;
1020 mountpoint
= kmem_alloc(mountpoint_len
, KM_SLEEP
);
1021 (void) snprintf(mountpoint
, mountpoint_len
,
1022 "%s/" ZFS_CTLDIR_NAME
"/snapshot/%s",
1023 dvp
->v_vfsp
->mnt_stat
.f_mntonname
, name
);
1025 err
= mount_snapshot(curthread
, vpp
, "zfs", mountpoint
, fullname
, 0,
1027 kmem_free(mountpoint
, mountpoint_len
);
1030 * Fix up the root vnode mounted on .zfs/snapshot/<snapname>.
1032 * This is where we lie about our v_vfsp in order to
1033 * make .zfs/snapshot/<snapname> accessible over NFS
1034 * without requiring manual mounts of <snapname>.
1036 ASSERT3P(VTOZ(*vpp
)->z_zfsvfs
, !=, zfsvfs
);
1037 VTOZ(*vpp
)->z_zfsvfs
->z_parent
= zfsvfs
;
1039 /* Clear the root flag (set via VFS_ROOT) as well. */
1040 (*vpp
)->v_vflag
&= ~VV_ROOT
;
1049 zfsctl_snapdir_readdir(struct vop_readdir_args
*ap
)
1051 char snapname
[ZFS_MAX_DATASET_NAME_LEN
];
1052 struct dirent entry
;
1053 vnode_t
*vp
= ap
->a_vp
;
1054 zfsvfs_t
*zfsvfs
= vp
->v_vfsp
->vfs_data
;
1056 int *eofp
= ap
->a_eofflag
;
1060 zfs_uio_init(&uio
, ap
->a_uio
);
1062 ASSERT3S(vp
->v_type
, ==, VDIR
);
1064 error
= sfs_readdir_common(ZFSCTL_INO_ROOT
, ZFSCTL_INO_SNAPDIR
, ap
,
1065 &uio
, &dots_offset
);
1067 if (error
== ENAMETOOLONG
) /* ran out of destination space */
1072 if ((error
= zfs_enter(zfsvfs
, FTAG
)) != 0)
1078 cookie
= zfs_uio_offset(&uio
) - dots_offset
;
1080 dsl_pool_config_enter(dmu_objset_pool(zfsvfs
->z_os
), FTAG
);
1081 error
= dmu_snapshot_list_next(zfsvfs
->z_os
, sizeof (snapname
),
1082 snapname
, &id
, &cookie
, NULL
);
1083 dsl_pool_config_exit(dmu_objset_pool(zfsvfs
->z_os
), FTAG
);
1085 if (error
== ENOENT
) {
1090 zfs_exit(zfsvfs
, FTAG
);
1094 entry
.d_fileno
= id
;
1095 entry
.d_type
= DT_DIR
;
1096 strcpy(entry
.d_name
, snapname
);
1097 entry
.d_namlen
= strlen(entry
.d_name
);
1098 entry
.d_reclen
= sizeof (entry
);
1099 error
= vfs_read_dirent(ap
, &entry
, zfs_uio_offset(&uio
));
1101 if (error
== ENAMETOOLONG
)
1103 zfs_exit(zfsvfs
, FTAG
);
1104 return (SET_ERROR(error
));
1106 zfs_uio_setoffset(&uio
, cookie
+ dots_offset
);
1108 __builtin_unreachable();
1112 zfsctl_snapdir_getattr(struct vop_getattr_args
*ap
)
1114 vnode_t
*vp
= ap
->a_vp
;
1115 vattr_t
*vap
= ap
->a_vap
;
1116 zfsvfs_t
*zfsvfs
= vp
->v_vfsp
->vfs_data
;
1118 uint64_t snap_count
;
1121 if ((err
= zfs_enter(zfsvfs
, FTAG
)) != 0)
1123 ds
= dmu_objset_ds(zfsvfs
->z_os
);
1124 zfsctl_common_getattr(vp
, vap
);
1125 vap
->va_ctime
= dmu_objset_snap_cmtime(zfsvfs
->z_os
);
1126 vap
->va_mtime
= vap
->va_ctime
;
1127 vap
->va_birthtime
= vap
->va_ctime
;
1128 if (dsl_dataset_phys(ds
)->ds_snapnames_zapobj
!= 0) {
1129 err
= zap_count(dmu_objset_pool(ds
->ds_objset
)->dp_meta_objset
,
1130 dsl_dataset_phys(ds
)->ds_snapnames_zapobj
, &snap_count
);
1132 zfs_exit(zfsvfs
, FTAG
);
1135 vap
->va_nlink
+= snap_count
;
1137 vap
->va_size
= vap
->va_nlink
;
1139 zfs_exit(zfsvfs
, FTAG
);
1143 static struct vop_vector zfsctl_ops_snapdir
= {
1144 .vop_default
= &default_vnodeops
,
1145 .vop_fplookup_vexec
= VOP_EAGAIN
,
1146 .vop_fplookup_symlink
= VOP_EAGAIN
,
1147 .vop_open
= zfsctl_common_open
,
1148 .vop_close
= zfsctl_common_close
,
1149 .vop_getattr
= zfsctl_snapdir_getattr
,
1150 .vop_access
= zfsctl_common_access
,
1151 .vop_readdir
= zfsctl_snapdir_readdir
,
1152 .vop_lookup
= zfsctl_snapdir_lookup
,
1153 .vop_reclaim
= zfsctl_common_reclaim
,
1154 .vop_fid
= zfsctl_common_fid
,
1155 .vop_print
= zfsctl_common_print
,
1156 .vop_pathconf
= zfsctl_common_pathconf
,
1157 .vop_getacl
= zfsctl_common_getacl
,
1158 #if __FreeBSD_version >= 1400043
1159 .vop_add_writecount
= vop_stdadd_writecount_nomsync
,
1162 VFS_VOP_VECTOR_REGISTER(zfsctl_ops_snapdir
);
1166 zfsctl_snapshot_inactive(struct vop_inactive_args
*ap
)
1168 vnode_t
*vp
= ap
->a_vp
;
1175 zfsctl_snapshot_reclaim(struct vop_reclaim_args
*ap
)
1177 vnode_t
*vp
= ap
->a_vp
;
1178 void *data
= vp
->v_data
;
1180 sfs_reclaim_vnode(vp
);
1181 sfs_destroy_node(data
);
1186 zfsctl_snapshot_vptocnp(struct vop_vptocnp_args
*ap
)
1198 len
= strlen(node
->sn_name
);
1199 if (*ap
->a_buflen
< len
)
1200 return (SET_ERROR(ENOMEM
));
1203 * Prevent unmounting of the snapshot while the vnode lock
1204 * is not held. That is not strictly required, but allows
1205 * us to assert that an uncovered snapshot vnode is never
1208 mp
= vp
->v_mountedhere
;
1210 return (SET_ERROR(ENOENT
));
1211 error
= vfs_busy(mp
, 0);
1212 KASSERT(error
== 0, ("vfs_busy(mp, 0) failed with %d", error
));
1215 * We can vput the vnode as we can now depend on the reference owned
1216 * by the busied mp. But we also need to hold the vnode, because
1217 * the reference may go after vfs_unbusy() which has to be called
1218 * before we can lock the vnode again.
1220 locked
= VOP_ISLOCKED(vp
);
1221 enum vgetstate vs
= vget_prep(vp
);
1224 /* Look up .zfs/snapshot, our parent. */
1225 error
= zfsctl_snapdir_vnode(vp
->v_mount
, NULL
, LK_SHARED
, &dvp
);
1229 *ap
->a_buflen
-= len
;
1230 memcpy(ap
->a_buf
+ *ap
->a_buflen
, node
->sn_name
, len
);
1233 vget_finish(vp
, locked
| LK_RETRY
, vs
);
1238 * These VP's should never see the light of day. They should always
1241 static struct vop_vector zfsctl_ops_snapshot
= {
1242 .vop_default
= NULL
, /* ensure very restricted access */
1243 .vop_fplookup_vexec
= VOP_EAGAIN
,
1244 .vop_fplookup_symlink
= VOP_EAGAIN
,
1245 .vop_open
= zfsctl_common_open
,
1246 .vop_close
= zfsctl_common_close
,
1247 .vop_inactive
= zfsctl_snapshot_inactive
,
1248 .vop_need_inactive
= vop_stdneed_inactive
,
1249 .vop_reclaim
= zfsctl_snapshot_reclaim
,
1250 .vop_vptocnp
= zfsctl_snapshot_vptocnp
,
1251 .vop_lock1
= vop_stdlock
,
1252 .vop_unlock
= vop_stdunlock
,
1253 .vop_islocked
= vop_stdislocked
,
1254 .vop_advlockpurge
= vop_stdadvlockpurge
, /* called by vgone */
1255 .vop_print
= zfsctl_common_print
,
1256 #if __FreeBSD_version >= 1400043
1257 .vop_add_writecount
= vop_stdadd_writecount_nomsync
,
1260 VFS_VOP_VECTOR_REGISTER(zfsctl_ops_snapshot
);
1263 zfsctl_lookup_objset(vfs_t
*vfsp
, uint64_t objsetid
, zfsvfs_t
**zfsvfsp
)
1265 zfsvfs_t
*zfsvfs __unused
= vfsp
->vfs_data
;
1269 ASSERT3P(zfsvfs
->z_ctldir
, !=, NULL
);
1271 error
= sfs_vnode_get(vfsp
, LK_EXCLUSIVE
,
1272 ZFSCTL_INO_SNAPDIR
, objsetid
, &vp
);
1273 if (error
== 0 && vp
!= NULL
) {
1275 * XXX Probably need to at least reference, if not busy, the mp.
1277 if (vp
->v_mountedhere
!= NULL
)
1278 *zfsvfsp
= vp
->v_mountedhere
->mnt_data
;
1281 if (*zfsvfsp
== NULL
)
1282 return (SET_ERROR(EINVAL
));
1287 * Unmount any snapshots for the given filesystem. This is called from
1288 * zfs_umount() - if we have a ctldir, then go through and unmount all the
1292 zfsctl_umount_snapshots(vfs_t
*vfsp
, int fflags
, cred_t
*cr
)
1294 char snapname
[ZFS_MAX_DATASET_NAME_LEN
];
1295 zfsvfs_t
*zfsvfs
= vfsp
->vfs_data
;
1301 ASSERT3P(zfsvfs
->z_ctldir
, !=, NULL
);
1307 dsl_pool_config_enter(dmu_objset_pool(zfsvfs
->z_os
), FTAG
);
1308 error
= dmu_snapshot_list_next(zfsvfs
->z_os
, sizeof (snapname
),
1309 snapname
, &id
, &cookie
, NULL
);
1310 dsl_pool_config_exit(dmu_objset_pool(zfsvfs
->z_os
), FTAG
);
1312 if (error
== ENOENT
)
1318 error
= sfs_vnode_get(vfsp
, LK_EXCLUSIVE
,
1319 ZFSCTL_INO_SNAPDIR
, id
, &vp
);
1320 if (error
!= 0 || vp
== NULL
)
1323 mp
= vp
->v_mountedhere
;
1326 * v_mountedhere being NULL means that the
1327 * (uncovered) vnode is in a transient state
1328 * (mounting or unmounting), so loop until it
1338 continue; /* no mountpoint, nothing to do */
1341 * The mount-point vnode is kept locked to avoid spurious EBUSY
1342 * from a concurrent umount.
1343 * The vnode lock must have recursive locking enabled.
1346 error
= dounmount(mp
, fflags
, curthread
);
1347 KASSERT_IMPLY(error
== 0, vrefcnt(vp
) == 1,
1348 ("extra references after unmount"));
1353 KASSERT_IMPLY((fflags
& MS_FORCE
) != 0, error
== 0,
1354 ("force unmounting failed"));
1359 zfsctl_snapshot_unmount(const char *snapname
, int flags __unused
)
1362 zfsvfs_t
*zfsvfs
= NULL
;
1364 if (strchr(snapname
, '@') == NULL
)
1367 int err
= getzfsvfs(snapname
, &zfsvfs
);
1369 ASSERT3P(zfsvfs
, ==, NULL
);
1372 vfsp
= zfsvfs
->z_vfs
;
1374 ASSERT(!dsl_pool_config_held(dmu_objset_pool(zfsvfs
->z_os
)));
1378 return (dounmount(vfsp
, MS_FORCE
, curthread
));