module/os/freebsd/zfs/zfs_ctldir.c

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or https://opensource.org/licenses/CDDL-1.0.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  23  * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
  24  * Copyright 2015, OmniTI Computer Consulting, Inc. All rights reserved.
  25  */
  26
  27 /*
  28  * ZFS control directory (a.k.a. ".zfs")
  29  *
  30  * This directory provides a common location for all ZFS meta-objects.
  31  * Currently, this is only the 'snapshot' directory, but this may expand in the
  32  * future.  The elements are built using the GFS primitives, as the hierarchy
  33  * does not actually exist on disk.
  34  *
  35  * For 'snapshot', we don't want to have all snapshots always mounted, because
  36  * this would take up a huge amount of space in /etc/mnttab.  We have three
  37  * types of objects:
  38  *
  39  *      ctldir ------> snapshotdir -------> snapshot
  40  *                                             |
  41  *                                             |
  42  *                                             V
  43  *                                         mounted fs
  44  *
  45  * The 'snapshot' node contains just enough information to lookup '..' and act
  46  * as a mountpoint for the snapshot.  Whenever we lookup a specific snapshot, we
  47  * perform an automount of the underlying filesystem and return the
  48  * corresponding vnode.
  49  *
  50  * All mounts are handled automatically by the kernel, but unmounts are
  51  * (currently) handled from user land.  The main reason is that there is no
  52  * reliable way to auto-unmount the filesystem when it's "no longer in use".
  53  * When the user unmounts a filesystem, we call zfsctl_unmount(), which
  54  * unmounts any snapshots within the snapshot directory.
  55  *
  56  * The '.zfs', '.zfs/snapshot', and all directories created under
  57  * '.zfs/snapshot' (ie: '.zfs/snapshot/<snapname>') are all GFS nodes and
  58  * share the same vfs_t as the head filesystem (what '.zfs' lives under).
  59  *
  60  * File systems mounted ontop of the GFS nodes '.zfs/snapshot/<snapname>'
  61  * (ie: snapshots) are ZFS nodes and have their own unique vfs_t.
  62  * However, vnodes within these mounted on file systems have their v_vfsp
  63  * fields set to the head filesystem to make NFS happy (see
  64  * zfsctl_snapdir_lookup()). We VFS_HOLD the head filesystem's vfs_t
  65  * so that it cannot be freed until all snapshots have been unmounted.
  66  */
  67
  68 #include <sys/types.h>
  69 #include <sys/param.h>
  70 #include <sys/libkern.h>
  71 #include <sys/dirent.h>
  72 #include <sys/zfs_context.h>
  73 #include <sys/zfs_ctldir.h>
  74 #include <sys/zfs_ioctl.h>
  75 #include <sys/zfs_vfsops.h>
  76 #include <sys/namei.h>
  77 #include <sys/stat.h>
  78 #include <sys/dmu.h>
  79 #include <sys/dsl_dataset.h>
  80 #include <sys/dsl_destroy.h>
  81 #include <sys/dsl_deleg.h>
  82 #include <sys/mount.h>
  83 #include <sys/zap.h>
  84 #include <sys/sysproto.h>
  85
  86 #include "zfs_namecheck.h"
  87
  88 #include <sys/kernel.h>
  89 #include <sys/ccompat.h>
  90
  91 /* Common access mode for all virtual directories under the ctldir */
  92 const uint16_t zfsctl_ctldir_mode = S_IRUSR | S_IXUSR | S_IRGRP | S_IXGRP |
  93     S_IROTH | S_IXOTH;
  94
  95 /*
  96  * "Synthetic" filesystem implementation.
  97  */
  98
  99 /*
 100  * Assert that A implies B.
 101  */
 102 #define KASSERT_IMPLY(A, B, msg)        KASSERT(!(A) || (B), (msg));
 103
 104 static MALLOC_DEFINE(M_SFSNODES, "sfs_nodes", "synthetic-fs nodes");
 105
 106 typedef struct sfs_node {
 107         char            sn_name[ZFS_MAX_DATASET_NAME_LEN];
 108         uint64_t        sn_parent_id;
 109         uint64_t        sn_id;
 110 } sfs_node_t;
 111
 112 /*
 113  * Check the parent's ID as well as the node's to account for a chance
 114  * that IDs originating from different domains (snapshot IDs, artificial
 115  * IDs, znode IDs) may clash.
 116  */
 117 static int
 118 sfs_compare_ids(struct vnode *vp, void *arg)
 119 {
 120         sfs_node_t *n1 = vp->v_data;
 121         sfs_node_t *n2 = arg;
 122         bool equal;
 123
 124         equal = n1->sn_id == n2->sn_id &&
 125             n1->sn_parent_id == n2->sn_parent_id;
 126
 127         /* Zero means equality. */
 128         return (!equal);
 129 }
 130
 131 static int
 132 sfs_vnode_get(const struct mount *mp, int flags, uint64_t parent_id,
 133     uint64_t id, struct vnode **vpp)
 134 {
 135         sfs_node_t search;
 136         int err;
 137
 138         search.sn_id = id;
 139         search.sn_parent_id = parent_id;
 140         err = vfs_hash_get(mp, (uint32_t)id, flags, curthread, vpp,
 141             sfs_compare_ids, &search);
 142         return (err);
 143 }
 144
 145 static int
 146 sfs_vnode_insert(struct vnode *vp, int flags, uint64_t parent_id,
 147     uint64_t id, struct vnode **vpp)
 148 {
 149         int err;
 150
 151         KASSERT(vp->v_data != NULL, ("sfs_vnode_insert with NULL v_data"));
 152         err = vfs_hash_insert(vp, (uint32_t)id, flags, curthread, vpp,
 153             sfs_compare_ids, vp->v_data);
 154         return (err);
 155 }
 156
 157 static void
 158 sfs_vnode_remove(struct vnode *vp)
 159 {
 160         vfs_hash_remove(vp);
 161 }
 162
 163 typedef void sfs_vnode_setup_fn(vnode_t *vp, void *arg);
 164
 165 static int
 166 sfs_vgetx(struct mount *mp, int flags, uint64_t parent_id, uint64_t id,
 167     const char *tag, struct vop_vector *vops,
 168     sfs_vnode_setup_fn setup, void *arg,
 169     struct vnode **vpp)
 170 {
 171         struct vnode *vp;
 172         int error;
 173
 174         error = sfs_vnode_get(mp, flags, parent_id, id, vpp);
 175         if (error != 0 || *vpp != NULL) {
 176                 KASSERT_IMPLY(error == 0, (*vpp)->v_data != NULL,
 177                     "sfs vnode with no data");
 178                 return (error);
 179         }
 180
 181         /* Allocate a new vnode/inode. */
 182         error = getnewvnode(tag, mp, vops, &vp);
 183         if (error != 0) {
 184                 *vpp = NULL;
 185                 return (error);
 186         }
 187
 188         /*
 189          * Exclusively lock the vnode vnode while it's being constructed.
 190          */
 191         lockmgr(vp->v_vnlock, LK_EXCLUSIVE, NULL);
 192         error = insmntque(vp, mp);
 193         if (error != 0) {
 194                 *vpp = NULL;
 195                 return (error);
 196         }
 197
 198         setup(vp, arg);
 199
 200         error = sfs_vnode_insert(vp, flags, parent_id, id, vpp);
 201         if (error != 0 || *vpp != NULL) {
 202                 KASSERT_IMPLY(error == 0, (*vpp)->v_data != NULL,
 203                     "sfs vnode with no data");
 204                 return (error);
 205         }
 206
 207 #if __FreeBSD_version >= 1400077
 208         vn_set_state(vp, VSTATE_CONSTRUCTED);
 209 #endif
 210
 211         *vpp = vp;
 212         return (0);
 213 }
 214
 215 static void
 216 sfs_print_node(sfs_node_t *node)
 217 {
 218         printf("\tname = %s\n", node->sn_name);
 219         printf("\tparent_id = %ju\n", (uintmax_t)node->sn_parent_id);
 220         printf("\tid = %ju\n", (uintmax_t)node->sn_id);
 221 }
 222
 223 static sfs_node_t *
 224 sfs_alloc_node(size_t size, const char *name, uint64_t parent_id, uint64_t id)
 225 {
 226         struct sfs_node *node;
 227
 228         KASSERT(strlen(name) < sizeof (node->sn_name),
 229             ("sfs node name is too long"));
 230         KASSERT(size >= sizeof (*node), ("sfs node size is too small"));
 231         node = malloc(size, M_SFSNODES, M_WAITOK | M_ZERO);
 232         strlcpy(node->sn_name, name, sizeof (node->sn_name));
 233         node->sn_parent_id = parent_id;
 234         node->sn_id = id;
 235
 236         return (node);
 237 }
 238
 239 static void
 240 sfs_destroy_node(sfs_node_t *node)
 241 {
 242         free(node, M_SFSNODES);
 243 }
 244
 245 static void *
 246 sfs_reclaim_vnode(vnode_t *vp)
 247 {
 248         void *data;
 249
 250         sfs_vnode_remove(vp);
 251         data = vp->v_data;
 252         vp->v_data = NULL;
 253         return (data);
 254 }
 255
 256 static int
 257 sfs_readdir_common(uint64_t parent_id, uint64_t id, struct vop_readdir_args *ap,
 258     zfs_uio_t *uio, off_t *offp)
 259 {
 260         struct dirent entry;
 261         int error;
 262
 263         /* Reset ncookies for subsequent use of vfs_read_dirent. */
 264         if (ap->a_ncookies != NULL)
 265                 *ap->a_ncookies = 0;
 266
 267         if (zfs_uio_resid(uio) < sizeof (entry))
 268                 return (SET_ERROR(EINVAL));
 269
 270         if (zfs_uio_offset(uio) < 0)
 271                 return (SET_ERROR(EINVAL));
 272         if (zfs_uio_offset(uio) == 0) {
 273                 entry.d_fileno = id;
 274                 entry.d_type = DT_DIR;
 275                 entry.d_name[0] = '.';
 276                 entry.d_name[1] = '\0';
 277                 entry.d_namlen = 1;
 278                 entry.d_reclen = sizeof (entry);
 279                 error = vfs_read_dirent(ap, &entry, zfs_uio_offset(uio));
 280                 if (error != 0)
 281                         return (SET_ERROR(error));
 282         }
 283
 284         if (zfs_uio_offset(uio) < sizeof (entry))
 285                 return (SET_ERROR(EINVAL));
 286         if (zfs_uio_offset(uio) == sizeof (entry)) {
 287                 entry.d_fileno = parent_id;
 288                 entry.d_type = DT_DIR;
 289                 entry.d_name[0] = '.';
 290                 entry.d_name[1] = '.';
 291                 entry.d_name[2] = '\0';
 292                 entry.d_namlen = 2;
 293                 entry.d_reclen = sizeof (entry);
 294                 error = vfs_read_dirent(ap, &entry, zfs_uio_offset(uio));
 295                 if (error != 0)
 296                         return (SET_ERROR(error));
 297         }
 298
 299         if (offp != NULL)
 300                 *offp = 2 * sizeof (entry);
 301         return (0);
 302 }
 303
 304
 305 /*
 306  * .zfs inode namespace
 307  *
 308  * We need to generate unique inode numbers for all files and directories
 309  * within the .zfs pseudo-filesystem.  We use the following scheme:
 310  *
 311  *      ENTRY                   ZFSCTL_INODE
 312  *      .zfs                    1
 313  *      .zfs/snapshot           2
 314  *      .zfs/snapshot/<snap>    objectid(snap)
 315  */
 316 #define ZFSCTL_INO_SNAP(id)     (id)
 317
 318 static struct vop_vector zfsctl_ops_root;
 319 static struct vop_vector zfsctl_ops_snapdir;
 320 static struct vop_vector zfsctl_ops_snapshot;
 321
 322 void
 323 zfsctl_init(void)
 324 {
 325 }
 326
 327 void
 328 zfsctl_fini(void)
 329 {
 330 }
 331
 332 boolean_t
 333 zfsctl_is_node(vnode_t *vp)
 334 {
 335         return (vn_matchops(vp, zfsctl_ops_root) ||
 336             vn_matchops(vp, zfsctl_ops_snapdir) ||
 337             vn_matchops(vp, zfsctl_ops_snapshot));
 338
 339 }
 340
 341 typedef struct zfsctl_root {
 342         sfs_node_t      node;
 343         sfs_node_t      *snapdir;
 344         timestruc_t     cmtime;
 345 } zfsctl_root_t;
 346
 347
 348 /*
 349  * Create the '.zfs' directory.
 350  */
 351 void
 352 zfsctl_create(zfsvfs_t *zfsvfs)
 353 {
 354         zfsctl_root_t *dot_zfs;
 355         sfs_node_t *snapdir;
 356         vnode_t *rvp;
 357         uint64_t crtime[2];
 358
 359         ASSERT3P(zfsvfs->z_ctldir, ==, NULL);
 360
 361         snapdir = sfs_alloc_node(sizeof (*snapdir), "snapshot", ZFSCTL_INO_ROOT,
 362             ZFSCTL_INO_SNAPDIR);
 363         dot_zfs = (zfsctl_root_t *)sfs_alloc_node(sizeof (*dot_zfs), ".zfs", 0,
 364             ZFSCTL_INO_ROOT);
 365         dot_zfs->snapdir = snapdir;
 366
 367         VERIFY0(VFS_ROOT(zfsvfs->z_vfs, LK_EXCLUSIVE, &rvp));
 368         VERIFY0(sa_lookup(VTOZ(rvp)->z_sa_hdl, SA_ZPL_CRTIME(zfsvfs),
 369             &crtime, sizeof (crtime)));
 370         ZFS_TIME_DECODE(&dot_zfs->cmtime, crtime);
 371         vput(rvp);
 372
 373         zfsvfs->z_ctldir = dot_zfs;
 374 }
 375
 376 /*
 377  * Destroy the '.zfs' directory.  Only called when the filesystem is unmounted.
 378  * The nodes must not have any associated vnodes by now as they should be
 379  * vflush-ed.
 380  */
 381 void
 382 zfsctl_destroy(zfsvfs_t *zfsvfs)
 383 {
 384         sfs_destroy_node(zfsvfs->z_ctldir->snapdir);
 385         sfs_destroy_node((sfs_node_t *)zfsvfs->z_ctldir);
 386         zfsvfs->z_ctldir = NULL;
 387 }
 388
 389 static int
 390 zfsctl_fs_root_vnode(struct mount *mp, void *arg __unused, int flags,
 391     struct vnode **vpp)
 392 {
 393         return (VFS_ROOT(mp, flags, vpp));
 394 }
 395
 396 static void
 397 zfsctl_common_vnode_setup(vnode_t *vp, void *arg)
 398 {
 399         ASSERT_VOP_ELOCKED(vp, __func__);
 400
 401         /* We support shared locking. */
 402         VN_LOCK_ASHARE(vp);
 403         vp->v_type = VDIR;
 404         vp->v_data = arg;
 405 }
 406
 407 static int
 408 zfsctl_root_vnode(struct mount *mp, void *arg __unused, int flags,
 409     struct vnode **vpp)
 410 {
 411         void *node;
 412         int err;
 413
 414         node = ((zfsvfs_t *)mp->mnt_data)->z_ctldir;
 415         err = sfs_vgetx(mp, flags, 0, ZFSCTL_INO_ROOT, "zfs", &zfsctl_ops_root,
 416             zfsctl_common_vnode_setup, node, vpp);
 417         return (err);
 418 }
 419
 420 static int
 421 zfsctl_snapdir_vnode(struct mount *mp, void *arg __unused, int flags,
 422     struct vnode **vpp)
 423 {
 424         void *node;
 425         int err;
 426
 427         node = ((zfsvfs_t *)mp->mnt_data)->z_ctldir->snapdir;
 428         err = sfs_vgetx(mp, flags, ZFSCTL_INO_ROOT, ZFSCTL_INO_SNAPDIR, "zfs",
 429             &zfsctl_ops_snapdir, zfsctl_common_vnode_setup, node, vpp);
 430         return (err);
 431 }
 432
 433 /*
 434  * Given a root znode, retrieve the associated .zfs directory.
 435  * Add a hold to the vnode and return it.
 436  */
 437 int
 438 zfsctl_root(zfsvfs_t *zfsvfs, int flags, vnode_t **vpp)
 439 {
 440         int error;
 441
 442         error = zfsctl_root_vnode(zfsvfs->z_vfs, NULL, flags, vpp);
 443         return (error);
 444 }
 445
 446 /*
 447  * Common open routine.  Disallow any write access.
 448  */
 449 static int
 450 zfsctl_common_open(struct vop_open_args *ap)
 451 {
 452         int flags = ap->a_mode;
 453
 454         if (flags & FWRITE)
 455                 return (SET_ERROR(EACCES));
 456
 457         return (0);
 458 }
 459
 460 /*
 461  * Common close routine.  Nothing to do here.
 462  */
 463 static int
 464 zfsctl_common_close(struct vop_close_args *ap)
 465 {
 466         (void) ap;
 467         return (0);
 468 }
 469
 470 /*
 471  * Common access routine.  Disallow writes.
 472  */
 473 static int
 474 zfsctl_common_access(struct vop_access_args *ap)
 475 {
 476         accmode_t accmode = ap->a_accmode;
 477
 478         if (accmode & VWRITE)
 479                 return (SET_ERROR(EACCES));
 480         return (0);
 481 }
 482
 483 /*
 484  * Common getattr function.  Fill in basic information.
 485  */
 486 static void
 487 zfsctl_common_getattr(vnode_t *vp, vattr_t *vap)
 488 {
 489         timestruc_t     now;
 490         sfs_node_t *node;
 491
 492         node = vp->v_data;
 493
 494         vap->va_uid = 0;
 495         vap->va_gid = 0;
 496         vap->va_rdev = 0;
 497         /*
 498          * We are a purely virtual object, so we have no
 499          * blocksize or allocated blocks.
 500          */
 501         vap->va_blksize = 0;
 502         vap->va_nblocks = 0;
 503         vap->va_gen = 0;
 504         vn_fsid(vp, vap);
 505         vap->va_mode = zfsctl_ctldir_mode;
 506         vap->va_type = VDIR;
 507         /*
 508          * We live in the now (for atime).
 509          */
 510         gethrestime(&now);
 511         vap->va_atime = now;
 512         /* FreeBSD: Reset chflags(2) flags. */
 513         vap->va_flags = 0;
 514
 515         vap->va_nodeid = node->sn_id;
 516
 517         /* At least '.' and '..'. */
 518         vap->va_nlink = 2;
 519 }
 520
 521 #ifndef _OPENSOLARIS_SYS_VNODE_H_
 522 struct vop_fid_args {
 523         struct vnode *a_vp;
 524         struct fid *a_fid;
 525 };
 526 #endif
 527
 528 static int
 529 zfsctl_common_fid(struct vop_fid_args *ap)
 530 {
 531         vnode_t         *vp = ap->a_vp;
 532         fid_t           *fidp = (void *)ap->a_fid;
 533         sfs_node_t      *node = vp->v_data;
 534         uint64_t        object = node->sn_id;
 535         zfid_short_t    *zfid;
 536         int             i;
 537
 538         zfid = (zfid_short_t *)fidp;
 539         zfid->zf_len = SHORT_FID_LEN;
 540
 541         for (i = 0; i < sizeof (zfid->zf_object); i++)
 542                 zfid->zf_object[i] = (uint8_t)(object >> (8 * i));
 543
 544         /* .zfs nodes always have a generation number of 0 */
 545         for (i = 0; i < sizeof (zfid->zf_gen); i++)
 546                 zfid->zf_gen[i] = 0;
 547
 548         return (0);
 549 }
 550
 551 #ifndef _SYS_SYSPROTO_H_
 552 struct vop_reclaim_args {
 553         struct vnode *a_vp;
 554         struct thread *a_td;
 555 };
 556 #endif
 557
 558 static int
 559 zfsctl_common_reclaim(struct vop_reclaim_args *ap)
 560 {
 561         vnode_t *vp = ap->a_vp;
 562
 563         (void) sfs_reclaim_vnode(vp);
 564         return (0);
 565 }
 566
 567 #ifndef _SYS_SYSPROTO_H_
 568 struct vop_print_args {
 569         struct vnode *a_vp;
 570 };
 571 #endif
 572
 573 static int
 574 zfsctl_common_print(struct vop_print_args *ap)
 575 {
 576         sfs_print_node(ap->a_vp->v_data);
 577         return (0);
 578 }
 579
 580 #ifndef _SYS_SYSPROTO_H_
 581 struct vop_getattr_args {
 582         struct vnode *a_vp;
 583         struct vattr *a_vap;
 584         struct ucred *a_cred;
 585 };
 586 #endif
 587
 588 /*
 589  * Get root directory attributes.
 590  */
 591 static int
 592 zfsctl_root_getattr(struct vop_getattr_args *ap)
 593 {
 594         struct vnode *vp = ap->a_vp;
 595         struct vattr *vap = ap->a_vap;
 596         zfsctl_root_t *node = vp->v_data;
 597
 598         zfsctl_common_getattr(vp, vap);
 599         vap->va_ctime = node->cmtime;
 600         vap->va_mtime = vap->va_ctime;
 601         vap->va_birthtime = vap->va_ctime;
 602         vap->va_nlink += 1; /* snapdir */
 603         vap->va_size = vap->va_nlink;
 604         return (0);
 605 }
 606
 607 /*
 608  * When we lookup "." we still can be asked to lock it
 609  * differently, can't we?
 610  */
 611 static int
 612 zfsctl_relock_dot(vnode_t *dvp, int ltype)
 613 {
 614         vref(dvp);
 615         if (ltype != VOP_ISLOCKED(dvp)) {
 616                 if (ltype == LK_EXCLUSIVE)
 617                         vn_lock(dvp, LK_UPGRADE | LK_RETRY);
 618                 else /* if (ltype == LK_SHARED) */
 619                         vn_lock(dvp, LK_DOWNGRADE | LK_RETRY);
 620
 621                 /* Relock for the "." case may left us with reclaimed vnode. */
 622                 if (VN_IS_DOOMED(dvp)) {
 623                         vrele(dvp);
 624                         return (SET_ERROR(ENOENT));
 625                 }
 626         }
 627         return (0);
 628 }
 629
 630 /*
 631  * Special case the handling of "..".
 632  */
 633 static int
 634 zfsctl_root_lookup(struct vop_lookup_args *ap)
 635 {
 636         struct componentname *cnp = ap->a_cnp;
 637         vnode_t *dvp = ap->a_dvp;
 638         vnode_t **vpp = ap->a_vpp;
 639         int flags = ap->a_cnp->cn_flags;
 640         int lkflags = ap->a_cnp->cn_lkflags;
 641         int nameiop = ap->a_cnp->cn_nameiop;
 642         int err;
 643
 644         ASSERT3S(dvp->v_type, ==, VDIR);
 645
 646         if ((flags & ISLASTCN) != 0 && nameiop != LOOKUP)
 647                 return (SET_ERROR(ENOTSUP));
 648
 649         if (cnp->cn_namelen == 1 && *cnp->cn_nameptr == '.') {
 650                 err = zfsctl_relock_dot(dvp, lkflags & LK_TYPE_MASK);
 651                 if (err == 0)
 652                         *vpp = dvp;
 653         } else if ((flags & ISDOTDOT) != 0) {
 654                 err = vn_vget_ino_gen(dvp, zfsctl_fs_root_vnode, NULL,
 655                     lkflags, vpp);
 656         } else if (strncmp(cnp->cn_nameptr, "snapshot", cnp->cn_namelen) == 0) {
 657                 err = zfsctl_snapdir_vnode(dvp->v_mount, NULL, lkflags, vpp);
 658         } else {
 659                 err = SET_ERROR(ENOENT);
 660         }
 661         if (err != 0)
 662                 *vpp = NULL;
 663         return (err);
 664 }
 665
 666 static int
 667 zfsctl_root_readdir(struct vop_readdir_args *ap)
 668 {
 669         struct dirent entry;
 670         vnode_t *vp = ap->a_vp;
 671         zfsvfs_t *zfsvfs = vp->v_vfsp->vfs_data;
 672         zfsctl_root_t *node = vp->v_data;
 673         zfs_uio_t uio;
 674         int *eofp = ap->a_eofflag;
 675         off_t dots_offset;
 676         int error;
 677
 678         zfs_uio_init(&uio, ap->a_uio);
 679
 680         ASSERT3S(vp->v_type, ==, VDIR);
 681
 682         /*
 683          * FIXME: this routine only ever emits 3 entries and does not tolerate
 684          * being called with a buffer too small to handle all of them.
 685          *
 686          * The check below facilitates the idiom of repeating calls until the
 687          * count to return is 0.
 688          */
 689         if (zfs_uio_offset(&uio) == 3 * sizeof (entry)) {
 690                 return (0);
 691         }
 692
 693         error = sfs_readdir_common(zfsvfs->z_root, ZFSCTL_INO_ROOT, ap, &uio,
 694             &dots_offset);
 695         if (error != 0) {
 696                 if (error == ENAMETOOLONG) /* ran out of destination space */
 697                         error = 0;
 698                 return (error);
 699         }
 700         if (zfs_uio_offset(&uio) != dots_offset)
 701                 return (SET_ERROR(EINVAL));
 702
 703         _Static_assert(sizeof (node->snapdir->sn_name) <= sizeof (entry.d_name),
 704             "node->snapdir->sn_name too big for entry.d_name");
 705         entry.d_fileno = node->snapdir->sn_id;
 706         entry.d_type = DT_DIR;
 707         strcpy(entry.d_name, node->snapdir->sn_name);
 708         entry.d_namlen = strlen(entry.d_name);
 709         entry.d_reclen = sizeof (entry);
 710         error = vfs_read_dirent(ap, &entry, zfs_uio_offset(&uio));
 711         if (error != 0) {
 712                 if (error == ENAMETOOLONG)
 713                         error = 0;
 714                 return (SET_ERROR(error));
 715         }
 716         if (eofp != NULL)
 717                 *eofp = 1;
 718         return (0);
 719 }
 720
 721 static int
 722 zfsctl_root_vptocnp(struct vop_vptocnp_args *ap)
 723 {
 724         static const char dotzfs_name[4] = ".zfs";
 725         vnode_t *dvp;
 726         int error;
 727
 728         if (*ap->a_buflen < sizeof (dotzfs_name))
 729                 return (SET_ERROR(ENOMEM));
 730
 731         error = vn_vget_ino_gen(ap->a_vp, zfsctl_fs_root_vnode, NULL,
 732             LK_SHARED, &dvp);
 733         if (error != 0)
 734                 return (SET_ERROR(error));
 735
 736         VOP_UNLOCK(dvp);
 737         *ap->a_vpp = dvp;
 738         *ap->a_buflen -= sizeof (dotzfs_name);
 739         memcpy(ap->a_buf + *ap->a_buflen, dotzfs_name, sizeof (dotzfs_name));
 740         return (0);
 741 }
 742
 743 static int
 744 zfsctl_common_pathconf(struct vop_pathconf_args *ap)
 745 {
 746         /*
 747          * We care about ACL variables so that user land utilities like ls
 748          * can display them correctly.  Since the ctldir's st_dev is set to be
 749          * the same as the parent dataset, we must support all variables that
 750          * it supports.
 751          */
 752         switch (ap->a_name) {
 753         case _PC_LINK_MAX:
 754                 *ap->a_retval = MIN(LONG_MAX, ZFS_LINK_MAX);
 755                 return (0);
 756
 757         case _PC_FILESIZEBITS:
 758                 *ap->a_retval = 64;
 759                 return (0);
 760
 761         case _PC_MIN_HOLE_SIZE:
 762                 *ap->a_retval = (int)SPA_MINBLOCKSIZE;
 763                 return (0);
 764
 765         case _PC_ACL_EXTENDED:
 766                 *ap->a_retval = 0;
 767                 return (0);
 768
 769         case _PC_ACL_NFS4:
 770                 *ap->a_retval = 1;
 771                 return (0);
 772
 773         case _PC_ACL_PATH_MAX:
 774                 *ap->a_retval = ACL_MAX_ENTRIES;
 775                 return (0);
 776
 777         case _PC_NAME_MAX:
 778                 *ap->a_retval = NAME_MAX;
 779                 return (0);
 780
 781         default:
 782                 return (vop_stdpathconf(ap));
 783         }
 784 }
 785
 786 /*
 787  * Returns a trivial ACL
 788  */
 789 static int
 790 zfsctl_common_getacl(struct vop_getacl_args *ap)
 791 {
 792         int i;
 793
 794         if (ap->a_type != ACL_TYPE_NFS4)
 795                 return (EINVAL);
 796
 797         acl_nfs4_sync_acl_from_mode(ap->a_aclp, zfsctl_ctldir_mode, 0);
 798         /*
 799          * acl_nfs4_sync_acl_from_mode assumes that the owner can always modify
 800          * attributes.  That is not the case for the ctldir, so we must clear
 801          * those bits.  We also must clear ACL_READ_NAMED_ATTRS, because xattrs
 802          * aren't supported by the ctldir.
 803          */
 804         for (i = 0; i < ap->a_aclp->acl_cnt; i++) {
 805                 struct acl_entry *entry;
 806                 entry = &(ap->a_aclp->acl_entry[i]);
 807                 entry->ae_perm &= ~(ACL_WRITE_ACL | ACL_WRITE_OWNER |
 808                     ACL_WRITE_ATTRIBUTES | ACL_WRITE_NAMED_ATTRS |
 809                     ACL_READ_NAMED_ATTRS);
 810         }
 811
 812         return (0);
 813 }
 814
 815 static struct vop_vector zfsctl_ops_root = {
 816         .vop_default =  &default_vnodeops,
 817         .vop_fplookup_vexec = VOP_EAGAIN,
 818         .vop_fplookup_symlink = VOP_EAGAIN,
 819         .vop_open =     zfsctl_common_open,
 820         .vop_close =    zfsctl_common_close,
 821         .vop_ioctl =    VOP_EINVAL,
 822         .vop_getattr =  zfsctl_root_getattr,
 823         .vop_access =   zfsctl_common_access,
 824         .vop_readdir =  zfsctl_root_readdir,
 825         .vop_lookup =   zfsctl_root_lookup,
 826         .vop_inactive = VOP_NULL,
 827         .vop_reclaim =  zfsctl_common_reclaim,
 828         .vop_fid =      zfsctl_common_fid,
 829         .vop_print =    zfsctl_common_print,
 830         .vop_vptocnp =  zfsctl_root_vptocnp,
 831         .vop_pathconf = zfsctl_common_pathconf,
 832         .vop_getacl =   zfsctl_common_getacl,
 833 #if __FreeBSD_version >= 1400043
 834         .vop_add_writecount =   vop_stdadd_writecount_nomsync,
 835 #endif
 836 };
 837 VFS_VOP_VECTOR_REGISTER(zfsctl_ops_root);
 838
 839 static int
 840 zfsctl_snapshot_zname(vnode_t *vp, const char *name, int len, char *zname)
 841 {
 842         objset_t *os = ((zfsvfs_t *)((vp)->v_vfsp->vfs_data))->z_os;
 843
 844         dmu_objset_name(os, zname);
 845         if (strlen(zname) + 1 + strlen(name) >= len)
 846                 return (SET_ERROR(ENAMETOOLONG));
 847         (void) strcat(zname, "@");
 848         (void) strcat(zname, name);
 849         return (0);
 850 }
 851
 852 static int
 853 zfsctl_snapshot_lookup(vnode_t *vp, const char *name, uint64_t *id)
 854 {
 855         objset_t *os = ((zfsvfs_t *)((vp)->v_vfsp->vfs_data))->z_os;
 856         int err;
 857
 858         err = dsl_dataset_snap_lookup(dmu_objset_ds(os), name, id);
 859         return (err);
 860 }
 861
 862 /*
 863  * Given a vnode get a root vnode of a filesystem mounted on top of
 864  * the vnode, if any.  The root vnode is referenced and locked.
 865  * If no filesystem is mounted then the orinal vnode remains referenced
 866  * and locked.  If any error happens the orinal vnode is unlocked and
 867  * released.
 868  */
 869 static int
 870 zfsctl_mounted_here(vnode_t **vpp, int flags)
 871 {
 872         struct mount *mp;
 873         int err;
 874
 875         ASSERT_VOP_LOCKED(*vpp, __func__);
 876         ASSERT3S((*vpp)->v_type, ==, VDIR);
 877
 878         if ((mp = (*vpp)->v_mountedhere) != NULL) {
 879                 err = vfs_busy(mp, 0);
 880                 KASSERT(err == 0, ("vfs_busy(mp, 0) failed with %d", err));
 881                 KASSERT(vrefcnt(*vpp) > 1, ("unreferenced mountpoint"));
 882                 vput(*vpp);
 883                 err = VFS_ROOT(mp, flags, vpp);
 884                 vfs_unbusy(mp);
 885                 return (err);
 886         }
 887         return (EJUSTRETURN);
 888 }
 889
 890 typedef struct {
 891         const char *snap_name;
 892         uint64_t    snap_id;
 893 } snapshot_setup_arg_t;
 894
 895 static void
 896 zfsctl_snapshot_vnode_setup(vnode_t *vp, void *arg)
 897 {
 898         snapshot_setup_arg_t *ssa = arg;
 899         sfs_node_t *node;
 900
 901         ASSERT_VOP_ELOCKED(vp, __func__);
 902
 903         node = sfs_alloc_node(sizeof (sfs_node_t),
 904             ssa->snap_name, ZFSCTL_INO_SNAPDIR, ssa->snap_id);
 905         zfsctl_common_vnode_setup(vp, node);
 906
 907         /* We have to support recursive locking. */
 908         VN_LOCK_AREC(vp);
 909 }
 910
 911 /*
 912  * Lookup entry point for the 'snapshot' directory.  Try to open the
 913  * snapshot if it exist, creating the pseudo filesystem vnode as necessary.
 914  * Perform a mount of the associated dataset on top of the vnode.
 915  * There are four possibilities:
 916  * - the snapshot node and vnode do not exist
 917  * - the snapshot vnode is covered by the mounted snapshot
 918  * - the snapshot vnode is not covered yet, the mount operation is in progress
 919  * - the snapshot vnode is not covered, because the snapshot has been unmounted
 920  * The last two states are transient and should be relatively short-lived.
 921  */
 922 static int
 923 zfsctl_snapdir_lookup(struct vop_lookup_args *ap)
 924 {
 925         vnode_t *dvp = ap->a_dvp;
 926         vnode_t **vpp = ap->a_vpp;
 927         struct componentname *cnp = ap->a_cnp;
 928         char name[NAME_MAX + 1];
 929         char fullname[ZFS_MAX_DATASET_NAME_LEN];
 930         char *mountpoint;
 931         size_t mountpoint_len;
 932         zfsvfs_t *zfsvfs = dvp->v_vfsp->vfs_data;
 933         uint64_t snap_id;
 934         int nameiop = cnp->cn_nameiop;
 935         int lkflags = cnp->cn_lkflags;
 936         int flags = cnp->cn_flags;
 937         int err;
 938
 939         ASSERT3S(dvp->v_type, ==, VDIR);
 940
 941         if ((flags & ISLASTCN) != 0 && nameiop != LOOKUP)
 942                 return (SET_ERROR(ENOTSUP));
 943
 944         if (cnp->cn_namelen == 1 && *cnp->cn_nameptr == '.') {
 945                 err = zfsctl_relock_dot(dvp, lkflags & LK_TYPE_MASK);
 946                 if (err == 0)
 947                         *vpp = dvp;
 948                 return (err);
 949         }
 950         if (flags & ISDOTDOT) {
 951                 err = vn_vget_ino_gen(dvp, zfsctl_root_vnode, NULL, lkflags,
 952                     vpp);
 953                 return (err);
 954         }
 955
 956         if (cnp->cn_namelen >= sizeof (name))
 957                 return (SET_ERROR(ENAMETOOLONG));
 958
 959         strlcpy(name, ap->a_cnp->cn_nameptr, ap->a_cnp->cn_namelen + 1);
 960         err = zfsctl_snapshot_lookup(dvp, name, &snap_id);
 961         if (err != 0)
 962                 return (SET_ERROR(ENOENT));
 963
 964         for (;;) {
 965                 snapshot_setup_arg_t ssa;
 966
 967                 ssa.snap_name = name;
 968                 ssa.snap_id = snap_id;
 969                 err = sfs_vgetx(dvp->v_mount, LK_SHARED, ZFSCTL_INO_SNAPDIR,
 970                     snap_id, "zfs", &zfsctl_ops_snapshot,
 971                     zfsctl_snapshot_vnode_setup, &ssa, vpp);
 972                 if (err != 0)
 973                         return (err);
 974
 975                 /* Check if a new vnode has just been created. */
 976                 if (VOP_ISLOCKED(*vpp) == LK_EXCLUSIVE)
 977                         break;
 978
 979                 /*
 980                  * Check if a snapshot is already mounted on top of the vnode.
 981                  */
 982                 err = zfsctl_mounted_here(vpp, lkflags);
 983                 if (err != EJUSTRETURN)
 984                         return (err);
 985
 986                 /*
 987                  * If the vnode is not covered, then either the mount operation
 988                  * is in progress or the snapshot has already been unmounted
 989                  * but the vnode hasn't been inactivated and reclaimed yet.
 990                  * We can try to re-use the vnode in the latter case.
 991                  */
 992                 VI_LOCK(*vpp);
 993                 if (((*vpp)->v_iflag & VI_MOUNT) == 0) {
 994                         VI_UNLOCK(*vpp);
 995                         /*
 996                          * Upgrade to exclusive lock in order to:
 997                          * - avoid race conditions
 998                          * - satisfy the contract of mount_snapshot()
 999                          */
1000                         err = VOP_LOCK(*vpp, LK_TRYUPGRADE);
1001                         if (err == 0)
1002                                 break;
1003                 } else {
1004                         VI_UNLOCK(*vpp);
1005                 }
1006
1007                 /*
1008                  * In this state we can loop on uncontested locks and starve
1009                  * the thread doing the lengthy, non-trivial mount operation.
1010                  * So, yield to prevent that from happening.
1011                  */
1012                 vput(*vpp);
1013                 kern_yield(PRI_USER);
1014         }
1015
1016         VERIFY0(zfsctl_snapshot_zname(dvp, name, sizeof (fullname), fullname));
1017
1018         mountpoint_len = strlen(dvp->v_vfsp->mnt_stat.f_mntonname) +
1019             strlen("/" ZFS_CTLDIR_NAME "/snapshot/") + strlen(name) + 1;
1020         mountpoint = kmem_alloc(mountpoint_len, KM_SLEEP);
1021         (void) snprintf(mountpoint, mountpoint_len,
1022             "%s/" ZFS_CTLDIR_NAME "/snapshot/%s",
1023             dvp->v_vfsp->mnt_stat.f_mntonname, name);
1024
1025         err = mount_snapshot(curthread, vpp, "zfs", mountpoint, fullname, 0,
1026             dvp->v_vfsp);
1027         kmem_free(mountpoint, mountpoint_len);
1028         if (err == 0) {
1029                 /*
1030                  * Fix up the root vnode mounted on .zfs/snapshot/<snapname>.
1031                  *
1032                  * This is where we lie about our v_vfsp in order to
1033                  * make .zfs/snapshot/<snapname> accessible over NFS
1034                  * without requiring manual mounts of <snapname>.
1035                  */
1036                 ASSERT3P(VTOZ(*vpp)->z_zfsvfs, !=, zfsvfs);
1037                 VTOZ(*vpp)->z_zfsvfs->z_parent = zfsvfs;
1038
1039                 /* Clear the root flag (set via VFS_ROOT) as well. */
1040                 (*vpp)->v_vflag &= ~VV_ROOT;
1041         }
1042
1043         if (err != 0)
1044                 *vpp = NULL;
1045         return (err);
1046 }
1047
1048 static int
1049 zfsctl_snapdir_readdir(struct vop_readdir_args *ap)
1050 {
1051         char snapname[ZFS_MAX_DATASET_NAME_LEN];
1052         struct dirent entry;
1053         vnode_t *vp = ap->a_vp;
1054         zfsvfs_t *zfsvfs = vp->v_vfsp->vfs_data;
1055         zfs_uio_t uio;
1056         int *eofp = ap->a_eofflag;
1057         off_t dots_offset;
1058         int error;
1059
1060         zfs_uio_init(&uio, ap->a_uio);
1061
1062         ASSERT3S(vp->v_type, ==, VDIR);
1063
1064         error = sfs_readdir_common(ZFSCTL_INO_ROOT, ZFSCTL_INO_SNAPDIR, ap,
1065             &uio, &dots_offset);
1066         if (error != 0) {
1067                 if (error == ENAMETOOLONG) /* ran out of destination space */
1068                         error = 0;
1069                 return (error);
1070         }
1071
1072         if ((error = zfs_enter(zfsvfs, FTAG)) != 0)
1073                 return (error);
1074         for (;;) {
1075                 uint64_t cookie;
1076                 uint64_t id;
1077
1078                 cookie = zfs_uio_offset(&uio) - dots_offset;
1079
1080                 dsl_pool_config_enter(dmu_objset_pool(zfsvfs->z_os), FTAG);
1081                 error = dmu_snapshot_list_next(zfsvfs->z_os, sizeof (snapname),
1082                     snapname, &id, &cookie, NULL);
1083                 dsl_pool_config_exit(dmu_objset_pool(zfsvfs->z_os), FTAG);
1084                 if (error != 0) {
1085                         if (error == ENOENT) {
1086                                 if (eofp != NULL)
1087                                         *eofp = 1;
1088                                 error = 0;
1089                         }
1090                         zfs_exit(zfsvfs, FTAG);
1091                         return (error);
1092                 }
1093
1094                 entry.d_fileno = id;
1095                 entry.d_type = DT_DIR;
1096                 strcpy(entry.d_name, snapname);
1097                 entry.d_namlen = strlen(entry.d_name);
1098                 entry.d_reclen = sizeof (entry);
1099                 error = vfs_read_dirent(ap, &entry, zfs_uio_offset(&uio));
1100                 if (error != 0) {
1101                         if (error == ENAMETOOLONG)
1102                                 error = 0;
1103                         zfs_exit(zfsvfs, FTAG);
1104                         return (SET_ERROR(error));
1105                 }
1106                 zfs_uio_setoffset(&uio, cookie + dots_offset);
1107         }
1108         __builtin_unreachable();
1109 }
1110
1111 static int
1112 zfsctl_snapdir_getattr(struct vop_getattr_args *ap)
1113 {
1114         vnode_t *vp = ap->a_vp;
1115         vattr_t *vap = ap->a_vap;
1116         zfsvfs_t *zfsvfs = vp->v_vfsp->vfs_data;
1117         dsl_dataset_t *ds;
1118         uint64_t snap_count;
1119         int err;
1120
1121         if ((err = zfs_enter(zfsvfs, FTAG)) != 0)
1122                 return (err);
1123         ds = dmu_objset_ds(zfsvfs->z_os);
1124         zfsctl_common_getattr(vp, vap);
1125         vap->va_ctime = dmu_objset_snap_cmtime(zfsvfs->z_os);
1126         vap->va_mtime = vap->va_ctime;
1127         vap->va_birthtime = vap->va_ctime;
1128         if (dsl_dataset_phys(ds)->ds_snapnames_zapobj != 0) {
1129                 err = zap_count(dmu_objset_pool(ds->ds_objset)->dp_meta_objset,
1130                     dsl_dataset_phys(ds)->ds_snapnames_zapobj, &snap_count);
1131                 if (err != 0) {
1132                         zfs_exit(zfsvfs, FTAG);
1133                         return (err);
1134                 }
1135                 vap->va_nlink += snap_count;
1136         }
1137         vap->va_size = vap->va_nlink;
1138
1139         zfs_exit(zfsvfs, FTAG);
1140         return (0);
1141 }
1142
1143 static struct vop_vector zfsctl_ops_snapdir = {
1144         .vop_default =  &default_vnodeops,
1145         .vop_fplookup_vexec = VOP_EAGAIN,
1146         .vop_fplookup_symlink = VOP_EAGAIN,
1147         .vop_open =     zfsctl_common_open,
1148         .vop_close =    zfsctl_common_close,
1149         .vop_getattr =  zfsctl_snapdir_getattr,
1150         .vop_access =   zfsctl_common_access,
1151         .vop_readdir =  zfsctl_snapdir_readdir,
1152         .vop_lookup =   zfsctl_snapdir_lookup,
1153         .vop_reclaim =  zfsctl_common_reclaim,
1154         .vop_fid =      zfsctl_common_fid,
1155         .vop_print =    zfsctl_common_print,
1156         .vop_pathconf = zfsctl_common_pathconf,
1157         .vop_getacl =   zfsctl_common_getacl,
1158 #if __FreeBSD_version >= 1400043
1159         .vop_add_writecount =   vop_stdadd_writecount_nomsync,
1160 #endif
1161 };
1162 VFS_VOP_VECTOR_REGISTER(zfsctl_ops_snapdir);
1163
1164
1165 static int
1166 zfsctl_snapshot_inactive(struct vop_inactive_args *ap)
1167 {
1168         vnode_t *vp = ap->a_vp;
1169
1170         vrecycle(vp);
1171         return (0);
1172 }
1173
1174 static int
1175 zfsctl_snapshot_reclaim(struct vop_reclaim_args *ap)
1176 {
1177         vnode_t *vp = ap->a_vp;
1178         void *data = vp->v_data;
1179
1180         sfs_reclaim_vnode(vp);
1181         sfs_destroy_node(data);
1182         return (0);
1183 }
1184
1185 static int
1186 zfsctl_snapshot_vptocnp(struct vop_vptocnp_args *ap)
1187 {
1188         struct mount *mp;
1189         vnode_t *dvp;
1190         vnode_t *vp;
1191         sfs_node_t *node;
1192         size_t len;
1193         int locked;
1194         int error;
1195
1196         vp = ap->a_vp;
1197         node = vp->v_data;
1198         len = strlen(node->sn_name);
1199         if (*ap->a_buflen < len)
1200                 return (SET_ERROR(ENOMEM));
1201
1202         /*
1203          * Prevent unmounting of the snapshot while the vnode lock
1204          * is not held.  That is not strictly required, but allows
1205          * us to assert that an uncovered snapshot vnode is never
1206          * "leaked".
1207          */
1208         mp = vp->v_mountedhere;
1209         if (mp == NULL)
1210                 return (SET_ERROR(ENOENT));
1211         error = vfs_busy(mp, 0);
1212         KASSERT(error == 0, ("vfs_busy(mp, 0) failed with %d", error));
1213
1214         /*
1215          * We can vput the vnode as we can now depend on the reference owned
1216          * by the busied mp.  But we also need to hold the vnode, because
1217          * the reference may go after vfs_unbusy() which has to be called
1218          * before we can lock the vnode again.
1219          */
1220         locked = VOP_ISLOCKED(vp);
1221         enum vgetstate vs = vget_prep(vp);
1222         vput(vp);
1223
1224         /* Look up .zfs/snapshot, our parent. */
1225         error = zfsctl_snapdir_vnode(vp->v_mount, NULL, LK_SHARED, &dvp);
1226         if (error == 0) {
1227                 VOP_UNLOCK(dvp);
1228                 *ap->a_vpp = dvp;
1229                 *ap->a_buflen -= len;
1230                 memcpy(ap->a_buf + *ap->a_buflen, node->sn_name, len);
1231         }
1232         vfs_unbusy(mp);
1233         vget_finish(vp, locked | LK_RETRY, vs);
1234         return (error);
1235 }
1236
1237 /*
1238  * These VP's should never see the light of day.  They should always
1239  * be covered.
1240  */
1241 static struct vop_vector zfsctl_ops_snapshot = {
1242         .vop_default =          NULL, /* ensure very restricted access */
1243         .vop_fplookup_vexec =   VOP_EAGAIN,
1244         .vop_fplookup_symlink = VOP_EAGAIN,
1245         .vop_open =             zfsctl_common_open,
1246         .vop_close =            zfsctl_common_close,
1247         .vop_inactive =         zfsctl_snapshot_inactive,
1248         .vop_need_inactive =    vop_stdneed_inactive,
1249         .vop_reclaim =          zfsctl_snapshot_reclaim,
1250         .vop_vptocnp =          zfsctl_snapshot_vptocnp,
1251         .vop_lock1 =            vop_stdlock,
1252         .vop_unlock =           vop_stdunlock,
1253         .vop_islocked =         vop_stdislocked,
1254         .vop_advlockpurge =     vop_stdadvlockpurge, /* called by vgone */
1255         .vop_print =            zfsctl_common_print,
1256 #if __FreeBSD_version >= 1400043
1257         .vop_add_writecount =   vop_stdadd_writecount_nomsync,
1258 #endif
1259 };
1260 VFS_VOP_VECTOR_REGISTER(zfsctl_ops_snapshot);
1261
1262 int
1263 zfsctl_lookup_objset(vfs_t *vfsp, uint64_t objsetid, zfsvfs_t **zfsvfsp)
1264 {
1265         zfsvfs_t *zfsvfs __unused = vfsp->vfs_data;
1266         vnode_t *vp;
1267         int error;
1268
1269         ASSERT3P(zfsvfs->z_ctldir, !=, NULL);
1270         *zfsvfsp = NULL;
1271         error = sfs_vnode_get(vfsp, LK_EXCLUSIVE,
1272             ZFSCTL_INO_SNAPDIR, objsetid, &vp);
1273         if (error == 0 && vp != NULL) {
1274                 /*
1275                  * XXX Probably need to at least reference, if not busy, the mp.
1276                  */
1277                 if (vp->v_mountedhere != NULL)
1278                         *zfsvfsp = vp->v_mountedhere->mnt_data;
1279                 vput(vp);
1280         }
1281         if (*zfsvfsp == NULL)
1282                 return (SET_ERROR(EINVAL));
1283         return (0);
1284 }
1285
1286 /*
1287  * Unmount any snapshots for the given filesystem.  This is called from
1288  * zfs_umount() - if we have a ctldir, then go through and unmount all the
1289  * snapshots.
1290  */
1291 int
1292 zfsctl_umount_snapshots(vfs_t *vfsp, int fflags, cred_t *cr)
1293 {
1294         char snapname[ZFS_MAX_DATASET_NAME_LEN];
1295         zfsvfs_t *zfsvfs = vfsp->vfs_data;
1296         struct mount *mp;
1297         vnode_t *vp;
1298         uint64_t cookie;
1299         int error;
1300
1301         ASSERT3P(zfsvfs->z_ctldir, !=, NULL);
1302
1303         cookie = 0;
1304         for (;;) {
1305                 uint64_t id;
1306
1307                 dsl_pool_config_enter(dmu_objset_pool(zfsvfs->z_os), FTAG);
1308                 error = dmu_snapshot_list_next(zfsvfs->z_os, sizeof (snapname),
1309                     snapname, &id, &cookie, NULL);
1310                 dsl_pool_config_exit(dmu_objset_pool(zfsvfs->z_os), FTAG);
1311                 if (error != 0) {
1312                         if (error == ENOENT)
1313                                 error = 0;
1314                         break;
1315                 }
1316
1317                 for (;;) {
1318                         error = sfs_vnode_get(vfsp, LK_EXCLUSIVE,
1319                             ZFSCTL_INO_SNAPDIR, id, &vp);
1320                         if (error != 0 || vp == NULL)
1321                                 break;
1322
1323                         mp = vp->v_mountedhere;
1324
1325                         /*
1326                          * v_mountedhere being NULL means that the
1327                          * (uncovered) vnode is in a transient state
1328                          * (mounting or unmounting), so loop until it
1329                          * settles down.
1330                          */
1331                         if (mp != NULL)
1332                                 break;
1333                         vput(vp);
1334                 }
1335                 if (error != 0)
1336                         break;
1337                 if (vp == NULL)
1338                         continue;       /* no mountpoint, nothing to do */
1339
1340                 /*
1341                  * The mount-point vnode is kept locked to avoid spurious EBUSY
1342                  * from a concurrent umount.
1343                  * The vnode lock must have recursive locking enabled.
1344                  */
1345                 vfs_ref(mp);
1346                 error = dounmount(mp, fflags, curthread);
1347                 KASSERT_IMPLY(error == 0, vrefcnt(vp) == 1,
1348                     ("extra references after unmount"));
1349                 vput(vp);
1350                 if (error != 0)
1351                         break;
1352         }
1353         KASSERT_IMPLY((fflags & MS_FORCE) != 0, error == 0,
1354             ("force unmounting failed"));
1355         return (error);
1356 }
1357
1358 int
1359 zfsctl_snapshot_unmount(const char *snapname, int flags __unused)
1360 {
1361         vfs_t *vfsp = NULL;
1362         zfsvfs_t *zfsvfs = NULL;
1363
1364         if (strchr(snapname, '@') == NULL)
1365                 return (0);
1366
1367         int err = getzfsvfs(snapname, &zfsvfs);
1368         if (err != 0) {
1369                 ASSERT3P(zfsvfs, ==, NULL);
1370                 return (0);
1371         }
1372         vfsp = zfsvfs->z_vfs;
1373
1374         ASSERT(!dsl_pool_config_held(dmu_objset_pool(zfsvfs->z_os)));
1375
1376         vfs_ref(vfsp);
1377         vfs_unbusy(vfsp);
1378         return (dounmount(vfsp, MS_FORCE, curthread));
1379 }