module/os/freebsd/zfs/zfs_vfsops.c

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or https://opensource.org/licenses/CDDL-1.0.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  23  * Copyright (c) 2011 Pawel Jakub Dawidek <pawel@dawidek.net>.
  24  * All rights reserved.
  25  * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
  26  * Copyright (c) 2014 Integros [integros.com]
  27  * Copyright 2016 Nexenta Systems, Inc. All rights reserved.
  28  */
  29
  30 /* Portions Copyright 2010 Robert Milkowski */
  31
  32 #include <sys/types.h>
  33 #include <sys/param.h>
  34 #include <sys/systm.h>
  35 #include <sys/kernel.h>
  36 #include <sys/sysmacros.h>
  37 #include <sys/kmem.h>
  38 #include <sys/acl.h>
  39 #include <sys/vnode.h>
  40 #include <sys/vfs.h>
  41 #include <sys/mntent.h>
  42 #include <sys/mount.h>
  43 #include <sys/cmn_err.h>
  44 #include <sys/zfs_znode.h>
  45 #include <sys/zfs_vnops.h>
  46 #include <sys/zfs_dir.h>
  47 #include <sys/zil.h>
  48 #include <sys/fs/zfs.h>
  49 #include <sys/dmu.h>
  50 #include <sys/dsl_prop.h>
  51 #include <sys/dsl_dataset.h>
  52 #include <sys/dsl_deleg.h>
  53 #include <sys/spa.h>
  54 #include <sys/zap.h>
  55 #include <sys/sa.h>
  56 #include <sys/sa_impl.h>
  57 #include <sys/policy.h>
  58 #include <sys/atomic.h>
  59 #include <sys/zfs_ioctl.h>
  60 #include <sys/zfs_ctldir.h>
  61 #include <sys/zfs_fuid.h>
  62 #include <sys/sunddi.h>
  63 #include <sys/dmu_objset.h>
  64 #include <sys/dsl_dir.h>
  65 #include <sys/jail.h>
  66 #include <sys/osd.h>
  67 #include <ufs/ufs/quota.h>
  68 #include <sys/zfs_quota.h>
  69
  70 #include "zfs_comutil.h"
  71
  72 #ifndef MNTK_VMSETSIZE_BUG
  73 #define MNTK_VMSETSIZE_BUG      0
  74 #endif
  75 #ifndef MNTK_NOMSYNC
  76 #define MNTK_NOMSYNC    8
  77 #endif
  78
  79 struct mtx zfs_debug_mtx;
  80 MTX_SYSINIT(zfs_debug_mtx, &zfs_debug_mtx, "zfs_debug", MTX_DEF);
  81
  82 SYSCTL_NODE(_vfs, OID_AUTO, zfs, CTLFLAG_RW, 0, "ZFS file system");
  83
  84 int zfs_super_owner;
  85 SYSCTL_INT(_vfs_zfs, OID_AUTO, super_owner, CTLFLAG_RW, &zfs_super_owner, 0,
  86         "File system owners can perform privileged operation on file systems");
  87
  88 int zfs_debug_level;
  89 SYSCTL_INT(_vfs_zfs, OID_AUTO, debug, CTLFLAG_RWTUN, &zfs_debug_level, 0,
  90         "Debug level");
  91
  92 struct zfs_jailparam {
  93         int mount_snapshot;
  94 };
  95
  96 static struct zfs_jailparam zfs_jailparam0 = {
  97         .mount_snapshot = 0,
  98 };
  99
 100 static int zfs_jailparam_slot;
 101
 102 SYSCTL_JAIL_PARAM_SYS_NODE(zfs, CTLFLAG_RW, "Jail ZFS parameters");
 103 SYSCTL_JAIL_PARAM(_zfs, mount_snapshot, CTLTYPE_INT | CTLFLAG_RW, "I",
 104         "Allow mounting snapshots in the .zfs directory for unjailed datasets");
 105
 106 SYSCTL_NODE(_vfs_zfs, OID_AUTO, version, CTLFLAG_RD, 0, "ZFS versions");
 107 static int zfs_version_acl = ZFS_ACL_VERSION;
 108 SYSCTL_INT(_vfs_zfs_version, OID_AUTO, acl, CTLFLAG_RD, &zfs_version_acl, 0,
 109         "ZFS_ACL_VERSION");
 110 static int zfs_version_spa = SPA_VERSION;
 111 SYSCTL_INT(_vfs_zfs_version, OID_AUTO, spa, CTLFLAG_RD, &zfs_version_spa, 0,
 112         "SPA_VERSION");
 113 static int zfs_version_zpl = ZPL_VERSION;
 114 SYSCTL_INT(_vfs_zfs_version, OID_AUTO, zpl, CTLFLAG_RD, &zfs_version_zpl, 0,
 115         "ZPL_VERSION");
 116
 117 #if __FreeBSD_version >= 1400018
 118 static int zfs_quotactl(vfs_t *vfsp, int cmds, uid_t id, void *arg,
 119     bool *mp_busy);
 120 #else
 121 static int zfs_quotactl(vfs_t *vfsp, int cmds, uid_t id, void *arg);
 122 #endif
 123 static int zfs_mount(vfs_t *vfsp);
 124 static int zfs_umount(vfs_t *vfsp, int fflag);
 125 static int zfs_root(vfs_t *vfsp, int flags, vnode_t **vpp);
 126 static int zfs_statfs(vfs_t *vfsp, struct statfs *statp);
 127 static int zfs_vget(vfs_t *vfsp, ino_t ino, int flags, vnode_t **vpp);
 128 static int zfs_sync(vfs_t *vfsp, int waitfor);
 129 static int zfs_checkexp(vfs_t *vfsp, struct sockaddr *nam, uint64_t *extflagsp,
 130     struct ucred **credanonp, int *numsecflavors, int *secflavors);
 131 static int zfs_fhtovp(vfs_t *vfsp, fid_t *fidp, int flags, vnode_t **vpp);
 132 static void zfs_freevfs(vfs_t *vfsp);
 133
 134 struct vfsops zfs_vfsops = {
 135         .vfs_mount =            zfs_mount,
 136         .vfs_unmount =          zfs_umount,
 137         .vfs_root =             vfs_cache_root,
 138         .vfs_cachedroot =       zfs_root,
 139         .vfs_statfs =           zfs_statfs,
 140         .vfs_vget =             zfs_vget,
 141         .vfs_sync =             zfs_sync,
 142         .vfs_checkexp =         zfs_checkexp,
 143         .vfs_fhtovp =           zfs_fhtovp,
 144         .vfs_quotactl =         zfs_quotactl,
 145 };
 146
 147 #ifdef VFCF_CROSS_COPY_FILE_RANGE
 148 VFS_SET(zfs_vfsops, zfs,
 149     VFCF_DELEGADMIN | VFCF_JAIL | VFCF_CROSS_COPY_FILE_RANGE);
 150 #else
 151 VFS_SET(zfs_vfsops, zfs, VFCF_DELEGADMIN | VFCF_JAIL);
 152 #endif
 153
 154 /*
 155  * We need to keep a count of active fs's.
 156  * This is necessary to prevent our module
 157  * from being unloaded after a umount -f
 158  */
 159 static uint32_t zfs_active_fs_count = 0;
 160
 161 int
 162 zfs_get_temporary_prop(dsl_dataset_t *ds, zfs_prop_t zfs_prop, uint64_t *val,
 163     char *setpoint)
 164 {
 165         int error;
 166         zfsvfs_t *zfvp;
 167         vfs_t *vfsp;
 168         objset_t *os;
 169         uint64_t tmp = *val;
 170
 171         error = dmu_objset_from_ds(ds, &os);
 172         if (error != 0)
 173                 return (error);
 174
 175         error = getzfsvfs_impl(os, &zfvp);
 176         if (error != 0)
 177                 return (error);
 178         if (zfvp == NULL)
 179                 return (ENOENT);
 180         vfsp = zfvp->z_vfs;
 181         switch (zfs_prop) {
 182         case ZFS_PROP_ATIME:
 183                 if (vfs_optionisset(vfsp, MNTOPT_NOATIME, NULL))
 184                         tmp = 0;
 185                 if (vfs_optionisset(vfsp, MNTOPT_ATIME, NULL))
 186                         tmp = 1;
 187                 break;
 188         case ZFS_PROP_DEVICES:
 189                 if (vfs_optionisset(vfsp, MNTOPT_NODEVICES, NULL))
 190                         tmp = 0;
 191                 if (vfs_optionisset(vfsp, MNTOPT_DEVICES, NULL))
 192                         tmp = 1;
 193                 break;
 194         case ZFS_PROP_EXEC:
 195                 if (vfs_optionisset(vfsp, MNTOPT_NOEXEC, NULL))
 196                         tmp = 0;
 197                 if (vfs_optionisset(vfsp, MNTOPT_EXEC, NULL))
 198                         tmp = 1;
 199                 break;
 200         case ZFS_PROP_SETUID:
 201                 if (vfs_optionisset(vfsp, MNTOPT_NOSETUID, NULL))
 202                         tmp = 0;
 203                 if (vfs_optionisset(vfsp, MNTOPT_SETUID, NULL))
 204                         tmp = 1;
 205                 break;
 206         case ZFS_PROP_READONLY:
 207                 if (vfs_optionisset(vfsp, MNTOPT_RW, NULL))
 208                         tmp = 0;
 209                 if (vfs_optionisset(vfsp, MNTOPT_RO, NULL))
 210                         tmp = 1;
 211                 break;
 212         case ZFS_PROP_XATTR:
 213                 if (zfvp->z_flags & ZSB_XATTR)
 214                         tmp = zfvp->z_xattr;
 215                 break;
 216         case ZFS_PROP_NBMAND:
 217                 if (vfs_optionisset(vfsp, MNTOPT_NONBMAND, NULL))
 218                         tmp = 0;
 219                 if (vfs_optionisset(vfsp, MNTOPT_NBMAND, NULL))
 220                         tmp = 1;
 221                 break;
 222         default:
 223                 vfs_unbusy(vfsp);
 224                 return (ENOENT);
 225         }
 226
 227         vfs_unbusy(vfsp);
 228         if (tmp != *val) {
 229                 if (setpoint)
 230                         (void) strcpy(setpoint, "temporary");
 231                 *val = tmp;
 232         }
 233         return (0);
 234 }
 235
 236 static int
 237 zfs_getquota(zfsvfs_t *zfsvfs, uid_t id, int isgroup, struct dqblk64 *dqp)
 238 {
 239         int error = 0;
 240         char buf[32];
 241         uint64_t usedobj, quotaobj;
 242         uint64_t quota, used = 0;
 243         timespec_t now;
 244
 245         usedobj = isgroup ? DMU_GROUPUSED_OBJECT : DMU_USERUSED_OBJECT;
 246         quotaobj = isgroup ? zfsvfs->z_groupquota_obj : zfsvfs->z_userquota_obj;
 247
 248         if (quotaobj == 0 || zfsvfs->z_replay) {
 249                 error = ENOENT;
 250                 goto done;
 251         }
 252         (void) sprintf(buf, "%llx", (longlong_t)id);
 253         if ((error = zap_lookup(zfsvfs->z_os, quotaobj,
 254             buf, sizeof (quota), 1, &quota)) != 0) {
 255                 dprintf("%s(%d): quotaobj lookup failed\n",
 256                     __FUNCTION__, __LINE__);
 257                 goto done;
 258         }
 259         /*
 260          * quota(8) uses bsoftlimit as "quoota", and hardlimit as "limit".
 261          * So we set them to be the same.
 262          */
 263         dqp->dqb_bsoftlimit = dqp->dqb_bhardlimit = btodb(quota);
 264         error = zap_lookup(zfsvfs->z_os, usedobj, buf, sizeof (used), 1, &used);
 265         if (error && error != ENOENT) {
 266                 dprintf("%s(%d):  usedobj failed; %d\n",
 267                     __FUNCTION__, __LINE__, error);
 268                 goto done;
 269         }
 270         dqp->dqb_curblocks = btodb(used);
 271         dqp->dqb_ihardlimit = dqp->dqb_isoftlimit = 0;
 272         vfs_timestamp(&now);
 273         /*
 274          * Setting this to 0 causes FreeBSD quota(8) to print
 275          * the number of days since the epoch, which isn't
 276          * particularly useful.
 277          */
 278         dqp->dqb_btime = dqp->dqb_itime = now.tv_sec;
 279 done:
 280         return (error);
 281 }
 282
 283 static int
 284 #if __FreeBSD_version >= 1400018
 285 zfs_quotactl(vfs_t *vfsp, int cmds, uid_t id, void *arg, bool *mp_busy)
 286 #else
 287 zfs_quotactl(vfs_t *vfsp, int cmds, uid_t id, void *arg)
 288 #endif
 289 {
 290         zfsvfs_t *zfsvfs = vfsp->vfs_data;
 291         struct thread *td;
 292         int cmd, type, error = 0;
 293         int bitsize;
 294         zfs_userquota_prop_t quota_type;
 295         struct dqblk64 dqblk = { 0 };
 296
 297         td = curthread;
 298         cmd = cmds >> SUBCMDSHIFT;
 299         type = cmds & SUBCMDMASK;
 300
 301         if ((error = zfs_enter(zfsvfs, FTAG)) != 0)
 302                 return (error);
 303         if (id == -1) {
 304                 switch (type) {
 305                 case USRQUOTA:
 306                         id = td->td_ucred->cr_ruid;
 307                         break;
 308                 case GRPQUOTA:
 309                         id = td->td_ucred->cr_rgid;
 310                         break;
 311                 default:
 312                         error = EINVAL;
 313 #if __FreeBSD_version < 1400018
 314                         if (cmd == Q_QUOTAON || cmd == Q_QUOTAOFF)
 315                                 vfs_unbusy(vfsp);
 316 #endif
 317                         goto done;
 318                 }
 319         }
 320         /*
 321          * Map BSD type to:
 322          * ZFS_PROP_USERUSED,
 323          * ZFS_PROP_USERQUOTA,
 324          * ZFS_PROP_GROUPUSED,
 325          * ZFS_PROP_GROUPQUOTA
 326          */
 327         switch (cmd) {
 328         case Q_SETQUOTA:
 329         case Q_SETQUOTA32:
 330                 if (type == USRQUOTA)
 331                         quota_type = ZFS_PROP_USERQUOTA;
 332                 else if (type == GRPQUOTA)
 333                         quota_type = ZFS_PROP_GROUPQUOTA;
 334                 else
 335                         error = EINVAL;
 336                 break;
 337         case Q_GETQUOTA:
 338         case Q_GETQUOTA32:
 339                 if (type == USRQUOTA)
 340                         quota_type = ZFS_PROP_USERUSED;
 341                 else if (type == GRPQUOTA)
 342                         quota_type = ZFS_PROP_GROUPUSED;
 343                 else
 344                         error = EINVAL;
 345                 break;
 346         }
 347
 348         /*
 349          * Depending on the cmd, we may need to get
 350          * the ruid and domain (see fuidstr_to_sid?),
 351          * the fuid (how?), or other information.
 352          * Create fuid using zfs_fuid_create(zfsvfs, id,
 353          * ZFS_OWNER or ZFS_GROUP, cr, &fuidp)?
 354          * I think I can use just the id?
 355          *
 356          * Look at zfs_id_overquota() to look up a quota.
 357          * zap_lookup(something, quotaobj, fuidstring,
 358          *     sizeof (long long), 1, &quota)
 359          *
 360          * See zfs_set_userquota() to set a quota.
 361          */
 362         if ((uint32_t)type >= MAXQUOTAS) {
 363                 error = EINVAL;
 364                 goto done;
 365         }
 366
 367         switch (cmd) {
 368         case Q_GETQUOTASIZE:
 369                 bitsize = 64;
 370                 error = copyout(&bitsize, arg, sizeof (int));
 371                 break;
 372         case Q_QUOTAON:
 373                 // As far as I can tell, you can't turn quotas on or off on zfs
 374                 error = 0;
 375 #if __FreeBSD_version < 1400018
 376                 vfs_unbusy(vfsp);
 377 #endif
 378                 break;
 379         case Q_QUOTAOFF:
 380                 error = ENOTSUP;
 381 #if __FreeBSD_version < 1400018
 382                 vfs_unbusy(vfsp);
 383 #endif
 384                 break;
 385         case Q_SETQUOTA:
 386                 error = copyin(arg, &dqblk, sizeof (dqblk));
 387                 if (error == 0)
 388                         error = zfs_set_userquota(zfsvfs, quota_type,
 389                             "", id, dbtob(dqblk.dqb_bhardlimit));
 390                 break;
 391         case Q_GETQUOTA:
 392                 error = zfs_getquota(zfsvfs, id, type == GRPQUOTA, &dqblk);
 393                 if (error == 0)
 394                         error = copyout(&dqblk, arg, sizeof (dqblk));
 395                 break;
 396         default:
 397                 error = EINVAL;
 398                 break;
 399         }
 400 done:
 401         zfs_exit(zfsvfs, FTAG);
 402         return (error);
 403 }
 404
 405
 406 boolean_t
 407 zfs_is_readonly(zfsvfs_t *zfsvfs)
 408 {
 409         return (!!(zfsvfs->z_vfs->vfs_flag & VFS_RDONLY));
 410 }
 411
 412 static int
 413 zfs_sync(vfs_t *vfsp, int waitfor)
 414 {
 415
 416         /*
 417          * Data integrity is job one.  We don't want a compromised kernel
 418          * writing to the storage pool, so we never sync during panic.
 419          */
 420         if (panicstr)
 421                 return (0);
 422
 423         /*
 424          * Ignore the system syncher.  ZFS already commits async data
 425          * at zfs_txg_timeout intervals.
 426          */
 427         if (waitfor == MNT_LAZY)
 428                 return (0);
 429
 430         if (vfsp != NULL) {
 431                 /*
 432                  * Sync a specific filesystem.
 433                  */
 434                 zfsvfs_t *zfsvfs = vfsp->vfs_data;
 435                 dsl_pool_t *dp;
 436                 int error;
 437
 438                 if ((error = zfs_enter(zfsvfs, FTAG)) != 0)
 439                         return (error);
 440                 dp = dmu_objset_pool(zfsvfs->z_os);
 441
 442                 /*
 443                  * If the system is shutting down, then skip any
 444                  * filesystems which may exist on a suspended pool.
 445                  */
 446                 if (rebooting && spa_suspended(dp->dp_spa)) {
 447                         zfs_exit(zfsvfs, FTAG);
 448                         return (0);
 449                 }
 450
 451                 if (zfsvfs->z_log != NULL)
 452                         zil_commit(zfsvfs->z_log, 0);
 453
 454                 zfs_exit(zfsvfs, FTAG);
 455         } else {
 456                 /*
 457                  * Sync all ZFS filesystems.  This is what happens when you
 458                  * run sync(8).  Unlike other filesystems, ZFS honors the
 459                  * request by waiting for all pools to commit all dirty data.
 460                  */
 461                 spa_sync_allpools();
 462         }
 463
 464         return (0);
 465 }
 466
 467 static void
 468 atime_changed_cb(void *arg, uint64_t newval)
 469 {
 470         zfsvfs_t *zfsvfs = arg;
 471
 472         if (newval == TRUE) {
 473                 zfsvfs->z_atime = TRUE;
 474                 zfsvfs->z_vfs->vfs_flag &= ~MNT_NOATIME;
 475                 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOATIME);
 476                 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_ATIME, NULL, 0);
 477         } else {
 478                 zfsvfs->z_atime = FALSE;
 479                 zfsvfs->z_vfs->vfs_flag |= MNT_NOATIME;
 480                 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_ATIME);
 481                 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOATIME, NULL, 0);
 482         }
 483 }
 484
 485 static void
 486 xattr_changed_cb(void *arg, uint64_t newval)
 487 {
 488         zfsvfs_t *zfsvfs = arg;
 489
 490         if (newval == ZFS_XATTR_OFF) {
 491                 zfsvfs->z_flags &= ~ZSB_XATTR;
 492         } else {
 493                 zfsvfs->z_flags |= ZSB_XATTR;
 494
 495                 if (newval == ZFS_XATTR_SA)
 496                         zfsvfs->z_xattr_sa = B_TRUE;
 497                 else
 498                         zfsvfs->z_xattr_sa = B_FALSE;
 499         }
 500 }
 501
 502 static void
 503 blksz_changed_cb(void *arg, uint64_t newval)
 504 {
 505         zfsvfs_t *zfsvfs = arg;
 506         ASSERT3U(newval, <=, spa_maxblocksize(dmu_objset_spa(zfsvfs->z_os)));
 507         ASSERT3U(newval, >=, SPA_MINBLOCKSIZE);
 508         ASSERT(ISP2(newval));
 509
 510         zfsvfs->z_max_blksz = newval;
 511         zfsvfs->z_vfs->mnt_stat.f_iosize = newval;
 512 }
 513
 514 static void
 515 readonly_changed_cb(void *arg, uint64_t newval)
 516 {
 517         zfsvfs_t *zfsvfs = arg;
 518
 519         if (newval) {
 520                 /* XXX locking on vfs_flag? */
 521                 zfsvfs->z_vfs->vfs_flag |= VFS_RDONLY;
 522                 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_RW);
 523                 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_RO, NULL, 0);
 524         } else {
 525                 /* XXX locking on vfs_flag? */
 526                 zfsvfs->z_vfs->vfs_flag &= ~VFS_RDONLY;
 527                 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_RO);
 528                 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_RW, NULL, 0);
 529         }
 530 }
 531
 532 static void
 533 setuid_changed_cb(void *arg, uint64_t newval)
 534 {
 535         zfsvfs_t *zfsvfs = arg;
 536
 537         if (newval == FALSE) {
 538                 zfsvfs->z_vfs->vfs_flag |= VFS_NOSETUID;
 539                 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_SETUID);
 540                 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOSETUID, NULL, 0);
 541         } else {
 542                 zfsvfs->z_vfs->vfs_flag &= ~VFS_NOSETUID;
 543                 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOSETUID);
 544                 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_SETUID, NULL, 0);
 545         }
 546 }
 547
 548 static void
 549 exec_changed_cb(void *arg, uint64_t newval)
 550 {
 551         zfsvfs_t *zfsvfs = arg;
 552
 553         if (newval == FALSE) {
 554                 zfsvfs->z_vfs->vfs_flag |= VFS_NOEXEC;
 555                 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_EXEC);
 556                 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOEXEC, NULL, 0);
 557         } else {
 558                 zfsvfs->z_vfs->vfs_flag &= ~VFS_NOEXEC;
 559                 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOEXEC);
 560                 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_EXEC, NULL, 0);
 561         }
 562 }
 563
 564 /*
 565  * The nbmand mount option can be changed at mount time.
 566  * We can't allow it to be toggled on live file systems or incorrect
 567  * behavior may be seen from cifs clients
 568  *
 569  * This property isn't registered via dsl_prop_register(), but this callback
 570  * will be called when a file system is first mounted
 571  */
 572 static void
 573 nbmand_changed_cb(void *arg, uint64_t newval)
 574 {
 575         zfsvfs_t *zfsvfs = arg;
 576         if (newval == FALSE) {
 577                 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NBMAND);
 578                 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NONBMAND, NULL, 0);
 579         } else {
 580                 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NONBMAND);
 581                 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NBMAND, NULL, 0);
 582         }
 583 }
 584
 585 static void
 586 snapdir_changed_cb(void *arg, uint64_t newval)
 587 {
 588         zfsvfs_t *zfsvfs = arg;
 589
 590         zfsvfs->z_show_ctldir = newval;
 591 }
 592
 593 static void
 594 acl_mode_changed_cb(void *arg, uint64_t newval)
 595 {
 596         zfsvfs_t *zfsvfs = arg;
 597
 598         zfsvfs->z_acl_mode = newval;
 599 }
 600
 601 static void
 602 acl_inherit_changed_cb(void *arg, uint64_t newval)
 603 {
 604         zfsvfs_t *zfsvfs = arg;
 605
 606         zfsvfs->z_acl_inherit = newval;
 607 }
 608
 609 static void
 610 acl_type_changed_cb(void *arg, uint64_t newval)
 611 {
 612         zfsvfs_t *zfsvfs = arg;
 613
 614         zfsvfs->z_acl_type = newval;
 615 }
 616
 617 static void
 618 longname_changed_cb(void *arg, uint64_t newval)
 619 {
 620         zfsvfs_t *zfsvfs = arg;
 621
 622         zfsvfs->z_longname = newval;
 623 }
 624
 625 static int
 626 zfs_register_callbacks(vfs_t *vfsp)
 627 {
 628         struct dsl_dataset *ds = NULL;
 629         objset_t *os = NULL;
 630         zfsvfs_t *zfsvfs = NULL;
 631         uint64_t nbmand;
 632         boolean_t readonly = B_FALSE;
 633         boolean_t do_readonly = B_FALSE;
 634         boolean_t setuid = B_FALSE;
 635         boolean_t do_setuid = B_FALSE;
 636         boolean_t exec = B_FALSE;
 637         boolean_t do_exec = B_FALSE;
 638         boolean_t xattr = B_FALSE;
 639         boolean_t atime = B_FALSE;
 640         boolean_t do_atime = B_FALSE;
 641         boolean_t do_xattr = B_FALSE;
 642         int error = 0;
 643
 644         ASSERT3P(vfsp, !=, NULL);
 645         zfsvfs = vfsp->vfs_data;
 646         ASSERT3P(zfsvfs, !=, NULL);
 647         os = zfsvfs->z_os;
 648
 649         /*
 650          * This function can be called for a snapshot when we update snapshot's
 651          * mount point, which isn't really supported.
 652          */
 653         if (dmu_objset_is_snapshot(os))
 654                 return (EOPNOTSUPP);
 655
 656         /*
 657          * The act of registering our callbacks will destroy any mount
 658          * options we may have.  In order to enable temporary overrides
 659          * of mount options, we stash away the current values and
 660          * restore them after we register the callbacks.
 661          */
 662         if (vfs_optionisset(vfsp, MNTOPT_RO, NULL) ||
 663             !spa_writeable(dmu_objset_spa(os))) {
 664                 readonly = B_TRUE;
 665                 do_readonly = B_TRUE;
 666         } else if (vfs_optionisset(vfsp, MNTOPT_RW, NULL)) {
 667                 readonly = B_FALSE;
 668                 do_readonly = B_TRUE;
 669         }
 670         if (vfs_optionisset(vfsp, MNTOPT_NOSETUID, NULL)) {
 671                 setuid = B_FALSE;
 672                 do_setuid = B_TRUE;
 673         } else if (vfs_optionisset(vfsp, MNTOPT_SETUID, NULL)) {
 674                 setuid = B_TRUE;
 675                 do_setuid = B_TRUE;
 676         }
 677         if (vfs_optionisset(vfsp, MNTOPT_NOEXEC, NULL)) {
 678                 exec = B_FALSE;
 679                 do_exec = B_TRUE;
 680         } else if (vfs_optionisset(vfsp, MNTOPT_EXEC, NULL)) {
 681                 exec = B_TRUE;
 682                 do_exec = B_TRUE;
 683         }
 684         if (vfs_optionisset(vfsp, MNTOPT_NOXATTR, NULL)) {
 685                 zfsvfs->z_xattr = xattr = ZFS_XATTR_OFF;
 686                 do_xattr = B_TRUE;
 687         } else if (vfs_optionisset(vfsp, MNTOPT_XATTR, NULL)) {
 688                 zfsvfs->z_xattr = xattr = ZFS_XATTR_DIR;
 689                 do_xattr = B_TRUE;
 690         } else if (vfs_optionisset(vfsp, MNTOPT_DIRXATTR, NULL)) {
 691                 zfsvfs->z_xattr = xattr = ZFS_XATTR_DIR;
 692                 do_xattr = B_TRUE;
 693         } else if (vfs_optionisset(vfsp, MNTOPT_SAXATTR, NULL)) {
 694                 zfsvfs->z_xattr = xattr = ZFS_XATTR_SA;
 695                 do_xattr = B_TRUE;
 696         }
 697         if (vfs_optionisset(vfsp, MNTOPT_NOATIME, NULL)) {
 698                 atime = B_FALSE;
 699                 do_atime = B_TRUE;
 700         } else if (vfs_optionisset(vfsp, MNTOPT_ATIME, NULL)) {
 701                 atime = B_TRUE;
 702                 do_atime = B_TRUE;
 703         }
 704
 705         /*
 706          * We need to enter pool configuration here, so that we can use
 707          * dsl_prop_get_int_ds() to handle the special nbmand property below.
 708          * dsl_prop_get_integer() can not be used, because it has to acquire
 709          * spa_namespace_lock and we can not do that because we already hold
 710          * z_teardown_lock.  The problem is that spa_write_cachefile() is called
 711          * with spa_namespace_lock held and the function calls ZFS vnode
 712          * operations to write the cache file and thus z_teardown_lock is
 713          * acquired after spa_namespace_lock.
 714          */
 715         ds = dmu_objset_ds(os);
 716         dsl_pool_config_enter(dmu_objset_pool(os), FTAG);
 717
 718         /*
 719          * nbmand is a special property.  It can only be changed at
 720          * mount time.
 721          *
 722          * This is weird, but it is documented to only be changeable
 723          * at mount time.
 724          */
 725         if (vfs_optionisset(vfsp, MNTOPT_NONBMAND, NULL)) {
 726                 nbmand = B_FALSE;
 727         } else if (vfs_optionisset(vfsp, MNTOPT_NBMAND, NULL)) {
 728                 nbmand = B_TRUE;
 729         } else if ((error = dsl_prop_get_int_ds(ds, "nbmand", &nbmand)) != 0) {
 730                 dsl_pool_config_exit(dmu_objset_pool(os), FTAG);
 731                 return (error);
 732         }
 733
 734         /*
 735          * Register property callbacks.
 736          *
 737          * It would probably be fine to just check for i/o error from
 738          * the first prop_register(), but I guess I like to go
 739          * overboard...
 740          */
 741         error = dsl_prop_register(ds,
 742             zfs_prop_to_name(ZFS_PROP_ATIME), atime_changed_cb, zfsvfs);
 743         error = error ? error : dsl_prop_register(ds,
 744             zfs_prop_to_name(ZFS_PROP_XATTR), xattr_changed_cb, zfsvfs);
 745         error = error ? error : dsl_prop_register(ds,
 746             zfs_prop_to_name(ZFS_PROP_RECORDSIZE), blksz_changed_cb, zfsvfs);
 747         error = error ? error : dsl_prop_register(ds,
 748             zfs_prop_to_name(ZFS_PROP_READONLY), readonly_changed_cb, zfsvfs);
 749         error = error ? error : dsl_prop_register(ds,
 750             zfs_prop_to_name(ZFS_PROP_SETUID), setuid_changed_cb, zfsvfs);
 751         error = error ? error : dsl_prop_register(ds,
 752             zfs_prop_to_name(ZFS_PROP_EXEC), exec_changed_cb, zfsvfs);
 753         error = error ? error : dsl_prop_register(ds,
 754             zfs_prop_to_name(ZFS_PROP_SNAPDIR), snapdir_changed_cb, zfsvfs);
 755         error = error ? error : dsl_prop_register(ds,
 756             zfs_prop_to_name(ZFS_PROP_ACLTYPE), acl_type_changed_cb, zfsvfs);
 757         error = error ? error : dsl_prop_register(ds,
 758             zfs_prop_to_name(ZFS_PROP_ACLMODE), acl_mode_changed_cb, zfsvfs);
 759         error = error ? error : dsl_prop_register(ds,
 760             zfs_prop_to_name(ZFS_PROP_ACLINHERIT), acl_inherit_changed_cb,
 761             zfsvfs);
 762         error = error ? error : dsl_prop_register(ds,
 763             zfs_prop_to_name(ZFS_PROP_LONGNAME), longname_changed_cb, zfsvfs);
 764         dsl_pool_config_exit(dmu_objset_pool(os), FTAG);
 765         if (error)
 766                 goto unregister;
 767
 768         /*
 769          * Invoke our callbacks to restore temporary mount options.
 770          */
 771         if (do_readonly)
 772                 readonly_changed_cb(zfsvfs, readonly);
 773         if (do_setuid)
 774                 setuid_changed_cb(zfsvfs, setuid);
 775         if (do_exec)
 776                 exec_changed_cb(zfsvfs, exec);
 777         if (do_xattr)
 778                 xattr_changed_cb(zfsvfs, xattr);
 779         if (do_atime)
 780                 atime_changed_cb(zfsvfs, atime);
 781
 782         nbmand_changed_cb(zfsvfs, nbmand);
 783
 784         return (0);
 785
 786 unregister:
 787         dsl_prop_unregister_all(ds, zfsvfs);
 788         return (error);
 789 }
 790
 791 /*
 792  * Associate this zfsvfs with the given objset, which must be owned.
 793  * This will cache a bunch of on-disk state from the objset in the
 794  * zfsvfs.
 795  */
 796 static int
 797 zfsvfs_init(zfsvfs_t *zfsvfs, objset_t *os)
 798 {
 799         int error;
 800         uint64_t val;
 801
 802         zfsvfs->z_max_blksz = SPA_OLD_MAXBLOCKSIZE;
 803         zfsvfs->z_show_ctldir = ZFS_SNAPDIR_VISIBLE;
 804         zfsvfs->z_os = os;
 805
 806         error = zfs_get_zplprop(os, ZFS_PROP_VERSION, &zfsvfs->z_version);
 807         if (error != 0)
 808                 return (error);
 809         if (zfsvfs->z_version >
 810             zfs_zpl_version_map(spa_version(dmu_objset_spa(os)))) {
 811                 (void) printf("Can't mount a version %lld file system "
 812                     "on a version %lld pool\n. Pool must be upgraded to mount "
 813                     "this file system.", (u_longlong_t)zfsvfs->z_version,
 814                     (u_longlong_t)spa_version(dmu_objset_spa(os)));
 815                 return (SET_ERROR(ENOTSUP));
 816         }
 817         error = zfs_get_zplprop(os, ZFS_PROP_NORMALIZE, &val);
 818         if (error != 0)
 819                 return (error);
 820         zfsvfs->z_norm = (int)val;
 821
 822         error = zfs_get_zplprop(os, ZFS_PROP_UTF8ONLY, &val);
 823         if (error != 0)
 824                 return (error);
 825         zfsvfs->z_utf8 = (val != 0);
 826
 827         error = zfs_get_zplprop(os, ZFS_PROP_CASE, &val);
 828         if (error != 0)
 829                 return (error);
 830         zfsvfs->z_case = (uint_t)val;
 831
 832         error = zfs_get_zplprop(os, ZFS_PROP_ACLTYPE, &val);
 833         if (error != 0)
 834                 return (error);
 835         zfsvfs->z_acl_type = (uint_t)val;
 836
 837         /*
 838          * Fold case on file systems that are always or sometimes case
 839          * insensitive.
 840          */
 841         if (zfsvfs->z_case == ZFS_CASE_INSENSITIVE ||
 842             zfsvfs->z_case == ZFS_CASE_MIXED)
 843                 zfsvfs->z_norm |= U8_TEXTPREP_TOUPPER;
 844
 845         zfsvfs->z_use_fuids = USE_FUIDS(zfsvfs->z_version, zfsvfs->z_os);
 846         zfsvfs->z_use_sa = USE_SA(zfsvfs->z_version, zfsvfs->z_os);
 847
 848         uint64_t sa_obj = 0;
 849         if (zfsvfs->z_use_sa) {
 850                 /* should either have both of these objects or none */
 851                 error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_SA_ATTRS, 8, 1,
 852                     &sa_obj);
 853                 if (error != 0)
 854                         return (error);
 855
 856                 error = zfs_get_zplprop(os, ZFS_PROP_XATTR, &val);
 857                 if (error == 0 && val == ZFS_XATTR_SA)
 858                         zfsvfs->z_xattr_sa = B_TRUE;
 859         }
 860
 861         error = sa_setup(os, sa_obj, zfs_attr_table, ZPL_END,
 862             &zfsvfs->z_attr_table);
 863         if (error != 0)
 864                 return (error);
 865
 866         if (zfsvfs->z_version >= ZPL_VERSION_SA)
 867                 sa_register_update_callback(os, zfs_sa_upgrade);
 868
 869         error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_ROOT_OBJ, 8, 1,
 870             &zfsvfs->z_root);
 871         if (error != 0)
 872                 return (error);
 873         ASSERT3U(zfsvfs->z_root, !=, 0);
 874
 875         error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_UNLINKED_SET, 8, 1,
 876             &zfsvfs->z_unlinkedobj);
 877         if (error != 0)
 878                 return (error);
 879
 880         error = zap_lookup(os, MASTER_NODE_OBJ,
 881             zfs_userquota_prop_prefixes[ZFS_PROP_USERQUOTA],
 882             8, 1, &zfsvfs->z_userquota_obj);
 883         if (error == ENOENT)
 884                 zfsvfs->z_userquota_obj = 0;
 885         else if (error != 0)
 886                 return (error);
 887
 888         error = zap_lookup(os, MASTER_NODE_OBJ,
 889             zfs_userquota_prop_prefixes[ZFS_PROP_GROUPQUOTA],
 890             8, 1, &zfsvfs->z_groupquota_obj);
 891         if (error == ENOENT)
 892                 zfsvfs->z_groupquota_obj = 0;
 893         else if (error != 0)
 894                 return (error);
 895
 896         error = zap_lookup(os, MASTER_NODE_OBJ,
 897             zfs_userquota_prop_prefixes[ZFS_PROP_PROJECTQUOTA],
 898             8, 1, &zfsvfs->z_projectquota_obj);
 899         if (error == ENOENT)
 900                 zfsvfs->z_projectquota_obj = 0;
 901         else if (error != 0)
 902                 return (error);
 903
 904         error = zap_lookup(os, MASTER_NODE_OBJ,
 905             zfs_userquota_prop_prefixes[ZFS_PROP_USEROBJQUOTA],
 906             8, 1, &zfsvfs->z_userobjquota_obj);
 907         if (error == ENOENT)
 908                 zfsvfs->z_userobjquota_obj = 0;
 909         else if (error != 0)
 910                 return (error);
 911
 912         error = zap_lookup(os, MASTER_NODE_OBJ,
 913             zfs_userquota_prop_prefixes[ZFS_PROP_GROUPOBJQUOTA],
 914             8, 1, &zfsvfs->z_groupobjquota_obj);
 915         if (error == ENOENT)
 916                 zfsvfs->z_groupobjquota_obj = 0;
 917         else if (error != 0)
 918                 return (error);
 919
 920         error = zap_lookup(os, MASTER_NODE_OBJ,
 921             zfs_userquota_prop_prefixes[ZFS_PROP_PROJECTOBJQUOTA],
 922             8, 1, &zfsvfs->z_projectobjquota_obj);
 923         if (error == ENOENT)
 924                 zfsvfs->z_projectobjquota_obj = 0;
 925         else if (error != 0)
 926                 return (error);
 927
 928         error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_FUID_TABLES, 8, 1,
 929             &zfsvfs->z_fuid_obj);
 930         if (error == ENOENT)
 931                 zfsvfs->z_fuid_obj = 0;
 932         else if (error != 0)
 933                 return (error);
 934
 935         error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_SHARES_DIR, 8, 1,
 936             &zfsvfs->z_shares_dir);
 937         if (error == ENOENT)
 938                 zfsvfs->z_shares_dir = 0;
 939         else if (error != 0)
 940                 return (error);
 941
 942         /*
 943          * Only use the name cache if we are looking for a
 944          * name on a file system that does not require normalization
 945          * or case folding.  We can also look there if we happen to be
 946          * on a non-normalizing, mixed sensitivity file system IF we
 947          * are looking for the exact name (which is always the case on
 948          * FreeBSD).
 949          */
 950         zfsvfs->z_use_namecache = !zfsvfs->z_norm ||
 951             ((zfsvfs->z_case == ZFS_CASE_MIXED) &&
 952             !(zfsvfs->z_norm & ~U8_TEXTPREP_TOUPPER));
 953
 954         return (0);
 955 }
 956
 957 taskq_t *zfsvfs_taskq;
 958
 959 static void
 960 zfsvfs_task_unlinked_drain(void *context, int pending __unused)
 961 {
 962
 963         zfs_unlinked_drain((zfsvfs_t *)context);
 964 }
 965
 966 int
 967 zfsvfs_create(const char *osname, boolean_t readonly, zfsvfs_t **zfvp)
 968 {
 969         objset_t *os;
 970         zfsvfs_t *zfsvfs;
 971         int error;
 972         boolean_t ro = (readonly || (strchr(osname, '@') != NULL));
 973
 974         /*
 975          * XXX: Fix struct statfs so this isn't necessary!
 976          *
 977          * The 'osname' is used as the filesystem's special node, which means
 978          * it must fit in statfs.f_mntfromname, or else it can't be
 979          * enumerated, so libzfs_mnttab_find() returns NULL, which causes
 980          * 'zfs unmount' to think it's not mounted when it is.
 981          */
 982         if (strlen(osname) >= MNAMELEN)
 983                 return (SET_ERROR(ENAMETOOLONG));
 984
 985         zfsvfs = kmem_zalloc(sizeof (zfsvfs_t), KM_SLEEP);
 986
 987         error = dmu_objset_own(osname, DMU_OST_ZFS, ro, B_TRUE, zfsvfs,
 988             &os);
 989         if (error != 0) {
 990                 kmem_free(zfsvfs, sizeof (zfsvfs_t));
 991                 return (error);
 992         }
 993
 994         error = zfsvfs_create_impl(zfvp, zfsvfs, os);
 995
 996         return (error);
 997 }
 998
 999
1000 int
1001 zfsvfs_create_impl(zfsvfs_t **zfvp, zfsvfs_t *zfsvfs, objset_t *os)
1002 {
1003         int error;
1004
1005         zfsvfs->z_vfs = NULL;
1006         zfsvfs->z_parent = zfsvfs;
1007
1008         mutex_init(&zfsvfs->z_znodes_lock, NULL, MUTEX_DEFAULT, NULL);
1009         mutex_init(&zfsvfs->z_lock, NULL, MUTEX_DEFAULT, NULL);
1010         list_create(&zfsvfs->z_all_znodes, sizeof (znode_t),
1011             offsetof(znode_t, z_link_node));
1012         TASK_INIT(&zfsvfs->z_unlinked_drain_task, 0,
1013             zfsvfs_task_unlinked_drain, zfsvfs);
1014         ZFS_TEARDOWN_INIT(zfsvfs);
1015         ZFS_TEARDOWN_INACTIVE_INIT(zfsvfs);
1016         rw_init(&zfsvfs->z_fuid_lock, NULL, RW_DEFAULT, NULL);
1017         for (int i = 0; i != ZFS_OBJ_MTX_SZ; i++)
1018                 mutex_init(&zfsvfs->z_hold_mtx[i], NULL, MUTEX_DEFAULT, NULL);
1019
1020         error = zfsvfs_init(zfsvfs, os);
1021         if (error != 0) {
1022                 dmu_objset_disown(os, B_TRUE, zfsvfs);
1023                 *zfvp = NULL;
1024                 kmem_free(zfsvfs, sizeof (zfsvfs_t));
1025                 return (error);
1026         }
1027
1028         *zfvp = zfsvfs;
1029         return (0);
1030 }
1031
1032 static int
1033 zfsvfs_setup(zfsvfs_t *zfsvfs, boolean_t mounting)
1034 {
1035         int error;
1036
1037         /*
1038          * Check for a bad on-disk format version now since we
1039          * lied about owning the dataset readonly before.
1040          */
1041         if (!(zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) &&
1042             dmu_objset_incompatible_encryption_version(zfsvfs->z_os))
1043                 return (SET_ERROR(EROFS));
1044
1045         error = zfs_register_callbacks(zfsvfs->z_vfs);
1046         if (error)
1047                 return (error);
1048
1049         /*
1050          * If we are not mounting (ie: online recv), then we don't
1051          * have to worry about replaying the log as we blocked all
1052          * operations out since we closed the ZIL.
1053          */
1054         if (mounting) {
1055                 boolean_t readonly;
1056
1057                 ASSERT3P(zfsvfs->z_kstat.dk_kstats, ==, NULL);
1058                 error = dataset_kstats_create(&zfsvfs->z_kstat, zfsvfs->z_os);
1059                 if (error)
1060                         return (error);
1061                 zfsvfs->z_log = zil_open(zfsvfs->z_os, zfs_get_data,
1062                     &zfsvfs->z_kstat.dk_zil_sums);
1063
1064                 /*
1065                  * During replay we remove the read only flag to
1066                  * allow replays to succeed.
1067                  */
1068                 readonly = zfsvfs->z_vfs->vfs_flag & VFS_RDONLY;
1069                 if (readonly != 0) {
1070                         zfsvfs->z_vfs->vfs_flag &= ~VFS_RDONLY;
1071                 } else {
1072                         dsl_dir_t *dd;
1073                         zap_stats_t zs;
1074
1075                         if (zap_get_stats(zfsvfs->z_os, zfsvfs->z_unlinkedobj,
1076                             &zs) == 0) {
1077                                 dataset_kstats_update_nunlinks_kstat(
1078                                     &zfsvfs->z_kstat, zs.zs_num_entries);
1079                                 dprintf_ds(zfsvfs->z_os->os_dsl_dataset,
1080                                     "num_entries in unlinked set: %llu",
1081                                     (u_longlong_t)zs.zs_num_entries);
1082                         }
1083
1084                         zfs_unlinked_drain(zfsvfs);
1085                         dd = zfsvfs->z_os->os_dsl_dataset->ds_dir;
1086                         dd->dd_activity_cancelled = B_FALSE;
1087                 }
1088
1089                 /*
1090                  * Parse and replay the intent log.
1091                  *
1092                  * Because of ziltest, this must be done after
1093                  * zfs_unlinked_drain().  (Further note: ziltest
1094                  * doesn't use readonly mounts, where
1095                  * zfs_unlinked_drain() isn't called.)  This is because
1096                  * ziltest causes spa_sync() to think it's committed,
1097                  * but actually it is not, so the intent log contains
1098                  * many txg's worth of changes.
1099                  *
1100                  * In particular, if object N is in the unlinked set in
1101                  * the last txg to actually sync, then it could be
1102                  * actually freed in a later txg and then reallocated
1103                  * in a yet later txg.  This would write a "create
1104                  * object N" record to the intent log.  Normally, this
1105                  * would be fine because the spa_sync() would have
1106                  * written out the fact that object N is free, before
1107                  * we could write the "create object N" intent log
1108                  * record.
1109                  *
1110                  * But when we are in ziltest mode, we advance the "open
1111                  * txg" without actually spa_sync()-ing the changes to
1112                  * disk.  So we would see that object N is still
1113                  * allocated and in the unlinked set, and there is an
1114                  * intent log record saying to allocate it.
1115                  */
1116                 if (spa_writeable(dmu_objset_spa(zfsvfs->z_os))) {
1117                         if (zil_replay_disable) {
1118                                 zil_destroy(zfsvfs->z_log, B_FALSE);
1119                         } else {
1120                                 boolean_t use_nc = zfsvfs->z_use_namecache;
1121                                 zfsvfs->z_use_namecache = B_FALSE;
1122                                 zfsvfs->z_replay = B_TRUE;
1123                                 zil_replay(zfsvfs->z_os, zfsvfs,
1124                                     zfs_replay_vector);
1125                                 zfsvfs->z_replay = B_FALSE;
1126                                 zfsvfs->z_use_namecache = use_nc;
1127                         }
1128                 }
1129
1130                 /* restore readonly bit */
1131                 if (readonly != 0)
1132                         zfsvfs->z_vfs->vfs_flag |= VFS_RDONLY;
1133         } else {
1134                 ASSERT3P(zfsvfs->z_kstat.dk_kstats, !=, NULL);
1135                 zfsvfs->z_log = zil_open(zfsvfs->z_os, zfs_get_data,
1136                     &zfsvfs->z_kstat.dk_zil_sums);
1137         }
1138
1139         /*
1140          * Set the objset user_ptr to track its zfsvfs.
1141          */
1142         mutex_enter(&zfsvfs->z_os->os_user_ptr_lock);
1143         dmu_objset_set_user(zfsvfs->z_os, zfsvfs);
1144         mutex_exit(&zfsvfs->z_os->os_user_ptr_lock);
1145
1146         return (0);
1147 }
1148
1149 void
1150 zfsvfs_free(zfsvfs_t *zfsvfs)
1151 {
1152         int i;
1153
1154         zfs_fuid_destroy(zfsvfs);
1155
1156         mutex_destroy(&zfsvfs->z_znodes_lock);
1157         mutex_destroy(&zfsvfs->z_lock);
1158         list_destroy(&zfsvfs->z_all_znodes);
1159         ZFS_TEARDOWN_DESTROY(zfsvfs);
1160         ZFS_TEARDOWN_INACTIVE_DESTROY(zfsvfs);
1161         rw_destroy(&zfsvfs->z_fuid_lock);
1162         for (i = 0; i != ZFS_OBJ_MTX_SZ; i++)
1163                 mutex_destroy(&zfsvfs->z_hold_mtx[i]);
1164         dataset_kstats_destroy(&zfsvfs->z_kstat);
1165         kmem_free(zfsvfs, sizeof (zfsvfs_t));
1166 }
1167
1168 static void
1169 zfs_set_fuid_feature(zfsvfs_t *zfsvfs)
1170 {
1171         zfsvfs->z_use_fuids = USE_FUIDS(zfsvfs->z_version, zfsvfs->z_os);
1172         zfsvfs->z_use_sa = USE_SA(zfsvfs->z_version, zfsvfs->z_os);
1173 }
1174
1175 static int
1176 zfs_domount(vfs_t *vfsp, char *osname)
1177 {
1178         uint64_t recordsize, fsid_guid;
1179         int error = 0;
1180         zfsvfs_t *zfsvfs;
1181
1182         ASSERT3P(vfsp, !=, NULL);
1183         ASSERT3P(osname, !=, NULL);
1184
1185         error = zfsvfs_create(osname, vfsp->mnt_flag & MNT_RDONLY, &zfsvfs);
1186         if (error)
1187                 return (error);
1188         zfsvfs->z_vfs = vfsp;
1189
1190         if ((error = dsl_prop_get_integer(osname,
1191             "recordsize", &recordsize, NULL)))
1192                 goto out;
1193         zfsvfs->z_vfs->vfs_bsize = SPA_MINBLOCKSIZE;
1194         zfsvfs->z_vfs->mnt_stat.f_iosize = recordsize;
1195
1196         vfsp->vfs_data = zfsvfs;
1197         vfsp->mnt_flag |= MNT_LOCAL;
1198         vfsp->mnt_kern_flag |= MNTK_LOOKUP_SHARED;
1199         vfsp->mnt_kern_flag |= MNTK_SHARED_WRITES;
1200         vfsp->mnt_kern_flag |= MNTK_EXTENDED_SHARED;
1201         /*
1202          * This can cause a loss of coherence between ARC and page cache
1203          * on ZoF - unclear if the problem is in FreeBSD or ZoF
1204          */
1205         vfsp->mnt_kern_flag |= MNTK_NO_IOPF;    /* vn_io_fault can be used */
1206         vfsp->mnt_kern_flag |= MNTK_NOMSYNC;
1207         vfsp->mnt_kern_flag |= MNTK_VMSETSIZE_BUG;
1208
1209 #if defined(_KERNEL) && !defined(KMEM_DEBUG)
1210         vfsp->mnt_kern_flag |= MNTK_FPLOOKUP;
1211 #endif
1212         /*
1213          * The fsid is 64 bits, composed of an 8-bit fs type, which
1214          * separates our fsid from any other filesystem types, and a
1215          * 56-bit objset unique ID.  The objset unique ID is unique to
1216          * all objsets open on this system, provided by unique_create().
1217          * The 8-bit fs type must be put in the low bits of fsid[1]
1218          * because that's where other Solaris filesystems put it.
1219          */
1220         fsid_guid = dmu_objset_fsid_guid(zfsvfs->z_os);
1221         ASSERT3U((fsid_guid & ~((1ULL << 56) - 1)), ==, 0);
1222         vfsp->vfs_fsid.val[0] = fsid_guid;
1223         vfsp->vfs_fsid.val[1] = ((fsid_guid >> 32) << 8) |
1224             (vfsp->mnt_vfc->vfc_typenum & 0xFF);
1225
1226         /*
1227          * Set features for file system.
1228          */
1229         zfs_set_fuid_feature(zfsvfs);
1230
1231         if (dmu_objset_is_snapshot(zfsvfs->z_os)) {
1232                 uint64_t pval;
1233
1234                 atime_changed_cb(zfsvfs, B_FALSE);
1235                 readonly_changed_cb(zfsvfs, B_TRUE);
1236                 if ((error = dsl_prop_get_integer(osname,
1237                     "xattr", &pval, NULL)))
1238                         goto out;
1239                 xattr_changed_cb(zfsvfs, pval);
1240                 if ((error = dsl_prop_get_integer(osname,
1241                     "acltype", &pval, NULL)))
1242                         goto out;
1243                 acl_type_changed_cb(zfsvfs, pval);
1244                 zfsvfs->z_issnap = B_TRUE;
1245                 zfsvfs->z_os->os_sync = ZFS_SYNC_DISABLED;
1246
1247                 mutex_enter(&zfsvfs->z_os->os_user_ptr_lock);
1248                 dmu_objset_set_user(zfsvfs->z_os, zfsvfs);
1249                 mutex_exit(&zfsvfs->z_os->os_user_ptr_lock);
1250         } else {
1251                 if ((error = zfsvfs_setup(zfsvfs, B_TRUE)))
1252                         goto out;
1253         }
1254
1255         vfs_mountedfrom(vfsp, osname);
1256
1257         if (!zfsvfs->z_issnap)
1258                 zfsctl_create(zfsvfs);
1259 out:
1260         if (error) {
1261                 dmu_objset_disown(zfsvfs->z_os, B_TRUE, zfsvfs);
1262                 zfsvfs_free(zfsvfs);
1263         } else {
1264                 atomic_inc_32(&zfs_active_fs_count);
1265         }
1266
1267         return (error);
1268 }
1269
1270 static void
1271 zfs_unregister_callbacks(zfsvfs_t *zfsvfs)
1272 {
1273         objset_t *os = zfsvfs->z_os;
1274
1275         if (!dmu_objset_is_snapshot(os))
1276                 dsl_prop_unregister_all(dmu_objset_ds(os), zfsvfs);
1277 }
1278
1279 static int
1280 getpoolname(const char *osname, char *poolname)
1281 {
1282         char *p;
1283
1284         p = strchr(osname, '/');
1285         if (p == NULL) {
1286                 if (strlen(osname) >= MAXNAMELEN)
1287                         return (ENAMETOOLONG);
1288                 (void) strcpy(poolname, osname);
1289         } else {
1290                 if (p - osname >= MAXNAMELEN)
1291                         return (ENAMETOOLONG);
1292                 (void) strlcpy(poolname, osname, p - osname + 1);
1293         }
1294         return (0);
1295 }
1296
1297 static void
1298 fetch_osname_options(char *name, bool *checkpointrewind)
1299 {
1300
1301         if (name[0] == '!') {
1302                 *checkpointrewind = true;
1303                 memmove(name, name + 1, strlen(name));
1304         } else {
1305                 *checkpointrewind = false;
1306         }
1307 }
1308
1309 static int
1310 zfs_mount(vfs_t *vfsp)
1311 {
1312         kthread_t       *td = curthread;
1313         vnode_t         *mvp = vfsp->mnt_vnodecovered;
1314         cred_t          *cr = td->td_ucred;
1315         char            *osname;
1316         int             error = 0;
1317         int             canwrite;
1318         bool            checkpointrewind, isctlsnap = false;
1319
1320         if (vfs_getopt(vfsp->mnt_optnew, "from", (void **)&osname, NULL))
1321                 return (SET_ERROR(EINVAL));
1322
1323         /*
1324          * If full-owner-access is enabled and delegated administration is
1325          * turned on, we must set nosuid.
1326          */
1327         if (zfs_super_owner &&
1328             dsl_deleg_access(osname, ZFS_DELEG_PERM_MOUNT, cr) != ECANCELED) {
1329                 secpolicy_fs_mount_clearopts(cr, vfsp);
1330         }
1331
1332         fetch_osname_options(osname, &checkpointrewind);
1333         isctlsnap = (mvp != NULL && zfsctl_is_node(mvp) &&
1334             strchr(osname, '@') != NULL);
1335
1336         /*
1337          * Check for mount privilege?
1338          *
1339          * If we don't have privilege then see if
1340          * we have local permission to allow it
1341          */
1342         error = secpolicy_fs_mount(cr, mvp, vfsp);
1343         if (error && isctlsnap) {
1344                 secpolicy_fs_mount_clearopts(cr, vfsp);
1345         } else if (error) {
1346                 if (dsl_deleg_access(osname, ZFS_DELEG_PERM_MOUNT, cr) != 0)
1347                         goto out;
1348
1349                 if (!(vfsp->vfs_flag & MS_REMOUNT)) {
1350                         vattr_t         vattr;
1351
1352                         /*
1353                          * Make sure user is the owner of the mount point
1354                          * or has sufficient privileges.
1355                          */
1356
1357                         vattr.va_mask = AT_UID;
1358
1359                         vn_lock(mvp, LK_SHARED | LK_RETRY);
1360                         if (VOP_GETATTR(mvp, &vattr, cr)) {
1361                                 VOP_UNLOCK(mvp);
1362                                 goto out;
1363                         }
1364
1365                         if (secpolicy_vnode_owner(mvp, cr, vattr.va_uid) != 0 &&
1366                             VOP_ACCESS(mvp, VWRITE, cr, td) != 0) {
1367                                 VOP_UNLOCK(mvp);
1368                                 goto out;
1369                         }
1370                         VOP_UNLOCK(mvp);
1371                 }
1372
1373                 secpolicy_fs_mount_clearopts(cr, vfsp);
1374         }
1375
1376         /*
1377          * Refuse to mount a filesystem if we are in a local zone and the
1378          * dataset is not visible.
1379          */
1380         if (!INGLOBALZONE(curproc) &&
1381             (!zone_dataset_visible(osname, &canwrite) || !canwrite)) {
1382                 boolean_t mount_snapshot = B_FALSE;
1383
1384                 /*
1385                  * Snapshots may be mounted in .zfs for unjailed datasets
1386                  * if allowed by the jail param zfs.mount_snapshot.
1387                  */
1388                 if (isctlsnap) {
1389                         struct prison *pr;
1390                         struct zfs_jailparam *zjp;
1391
1392                         pr = curthread->td_ucred->cr_prison;
1393                         mtx_lock(&pr->pr_mtx);
1394                         zjp = osd_jail_get(pr, zfs_jailparam_slot);
1395                         mtx_unlock(&pr->pr_mtx);
1396                         if (zjp && zjp->mount_snapshot)
1397                                 mount_snapshot = B_TRUE;
1398                 }
1399                 if (!mount_snapshot) {
1400                         error = SET_ERROR(EPERM);
1401                         goto out;
1402                 }
1403         }
1404
1405         vfsp->vfs_flag |= MNT_NFS4ACLS;
1406
1407         /*
1408          * When doing a remount, we simply refresh our temporary properties
1409          * according to those options set in the current VFS options.
1410          */
1411         if (vfsp->vfs_flag & MS_REMOUNT) {
1412                 zfsvfs_t *zfsvfs = vfsp->vfs_data;
1413
1414                 /*
1415                  * Refresh mount options with z_teardown_lock blocking I/O while
1416                  * the filesystem is in an inconsistent state.
1417                  * The lock also serializes this code with filesystem
1418                  * manipulations between entry to zfs_suspend_fs() and return
1419                  * from zfs_resume_fs().
1420                  */
1421                 ZFS_TEARDOWN_ENTER_WRITE(zfsvfs, FTAG);
1422                 zfs_unregister_callbacks(zfsvfs);
1423                 error = zfs_register_callbacks(vfsp);
1424                 ZFS_TEARDOWN_EXIT(zfsvfs, FTAG);
1425                 goto out;
1426         }
1427
1428         /* Initial root mount: try hard to import the requested root pool. */
1429         if ((vfsp->vfs_flag & MNT_ROOTFS) != 0 &&
1430             (vfsp->vfs_flag & MNT_UPDATE) == 0) {
1431                 char pname[MAXNAMELEN];
1432
1433                 error = getpoolname(osname, pname);
1434                 if (error == 0)
1435                         error = spa_import_rootpool(pname, checkpointrewind);
1436                 if (error)
1437                         goto out;
1438         }
1439         DROP_GIANT();
1440         error = zfs_domount(vfsp, osname);
1441         PICKUP_GIANT();
1442
1443 out:
1444         return (error);
1445 }
1446
1447 static int
1448 zfs_statfs(vfs_t *vfsp, struct statfs *statp)
1449 {
1450         zfsvfs_t *zfsvfs = vfsp->vfs_data;
1451         uint64_t refdbytes, availbytes, usedobjs, availobjs;
1452         int error;
1453
1454         statp->f_version = STATFS_VERSION;
1455
1456         if ((error = zfs_enter(zfsvfs, FTAG)) != 0)
1457                 return (error);
1458
1459         dmu_objset_space(zfsvfs->z_os,
1460             &refdbytes, &availbytes, &usedobjs, &availobjs);
1461
1462         /*
1463          * The underlying storage pool actually uses multiple block sizes.
1464          * We report the fragsize as the smallest block size we support,
1465          * and we report our blocksize as the filesystem's maximum blocksize.
1466          */
1467         statp->f_bsize = SPA_MINBLOCKSIZE;
1468         statp->f_iosize = zfsvfs->z_vfs->mnt_stat.f_iosize;
1469
1470         /*
1471          * The following report "total" blocks of various kinds in the
1472          * file system, but reported in terms of f_frsize - the
1473          * "fragment" size.
1474          */
1475
1476         statp->f_blocks = (refdbytes + availbytes) >> SPA_MINBLOCKSHIFT;
1477         statp->f_bfree = availbytes / statp->f_bsize;
1478         statp->f_bavail = statp->f_bfree; /* no root reservation */
1479
1480         /*
1481          * statvfs() should really be called statufs(), because it assumes
1482          * static metadata.  ZFS doesn't preallocate files, so the best
1483          * we can do is report the max that could possibly fit in f_files,
1484          * and that minus the number actually used in f_ffree.
1485          * For f_ffree, report the smaller of the number of object available
1486          * and the number of blocks (each object will take at least a block).
1487          */
1488         statp->f_ffree = MIN(availobjs, statp->f_bfree);
1489         statp->f_files = statp->f_ffree + usedobjs;
1490
1491         /*
1492          * We're a zfs filesystem.
1493          */
1494         strlcpy(statp->f_fstypename, "zfs",
1495             sizeof (statp->f_fstypename));
1496
1497         strlcpy(statp->f_mntfromname, vfsp->mnt_stat.f_mntfromname,
1498             sizeof (statp->f_mntfromname));
1499         strlcpy(statp->f_mntonname, vfsp->mnt_stat.f_mntonname,
1500             sizeof (statp->f_mntonname));
1501
1502         statp->f_namemax =
1503             zfsvfs->z_longname ? (ZAP_MAXNAMELEN_NEW - 1) : (MAXNAMELEN - 1);
1504
1505         zfs_exit(zfsvfs, FTAG);
1506         return (0);
1507 }
1508
1509 static int
1510 zfs_root(vfs_t *vfsp, int flags, vnode_t **vpp)
1511 {
1512         zfsvfs_t *zfsvfs = vfsp->vfs_data;
1513         znode_t *rootzp;
1514         int error;
1515
1516         if ((error = zfs_enter(zfsvfs, FTAG)) != 0)
1517                 return (error);
1518
1519         error = zfs_zget(zfsvfs, zfsvfs->z_root, &rootzp);
1520         if (error == 0)
1521                 *vpp = ZTOV(rootzp);
1522
1523         zfs_exit(zfsvfs, FTAG);
1524
1525         if (error == 0) {
1526                 error = vn_lock(*vpp, flags);
1527                 if (error != 0) {
1528                         VN_RELE(*vpp);
1529                         *vpp = NULL;
1530                 }
1531         }
1532         return (error);
1533 }
1534
1535 /*
1536  * Teardown the zfsvfs::z_os.
1537  *
1538  * Note, if 'unmounting' is FALSE, we return with the 'z_teardown_lock'
1539  * and 'z_teardown_inactive_lock' held.
1540  */
1541 static int
1542 zfsvfs_teardown(zfsvfs_t *zfsvfs, boolean_t unmounting)
1543 {
1544         znode_t *zp;
1545         dsl_dir_t *dd;
1546
1547         /*
1548          * If someone has not already unmounted this file system,
1549          * drain the zrele_taskq to ensure all active references to the
1550          * zfsvfs_t have been handled only then can it be safely destroyed.
1551          */
1552         if (zfsvfs->z_os) {
1553                 /*
1554                  * If we're unmounting we have to wait for the list to
1555                  * drain completely.
1556                  *
1557                  * If we're not unmounting there's no guarantee the list
1558                  * will drain completely, but zreles run from the taskq
1559                  * may add the parents of dir-based xattrs to the taskq
1560                  * so we want to wait for these.
1561                  *
1562                  * We can safely check z_all_znodes for being empty because the
1563                  * VFS has already blocked operations which add to it.
1564                  */
1565                 int round = 0;
1566                 while (!list_is_empty(&zfsvfs->z_all_znodes)) {
1567                         taskq_wait_outstanding(dsl_pool_zrele_taskq(
1568                             dmu_objset_pool(zfsvfs->z_os)), 0);
1569                         if (++round > 1 && !unmounting)
1570                                 break;
1571                 }
1572         }
1573         ZFS_TEARDOWN_ENTER_WRITE(zfsvfs, FTAG);
1574
1575         if (!unmounting) {
1576                 /*
1577                  * We purge the parent filesystem's vfsp as the parent
1578                  * filesystem and all of its snapshots have their vnode's
1579                  * v_vfsp set to the parent's filesystem's vfsp.  Note,
1580                  * 'z_parent' is self referential for non-snapshots.
1581                  */
1582 #ifdef FREEBSD_NAMECACHE
1583                 cache_purgevfs(zfsvfs->z_parent->z_vfs);
1584 #endif
1585         }
1586
1587         /*
1588          * Close the zil. NB: Can't close the zil while zfs_inactive
1589          * threads are blocked as zil_close can call zfs_inactive.
1590          */
1591         if (zfsvfs->z_log) {
1592                 zil_close(zfsvfs->z_log);
1593                 zfsvfs->z_log = NULL;
1594         }
1595
1596         ZFS_TEARDOWN_INACTIVE_ENTER_WRITE(zfsvfs);
1597
1598         /*
1599          * If we are not unmounting (ie: online recv) and someone already
1600          * unmounted this file system while we were doing the switcheroo,
1601          * or a reopen of z_os failed then just bail out now.
1602          */
1603         if (!unmounting && (zfsvfs->z_unmounted || zfsvfs->z_os == NULL)) {
1604                 ZFS_TEARDOWN_INACTIVE_EXIT_WRITE(zfsvfs);
1605                 ZFS_TEARDOWN_EXIT(zfsvfs, FTAG);
1606                 return (SET_ERROR(EIO));
1607         }
1608
1609         /*
1610          * At this point there are no vops active, and any new vops will
1611          * fail with EIO since we have z_teardown_lock for writer (only
1612          * relevant for forced unmount).
1613          *
1614          * Release all holds on dbufs.
1615          */
1616         mutex_enter(&zfsvfs->z_znodes_lock);
1617         for (zp = list_head(&zfsvfs->z_all_znodes); zp != NULL;
1618             zp = list_next(&zfsvfs->z_all_znodes, zp)) {
1619                 if (zp->z_sa_hdl != NULL) {
1620                         zfs_znode_dmu_fini(zp);
1621                 }
1622         }
1623         mutex_exit(&zfsvfs->z_znodes_lock);
1624
1625         /*
1626          * If we are unmounting, set the unmounted flag and let new vops
1627          * unblock.  zfs_inactive will have the unmounted behavior, and all
1628          * other vops will fail with EIO.
1629          */
1630         if (unmounting) {
1631                 zfsvfs->z_unmounted = B_TRUE;
1632                 ZFS_TEARDOWN_INACTIVE_EXIT_WRITE(zfsvfs);
1633                 ZFS_TEARDOWN_EXIT(zfsvfs, FTAG);
1634         }
1635
1636         /*
1637          * z_os will be NULL if there was an error in attempting to reopen
1638          * zfsvfs, so just return as the properties had already been
1639          * unregistered and cached data had been evicted before.
1640          */
1641         if (zfsvfs->z_os == NULL)
1642                 return (0);
1643
1644         /*
1645          * Unregister properties.
1646          */
1647         zfs_unregister_callbacks(zfsvfs);
1648
1649         /*
1650          * Evict cached data. We must write out any dirty data before
1651          * disowning the dataset.
1652          */
1653         objset_t *os = zfsvfs->z_os;
1654         boolean_t os_dirty = B_FALSE;
1655         for (int t = 0; t < TXG_SIZE; t++) {
1656                 if (dmu_objset_is_dirty(os, t)) {
1657                         os_dirty = B_TRUE;
1658                         break;
1659                 }
1660         }
1661         if (!zfs_is_readonly(zfsvfs) && os_dirty)
1662                 txg_wait_synced(dmu_objset_pool(zfsvfs->z_os), 0);
1663         dmu_objset_evict_dbufs(zfsvfs->z_os);
1664         dd = zfsvfs->z_os->os_dsl_dataset->ds_dir;
1665         dsl_dir_cancel_waiters(dd);
1666
1667         return (0);
1668 }
1669
1670 static int
1671 zfs_umount(vfs_t *vfsp, int fflag)
1672 {
1673         kthread_t *td = curthread;
1674         zfsvfs_t *zfsvfs = vfsp->vfs_data;
1675         objset_t *os;
1676         cred_t *cr = td->td_ucred;
1677         int ret;
1678
1679         ret = secpolicy_fs_unmount(cr, vfsp);
1680         if (ret) {
1681                 if (dsl_deleg_access((char *)vfsp->vfs_resource,
1682                     ZFS_DELEG_PERM_MOUNT, cr))
1683                         return (ret);
1684         }
1685
1686         /*
1687          * Unmount any snapshots mounted under .zfs before unmounting the
1688          * dataset itself.
1689          */
1690         if (zfsvfs->z_ctldir != NULL) {
1691                 if ((ret = zfsctl_umount_snapshots(vfsp, fflag, cr)) != 0)
1692                         return (ret);
1693         }
1694
1695         if (fflag & MS_FORCE) {
1696                 /*
1697                  * Mark file system as unmounted before calling
1698                  * vflush(FORCECLOSE). This way we ensure no future vnops
1699                  * will be called and risk operating on DOOMED vnodes.
1700                  */
1701                 ZFS_TEARDOWN_ENTER_WRITE(zfsvfs, FTAG);
1702                 zfsvfs->z_unmounted = B_TRUE;
1703                 ZFS_TEARDOWN_EXIT(zfsvfs, FTAG);
1704         }
1705
1706         /*
1707          * Flush all the files.
1708          */
1709         ret = vflush(vfsp, 0, (fflag & MS_FORCE) ? FORCECLOSE : 0, td);
1710         if (ret != 0)
1711                 return (ret);
1712         while (taskqueue_cancel(zfsvfs_taskq->tq_queue,
1713             &zfsvfs->z_unlinked_drain_task, NULL) != 0)
1714                 taskqueue_drain(zfsvfs_taskq->tq_queue,
1715                     &zfsvfs->z_unlinked_drain_task);
1716
1717         VERIFY0(zfsvfs_teardown(zfsvfs, B_TRUE));
1718         os = zfsvfs->z_os;
1719
1720         /*
1721          * z_os will be NULL if there was an error in
1722          * attempting to reopen zfsvfs.
1723          */
1724         if (os != NULL) {
1725                 /*
1726                  * Unset the objset user_ptr.
1727                  */
1728                 mutex_enter(&os->os_user_ptr_lock);
1729                 dmu_objset_set_user(os, NULL);
1730                 mutex_exit(&os->os_user_ptr_lock);
1731
1732                 /*
1733                  * Finally release the objset
1734                  */
1735                 dmu_objset_disown(os, B_TRUE, zfsvfs);
1736         }
1737
1738         /*
1739          * We can now safely destroy the '.zfs' directory node.
1740          */
1741         if (zfsvfs->z_ctldir != NULL)
1742                 zfsctl_destroy(zfsvfs);
1743         zfs_freevfs(vfsp);
1744
1745         return (0);
1746 }
1747
1748 static int
1749 zfs_vget(vfs_t *vfsp, ino_t ino, int flags, vnode_t **vpp)
1750 {
1751         zfsvfs_t        *zfsvfs = vfsp->vfs_data;
1752         znode_t         *zp;
1753         int             err;
1754
1755         /*
1756          * zfs_zget() can't operate on virtual entries like .zfs/ or
1757          * .zfs/snapshot/ directories, that's why we return EOPNOTSUPP.
1758          * This will make NFS to switch to LOOKUP instead of using VGET.
1759          */
1760         if (ino == ZFSCTL_INO_ROOT || ino == ZFSCTL_INO_SNAPDIR ||
1761             (zfsvfs->z_shares_dir != 0 && ino == zfsvfs->z_shares_dir))
1762                 return (EOPNOTSUPP);
1763
1764         if ((err = zfs_enter(zfsvfs, FTAG)) != 0)
1765                 return (err);
1766         err = zfs_zget(zfsvfs, ino, &zp);
1767         if (err == 0 && zp->z_unlinked) {
1768                 vrele(ZTOV(zp));
1769                 err = EINVAL;
1770         }
1771         if (err == 0)
1772                 *vpp = ZTOV(zp);
1773         zfs_exit(zfsvfs, FTAG);
1774         if (err == 0) {
1775                 err = vn_lock(*vpp, flags);
1776                 if (err != 0)
1777                         vrele(*vpp);
1778         }
1779         if (err != 0)
1780                 *vpp = NULL;
1781         return (err);
1782 }
1783
1784 static int
1785 zfs_checkexp(vfs_t *vfsp, struct sockaddr *nam, uint64_t *extflagsp,
1786     struct ucred **credanonp, int *numsecflavors, int *secflavors)
1787 {
1788         zfsvfs_t *zfsvfs = vfsp->vfs_data;
1789
1790         /*
1791          * If this is regular file system vfsp is the same as
1792          * zfsvfs->z_parent->z_vfs, but if it is snapshot,
1793          * zfsvfs->z_parent->z_vfs represents parent file system
1794          * which we have to use here, because only this file system
1795          * has mnt_export configured.
1796          */
1797         return (vfs_stdcheckexp(zfsvfs->z_parent->z_vfs, nam, extflagsp,
1798             credanonp, numsecflavors, secflavors));
1799 }
1800
1801 _Static_assert(sizeof (struct fid) >= SHORT_FID_LEN,
1802         "struct fid bigger than SHORT_FID_LEN");
1803 _Static_assert(sizeof (struct fid) >= LONG_FID_LEN,
1804         "struct fid bigger than LONG_FID_LEN");
1805
1806 static int
1807 zfs_fhtovp(vfs_t *vfsp, fid_t *fidp, int flags, vnode_t **vpp)
1808 {
1809         struct componentname cn;
1810         zfsvfs_t        *zfsvfs = vfsp->vfs_data;
1811         znode_t         *zp;
1812         vnode_t         *dvp;
1813         uint64_t        object = 0;
1814         uint64_t        fid_gen = 0;
1815         uint64_t        setgen = 0;
1816         uint64_t        gen_mask;
1817         uint64_t        zp_gen;
1818         int             i, err;
1819
1820         *vpp = NULL;
1821
1822         if ((err = zfs_enter(zfsvfs, FTAG)) != 0)
1823                 return (err);
1824
1825         /*
1826          * On FreeBSD we can get snapshot's mount point or its parent file
1827          * system mount point depending if snapshot is already mounted or not.
1828          */
1829         if (zfsvfs->z_parent == zfsvfs && fidp->fid_len == LONG_FID_LEN) {
1830                 zfid_long_t     *zlfid = (zfid_long_t *)fidp;
1831                 uint64_t        objsetid = 0;
1832
1833                 for (i = 0; i < sizeof (zlfid->zf_setid); i++)
1834                         objsetid |= ((uint64_t)zlfid->zf_setid[i]) << (8 * i);
1835
1836                 for (i = 0; i < sizeof (zlfid->zf_setgen); i++)
1837                         setgen |= ((uint64_t)zlfid->zf_setgen[i]) << (8 * i);
1838
1839                 zfs_exit(zfsvfs, FTAG);
1840
1841                 err = zfsctl_lookup_objset(vfsp, objsetid, &zfsvfs);
1842                 if (err)
1843                         return (SET_ERROR(EINVAL));
1844                 if ((err = zfs_enter(zfsvfs, FTAG)) != 0)
1845                         return (err);
1846         }
1847
1848         if (fidp->fid_len == SHORT_FID_LEN || fidp->fid_len == LONG_FID_LEN) {
1849                 zfid_short_t    *zfid = (zfid_short_t *)fidp;
1850
1851                 for (i = 0; i < sizeof (zfid->zf_object); i++)
1852                         object |= ((uint64_t)zfid->zf_object[i]) << (8 * i);
1853
1854                 for (i = 0; i < sizeof (zfid->zf_gen); i++)
1855                         fid_gen |= ((uint64_t)zfid->zf_gen[i]) << (8 * i);
1856         } else {
1857                 zfs_exit(zfsvfs, FTAG);
1858                 return (SET_ERROR(EINVAL));
1859         }
1860
1861         if (fidp->fid_len == LONG_FID_LEN && setgen != 0) {
1862                 zfs_exit(zfsvfs, FTAG);
1863                 dprintf("snapdir fid: fid_gen (%llu) and setgen (%llu)\n",
1864                     (u_longlong_t)fid_gen, (u_longlong_t)setgen);
1865                 return (SET_ERROR(EINVAL));
1866         }
1867
1868         /*
1869          * A zero fid_gen means we are in .zfs or the .zfs/snapshot
1870          * directory tree. If the object == zfsvfs->z_shares_dir, then
1871          * we are in the .zfs/shares directory tree.
1872          */
1873         if ((fid_gen == 0 &&
1874             (object == ZFSCTL_INO_ROOT || object == ZFSCTL_INO_SNAPDIR)) ||
1875             (zfsvfs->z_shares_dir != 0 && object == zfsvfs->z_shares_dir)) {
1876                 zfs_exit(zfsvfs, FTAG);
1877                 VERIFY0(zfsctl_root(zfsvfs, LK_SHARED, &dvp));
1878                 if (object == ZFSCTL_INO_SNAPDIR) {
1879                         cn.cn_nameptr = "snapshot";
1880                         cn.cn_namelen = strlen(cn.cn_nameptr);
1881                         cn.cn_nameiop = LOOKUP;
1882                         cn.cn_flags = ISLASTCN | LOCKLEAF;
1883                         cn.cn_lkflags = flags;
1884                         VERIFY0(VOP_LOOKUP(dvp, vpp, &cn));
1885                         vput(dvp);
1886                 } else if (object == zfsvfs->z_shares_dir) {
1887                         /*
1888                          * XXX This branch must not be taken,
1889                          * if it is, then the lookup below will
1890                          * explode.
1891                          */
1892                         cn.cn_nameptr = "shares";
1893                         cn.cn_namelen = strlen(cn.cn_nameptr);
1894                         cn.cn_nameiop = LOOKUP;
1895                         cn.cn_flags = ISLASTCN;
1896                         cn.cn_lkflags = flags;
1897                         VERIFY0(VOP_LOOKUP(dvp, vpp, &cn));
1898                         vput(dvp);
1899                 } else {
1900                         *vpp = dvp;
1901                 }
1902                 return (err);
1903         }
1904
1905         gen_mask = -1ULL >> (64 - 8 * i);
1906
1907         dprintf("getting %llu [%llu mask %llx]\n", (u_longlong_t)object,
1908             (u_longlong_t)fid_gen,
1909             (u_longlong_t)gen_mask);
1910         if ((err = zfs_zget(zfsvfs, object, &zp))) {
1911                 zfs_exit(zfsvfs, FTAG);
1912                 return (err);
1913         }
1914         (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_GEN(zfsvfs), &zp_gen,
1915             sizeof (uint64_t));
1916         zp_gen = zp_gen & gen_mask;
1917         if (zp_gen == 0)
1918                 zp_gen = 1;
1919         if (zp->z_unlinked || zp_gen != fid_gen) {
1920                 dprintf("znode gen (%llu) != fid gen (%llu)\n",
1921                     (u_longlong_t)zp_gen, (u_longlong_t)fid_gen);
1922                 vrele(ZTOV(zp));
1923                 zfs_exit(zfsvfs, FTAG);
1924                 return (SET_ERROR(EINVAL));
1925         }
1926
1927         *vpp = ZTOV(zp);
1928         zfs_exit(zfsvfs, FTAG);
1929         err = vn_lock(*vpp, flags);
1930         if (err == 0)
1931                 vnode_create_vobject(*vpp, zp->z_size, curthread);
1932         else
1933                 *vpp = NULL;
1934         return (err);
1935 }
1936
1937 /*
1938  * Block out VOPs and close zfsvfs_t::z_os
1939  *
1940  * Note, if successful, then we return with the 'z_teardown_lock' and
1941  * 'z_teardown_inactive_lock' write held.  We leave ownership of the underlying
1942  * dataset and objset intact so that they can be atomically handed off during
1943  * a subsequent rollback or recv operation and the resume thereafter.
1944  */
1945 int
1946 zfs_suspend_fs(zfsvfs_t *zfsvfs)
1947 {
1948         int error;
1949
1950         if ((error = zfsvfs_teardown(zfsvfs, B_FALSE)) != 0)
1951                 return (error);
1952
1953         return (0);
1954 }
1955
1956 /*
1957  * Rebuild SA and release VOPs.  Note that ownership of the underlying dataset
1958  * is an invariant across any of the operations that can be performed while the
1959  * filesystem was suspended.  Whether it succeeded or failed, the preconditions
1960  * are the same: the relevant objset and associated dataset are owned by
1961  * zfsvfs, held, and long held on entry.
1962  */
1963 int
1964 zfs_resume_fs(zfsvfs_t *zfsvfs, dsl_dataset_t *ds)
1965 {
1966         int err;
1967         znode_t *zp;
1968
1969         ASSERT(ZFS_TEARDOWN_WRITE_HELD(zfsvfs));
1970         ASSERT(ZFS_TEARDOWN_INACTIVE_WRITE_HELD(zfsvfs));
1971
1972         /*
1973          * We already own this, so just update the objset_t, as the one we
1974          * had before may have been evicted.
1975          */
1976         objset_t *os;
1977         VERIFY3P(ds->ds_owner, ==, zfsvfs);
1978         VERIFY(dsl_dataset_long_held(ds));
1979         dsl_pool_t *dp = spa_get_dsl(dsl_dataset_get_spa(ds));
1980         dsl_pool_config_enter(dp, FTAG);
1981         VERIFY0(dmu_objset_from_ds(ds, &os));
1982         dsl_pool_config_exit(dp, FTAG);
1983
1984         err = zfsvfs_init(zfsvfs, os);
1985         if (err != 0)
1986                 goto bail;
1987
1988         ds->ds_dir->dd_activity_cancelled = B_FALSE;
1989         VERIFY0(zfsvfs_setup(zfsvfs, B_FALSE));
1990
1991         zfs_set_fuid_feature(zfsvfs);
1992
1993         /*
1994          * Attempt to re-establish all the active znodes with
1995          * their dbufs.  If a zfs_rezget() fails, then we'll let
1996          * any potential callers discover that via zfs_enter_verify_zp
1997          * when they try to use their znode.
1998          */
1999         mutex_enter(&zfsvfs->z_znodes_lock);
2000         for (zp = list_head(&zfsvfs->z_all_znodes); zp;
2001             zp = list_next(&zfsvfs->z_all_znodes, zp)) {
2002                 (void) zfs_rezget(zp);
2003         }
2004         mutex_exit(&zfsvfs->z_znodes_lock);
2005
2006 bail:
2007         /* release the VOPs */
2008         ZFS_TEARDOWN_INACTIVE_EXIT_WRITE(zfsvfs);
2009         ZFS_TEARDOWN_EXIT(zfsvfs, FTAG);
2010
2011         if (err) {
2012                 /*
2013                  * Since we couldn't setup the sa framework, try to force
2014                  * unmount this file system.
2015                  */
2016                 if (vn_vfswlock(zfsvfs->z_vfs->vfs_vnodecovered) == 0) {
2017                         vfs_ref(zfsvfs->z_vfs);
2018                         (void) dounmount(zfsvfs->z_vfs, MS_FORCE, curthread);
2019                 }
2020         }
2021         return (err);
2022 }
2023
2024 static void
2025 zfs_freevfs(vfs_t *vfsp)
2026 {
2027         zfsvfs_t *zfsvfs = vfsp->vfs_data;
2028
2029         zfsvfs_free(zfsvfs);
2030
2031         atomic_dec_32(&zfs_active_fs_count);
2032 }
2033
2034 #ifdef __i386__
2035 static int desiredvnodes_backup;
2036 #include <sys/vmmeter.h>
2037
2038
2039 #include <vm/vm_page.h>
2040 #include <vm/vm_object.h>
2041 #include <vm/vm_kern.h>
2042 #include <vm/vm_map.h>
2043 #endif
2044
2045 static void
2046 zfs_vnodes_adjust(void)
2047 {
2048 #ifdef __i386__
2049         int newdesiredvnodes;
2050
2051         desiredvnodes_backup = desiredvnodes;
2052
2053         /*
2054          * We calculate newdesiredvnodes the same way it is done in
2055          * vntblinit(). If it is equal to desiredvnodes, it means that
2056          * it wasn't tuned by the administrator and we can tune it down.
2057          */
2058         newdesiredvnodes = min(maxproc + vm_cnt.v_page_count / 4, 2 *
2059             vm_kmem_size / (5 * (sizeof (struct vm_object) +
2060             sizeof (struct vnode))));
2061         if (newdesiredvnodes == desiredvnodes)
2062                 desiredvnodes = (3 * newdesiredvnodes) / 4;
2063 #endif
2064 }
2065
2066 static void
2067 zfs_vnodes_adjust_back(void)
2068 {
2069
2070 #ifdef __i386__
2071         desiredvnodes = desiredvnodes_backup;
2072 #endif
2073 }
2074
2075 static struct sx zfs_vnlru_lock;
2076 static struct vnode *zfs_vnlru_marker;
2077 static arc_prune_t *zfs_prune;
2078
2079 static void
2080 zfs_prune_task(uint64_t nr_to_scan, void *arg __unused)
2081 {
2082         if (nr_to_scan > INT_MAX)
2083                 nr_to_scan = INT_MAX;
2084         sx_xlock(&zfs_vnlru_lock);
2085         vnlru_free_vfsops(nr_to_scan, &zfs_vfsops, zfs_vnlru_marker);
2086         sx_xunlock(&zfs_vnlru_lock);
2087 }
2088
2089 void
2090 zfs_init(void)
2091 {
2092
2093         printf("ZFS filesystem version: " ZPL_VERSION_STRING "\n");
2094
2095         /*
2096          * Initialize .zfs directory structures
2097          */
2098         zfsctl_init();
2099
2100         /*
2101          * Initialize znode cache, vnode ops, etc...
2102          */
2103         zfs_znode_init();
2104
2105         /*
2106          * Reduce number of vnodes. Originally number of vnodes is calculated
2107          * with UFS inode in mind. We reduce it here, because it's too big for
2108          * ZFS/i386.
2109          */
2110         zfs_vnodes_adjust();
2111
2112         dmu_objset_register_type(DMU_OST_ZFS, zpl_get_file_info);
2113
2114         zfsvfs_taskq = taskq_create("zfsvfs", 1, minclsyspri, 0, 0, 0);
2115
2116         zfs_vnlru_marker = vnlru_alloc_marker();
2117         sx_init(&zfs_vnlru_lock, "zfs vnlru lock");
2118         zfs_prune = arc_add_prune_callback(zfs_prune_task, NULL);
2119 }
2120
2121 void
2122 zfs_fini(void)
2123 {
2124         arc_remove_prune_callback(zfs_prune);
2125         vnlru_free_marker(zfs_vnlru_marker);
2126         sx_destroy(&zfs_vnlru_lock);
2127
2128         taskq_destroy(zfsvfs_taskq);
2129         zfsctl_fini();
2130         zfs_znode_fini();
2131         zfs_vnodes_adjust_back();
2132 }
2133
2134 int
2135 zfs_busy(void)
2136 {
2137         return (zfs_active_fs_count != 0);
2138 }
2139
2140 /*
2141  * Release VOPs and unmount a suspended filesystem.
2142  */
2143 int
2144 zfs_end_fs(zfsvfs_t *zfsvfs, dsl_dataset_t *ds)
2145 {
2146         ASSERT(ZFS_TEARDOWN_WRITE_HELD(zfsvfs));
2147         ASSERT(ZFS_TEARDOWN_INACTIVE_WRITE_HELD(zfsvfs));
2148
2149         /*
2150          * We already own this, so just hold and rele it to update the
2151          * objset_t, as the one we had before may have been evicted.
2152          */
2153         objset_t *os;
2154         VERIFY3P(ds->ds_owner, ==, zfsvfs);
2155         VERIFY(dsl_dataset_long_held(ds));
2156         dsl_pool_t *dp = spa_get_dsl(dsl_dataset_get_spa(ds));
2157         dsl_pool_config_enter(dp, FTAG);
2158         VERIFY0(dmu_objset_from_ds(ds, &os));
2159         dsl_pool_config_exit(dp, FTAG);
2160         zfsvfs->z_os = os;
2161
2162         /* release the VOPs */
2163         ZFS_TEARDOWN_INACTIVE_EXIT_WRITE(zfsvfs);
2164         ZFS_TEARDOWN_EXIT(zfsvfs, FTAG);
2165
2166         /*
2167          * Try to force unmount this file system.
2168          */
2169         (void) zfs_umount(zfsvfs->z_vfs, 0);
2170         zfsvfs->z_unmounted = B_TRUE;
2171         return (0);
2172 }
2173
2174 int
2175 zfs_set_version(zfsvfs_t *zfsvfs, uint64_t newvers)
2176 {
2177         int error;
2178         objset_t *os = zfsvfs->z_os;
2179         dmu_tx_t *tx;
2180
2181         if (newvers < ZPL_VERSION_INITIAL || newvers > ZPL_VERSION)
2182                 return (SET_ERROR(EINVAL));
2183
2184         if (newvers < zfsvfs->z_version)
2185                 return (SET_ERROR(EINVAL));
2186
2187         if (zfs_spa_version_map(newvers) >
2188             spa_version(dmu_objset_spa(zfsvfs->z_os)))
2189                 return (SET_ERROR(ENOTSUP));
2190
2191         tx = dmu_tx_create(os);
2192         dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, B_FALSE, ZPL_VERSION_STR);
2193         if (newvers >= ZPL_VERSION_SA && !zfsvfs->z_use_sa) {
2194                 dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, B_TRUE,
2195                     ZFS_SA_ATTRS);
2196                 dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL);
2197         }
2198         error = dmu_tx_assign(tx, TXG_WAIT);
2199         if (error) {
2200                 dmu_tx_abort(tx);
2201                 return (error);
2202         }
2203
2204         error = zap_update(os, MASTER_NODE_OBJ, ZPL_VERSION_STR,
2205             8, 1, &newvers, tx);
2206
2207         if (error) {
2208                 dmu_tx_commit(tx);
2209                 return (error);
2210         }
2211
2212         if (newvers >= ZPL_VERSION_SA && !zfsvfs->z_use_sa) {
2213                 uint64_t sa_obj;
2214
2215                 ASSERT3U(spa_version(dmu_objset_spa(zfsvfs->z_os)), >=,
2216                     SPA_VERSION_SA);
2217                 sa_obj = zap_create(os, DMU_OT_SA_MASTER_NODE,
2218                     DMU_OT_NONE, 0, tx);
2219
2220                 error = zap_add(os, MASTER_NODE_OBJ,
2221                     ZFS_SA_ATTRS, 8, 1, &sa_obj, tx);
2222                 ASSERT0(error);
2223
2224                 VERIFY0(sa_set_sa_object(os, sa_obj));
2225                 sa_register_update_callback(os, zfs_sa_upgrade);
2226         }
2227
2228         spa_history_log_internal_ds(dmu_objset_ds(os), "upgrade", tx,
2229             "from %ju to %ju", (uintmax_t)zfsvfs->z_version,
2230             (uintmax_t)newvers);
2231         dmu_tx_commit(tx);
2232
2233         zfsvfs->z_version = newvers;
2234         os->os_version = newvers;
2235
2236         zfs_set_fuid_feature(zfsvfs);
2237
2238         return (0);
2239 }
2240
2241 /*
2242  * Return true if the corresponding vfs's unmounted flag is set.
2243  * Otherwise return false.
2244  * If this function returns true we know VFS unmount has been initiated.
2245  */
2246 boolean_t
2247 zfs_get_vfs_flag_unmounted(objset_t *os)
2248 {
2249         zfsvfs_t *zfvp;
2250         boolean_t unmounted = B_FALSE;
2251
2252         ASSERT3U(dmu_objset_type(os), ==, DMU_OST_ZFS);
2253
2254         mutex_enter(&os->os_user_ptr_lock);
2255         zfvp = dmu_objset_get_user(os);
2256         if (zfvp != NULL && zfvp->z_vfs != NULL &&
2257             (zfvp->z_vfs->mnt_kern_flag & MNTK_UNMOUNT))
2258                 unmounted = B_TRUE;
2259         mutex_exit(&os->os_user_ptr_lock);
2260
2261         return (unmounted);
2262 }
2263
2264 #ifdef _KERNEL
2265 void
2266 zfsvfs_update_fromname(const char *oldname, const char *newname)
2267 {
2268         char tmpbuf[MAXPATHLEN];
2269         struct mount *mp;
2270         char *fromname;
2271         size_t oldlen;
2272
2273         oldlen = strlen(oldname);
2274
2275         mtx_lock(&mountlist_mtx);
2276         TAILQ_FOREACH(mp, &mountlist, mnt_list) {
2277                 fromname = mp->mnt_stat.f_mntfromname;
2278                 if (strcmp(fromname, oldname) == 0) {
2279                         (void) strlcpy(fromname, newname,
2280                             sizeof (mp->mnt_stat.f_mntfromname));
2281                         continue;
2282                 }
2283                 if (strncmp(fromname, oldname, oldlen) == 0 &&
2284                     (fromname[oldlen] == '/' || fromname[oldlen] == '@')) {
2285                         (void) snprintf(tmpbuf, sizeof (tmpbuf), "%s%s",
2286                             newname, fromname + oldlen);
2287                         (void) strlcpy(fromname, tmpbuf,
2288                             sizeof (mp->mnt_stat.f_mntfromname));
2289                         continue;
2290                 }
2291         }
2292         mtx_unlock(&mountlist_mtx);
2293 }
2294 #endif
2295
2296 /*
2297  * Find a prison with ZFS info.
2298  * Return the ZFS info and the (locked) prison.
2299  */
2300 static struct zfs_jailparam *
2301 zfs_jailparam_find(struct prison *spr, struct prison **prp)
2302 {
2303         struct prison *pr;
2304         struct zfs_jailparam *zjp;
2305
2306         for (pr = spr; ; pr = pr->pr_parent) {
2307                 mtx_lock(&pr->pr_mtx);
2308                 if (pr == &prison0) {
2309                         zjp = &zfs_jailparam0;
2310                         break;
2311                 }
2312                 zjp = osd_jail_get(pr, zfs_jailparam_slot);
2313                 if (zjp != NULL)
2314                         break;
2315                 mtx_unlock(&pr->pr_mtx);
2316         }
2317         *prp = pr;
2318
2319         return (zjp);
2320 }
2321
2322 /*
2323  * Ensure a prison has its own ZFS info.  If zjpp is non-null, point it to the
2324  * ZFS info and lock the prison.
2325  */
2326 static void
2327 zfs_jailparam_alloc(struct prison *pr, struct zfs_jailparam **zjpp)
2328 {
2329         struct prison *ppr;
2330         struct zfs_jailparam *zjp, *nzjp;
2331         void **rsv;
2332
2333         /* If this prison already has ZFS info, return that. */
2334         zjp = zfs_jailparam_find(pr, &ppr);
2335         if (ppr == pr)
2336                 goto done;
2337
2338         /*
2339          * Allocate a new info record.  Then check again, in case something
2340          * changed during the allocation.
2341          */
2342         mtx_unlock(&ppr->pr_mtx);
2343         nzjp = malloc(sizeof (struct zfs_jailparam), M_PRISON, M_WAITOK);
2344         rsv = osd_reserve(zfs_jailparam_slot);
2345         zjp = zfs_jailparam_find(pr, &ppr);
2346         if (ppr == pr) {
2347                 free(nzjp, M_PRISON);
2348                 osd_free_reserved(rsv);
2349                 goto done;
2350         }
2351         /* Inherit the initial values from the ancestor. */
2352         mtx_lock(&pr->pr_mtx);
2353         (void) osd_jail_set_reserved(pr, zfs_jailparam_slot, rsv, nzjp);
2354         (void) memcpy(nzjp, zjp, sizeof (*zjp));
2355         zjp = nzjp;
2356         mtx_unlock(&ppr->pr_mtx);
2357 done:
2358         if (zjpp != NULL)
2359                 *zjpp = zjp;
2360         else
2361                 mtx_unlock(&pr->pr_mtx);
2362 }
2363
2364 /*
2365  * Jail OSD methods for ZFS VFS info.
2366  */
2367 static int
2368 zfs_jailparam_create(void *obj, void *data)
2369 {
2370         struct prison *pr = obj;
2371         struct vfsoptlist *opts = data;
2372         int jsys;
2373
2374         if (vfs_copyopt(opts, "zfs", &jsys, sizeof (jsys)) == 0 &&
2375             jsys == JAIL_SYS_INHERIT)
2376                 return (0);
2377         /*
2378          * Inherit a prison's initial values from its parent
2379          * (different from JAIL_SYS_INHERIT which also inherits changes).
2380          */
2381         zfs_jailparam_alloc(pr, NULL);
2382         return (0);
2383 }
2384
2385 static int
2386 zfs_jailparam_get(void *obj, void *data)
2387 {
2388         struct prison *ppr, *pr = obj;
2389         struct vfsoptlist *opts = data;
2390         struct zfs_jailparam *zjp;
2391         int jsys, error;
2392
2393         zjp = zfs_jailparam_find(pr, &ppr);
2394         jsys = (ppr == pr) ? JAIL_SYS_NEW : JAIL_SYS_INHERIT;
2395         error = vfs_setopt(opts, "zfs", &jsys, sizeof (jsys));
2396         if (error != 0 && error != ENOENT)
2397                 goto done;
2398         if (jsys == JAIL_SYS_NEW) {
2399                 error = vfs_setopt(opts, "zfs.mount_snapshot",
2400                     &zjp->mount_snapshot, sizeof (zjp->mount_snapshot));
2401                 if (error != 0 && error != ENOENT)
2402                         goto done;
2403         } else {
2404                 /*
2405                  * If this prison is inheriting its ZFS info, report
2406                  * empty/zero parameters.
2407                  */
2408                 static int mount_snapshot = 0;
2409
2410                 error = vfs_setopt(opts, "zfs.mount_snapshot",
2411                     &mount_snapshot, sizeof (mount_snapshot));
2412                 if (error != 0 && error != ENOENT)
2413                         goto done;
2414         }
2415         error = 0;
2416 done:
2417         mtx_unlock(&ppr->pr_mtx);
2418         return (error);
2419 }
2420
2421 static int
2422 zfs_jailparam_set(void *obj, void *data)
2423 {
2424         struct prison *pr = obj;
2425         struct prison *ppr;
2426         struct vfsoptlist *opts = data;
2427         int error, jsys, mount_snapshot;
2428
2429         /* Set the parameters, which should be correct. */
2430         error = vfs_copyopt(opts, "zfs", &jsys, sizeof (jsys));
2431         if (error == ENOENT)
2432                 jsys = -1;
2433         error = vfs_copyopt(opts, "zfs.mount_snapshot", &mount_snapshot,
2434             sizeof (mount_snapshot));
2435         if (error == ENOENT)
2436                 mount_snapshot = -1;
2437         else
2438                 jsys = JAIL_SYS_NEW;
2439         switch (jsys) {
2440         case JAIL_SYS_NEW:
2441         {
2442                 /* "zfs=new" or "zfs.*": the prison gets its own ZFS info. */
2443                 struct zfs_jailparam *zjp;
2444
2445                 /*
2446                  * A child jail cannot have more permissions than its parent
2447                  */
2448                 if (pr->pr_parent != &prison0) {
2449                         zjp = zfs_jailparam_find(pr->pr_parent, &ppr);
2450                         mtx_unlock(&ppr->pr_mtx);
2451                         if (zjp->mount_snapshot < mount_snapshot) {
2452                                 return (EPERM);
2453                         }
2454                 }
2455                 zfs_jailparam_alloc(pr, &zjp);
2456                 if (mount_snapshot != -1)
2457                         zjp->mount_snapshot = mount_snapshot;
2458                 mtx_unlock(&pr->pr_mtx);
2459                 break;
2460         }
2461         case JAIL_SYS_INHERIT:
2462                 /* "zfs=inherit": inherit the parent's ZFS info. */
2463                 mtx_lock(&pr->pr_mtx);
2464                 osd_jail_del(pr, zfs_jailparam_slot);
2465                 mtx_unlock(&pr->pr_mtx);
2466                 break;
2467         case -1:
2468                 /*
2469                  * If the setting being changed is not ZFS related
2470                  * then do nothing.
2471                  */
2472                 break;
2473         }
2474
2475         return (0);
2476 }
2477
2478 static int
2479 zfs_jailparam_check(void *obj __unused, void *data)
2480 {
2481         struct vfsoptlist *opts = data;
2482         int error, jsys, mount_snapshot;
2483
2484         /* Check that the parameters are correct. */
2485         error = vfs_copyopt(opts, "zfs", &jsys, sizeof (jsys));
2486         if (error != ENOENT) {
2487                 if (error != 0)
2488                         return (error);
2489                 if (jsys != JAIL_SYS_NEW && jsys != JAIL_SYS_INHERIT)
2490                         return (EINVAL);
2491         }
2492         error = vfs_copyopt(opts, "zfs.mount_snapshot", &mount_snapshot,
2493             sizeof (mount_snapshot));
2494         if (error != ENOENT) {
2495                 if (error != 0)
2496                         return (error);
2497                 if (mount_snapshot != 0 && mount_snapshot != 1)
2498                         return (EINVAL);
2499         }
2500         return (0);
2501 }
2502
2503 static void
2504 zfs_jailparam_destroy(void *data)
2505 {
2506
2507         free(data, M_PRISON);
2508 }
2509
2510 static void
2511 zfs_jailparam_sysinit(void *arg __unused)
2512 {
2513         struct prison *pr;
2514         osd_method_t  methods[PR_MAXMETHOD] = {
2515                 [PR_METHOD_CREATE] = zfs_jailparam_create,
2516                 [PR_METHOD_GET] = zfs_jailparam_get,
2517                 [PR_METHOD_SET] = zfs_jailparam_set,
2518                 [PR_METHOD_CHECK] = zfs_jailparam_check,
2519         };
2520
2521         zfs_jailparam_slot = osd_jail_register(zfs_jailparam_destroy, methods);
2522         /* Copy the defaults to any existing prisons. */
2523         sx_slock(&allprison_lock);
2524         TAILQ_FOREACH(pr, &allprison, pr_list)
2525                 zfs_jailparam_alloc(pr, NULL);
2526         sx_sunlock(&allprison_lock);
2527 }
2528
2529 static void
2530 zfs_jailparam_sysuninit(void *arg __unused)
2531 {
2532
2533         osd_jail_deregister(zfs_jailparam_slot);
2534 }
2535
2536 SYSINIT(zfs_jailparam_sysinit, SI_SUB_DRIVERS, SI_ORDER_ANY,
2537         zfs_jailparam_sysinit, NULL);
2538 SYSUNINIT(zfs_jailparam_sysuninit, SI_SUB_DRIVERS, SI_ORDER_ANY,
2539         zfs_jailparam_sysuninit, NULL);