kernel/fs/zfs/zfs_vfsops.c

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  23  * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
  24  * Copyright (c) 2014 Integros [integros.com]
  25  * Copyright 2016 Nexenta Systems, Inc. All rights reserved.
  26  */
  27
  28 /* Portions Copyright 2010 Robert Milkowski */
  29
  30 #include <sys/types.h>
  31 #include <sys/param.h>
  32 #include <sys/systm.h>
  33 #include <sys/sysmacros.h>
  34 #include <sys/kmem.h>
  35 #include <sys/pathname.h>
  36 #include <sys/vnode.h>
  37 #include <sys/vfs.h>
  38 #include <sys/mntent.h>
  39 #include <sys/mount.h>
  40 #include <sys/cmn_err.h>
  41 #include "sys/fs_subr.h"
  42 #include <sys/zfs_znode.h>
  43 #include <sys/zfs_dir.h>
  44 #include <sys/zil.h>
  45 #include <sys/fs/zfs.h>
  46 #include <sys/dmu.h>
  47 #include <sys/dsl_prop.h>
  48 #include <sys/dsl_dataset.h>
  49 #include <sys/dsl_deleg.h>
  50 #include <sys/spa.h>
  51 #include <sys/zap.h>
  52 #include <sys/sa.h>
  53 #include <sys/sa_impl.h>
  54 #include <sys/varargs.h>
  55 #include <sys/policy.h>
  56 #include <sys/atomic.h>
  57 #include <sys/mkdev.h>
  58 #include <sys/modctl.h>
  59 #include <sys/refstr.h>
  60 #include <sys/zfs_ioctl.h>
  61 #include <sys/zfs_ctldir.h>
  62 #include <sys/zfs_fuid.h>
  63 #include <sys/bootconf.h>
  64 #include <sys/sunddi.h>
  65 #include <sys/dnlc.h>
  66 #include <sys/dmu_objset.h>
  67 #include <sys/spa_boot.h>
  68 #include "zfs_comutil.h"
  69
  70 int zfsfstype;
  71 static major_t zfs_major;
  72 static minor_t zfs_minor;
  73 static kmutex_t zfs_dev_mtx;
  74
  75 extern int sys_shutdown;
  76
  77 static int zfs_mount(vfs_t *vfsp, vnode_t *mvp, struct mounta *uap, cred_t *cr);
  78 static int zfs_umount(vfs_t *vfsp, int fflag, cred_t *cr);
  79 static int zfs_mountroot(vfs_t *vfsp, enum whymountroot);
  80 static int zfs_root(vfs_t *vfsp, vnode_t **vpp);
  81 static int zfs_statvfs(vfs_t *vfsp, struct statvfs64 *statp);
  82 static int zfs_vget(vfs_t *vfsp, vnode_t **vpp, fid_t *fidp);
  83 static void zfs_freevfs(vfs_t *vfsp);
  84
  85 static const struct vfsops zfs_vfsops = {
  86         .vfs_mount = zfs_mount,
  87         .vfs_mountroot = zfs_mountroot,
  88         .vfs_unmount = zfs_umount,
  89         .vfs_root = zfs_root,
  90         .vfs_statvfs = zfs_statvfs,
  91         .vfs_sync = zfs_sync,
  92         .vfs_vget = zfs_vget,
  93         .vfs_freevfs = zfs_freevfs,
  94 };
  95
  96 /*
  97  * We need to keep a count of active fs's.
  98  * This is necessary to prevent our module
  99  * from being unloaded after a umount -f
 100  */
 101 static uint32_t zfs_active_fs_count = 0;
 102
 103 static char *noatime_cancel[] = { MNTOPT_ATIME, NULL };
 104 static char *atime_cancel[] = { MNTOPT_NOATIME, NULL };
 105 static char *noxattr_cancel[] = { MNTOPT_XATTR, NULL };
 106 static char *xattr_cancel[] = { MNTOPT_NOXATTR, NULL };
 107
 108 /*
 109  * MO_DEFAULT is not used since the default value is determined
 110  * by the equivalent property.
 111  */
 112 static mntopt_t mntopts[] = {
 113         { MNTOPT_NOXATTR, noxattr_cancel, NULL, 0, NULL },
 114         { MNTOPT_XATTR, xattr_cancel, NULL, 0, NULL },
 115         { MNTOPT_NOATIME, noatime_cancel, NULL, 0, NULL },
 116         { MNTOPT_ATIME, atime_cancel, NULL, 0, NULL }
 117 };
 118
 119 static mntopts_t zfs_mntopts = {
 120         sizeof (mntopts) / sizeof (mntopt_t),
 121         mntopts
 122 };
 123
 124 /*ARGSUSED*/
 125 int
 126 zfs_sync(vfs_t *vfsp, short flag, cred_t *cr)
 127 {
 128         /*
 129          * Data integrity is job one.  We don't want a compromised kernel
 130          * writing to the storage pool, so we never sync during panic.
 131          */
 132         if (panicstr)
 133                 return (0);
 134
 135         /*
 136          * SYNC_ATTR is used by fsflush() to force old filesystems like UFS
 137          * to sync metadata, which they would otherwise cache indefinitely.
 138          * Semantically, the only requirement is that the sync be initiated.
 139          * The DMU syncs out txgs frequently, so there's nothing to do.
 140          */
 141         if (flag & SYNC_ATTR)
 142                 return (0);
 143
 144         if (vfsp != NULL) {
 145                 /*
 146                  * Sync a specific filesystem.
 147                  */
 148                 zfsvfs_t *zfsvfs = vfsp->vfs_data;
 149                 dsl_pool_t *dp;
 150
 151                 ZFS_ENTER(zfsvfs);
 152                 dp = dmu_objset_pool(zfsvfs->z_os);
 153
 154                 /*
 155                  * If the system is shutting down, then skip any
 156                  * filesystems which may exist on a suspended pool.
 157                  */
 158                 if (sys_shutdown && spa_suspended(dp->dp_spa)) {
 159                         ZFS_EXIT(zfsvfs);
 160                         return (0);
 161                 }
 162
 163                 if (zfsvfs->z_log != NULL)
 164                         zil_commit(zfsvfs->z_log, 0);
 165
 166                 ZFS_EXIT(zfsvfs);
 167         } else {
 168                 /*
 169                  * Sync all ZFS filesystems.  This is what happens when you
 170                  * run sync(1M).  Unlike other filesystems, ZFS honors the
 171                  * request by waiting for all pools to commit all dirty data.
 172                  */
 173                 spa_sync_allpools();
 174         }
 175
 176         return (0);
 177 }
 178
 179 static int
 180 zfs_create_unique_device(dev_t *dev)
 181 {
 182         major_t new_major;
 183
 184         do {
 185                 ASSERT3U(zfs_minor, <=, MAXMIN32);
 186                 minor_t start = zfs_minor;
 187                 do {
 188                         mutex_enter(&zfs_dev_mtx);
 189                         if (zfs_minor >= MAXMIN32) {
 190                                 /*
 191                                  * If we're still using the real major
 192                                  * keep out of /dev/zfs and /dev/zvol minor
 193                                  * number space.  If we're using a getudev()'ed
 194                                  * major number, we can use all of its minors.
 195                                  */
 196                                 if (zfs_major == ddi_name_to_major(ZFS_DRIVER))
 197                                         zfs_minor = ZFS_MIN_MINOR;
 198                                 else
 199                                         zfs_minor = 0;
 200                         } else {
 201                                 zfs_minor++;
 202                         }
 203                         *dev = makedevice(zfs_major, zfs_minor);
 204                         mutex_exit(&zfs_dev_mtx);
 205                 } while (vfs_devismounted(*dev) && zfs_minor != start);
 206                 if (zfs_minor == start) {
 207                         /*
 208                          * We are using all ~262,000 minor numbers for the
 209                          * current major number.  Create a new major number.
 210                          */
 211                         if ((new_major = getudev()) == (major_t)-1) {
 212                                 cmn_err(CE_WARN,
 213                                     "zfs_mount: Can't get unique major "
 214                                     "device number.");
 215                                 return (-1);
 216                         }
 217                         mutex_enter(&zfs_dev_mtx);
 218                         zfs_major = new_major;
 219                         zfs_minor = 0;
 220
 221                         mutex_exit(&zfs_dev_mtx);
 222                 } else {
 223                         break;
 224                 }
 225                 /* CONSTANTCONDITION */
 226         } while (1);
 227
 228         return (0);
 229 }
 230
 231 static void
 232 atime_changed_cb(void *arg, uint64_t newval)
 233 {
 234         zfsvfs_t *zfsvfs = arg;
 235
 236         if (newval == TRUE) {
 237                 zfsvfs->z_atime = TRUE;
 238                 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOATIME);
 239                 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_ATIME, NULL, 0);
 240         } else {
 241                 zfsvfs->z_atime = FALSE;
 242                 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_ATIME);
 243                 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOATIME, NULL, 0);
 244         }
 245 }
 246
 247 static void
 248 xattr_changed_cb(void *arg, uint64_t newval)
 249 {
 250         zfsvfs_t *zfsvfs = arg;
 251
 252         if (newval == TRUE) {
 253                 /* XXX locking on vfs_flag? */
 254                 zfsvfs->z_vfs->vfs_flag |= VFS_XATTR;
 255                 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOXATTR);
 256                 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_XATTR, NULL, 0);
 257         } else {
 258                 /* XXX locking on vfs_flag? */
 259                 zfsvfs->z_vfs->vfs_flag &= ~VFS_XATTR;
 260                 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_XATTR);
 261                 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOXATTR, NULL, 0);
 262         }
 263 }
 264
 265 static void
 266 blksz_changed_cb(void *arg, uint64_t newval)
 267 {
 268         zfsvfs_t *zfsvfs = arg;
 269         ASSERT3U(newval, <=, spa_maxblocksize(dmu_objset_spa(zfsvfs->z_os)));
 270         ASSERT3U(newval, >=, SPA_MINBLOCKSIZE);
 271         ASSERT(ISP2(newval));
 272
 273         zfsvfs->z_max_blksz = newval;
 274         zfsvfs->z_vfs->vfs_bsize = newval;
 275 }
 276
 277 static void
 278 readonly_changed_cb(void *arg, uint64_t newval)
 279 {
 280         zfsvfs_t *zfsvfs = arg;
 281
 282         if (newval) {
 283                 /* XXX locking on vfs_flag? */
 284                 zfsvfs->z_vfs->vfs_flag |= VFS_RDONLY;
 285                 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_RW);
 286                 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_RO, NULL, 0);
 287         } else {
 288                 /* XXX locking on vfs_flag? */
 289                 zfsvfs->z_vfs->vfs_flag &= ~VFS_RDONLY;
 290                 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_RO);
 291                 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_RW, NULL, 0);
 292         }
 293 }
 294
 295 static void
 296 devices_changed_cb(void *arg, uint64_t newval)
 297 {
 298         zfsvfs_t *zfsvfs = arg;
 299
 300         if (newval == FALSE) {
 301                 zfsvfs->z_vfs->vfs_flag |= VFS_NODEVICES;
 302                 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_DEVICES);
 303                 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NODEVICES, NULL, 0);
 304         } else {
 305                 zfsvfs->z_vfs->vfs_flag &= ~VFS_NODEVICES;
 306                 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NODEVICES);
 307                 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_DEVICES, NULL, 0);
 308         }
 309 }
 310
 311 static void
 312 setuid_changed_cb(void *arg, uint64_t newval)
 313 {
 314         zfsvfs_t *zfsvfs = arg;
 315
 316         if (newval == FALSE) {
 317                 zfsvfs->z_vfs->vfs_flag |= VFS_NOSETUID;
 318                 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_SETUID);
 319                 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOSETUID, NULL, 0);
 320         } else {
 321                 zfsvfs->z_vfs->vfs_flag &= ~VFS_NOSETUID;
 322                 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOSETUID);
 323                 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_SETUID, NULL, 0);
 324         }
 325 }
 326
 327 static void
 328 exec_changed_cb(void *arg, uint64_t newval)
 329 {
 330         zfsvfs_t *zfsvfs = arg;
 331
 332         if (newval == FALSE) {
 333                 zfsvfs->z_vfs->vfs_flag |= VFS_NOEXEC;
 334                 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_EXEC);
 335                 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOEXEC, NULL, 0);
 336         } else {
 337                 zfsvfs->z_vfs->vfs_flag &= ~VFS_NOEXEC;
 338                 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOEXEC);
 339                 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_EXEC, NULL, 0);
 340         }
 341 }
 342
 343 /*
 344  * The nbmand mount option can be changed at mount time.
 345  * We can't allow it to be toggled on live file systems or incorrect
 346  * behavior may be seen from cifs clients
 347  *
 348  * This property isn't registered via dsl_prop_register(), but this callback
 349  * will be called when a file system is first mounted
 350  */
 351 static void
 352 nbmand_changed_cb(void *arg, uint64_t newval)
 353 {
 354         zfsvfs_t *zfsvfs = arg;
 355         if (newval == FALSE) {
 356                 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NBMAND);
 357                 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NONBMAND, NULL, 0);
 358         } else {
 359                 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NONBMAND);
 360                 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NBMAND, NULL, 0);
 361         }
 362 }
 363
 364 static void
 365 snapdir_changed_cb(void *arg, uint64_t newval)
 366 {
 367         zfsvfs_t *zfsvfs = arg;
 368
 369         zfsvfs->z_show_ctldir = newval;
 370 }
 371
 372 static void
 373 vscan_changed_cb(void *arg, uint64_t newval)
 374 {
 375         zfsvfs_t *zfsvfs = arg;
 376
 377         zfsvfs->z_vscan = newval;
 378 }
 379
 380 static void
 381 acl_mode_changed_cb(void *arg, uint64_t newval)
 382 {
 383         zfsvfs_t *zfsvfs = arg;
 384
 385         zfsvfs->z_acl_mode = newval;
 386 }
 387
 388 static void
 389 acl_inherit_changed_cb(void *arg, uint64_t newval)
 390 {
 391         zfsvfs_t *zfsvfs = arg;
 392
 393         zfsvfs->z_acl_inherit = newval;
 394 }
 395
 396 static int
 397 zfs_register_callbacks(vfs_t *vfsp)
 398 {
 399         struct dsl_dataset *ds = NULL;
 400         objset_t *os = NULL;
 401         zfsvfs_t *zfsvfs = NULL;
 402         uint64_t nbmand;
 403         boolean_t readonly = B_FALSE;
 404         boolean_t do_readonly = B_FALSE;
 405         boolean_t setuid = B_FALSE;
 406         boolean_t do_setuid = B_FALSE;
 407         boolean_t exec = B_FALSE;
 408         boolean_t do_exec = B_FALSE;
 409         boolean_t devices = B_FALSE;
 410         boolean_t do_devices = B_FALSE;
 411         boolean_t xattr = B_FALSE;
 412         boolean_t do_xattr = B_FALSE;
 413         boolean_t atime = B_FALSE;
 414         boolean_t do_atime = B_FALSE;
 415         int error = 0;
 416
 417         ASSERT(vfsp);
 418         zfsvfs = vfsp->vfs_data;
 419         ASSERT(zfsvfs);
 420         os = zfsvfs->z_os;
 421
 422         /*
 423          * The act of registering our callbacks will destroy any mount
 424          * options we may have.  In order to enable temporary overrides
 425          * of mount options, we stash away the current values and
 426          * restore them after we register the callbacks.
 427          */
 428         if (vfs_optionisset(vfsp, MNTOPT_RO, NULL) ||
 429             !spa_writeable(dmu_objset_spa(os))) {
 430                 readonly = B_TRUE;
 431                 do_readonly = B_TRUE;
 432         } else if (vfs_optionisset(vfsp, MNTOPT_RW, NULL)) {
 433                 readonly = B_FALSE;
 434                 do_readonly = B_TRUE;
 435         }
 436         if (vfs_optionisset(vfsp, MNTOPT_NOSUID, NULL)) {
 437                 devices = B_FALSE;
 438                 setuid = B_FALSE;
 439                 do_devices = B_TRUE;
 440                 do_setuid = B_TRUE;
 441         } else {
 442                 if (vfs_optionisset(vfsp, MNTOPT_NODEVICES, NULL)) {
 443                         devices = B_FALSE;
 444                         do_devices = B_TRUE;
 445                 } else if (vfs_optionisset(vfsp, MNTOPT_DEVICES, NULL)) {
 446                         devices = B_TRUE;
 447                         do_devices = B_TRUE;
 448                 }
 449
 450                 if (vfs_optionisset(vfsp, MNTOPT_NOSETUID, NULL)) {
 451                         setuid = B_FALSE;
 452                         do_setuid = B_TRUE;
 453                 } else if (vfs_optionisset(vfsp, MNTOPT_SETUID, NULL)) {
 454                         setuid = B_TRUE;
 455                         do_setuid = B_TRUE;
 456                 }
 457         }
 458         if (vfs_optionisset(vfsp, MNTOPT_NOEXEC, NULL)) {
 459                 exec = B_FALSE;
 460                 do_exec = B_TRUE;
 461         } else if (vfs_optionisset(vfsp, MNTOPT_EXEC, NULL)) {
 462                 exec = B_TRUE;
 463                 do_exec = B_TRUE;
 464         }
 465         if (vfs_optionisset(vfsp, MNTOPT_NOXATTR, NULL)) {
 466                 xattr = B_FALSE;
 467                 do_xattr = B_TRUE;
 468         } else if (vfs_optionisset(vfsp, MNTOPT_XATTR, NULL)) {
 469                 xattr = B_TRUE;
 470                 do_xattr = B_TRUE;
 471         }
 472         if (vfs_optionisset(vfsp, MNTOPT_NOATIME, NULL)) {
 473                 atime = B_FALSE;
 474                 do_atime = B_TRUE;
 475         } else if (vfs_optionisset(vfsp, MNTOPT_ATIME, NULL)) {
 476                 atime = B_TRUE;
 477                 do_atime = B_TRUE;
 478         }
 479
 480         /*
 481          * nbmand is a special property.  It can only be changed at
 482          * mount time.
 483          *
 484          * This is weird, but it is documented to only be changeable
 485          * at mount time.
 486          */
 487         if (vfs_optionisset(vfsp, MNTOPT_NONBMAND, NULL)) {
 488                 nbmand = B_FALSE;
 489         } else if (vfs_optionisset(vfsp, MNTOPT_NBMAND, NULL)) {
 490                 nbmand = B_TRUE;
 491         } else {
 492                 char osname[ZFS_MAX_DATASET_NAME_LEN];
 493
 494                 dmu_objset_name(os, osname);
 495                 if (error = dsl_prop_get_integer(osname, "nbmand", &nbmand,
 496                     NULL)) {
 497                         return (error);
 498                 }
 499         }
 500
 501         /*
 502          * Register property callbacks.
 503          *
 504          * It would probably be fine to just check for i/o error from
 505          * the first prop_register(), but I guess I like to go
 506          * overboard...
 507          */
 508         ds = dmu_objset_ds(os);
 509         dsl_pool_config_enter(dmu_objset_pool(os), FTAG);
 510         error = dsl_prop_register(ds,
 511             zfs_prop_to_name(ZFS_PROP_ATIME), atime_changed_cb, zfsvfs);
 512         error = error ? error : dsl_prop_register(ds,
 513             zfs_prop_to_name(ZFS_PROP_XATTR), xattr_changed_cb, zfsvfs);
 514         error = error ? error : dsl_prop_register(ds,
 515             zfs_prop_to_name(ZFS_PROP_RECORDSIZE), blksz_changed_cb, zfsvfs);
 516         error = error ? error : dsl_prop_register(ds,
 517             zfs_prop_to_name(ZFS_PROP_READONLY), readonly_changed_cb, zfsvfs);
 518         error = error ? error : dsl_prop_register(ds,
 519             zfs_prop_to_name(ZFS_PROP_DEVICES), devices_changed_cb, zfsvfs);
 520         error = error ? error : dsl_prop_register(ds,
 521             zfs_prop_to_name(ZFS_PROP_SETUID), setuid_changed_cb, zfsvfs);
 522         error = error ? error : dsl_prop_register(ds,
 523             zfs_prop_to_name(ZFS_PROP_EXEC), exec_changed_cb, zfsvfs);
 524         error = error ? error : dsl_prop_register(ds,
 525             zfs_prop_to_name(ZFS_PROP_SNAPDIR), snapdir_changed_cb, zfsvfs);
 526         error = error ? error : dsl_prop_register(ds,
 527             zfs_prop_to_name(ZFS_PROP_ACLMODE), acl_mode_changed_cb, zfsvfs);
 528         error = error ? error : dsl_prop_register(ds,
 529             zfs_prop_to_name(ZFS_PROP_ACLINHERIT), acl_inherit_changed_cb,
 530             zfsvfs);
 531         error = error ? error : dsl_prop_register(ds,
 532             zfs_prop_to_name(ZFS_PROP_VSCAN), vscan_changed_cb, zfsvfs);
 533         dsl_pool_config_exit(dmu_objset_pool(os), FTAG);
 534         if (error)
 535                 goto unregister;
 536
 537         /*
 538          * Invoke our callbacks to restore temporary mount options.
 539          */
 540         if (do_readonly)
 541                 readonly_changed_cb(zfsvfs, readonly);
 542         if (do_setuid)
 543                 setuid_changed_cb(zfsvfs, setuid);
 544         if (do_exec)
 545                 exec_changed_cb(zfsvfs, exec);
 546         if (do_devices)
 547                 devices_changed_cb(zfsvfs, devices);
 548         if (do_xattr)
 549                 xattr_changed_cb(zfsvfs, xattr);
 550         if (do_atime)
 551                 atime_changed_cb(zfsvfs, atime);
 552
 553         nbmand_changed_cb(zfsvfs, nbmand);
 554
 555         return (0);
 556
 557 unregister:
 558         dsl_prop_unregister_all(ds, zfsvfs);
 559         return (error);
 560 }
 561
 562 static int
 563 zfs_space_delta_cb(dmu_object_type_t bonustype, void *data,
 564     uint64_t *userp, uint64_t *groupp)
 565 {
 566         /*
 567          * Is it a valid type of object to track?
 568          */
 569         if (bonustype != DMU_OT_ZNODE && bonustype != DMU_OT_SA)
 570                 return (SET_ERROR(ENOENT));
 571
 572         /*
 573          * If we have a NULL data pointer
 574          * then assume the id's aren't changing and
 575          * return EEXIST to the dmu to let it know to
 576          * use the same ids
 577          */
 578         if (data == NULL)
 579                 return (SET_ERROR(EEXIST));
 580
 581         if (bonustype == DMU_OT_ZNODE) {
 582                 znode_phys_t *znp = data;
 583                 *userp = znp->zp_uid;
 584                 *groupp = znp->zp_gid;
 585         } else {
 586                 int hdrsize;
 587                 sa_hdr_phys_t *sap = data;
 588                 sa_hdr_phys_t sa = *sap;
 589                 boolean_t swap = B_FALSE;
 590
 591                 ASSERT(bonustype == DMU_OT_SA);
 592
 593                 if (sa.sa_magic == 0) {
 594                         /*
 595                          * This should only happen for newly created
 596                          * files that haven't had the znode data filled
 597                          * in yet.
 598                          */
 599                         *userp = 0;
 600                         *groupp = 0;
 601                         return (0);
 602                 }
 603                 if (sa.sa_magic == BSWAP_32(SA_MAGIC)) {
 604                         sa.sa_magic = SA_MAGIC;
 605                         sa.sa_layout_info = BSWAP_16(sa.sa_layout_info);
 606                         swap = B_TRUE;
 607                 } else {
 608                         VERIFY3U(sa.sa_magic, ==, SA_MAGIC);
 609                 }
 610
 611                 hdrsize = sa_hdrsize(&sa);
 612                 VERIFY3U(hdrsize, >=, sizeof (sa_hdr_phys_t));
 613                 *userp = *((uint64_t *)((uintptr_t)data + hdrsize +
 614                     SA_UID_OFFSET));
 615                 *groupp = *((uint64_t *)((uintptr_t)data + hdrsize +
 616                     SA_GID_OFFSET));
 617                 if (swap) {
 618                         *userp = BSWAP_64(*userp);
 619                         *groupp = BSWAP_64(*groupp);
 620                 }
 621         }
 622         return (0);
 623 }
 624
 625 static void
 626 fuidstr_to_sid(zfsvfs_t *zfsvfs, const char *fuidstr,
 627     char *domainbuf, int buflen, uid_t *ridp)
 628 {
 629         uint64_t fuid;
 630         const char *domain;
 631
 632         fuid = zfs_strtonum(fuidstr, NULL);
 633
 634         domain = zfs_fuid_find_by_idx(zfsvfs, FUID_INDEX(fuid));
 635         if (domain)
 636                 (void) strlcpy(domainbuf, domain, buflen);
 637         else
 638                 domainbuf[0] = '\0';
 639         *ridp = FUID_RID(fuid);
 640 }
 641
 642 static uint64_t
 643 zfs_userquota_prop_to_obj(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type)
 644 {
 645         switch (type) {
 646         case ZFS_PROP_USERUSED:
 647                 return (DMU_USERUSED_OBJECT);
 648         case ZFS_PROP_GROUPUSED:
 649                 return (DMU_GROUPUSED_OBJECT);
 650         case ZFS_PROP_USERQUOTA:
 651                 return (zfsvfs->z_userquota_obj);
 652         case ZFS_PROP_GROUPQUOTA:
 653                 return (zfsvfs->z_groupquota_obj);
 654         }
 655         return (0);
 656 }
 657
 658 int
 659 zfs_userspace_many(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type,
 660     uint64_t *cookiep, void *vbuf, uint64_t *bufsizep)
 661 {
 662         int error;
 663         zap_cursor_t zc;
 664         zap_attribute_t za;
 665         zfs_useracct_t *buf = vbuf;
 666         uint64_t obj;
 667
 668         if (!dmu_objset_userspace_present(zfsvfs->z_os))
 669                 return (SET_ERROR(ENOTSUP));
 670
 671         obj = zfs_userquota_prop_to_obj(zfsvfs, type);
 672         if (obj == 0) {
 673                 *bufsizep = 0;
 674                 return (0);
 675         }
 676
 677         for (zap_cursor_init_serialized(&zc, zfsvfs->z_os, obj, *cookiep);
 678             (error = zap_cursor_retrieve(&zc, &za)) == 0;
 679             zap_cursor_advance(&zc)) {
 680                 if ((uintptr_t)buf - (uintptr_t)vbuf + sizeof (zfs_useracct_t) >
 681                     *bufsizep)
 682                         break;
 683
 684                 fuidstr_to_sid(zfsvfs, za.za_name,
 685                     buf->zu_domain, sizeof (buf->zu_domain), &buf->zu_rid);
 686
 687                 buf->zu_space = za.za_first_integer;
 688                 buf++;
 689         }
 690         if (error == ENOENT)
 691                 error = 0;
 692
 693         ASSERT3U((uintptr_t)buf - (uintptr_t)vbuf, <=, *bufsizep);
 694         *bufsizep = (uintptr_t)buf - (uintptr_t)vbuf;
 695         *cookiep = zap_cursor_serialize(&zc);
 696         zap_cursor_fini(&zc);
 697         return (error);
 698 }
 699
 700 /*
 701  * buf must be big enough (eg, 32 bytes)
 702  */
 703 static int
 704 id_to_fuidstr(zfsvfs_t *zfsvfs, const char *domain, uid_t rid,
 705     char *buf, boolean_t addok)
 706 {
 707         uint64_t fuid;
 708         int domainid = 0;
 709
 710         if (domain && domain[0]) {
 711                 domainid = zfs_fuid_find_by_domain(zfsvfs, domain, NULL, addok);
 712                 if (domainid == -1)
 713                         return (SET_ERROR(ENOENT));
 714         }
 715         fuid = FUID_ENCODE(domainid, rid);
 716         (void) sprintf(buf, "%llx", (longlong_t)fuid);
 717         return (0);
 718 }
 719
 720 int
 721 zfs_userspace_one(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type,
 722     const char *domain, uint64_t rid, uint64_t *valp)
 723 {
 724         char buf[32];
 725         int err;
 726         uint64_t obj;
 727
 728         *valp = 0;
 729
 730         if (!dmu_objset_userspace_present(zfsvfs->z_os))
 731                 return (SET_ERROR(ENOTSUP));
 732
 733         obj = zfs_userquota_prop_to_obj(zfsvfs, type);
 734         if (obj == 0)
 735                 return (0);
 736
 737         err = id_to_fuidstr(zfsvfs, domain, rid, buf, B_FALSE);
 738         if (err)
 739                 return (err);
 740
 741         err = zap_lookup(zfsvfs->z_os, obj, buf, 8, 1, valp);
 742         if (err == ENOENT)
 743                 err = 0;
 744         return (err);
 745 }
 746
 747 int
 748 zfs_set_userquota(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type,
 749     const char *domain, uint64_t rid, uint64_t quota)
 750 {
 751         char buf[32];
 752         int err;
 753         dmu_tx_t *tx;
 754         uint64_t *objp;
 755         boolean_t fuid_dirtied;
 756
 757         if (type != ZFS_PROP_USERQUOTA && type != ZFS_PROP_GROUPQUOTA)
 758                 return (SET_ERROR(EINVAL));
 759
 760         if (zfsvfs->z_version < ZPL_VERSION_USERSPACE)
 761                 return (SET_ERROR(ENOTSUP));
 762
 763         objp = (type == ZFS_PROP_USERQUOTA) ? &zfsvfs->z_userquota_obj :
 764             &zfsvfs->z_groupquota_obj;
 765
 766         err = id_to_fuidstr(zfsvfs, domain, rid, buf, B_TRUE);
 767         if (err)
 768                 return (err);
 769         fuid_dirtied = zfsvfs->z_fuid_dirty;
 770
 771         tx = dmu_tx_create(zfsvfs->z_os);
 772         dmu_tx_hold_zap(tx, *objp ? *objp : DMU_NEW_OBJECT, B_TRUE, NULL);
 773         if (*objp == 0) {
 774                 dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, B_TRUE,
 775                     zfs_userquota_prop_prefixes[type]);
 776         }
 777         if (fuid_dirtied)
 778                 zfs_fuid_txhold(zfsvfs, tx);
 779         err = dmu_tx_assign(tx, TXG_WAIT);
 780         if (err) {
 781                 dmu_tx_abort(tx);
 782                 return (err);
 783         }
 784
 785         mutex_enter(&zfsvfs->z_lock);
 786         if (*objp == 0) {
 787                 *objp = zap_create(zfsvfs->z_os, DMU_OT_USERGROUP_QUOTA,
 788                     DMU_OT_NONE, 0, tx);
 789                 VERIFY(0 == zap_add(zfsvfs->z_os, MASTER_NODE_OBJ,
 790                     zfs_userquota_prop_prefixes[type], 8, 1, objp, tx));
 791         }
 792         mutex_exit(&zfsvfs->z_lock);
 793
 794         if (quota == 0) {
 795                 err = zap_remove(zfsvfs->z_os, *objp, buf, tx);
 796                 if (err == ENOENT)
 797                         err = 0;
 798         } else {
 799                 err = zap_update(zfsvfs->z_os, *objp, buf, 8, 1, &quota, tx);
 800         }
 801         ASSERT(err == 0);
 802         if (fuid_dirtied)
 803                 zfs_fuid_sync(zfsvfs, tx);
 804         dmu_tx_commit(tx);
 805         return (err);
 806 }
 807
 808 boolean_t
 809 zfs_fuid_overquota(zfsvfs_t *zfsvfs, boolean_t isgroup, uint64_t fuid)
 810 {
 811         char buf[32];
 812         uint64_t used, quota, usedobj, quotaobj;
 813         int err;
 814
 815         usedobj = isgroup ? DMU_GROUPUSED_OBJECT : DMU_USERUSED_OBJECT;
 816         quotaobj = isgroup ? zfsvfs->z_groupquota_obj : zfsvfs->z_userquota_obj;
 817
 818         if (quotaobj == 0 || zfsvfs->z_replay)
 819                 return (B_FALSE);
 820
 821         (void) sprintf(buf, "%llx", (longlong_t)fuid);
 822         err = zap_lookup(zfsvfs->z_os, quotaobj, buf, 8, 1, &quota);
 823         if (err != 0)
 824                 return (B_FALSE);
 825
 826         err = zap_lookup(zfsvfs->z_os, usedobj, buf, 8, 1, &used);
 827         if (err != 0)
 828                 return (B_FALSE);
 829         return (used >= quota);
 830 }
 831
 832 boolean_t
 833 zfs_owner_overquota(zfsvfs_t *zfsvfs, znode_t *zp, boolean_t isgroup)
 834 {
 835         uint64_t fuid;
 836         uint64_t quotaobj;
 837
 838         quotaobj = isgroup ? zfsvfs->z_groupquota_obj : zfsvfs->z_userquota_obj;
 839
 840         fuid = isgroup ? zp->z_gid : zp->z_uid;
 841
 842         if (quotaobj == 0 || zfsvfs->z_replay)
 843                 return (B_FALSE);
 844
 845         return (zfs_fuid_overquota(zfsvfs, isgroup, fuid));
 846 }
 847
 848 /*
 849  * Associate this zfsvfs with the given objset, which must be owned.
 850  * This will cache a bunch of on-disk state from the objset in the
 851  * zfsvfs.
 852  */
 853 static int
 854 zfsvfs_init(zfsvfs_t *zfsvfs, objset_t *os)
 855 {
 856         int error;
 857         uint64_t val;
 858
 859         zfsvfs->z_max_blksz = SPA_OLD_MAXBLOCKSIZE;
 860         zfsvfs->z_show_ctldir = ZFS_SNAPDIR_VISIBLE;
 861         zfsvfs->z_os = os;
 862
 863         error = zfs_get_zplprop(os, ZFS_PROP_VERSION, &zfsvfs->z_version);
 864         if (error != 0)
 865                 return (error);
 866         if (zfsvfs->z_version >
 867             zfs_zpl_version_map(spa_version(dmu_objset_spa(os)))) {
 868                 (void) printf("Can't mount a version %lld file system "
 869                     "on a version %lld pool\n. Pool must be upgraded to mount "
 870                     "this file system.", (u_longlong_t)zfsvfs->z_version,
 871                     (u_longlong_t)spa_version(dmu_objset_spa(os)));
 872                 return (SET_ERROR(ENOTSUP));
 873         }
 874         error = zfs_get_zplprop(os, ZFS_PROP_NORMALIZE, &val);
 875         if (error != 0)
 876                 return (error);
 877         zfsvfs->z_norm = (int)val;
 878
 879         error = zfs_get_zplprop(os, ZFS_PROP_UTF8ONLY, &val);
 880         if (error != 0)
 881                 return (error);
 882         zfsvfs->z_utf8 = (val != 0);
 883
 884         error = zfs_get_zplprop(os, ZFS_PROP_CASE, &val);
 885         if (error != 0)
 886                 return (error);
 887         zfsvfs->z_case = (uint_t)val;
 888
 889         /*
 890          * Fold case on file systems that are always or sometimes case
 891          * insensitive.
 892          */
 893         if (zfsvfs->z_case == ZFS_CASE_INSENSITIVE ||
 894             zfsvfs->z_case == ZFS_CASE_MIXED)
 895                 zfsvfs->z_norm |= U8_TEXTPREP_TOUPPER;
 896
 897         zfsvfs->z_use_fuids = USE_FUIDS(zfsvfs->z_version, zfsvfs->z_os);
 898         zfsvfs->z_use_sa = USE_SA(zfsvfs->z_version, zfsvfs->z_os);
 899
 900         uint64_t sa_obj = 0;
 901         if (zfsvfs->z_use_sa) {
 902                 /* should either have both of these objects or none */
 903                 error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_SA_ATTRS, 8, 1,
 904                     &sa_obj);
 905                 if (error != 0)
 906                         return (error);
 907         }
 908
 909         error = sa_setup(os, sa_obj, zfs_attr_table, ZPL_END,
 910             &zfsvfs->z_attr_table);
 911         if (error != 0)
 912                 return (error);
 913
 914         if (zfsvfs->z_version >= ZPL_VERSION_SA)
 915                 sa_register_update_callback(os, zfs_sa_upgrade);
 916
 917         error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_ROOT_OBJ, 8, 1,
 918             &zfsvfs->z_root);
 919         if (error != 0)
 920                 return (error);
 921         ASSERT(zfsvfs->z_root != 0);
 922
 923         error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_UNLINKED_SET, 8, 1,
 924             &zfsvfs->z_unlinkedobj);
 925         if (error != 0)
 926                 return (error);
 927
 928         error = zap_lookup(os, MASTER_NODE_OBJ,
 929             zfs_userquota_prop_prefixes[ZFS_PROP_USERQUOTA],
 930             8, 1, &zfsvfs->z_userquota_obj);
 931         if (error == ENOENT)
 932                 zfsvfs->z_userquota_obj = 0;
 933         else if (error != 0)
 934                 return (error);
 935
 936         error = zap_lookup(os, MASTER_NODE_OBJ,
 937             zfs_userquota_prop_prefixes[ZFS_PROP_GROUPQUOTA],
 938             8, 1, &zfsvfs->z_groupquota_obj);
 939         if (error == ENOENT)
 940                 zfsvfs->z_groupquota_obj = 0;
 941         else if (error != 0)
 942                 return (error);
 943
 944         error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_FUID_TABLES, 8, 1,
 945             &zfsvfs->z_fuid_obj);
 946         if (error == ENOENT)
 947                 zfsvfs->z_fuid_obj = 0;
 948         else if (error != 0)
 949                 return (error);
 950
 951         error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_SHARES_DIR, 8, 1,
 952             &zfsvfs->z_shares_dir);
 953         if (error == ENOENT)
 954                 zfsvfs->z_shares_dir = 0;
 955         else if (error != 0)
 956                 return (error);
 957
 958         return (0);
 959 }
 960
 961 int
 962 zfsvfs_create(const char *osname, zfsvfs_t **zfvp)
 963 {
 964         objset_t *os;
 965         zfsvfs_t *zfsvfs;
 966         int error;
 967
 968         zfsvfs = kmem_zalloc(sizeof (zfsvfs_t), KM_SLEEP);
 969
 970         /*
 971          * We claim to always be readonly so we can open snapshots;
 972          * other ZPL code will prevent us from writing to snapshots.
 973          */
 974
 975         error = dmu_objset_own(osname, DMU_OST_ZFS, B_TRUE, zfsvfs, &os);
 976         if (error != 0) {
 977                 kmem_free(zfsvfs, sizeof (zfsvfs_t));
 978                 return (error);
 979         }
 980
 981         error = zfsvfs_create_impl(zfvp, zfsvfs, os);
 982         if (error != 0) {
 983                 dmu_objset_disown(os, zfsvfs);
 984         }
 985         return (error);
 986 }
 987
 988
 989 int
 990 zfsvfs_create_impl(zfsvfs_t **zfvp, zfsvfs_t *zfsvfs, objset_t *os)
 991 {
 992         int error;
 993
 994         zfsvfs->z_vfs = NULL;
 995         zfsvfs->z_parent = zfsvfs;
 996
 997         mutex_init(&zfsvfs->z_znodes_lock, NULL, MUTEX_DEFAULT, NULL);
 998         mutex_init(&zfsvfs->z_lock, NULL, MUTEX_DEFAULT, NULL);
 999         list_create(&zfsvfs->z_all_znodes, sizeof (znode_t),
1000             offsetof(znode_t, z_link_node));
1001         rrm_init(&zfsvfs->z_teardown_lock, B_FALSE);
1002         rw_init(&zfsvfs->z_teardown_inactive_lock, NULL, RW_DEFAULT, NULL);
1003         rw_init(&zfsvfs->z_fuid_lock, NULL, RW_DEFAULT, NULL);
1004         for (int i = 0; i != ZFS_OBJ_MTX_SZ; i++)
1005                 mutex_init(&zfsvfs->z_hold_mtx[i], NULL, MUTEX_DEFAULT, NULL);
1006
1007         error = zfsvfs_init(zfsvfs, os);
1008         if (error != 0) {
1009                 *zfvp = NULL;
1010                 kmem_free(zfsvfs, sizeof (zfsvfs_t));
1011                 return (error);
1012         }
1013
1014         *zfvp = zfsvfs;
1015         return (0);
1016 }
1017
1018 static int
1019 zfsvfs_setup(zfsvfs_t *zfsvfs, boolean_t mounting)
1020 {
1021         int error;
1022
1023         error = zfs_register_callbacks(zfsvfs->z_vfs);
1024         if (error)
1025                 return (error);
1026
1027         zfsvfs->z_log = zil_open(zfsvfs->z_os, zfs_get_data);
1028
1029         /*
1030          * If we are not mounting (ie: online recv), then we don't
1031          * have to worry about replaying the log as we blocked all
1032          * operations out since we closed the ZIL.
1033          */
1034         if (mounting) {
1035                 boolean_t readonly;
1036
1037                 /*
1038                  * During replay we remove the read only flag to
1039                  * allow replays to succeed.
1040                  */
1041                 readonly = zfsvfs->z_vfs->vfs_flag & VFS_RDONLY;
1042                 if (readonly != 0)
1043                         zfsvfs->z_vfs->vfs_flag &= ~VFS_RDONLY;
1044                 else
1045                         zfs_unlinked_drain(zfsvfs);
1046
1047                 /*
1048                  * Parse and replay the intent log.
1049                  *
1050                  * Because of ziltest, this must be done after
1051                  * zfs_unlinked_drain().  (Further note: ziltest
1052                  * doesn't use readonly mounts, where
1053                  * zfs_unlinked_drain() isn't called.)  This is because
1054                  * ziltest causes spa_sync() to think it's committed,
1055                  * but actually it is not, so the intent log contains
1056                  * many txg's worth of changes.
1057                  *
1058                  * In particular, if object N is in the unlinked set in
1059                  * the last txg to actually sync, then it could be
1060                  * actually freed in a later txg and then reallocated
1061                  * in a yet later txg.  This would write a "create
1062                  * object N" record to the intent log.  Normally, this
1063                  * would be fine because the spa_sync() would have
1064                  * written out the fact that object N is free, before
1065                  * we could write the "create object N" intent log
1066                  * record.
1067                  *
1068                  * But when we are in ziltest mode, we advance the "open
1069                  * txg" without actually spa_sync()-ing the changes to
1070                  * disk.  So we would see that object N is still
1071                  * allocated and in the unlinked set, and there is an
1072                  * intent log record saying to allocate it.
1073                  */
1074                 if (spa_writeable(dmu_objset_spa(zfsvfs->z_os))) {
1075                         if (zil_replay_disable) {
1076                                 zil_destroy(zfsvfs->z_log, B_FALSE);
1077                         } else {
1078                                 zfsvfs->z_replay = B_TRUE;
1079                                 zil_replay(zfsvfs->z_os, zfsvfs,
1080                                     zfs_replay_vector);
1081                                 zfsvfs->z_replay = B_FALSE;
1082                         }
1083                 }
1084                 zfsvfs->z_vfs->vfs_flag |= readonly; /* restore readonly bit */
1085         }
1086
1087         /*
1088          * Set the objset user_ptr to track its zfsvfs.
1089          */
1090         mutex_enter(&zfsvfs->z_os->os_user_ptr_lock);
1091         dmu_objset_set_user(zfsvfs->z_os, zfsvfs);
1092         mutex_exit(&zfsvfs->z_os->os_user_ptr_lock);
1093
1094         return (0);
1095 }
1096
1097 void
1098 zfsvfs_free(zfsvfs_t *zfsvfs)
1099 {
1100         int i;
1101         extern krwlock_t zfsvfs_lock; /* in zfs_znode.c */
1102
1103         /*
1104          * This is a barrier to prevent the filesystem from going away in
1105          * zfs_znode_move() until we can safely ensure that the filesystem is
1106          * not unmounted. We consider the filesystem valid before the barrier
1107          * and invalid after the barrier.
1108          */
1109         rw_enter(&zfsvfs_lock, RW_READER);
1110         rw_exit(&zfsvfs_lock);
1111
1112         zfs_fuid_destroy(zfsvfs);
1113
1114         mutex_destroy(&zfsvfs->z_znodes_lock);
1115         mutex_destroy(&zfsvfs->z_lock);
1116         list_destroy(&zfsvfs->z_all_znodes);
1117         rrm_destroy(&zfsvfs->z_teardown_lock);
1118         rw_destroy(&zfsvfs->z_teardown_inactive_lock);
1119         rw_destroy(&zfsvfs->z_fuid_lock);
1120         for (i = 0; i != ZFS_OBJ_MTX_SZ; i++)
1121                 mutex_destroy(&zfsvfs->z_hold_mtx[i]);
1122         kmem_free(zfsvfs, sizeof (zfsvfs_t));
1123 }
1124
1125 static void
1126 zfs_set_fuid_feature(zfsvfs_t *zfsvfs)
1127 {
1128         zfsvfs->z_use_fuids = USE_FUIDS(zfsvfs->z_version, zfsvfs->z_os);
1129         if (zfsvfs->z_vfs) {
1130                 if (zfsvfs->z_use_fuids) {
1131                         vfs_set_feature(zfsvfs->z_vfs, VFSFT_XVATTR);
1132                         vfs_set_feature(zfsvfs->z_vfs, VFSFT_SYSATTR_VIEWS);
1133                         vfs_set_feature(zfsvfs->z_vfs, VFSFT_ACEMASKONACCESS);
1134                         vfs_set_feature(zfsvfs->z_vfs, VFSFT_ACLONCREATE);
1135                         vfs_set_feature(zfsvfs->z_vfs, VFSFT_ACCESS_FILTER);
1136                         vfs_set_feature(zfsvfs->z_vfs, VFSFT_REPARSE);
1137                 } else {
1138                         vfs_clear_feature(zfsvfs->z_vfs, VFSFT_XVATTR);
1139                         vfs_clear_feature(zfsvfs->z_vfs, VFSFT_SYSATTR_VIEWS);
1140                         vfs_clear_feature(zfsvfs->z_vfs, VFSFT_ACEMASKONACCESS);
1141                         vfs_clear_feature(zfsvfs->z_vfs, VFSFT_ACLONCREATE);
1142                         vfs_clear_feature(zfsvfs->z_vfs, VFSFT_ACCESS_FILTER);
1143                         vfs_clear_feature(zfsvfs->z_vfs, VFSFT_REPARSE);
1144                 }
1145         }
1146         zfsvfs->z_use_sa = USE_SA(zfsvfs->z_version, zfsvfs->z_os);
1147 }
1148
1149 static int
1150 zfs_domount(vfs_t *vfsp, char *osname)
1151 {
1152         dev_t mount_dev;
1153         uint64_t recordsize, fsid_guid;
1154         int error = 0;
1155         zfsvfs_t *zfsvfs;
1156
1157         ASSERT(vfsp);
1158         ASSERT(osname);
1159
1160         error = zfsvfs_create(osname, &zfsvfs);
1161         if (error)
1162                 return (error);
1163         zfsvfs->z_vfs = vfsp;
1164
1165         /* Initialize the generic filesystem structure. */
1166         vfsp->vfs_bcount = 0;
1167         vfsp->vfs_data = NULL;
1168
1169         if (zfs_create_unique_device(&mount_dev) == -1) {
1170                 error = SET_ERROR(ENODEV);
1171                 goto out;
1172         }
1173         ASSERT(vfs_devismounted(mount_dev) == 0);
1174
1175         if (error = dsl_prop_get_integer(osname, "recordsize", &recordsize,
1176             NULL))
1177                 goto out;
1178
1179         vfsp->vfs_dev = mount_dev;
1180         vfsp->vfs_fstype = zfsfstype;
1181         vfsp->vfs_bsize = recordsize;
1182         vfsp->vfs_flag |= VFS_NOTRUNC;
1183         vfsp->vfs_data = zfsvfs;
1184
1185         /*
1186          * The fsid is 64 bits, composed of an 8-bit fs type, which
1187          * separates our fsid from any other filesystem types, and a
1188          * 56-bit objset unique ID.  The objset unique ID is unique to
1189          * all objsets open on this system, provided by unique_create().
1190          * The 8-bit fs type must be put in the low bits of fsid[1]
1191          * because that's where other Solaris filesystems put it.
1192          */
1193         fsid_guid = dmu_objset_fsid_guid(zfsvfs->z_os);
1194         ASSERT((fsid_guid & ~((1ULL<<56)-1)) == 0);
1195         vfsp->vfs_fsid.val[0] = fsid_guid;
1196         vfsp->vfs_fsid.val[1] = ((fsid_guid>>32) << 8) |
1197             zfsfstype & 0xFF;
1198
1199         /*
1200          * Set features for file system.
1201          */
1202         zfs_set_fuid_feature(zfsvfs);
1203         if (zfsvfs->z_case == ZFS_CASE_INSENSITIVE) {
1204                 vfs_set_feature(vfsp, VFSFT_DIRENTFLAGS);
1205                 vfs_set_feature(vfsp, VFSFT_CASEINSENSITIVE);
1206                 vfs_set_feature(vfsp, VFSFT_NOCASESENSITIVE);
1207         } else if (zfsvfs->z_case == ZFS_CASE_MIXED) {
1208                 vfs_set_feature(vfsp, VFSFT_DIRENTFLAGS);
1209                 vfs_set_feature(vfsp, VFSFT_CASEINSENSITIVE);
1210         }
1211         vfs_set_feature(vfsp, VFSFT_ZEROCOPY_SUPPORTED);
1212
1213         if (dmu_objset_is_snapshot(zfsvfs->z_os)) {
1214                 uint64_t pval;
1215
1216                 atime_changed_cb(zfsvfs, B_FALSE);
1217                 readonly_changed_cb(zfsvfs, B_TRUE);
1218                 if (error = dsl_prop_get_integer(osname, "xattr", &pval, NULL))
1219                         goto out;
1220                 xattr_changed_cb(zfsvfs, pval);
1221                 zfsvfs->z_issnap = B_TRUE;
1222                 zfsvfs->z_os->os_sync = ZFS_SYNC_DISABLED;
1223
1224                 mutex_enter(&zfsvfs->z_os->os_user_ptr_lock);
1225                 dmu_objset_set_user(zfsvfs->z_os, zfsvfs);
1226                 mutex_exit(&zfsvfs->z_os->os_user_ptr_lock);
1227         } else {
1228                 error = zfsvfs_setup(zfsvfs, B_TRUE);
1229         }
1230
1231         if (!zfsvfs->z_issnap)
1232                 zfsctl_create(zfsvfs);
1233 out:
1234         if (error) {
1235                 dmu_objset_disown(zfsvfs->z_os, zfsvfs);
1236                 zfsvfs_free(zfsvfs);
1237         } else {
1238                 atomic_inc_32(&zfs_active_fs_count);
1239         }
1240
1241         return (error);
1242 }
1243
1244 void
1245 zfs_unregister_callbacks(zfsvfs_t *zfsvfs)
1246 {
1247         objset_t *os = zfsvfs->z_os;
1248
1249         if (!dmu_objset_is_snapshot(os))
1250                 dsl_prop_unregister_all(dmu_objset_ds(os), zfsvfs);
1251 }
1252
1253 /*
1254  * Convert a decimal digit string to a uint64_t integer.
1255  */
1256 static int
1257 str_to_uint64(char *str, uint64_t *objnum)
1258 {
1259         uint64_t num = 0;
1260
1261         while (*str) {
1262                 if (*str < '0' || *str > '9')
1263                         return (SET_ERROR(EINVAL));
1264
1265                 num = num*10 + *str++ - '0';
1266         }
1267
1268         *objnum = num;
1269         return (0);
1270 }
1271
1272 /*
1273  * The boot path passed from the boot loader is in the form of
1274  * "rootpool-name/root-filesystem-object-number'. Convert this
1275  * string to a dataset name: "rootpool-name/root-filesystem-name".
1276  */
1277 static int
1278 zfs_parse_bootfs(char *bpath, char *outpath)
1279 {
1280         char *slashp;
1281         uint64_t objnum;
1282         int error;
1283
1284         if (*bpath == 0 || *bpath == '/')
1285                 return (SET_ERROR(EINVAL));
1286
1287         (void) strcpy(outpath, bpath);
1288
1289         slashp = strchr(bpath, '/');
1290
1291         /* if no '/', just return the pool name */
1292         if (slashp == NULL) {
1293                 return (0);
1294         }
1295
1296         /* if not a number, just return the root dataset name */
1297         if (str_to_uint64(slashp+1, &objnum)) {
1298                 return (0);
1299         }
1300
1301         *slashp = '\0';
1302         error = dsl_dsobj_to_dsname(bpath, objnum, outpath);
1303         *slashp = '/';
1304
1305         return (error);
1306 }
1307
1308 static int
1309 zfs_mountroot(vfs_t *vfsp, enum whymountroot why)
1310 {
1311         int error = 0;
1312         static int zfsrootdone = 0;
1313         zfsvfs_t *zfsvfs = NULL;
1314         znode_t *zp = NULL;
1315         vnode_t *vp = NULL;
1316         char *zfs_bootfs;
1317         char *zfs_devid;
1318
1319         ASSERT(vfsp);
1320
1321         /*
1322          * The filesystem that we mount as root is defined in the
1323          * boot property "zfs-bootfs" with a format of
1324          * "poolname/root-dataset-objnum".
1325          */
1326         if (why == ROOT_INIT) {
1327                 if (zfsrootdone++)
1328                         return (SET_ERROR(EBUSY));
1329                 /*
1330                  * the process of doing a spa_load will require the
1331                  * clock to be set before we could (for example) do
1332                  * something better by looking at the timestamp on
1333                  * an uberblock, so just set it to -1.
1334                  */
1335                 clkset(-1);
1336
1337                 if ((zfs_bootfs = spa_get_bootprop("zfs-bootfs")) == NULL) {
1338                         cmn_err(CE_NOTE, "spa_get_bootfs: can not get "
1339                             "bootfs name");
1340                         return (SET_ERROR(EINVAL));
1341                 }
1342                 zfs_devid = spa_get_bootprop("diskdevid");
1343                 error = spa_import_rootpool(rootfs.bo_name, zfs_devid);
1344                 if (zfs_devid)
1345                         spa_free_bootprop(zfs_devid);
1346                 if (error) {
1347                         spa_free_bootprop(zfs_bootfs);
1348                         cmn_err(CE_NOTE, "spa_import_rootpool: error %d",
1349                             error);
1350                         return (error);
1351                 }
1352                 if (error = zfs_parse_bootfs(zfs_bootfs, rootfs.bo_name)) {
1353                         spa_free_bootprop(zfs_bootfs);
1354                         cmn_err(CE_NOTE, "zfs_parse_bootfs: error %d",
1355                             error);
1356                         return (error);
1357                 }
1358
1359                 spa_free_bootprop(zfs_bootfs);
1360
1361                 if (error = vfs_lock(vfsp))
1362                         return (error);
1363
1364                 if (error = zfs_domount(vfsp, rootfs.bo_name)) {
1365                         cmn_err(CE_NOTE, "zfs_domount: error %d", error);
1366                         goto out;
1367                 }
1368
1369                 zfsvfs = (zfsvfs_t *)vfsp->vfs_data;
1370                 ASSERT(zfsvfs);
1371                 if (error = zfs_zget(zfsvfs, zfsvfs->z_root, &zp)) {
1372                         cmn_err(CE_NOTE, "zfs_zget: error %d", error);
1373                         goto out;
1374                 }
1375
1376                 vp = ZTOV(zp);
1377                 mutex_enter(&vp->v_lock);
1378                 vp->v_flag |= VROOT;
1379                 mutex_exit(&vp->v_lock);
1380                 rootvp = vp;
1381
1382                 /*
1383                  * Leave rootvp held.  The root file system is never unmounted.
1384                  */
1385
1386                 vfs_add(NULL, vfsp,
1387                     (vfsp->vfs_flag & VFS_RDONLY) ? MS_RDONLY : 0);
1388 out:
1389                 vfs_unlock(vfsp);
1390                 return (error);
1391         } else if (why == ROOT_REMOUNT) {
1392                 readonly_changed_cb(vfsp->vfs_data, B_FALSE);
1393                 vfsp->vfs_flag |= VFS_REMOUNT;
1394
1395                 /* refresh mount options */
1396                 zfs_unregister_callbacks(vfsp->vfs_data);
1397                 return (zfs_register_callbacks(vfsp));
1398
1399         } else if (why == ROOT_UNMOUNT) {
1400                 zfs_unregister_callbacks((zfsvfs_t *)vfsp->vfs_data);
1401                 (void) zfs_sync(vfsp, 0, 0);
1402                 return (0);
1403         }
1404
1405         /*
1406          * if "why" is equal to anything else other than ROOT_INIT,
1407          * ROOT_REMOUNT, or ROOT_UNMOUNT, we do not support it.
1408          */
1409         return (SET_ERROR(ENOTSUP));
1410 }
1411
1412 /*ARGSUSED*/
1413 static int
1414 zfs_mount(vfs_t *vfsp, vnode_t *mvp, struct mounta *uap, cred_t *cr)
1415 {
1416         char            *osname;
1417         pathname_t      spn;
1418         int             error = 0;
1419         uio_seg_t       fromspace = (uap->flags & MS_SYSSPACE) ?
1420             UIO_SYSSPACE : UIO_USERSPACE;
1421         int             canwrite;
1422
1423         if (mvp->v_type != VDIR)
1424                 return (SET_ERROR(ENOTDIR));
1425
1426         mutex_enter(&mvp->v_lock);
1427         if ((uap->flags & MS_REMOUNT) == 0 &&
1428             (uap->flags & MS_OVERLAY) == 0 &&
1429             (mvp->v_count != 1 || (mvp->v_flag & VROOT))) {
1430                 mutex_exit(&mvp->v_lock);
1431                 return (SET_ERROR(EBUSY));
1432         }
1433         mutex_exit(&mvp->v_lock);
1434
1435         /*
1436          * ZFS does not support passing unparsed data in via MS_DATA.
1437          * Users should use the MS_OPTIONSTR interface; this means
1438          * that all option parsing is already done and the options struct
1439          * can be interrogated.
1440          */
1441         if ((uap->flags & MS_DATA) && uap->datalen > 0)
1442                 return (SET_ERROR(EINVAL));
1443
1444         /*
1445          * Get the objset name (the "special" mount argument).
1446          */
1447         if (error = pn_get(uap->spec, fromspace, &spn))
1448                 return (error);
1449
1450         osname = spn.pn_path;
1451
1452         /*
1453          * Check for mount privilege?
1454          *
1455          * If we don't have privilege then see if
1456          * we have local permission to allow it
1457          */
1458         error = secpolicy_fs_mount(cr, mvp, vfsp);
1459         if (error) {
1460                 if (dsl_deleg_access(osname, ZFS_DELEG_PERM_MOUNT, cr) == 0) {
1461                         vattr_t         vattr;
1462
1463                         /*
1464                          * Make sure user is the owner of the mount point
1465                          * or has sufficient privileges.
1466                          */
1467
1468                         vattr.va_mask = AT_UID;
1469
1470                         if (fop_getattr(mvp, &vattr, 0, cr, NULL)) {
1471                                 goto out;
1472                         }
1473
1474                         if (secpolicy_vnode_owner(cr, vattr.va_uid) != 0 &&
1475                             fop_access(mvp, VWRITE, 0, cr, NULL) != 0) {
1476                                 goto out;
1477                         }
1478                         secpolicy_fs_mount_clearopts(cr, vfsp);
1479                 } else {
1480                         goto out;
1481                 }
1482         }
1483
1484         /*
1485          * Refuse to mount a filesystem if we are in a local zone and the
1486          * dataset is not visible.
1487          */
1488         if (!INGLOBALZONE(curproc) &&
1489             (!zone_dataset_visible(osname, &canwrite) || !canwrite)) {
1490                 error = SET_ERROR(EPERM);
1491                 goto out;
1492         }
1493
1494         /*
1495          * When doing a remount, we simply refresh our temporary properties
1496          * according to those options set in the current VFS options.
1497          */
1498         if (uap->flags & MS_REMOUNT) {
1499                 /* refresh mount options */
1500                 zfs_unregister_callbacks(vfsp->vfs_data);
1501                 error = zfs_register_callbacks(vfsp);
1502                 goto out;
1503         }
1504
1505         error = zfs_domount(vfsp, osname);
1506
1507         /*
1508          * Add an extra VFS_HOLD on our parent vfs so that it can't
1509          * disappear due to a forced unmount.
1510          */
1511         if (error == 0 && ((zfsvfs_t *)vfsp->vfs_data)->z_issnap)
1512                 VFS_HOLD(mvp->v_vfsp);
1513
1514 out:
1515         pn_free(&spn);
1516         return (error);
1517 }
1518
1519 static int
1520 zfs_statvfs(vfs_t *vfsp, struct statvfs64 *statp)
1521 {
1522         zfsvfs_t *zfsvfs = vfsp->vfs_data;
1523         dev32_t d32;
1524         uint64_t refdbytes, availbytes, usedobjs, availobjs;
1525
1526         ZFS_ENTER(zfsvfs);
1527
1528         dmu_objset_space(zfsvfs->z_os,
1529             &refdbytes, &availbytes, &usedobjs, &availobjs);
1530
1531         /*
1532          * The underlying storage pool actually uses multiple block sizes.
1533          * We report the fragsize as the smallest block size we support,
1534          * and we report our blocksize as the filesystem's maximum blocksize.
1535          */
1536         statp->f_frsize = 1UL << SPA_MINBLOCKSHIFT;
1537         statp->f_bsize = zfsvfs->z_max_blksz;
1538
1539         /*
1540          * The following report "total" blocks of various kinds in the
1541          * file system, but reported in terms of f_frsize - the
1542          * "fragment" size.
1543          */
1544
1545         statp->f_blocks = (refdbytes + availbytes) >> SPA_MINBLOCKSHIFT;
1546         statp->f_bfree = availbytes >> SPA_MINBLOCKSHIFT;
1547         statp->f_bavail = statp->f_bfree; /* no root reservation */
1548
1549         /*
1550          * statvfs() should really be called statufs(), because it assumes
1551          * static metadata.  ZFS doesn't preallocate files, so the best
1552          * we can do is report the max that could possibly fit in f_files,
1553          * and that minus the number actually used in f_ffree.
1554          * For f_ffree, report the smaller of the number of object available
1555          * and the number of blocks (each object will take at least a block).
1556          */
1557         statp->f_ffree = MIN(availobjs, statp->f_bfree);
1558         statp->f_favail = statp->f_ffree;       /* no "root reservation" */
1559         statp->f_files = statp->f_ffree + usedobjs;
1560
1561         (void) cmpldev(&d32, vfsp->vfs_dev);
1562         statp->f_fsid = d32;
1563
1564         /*
1565          * We're a zfs filesystem.
1566          */
1567         (void) strcpy(statp->f_basetype, vfssw[vfsp->vfs_fstype].vsw_name);
1568
1569         statp->f_flag = vf_to_stf(vfsp->vfs_flag);
1570
1571         statp->f_namemax = MAXNAMELEN - 1;
1572
1573         /*
1574          * We have all of 32 characters to stuff a string here.
1575          * Is there anything useful we could/should provide?
1576          */
1577         bzero(statp->f_fstr, sizeof (statp->f_fstr));
1578
1579         ZFS_EXIT(zfsvfs);
1580         return (0);
1581 }
1582
1583 static int
1584 zfs_root(vfs_t *vfsp, vnode_t **vpp)
1585 {
1586         zfsvfs_t *zfsvfs = vfsp->vfs_data;
1587         znode_t *rootzp;
1588         int error;
1589
1590         ZFS_ENTER(zfsvfs);
1591
1592         error = zfs_zget(zfsvfs, zfsvfs->z_root, &rootzp);
1593         if (error == 0)
1594                 *vpp = ZTOV(rootzp);
1595
1596         ZFS_EXIT(zfsvfs);
1597         return (error);
1598 }
1599
1600 /*
1601  * Teardown the zfsvfs::z_os.
1602  *
1603  * Note, if 'unmounting' is FALSE, we return with the 'z_teardown_lock'
1604  * and 'z_teardown_inactive_lock' held.
1605  */
1606 static int
1607 zfsvfs_teardown(zfsvfs_t *zfsvfs, boolean_t unmounting)
1608 {
1609         znode_t *zp;
1610
1611         rrm_enter(&zfsvfs->z_teardown_lock, RW_WRITER, FTAG);
1612
1613         if (!unmounting) {
1614                 /*
1615                  * We purge the parent filesystem's vfsp as the parent
1616                  * filesystem and all of its snapshots have their vnode's
1617                  * v_vfsp set to the parent's filesystem's vfsp.  Note,
1618                  * 'z_parent' is self referential for non-snapshots.
1619                  */
1620                 (void) dnlc_purge_vfsp(zfsvfs->z_parent->z_vfs, 0);
1621         }
1622
1623         /*
1624          * Close the zil. NB: Can't close the zil while zfs_inactive
1625          * threads are blocked as zil_close can call zfs_inactive.
1626          */
1627         if (zfsvfs->z_log) {
1628                 zil_close(zfsvfs->z_log);
1629                 zfsvfs->z_log = NULL;
1630         }
1631
1632         rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_WRITER);
1633
1634         /*
1635          * If we are not unmounting (ie: online recv) and someone already
1636          * unmounted this file system while we were doing the switcheroo,
1637          * or a reopen of z_os failed then just bail out now.
1638          */
1639         if (!unmounting && (zfsvfs->z_unmounted || zfsvfs->z_os == NULL)) {
1640                 rw_exit(&zfsvfs->z_teardown_inactive_lock);
1641                 rrm_exit(&zfsvfs->z_teardown_lock, FTAG);
1642                 return (SET_ERROR(EIO));
1643         }
1644
1645         /*
1646          * At this point there are no vops active, and any new vops will
1647          * fail with EIO since we have z_teardown_lock for writer (only
1648          * relavent for forced unmount).
1649          *
1650          * Release all holds on dbufs.
1651          */
1652         mutex_enter(&zfsvfs->z_znodes_lock);
1653         for (zp = list_head(&zfsvfs->z_all_znodes); zp != NULL;
1654             zp = list_next(&zfsvfs->z_all_znodes, zp))
1655                 if (zp->z_sa_hdl) {
1656                         ASSERT(ZTOV(zp)->v_count > 0);
1657                         zfs_znode_dmu_fini(zp);
1658                 }
1659         mutex_exit(&zfsvfs->z_znodes_lock);
1660
1661         /*
1662          * If we are unmounting, set the unmounted flag and let new vops
1663          * unblock.  zfs_inactive will have the unmounted behavior, and all
1664          * other vops will fail with EIO.
1665          */
1666         if (unmounting) {
1667                 zfsvfs->z_unmounted = B_TRUE;
1668                 rw_exit(&zfsvfs->z_teardown_inactive_lock);
1669                 rrm_exit(&zfsvfs->z_teardown_lock, FTAG);
1670         }
1671
1672         /*
1673          * z_os will be NULL if there was an error in attempting to reopen
1674          * zfsvfs, so just return as the properties had already been
1675          * unregistered and cached data had been evicted before.
1676          */
1677         if (zfsvfs->z_os == NULL)
1678                 return (0);
1679
1680         /*
1681          * Unregister properties.
1682          */
1683         zfs_unregister_callbacks(zfsvfs);
1684
1685         /*
1686          * Evict cached data
1687          */
1688         if (dsl_dataset_is_dirty(dmu_objset_ds(zfsvfs->z_os)) &&
1689             !(zfsvfs->z_vfs->vfs_flag & VFS_RDONLY))
1690                 txg_wait_synced(dmu_objset_pool(zfsvfs->z_os), 0);
1691         dmu_objset_evict_dbufs(zfsvfs->z_os);
1692
1693         return (0);
1694 }
1695
1696 /*ARGSUSED*/
1697 static int
1698 zfs_umount(vfs_t *vfsp, int fflag, cred_t *cr)
1699 {
1700         zfsvfs_t *zfsvfs = vfsp->vfs_data;
1701         objset_t *os;
1702         int ret;
1703
1704         ret = secpolicy_fs_unmount(cr, vfsp);
1705         if (ret) {
1706                 if (dsl_deleg_access((char *)refstr_value(vfsp->vfs_resource),
1707                     ZFS_DELEG_PERM_MOUNT, cr))
1708                         return (ret);
1709         }
1710
1711         /*
1712          * We purge the parent filesystem's vfsp as the parent filesystem
1713          * and all of its snapshots have their vnode's v_vfsp set to the
1714          * parent's filesystem's vfsp.  Note, 'z_parent' is self
1715          * referential for non-snapshots.
1716          */
1717         (void) dnlc_purge_vfsp(zfsvfs->z_parent->z_vfs, 0);
1718
1719         /*
1720          * Unmount any snapshots mounted under .zfs before unmounting the
1721          * dataset itself.
1722          */
1723         if (zfsvfs->z_ctldir != NULL &&
1724             (ret = zfsctl_umount_snapshots(vfsp, fflag, cr)) != 0) {
1725                 return (ret);
1726         }
1727
1728         if (!(fflag & MS_FORCE)) {
1729                 /*
1730                  * Check the number of active vnodes in the file system.
1731                  * Our count is maintained in the vfs structure, but the
1732                  * number is off by 1 to indicate a hold on the vfs
1733                  * structure itself.
1734                  *
1735                  * The '.zfs' directory maintains a reference of its
1736                  * own, and any active references underneath are
1737                  * reflected in the vnode count.
1738                  */
1739                 if (zfsvfs->z_ctldir == NULL) {
1740                         if (vfsp->vfs_count > 1)
1741                                 return (SET_ERROR(EBUSY));
1742                 } else {
1743                         if (vfsp->vfs_count > 2 ||
1744                             zfsvfs->z_ctldir->v_count > 1)
1745                                 return (SET_ERROR(EBUSY));
1746                 }
1747         }
1748
1749         vfsp->vfs_flag |= VFS_UNMOUNTED;
1750
1751         VERIFY(zfsvfs_teardown(zfsvfs, B_TRUE) == 0);
1752         os = zfsvfs->z_os;
1753
1754         /*
1755          * z_os will be NULL if there was an error in
1756          * attempting to reopen zfsvfs.
1757          */
1758         if (os != NULL) {
1759                 /*
1760                  * Unset the objset user_ptr.
1761                  */
1762                 mutex_enter(&os->os_user_ptr_lock);
1763                 dmu_objset_set_user(os, NULL);
1764                 mutex_exit(&os->os_user_ptr_lock);
1765
1766                 /*
1767                  * Finally release the objset
1768                  */
1769                 dmu_objset_disown(os, zfsvfs);
1770         }
1771
1772         /*
1773          * We can now safely destroy the '.zfs' directory node.
1774          */
1775         if (zfsvfs->z_ctldir != NULL)
1776                 zfsctl_destroy(zfsvfs);
1777
1778         return (0);
1779 }
1780
1781 static int
1782 zfs_vget(vfs_t *vfsp, vnode_t **vpp, fid_t *fidp)
1783 {
1784         zfsvfs_t        *zfsvfs = vfsp->vfs_data;
1785         znode_t         *zp;
1786         uint64_t        object = 0;
1787         uint64_t        fid_gen = 0;
1788         uint64_t        gen_mask;
1789         uint64_t        zp_gen;
1790         int             i, err;
1791
1792         *vpp = NULL;
1793
1794         ZFS_ENTER(zfsvfs);
1795
1796         if (fidp->fid_len == LONG_FID_LEN) {
1797                 zfid_long_t     *zlfid = (zfid_long_t *)fidp;
1798                 uint64_t        objsetid = 0;
1799                 uint64_t        setgen = 0;
1800
1801                 for (i = 0; i < sizeof (zlfid->zf_setid); i++)
1802                         objsetid |= ((uint64_t)zlfid->zf_setid[i]) << (8 * i);
1803
1804                 for (i = 0; i < sizeof (zlfid->zf_setgen); i++)
1805                         setgen |= ((uint64_t)zlfid->zf_setgen[i]) << (8 * i);
1806
1807                 ZFS_EXIT(zfsvfs);
1808
1809                 err = zfsctl_lookup_objset(vfsp, objsetid, &zfsvfs);
1810                 if (err)
1811                         return (SET_ERROR(EINVAL));
1812                 ZFS_ENTER(zfsvfs);
1813         }
1814
1815         if (fidp->fid_len == SHORT_FID_LEN || fidp->fid_len == LONG_FID_LEN) {
1816                 zfid_short_t    *zfid = (zfid_short_t *)fidp;
1817
1818                 for (i = 0; i < sizeof (zfid->zf_object); i++)
1819                         object |= ((uint64_t)zfid->zf_object[i]) << (8 * i);
1820
1821                 for (i = 0; i < sizeof (zfid->zf_gen); i++)
1822                         fid_gen |= ((uint64_t)zfid->zf_gen[i]) << (8 * i);
1823         } else {
1824                 ZFS_EXIT(zfsvfs);
1825                 return (SET_ERROR(EINVAL));
1826         }
1827
1828         /* A zero fid_gen means we are in the .zfs control directories */
1829         if (fid_gen == 0 &&
1830             (object == ZFSCTL_INO_ROOT || object == ZFSCTL_INO_SNAPDIR)) {
1831                 *vpp = zfsvfs->z_ctldir;
1832                 ASSERT(*vpp != NULL);
1833                 if (object == ZFSCTL_INO_SNAPDIR) {
1834                         VERIFY(zfsctl_root_lookup(*vpp, "snapshot", vpp, NULL,
1835                             0, NULL, NULL, NULL, NULL, NULL) == 0);
1836                 } else {
1837                         VN_HOLD(*vpp);
1838                 }
1839                 ZFS_EXIT(zfsvfs);
1840                 return (0);
1841         }
1842
1843         gen_mask = -1ULL >> (64 - 8 * i);
1844
1845         dprintf("getting %llu [%u mask %llx]\n", object, fid_gen, gen_mask);
1846         if (err = zfs_zget(zfsvfs, object, &zp)) {
1847                 ZFS_EXIT(zfsvfs);
1848                 return (err);
1849         }
1850         (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_GEN(zfsvfs), &zp_gen,
1851             sizeof (uint64_t));
1852         zp_gen = zp_gen & gen_mask;
1853         if (zp_gen == 0)
1854                 zp_gen = 1;
1855         if (zp->z_unlinked || zp_gen != fid_gen) {
1856                 dprintf("znode gen (%u) != fid gen (%u)\n", zp_gen, fid_gen);
1857                 VN_RELE(ZTOV(zp));
1858                 ZFS_EXIT(zfsvfs);
1859                 return (SET_ERROR(EINVAL));
1860         }
1861
1862         *vpp = ZTOV(zp);
1863         ZFS_EXIT(zfsvfs);
1864         return (0);
1865 }
1866
1867 /*
1868  * Block out VOPs and close zfsvfs_t::z_os
1869  *
1870  * Note, if successful, then we return with the 'z_teardown_lock' and
1871  * 'z_teardown_inactive_lock' write held.  We leave ownership of the underlying
1872  * dataset and objset intact so that they can be atomically handed off during
1873  * a subsequent rollback or recv operation and the resume thereafter.
1874  */
1875 int
1876 zfs_suspend_fs(zfsvfs_t *zfsvfs)
1877 {
1878         int error;
1879
1880         if ((error = zfsvfs_teardown(zfsvfs, B_FALSE)) != 0)
1881                 return (error);
1882
1883         return (0);
1884 }
1885
1886 /*
1887  * Rebuild SA and release VOPs.  Note that ownership of the underlying dataset
1888  * is an invariant across any of the operations that can be performed while the
1889  * filesystem was suspended.  Whether it succeeded or failed, the preconditions
1890  * are the same: the relevant objset and associated dataset are owned by
1891  * zfsvfs, held, and long held on entry.
1892  */
1893 int
1894 zfs_resume_fs(zfsvfs_t *zfsvfs, dsl_dataset_t *ds)
1895 {
1896         int err;
1897         znode_t *zp;
1898
1899         ASSERT(RRM_WRITE_HELD(&zfsvfs->z_teardown_lock));
1900         ASSERT(RW_WRITE_HELD(&zfsvfs->z_teardown_inactive_lock));
1901
1902         /*
1903          * We already own this, so just update the objset_t, as the one we
1904          * had before may have been evicted.
1905          */
1906         objset_t *os;
1907         VERIFY3P(ds->ds_owner, ==, zfsvfs);
1908         VERIFY(dsl_dataset_long_held(ds));
1909         VERIFY0(dmu_objset_from_ds(ds, &os));
1910
1911         err = zfsvfs_init(zfsvfs, os);
1912         if (err != 0)
1913                 goto bail;
1914
1915         VERIFY(zfsvfs_setup(zfsvfs, B_FALSE) == 0);
1916
1917         zfs_set_fuid_feature(zfsvfs);
1918
1919         /*
1920          * Attempt to re-establish all the active znodes with
1921          * their dbufs.  If a zfs_rezget() fails, then we'll let
1922          * any potential callers discover that via ZFS_ENTER_VERIFY_VP
1923          * when they try to use their znode.
1924          */
1925         mutex_enter(&zfsvfs->z_znodes_lock);
1926         for (zp = list_head(&zfsvfs->z_all_znodes); zp;
1927             zp = list_next(&zfsvfs->z_all_znodes, zp)) {
1928                 (void) zfs_rezget(zp);
1929         }
1930         mutex_exit(&zfsvfs->z_znodes_lock);
1931
1932 bail:
1933         /* release the VOPs */
1934         rw_exit(&zfsvfs->z_teardown_inactive_lock);
1935         rrm_exit(&zfsvfs->z_teardown_lock, FTAG);
1936
1937         if (err) {
1938                 /*
1939                  * Since we couldn't setup the sa framework, try to force
1940                  * unmount this file system.
1941                  */
1942                 if (vn_vfswlock(zfsvfs->z_vfs->vfs_vnodecovered) == 0)
1943                         (void) dounmount(zfsvfs->z_vfs, MS_FORCE, CRED());
1944         }
1945         return (err);
1946 }
1947
1948 static void
1949 zfs_freevfs(vfs_t *vfsp)
1950 {
1951         zfsvfs_t *zfsvfs = vfsp->vfs_data;
1952
1953         /*
1954          * If this is a snapshot, we have an extra VFS_HOLD on our parent
1955          * from zfs_mount().  Release it here.  If we came through
1956          * zfs_mountroot() instead, we didn't grab an extra hold, so
1957          * skip the VFS_RELE for rootvfs.
1958          */
1959         if (zfsvfs->z_issnap && (vfsp != rootvfs))
1960                 VFS_RELE(zfsvfs->z_parent->z_vfs);
1961
1962         zfsvfs_free(zfsvfs);
1963
1964         atomic_dec_32(&zfs_active_fs_count);
1965 }
1966
1967 /*
1968  * VFS_INIT() initialization.  Note that there is no VFS_FINI(),
1969  * so we can't safely do any non-idempotent initialization here.
1970  * Leave that to zfs_init() and zfs_fini(), which are called
1971  * from the module's _init() and _fini() entry points.
1972  */
1973 /*ARGSUSED*/
1974 static int
1975 zfs_vfsinit(int fstype, char *name)
1976 {
1977         int error;
1978
1979         zfsfstype = fstype;
1980
1981         /*
1982          * Setup vfsops and vnodeops tables.
1983          */
1984         error = vfs_setfsops(fstype, &zfs_vfsops);
1985         if (error != 0) {
1986                 cmn_err(CE_WARN, "zfs: bad fstype");
1987         }
1988
1989         error = zfs_create_op_tables();
1990         if (error) {
1991                 zfs_remove_op_tables();
1992                 cmn_err(CE_WARN, "zfs: bad vnode ops template");
1993                 (void) vfs_freevfsops_by_type(zfsfstype);
1994                 return (error);
1995         }
1996
1997         mutex_init(&zfs_dev_mtx, NULL, MUTEX_DEFAULT, NULL);
1998
1999         /*
2000          * Unique major number for all zfs mounts.
2001          * If we run out of 32-bit minors, we'll getudev() another major.
2002          */
2003         zfs_major = ddi_name_to_major(ZFS_DRIVER);
2004         zfs_minor = ZFS_MIN_MINOR;
2005
2006         return (0);
2007 }
2008
2009 void
2010 zfs_init(void)
2011 {
2012         /*
2013          * Initialize .zfs directory structures
2014          */
2015         zfsctl_init();
2016
2017         /*
2018          * Initialize znode cache, vnode ops, etc...
2019          */
2020         zfs_znode_init();
2021
2022         dmu_objset_register_type(DMU_OST_ZFS, zfs_space_delta_cb);
2023 }
2024
2025 void
2026 zfs_fini(void)
2027 {
2028         zfsctl_fini();
2029         zfs_znode_fini();
2030 }
2031
2032 int
2033 zfs_busy(void)
2034 {
2035         return (zfs_active_fs_count != 0);
2036 }
2037
2038 int
2039 zfs_set_version(zfsvfs_t *zfsvfs, uint64_t newvers)
2040 {
2041         int error;
2042         objset_t *os = zfsvfs->z_os;
2043         dmu_tx_t *tx;
2044
2045         if (newvers < ZPL_VERSION_INITIAL || newvers > ZPL_VERSION)
2046                 return (SET_ERROR(EINVAL));
2047
2048         if (newvers < zfsvfs->z_version)
2049                 return (SET_ERROR(EINVAL));
2050
2051         if (zfs_spa_version_map(newvers) >
2052             spa_version(dmu_objset_spa(zfsvfs->z_os)))
2053                 return (SET_ERROR(ENOTSUP));
2054
2055         tx = dmu_tx_create(os);
2056         dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, B_FALSE, ZPL_VERSION_STR);
2057         if (newvers >= ZPL_VERSION_SA && !zfsvfs->z_use_sa) {
2058                 dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, B_TRUE,
2059                     ZFS_SA_ATTRS);
2060                 dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL);
2061         }
2062         error = dmu_tx_assign(tx, TXG_WAIT);
2063         if (error) {
2064                 dmu_tx_abort(tx);
2065                 return (error);
2066         }
2067
2068         error = zap_update(os, MASTER_NODE_OBJ, ZPL_VERSION_STR,
2069             8, 1, &newvers, tx);
2070
2071         if (error) {
2072                 dmu_tx_commit(tx);
2073                 return (error);
2074         }
2075
2076         if (newvers >= ZPL_VERSION_SA && !zfsvfs->z_use_sa) {
2077                 uint64_t sa_obj;
2078
2079                 ASSERT3U(spa_version(dmu_objset_spa(zfsvfs->z_os)), >=,
2080                     SPA_VERSION_SA);
2081                 sa_obj = zap_create(os, DMU_OT_SA_MASTER_NODE,
2082                     DMU_OT_NONE, 0, tx);
2083
2084                 error = zap_add(os, MASTER_NODE_OBJ,
2085                     ZFS_SA_ATTRS, 8, 1, &sa_obj, tx);
2086                 ASSERT0(error);
2087
2088                 VERIFY(0 == sa_set_sa_object(os, sa_obj));
2089                 sa_register_update_callback(os, zfs_sa_upgrade);
2090         }
2091
2092         spa_history_log_internal_ds(dmu_objset_ds(os), "upgrade", tx,
2093             "from %llu to %llu", zfsvfs->z_version, newvers);
2094
2095         dmu_tx_commit(tx);
2096
2097         zfsvfs->z_version = newvers;
2098
2099         zfs_set_fuid_feature(zfsvfs);
2100
2101         return (0);
2102 }
2103
2104 /*
2105  * Read a property stored within the master node.
2106  */
2107 int
2108 zfs_get_zplprop(objset_t *os, zfs_prop_t prop, uint64_t *value)
2109 {
2110         const char *pname;
2111         int error = ENOENT;
2112
2113         /*
2114          * Look up the file system's value for the property.  For the
2115          * version property, we look up a slightly different string.
2116          */
2117         if (prop == ZFS_PROP_VERSION)
2118                 pname = ZPL_VERSION_STR;
2119         else
2120                 pname = zfs_prop_to_name(prop);
2121
2122         if (os != NULL) {
2123                 ASSERT3U(os->os_phys->os_type, ==, DMU_OST_ZFS);
2124                 error = zap_lookup(os, MASTER_NODE_OBJ, pname, 8, 1, value);
2125         }
2126
2127         if (error == ENOENT) {
2128                 /* No value set, use the default value */
2129                 switch (prop) {
2130                 case ZFS_PROP_VERSION:
2131                         *value = ZPL_VERSION;
2132                         break;
2133                 case ZFS_PROP_NORMALIZE:
2134                 case ZFS_PROP_UTF8ONLY:
2135                         *value = 0;
2136                         break;
2137                 case ZFS_PROP_CASE:
2138                         *value = ZFS_CASE_SENSITIVE;
2139                         break;
2140                 default:
2141                         return (error);
2142                 }
2143                 error = 0;
2144         }
2145         return (error);
2146 }
2147
2148 /*
2149  * Return true if the coresponding vfs's unmounted flag is set.
2150  * Otherwise return false.
2151  * If this function returns true we know VFS unmount has been initiated.
2152  */
2153 boolean_t
2154 zfs_get_vfs_flag_unmounted(objset_t *os)
2155 {
2156         zfsvfs_t *zfvp;
2157         boolean_t unmounted = B_FALSE;
2158
2159         ASSERT(dmu_objset_type(os) == DMU_OST_ZFS);
2160
2161         mutex_enter(&os->os_user_ptr_lock);
2162         zfvp = dmu_objset_get_user(os);
2163         if (zfvp != NULL && zfvp->z_vfs != NULL &&
2164             (zfvp->z_vfs->vfs_flag & VFS_UNMOUNTED))
2165                 unmounted = B_TRUE;
2166         mutex_exit(&os->os_user_ptr_lock);
2167
2168         return (unmounted);
2169 }
2170
2171 static vfsdef_t vfw = {
2172         VFSDEF_VERSION,
2173         MNTTYPE_ZFS,
2174         zfs_vfsinit,
2175         VSW_HASPROTO|VSW_CANRWRO|VSW_CANREMOUNT|VSW_VOLATILEDEV|VSW_STATS|
2176             VSW_XID|VSW_ZMOUNT,
2177         &zfs_mntopts
2178 };
2179
2180 struct modlfs zfs_modlfs = {
2181         &mod_fsops, "ZFS filesystem version " SPA_VERSION_STRING, &vfw
2182 };