FreeBSD: Fix a pair of bugs in zfs_fhtovp()
[zfs.git] / module / os / freebsd / zfs / zfs_vfsops.c
blob4cb7f63b5230ff01ce0c379d647c1b58eb9f6426
1 /*
2 * CDDL HEADER START
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or https://opensource.org/licenses/CDDL-1.0.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
19 * CDDL HEADER END
22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23 * Copyright (c) 2011 Pawel Jakub Dawidek <pawel@dawidek.net>.
24 * All rights reserved.
25 * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
26 * Copyright (c) 2014 Integros [integros.com]
27 * Copyright 2016 Nexenta Systems, Inc. All rights reserved.
30 /* Portions Copyright 2010 Robert Milkowski */
32 #include <sys/types.h>
33 #include <sys/param.h>
34 #include <sys/systm.h>
35 #include <sys/kernel.h>
36 #include <sys/sysmacros.h>
37 #include <sys/kmem.h>
38 #include <sys/acl.h>
39 #include <sys/vnode.h>
40 #include <sys/vfs.h>
41 #include <sys/mntent.h>
42 #include <sys/mount.h>
43 #include <sys/cmn_err.h>
44 #include <sys/zfs_znode.h>
45 #include <sys/zfs_vnops.h>
46 #include <sys/zfs_dir.h>
47 #include <sys/zil.h>
48 #include <sys/fs/zfs.h>
49 #include <sys/dmu.h>
50 #include <sys/dsl_prop.h>
51 #include <sys/dsl_dataset.h>
52 #include <sys/dsl_deleg.h>
53 #include <sys/spa.h>
54 #include <sys/zap.h>
55 #include <sys/sa.h>
56 #include <sys/sa_impl.h>
57 #include <sys/policy.h>
58 #include <sys/atomic.h>
59 #include <sys/zfs_ioctl.h>
60 #include <sys/zfs_ctldir.h>
61 #include <sys/zfs_fuid.h>
62 #include <sys/sunddi.h>
63 #include <sys/dmu_objset.h>
64 #include <sys/dsl_dir.h>
65 #include <sys/jail.h>
66 #include <ufs/ufs/quota.h>
67 #include <sys/zfs_quota.h>
69 #include "zfs_comutil.h"
71 #ifndef MNTK_VMSETSIZE_BUG
72 #define MNTK_VMSETSIZE_BUG 0
73 #endif
74 #ifndef MNTK_NOMSYNC
75 #define MNTK_NOMSYNC 8
76 #endif
78 struct mtx zfs_debug_mtx;
79 MTX_SYSINIT(zfs_debug_mtx, &zfs_debug_mtx, "zfs_debug", MTX_DEF);
81 SYSCTL_NODE(_vfs, OID_AUTO, zfs, CTLFLAG_RW, 0, "ZFS file system");
83 int zfs_super_owner;
84 SYSCTL_INT(_vfs_zfs, OID_AUTO, super_owner, CTLFLAG_RW, &zfs_super_owner, 0,
85 "File system owners can perform privileged operation on file systems");
87 int zfs_debug_level;
88 SYSCTL_INT(_vfs_zfs, OID_AUTO, debug, CTLFLAG_RWTUN, &zfs_debug_level, 0,
89 "Debug level");
91 SYSCTL_NODE(_vfs_zfs, OID_AUTO, version, CTLFLAG_RD, 0, "ZFS versions");
92 static int zfs_version_acl = ZFS_ACL_VERSION;
93 SYSCTL_INT(_vfs_zfs_version, OID_AUTO, acl, CTLFLAG_RD, &zfs_version_acl, 0,
94 "ZFS_ACL_VERSION");
95 static int zfs_version_spa = SPA_VERSION;
96 SYSCTL_INT(_vfs_zfs_version, OID_AUTO, spa, CTLFLAG_RD, &zfs_version_spa, 0,
97 "SPA_VERSION");
98 static int zfs_version_zpl = ZPL_VERSION;
99 SYSCTL_INT(_vfs_zfs_version, OID_AUTO, zpl, CTLFLAG_RD, &zfs_version_zpl, 0,
100 "ZPL_VERSION");
102 #if __FreeBSD_version >= 1400018
103 static int zfs_quotactl(vfs_t *vfsp, int cmds, uid_t id, void *arg,
104 bool *mp_busy);
105 #else
106 static int zfs_quotactl(vfs_t *vfsp, int cmds, uid_t id, void *arg);
107 #endif
108 static int zfs_mount(vfs_t *vfsp);
109 static int zfs_umount(vfs_t *vfsp, int fflag);
110 static int zfs_root(vfs_t *vfsp, int flags, vnode_t **vpp);
111 static int zfs_statfs(vfs_t *vfsp, struct statfs *statp);
112 static int zfs_vget(vfs_t *vfsp, ino_t ino, int flags, vnode_t **vpp);
113 static int zfs_sync(vfs_t *vfsp, int waitfor);
114 #if __FreeBSD_version >= 1300098
115 static int zfs_checkexp(vfs_t *vfsp, struct sockaddr *nam, uint64_t *extflagsp,
116 struct ucred **credanonp, int *numsecflavors, int *secflavors);
117 #else
118 static int zfs_checkexp(vfs_t *vfsp, struct sockaddr *nam, int *extflagsp,
119 struct ucred **credanonp, int *numsecflavors, int **secflavors);
120 #endif
121 static int zfs_fhtovp(vfs_t *vfsp, fid_t *fidp, int flags, vnode_t **vpp);
122 static void zfs_freevfs(vfs_t *vfsp);
124 struct vfsops zfs_vfsops = {
125 .vfs_mount = zfs_mount,
126 .vfs_unmount = zfs_umount,
127 #if __FreeBSD_version >= 1300049
128 .vfs_root = vfs_cache_root,
129 .vfs_cachedroot = zfs_root,
130 #else
131 .vfs_root = zfs_root,
132 #endif
133 .vfs_statfs = zfs_statfs,
134 .vfs_vget = zfs_vget,
135 .vfs_sync = zfs_sync,
136 .vfs_checkexp = zfs_checkexp,
137 .vfs_fhtovp = zfs_fhtovp,
138 .vfs_quotactl = zfs_quotactl,
141 VFS_SET(zfs_vfsops, zfs, VFCF_JAIL | VFCF_DELEGADMIN);
144 * We need to keep a count of active fs's.
145 * This is necessary to prevent our module
146 * from being unloaded after a umount -f
148 static uint32_t zfs_active_fs_count = 0;
151 zfs_get_temporary_prop(dsl_dataset_t *ds, zfs_prop_t zfs_prop, uint64_t *val,
152 char *setpoint)
154 int error;
155 zfsvfs_t *zfvp;
156 vfs_t *vfsp;
157 objset_t *os;
158 uint64_t tmp = *val;
160 error = dmu_objset_from_ds(ds, &os);
161 if (error != 0)
162 return (error);
164 error = getzfsvfs_impl(os, &zfvp);
165 if (error != 0)
166 return (error);
167 if (zfvp == NULL)
168 return (ENOENT);
169 vfsp = zfvp->z_vfs;
170 switch (zfs_prop) {
171 case ZFS_PROP_ATIME:
172 if (vfs_optionisset(vfsp, MNTOPT_NOATIME, NULL))
173 tmp = 0;
174 if (vfs_optionisset(vfsp, MNTOPT_ATIME, NULL))
175 tmp = 1;
176 break;
177 case ZFS_PROP_DEVICES:
178 if (vfs_optionisset(vfsp, MNTOPT_NODEVICES, NULL))
179 tmp = 0;
180 if (vfs_optionisset(vfsp, MNTOPT_DEVICES, NULL))
181 tmp = 1;
182 break;
183 case ZFS_PROP_EXEC:
184 if (vfs_optionisset(vfsp, MNTOPT_NOEXEC, NULL))
185 tmp = 0;
186 if (vfs_optionisset(vfsp, MNTOPT_EXEC, NULL))
187 tmp = 1;
188 break;
189 case ZFS_PROP_SETUID:
190 if (vfs_optionisset(vfsp, MNTOPT_NOSETUID, NULL))
191 tmp = 0;
192 if (vfs_optionisset(vfsp, MNTOPT_SETUID, NULL))
193 tmp = 1;
194 break;
195 case ZFS_PROP_READONLY:
196 if (vfs_optionisset(vfsp, MNTOPT_RW, NULL))
197 tmp = 0;
198 if (vfs_optionisset(vfsp, MNTOPT_RO, NULL))
199 tmp = 1;
200 break;
201 case ZFS_PROP_XATTR:
202 if (zfvp->z_flags & ZSB_XATTR)
203 tmp = zfvp->z_xattr;
204 break;
205 case ZFS_PROP_NBMAND:
206 if (vfs_optionisset(vfsp, MNTOPT_NONBMAND, NULL))
207 tmp = 0;
208 if (vfs_optionisset(vfsp, MNTOPT_NBMAND, NULL))
209 tmp = 1;
210 break;
211 default:
212 vfs_unbusy(vfsp);
213 return (ENOENT);
216 vfs_unbusy(vfsp);
217 if (tmp != *val) {
218 (void) strcpy(setpoint, "temporary");
219 *val = tmp;
221 return (0);
224 static int
225 zfs_getquota(zfsvfs_t *zfsvfs, uid_t id, int isgroup, struct dqblk64 *dqp)
227 int error = 0;
228 char buf[32];
229 uint64_t usedobj, quotaobj;
230 uint64_t quota, used = 0;
231 timespec_t now;
233 usedobj = isgroup ? DMU_GROUPUSED_OBJECT : DMU_USERUSED_OBJECT;
234 quotaobj = isgroup ? zfsvfs->z_groupquota_obj : zfsvfs->z_userquota_obj;
236 if (quotaobj == 0 || zfsvfs->z_replay) {
237 error = ENOENT;
238 goto done;
240 (void) sprintf(buf, "%llx", (longlong_t)id);
241 if ((error = zap_lookup(zfsvfs->z_os, quotaobj,
242 buf, sizeof (quota), 1, &quota)) != 0) {
243 dprintf("%s(%d): quotaobj lookup failed\n",
244 __FUNCTION__, __LINE__);
245 goto done;
248 * quota(8) uses bsoftlimit as "quoota", and hardlimit as "limit".
249 * So we set them to be the same.
251 dqp->dqb_bsoftlimit = dqp->dqb_bhardlimit = btodb(quota);
252 error = zap_lookup(zfsvfs->z_os, usedobj, buf, sizeof (used), 1, &used);
253 if (error && error != ENOENT) {
254 dprintf("%s(%d): usedobj failed; %d\n",
255 __FUNCTION__, __LINE__, error);
256 goto done;
258 dqp->dqb_curblocks = btodb(used);
259 dqp->dqb_ihardlimit = dqp->dqb_isoftlimit = 0;
260 vfs_timestamp(&now);
262 * Setting this to 0 causes FreeBSD quota(8) to print
263 * the number of days since the epoch, which isn't
264 * particularly useful.
266 dqp->dqb_btime = dqp->dqb_itime = now.tv_sec;
267 done:
268 return (error);
271 static int
272 #if __FreeBSD_version >= 1400018
273 zfs_quotactl(vfs_t *vfsp, int cmds, uid_t id, void *arg, bool *mp_busy)
274 #else
275 zfs_quotactl(vfs_t *vfsp, int cmds, uid_t id, void *arg)
276 #endif
278 zfsvfs_t *zfsvfs = vfsp->vfs_data;
279 struct thread *td;
280 int cmd, type, error = 0;
281 int bitsize;
282 zfs_userquota_prop_t quota_type;
283 struct dqblk64 dqblk = { 0 };
285 td = curthread;
286 cmd = cmds >> SUBCMDSHIFT;
287 type = cmds & SUBCMDMASK;
289 if ((error = zfs_enter(zfsvfs, FTAG)) != 0)
290 return (error);
291 if (id == -1) {
292 switch (type) {
293 case USRQUOTA:
294 id = td->td_ucred->cr_ruid;
295 break;
296 case GRPQUOTA:
297 id = td->td_ucred->cr_rgid;
298 break;
299 default:
300 error = EINVAL;
301 #if __FreeBSD_version < 1400018
302 if (cmd == Q_QUOTAON || cmd == Q_QUOTAOFF)
303 vfs_unbusy(vfsp);
304 #endif
305 goto done;
309 * Map BSD type to:
310 * ZFS_PROP_USERUSED,
311 * ZFS_PROP_USERQUOTA,
312 * ZFS_PROP_GROUPUSED,
313 * ZFS_PROP_GROUPQUOTA
315 switch (cmd) {
316 case Q_SETQUOTA:
317 case Q_SETQUOTA32:
318 if (type == USRQUOTA)
319 quota_type = ZFS_PROP_USERQUOTA;
320 else if (type == GRPQUOTA)
321 quota_type = ZFS_PROP_GROUPQUOTA;
322 else
323 error = EINVAL;
324 break;
325 case Q_GETQUOTA:
326 case Q_GETQUOTA32:
327 if (type == USRQUOTA)
328 quota_type = ZFS_PROP_USERUSED;
329 else if (type == GRPQUOTA)
330 quota_type = ZFS_PROP_GROUPUSED;
331 else
332 error = EINVAL;
333 break;
337 * Depending on the cmd, we may need to get
338 * the ruid and domain (see fuidstr_to_sid?),
339 * the fuid (how?), or other information.
340 * Create fuid using zfs_fuid_create(zfsvfs, id,
341 * ZFS_OWNER or ZFS_GROUP, cr, &fuidp)?
342 * I think I can use just the id?
344 * Look at zfs_id_overquota() to look up a quota.
345 * zap_lookup(something, quotaobj, fuidstring,
346 * sizeof (long long), 1, &quota)
348 * See zfs_set_userquota() to set a quota.
350 if ((uint32_t)type >= MAXQUOTAS) {
351 error = EINVAL;
352 goto done;
355 switch (cmd) {
356 case Q_GETQUOTASIZE:
357 bitsize = 64;
358 error = copyout(&bitsize, arg, sizeof (int));
359 break;
360 case Q_QUOTAON:
361 // As far as I can tell, you can't turn quotas on or off on zfs
362 error = 0;
363 #if __FreeBSD_version < 1400018
364 vfs_unbusy(vfsp);
365 #endif
366 break;
367 case Q_QUOTAOFF:
368 error = ENOTSUP;
369 #if __FreeBSD_version < 1400018
370 vfs_unbusy(vfsp);
371 #endif
372 break;
373 case Q_SETQUOTA:
374 error = copyin(arg, &dqblk, sizeof (dqblk));
375 if (error == 0)
376 error = zfs_set_userquota(zfsvfs, quota_type,
377 "", id, dbtob(dqblk.dqb_bhardlimit));
378 break;
379 case Q_GETQUOTA:
380 error = zfs_getquota(zfsvfs, id, type == GRPQUOTA, &dqblk);
381 if (error == 0)
382 error = copyout(&dqblk, arg, sizeof (dqblk));
383 break;
384 default:
385 error = EINVAL;
386 break;
388 done:
389 zfs_exit(zfsvfs, FTAG);
390 return (error);
394 boolean_t
395 zfs_is_readonly(zfsvfs_t *zfsvfs)
397 return (!!(zfsvfs->z_vfs->vfs_flag & VFS_RDONLY));
400 static int
401 zfs_sync(vfs_t *vfsp, int waitfor)
405 * Data integrity is job one. We don't want a compromised kernel
406 * writing to the storage pool, so we never sync during panic.
408 if (panicstr)
409 return (0);
412 * Ignore the system syncher. ZFS already commits async data
413 * at zfs_txg_timeout intervals.
415 if (waitfor == MNT_LAZY)
416 return (0);
418 if (vfsp != NULL) {
420 * Sync a specific filesystem.
422 zfsvfs_t *zfsvfs = vfsp->vfs_data;
423 dsl_pool_t *dp;
424 int error;
426 error = vfs_stdsync(vfsp, waitfor);
427 if (error != 0)
428 return (error);
430 if ((error = zfs_enter(zfsvfs, FTAG)) != 0)
431 return (error);
432 dp = dmu_objset_pool(zfsvfs->z_os);
435 * If the system is shutting down, then skip any
436 * filesystems which may exist on a suspended pool.
438 if (rebooting && spa_suspended(dp->dp_spa)) {
439 zfs_exit(zfsvfs, FTAG);
440 return (0);
443 if (zfsvfs->z_log != NULL)
444 zil_commit(zfsvfs->z_log, 0);
446 zfs_exit(zfsvfs, FTAG);
447 } else {
449 * Sync all ZFS filesystems. This is what happens when you
450 * run sync(8). Unlike other filesystems, ZFS honors the
451 * request by waiting for all pools to commit all dirty data.
453 spa_sync_allpools();
456 return (0);
459 static void
460 atime_changed_cb(void *arg, uint64_t newval)
462 zfsvfs_t *zfsvfs = arg;
464 if (newval == TRUE) {
465 zfsvfs->z_atime = TRUE;
466 zfsvfs->z_vfs->vfs_flag &= ~MNT_NOATIME;
467 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOATIME);
468 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_ATIME, NULL, 0);
469 } else {
470 zfsvfs->z_atime = FALSE;
471 zfsvfs->z_vfs->vfs_flag |= MNT_NOATIME;
472 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_ATIME);
473 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOATIME, NULL, 0);
477 static void
478 xattr_changed_cb(void *arg, uint64_t newval)
480 zfsvfs_t *zfsvfs = arg;
482 if (newval == ZFS_XATTR_OFF) {
483 zfsvfs->z_flags &= ~ZSB_XATTR;
484 } else {
485 zfsvfs->z_flags |= ZSB_XATTR;
487 if (newval == ZFS_XATTR_SA)
488 zfsvfs->z_xattr_sa = B_TRUE;
489 else
490 zfsvfs->z_xattr_sa = B_FALSE;
494 static void
495 blksz_changed_cb(void *arg, uint64_t newval)
497 zfsvfs_t *zfsvfs = arg;
498 ASSERT3U(newval, <=, spa_maxblocksize(dmu_objset_spa(zfsvfs->z_os)));
499 ASSERT3U(newval, >=, SPA_MINBLOCKSIZE);
500 ASSERT(ISP2(newval));
502 zfsvfs->z_max_blksz = newval;
503 zfsvfs->z_vfs->mnt_stat.f_iosize = newval;
506 static void
507 readonly_changed_cb(void *arg, uint64_t newval)
509 zfsvfs_t *zfsvfs = arg;
511 if (newval) {
512 /* XXX locking on vfs_flag? */
513 zfsvfs->z_vfs->vfs_flag |= VFS_RDONLY;
514 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_RW);
515 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_RO, NULL, 0);
516 } else {
517 /* XXX locking on vfs_flag? */
518 zfsvfs->z_vfs->vfs_flag &= ~VFS_RDONLY;
519 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_RO);
520 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_RW, NULL, 0);
524 static void
525 setuid_changed_cb(void *arg, uint64_t newval)
527 zfsvfs_t *zfsvfs = arg;
529 if (newval == FALSE) {
530 zfsvfs->z_vfs->vfs_flag |= VFS_NOSETUID;
531 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_SETUID);
532 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOSETUID, NULL, 0);
533 } else {
534 zfsvfs->z_vfs->vfs_flag &= ~VFS_NOSETUID;
535 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOSETUID);
536 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_SETUID, NULL, 0);
540 static void
541 exec_changed_cb(void *arg, uint64_t newval)
543 zfsvfs_t *zfsvfs = arg;
545 if (newval == FALSE) {
546 zfsvfs->z_vfs->vfs_flag |= VFS_NOEXEC;
547 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_EXEC);
548 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOEXEC, NULL, 0);
549 } else {
550 zfsvfs->z_vfs->vfs_flag &= ~VFS_NOEXEC;
551 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOEXEC);
552 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_EXEC, NULL, 0);
557 * The nbmand mount option can be changed at mount time.
558 * We can't allow it to be toggled on live file systems or incorrect
559 * behavior may be seen from cifs clients
561 * This property isn't registered via dsl_prop_register(), but this callback
562 * will be called when a file system is first mounted
564 static void
565 nbmand_changed_cb(void *arg, uint64_t newval)
567 zfsvfs_t *zfsvfs = arg;
568 if (newval == FALSE) {
569 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NBMAND);
570 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NONBMAND, NULL, 0);
571 } else {
572 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NONBMAND);
573 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NBMAND, NULL, 0);
577 static void
578 snapdir_changed_cb(void *arg, uint64_t newval)
580 zfsvfs_t *zfsvfs = arg;
582 zfsvfs->z_show_ctldir = newval;
585 static void
586 acl_mode_changed_cb(void *arg, uint64_t newval)
588 zfsvfs_t *zfsvfs = arg;
590 zfsvfs->z_acl_mode = newval;
593 static void
594 acl_inherit_changed_cb(void *arg, uint64_t newval)
596 zfsvfs_t *zfsvfs = arg;
598 zfsvfs->z_acl_inherit = newval;
601 static void
602 acl_type_changed_cb(void *arg, uint64_t newval)
604 zfsvfs_t *zfsvfs = arg;
606 zfsvfs->z_acl_type = newval;
609 static int
610 zfs_register_callbacks(vfs_t *vfsp)
612 struct dsl_dataset *ds = NULL;
613 objset_t *os = NULL;
614 zfsvfs_t *zfsvfs = NULL;
615 uint64_t nbmand;
616 boolean_t readonly = B_FALSE;
617 boolean_t do_readonly = B_FALSE;
618 boolean_t setuid = B_FALSE;
619 boolean_t do_setuid = B_FALSE;
620 boolean_t exec = B_FALSE;
621 boolean_t do_exec = B_FALSE;
622 boolean_t xattr = B_FALSE;
623 boolean_t atime = B_FALSE;
624 boolean_t do_atime = B_FALSE;
625 boolean_t do_xattr = B_FALSE;
626 int error = 0;
628 ASSERT3P(vfsp, !=, NULL);
629 zfsvfs = vfsp->vfs_data;
630 ASSERT3P(zfsvfs, !=, NULL);
631 os = zfsvfs->z_os;
634 * This function can be called for a snapshot when we update snapshot's
635 * mount point, which isn't really supported.
637 if (dmu_objset_is_snapshot(os))
638 return (EOPNOTSUPP);
641 * The act of registering our callbacks will destroy any mount
642 * options we may have. In order to enable temporary overrides
643 * of mount options, we stash away the current values and
644 * restore them after we register the callbacks.
646 if (vfs_optionisset(vfsp, MNTOPT_RO, NULL) ||
647 !spa_writeable(dmu_objset_spa(os))) {
648 readonly = B_TRUE;
649 do_readonly = B_TRUE;
650 } else if (vfs_optionisset(vfsp, MNTOPT_RW, NULL)) {
651 readonly = B_FALSE;
652 do_readonly = B_TRUE;
654 if (vfs_optionisset(vfsp, MNTOPT_NOSETUID, NULL)) {
655 setuid = B_FALSE;
656 do_setuid = B_TRUE;
657 } else if (vfs_optionisset(vfsp, MNTOPT_SETUID, NULL)) {
658 setuid = B_TRUE;
659 do_setuid = B_TRUE;
661 if (vfs_optionisset(vfsp, MNTOPT_NOEXEC, NULL)) {
662 exec = B_FALSE;
663 do_exec = B_TRUE;
664 } else if (vfs_optionisset(vfsp, MNTOPT_EXEC, NULL)) {
665 exec = B_TRUE;
666 do_exec = B_TRUE;
668 if (vfs_optionisset(vfsp, MNTOPT_NOXATTR, NULL)) {
669 zfsvfs->z_xattr = xattr = ZFS_XATTR_OFF;
670 do_xattr = B_TRUE;
671 } else if (vfs_optionisset(vfsp, MNTOPT_XATTR, NULL)) {
672 zfsvfs->z_xattr = xattr = ZFS_XATTR_DIR;
673 do_xattr = B_TRUE;
674 } else if (vfs_optionisset(vfsp, MNTOPT_DIRXATTR, NULL)) {
675 zfsvfs->z_xattr = xattr = ZFS_XATTR_DIR;
676 do_xattr = B_TRUE;
677 } else if (vfs_optionisset(vfsp, MNTOPT_SAXATTR, NULL)) {
678 zfsvfs->z_xattr = xattr = ZFS_XATTR_SA;
679 do_xattr = B_TRUE;
681 if (vfs_optionisset(vfsp, MNTOPT_NOATIME, NULL)) {
682 atime = B_FALSE;
683 do_atime = B_TRUE;
684 } else if (vfs_optionisset(vfsp, MNTOPT_ATIME, NULL)) {
685 atime = B_TRUE;
686 do_atime = B_TRUE;
690 * We need to enter pool configuration here, so that we can use
691 * dsl_prop_get_int_ds() to handle the special nbmand property below.
692 * dsl_prop_get_integer() can not be used, because it has to acquire
693 * spa_namespace_lock and we can not do that because we already hold
694 * z_teardown_lock. The problem is that spa_write_cachefile() is called
695 * with spa_namespace_lock held and the function calls ZFS vnode
696 * operations to write the cache file and thus z_teardown_lock is
697 * acquired after spa_namespace_lock.
699 ds = dmu_objset_ds(os);
700 dsl_pool_config_enter(dmu_objset_pool(os), FTAG);
703 * nbmand is a special property. It can only be changed at
704 * mount time.
706 * This is weird, but it is documented to only be changeable
707 * at mount time.
709 if (vfs_optionisset(vfsp, MNTOPT_NONBMAND, NULL)) {
710 nbmand = B_FALSE;
711 } else if (vfs_optionisset(vfsp, MNTOPT_NBMAND, NULL)) {
712 nbmand = B_TRUE;
713 } else if ((error = dsl_prop_get_int_ds(ds, "nbmand", &nbmand) != 0)) {
714 dsl_pool_config_exit(dmu_objset_pool(os), FTAG);
715 return (error);
719 * Register property callbacks.
721 * It would probably be fine to just check for i/o error from
722 * the first prop_register(), but I guess I like to go
723 * overboard...
725 error = dsl_prop_register(ds,
726 zfs_prop_to_name(ZFS_PROP_ATIME), atime_changed_cb, zfsvfs);
727 error = error ? error : dsl_prop_register(ds,
728 zfs_prop_to_name(ZFS_PROP_XATTR), xattr_changed_cb, zfsvfs);
729 error = error ? error : dsl_prop_register(ds,
730 zfs_prop_to_name(ZFS_PROP_RECORDSIZE), blksz_changed_cb, zfsvfs);
731 error = error ? error : dsl_prop_register(ds,
732 zfs_prop_to_name(ZFS_PROP_READONLY), readonly_changed_cb, zfsvfs);
733 error = error ? error : dsl_prop_register(ds,
734 zfs_prop_to_name(ZFS_PROP_SETUID), setuid_changed_cb, zfsvfs);
735 error = error ? error : dsl_prop_register(ds,
736 zfs_prop_to_name(ZFS_PROP_EXEC), exec_changed_cb, zfsvfs);
737 error = error ? error : dsl_prop_register(ds,
738 zfs_prop_to_name(ZFS_PROP_SNAPDIR), snapdir_changed_cb, zfsvfs);
739 error = error ? error : dsl_prop_register(ds,
740 zfs_prop_to_name(ZFS_PROP_ACLTYPE), acl_type_changed_cb, zfsvfs);
741 error = error ? error : dsl_prop_register(ds,
742 zfs_prop_to_name(ZFS_PROP_ACLMODE), acl_mode_changed_cb, zfsvfs);
743 error = error ? error : dsl_prop_register(ds,
744 zfs_prop_to_name(ZFS_PROP_ACLINHERIT), acl_inherit_changed_cb,
745 zfsvfs);
746 dsl_pool_config_exit(dmu_objset_pool(os), FTAG);
747 if (error)
748 goto unregister;
751 * Invoke our callbacks to restore temporary mount options.
753 if (do_readonly)
754 readonly_changed_cb(zfsvfs, readonly);
755 if (do_setuid)
756 setuid_changed_cb(zfsvfs, setuid);
757 if (do_exec)
758 exec_changed_cb(zfsvfs, exec);
759 if (do_xattr)
760 xattr_changed_cb(zfsvfs, xattr);
761 if (do_atime)
762 atime_changed_cb(zfsvfs, atime);
764 nbmand_changed_cb(zfsvfs, nbmand);
766 return (0);
768 unregister:
769 dsl_prop_unregister_all(ds, zfsvfs);
770 return (error);
774 * Associate this zfsvfs with the given objset, which must be owned.
775 * This will cache a bunch of on-disk state from the objset in the
776 * zfsvfs.
778 static int
779 zfsvfs_init(zfsvfs_t *zfsvfs, objset_t *os)
781 int error;
782 uint64_t val;
784 zfsvfs->z_max_blksz = SPA_OLD_MAXBLOCKSIZE;
785 zfsvfs->z_show_ctldir = ZFS_SNAPDIR_VISIBLE;
786 zfsvfs->z_os = os;
788 error = zfs_get_zplprop(os, ZFS_PROP_VERSION, &zfsvfs->z_version);
789 if (error != 0)
790 return (error);
791 if (zfsvfs->z_version >
792 zfs_zpl_version_map(spa_version(dmu_objset_spa(os)))) {
793 (void) printf("Can't mount a version %lld file system "
794 "on a version %lld pool\n. Pool must be upgraded to mount "
795 "this file system.", (u_longlong_t)zfsvfs->z_version,
796 (u_longlong_t)spa_version(dmu_objset_spa(os)));
797 return (SET_ERROR(ENOTSUP));
799 error = zfs_get_zplprop(os, ZFS_PROP_NORMALIZE, &val);
800 if (error != 0)
801 return (error);
802 zfsvfs->z_norm = (int)val;
804 error = zfs_get_zplprop(os, ZFS_PROP_UTF8ONLY, &val);
805 if (error != 0)
806 return (error);
807 zfsvfs->z_utf8 = (val != 0);
809 error = zfs_get_zplprop(os, ZFS_PROP_CASE, &val);
810 if (error != 0)
811 return (error);
812 zfsvfs->z_case = (uint_t)val;
814 error = zfs_get_zplprop(os, ZFS_PROP_ACLTYPE, &val);
815 if (error != 0)
816 return (error);
817 zfsvfs->z_acl_type = (uint_t)val;
820 * Fold case on file systems that are always or sometimes case
821 * insensitive.
823 if (zfsvfs->z_case == ZFS_CASE_INSENSITIVE ||
824 zfsvfs->z_case == ZFS_CASE_MIXED)
825 zfsvfs->z_norm |= U8_TEXTPREP_TOUPPER;
827 zfsvfs->z_use_fuids = USE_FUIDS(zfsvfs->z_version, zfsvfs->z_os);
828 zfsvfs->z_use_sa = USE_SA(zfsvfs->z_version, zfsvfs->z_os);
830 uint64_t sa_obj = 0;
831 if (zfsvfs->z_use_sa) {
832 /* should either have both of these objects or none */
833 error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_SA_ATTRS, 8, 1,
834 &sa_obj);
835 if (error != 0)
836 return (error);
838 error = zfs_get_zplprop(os, ZFS_PROP_XATTR, &val);
839 if (error == 0 && val == ZFS_XATTR_SA)
840 zfsvfs->z_xattr_sa = B_TRUE;
843 error = sa_setup(os, sa_obj, zfs_attr_table, ZPL_END,
844 &zfsvfs->z_attr_table);
845 if (error != 0)
846 return (error);
848 if (zfsvfs->z_version >= ZPL_VERSION_SA)
849 sa_register_update_callback(os, zfs_sa_upgrade);
851 error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_ROOT_OBJ, 8, 1,
852 &zfsvfs->z_root);
853 if (error != 0)
854 return (error);
855 ASSERT3U(zfsvfs->z_root, !=, 0);
857 error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_UNLINKED_SET, 8, 1,
858 &zfsvfs->z_unlinkedobj);
859 if (error != 0)
860 return (error);
862 error = zap_lookup(os, MASTER_NODE_OBJ,
863 zfs_userquota_prop_prefixes[ZFS_PROP_USERQUOTA],
864 8, 1, &zfsvfs->z_userquota_obj);
865 if (error == ENOENT)
866 zfsvfs->z_userquota_obj = 0;
867 else if (error != 0)
868 return (error);
870 error = zap_lookup(os, MASTER_NODE_OBJ,
871 zfs_userquota_prop_prefixes[ZFS_PROP_GROUPQUOTA],
872 8, 1, &zfsvfs->z_groupquota_obj);
873 if (error == ENOENT)
874 zfsvfs->z_groupquota_obj = 0;
875 else if (error != 0)
876 return (error);
878 error = zap_lookup(os, MASTER_NODE_OBJ,
879 zfs_userquota_prop_prefixes[ZFS_PROP_PROJECTQUOTA],
880 8, 1, &zfsvfs->z_projectquota_obj);
881 if (error == ENOENT)
882 zfsvfs->z_projectquota_obj = 0;
883 else if (error != 0)
884 return (error);
886 error = zap_lookup(os, MASTER_NODE_OBJ,
887 zfs_userquota_prop_prefixes[ZFS_PROP_USEROBJQUOTA],
888 8, 1, &zfsvfs->z_userobjquota_obj);
889 if (error == ENOENT)
890 zfsvfs->z_userobjquota_obj = 0;
891 else if (error != 0)
892 return (error);
894 error = zap_lookup(os, MASTER_NODE_OBJ,
895 zfs_userquota_prop_prefixes[ZFS_PROP_GROUPOBJQUOTA],
896 8, 1, &zfsvfs->z_groupobjquota_obj);
897 if (error == ENOENT)
898 zfsvfs->z_groupobjquota_obj = 0;
899 else if (error != 0)
900 return (error);
902 error = zap_lookup(os, MASTER_NODE_OBJ,
903 zfs_userquota_prop_prefixes[ZFS_PROP_PROJECTOBJQUOTA],
904 8, 1, &zfsvfs->z_projectobjquota_obj);
905 if (error == ENOENT)
906 zfsvfs->z_projectobjquota_obj = 0;
907 else if (error != 0)
908 return (error);
910 error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_FUID_TABLES, 8, 1,
911 &zfsvfs->z_fuid_obj);
912 if (error == ENOENT)
913 zfsvfs->z_fuid_obj = 0;
914 else if (error != 0)
915 return (error);
917 error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_SHARES_DIR, 8, 1,
918 &zfsvfs->z_shares_dir);
919 if (error == ENOENT)
920 zfsvfs->z_shares_dir = 0;
921 else if (error != 0)
922 return (error);
925 * Only use the name cache if we are looking for a
926 * name on a file system that does not require normalization
927 * or case folding. We can also look there if we happen to be
928 * on a non-normalizing, mixed sensitivity file system IF we
929 * are looking for the exact name (which is always the case on
930 * FreeBSD).
932 zfsvfs->z_use_namecache = !zfsvfs->z_norm ||
933 ((zfsvfs->z_case == ZFS_CASE_MIXED) &&
934 !(zfsvfs->z_norm & ~U8_TEXTPREP_TOUPPER));
936 return (0);
939 taskq_t *zfsvfs_taskq;
941 static void
942 zfsvfs_task_unlinked_drain(void *context, int pending __unused)
945 zfs_unlinked_drain((zfsvfs_t *)context);
949 zfsvfs_create(const char *osname, boolean_t readonly, zfsvfs_t **zfvp)
951 objset_t *os;
952 zfsvfs_t *zfsvfs;
953 int error;
954 boolean_t ro = (readonly || (strchr(osname, '@') != NULL));
957 * XXX: Fix struct statfs so this isn't necessary!
959 * The 'osname' is used as the filesystem's special node, which means
960 * it must fit in statfs.f_mntfromname, or else it can't be
961 * enumerated, so libzfs_mnttab_find() returns NULL, which causes
962 * 'zfs unmount' to think it's not mounted when it is.
964 if (strlen(osname) >= MNAMELEN)
965 return (SET_ERROR(ENAMETOOLONG));
967 zfsvfs = kmem_zalloc(sizeof (zfsvfs_t), KM_SLEEP);
969 error = dmu_objset_own(osname, DMU_OST_ZFS, ro, B_TRUE, zfsvfs,
970 &os);
971 if (error != 0) {
972 kmem_free(zfsvfs, sizeof (zfsvfs_t));
973 return (error);
976 error = zfsvfs_create_impl(zfvp, zfsvfs, os);
978 return (error);
983 zfsvfs_create_impl(zfsvfs_t **zfvp, zfsvfs_t *zfsvfs, objset_t *os)
985 int error;
987 zfsvfs->z_vfs = NULL;
988 zfsvfs->z_parent = zfsvfs;
990 mutex_init(&zfsvfs->z_znodes_lock, NULL, MUTEX_DEFAULT, NULL);
991 mutex_init(&zfsvfs->z_lock, NULL, MUTEX_DEFAULT, NULL);
992 list_create(&zfsvfs->z_all_znodes, sizeof (znode_t),
993 offsetof(znode_t, z_link_node));
994 TASK_INIT(&zfsvfs->z_unlinked_drain_task, 0,
995 zfsvfs_task_unlinked_drain, zfsvfs);
996 ZFS_TEARDOWN_INIT(zfsvfs);
997 ZFS_TEARDOWN_INACTIVE_INIT(zfsvfs);
998 rw_init(&zfsvfs->z_fuid_lock, NULL, RW_DEFAULT, NULL);
999 for (int i = 0; i != ZFS_OBJ_MTX_SZ; i++)
1000 mutex_init(&zfsvfs->z_hold_mtx[i], NULL, MUTEX_DEFAULT, NULL);
1002 error = zfsvfs_init(zfsvfs, os);
1003 if (error != 0) {
1004 dmu_objset_disown(os, B_TRUE, zfsvfs);
1005 *zfvp = NULL;
1006 kmem_free(zfsvfs, sizeof (zfsvfs_t));
1007 return (error);
1010 *zfvp = zfsvfs;
1011 return (0);
1014 static int
1015 zfsvfs_setup(zfsvfs_t *zfsvfs, boolean_t mounting)
1017 int error;
1020 * Check for a bad on-disk format version now since we
1021 * lied about owning the dataset readonly before.
1023 if (!(zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) &&
1024 dmu_objset_incompatible_encryption_version(zfsvfs->z_os))
1025 return (SET_ERROR(EROFS));
1027 error = zfs_register_callbacks(zfsvfs->z_vfs);
1028 if (error)
1029 return (error);
1032 * If we are not mounting (ie: online recv), then we don't
1033 * have to worry about replaying the log as we blocked all
1034 * operations out since we closed the ZIL.
1036 if (mounting) {
1037 boolean_t readonly;
1039 ASSERT3P(zfsvfs->z_kstat.dk_kstats, ==, NULL);
1040 error = dataset_kstats_create(&zfsvfs->z_kstat, zfsvfs->z_os);
1041 if (error)
1042 return (error);
1043 zfsvfs->z_log = zil_open(zfsvfs->z_os, zfs_get_data,
1044 &zfsvfs->z_kstat.dk_zil_sums);
1047 * During replay we remove the read only flag to
1048 * allow replays to succeed.
1050 readonly = zfsvfs->z_vfs->vfs_flag & VFS_RDONLY;
1051 if (readonly != 0) {
1052 zfsvfs->z_vfs->vfs_flag &= ~VFS_RDONLY;
1053 } else {
1054 dsl_dir_t *dd;
1055 zap_stats_t zs;
1057 if (zap_get_stats(zfsvfs->z_os, zfsvfs->z_unlinkedobj,
1058 &zs) == 0) {
1059 dataset_kstats_update_nunlinks_kstat(
1060 &zfsvfs->z_kstat, zs.zs_num_entries);
1061 dprintf_ds(zfsvfs->z_os->os_dsl_dataset,
1062 "num_entries in unlinked set: %llu",
1063 (u_longlong_t)zs.zs_num_entries);
1066 zfs_unlinked_drain(zfsvfs);
1067 dd = zfsvfs->z_os->os_dsl_dataset->ds_dir;
1068 dd->dd_activity_cancelled = B_FALSE;
1072 * Parse and replay the intent log.
1074 * Because of ziltest, this must be done after
1075 * zfs_unlinked_drain(). (Further note: ziltest
1076 * doesn't use readonly mounts, where
1077 * zfs_unlinked_drain() isn't called.) This is because
1078 * ziltest causes spa_sync() to think it's committed,
1079 * but actually it is not, so the intent log contains
1080 * many txg's worth of changes.
1082 * In particular, if object N is in the unlinked set in
1083 * the last txg to actually sync, then it could be
1084 * actually freed in a later txg and then reallocated
1085 * in a yet later txg. This would write a "create
1086 * object N" record to the intent log. Normally, this
1087 * would be fine because the spa_sync() would have
1088 * written out the fact that object N is free, before
1089 * we could write the "create object N" intent log
1090 * record.
1092 * But when we are in ziltest mode, we advance the "open
1093 * txg" without actually spa_sync()-ing the changes to
1094 * disk. So we would see that object N is still
1095 * allocated and in the unlinked set, and there is an
1096 * intent log record saying to allocate it.
1098 if (spa_writeable(dmu_objset_spa(zfsvfs->z_os))) {
1099 if (zil_replay_disable) {
1100 zil_destroy(zfsvfs->z_log, B_FALSE);
1101 } else {
1102 boolean_t use_nc = zfsvfs->z_use_namecache;
1103 zfsvfs->z_use_namecache = B_FALSE;
1104 zfsvfs->z_replay = B_TRUE;
1105 zil_replay(zfsvfs->z_os, zfsvfs,
1106 zfs_replay_vector);
1107 zfsvfs->z_replay = B_FALSE;
1108 zfsvfs->z_use_namecache = use_nc;
1112 /* restore readonly bit */
1113 if (readonly != 0)
1114 zfsvfs->z_vfs->vfs_flag |= VFS_RDONLY;
1115 } else {
1116 ASSERT3P(zfsvfs->z_kstat.dk_kstats, !=, NULL);
1117 zfsvfs->z_log = zil_open(zfsvfs->z_os, zfs_get_data,
1118 &zfsvfs->z_kstat.dk_zil_sums);
1122 * Set the objset user_ptr to track its zfsvfs.
1124 mutex_enter(&zfsvfs->z_os->os_user_ptr_lock);
1125 dmu_objset_set_user(zfsvfs->z_os, zfsvfs);
1126 mutex_exit(&zfsvfs->z_os->os_user_ptr_lock);
1128 return (0);
1131 void
1132 zfsvfs_free(zfsvfs_t *zfsvfs)
1134 int i;
1136 zfs_fuid_destroy(zfsvfs);
1138 mutex_destroy(&zfsvfs->z_znodes_lock);
1139 mutex_destroy(&zfsvfs->z_lock);
1140 ASSERT3U(zfsvfs->z_nr_znodes, ==, 0);
1141 list_destroy(&zfsvfs->z_all_znodes);
1142 ZFS_TEARDOWN_DESTROY(zfsvfs);
1143 ZFS_TEARDOWN_INACTIVE_DESTROY(zfsvfs);
1144 rw_destroy(&zfsvfs->z_fuid_lock);
1145 for (i = 0; i != ZFS_OBJ_MTX_SZ; i++)
1146 mutex_destroy(&zfsvfs->z_hold_mtx[i]);
1147 dataset_kstats_destroy(&zfsvfs->z_kstat);
1148 kmem_free(zfsvfs, sizeof (zfsvfs_t));
1151 static void
1152 zfs_set_fuid_feature(zfsvfs_t *zfsvfs)
1154 zfsvfs->z_use_fuids = USE_FUIDS(zfsvfs->z_version, zfsvfs->z_os);
1155 zfsvfs->z_use_sa = USE_SA(zfsvfs->z_version, zfsvfs->z_os);
1158 static int
1159 zfs_domount(vfs_t *vfsp, char *osname)
1161 uint64_t recordsize, fsid_guid;
1162 int error = 0;
1163 zfsvfs_t *zfsvfs;
1165 ASSERT3P(vfsp, !=, NULL);
1166 ASSERT3P(osname, !=, NULL);
1168 error = zfsvfs_create(osname, vfsp->mnt_flag & MNT_RDONLY, &zfsvfs);
1169 if (error)
1170 return (error);
1171 zfsvfs->z_vfs = vfsp;
1173 if ((error = dsl_prop_get_integer(osname,
1174 "recordsize", &recordsize, NULL)))
1175 goto out;
1176 zfsvfs->z_vfs->vfs_bsize = SPA_MINBLOCKSIZE;
1177 zfsvfs->z_vfs->mnt_stat.f_iosize = recordsize;
1179 vfsp->vfs_data = zfsvfs;
1180 vfsp->mnt_flag |= MNT_LOCAL;
1181 vfsp->mnt_kern_flag |= MNTK_LOOKUP_SHARED;
1182 vfsp->mnt_kern_flag |= MNTK_SHARED_WRITES;
1183 vfsp->mnt_kern_flag |= MNTK_EXTENDED_SHARED;
1185 * This can cause a loss of coherence between ARC and page cache
1186 * on ZoF - unclear if the problem is in FreeBSD or ZoF
1188 vfsp->mnt_kern_flag |= MNTK_NO_IOPF; /* vn_io_fault can be used */
1189 vfsp->mnt_kern_flag |= MNTK_NOMSYNC;
1190 vfsp->mnt_kern_flag |= MNTK_VMSETSIZE_BUG;
1192 #if defined(_KERNEL) && !defined(KMEM_DEBUG)
1193 vfsp->mnt_kern_flag |= MNTK_FPLOOKUP;
1194 #endif
1196 * The fsid is 64 bits, composed of an 8-bit fs type, which
1197 * separates our fsid from any other filesystem types, and a
1198 * 56-bit objset unique ID. The objset unique ID is unique to
1199 * all objsets open on this system, provided by unique_create().
1200 * The 8-bit fs type must be put in the low bits of fsid[1]
1201 * because that's where other Solaris filesystems put it.
1203 fsid_guid = dmu_objset_fsid_guid(zfsvfs->z_os);
1204 ASSERT3U((fsid_guid & ~((1ULL << 56) - 1)), ==, 0);
1205 vfsp->vfs_fsid.val[0] = fsid_guid;
1206 vfsp->vfs_fsid.val[1] = ((fsid_guid >> 32) << 8) |
1207 (vfsp->mnt_vfc->vfc_typenum & 0xFF);
1210 * Set features for file system.
1212 zfs_set_fuid_feature(zfsvfs);
1214 if (dmu_objset_is_snapshot(zfsvfs->z_os)) {
1215 uint64_t pval;
1217 atime_changed_cb(zfsvfs, B_FALSE);
1218 readonly_changed_cb(zfsvfs, B_TRUE);
1219 if ((error = dsl_prop_get_integer(osname,
1220 "xattr", &pval, NULL)))
1221 goto out;
1222 xattr_changed_cb(zfsvfs, pval);
1223 if ((error = dsl_prop_get_integer(osname,
1224 "acltype", &pval, NULL)))
1225 goto out;
1226 acl_type_changed_cb(zfsvfs, pval);
1227 zfsvfs->z_issnap = B_TRUE;
1228 zfsvfs->z_os->os_sync = ZFS_SYNC_DISABLED;
1230 mutex_enter(&zfsvfs->z_os->os_user_ptr_lock);
1231 dmu_objset_set_user(zfsvfs->z_os, zfsvfs);
1232 mutex_exit(&zfsvfs->z_os->os_user_ptr_lock);
1233 } else {
1234 if ((error = zfsvfs_setup(zfsvfs, B_TRUE)))
1235 goto out;
1238 vfs_mountedfrom(vfsp, osname);
1240 if (!zfsvfs->z_issnap)
1241 zfsctl_create(zfsvfs);
1242 out:
1243 if (error) {
1244 dmu_objset_disown(zfsvfs->z_os, B_TRUE, zfsvfs);
1245 zfsvfs_free(zfsvfs);
1246 } else {
1247 atomic_inc_32(&zfs_active_fs_count);
1250 return (error);
1253 static void
1254 zfs_unregister_callbacks(zfsvfs_t *zfsvfs)
1256 objset_t *os = zfsvfs->z_os;
1258 if (!dmu_objset_is_snapshot(os))
1259 dsl_prop_unregister_all(dmu_objset_ds(os), zfsvfs);
1262 static int
1263 getpoolname(const char *osname, char *poolname)
1265 char *p;
1267 p = strchr(osname, '/');
1268 if (p == NULL) {
1269 if (strlen(osname) >= MAXNAMELEN)
1270 return (ENAMETOOLONG);
1271 (void) strcpy(poolname, osname);
1272 } else {
1273 if (p - osname >= MAXNAMELEN)
1274 return (ENAMETOOLONG);
1275 (void) strlcpy(poolname, osname, p - osname + 1);
1277 return (0);
1280 static void
1281 fetch_osname_options(char *name, bool *checkpointrewind)
1284 if (name[0] == '!') {
1285 *checkpointrewind = true;
1286 memmove(name, name + 1, strlen(name));
1287 } else {
1288 *checkpointrewind = false;
1292 static int
1293 zfs_mount(vfs_t *vfsp)
1295 kthread_t *td = curthread;
1296 vnode_t *mvp = vfsp->mnt_vnodecovered;
1297 cred_t *cr = td->td_ucred;
1298 char *osname;
1299 int error = 0;
1300 int canwrite;
1301 bool checkpointrewind;
1303 if (vfs_getopt(vfsp->mnt_optnew, "from", (void **)&osname, NULL))
1304 return (SET_ERROR(EINVAL));
1307 * If full-owner-access is enabled and delegated administration is
1308 * turned on, we must set nosuid.
1310 if (zfs_super_owner &&
1311 dsl_deleg_access(osname, ZFS_DELEG_PERM_MOUNT, cr) != ECANCELED) {
1312 secpolicy_fs_mount_clearopts(cr, vfsp);
1315 fetch_osname_options(osname, &checkpointrewind);
1318 * Check for mount privilege?
1320 * If we don't have privilege then see if
1321 * we have local permission to allow it
1323 error = secpolicy_fs_mount(cr, mvp, vfsp);
1324 if (error) {
1325 if (dsl_deleg_access(osname, ZFS_DELEG_PERM_MOUNT, cr) != 0)
1326 goto out;
1328 if (!(vfsp->vfs_flag & MS_REMOUNT)) {
1329 vattr_t vattr;
1332 * Make sure user is the owner of the mount point
1333 * or has sufficient privileges.
1336 vattr.va_mask = AT_UID;
1338 vn_lock(mvp, LK_SHARED | LK_RETRY);
1339 if (VOP_GETATTR(mvp, &vattr, cr)) {
1340 VOP_UNLOCK1(mvp);
1341 goto out;
1344 if (secpolicy_vnode_owner(mvp, cr, vattr.va_uid) != 0 &&
1345 VOP_ACCESS(mvp, VWRITE, cr, td) != 0) {
1346 VOP_UNLOCK1(mvp);
1347 goto out;
1349 VOP_UNLOCK1(mvp);
1352 secpolicy_fs_mount_clearopts(cr, vfsp);
1356 * Refuse to mount a filesystem if we are in a local zone and the
1357 * dataset is not visible.
1359 if (!INGLOBALZONE(curproc) &&
1360 (!zone_dataset_visible(osname, &canwrite) || !canwrite)) {
1361 error = SET_ERROR(EPERM);
1362 goto out;
1365 vfsp->vfs_flag |= MNT_NFS4ACLS;
1368 * When doing a remount, we simply refresh our temporary properties
1369 * according to those options set in the current VFS options.
1371 if (vfsp->vfs_flag & MS_REMOUNT) {
1372 zfsvfs_t *zfsvfs = vfsp->vfs_data;
1375 * Refresh mount options with z_teardown_lock blocking I/O while
1376 * the filesystem is in an inconsistent state.
1377 * The lock also serializes this code with filesystem
1378 * manipulations between entry to zfs_suspend_fs() and return
1379 * from zfs_resume_fs().
1381 ZFS_TEARDOWN_ENTER_WRITE(zfsvfs, FTAG);
1382 zfs_unregister_callbacks(zfsvfs);
1383 error = zfs_register_callbacks(vfsp);
1384 ZFS_TEARDOWN_EXIT(zfsvfs, FTAG);
1385 goto out;
1388 /* Initial root mount: try hard to import the requested root pool. */
1389 if ((vfsp->vfs_flag & MNT_ROOTFS) != 0 &&
1390 (vfsp->vfs_flag & MNT_UPDATE) == 0) {
1391 char pname[MAXNAMELEN];
1393 error = getpoolname(osname, pname);
1394 if (error == 0)
1395 error = spa_import_rootpool(pname, checkpointrewind);
1396 if (error)
1397 goto out;
1399 DROP_GIANT();
1400 error = zfs_domount(vfsp, osname);
1401 PICKUP_GIANT();
1403 out:
1404 return (error);
1407 static int
1408 zfs_statfs(vfs_t *vfsp, struct statfs *statp)
1410 zfsvfs_t *zfsvfs = vfsp->vfs_data;
1411 uint64_t refdbytes, availbytes, usedobjs, availobjs;
1412 int error;
1414 statp->f_version = STATFS_VERSION;
1416 if ((error = zfs_enter(zfsvfs, FTAG)) != 0)
1417 return (error);
1419 dmu_objset_space(zfsvfs->z_os,
1420 &refdbytes, &availbytes, &usedobjs, &availobjs);
1423 * The underlying storage pool actually uses multiple block sizes.
1424 * We report the fragsize as the smallest block size we support,
1425 * and we report our blocksize as the filesystem's maximum blocksize.
1427 statp->f_bsize = SPA_MINBLOCKSIZE;
1428 statp->f_iosize = zfsvfs->z_vfs->mnt_stat.f_iosize;
1431 * The following report "total" blocks of various kinds in the
1432 * file system, but reported in terms of f_frsize - the
1433 * "fragment" size.
1436 statp->f_blocks = (refdbytes + availbytes) >> SPA_MINBLOCKSHIFT;
1437 statp->f_bfree = availbytes / statp->f_bsize;
1438 statp->f_bavail = statp->f_bfree; /* no root reservation */
1441 * statvfs() should really be called statufs(), because it assumes
1442 * static metadata. ZFS doesn't preallocate files, so the best
1443 * we can do is report the max that could possibly fit in f_files,
1444 * and that minus the number actually used in f_ffree.
1445 * For f_ffree, report the smaller of the number of object available
1446 * and the number of blocks (each object will take at least a block).
1448 statp->f_ffree = MIN(availobjs, statp->f_bfree);
1449 statp->f_files = statp->f_ffree + usedobjs;
1452 * We're a zfs filesystem.
1454 strlcpy(statp->f_fstypename, "zfs",
1455 sizeof (statp->f_fstypename));
1457 strlcpy(statp->f_mntfromname, vfsp->mnt_stat.f_mntfromname,
1458 sizeof (statp->f_mntfromname));
1459 strlcpy(statp->f_mntonname, vfsp->mnt_stat.f_mntonname,
1460 sizeof (statp->f_mntonname));
1462 statp->f_namemax = MAXNAMELEN - 1;
1464 zfs_exit(zfsvfs, FTAG);
1465 return (0);
1468 static int
1469 zfs_root(vfs_t *vfsp, int flags, vnode_t **vpp)
1471 zfsvfs_t *zfsvfs = vfsp->vfs_data;
1472 znode_t *rootzp;
1473 int error;
1475 if ((error = zfs_enter(zfsvfs, FTAG)) != 0)
1476 return (error);
1478 error = zfs_zget(zfsvfs, zfsvfs->z_root, &rootzp);
1479 if (error == 0)
1480 *vpp = ZTOV(rootzp);
1482 zfs_exit(zfsvfs, FTAG);
1484 if (error == 0) {
1485 error = vn_lock(*vpp, flags);
1486 if (error != 0) {
1487 VN_RELE(*vpp);
1488 *vpp = NULL;
1491 return (error);
1495 * Teardown the zfsvfs::z_os.
1497 * Note, if 'unmounting' is FALSE, we return with the 'z_teardown_lock'
1498 * and 'z_teardown_inactive_lock' held.
1500 static int
1501 zfsvfs_teardown(zfsvfs_t *zfsvfs, boolean_t unmounting)
1503 znode_t *zp;
1504 dsl_dir_t *dd;
1507 * If someone has not already unmounted this file system,
1508 * drain the zrele_taskq to ensure all active references to the
1509 * zfsvfs_t have been handled only then can it be safely destroyed.
1511 if (zfsvfs->z_os) {
1513 * If we're unmounting we have to wait for the list to
1514 * drain completely.
1516 * If we're not unmounting there's no guarantee the list
1517 * will drain completely, but zreles run from the taskq
1518 * may add the parents of dir-based xattrs to the taskq
1519 * so we want to wait for these.
1521 * We can safely read z_nr_znodes without locking because the
1522 * VFS has already blocked operations which add to the
1523 * z_all_znodes list and thus increment z_nr_znodes.
1525 int round = 0;
1526 while (zfsvfs->z_nr_znodes > 0) {
1527 taskq_wait_outstanding(dsl_pool_zrele_taskq(
1528 dmu_objset_pool(zfsvfs->z_os)), 0);
1529 if (++round > 1 && !unmounting)
1530 break;
1533 ZFS_TEARDOWN_ENTER_WRITE(zfsvfs, FTAG);
1535 if (!unmounting) {
1537 * We purge the parent filesystem's vfsp as the parent
1538 * filesystem and all of its snapshots have their vnode's
1539 * v_vfsp set to the parent's filesystem's vfsp. Note,
1540 * 'z_parent' is self referential for non-snapshots.
1542 #ifdef FREEBSD_NAMECACHE
1543 #if __FreeBSD_version >= 1300117
1544 cache_purgevfs(zfsvfs->z_parent->z_vfs);
1545 #else
1546 cache_purgevfs(zfsvfs->z_parent->z_vfs, true);
1547 #endif
1548 #endif
1552 * Close the zil. NB: Can't close the zil while zfs_inactive
1553 * threads are blocked as zil_close can call zfs_inactive.
1555 if (zfsvfs->z_log) {
1556 zil_close(zfsvfs->z_log);
1557 zfsvfs->z_log = NULL;
1560 ZFS_TEARDOWN_INACTIVE_ENTER_WRITE(zfsvfs);
1563 * If we are not unmounting (ie: online recv) and someone already
1564 * unmounted this file system while we were doing the switcheroo,
1565 * or a reopen of z_os failed then just bail out now.
1567 if (!unmounting && (zfsvfs->z_unmounted || zfsvfs->z_os == NULL)) {
1568 ZFS_TEARDOWN_INACTIVE_EXIT_WRITE(zfsvfs);
1569 ZFS_TEARDOWN_EXIT(zfsvfs, FTAG);
1570 return (SET_ERROR(EIO));
1574 * At this point there are no vops active, and any new vops will
1575 * fail with EIO since we have z_teardown_lock for writer (only
1576 * relevant for forced unmount).
1578 * Release all holds on dbufs.
1580 mutex_enter(&zfsvfs->z_znodes_lock);
1581 for (zp = list_head(&zfsvfs->z_all_znodes); zp != NULL;
1582 zp = list_next(&zfsvfs->z_all_znodes, zp)) {
1583 if (zp->z_sa_hdl != NULL) {
1584 zfs_znode_dmu_fini(zp);
1587 mutex_exit(&zfsvfs->z_znodes_lock);
1590 * If we are unmounting, set the unmounted flag and let new vops
1591 * unblock. zfs_inactive will have the unmounted behavior, and all
1592 * other vops will fail with EIO.
1594 if (unmounting) {
1595 zfsvfs->z_unmounted = B_TRUE;
1596 ZFS_TEARDOWN_INACTIVE_EXIT_WRITE(zfsvfs);
1597 ZFS_TEARDOWN_EXIT(zfsvfs, FTAG);
1601 * z_os will be NULL if there was an error in attempting to reopen
1602 * zfsvfs, so just return as the properties had already been
1603 * unregistered and cached data had been evicted before.
1605 if (zfsvfs->z_os == NULL)
1606 return (0);
1609 * Unregister properties.
1611 zfs_unregister_callbacks(zfsvfs);
1614 * Evict cached data
1616 if (!zfs_is_readonly(zfsvfs))
1617 txg_wait_synced(dmu_objset_pool(zfsvfs->z_os), 0);
1618 dmu_objset_evict_dbufs(zfsvfs->z_os);
1619 dd = zfsvfs->z_os->os_dsl_dataset->ds_dir;
1620 dsl_dir_cancel_waiters(dd);
1622 return (0);
1625 static int
1626 zfs_umount(vfs_t *vfsp, int fflag)
1628 kthread_t *td = curthread;
1629 zfsvfs_t *zfsvfs = vfsp->vfs_data;
1630 objset_t *os;
1631 cred_t *cr = td->td_ucred;
1632 int ret;
1634 ret = secpolicy_fs_unmount(cr, vfsp);
1635 if (ret) {
1636 if (dsl_deleg_access((char *)vfsp->vfs_resource,
1637 ZFS_DELEG_PERM_MOUNT, cr))
1638 return (ret);
1642 * Unmount any snapshots mounted under .zfs before unmounting the
1643 * dataset itself.
1645 if (zfsvfs->z_ctldir != NULL) {
1646 if ((ret = zfsctl_umount_snapshots(vfsp, fflag, cr)) != 0)
1647 return (ret);
1650 if (fflag & MS_FORCE) {
1652 * Mark file system as unmounted before calling
1653 * vflush(FORCECLOSE). This way we ensure no future vnops
1654 * will be called and risk operating on DOOMED vnodes.
1656 ZFS_TEARDOWN_ENTER_WRITE(zfsvfs, FTAG);
1657 zfsvfs->z_unmounted = B_TRUE;
1658 ZFS_TEARDOWN_EXIT(zfsvfs, FTAG);
1662 * Flush all the files.
1664 ret = vflush(vfsp, 0, (fflag & MS_FORCE) ? FORCECLOSE : 0, td);
1665 if (ret != 0)
1666 return (ret);
1667 while (taskqueue_cancel(zfsvfs_taskq->tq_queue,
1668 &zfsvfs->z_unlinked_drain_task, NULL) != 0)
1669 taskqueue_drain(zfsvfs_taskq->tq_queue,
1670 &zfsvfs->z_unlinked_drain_task);
1672 VERIFY0(zfsvfs_teardown(zfsvfs, B_TRUE));
1673 os = zfsvfs->z_os;
1676 * z_os will be NULL if there was an error in
1677 * attempting to reopen zfsvfs.
1679 if (os != NULL) {
1681 * Unset the objset user_ptr.
1683 mutex_enter(&os->os_user_ptr_lock);
1684 dmu_objset_set_user(os, NULL);
1685 mutex_exit(&os->os_user_ptr_lock);
1688 * Finally release the objset
1690 dmu_objset_disown(os, B_TRUE, zfsvfs);
1694 * We can now safely destroy the '.zfs' directory node.
1696 if (zfsvfs->z_ctldir != NULL)
1697 zfsctl_destroy(zfsvfs);
1698 zfs_freevfs(vfsp);
1700 return (0);
1703 static int
1704 zfs_vget(vfs_t *vfsp, ino_t ino, int flags, vnode_t **vpp)
1706 zfsvfs_t *zfsvfs = vfsp->vfs_data;
1707 znode_t *zp;
1708 int err;
1711 * zfs_zget() can't operate on virtual entries like .zfs/ or
1712 * .zfs/snapshot/ directories, that's why we return EOPNOTSUPP.
1713 * This will make NFS to switch to LOOKUP instead of using VGET.
1715 if (ino == ZFSCTL_INO_ROOT || ino == ZFSCTL_INO_SNAPDIR ||
1716 (zfsvfs->z_shares_dir != 0 && ino == zfsvfs->z_shares_dir))
1717 return (EOPNOTSUPP);
1719 if ((err = zfs_enter(zfsvfs, FTAG)) != 0)
1720 return (err);
1721 err = zfs_zget(zfsvfs, ino, &zp);
1722 if (err == 0 && zp->z_unlinked) {
1723 vrele(ZTOV(zp));
1724 err = EINVAL;
1726 if (err == 0)
1727 *vpp = ZTOV(zp);
1728 zfs_exit(zfsvfs, FTAG);
1729 if (err == 0) {
1730 err = vn_lock(*vpp, flags);
1731 if (err != 0)
1732 vrele(*vpp);
1734 if (err != 0)
1735 *vpp = NULL;
1736 return (err);
1739 static int
1740 #if __FreeBSD_version >= 1300098
1741 zfs_checkexp(vfs_t *vfsp, struct sockaddr *nam, uint64_t *extflagsp,
1742 struct ucred **credanonp, int *numsecflavors, int *secflavors)
1743 #else
1744 zfs_checkexp(vfs_t *vfsp, struct sockaddr *nam, int *extflagsp,
1745 struct ucred **credanonp, int *numsecflavors, int **secflavors)
1746 #endif
1748 zfsvfs_t *zfsvfs = vfsp->vfs_data;
1751 * If this is regular file system vfsp is the same as
1752 * zfsvfs->z_parent->z_vfs, but if it is snapshot,
1753 * zfsvfs->z_parent->z_vfs represents parent file system
1754 * which we have to use here, because only this file system
1755 * has mnt_export configured.
1757 return (vfs_stdcheckexp(zfsvfs->z_parent->z_vfs, nam, extflagsp,
1758 credanonp, numsecflavors, secflavors));
1761 _Static_assert(sizeof (struct fid) >= SHORT_FID_LEN,
1762 "struct fid bigger than SHORT_FID_LEN");
1763 _Static_assert(sizeof (struct fid) >= LONG_FID_LEN,
1764 "struct fid bigger than LONG_FID_LEN");
1766 static int
1767 zfs_fhtovp(vfs_t *vfsp, fid_t *fidp, int flags, vnode_t **vpp)
1769 struct componentname cn;
1770 zfsvfs_t *zfsvfs = vfsp->vfs_data;
1771 znode_t *zp;
1772 vnode_t *dvp;
1773 uint64_t object = 0;
1774 uint64_t fid_gen = 0;
1775 uint64_t setgen = 0;
1776 uint64_t gen_mask;
1777 uint64_t zp_gen;
1778 int i, err;
1780 *vpp = NULL;
1782 if ((err = zfs_enter(zfsvfs, FTAG)) != 0)
1783 return (err);
1786 * On FreeBSD we can get snapshot's mount point or its parent file
1787 * system mount point depending if snapshot is already mounted or not.
1789 if (zfsvfs->z_parent == zfsvfs && fidp->fid_len == LONG_FID_LEN) {
1790 zfid_long_t *zlfid = (zfid_long_t *)fidp;
1791 uint64_t objsetid = 0;
1793 for (i = 0; i < sizeof (zlfid->zf_setid); i++)
1794 objsetid |= ((uint64_t)zlfid->zf_setid[i]) << (8 * i);
1796 for (i = 0; i < sizeof (zlfid->zf_setgen); i++)
1797 setgen |= ((uint64_t)zlfid->zf_setgen[i]) << (8 * i);
1799 zfs_exit(zfsvfs, FTAG);
1801 err = zfsctl_lookup_objset(vfsp, objsetid, &zfsvfs);
1802 if (err)
1803 return (SET_ERROR(EINVAL));
1804 if ((err = zfs_enter(zfsvfs, FTAG)) != 0)
1805 return (err);
1808 if (fidp->fid_len == SHORT_FID_LEN || fidp->fid_len == LONG_FID_LEN) {
1809 zfid_short_t *zfid = (zfid_short_t *)fidp;
1811 for (i = 0; i < sizeof (zfid->zf_object); i++)
1812 object |= ((uint64_t)zfid->zf_object[i]) << (8 * i);
1814 for (i = 0; i < sizeof (zfid->zf_gen); i++)
1815 fid_gen |= ((uint64_t)zfid->zf_gen[i]) << (8 * i);
1816 } else {
1817 zfs_exit(zfsvfs, FTAG);
1818 return (SET_ERROR(EINVAL));
1821 if (fidp->fid_len == LONG_FID_LEN && setgen != 0) {
1822 zfs_exit(zfsvfs, FTAG);
1823 dprintf("snapdir fid: fid_gen (%llu) and setgen (%llu)\n",
1824 (u_longlong_t)fid_gen, (u_longlong_t)setgen);
1825 return (SET_ERROR(EINVAL));
1829 * A zero fid_gen means we are in .zfs or the .zfs/snapshot
1830 * directory tree. If the object == zfsvfs->z_shares_dir, then
1831 * we are in the .zfs/shares directory tree.
1833 if ((fid_gen == 0 &&
1834 (object == ZFSCTL_INO_ROOT || object == ZFSCTL_INO_SNAPDIR)) ||
1835 (zfsvfs->z_shares_dir != 0 && object == zfsvfs->z_shares_dir)) {
1836 zfs_exit(zfsvfs, FTAG);
1837 VERIFY0(zfsctl_root(zfsvfs, LK_SHARED, &dvp));
1838 if (object == ZFSCTL_INO_SNAPDIR) {
1839 cn.cn_nameptr = "snapshot";
1840 cn.cn_namelen = strlen(cn.cn_nameptr);
1841 cn.cn_nameiop = LOOKUP;
1842 cn.cn_flags = ISLASTCN | LOCKLEAF;
1843 cn.cn_lkflags = flags;
1844 VERIFY0(VOP_LOOKUP(dvp, vpp, &cn));
1845 vput(dvp);
1846 } else if (object == zfsvfs->z_shares_dir) {
1848 * XXX This branch must not be taken,
1849 * if it is, then the lookup below will
1850 * explode.
1852 cn.cn_nameptr = "shares";
1853 cn.cn_namelen = strlen(cn.cn_nameptr);
1854 cn.cn_nameiop = LOOKUP;
1855 cn.cn_flags = ISLASTCN;
1856 cn.cn_lkflags = flags;
1857 VERIFY0(VOP_LOOKUP(dvp, vpp, &cn));
1858 vput(dvp);
1859 } else {
1860 *vpp = dvp;
1862 return (err);
1865 gen_mask = -1ULL >> (64 - 8 * i);
1867 dprintf("getting %llu [%llu mask %llx]\n", (u_longlong_t)object,
1868 (u_longlong_t)fid_gen,
1869 (u_longlong_t)gen_mask);
1870 if ((err = zfs_zget(zfsvfs, object, &zp))) {
1871 zfs_exit(zfsvfs, FTAG);
1872 return (err);
1874 (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_GEN(zfsvfs), &zp_gen,
1875 sizeof (uint64_t));
1876 zp_gen = zp_gen & gen_mask;
1877 if (zp_gen == 0)
1878 zp_gen = 1;
1879 if (zp->z_unlinked || zp_gen != fid_gen) {
1880 dprintf("znode gen (%llu) != fid gen (%llu)\n",
1881 (u_longlong_t)zp_gen, (u_longlong_t)fid_gen);
1882 vrele(ZTOV(zp));
1883 zfs_exit(zfsvfs, FTAG);
1884 return (SET_ERROR(EINVAL));
1887 *vpp = ZTOV(zp);
1888 zfs_exit(zfsvfs, FTAG);
1889 err = vn_lock(*vpp, flags);
1890 if (err == 0)
1891 vnode_create_vobject(*vpp, zp->z_size, curthread);
1892 else
1893 *vpp = NULL;
1894 return (err);
1898 * Block out VOPs and close zfsvfs_t::z_os
1900 * Note, if successful, then we return with the 'z_teardown_lock' and
1901 * 'z_teardown_inactive_lock' write held. We leave ownership of the underlying
1902 * dataset and objset intact so that they can be atomically handed off during
1903 * a subsequent rollback or recv operation and the resume thereafter.
1906 zfs_suspend_fs(zfsvfs_t *zfsvfs)
1908 int error;
1910 if ((error = zfsvfs_teardown(zfsvfs, B_FALSE)) != 0)
1911 return (error);
1913 return (0);
1917 * Rebuild SA and release VOPs. Note that ownership of the underlying dataset
1918 * is an invariant across any of the operations that can be performed while the
1919 * filesystem was suspended. Whether it succeeded or failed, the preconditions
1920 * are the same: the relevant objset and associated dataset are owned by
1921 * zfsvfs, held, and long held on entry.
1924 zfs_resume_fs(zfsvfs_t *zfsvfs, dsl_dataset_t *ds)
1926 int err;
1927 znode_t *zp;
1929 ASSERT(ZFS_TEARDOWN_WRITE_HELD(zfsvfs));
1930 ASSERT(ZFS_TEARDOWN_INACTIVE_WRITE_HELD(zfsvfs));
1933 * We already own this, so just update the objset_t, as the one we
1934 * had before may have been evicted.
1936 objset_t *os;
1937 VERIFY3P(ds->ds_owner, ==, zfsvfs);
1938 VERIFY(dsl_dataset_long_held(ds));
1939 dsl_pool_t *dp = spa_get_dsl(dsl_dataset_get_spa(ds));
1940 dsl_pool_config_enter(dp, FTAG);
1941 VERIFY0(dmu_objset_from_ds(ds, &os));
1942 dsl_pool_config_exit(dp, FTAG);
1944 err = zfsvfs_init(zfsvfs, os);
1945 if (err != 0)
1946 goto bail;
1948 ds->ds_dir->dd_activity_cancelled = B_FALSE;
1949 VERIFY0(zfsvfs_setup(zfsvfs, B_FALSE));
1951 zfs_set_fuid_feature(zfsvfs);
1954 * Attempt to re-establish all the active znodes with
1955 * their dbufs. If a zfs_rezget() fails, then we'll let
1956 * any potential callers discover that via zfs_enter_verify_zp
1957 * when they try to use their znode.
1959 mutex_enter(&zfsvfs->z_znodes_lock);
1960 for (zp = list_head(&zfsvfs->z_all_znodes); zp;
1961 zp = list_next(&zfsvfs->z_all_znodes, zp)) {
1962 (void) zfs_rezget(zp);
1964 mutex_exit(&zfsvfs->z_znodes_lock);
1966 bail:
1967 /* release the VOPs */
1968 ZFS_TEARDOWN_INACTIVE_EXIT_WRITE(zfsvfs);
1969 ZFS_TEARDOWN_EXIT(zfsvfs, FTAG);
1971 if (err) {
1973 * Since we couldn't setup the sa framework, try to force
1974 * unmount this file system.
1976 if (vn_vfswlock(zfsvfs->z_vfs->vfs_vnodecovered) == 0) {
1977 vfs_ref(zfsvfs->z_vfs);
1978 (void) dounmount(zfsvfs->z_vfs, MS_FORCE, curthread);
1981 return (err);
1984 static void
1985 zfs_freevfs(vfs_t *vfsp)
1987 zfsvfs_t *zfsvfs = vfsp->vfs_data;
1989 zfsvfs_free(zfsvfs);
1991 atomic_dec_32(&zfs_active_fs_count);
1994 #ifdef __i386__
1995 static int desiredvnodes_backup;
1996 #include <sys/vmmeter.h>
1999 #include <vm/vm_page.h>
2000 #include <vm/vm_object.h>
2001 #include <vm/vm_kern.h>
2002 #include <vm/vm_map.h>
2003 #endif
2005 static void
2006 zfs_vnodes_adjust(void)
2008 #ifdef __i386__
2009 int newdesiredvnodes;
2011 desiredvnodes_backup = desiredvnodes;
2014 * We calculate newdesiredvnodes the same way it is done in
2015 * vntblinit(). If it is equal to desiredvnodes, it means that
2016 * it wasn't tuned by the administrator and we can tune it down.
2018 newdesiredvnodes = min(maxproc + vm_cnt.v_page_count / 4, 2 *
2019 vm_kmem_size / (5 * (sizeof (struct vm_object) +
2020 sizeof (struct vnode))));
2021 if (newdesiredvnodes == desiredvnodes)
2022 desiredvnodes = (3 * newdesiredvnodes) / 4;
2023 #endif
2026 static void
2027 zfs_vnodes_adjust_back(void)
2030 #ifdef __i386__
2031 desiredvnodes = desiredvnodes_backup;
2032 #endif
2035 void
2036 zfs_init(void)
2039 printf("ZFS filesystem version: " ZPL_VERSION_STRING "\n");
2042 * Initialize .zfs directory structures
2044 zfsctl_init();
2047 * Initialize znode cache, vnode ops, etc...
2049 zfs_znode_init();
2052 * Reduce number of vnodes. Originally number of vnodes is calculated
2053 * with UFS inode in mind. We reduce it here, because it's too big for
2054 * ZFS/i386.
2056 zfs_vnodes_adjust();
2058 dmu_objset_register_type(DMU_OST_ZFS, zpl_get_file_info);
2060 zfsvfs_taskq = taskq_create("zfsvfs", 1, minclsyspri, 0, 0, 0);
2063 void
2064 zfs_fini(void)
2066 taskq_destroy(zfsvfs_taskq);
2067 zfsctl_fini();
2068 zfs_znode_fini();
2069 zfs_vnodes_adjust_back();
2073 zfs_busy(void)
2075 return (zfs_active_fs_count != 0);
2079 * Release VOPs and unmount a suspended filesystem.
2082 zfs_end_fs(zfsvfs_t *zfsvfs, dsl_dataset_t *ds)
2084 ASSERT(ZFS_TEARDOWN_WRITE_HELD(zfsvfs));
2085 ASSERT(ZFS_TEARDOWN_INACTIVE_WRITE_HELD(zfsvfs));
2088 * We already own this, so just hold and rele it to update the
2089 * objset_t, as the one we had before may have been evicted.
2091 objset_t *os;
2092 VERIFY3P(ds->ds_owner, ==, zfsvfs);
2093 VERIFY(dsl_dataset_long_held(ds));
2094 dsl_pool_t *dp = spa_get_dsl(dsl_dataset_get_spa(ds));
2095 dsl_pool_config_enter(dp, FTAG);
2096 VERIFY0(dmu_objset_from_ds(ds, &os));
2097 dsl_pool_config_exit(dp, FTAG);
2098 zfsvfs->z_os = os;
2100 /* release the VOPs */
2101 ZFS_TEARDOWN_INACTIVE_EXIT_WRITE(zfsvfs);
2102 ZFS_TEARDOWN_EXIT(zfsvfs, FTAG);
2105 * Try to force unmount this file system.
2107 (void) zfs_umount(zfsvfs->z_vfs, 0);
2108 zfsvfs->z_unmounted = B_TRUE;
2109 return (0);
2113 zfs_set_version(zfsvfs_t *zfsvfs, uint64_t newvers)
2115 int error;
2116 objset_t *os = zfsvfs->z_os;
2117 dmu_tx_t *tx;
2119 if (newvers < ZPL_VERSION_INITIAL || newvers > ZPL_VERSION)
2120 return (SET_ERROR(EINVAL));
2122 if (newvers < zfsvfs->z_version)
2123 return (SET_ERROR(EINVAL));
2125 if (zfs_spa_version_map(newvers) >
2126 spa_version(dmu_objset_spa(zfsvfs->z_os)))
2127 return (SET_ERROR(ENOTSUP));
2129 tx = dmu_tx_create(os);
2130 dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, B_FALSE, ZPL_VERSION_STR);
2131 if (newvers >= ZPL_VERSION_SA && !zfsvfs->z_use_sa) {
2132 dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, B_TRUE,
2133 ZFS_SA_ATTRS);
2134 dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL);
2136 error = dmu_tx_assign(tx, TXG_WAIT);
2137 if (error) {
2138 dmu_tx_abort(tx);
2139 return (error);
2142 error = zap_update(os, MASTER_NODE_OBJ, ZPL_VERSION_STR,
2143 8, 1, &newvers, tx);
2145 if (error) {
2146 dmu_tx_commit(tx);
2147 return (error);
2150 if (newvers >= ZPL_VERSION_SA && !zfsvfs->z_use_sa) {
2151 uint64_t sa_obj;
2153 ASSERT3U(spa_version(dmu_objset_spa(zfsvfs->z_os)), >=,
2154 SPA_VERSION_SA);
2155 sa_obj = zap_create(os, DMU_OT_SA_MASTER_NODE,
2156 DMU_OT_NONE, 0, tx);
2158 error = zap_add(os, MASTER_NODE_OBJ,
2159 ZFS_SA_ATTRS, 8, 1, &sa_obj, tx);
2160 ASSERT0(error);
2162 VERIFY0(sa_set_sa_object(os, sa_obj));
2163 sa_register_update_callback(os, zfs_sa_upgrade);
2166 spa_history_log_internal_ds(dmu_objset_ds(os), "upgrade", tx,
2167 "from %ju to %ju", (uintmax_t)zfsvfs->z_version,
2168 (uintmax_t)newvers);
2169 dmu_tx_commit(tx);
2171 zfsvfs->z_version = newvers;
2172 os->os_version = newvers;
2174 zfs_set_fuid_feature(zfsvfs);
2176 return (0);
2180 * Read a property stored within the master node.
2183 zfs_get_zplprop(objset_t *os, zfs_prop_t prop, uint64_t *value)
2185 uint64_t *cached_copy = NULL;
2188 * Figure out where in the objset_t the cached copy would live, if it
2189 * is available for the requested property.
2191 if (os != NULL) {
2192 switch (prop) {
2193 case ZFS_PROP_VERSION:
2194 cached_copy = &os->os_version;
2195 break;
2196 case ZFS_PROP_NORMALIZE:
2197 cached_copy = &os->os_normalization;
2198 break;
2199 case ZFS_PROP_UTF8ONLY:
2200 cached_copy = &os->os_utf8only;
2201 break;
2202 case ZFS_PROP_CASE:
2203 cached_copy = &os->os_casesensitivity;
2204 break;
2205 default:
2206 break;
2209 if (cached_copy != NULL && *cached_copy != OBJSET_PROP_UNINITIALIZED) {
2210 *value = *cached_copy;
2211 return (0);
2215 * If the property wasn't cached, look up the file system's value for
2216 * the property. For the version property, we look up a slightly
2217 * different string.
2219 const char *pname;
2220 int error = ENOENT;
2221 if (prop == ZFS_PROP_VERSION) {
2222 pname = ZPL_VERSION_STR;
2223 } else {
2224 pname = zfs_prop_to_name(prop);
2227 if (os != NULL) {
2228 ASSERT3U(os->os_phys->os_type, ==, DMU_OST_ZFS);
2229 error = zap_lookup(os, MASTER_NODE_OBJ, pname, 8, 1, value);
2232 if (error == ENOENT) {
2233 /* No value set, use the default value */
2234 switch (prop) {
2235 case ZFS_PROP_VERSION:
2236 *value = ZPL_VERSION;
2237 break;
2238 case ZFS_PROP_NORMALIZE:
2239 case ZFS_PROP_UTF8ONLY:
2240 *value = 0;
2241 break;
2242 case ZFS_PROP_CASE:
2243 *value = ZFS_CASE_SENSITIVE;
2244 break;
2245 case ZFS_PROP_ACLTYPE:
2246 *value = ZFS_ACLTYPE_NFSV4;
2247 break;
2248 default:
2249 return (error);
2251 error = 0;
2255 * If one of the methods for getting the property value above worked,
2256 * copy it into the objset_t's cache.
2258 if (error == 0 && cached_copy != NULL) {
2259 *cached_copy = *value;
2262 return (error);
2266 * Return true if the corresponding vfs's unmounted flag is set.
2267 * Otherwise return false.
2268 * If this function returns true we know VFS unmount has been initiated.
2270 boolean_t
2271 zfs_get_vfs_flag_unmounted(objset_t *os)
2273 zfsvfs_t *zfvp;
2274 boolean_t unmounted = B_FALSE;
2276 ASSERT3U(dmu_objset_type(os), ==, DMU_OST_ZFS);
2278 mutex_enter(&os->os_user_ptr_lock);
2279 zfvp = dmu_objset_get_user(os);
2280 if (zfvp != NULL && zfvp->z_vfs != NULL &&
2281 (zfvp->z_vfs->mnt_kern_flag & MNTK_UNMOUNT))
2282 unmounted = B_TRUE;
2283 mutex_exit(&os->os_user_ptr_lock);
2285 return (unmounted);
2288 #ifdef _KERNEL
2289 void
2290 zfsvfs_update_fromname(const char *oldname, const char *newname)
2292 char tmpbuf[MAXPATHLEN];
2293 struct mount *mp;
2294 char *fromname;
2295 size_t oldlen;
2297 oldlen = strlen(oldname);
2299 mtx_lock(&mountlist_mtx);
2300 TAILQ_FOREACH(mp, &mountlist, mnt_list) {
2301 fromname = mp->mnt_stat.f_mntfromname;
2302 if (strcmp(fromname, oldname) == 0) {
2303 (void) strlcpy(fromname, newname,
2304 sizeof (mp->mnt_stat.f_mntfromname));
2305 continue;
2307 if (strncmp(fromname, oldname, oldlen) == 0 &&
2308 (fromname[oldlen] == '/' || fromname[oldlen] == '@')) {
2309 (void) snprintf(tmpbuf, sizeof (tmpbuf), "%s%s",
2310 newname, fromname + oldlen);
2311 (void) strlcpy(fromname, tmpbuf,
2312 sizeof (mp->mnt_stat.f_mntfromname));
2313 continue;
2316 mtx_unlock(&mountlist_mtx);
2318 #endif