4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or https://opensource.org/licenses/CDDL-1.0.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
22 * Copyright (c) 2011, Lawrence Livermore National Security, LLC.
23 * Copyright (c) 2023, Datto Inc. All rights reserved.
27 #include <sys/zfs_znode.h>
28 #include <sys/zfs_vfsops.h>
29 #include <sys/zfs_vnops.h>
30 #include <sys/zfs_ctldir.h>
32 #include <linux/iversion.h>
36 zpl_inode_alloc(struct super_block
*sb
)
40 VERIFY3S(zfs_inode_alloc(sb
, &ip
), ==, 0);
41 inode_set_iversion(ip
, 1);
47 zpl_inode_destroy(struct inode
*ip
)
49 ASSERT(atomic_read(&ip
->i_count
) == 0);
50 zfs_inode_destroy(ip
);
54 * Called from __mark_inode_dirty() to reflect that something in the
55 * inode has changed. We use it to ensure the znode system attributes
56 * are always strictly update to date with respect to the inode.
59 zpl_dirty_inode(struct inode
*ip
, int flags
)
61 fstrans_cookie_t cookie
;
63 cookie
= spl_fstrans_mark();
64 zfs_dirty_inode(ip
, flags
);
65 spl_fstrans_unmark(cookie
);
69 * When ->drop_inode() is called its return value indicates if the
70 * inode should be evicted from the inode cache. If the inode is
71 * unhashed and has no links the default policy is to evict it
74 * The ->evict_inode() callback must minimally truncate the inode pages,
75 * and call clear_inode(). For 2.6.35 and later kernels this will
76 * simply update the inode state, with the sync occurring before the
77 * truncate in evict(). For earlier kernels clear_inode() maps to
78 * end_writeback() which is responsible for completing all outstanding
79 * write back. In either case, once this is done it is safe to cleanup
80 * any remaining inode specific data via zfs_inactive().
81 * remaining filesystem specific data.
84 zpl_evict_inode(struct inode
*ip
)
86 fstrans_cookie_t cookie
;
88 cookie
= spl_fstrans_mark();
89 truncate_setsize(ip
, 0);
92 spl_fstrans_unmark(cookie
);
96 zpl_put_super(struct super_block
*sb
)
98 fstrans_cookie_t cookie
;
101 cookie
= spl_fstrans_mark();
102 error
= -zfs_umount(sb
);
103 spl_fstrans_unmark(cookie
);
104 ASSERT3S(error
, <=, 0);
108 zpl_sync_fs(struct super_block
*sb
, int wait
)
110 fstrans_cookie_t cookie
;
115 cookie
= spl_fstrans_mark();
116 error
= -zfs_sync(sb
, wait
, cr
);
117 spl_fstrans_unmark(cookie
);
119 ASSERT3S(error
, <=, 0);
125 zpl_statfs(struct dentry
*dentry
, struct kstatfs
*statp
)
127 fstrans_cookie_t cookie
;
130 cookie
= spl_fstrans_mark();
131 error
= -zfs_statvfs(dentry
->d_inode
, statp
);
132 spl_fstrans_unmark(cookie
);
133 ASSERT3S(error
, <=, 0);
136 * If required by a 32-bit system call, dynamically scale the
137 * block size up to 16MiB and decrease the block counts. This
138 * allows for a maximum size of 64EiB to be reported. The file
139 * counts must be artificially capped at 2^32-1.
141 if (unlikely(zpl_is_32bit_api())) {
142 while (statp
->f_blocks
> UINT32_MAX
&&
143 statp
->f_bsize
< SPA_MAXBLOCKSIZE
) {
144 statp
->f_frsize
<<= 1;
145 statp
->f_bsize
<<= 1;
147 statp
->f_blocks
>>= 1;
148 statp
->f_bfree
>>= 1;
149 statp
->f_bavail
>>= 1;
152 uint64_t usedobjs
= statp
->f_files
- statp
->f_ffree
;
153 statp
->f_ffree
= MIN(statp
->f_ffree
, UINT32_MAX
- usedobjs
);
154 statp
->f_files
= statp
->f_ffree
+ usedobjs
;
161 zpl_remount_fs(struct super_block
*sb
, int *flags
, char *data
)
163 zfs_mnt_t zm
= { .mnt_osname
= NULL
, .mnt_data
= data
};
164 fstrans_cookie_t cookie
;
167 cookie
= spl_fstrans_mark();
168 error
= -zfs_remount(sb
, flags
, &zm
);
169 spl_fstrans_unmark(cookie
);
170 ASSERT3S(error
, <=, 0);
176 __zpl_show_devname(struct seq_file
*seq
, zfsvfs_t
*zfsvfs
)
179 if ((error
= zpl_enter(zfsvfs
, FTAG
)) != 0)
182 char *fsname
= kmem_alloc(ZFS_MAX_DATASET_NAME_LEN
, KM_SLEEP
);
183 dmu_objset_name(zfsvfs
->z_os
, fsname
);
185 for (int i
= 0; fsname
[i
] != 0; i
++) {
187 * Spaces in the dataset name must be converted to their
188 * octal escape sequence for getmntent(3) to correctly
189 * parse then fsname portion of /proc/self/mounts.
191 if (fsname
[i
] == ' ') {
192 seq_puts(seq
, "\\040");
194 seq_putc(seq
, fsname
[i
]);
198 kmem_free(fsname
, ZFS_MAX_DATASET_NAME_LEN
);
200 zpl_exit(zfsvfs
, FTAG
);
206 zpl_show_devname(struct seq_file
*seq
, struct dentry
*root
)
208 return (__zpl_show_devname(seq
, root
->d_sb
->s_fs_info
));
212 __zpl_show_options(struct seq_file
*seq
, zfsvfs_t
*zfsvfs
)
214 seq_printf(seq
, ",%s",
215 zfsvfs
->z_flags
& ZSB_XATTR
? "xattr" : "noxattr");
217 #ifdef CONFIG_FS_POSIX_ACL
218 switch (zfsvfs
->z_acl_type
) {
219 case ZFS_ACLTYPE_POSIX
:
220 seq_puts(seq
, ",posixacl");
223 seq_puts(seq
, ",noacl");
226 #endif /* CONFIG_FS_POSIX_ACL */
228 switch (zfsvfs
->z_case
) {
229 case ZFS_CASE_SENSITIVE
:
230 seq_puts(seq
, ",casesensitive");
232 case ZFS_CASE_INSENSITIVE
:
233 seq_puts(seq
, ",caseinsensitive");
236 seq_puts(seq
, ",casemixed");
244 zpl_show_options(struct seq_file
*seq
, struct dentry
*root
)
246 return (__zpl_show_options(seq
, root
->d_sb
->s_fs_info
));
250 zpl_fill_super(struct super_block
*sb
, void *data
, int silent
)
252 zfs_mnt_t
*zm
= (zfs_mnt_t
*)data
;
253 fstrans_cookie_t cookie
;
256 cookie
= spl_fstrans_mark();
257 error
= -zfs_domount(sb
, zm
, silent
);
258 spl_fstrans_unmark(cookie
);
259 ASSERT3S(error
, <=, 0);
265 zpl_test_super(struct super_block
*s
, void *data
)
267 zfsvfs_t
*zfsvfs
= s
->s_fs_info
;
270 * If the os doesn't match the z_os in the super_block, assume it is
271 * not a match. Matching would imply a multimount of a dataset. It is
272 * possible that during a multimount, there is a simultaneous operation
273 * that changes the z_os, e.g., rollback, where the match will be
274 * missed, but in that case the user will get an EBUSY.
276 return (zfsvfs
!= NULL
&& os
== zfsvfs
->z_os
);
279 static struct super_block
*
280 zpl_mount_impl(struct file_system_type
*fs_type
, int flags
, zfs_mnt_t
*zm
)
282 struct super_block
*s
;
284 boolean_t issnap
= B_FALSE
;
287 err
= dmu_objset_hold(zm
->mnt_osname
, FTAG
, &os
);
289 return (ERR_PTR(-err
));
292 * The dsl pool lock must be released prior to calling sget().
293 * It is possible sget() may block on the lock in grab_super()
294 * while deactivate_super() holds that same lock and waits for
295 * a txg sync. If the dsl_pool lock is held over sget()
296 * this can prevent the pool sync and cause a deadlock.
298 dsl_dataset_long_hold(dmu_objset_ds(os
), FTAG
);
299 dsl_pool_rele(dmu_objset_pool(os
), FTAG
);
301 s
= sget(fs_type
, zpl_test_super
, set_anon_super
, flags
, os
);
304 * Recheck with the lock held to prevent mounting the wrong dataset
305 * since z_os can be stale when the teardown lock is held.
307 * We can't do this in zpl_test_super in since it's under spinlock and
308 * also s_umount lock is not held there so it would race with
309 * zfs_umount and zfsvfs can be freed.
311 if (!IS_ERR(s
) && s
->s_fs_info
!= NULL
) {
312 zfsvfs_t
*zfsvfs
= s
->s_fs_info
;
313 if (zpl_enter(zfsvfs
, FTAG
) == 0) {
314 if (os
!= zfsvfs
->z_os
)
315 err
= -SET_ERROR(EBUSY
);
316 issnap
= zfsvfs
->z_issnap
;
317 zpl_exit(zfsvfs
, FTAG
);
319 err
= -SET_ERROR(EBUSY
);
322 dsl_dataset_long_rele(dmu_objset_ds(os
), FTAG
);
323 dsl_dataset_rele(dmu_objset_ds(os
), FTAG
);
326 return (ERR_CAST(s
));
329 deactivate_locked_super(s
);
330 return (ERR_PTR(err
));
333 if (s
->s_root
== NULL
) {
334 err
= zpl_fill_super(s
, zm
, flags
& SB_SILENT
? 1 : 0);
336 deactivate_locked_super(s
);
337 return (ERR_PTR(err
));
339 s
->s_flags
|= SB_ACTIVE
;
340 } else if (!issnap
&& ((flags
^ s
->s_flags
) & SB_RDONLY
)) {
342 * Skip ro check for snap since snap is always ro regardless
343 * ro flag is passed by mount or not.
345 deactivate_locked_super(s
);
346 return (ERR_PTR(-EBUSY
));
352 static struct dentry
*
353 zpl_mount(struct file_system_type
*fs_type
, int flags
,
354 const char *osname
, void *data
)
356 zfs_mnt_t zm
= { .mnt_osname
= osname
, .mnt_data
= data
};
358 struct super_block
*sb
= zpl_mount_impl(fs_type
, flags
, &zm
);
360 return (ERR_CAST(sb
));
362 return (dget(sb
->s_root
));
366 zpl_kill_sb(struct super_block
*sb
)
373 zpl_prune_sb(uint64_t nr_to_scan
, void *arg
)
375 struct super_block
*sb
= (struct super_block
*)arg
;
379 * deactivate_locked_super calls shrinker_free and only then
380 * sops->kill_sb cb, resulting in UAF on umount when trying to reach
381 * for the shrinker functions in zpl_prune_sb of in-umount dataset.
382 * Increment if s_active is not zero, but don't prune if it is -
383 * umount could be underway.
385 if (atomic_inc_not_zero(&sb
->s_active
)) {
386 (void) -zfs_prune(sb
, nr_to_scan
, &objects
);
387 atomic_dec(&sb
->s_active
);
392 const struct super_operations zpl_super_operations
= {
393 .alloc_inode
= zpl_inode_alloc
,
394 .destroy_inode
= zpl_inode_destroy
,
395 .dirty_inode
= zpl_dirty_inode
,
397 .evict_inode
= zpl_evict_inode
,
398 .put_super
= zpl_put_super
,
399 .sync_fs
= zpl_sync_fs
,
400 .statfs
= zpl_statfs
,
401 .remount_fs
= zpl_remount_fs
,
402 .show_devname
= zpl_show_devname
,
403 .show_options
= zpl_show_options
,
407 struct file_system_type zpl_fs_type
= {
408 .owner
= THIS_MODULE
,
410 #if defined(HAVE_IDMAP_MNT_API)
411 .fs_flags
= FS_USERNS_MOUNT
| FS_ALLOW_IDMAP
,
413 .fs_flags
= FS_USERNS_MOUNT
,
416 .kill_sb
= zpl_kill_sb
,