module/os/linux/zfs/zpl_super.c

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or https://opensource.org/licenses/CDDL-1.0.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright (c) 2011, Lawrence Livermore National Security, LLC.
  23  * Copyright (c) 2023, Datto Inc. All rights reserved.
  24  */
  25
  26
  27 #include <sys/zfs_znode.h>
  28 #include <sys/zfs_vfsops.h>
  29 #include <sys/zfs_vnops.h>
  30 #include <sys/zfs_ctldir.h>
  31 #include <sys/zpl.h>
  32 #include <linux/iversion.h>
  33
  34
  35 static struct inode *
  36 zpl_inode_alloc(struct super_block *sb)
  37 {
  38         struct inode *ip;
  39
  40         VERIFY3S(zfs_inode_alloc(sb, &ip), ==, 0);
  41         inode_set_iversion(ip, 1);
  42
  43         return (ip);
  44 }
  45
  46 static void
  47 zpl_inode_destroy(struct inode *ip)
  48 {
  49         ASSERT(atomic_read(&ip->i_count) == 0);
  50         zfs_inode_destroy(ip);
  51 }
  52
  53 /*
  54  * Called from __mark_inode_dirty() to reflect that something in the
  55  * inode has changed.  We use it to ensure the znode system attributes
  56  * are always strictly update to date with respect to the inode.
  57  */
  58 static void
  59 zpl_dirty_inode(struct inode *ip, int flags)
  60 {
  61         fstrans_cookie_t cookie;
  62
  63         cookie = spl_fstrans_mark();
  64         zfs_dirty_inode(ip, flags);
  65         spl_fstrans_unmark(cookie);
  66 }
  67
  68 /*
  69  * When ->drop_inode() is called its return value indicates if the
  70  * inode should be evicted from the inode cache.  If the inode is
  71  * unhashed and has no links the default policy is to evict it
  72  * immediately.
  73  *
  74  * The ->evict_inode() callback must minimally truncate the inode pages,
  75  * and call clear_inode().  For 2.6.35 and later kernels this will
  76  * simply update the inode state, with the sync occurring before the
  77  * truncate in evict().  For earlier kernels clear_inode() maps to
  78  * end_writeback() which is responsible for completing all outstanding
  79  * write back.  In either case, once this is done it is safe to cleanup
  80  * any remaining inode specific data via zfs_inactive().
  81  * remaining filesystem specific data.
  82  */
  83 static void
  84 zpl_evict_inode(struct inode *ip)
  85 {
  86         fstrans_cookie_t cookie;
  87
  88         cookie = spl_fstrans_mark();
  89         truncate_setsize(ip, 0);
  90         clear_inode(ip);
  91         zfs_inactive(ip);
  92         spl_fstrans_unmark(cookie);
  93 }
  94
  95 static void
  96 zpl_put_super(struct super_block *sb)
  97 {
  98         fstrans_cookie_t cookie;
  99         int error;
 100
 101         cookie = spl_fstrans_mark();
 102         error = -zfs_umount(sb);
 103         spl_fstrans_unmark(cookie);
 104         ASSERT3S(error, <=, 0);
 105 }
 106
 107 static int
 108 zpl_sync_fs(struct super_block *sb, int wait)
 109 {
 110         fstrans_cookie_t cookie;
 111         cred_t *cr = CRED();
 112         int error;
 113
 114         crhold(cr);
 115         cookie = spl_fstrans_mark();
 116         error = -zfs_sync(sb, wait, cr);
 117         spl_fstrans_unmark(cookie);
 118         crfree(cr);
 119         ASSERT3S(error, <=, 0);
 120
 121         return (error);
 122 }
 123
 124 static int
 125 zpl_statfs(struct dentry *dentry, struct kstatfs *statp)
 126 {
 127         fstrans_cookie_t cookie;
 128         int error;
 129
 130         cookie = spl_fstrans_mark();
 131         error = -zfs_statvfs(dentry->d_inode, statp);
 132         spl_fstrans_unmark(cookie);
 133         ASSERT3S(error, <=, 0);
 134
 135         /*
 136          * If required by a 32-bit system call, dynamically scale the
 137          * block size up to 16MiB and decrease the block counts.  This
 138          * allows for a maximum size of 64EiB to be reported.  The file
 139          * counts must be artificially capped at 2^32-1.
 140          */
 141         if (unlikely(zpl_is_32bit_api())) {
 142                 while (statp->f_blocks > UINT32_MAX &&
 143                     statp->f_bsize < SPA_MAXBLOCKSIZE) {
 144                         statp->f_frsize <<= 1;
 145                         statp->f_bsize <<= 1;
 146
 147                         statp->f_blocks >>= 1;
 148                         statp->f_bfree >>= 1;
 149                         statp->f_bavail >>= 1;
 150                 }
 151
 152                 uint64_t usedobjs = statp->f_files - statp->f_ffree;
 153                 statp->f_ffree = MIN(statp->f_ffree, UINT32_MAX - usedobjs);
 154                 statp->f_files = statp->f_ffree + usedobjs;
 155         }
 156
 157         return (error);
 158 }
 159
 160 static int
 161 zpl_remount_fs(struct super_block *sb, int *flags, char *data)
 162 {
 163         zfs_mnt_t zm = { .mnt_osname = NULL, .mnt_data = data };
 164         fstrans_cookie_t cookie;
 165         int error;
 166
 167         cookie = spl_fstrans_mark();
 168         error = -zfs_remount(sb, flags, &zm);
 169         spl_fstrans_unmark(cookie);
 170         ASSERT3S(error, <=, 0);
 171
 172         return (error);
 173 }
 174
 175 static int
 176 __zpl_show_devname(struct seq_file *seq, zfsvfs_t *zfsvfs)
 177 {
 178         int error;
 179         if ((error = zpl_enter(zfsvfs, FTAG)) != 0)
 180                 return (error);
 181
 182         char *fsname = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP);
 183         dmu_objset_name(zfsvfs->z_os, fsname);
 184
 185         for (int i = 0; fsname[i] != 0; i++) {
 186                 /*
 187                  * Spaces in the dataset name must be converted to their
 188                  * octal escape sequence for getmntent(3) to correctly
 189                  * parse then fsname portion of /proc/self/mounts.
 190                  */
 191                 if (fsname[i] == ' ') {
 192                         seq_puts(seq, "\\040");
 193                 } else {
 194                         seq_putc(seq, fsname[i]);
 195                 }
 196         }
 197
 198         kmem_free(fsname, ZFS_MAX_DATASET_NAME_LEN);
 199
 200         zpl_exit(zfsvfs, FTAG);
 201
 202         return (0);
 203 }
 204
 205 static int
 206 zpl_show_devname(struct seq_file *seq, struct dentry *root)
 207 {
 208         return (__zpl_show_devname(seq, root->d_sb->s_fs_info));
 209 }
 210
 211 static int
 212 __zpl_show_options(struct seq_file *seq, zfsvfs_t *zfsvfs)
 213 {
 214         seq_printf(seq, ",%s",
 215             zfsvfs->z_flags & ZSB_XATTR ? "xattr" : "noxattr");
 216
 217 #ifdef CONFIG_FS_POSIX_ACL
 218         switch (zfsvfs->z_acl_type) {
 219         case ZFS_ACLTYPE_POSIX:
 220                 seq_puts(seq, ",posixacl");
 221                 break;
 222         default:
 223                 seq_puts(seq, ",noacl");
 224                 break;
 225         }
 226 #endif /* CONFIG_FS_POSIX_ACL */
 227
 228         switch (zfsvfs->z_case) {
 229         case ZFS_CASE_SENSITIVE:
 230                 seq_puts(seq, ",casesensitive");
 231                 break;
 232         case ZFS_CASE_INSENSITIVE:
 233                 seq_puts(seq, ",caseinsensitive");
 234                 break;
 235         default:
 236                 seq_puts(seq, ",casemixed");
 237                 break;
 238         }
 239
 240         return (0);
 241 }
 242
 243 static int
 244 zpl_show_options(struct seq_file *seq, struct dentry *root)
 245 {
 246         return (__zpl_show_options(seq, root->d_sb->s_fs_info));
 247 }
 248
 249 static int
 250 zpl_fill_super(struct super_block *sb, void *data, int silent)
 251 {
 252         zfs_mnt_t *zm = (zfs_mnt_t *)data;
 253         fstrans_cookie_t cookie;
 254         int error;
 255
 256         cookie = spl_fstrans_mark();
 257         error = -zfs_domount(sb, zm, silent);
 258         spl_fstrans_unmark(cookie);
 259         ASSERT3S(error, <=, 0);
 260
 261         return (error);
 262 }
 263
 264 static int
 265 zpl_test_super(struct super_block *s, void *data)
 266 {
 267         zfsvfs_t *zfsvfs = s->s_fs_info;
 268         objset_t *os = data;
 269         /*
 270          * If the os doesn't match the z_os in the super_block, assume it is
 271          * not a match. Matching would imply a multimount of a dataset. It is
 272          * possible that during a multimount, there is a simultaneous operation
 273          * that changes the z_os, e.g., rollback, where the match will be
 274          * missed, but in that case the user will get an EBUSY.
 275          */
 276         return (zfsvfs != NULL && os == zfsvfs->z_os);
 277 }
 278
 279 static struct super_block *
 280 zpl_mount_impl(struct file_system_type *fs_type, int flags, zfs_mnt_t *zm)
 281 {
 282         struct super_block *s;
 283         objset_t *os;
 284         boolean_t issnap = B_FALSE;
 285         int err;
 286
 287         err = dmu_objset_hold(zm->mnt_osname, FTAG, &os);
 288         if (err)
 289                 return (ERR_PTR(-err));
 290
 291         /*
 292          * The dsl pool lock must be released prior to calling sget().
 293          * It is possible sget() may block on the lock in grab_super()
 294          * while deactivate_super() holds that same lock and waits for
 295          * a txg sync.  If the dsl_pool lock is held over sget()
 296          * this can prevent the pool sync and cause a deadlock.
 297          */
 298         dsl_dataset_long_hold(dmu_objset_ds(os), FTAG);
 299         dsl_pool_rele(dmu_objset_pool(os), FTAG);
 300
 301         s = sget(fs_type, zpl_test_super, set_anon_super, flags, os);
 302
 303         /*
 304          * Recheck with the lock held to prevent mounting the wrong dataset
 305          * since z_os can be stale when the teardown lock is held.
 306          *
 307          * We can't do this in zpl_test_super in since it's under spinlock and
 308          * also s_umount lock is not held there so it would race with
 309          * zfs_umount and zfsvfs can be freed.
 310          */
 311         if (!IS_ERR(s) && s->s_fs_info != NULL) {
 312                 zfsvfs_t *zfsvfs = s->s_fs_info;
 313                 if (zpl_enter(zfsvfs, FTAG) == 0) {
 314                         if (os != zfsvfs->z_os)
 315                                 err = -SET_ERROR(EBUSY);
 316                         issnap = zfsvfs->z_issnap;
 317                         zpl_exit(zfsvfs, FTAG);
 318                 } else {
 319                         err = -SET_ERROR(EBUSY);
 320                 }
 321         }
 322         dsl_dataset_long_rele(dmu_objset_ds(os), FTAG);
 323         dsl_dataset_rele(dmu_objset_ds(os), FTAG);
 324
 325         if (IS_ERR(s))
 326                 return (ERR_CAST(s));
 327
 328         if (err) {
 329                 deactivate_locked_super(s);
 330                 return (ERR_PTR(err));
 331         }
 332
 333         if (s->s_root == NULL) {
 334                 err = zpl_fill_super(s, zm, flags & SB_SILENT ? 1 : 0);
 335                 if (err) {
 336                         deactivate_locked_super(s);
 337                         return (ERR_PTR(err));
 338                 }
 339                 s->s_flags |= SB_ACTIVE;
 340         } else if (!issnap && ((flags ^ s->s_flags) & SB_RDONLY)) {
 341                 /*
 342                  * Skip ro check for snap since snap is always ro regardless
 343                  * ro flag is passed by mount or not.
 344                  */
 345                 deactivate_locked_super(s);
 346                 return (ERR_PTR(-EBUSY));
 347         }
 348
 349         return (s);
 350 }
 351
 352 static struct dentry *
 353 zpl_mount(struct file_system_type *fs_type, int flags,
 354     const char *osname, void *data)
 355 {
 356         zfs_mnt_t zm = { .mnt_osname = osname, .mnt_data = data };
 357
 358         struct super_block *sb = zpl_mount_impl(fs_type, flags, &zm);
 359         if (IS_ERR(sb))
 360                 return (ERR_CAST(sb));
 361
 362         return (dget(sb->s_root));
 363 }
 364
 365 static void
 366 zpl_kill_sb(struct super_block *sb)
 367 {
 368         zfs_preumount(sb);
 369         kill_anon_super(sb);
 370 }
 371
 372 void
 373 zpl_prune_sb(uint64_t nr_to_scan, void *arg)
 374 {
 375         struct super_block *sb = (struct super_block *)arg;
 376         int objects = 0;
 377
 378         /*
 379          * deactivate_locked_super calls shrinker_free and only then
 380          * sops->kill_sb cb, resulting in UAF on umount when trying to reach
 381          * for the shrinker functions in zpl_prune_sb of in-umount dataset.
 382          * Increment if s_active is not zero, but don't prune if it is -
 383          * umount could be underway.
 384          */
 385         if (atomic_inc_not_zero(&sb->s_active)) {
 386                 (void) -zfs_prune(sb, nr_to_scan, &objects);
 387                 atomic_dec(&sb->s_active);
 388         }
 389
 390 }
 391
 392 const struct super_operations zpl_super_operations = {
 393         .alloc_inode            = zpl_inode_alloc,
 394         .destroy_inode          = zpl_inode_destroy,
 395         .dirty_inode            = zpl_dirty_inode,
 396         .write_inode            = NULL,
 397         .evict_inode            = zpl_evict_inode,
 398         .put_super              = zpl_put_super,
 399         .sync_fs                = zpl_sync_fs,
 400         .statfs                 = zpl_statfs,
 401         .remount_fs             = zpl_remount_fs,
 402         .show_devname           = zpl_show_devname,
 403         .show_options           = zpl_show_options,
 404         .show_stats             = NULL,
 405 };
 406
 407 struct file_system_type zpl_fs_type = {
 408         .owner                  = THIS_MODULE,
 409         .name                   = ZFS_DRIVER,
 410 #if defined(HAVE_IDMAP_MNT_API)
 411         .fs_flags               = FS_USERNS_MOUNT | FS_ALLOW_IDMAP,
 412 #else
 413         .fs_flags               = FS_USERNS_MOUNT,
 414 #endif
 415         .mount                  = zpl_mount,
 416         .kill_sb                = zpl_kill_sb,
 417 };