module/os/linux/zfs/zfs_znode_os.c

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or https://opensource.org/licenses/CDDL-1.0.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  23  * Copyright (c) 2012, 2018 by Delphix. All rights reserved.
  24  */
  25
  26 /* Portions Copyright 2007 Jeremy Teo */
  27
  28 #include <sys/types.h>
  29 #include <sys/param.h>
  30 #include <sys/time.h>
  31 #include <sys/sysmacros.h>
  32 #include <sys/mntent.h>
  33 #include <sys/u8_textprep.h>
  34 #include <sys/dsl_dataset.h>
  35 #include <sys/vfs.h>
  36 #include <sys/vnode.h>
  37 #include <sys/file.h>
  38 #include <sys/kmem.h>
  39 #include <sys/errno.h>
  40 #include <sys/atomic.h>
  41 #include <sys/zfs_dir.h>
  42 #include <sys/zfs_acl.h>
  43 #include <sys/zfs_ioctl.h>
  44 #include <sys/zfs_rlock.h>
  45 #include <sys/zfs_fuid.h>
  46 #include <sys/zfs_vnops.h>
  47 #include <sys/zfs_ctldir.h>
  48 #include <sys/dnode.h>
  49 #include <sys/fs/zfs.h>
  50 #include <sys/zpl.h>
  51 #include <sys/dmu.h>
  52 #include <sys/dmu_objset.h>
  53 #include <sys/dmu_tx.h>
  54 #include <sys/zfs_refcount.h>
  55 #include <sys/stat.h>
  56 #include <sys/zap.h>
  57 #include <sys/zfs_znode.h>
  58 #include <sys/sa.h>
  59 #include <sys/zfs_sa.h>
  60 #include <sys/zfs_stat.h>
  61 #include <linux/mm_compat.h>
  62
  63 #include "zfs_prop.h"
  64 #include "zfs_comutil.h"
  65
  66 static kmem_cache_t *znode_cache = NULL;
  67 static kmem_cache_t *znode_hold_cache = NULL;
  68 unsigned int zfs_object_mutex_size = ZFS_OBJ_MTX_SZ;
  69
  70 /*
  71  * This is used by the test suite so that it can delay znodes from being
  72  * freed in order to inspect the unlinked set.
  73  */
  74 static int zfs_unlink_suspend_progress = 0;
  75
  76 /*
  77  * This callback is invoked when acquiring a RL_WRITER or RL_APPEND lock on
  78  * z_rangelock. It will modify the offset and length of the lock to reflect
  79  * znode-specific information, and convert RL_APPEND to RL_WRITER.  This is
  80  * called with the rangelock_t's rl_lock held, which avoids races.
  81  */
  82 static void
  83 zfs_rangelock_cb(zfs_locked_range_t *new, void *arg)
  84 {
  85         znode_t *zp = arg;
  86
  87         /*
  88          * If in append mode, convert to writer and lock starting at the
  89          * current end of file.
  90          */
  91         if (new->lr_type == RL_APPEND) {
  92                 new->lr_offset = zp->z_size;
  93                 new->lr_type = RL_WRITER;
  94         }
  95
  96         /*
  97          * If we need to grow the block size then lock the whole file range.
  98          */
  99         uint64_t end_size = MAX(zp->z_size, new->lr_offset + new->lr_length);
 100         if (end_size > zp->z_blksz && (!ISP2(zp->z_blksz) ||
 101             zp->z_blksz < ZTOZSB(zp)->z_max_blksz)) {
 102                 new->lr_offset = 0;
 103                 new->lr_length = UINT64_MAX;
 104         }
 105 }
 106
 107 static int
 108 zfs_znode_cache_constructor(void *buf, void *arg, int kmflags)
 109 {
 110         (void) arg, (void) kmflags;
 111         znode_t *zp = buf;
 112
 113         inode_init_once(ZTOI(zp));
 114         list_link_init(&zp->z_link_node);
 115
 116         mutex_init(&zp->z_lock, NULL, MUTEX_DEFAULT, NULL);
 117         rw_init(&zp->z_parent_lock, NULL, RW_DEFAULT, NULL);
 118         rw_init(&zp->z_name_lock, NULL, RW_NOLOCKDEP, NULL);
 119         mutex_init(&zp->z_acl_lock, NULL, MUTEX_DEFAULT, NULL);
 120         rw_init(&zp->z_xattr_lock, NULL, RW_DEFAULT, NULL);
 121
 122         zfs_rangelock_init(&zp->z_rangelock, zfs_rangelock_cb, zp);
 123
 124         zp->z_dirlocks = NULL;
 125         zp->z_acl_cached = NULL;
 126         zp->z_xattr_cached = NULL;
 127         zp->z_xattr_parent = 0;
 128         zp->z_sync_writes_cnt = 0;
 129         zp->z_async_writes_cnt = 0;
 130
 131         return (0);
 132 }
 133
 134 static void
 135 zfs_znode_cache_destructor(void *buf, void *arg)
 136 {
 137         (void) arg;
 138         znode_t *zp = buf;
 139
 140         ASSERT(!list_link_active(&zp->z_link_node));
 141         mutex_destroy(&zp->z_lock);
 142         rw_destroy(&zp->z_parent_lock);
 143         rw_destroy(&zp->z_name_lock);
 144         mutex_destroy(&zp->z_acl_lock);
 145         rw_destroy(&zp->z_xattr_lock);
 146         zfs_rangelock_fini(&zp->z_rangelock);
 147
 148         ASSERT3P(zp->z_dirlocks, ==, NULL);
 149         ASSERT3P(zp->z_acl_cached, ==, NULL);
 150         ASSERT3P(zp->z_xattr_cached, ==, NULL);
 151
 152         ASSERT0(atomic_load_32(&zp->z_sync_writes_cnt));
 153         ASSERT0(atomic_load_32(&zp->z_async_writes_cnt));
 154 }
 155
 156 static int
 157 zfs_znode_hold_cache_constructor(void *buf, void *arg, int kmflags)
 158 {
 159         (void) arg, (void) kmflags;
 160         znode_hold_t *zh = buf;
 161
 162         mutex_init(&zh->zh_lock, NULL, MUTEX_DEFAULT, NULL);
 163         zh->zh_refcount = 0;
 164
 165         return (0);
 166 }
 167
 168 static void
 169 zfs_znode_hold_cache_destructor(void *buf, void *arg)
 170 {
 171         (void) arg;
 172         znode_hold_t *zh = buf;
 173
 174         mutex_destroy(&zh->zh_lock);
 175 }
 176
 177 void
 178 zfs_znode_init(void)
 179 {
 180         /*
 181          * Initialize zcache.  The KMC_SLAB hint is used in order that it be
 182          * backed by kmalloc() when on the Linux slab in order that any
 183          * wait_on_bit() operations on the related inode operate properly.
 184          */
 185         ASSERT(znode_cache == NULL);
 186         znode_cache = kmem_cache_create("zfs_znode_cache",
 187             sizeof (znode_t), 0, zfs_znode_cache_constructor,
 188             zfs_znode_cache_destructor, NULL, NULL, NULL,
 189             KMC_SLAB | KMC_RECLAIMABLE);
 190
 191         ASSERT(znode_hold_cache == NULL);
 192         znode_hold_cache = kmem_cache_create("zfs_znode_hold_cache",
 193             sizeof (znode_hold_t), 0, zfs_znode_hold_cache_constructor,
 194             zfs_znode_hold_cache_destructor, NULL, NULL, NULL, 0);
 195 }
 196
 197 void
 198 zfs_znode_fini(void)
 199 {
 200         /*
 201          * Cleanup zcache
 202          */
 203         if (znode_cache)
 204                 kmem_cache_destroy(znode_cache);
 205         znode_cache = NULL;
 206
 207         if (znode_hold_cache)
 208                 kmem_cache_destroy(znode_hold_cache);
 209         znode_hold_cache = NULL;
 210 }
 211
 212 /*
 213  * The zfs_znode_hold_enter() / zfs_znode_hold_exit() functions are used to
 214  * serialize access to a znode and its SA buffer while the object is being
 215  * created or destroyed.  This kind of locking would normally reside in the
 216  * znode itself but in this case that's impossible because the znode and SA
 217  * buffer may not yet exist.  Therefore the locking is handled externally
 218  * with an array of mutexes and AVLs trees which contain per-object locks.
 219  *
 220  * In zfs_znode_hold_enter() a per-object lock is created as needed, inserted
 221  * in to the correct AVL tree and finally the per-object lock is held.  In
 222  * zfs_znode_hold_exit() the process is reversed.  The per-object lock is
 223  * released, removed from the AVL tree and destroyed if there are no waiters.
 224  *
 225  * This scheme has two important properties:
 226  *
 227  * 1) No memory allocations are performed while holding one of the z_hold_locks.
 228  *    This ensures evict(), which can be called from direct memory reclaim, will
 229  *    never block waiting on a z_hold_locks which just happens to have hashed
 230  *    to the same index.
 231  *
 232  * 2) All locks used to serialize access to an object are per-object and never
 233  *    shared.  This minimizes lock contention without creating a large number
 234  *    of dedicated locks.
 235  *
 236  * On the downside it does require znode_lock_t structures to be frequently
 237  * allocated and freed.  However, because these are backed by a kmem cache
 238  * and very short lived this cost is minimal.
 239  */
 240 int
 241 zfs_znode_hold_compare(const void *a, const void *b)
 242 {
 243         const znode_hold_t *zh_a = (const znode_hold_t *)a;
 244         const znode_hold_t *zh_b = (const znode_hold_t *)b;
 245
 246         return (TREE_CMP(zh_a->zh_obj, zh_b->zh_obj));
 247 }
 248
 249 static boolean_t __maybe_unused
 250 zfs_znode_held(zfsvfs_t *zfsvfs, uint64_t obj)
 251 {
 252         znode_hold_t *zh, search;
 253         int i = ZFS_OBJ_HASH(zfsvfs, obj);
 254         boolean_t held;
 255
 256         search.zh_obj = obj;
 257
 258         mutex_enter(&zfsvfs->z_hold_locks[i]);
 259         zh = avl_find(&zfsvfs->z_hold_trees[i], &search, NULL);
 260         held = (zh && MUTEX_HELD(&zh->zh_lock)) ? B_TRUE : B_FALSE;
 261         mutex_exit(&zfsvfs->z_hold_locks[i]);
 262
 263         return (held);
 264 }
 265
 266 znode_hold_t *
 267 zfs_znode_hold_enter(zfsvfs_t *zfsvfs, uint64_t obj)
 268 {
 269         znode_hold_t *zh, *zh_new, search;
 270         int i = ZFS_OBJ_HASH(zfsvfs, obj);
 271         boolean_t found = B_FALSE;
 272
 273         zh_new = kmem_cache_alloc(znode_hold_cache, KM_SLEEP);
 274         search.zh_obj = obj;
 275
 276         mutex_enter(&zfsvfs->z_hold_locks[i]);
 277         zh = avl_find(&zfsvfs->z_hold_trees[i], &search, NULL);
 278         if (likely(zh == NULL)) {
 279                 zh = zh_new;
 280                 zh->zh_obj = obj;
 281                 avl_add(&zfsvfs->z_hold_trees[i], zh);
 282         } else {
 283                 ASSERT3U(zh->zh_obj, ==, obj);
 284                 found = B_TRUE;
 285         }
 286         zh->zh_refcount++;
 287         ASSERT3S(zh->zh_refcount, >, 0);
 288         mutex_exit(&zfsvfs->z_hold_locks[i]);
 289
 290         if (found == B_TRUE)
 291                 kmem_cache_free(znode_hold_cache, zh_new);
 292
 293         ASSERT(MUTEX_NOT_HELD(&zh->zh_lock));
 294         mutex_enter(&zh->zh_lock);
 295
 296         return (zh);
 297 }
 298
 299 void
 300 zfs_znode_hold_exit(zfsvfs_t *zfsvfs, znode_hold_t *zh)
 301 {
 302         int i = ZFS_OBJ_HASH(zfsvfs, zh->zh_obj);
 303         boolean_t remove = B_FALSE;
 304
 305         ASSERT(zfs_znode_held(zfsvfs, zh->zh_obj));
 306         mutex_exit(&zh->zh_lock);
 307
 308         mutex_enter(&zfsvfs->z_hold_locks[i]);
 309         ASSERT3S(zh->zh_refcount, >, 0);
 310         if (--zh->zh_refcount == 0) {
 311                 avl_remove(&zfsvfs->z_hold_trees[i], zh);
 312                 remove = B_TRUE;
 313         }
 314         mutex_exit(&zfsvfs->z_hold_locks[i]);
 315
 316         if (remove == B_TRUE)
 317                 kmem_cache_free(znode_hold_cache, zh);
 318 }
 319
 320 dev_t
 321 zfs_cmpldev(uint64_t dev)
 322 {
 323         return (dev);
 324 }
 325
 326 static void
 327 zfs_znode_sa_init(zfsvfs_t *zfsvfs, znode_t *zp,
 328     dmu_buf_t *db, dmu_object_type_t obj_type, sa_handle_t *sa_hdl)
 329 {
 330         ASSERT(zfs_znode_held(zfsvfs, zp->z_id));
 331
 332         mutex_enter(&zp->z_lock);
 333
 334         ASSERT(zp->z_sa_hdl == NULL);
 335         ASSERT(zp->z_acl_cached == NULL);
 336         if (sa_hdl == NULL) {
 337                 VERIFY(0 == sa_handle_get_from_db(zfsvfs->z_os, db, zp,
 338                     SA_HDL_SHARED, &zp->z_sa_hdl));
 339         } else {
 340                 zp->z_sa_hdl = sa_hdl;
 341                 sa_set_userp(sa_hdl, zp);
 342         }
 343
 344         zp->z_is_sa = (obj_type == DMU_OT_SA) ? B_TRUE : B_FALSE;
 345
 346         mutex_exit(&zp->z_lock);
 347 }
 348
 349 void
 350 zfs_znode_dmu_fini(znode_t *zp)
 351 {
 352         ASSERT(zfs_znode_held(ZTOZSB(zp), zp->z_id) ||
 353             RW_WRITE_HELD(&ZTOZSB(zp)->z_teardown_inactive_lock));
 354
 355         sa_handle_destroy(zp->z_sa_hdl);
 356         zp->z_sa_hdl = NULL;
 357 }
 358
 359 /*
 360  * Called by new_inode() to allocate a new inode.
 361  */
 362 int
 363 zfs_inode_alloc(struct super_block *sb, struct inode **ip)
 364 {
 365         znode_t *zp;
 366
 367         zp = kmem_cache_alloc(znode_cache, KM_SLEEP);
 368         *ip = ZTOI(zp);
 369
 370         return (0);
 371 }
 372
 373 /*
 374  * Called in multiple places when an inode should be destroyed.
 375  */
 376 void
 377 zfs_inode_destroy(struct inode *ip)
 378 {
 379         znode_t *zp = ITOZ(ip);
 380         zfsvfs_t *zfsvfs = ZTOZSB(zp);
 381
 382         mutex_enter(&zfsvfs->z_znodes_lock);
 383         if (list_link_active(&zp->z_link_node)) {
 384                 list_remove(&zfsvfs->z_all_znodes, zp);
 385         }
 386         mutex_exit(&zfsvfs->z_znodes_lock);
 387
 388         if (zp->z_acl_cached) {
 389                 zfs_acl_free(zp->z_acl_cached);
 390                 zp->z_acl_cached = NULL;
 391         }
 392
 393         if (zp->z_xattr_cached) {
 394                 nvlist_free(zp->z_xattr_cached);
 395                 zp->z_xattr_cached = NULL;
 396         }
 397
 398         kmem_cache_free(znode_cache, zp);
 399 }
 400
 401 static void
 402 zfs_inode_set_ops(zfsvfs_t *zfsvfs, struct inode *ip)
 403 {
 404         uint64_t rdev = 0;
 405
 406         switch (ip->i_mode & S_IFMT) {
 407         case S_IFREG:
 408                 ip->i_op = &zpl_inode_operations;
 409                 ip->i_fop = &zpl_file_operations;
 410                 ip->i_mapping->a_ops = &zpl_address_space_operations;
 411                 break;
 412
 413         case S_IFDIR:
 414                 ip->i_op = &zpl_dir_inode_operations;
 415                 ip->i_fop = &zpl_dir_file_operations;
 416                 ITOZ(ip)->z_zn_prefetch = B_TRUE;
 417                 break;
 418
 419         case S_IFLNK:
 420                 ip->i_op = &zpl_symlink_inode_operations;
 421                 break;
 422
 423         /*
 424          * rdev is only stored in a SA only for device files.
 425          */
 426         case S_IFCHR:
 427         case S_IFBLK:
 428                 (void) sa_lookup(ITOZ(ip)->z_sa_hdl, SA_ZPL_RDEV(zfsvfs), &rdev,
 429                     sizeof (rdev));
 430                 zfs_fallthrough;
 431         case S_IFIFO:
 432         case S_IFSOCK:
 433                 init_special_inode(ip, ip->i_mode, rdev);
 434                 ip->i_op = &zpl_special_inode_operations;
 435                 break;
 436
 437         default:
 438                 zfs_panic_recover("inode %llu has invalid mode: 0x%x\n",
 439                     (u_longlong_t)ip->i_ino, ip->i_mode);
 440
 441                 /* Assume the inode is a file and attempt to continue */
 442                 ip->i_mode = S_IFREG | 0644;
 443                 ip->i_op = &zpl_inode_operations;
 444                 ip->i_fop = &zpl_file_operations;
 445                 ip->i_mapping->a_ops = &zpl_address_space_operations;
 446                 break;
 447         }
 448 }
 449
 450 static void
 451 zfs_set_inode_flags(znode_t *zp, struct inode *ip)
 452 {
 453         /*
 454          * Linux and Solaris have different sets of file attributes, so we
 455          * restrict this conversion to the intersection of the two.
 456          */
 457         unsigned int flags = 0;
 458         if (zp->z_pflags & ZFS_IMMUTABLE)
 459                 flags |= S_IMMUTABLE;
 460         if (zp->z_pflags & ZFS_APPENDONLY)
 461                 flags |= S_APPEND;
 462
 463         inode_set_flags(ip, flags, S_IMMUTABLE|S_APPEND);
 464 }
 465
 466 /*
 467  * Update the embedded inode given the znode.
 468  */
 469 void
 470 zfs_znode_update_vfs(znode_t *zp)
 471 {
 472         struct inode    *ip;
 473         uint32_t        blksize;
 474         u_longlong_t    i_blocks;
 475
 476         ASSERT(zp != NULL);
 477         ip = ZTOI(zp);
 478
 479         /* Skip .zfs control nodes which do not exist on disk. */
 480         if (zfsctl_is_node(ip))
 481                 return;
 482
 483         dmu_object_size_from_db(sa_get_db(zp->z_sa_hdl), &blksize, &i_blocks);
 484
 485         spin_lock(&ip->i_lock);
 486         ip->i_mode = zp->z_mode;
 487         ip->i_blocks = i_blocks;
 488         i_size_write(ip, zp->z_size);
 489         spin_unlock(&ip->i_lock);
 490 }
 491
 492
 493 /*
 494  * Construct a znode+inode and initialize.
 495  *
 496  * This does not do a call to dmu_set_user() that is
 497  * up to the caller to do, in case you don't want to
 498  * return the znode
 499  */
 500 static znode_t *
 501 zfs_znode_alloc(zfsvfs_t *zfsvfs, dmu_buf_t *db, int blksz,
 502     dmu_object_type_t obj_type, sa_handle_t *hdl)
 503 {
 504         znode_t *zp;
 505         struct inode *ip;
 506         uint64_t mode;
 507         uint64_t parent;
 508         uint64_t tmp_gen;
 509         uint64_t links;
 510         uint64_t z_uid, z_gid;
 511         uint64_t atime[2], mtime[2], ctime[2], btime[2];
 512         inode_timespec_t tmp_ts;
 513         uint64_t projid = ZFS_DEFAULT_PROJID;
 514         sa_bulk_attr_t bulk[12];
 515         int count = 0;
 516
 517         ASSERT(zfsvfs != NULL);
 518
 519         ip = new_inode(zfsvfs->z_sb);
 520         if (ip == NULL)
 521                 return (NULL);
 522
 523         zp = ITOZ(ip);
 524         ASSERT(zp->z_dirlocks == NULL);
 525         ASSERT3P(zp->z_acl_cached, ==, NULL);
 526         ASSERT3P(zp->z_xattr_cached, ==, NULL);
 527         zp->z_unlinked = B_FALSE;
 528         zp->z_atime_dirty = B_FALSE;
 529         zp->z_is_ctldir = B_FALSE;
 530         zp->z_suspended = B_FALSE;
 531         zp->z_sa_hdl = NULL;
 532         zp->z_mapcnt = 0;
 533         zp->z_id = db->db_object;
 534         zp->z_blksz = blksz;
 535         zp->z_seq = 0x7A4653;
 536         zp->z_sync_cnt = 0;
 537         zp->z_sync_writes_cnt = 0;
 538         zp->z_async_writes_cnt = 0;
 539
 540         zfs_znode_sa_init(zfsvfs, zp, db, obj_type, hdl);
 541
 542         SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL, &mode, 8);
 543         SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GEN(zfsvfs), NULL, &tmp_gen, 8);
 544         SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL,
 545             &zp->z_size, 8);
 546         SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs), NULL, &links, 8);
 547         SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
 548             &zp->z_pflags, 8);
 549         SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_PARENT(zfsvfs), NULL,
 550             &parent, 8);
 551         SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL, &z_uid, 8);
 552         SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs), NULL, &z_gid, 8);
 553         SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zfsvfs), NULL, &atime, 16);
 554         SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16);
 555         SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16);
 556         SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CRTIME(zfsvfs), NULL, &btime, 16);
 557
 558         if (sa_bulk_lookup(zp->z_sa_hdl, bulk, count) != 0 || tmp_gen == 0 ||
 559             (dmu_objset_projectquota_enabled(zfsvfs->z_os) &&
 560             (zp->z_pflags & ZFS_PROJID) &&
 561             sa_lookup(zp->z_sa_hdl, SA_ZPL_PROJID(zfsvfs), &projid, 8) != 0)) {
 562                 if (hdl == NULL)
 563                         sa_handle_destroy(zp->z_sa_hdl);
 564                 zp->z_sa_hdl = NULL;
 565                 goto error;
 566         }
 567
 568         zp->z_projid = projid;
 569         zp->z_mode = ip->i_mode = mode;
 570         ip->i_generation = (uint32_t)tmp_gen;
 571         ip->i_blkbits = SPA_MINBLOCKSHIFT;
 572         set_nlink(ip, (uint32_t)links);
 573         zfs_uid_write(ip, z_uid);
 574         zfs_gid_write(ip, z_gid);
 575         zfs_set_inode_flags(zp, ip);
 576
 577         /* Cache the xattr parent id */
 578         if (zp->z_pflags & ZFS_XATTR)
 579                 zp->z_xattr_parent = parent;
 580
 581         ZFS_TIME_DECODE(&tmp_ts, atime);
 582         zpl_inode_set_atime_to_ts(ip, tmp_ts);
 583         ZFS_TIME_DECODE(&tmp_ts, mtime);
 584         zpl_inode_set_mtime_to_ts(ip, tmp_ts);
 585         ZFS_TIME_DECODE(&tmp_ts, ctime);
 586         zpl_inode_set_ctime_to_ts(ip, tmp_ts);
 587         ZFS_TIME_DECODE(&zp->z_btime, btime);
 588
 589         ip->i_ino = zp->z_id;
 590         zfs_znode_update_vfs(zp);
 591         zfs_inode_set_ops(zfsvfs, ip);
 592
 593         /*
 594          * The only way insert_inode_locked() can fail is if the ip->i_ino
 595          * number is already hashed for this super block.  This can never
 596          * happen because the inode numbers map 1:1 with the object numbers.
 597          *
 598          * Exceptions include rolling back a mounted file system, either
 599          * from the zfs rollback or zfs recv command.
 600          *
 601          * Active inodes are unhashed during the rollback, but since zrele
 602          * can happen asynchronously, we can't guarantee they've been
 603          * unhashed.  This can cause hash collisions in unlinked drain
 604          * processing so do not hash unlinked znodes.
 605          */
 606         if (links > 0)
 607                 VERIFY3S(insert_inode_locked(ip), ==, 0);
 608
 609         mutex_enter(&zfsvfs->z_znodes_lock);
 610         list_insert_tail(&zfsvfs->z_all_znodes, zp);
 611         mutex_exit(&zfsvfs->z_znodes_lock);
 612
 613         if (links > 0)
 614                 unlock_new_inode(ip);
 615         return (zp);
 616
 617 error:
 618         iput(ip);
 619         return (NULL);
 620 }
 621
 622 /*
 623  * Safely mark an inode dirty.  Inodes which are part of a read-only
 624  * file system or snapshot may not be dirtied.
 625  */
 626 void
 627 zfs_mark_inode_dirty(struct inode *ip)
 628 {
 629         zfsvfs_t *zfsvfs = ITOZSB(ip);
 630
 631         if (zfs_is_readonly(zfsvfs) || dmu_objset_is_snapshot(zfsvfs->z_os))
 632                 return;
 633
 634         mark_inode_dirty(ip);
 635 }
 636
 637 static uint64_t empty_xattr;
 638 static uint64_t pad[4];
 639 static zfs_acl_phys_t acl_phys;
 640 /*
 641  * Create a new DMU object to hold a zfs znode.
 642  *
 643  *      IN:     dzp     - parent directory for new znode
 644  *              vap     - file attributes for new znode
 645  *              tx      - dmu transaction id for zap operations
 646  *              cr      - credentials of caller
 647  *              flag    - flags:
 648  *                        IS_ROOT_NODE  - new object will be root
 649  *                        IS_TMPFILE    - new object is of O_TMPFILE
 650  *                        IS_XATTR      - new object is an attribute
 651  *              acl_ids - ACL related attributes
 652  *
 653  *      OUT:    zpp     - allocated znode (set to dzp if IS_ROOT_NODE)
 654  *
 655  */
 656 void
 657 zfs_mknode(znode_t *dzp, vattr_t *vap, dmu_tx_t *tx, cred_t *cr,
 658     uint_t flag, znode_t **zpp, zfs_acl_ids_t *acl_ids)
 659 {
 660         uint64_t        crtime[2], atime[2], mtime[2], ctime[2];
 661         uint64_t        mode, size, links, parent, pflags;
 662         uint64_t        projid = ZFS_DEFAULT_PROJID;
 663         uint64_t        rdev = 0;
 664         zfsvfs_t        *zfsvfs = ZTOZSB(dzp);
 665         dmu_buf_t       *db;
 666         inode_timespec_t now;
 667         uint64_t        gen, obj;
 668         int             bonuslen;
 669         int             dnodesize;
 670         sa_handle_t     *sa_hdl;
 671         dmu_object_type_t obj_type;
 672         sa_bulk_attr_t  *sa_attrs;
 673         int             cnt = 0;
 674         zfs_acl_locator_cb_t locate = { 0 };
 675         znode_hold_t    *zh;
 676
 677         if (zfsvfs->z_replay) {
 678                 obj = vap->va_nodeid;
 679                 now = vap->va_ctime;            /* see zfs_replay_create() */
 680                 gen = vap->va_nblocks;          /* ditto */
 681                 dnodesize = vap->va_fsid;       /* ditto */
 682         } else {
 683                 obj = 0;
 684                 gethrestime(&now);
 685                 gen = dmu_tx_get_txg(tx);
 686                 dnodesize = dmu_objset_dnodesize(zfsvfs->z_os);
 687         }
 688
 689         if (dnodesize == 0)
 690                 dnodesize = DNODE_MIN_SIZE;
 691
 692         obj_type = zfsvfs->z_use_sa ? DMU_OT_SA : DMU_OT_ZNODE;
 693
 694         bonuslen = (obj_type == DMU_OT_SA) ?
 695             DN_BONUS_SIZE(dnodesize) : ZFS_OLD_ZNODE_PHYS_SIZE;
 696
 697         /*
 698          * Create a new DMU object.
 699          */
 700         /*
 701          * There's currently no mechanism for pre-reading the blocks that will
 702          * be needed to allocate a new object, so we accept the small chance
 703          * that there will be an i/o error and we will fail one of the
 704          * assertions below.
 705          */
 706         if (S_ISDIR(vap->va_mode)) {
 707                 if (zfsvfs->z_replay) {
 708                         VERIFY0(zap_create_claim_norm_dnsize(zfsvfs->z_os, obj,
 709                             zfsvfs->z_norm, DMU_OT_DIRECTORY_CONTENTS,
 710                             obj_type, bonuslen, dnodesize, tx));
 711                 } else {
 712                         obj = zap_create_norm_dnsize(zfsvfs->z_os,
 713                             zfsvfs->z_norm, DMU_OT_DIRECTORY_CONTENTS,
 714                             obj_type, bonuslen, dnodesize, tx);
 715                 }
 716         } else {
 717                 if (zfsvfs->z_replay) {
 718                         VERIFY0(dmu_object_claim_dnsize(zfsvfs->z_os, obj,
 719                             DMU_OT_PLAIN_FILE_CONTENTS, 0,
 720                             obj_type, bonuslen, dnodesize, tx));
 721                 } else {
 722                         obj = dmu_object_alloc_dnsize(zfsvfs->z_os,
 723                             DMU_OT_PLAIN_FILE_CONTENTS, 0,
 724                             obj_type, bonuslen, dnodesize, tx);
 725                 }
 726         }
 727
 728         zh = zfs_znode_hold_enter(zfsvfs, obj);
 729         VERIFY0(sa_buf_hold(zfsvfs->z_os, obj, NULL, &db));
 730
 731         /*
 732          * If this is the root, fix up the half-initialized parent pointer
 733          * to reference the just-allocated physical data area.
 734          */
 735         if (flag & IS_ROOT_NODE) {
 736                 dzp->z_id = obj;
 737         }
 738
 739         /*
 740          * If parent is an xattr, so am I.
 741          */
 742         if (dzp->z_pflags & ZFS_XATTR) {
 743                 flag |= IS_XATTR;
 744         }
 745
 746         if (zfsvfs->z_use_fuids)
 747                 pflags = ZFS_ARCHIVE | ZFS_AV_MODIFIED;
 748         else
 749                 pflags = 0;
 750
 751         if (S_ISDIR(vap->va_mode)) {
 752                 size = 2;               /* contents ("." and "..") */
 753                 links = 2;
 754         } else {
 755                 size = 0;
 756                 links = (flag & IS_TMPFILE) ? 0 : 1;
 757         }
 758
 759         if (S_ISBLK(vap->va_mode) || S_ISCHR(vap->va_mode))
 760                 rdev = vap->va_rdev;
 761
 762         parent = dzp->z_id;
 763         mode = acl_ids->z_mode;
 764         if (flag & IS_XATTR)
 765                 pflags |= ZFS_XATTR;
 766
 767         if (S_ISREG(vap->va_mode) || S_ISDIR(vap->va_mode)) {
 768                 /*
 769                  * With ZFS_PROJID flag, we can easily know whether there is
 770                  * project ID stored on disk or not. See zfs_space_delta_cb().
 771                  */
 772                 if (obj_type != DMU_OT_ZNODE &&
 773                     dmu_objset_projectquota_enabled(zfsvfs->z_os))
 774                         pflags |= ZFS_PROJID;
 775
 776                 /*
 777                  * Inherit project ID from parent if required.
 778                  */
 779                 projid = zfs_inherit_projid(dzp);
 780                 if (dzp->z_pflags & ZFS_PROJINHERIT)
 781                         pflags |= ZFS_PROJINHERIT;
 782         }
 783
 784         /*
 785          * No execs denied will be determined when zfs_mode_compute() is called.
 786          */
 787         pflags |= acl_ids->z_aclp->z_hints &
 788             (ZFS_ACL_TRIVIAL|ZFS_INHERIT_ACE|ZFS_ACL_AUTO_INHERIT|
 789             ZFS_ACL_DEFAULTED|ZFS_ACL_PROTECTED);
 790
 791         ZFS_TIME_ENCODE(&now, crtime);
 792         ZFS_TIME_ENCODE(&now, ctime);
 793
 794         if (vap->va_mask & ATTR_ATIME) {
 795                 ZFS_TIME_ENCODE(&vap->va_atime, atime);
 796         } else {
 797                 ZFS_TIME_ENCODE(&now, atime);
 798         }
 799
 800         if (vap->va_mask & ATTR_MTIME) {
 801                 ZFS_TIME_ENCODE(&vap->va_mtime, mtime);
 802         } else {
 803                 ZFS_TIME_ENCODE(&now, mtime);
 804         }
 805
 806         /* Now add in all of the "SA" attributes */
 807         VERIFY(0 == sa_handle_get_from_db(zfsvfs->z_os, db, NULL, SA_HDL_SHARED,
 808             &sa_hdl));
 809
 810         /*
 811          * Setup the array of attributes to be replaced/set on the new file
 812          *
 813          * order for  DMU_OT_ZNODE is critical since it needs to be constructed
 814          * in the old znode_phys_t format.  Don't change this ordering
 815          */
 816         sa_attrs = kmem_alloc(sizeof (sa_bulk_attr_t) * ZPL_END, KM_SLEEP);
 817
 818         if (obj_type == DMU_OT_ZNODE) {
 819                 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_ATIME(zfsvfs),
 820                     NULL, &atime, 16);
 821                 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_MTIME(zfsvfs),
 822                     NULL, &mtime, 16);
 823                 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_CTIME(zfsvfs),
 824                     NULL, &ctime, 16);
 825                 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_CRTIME(zfsvfs),
 826                     NULL, &crtime, 16);
 827                 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_GEN(zfsvfs),
 828                     NULL, &gen, 8);
 829                 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_MODE(zfsvfs),
 830                     NULL, &mode, 8);
 831                 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_SIZE(zfsvfs),
 832                     NULL, &size, 8);
 833                 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_PARENT(zfsvfs),
 834                     NULL, &parent, 8);
 835         } else {
 836                 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_MODE(zfsvfs),
 837                     NULL, &mode, 8);
 838                 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_SIZE(zfsvfs),
 839                     NULL, &size, 8);
 840                 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_GEN(zfsvfs),
 841                     NULL, &gen, 8);
 842                 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_UID(zfsvfs),
 843                     NULL, &acl_ids->z_fuid, 8);
 844                 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_GID(zfsvfs),
 845                     NULL, &acl_ids->z_fgid, 8);
 846                 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_PARENT(zfsvfs),
 847                     NULL, &parent, 8);
 848                 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_FLAGS(zfsvfs),
 849                     NULL, &pflags, 8);
 850                 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_ATIME(zfsvfs),
 851                     NULL, &atime, 16);
 852                 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_MTIME(zfsvfs),
 853                     NULL, &mtime, 16);
 854                 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_CTIME(zfsvfs),
 855                     NULL, &ctime, 16);
 856                 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_CRTIME(zfsvfs),
 857                     NULL, &crtime, 16);
 858         }
 859
 860         SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_LINKS(zfsvfs), NULL, &links, 8);
 861
 862         if (obj_type == DMU_OT_ZNODE) {
 863                 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_XATTR(zfsvfs), NULL,
 864                     &empty_xattr, 8);
 865         } else if (dmu_objset_projectquota_enabled(zfsvfs->z_os) &&
 866             pflags & ZFS_PROJID) {
 867                 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_PROJID(zfsvfs),
 868                     NULL, &projid, 8);
 869         }
 870         if (obj_type == DMU_OT_ZNODE ||
 871             (S_ISBLK(vap->va_mode) || S_ISCHR(vap->va_mode))) {
 872                 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_RDEV(zfsvfs),
 873                     NULL, &rdev, 8);
 874         }
 875         if (obj_type == DMU_OT_ZNODE) {
 876                 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_FLAGS(zfsvfs),
 877                     NULL, &pflags, 8);
 878                 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_UID(zfsvfs), NULL,
 879                     &acl_ids->z_fuid, 8);
 880                 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_GID(zfsvfs), NULL,
 881                     &acl_ids->z_fgid, 8);
 882                 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_PAD(zfsvfs), NULL, pad,
 883                     sizeof (uint64_t) * 4);
 884                 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_ZNODE_ACL(zfsvfs), NULL,
 885                     &acl_phys, sizeof (zfs_acl_phys_t));
 886         } else if (acl_ids->z_aclp->z_version >= ZFS_ACL_VERSION_FUID) {
 887                 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_DACL_COUNT(zfsvfs), NULL,
 888                     &acl_ids->z_aclp->z_acl_count, 8);
 889                 locate.cb_aclp = acl_ids->z_aclp;
 890                 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_DACL_ACES(zfsvfs),
 891                     zfs_acl_data_locator, &locate,
 892                     acl_ids->z_aclp->z_acl_bytes);
 893                 mode = zfs_mode_compute(mode, acl_ids->z_aclp, &pflags,
 894                     acl_ids->z_fuid, acl_ids->z_fgid);
 895         }
 896
 897         VERIFY(sa_replace_all_by_template(sa_hdl, sa_attrs, cnt, tx) == 0);
 898
 899         if (!(flag & IS_ROOT_NODE)) {
 900                 /*
 901                  * The call to zfs_znode_alloc() may fail if memory is low
 902                  * via the call path: alloc_inode() -> inode_init_always() ->
 903                  * security_inode_alloc() -> inode_alloc_security().  Since
 904                  * the existing code is written such that zfs_mknode() can
 905                  * not fail retry until sufficient memory has been reclaimed.
 906                  */
 907                 do {
 908                         *zpp = zfs_znode_alloc(zfsvfs, db, 0, obj_type, sa_hdl);
 909                 } while (*zpp == NULL);
 910
 911                 VERIFY(*zpp != NULL);
 912                 VERIFY(dzp != NULL);
 913         } else {
 914                 /*
 915                  * If we are creating the root node, the "parent" we
 916                  * passed in is the znode for the root.
 917                  */
 918                 *zpp = dzp;
 919
 920                 (*zpp)->z_sa_hdl = sa_hdl;
 921         }
 922
 923         (*zpp)->z_pflags = pflags;
 924         (*zpp)->z_mode = ZTOI(*zpp)->i_mode = mode;
 925         (*zpp)->z_dnodesize = dnodesize;
 926         (*zpp)->z_projid = projid;
 927
 928         if (obj_type == DMU_OT_ZNODE ||
 929             acl_ids->z_aclp->z_version < ZFS_ACL_VERSION_FUID) {
 930                 VERIFY0(zfs_aclset_common(*zpp, acl_ids->z_aclp, cr, tx));
 931         }
 932         kmem_free(sa_attrs, sizeof (sa_bulk_attr_t) * ZPL_END);
 933         zfs_znode_hold_exit(zfsvfs, zh);
 934 }
 935
 936 /*
 937  * Update in-core attributes.  It is assumed the caller will be doing an
 938  * sa_bulk_update to push the changes out.
 939  */
 940 void
 941 zfs_xvattr_set(znode_t *zp, xvattr_t *xvap, dmu_tx_t *tx)
 942 {
 943         xoptattr_t *xoap;
 944         boolean_t update_inode = B_FALSE;
 945
 946         xoap = xva_getxoptattr(xvap);
 947         ASSERT(xoap);
 948
 949         if (XVA_ISSET_REQ(xvap, XAT_CREATETIME)) {
 950                 uint64_t times[2];
 951                 ZFS_TIME_ENCODE(&xoap->xoa_createtime, times);
 952                 (void) sa_update(zp->z_sa_hdl, SA_ZPL_CRTIME(ZTOZSB(zp)),
 953                     &times, sizeof (times), tx);
 954                 XVA_SET_RTN(xvap, XAT_CREATETIME);
 955         }
 956         if (XVA_ISSET_REQ(xvap, XAT_READONLY)) {
 957                 ZFS_ATTR_SET(zp, ZFS_READONLY, xoap->xoa_readonly,
 958                     zp->z_pflags, tx);
 959                 XVA_SET_RTN(xvap, XAT_READONLY);
 960         }
 961         if (XVA_ISSET_REQ(xvap, XAT_HIDDEN)) {
 962                 ZFS_ATTR_SET(zp, ZFS_HIDDEN, xoap->xoa_hidden,
 963                     zp->z_pflags, tx);
 964                 XVA_SET_RTN(xvap, XAT_HIDDEN);
 965         }
 966         if (XVA_ISSET_REQ(xvap, XAT_SYSTEM)) {
 967                 ZFS_ATTR_SET(zp, ZFS_SYSTEM, xoap->xoa_system,
 968                     zp->z_pflags, tx);
 969                 XVA_SET_RTN(xvap, XAT_SYSTEM);
 970         }
 971         if (XVA_ISSET_REQ(xvap, XAT_ARCHIVE)) {
 972                 ZFS_ATTR_SET(zp, ZFS_ARCHIVE, xoap->xoa_archive,
 973                     zp->z_pflags, tx);
 974                 XVA_SET_RTN(xvap, XAT_ARCHIVE);
 975         }
 976         if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) {
 977                 ZFS_ATTR_SET(zp, ZFS_IMMUTABLE, xoap->xoa_immutable,
 978                     zp->z_pflags, tx);
 979                 XVA_SET_RTN(xvap, XAT_IMMUTABLE);
 980
 981                 update_inode = B_TRUE;
 982         }
 983         if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) {
 984                 ZFS_ATTR_SET(zp, ZFS_NOUNLINK, xoap->xoa_nounlink,
 985                     zp->z_pflags, tx);
 986                 XVA_SET_RTN(xvap, XAT_NOUNLINK);
 987         }
 988         if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) {
 989                 ZFS_ATTR_SET(zp, ZFS_APPENDONLY, xoap->xoa_appendonly,
 990                     zp->z_pflags, tx);
 991                 XVA_SET_RTN(xvap, XAT_APPENDONLY);
 992
 993                 update_inode = B_TRUE;
 994         }
 995         if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) {
 996                 ZFS_ATTR_SET(zp, ZFS_NODUMP, xoap->xoa_nodump,
 997                     zp->z_pflags, tx);
 998                 XVA_SET_RTN(xvap, XAT_NODUMP);
 999         }
1000         if (XVA_ISSET_REQ(xvap, XAT_OPAQUE)) {
1001                 ZFS_ATTR_SET(zp, ZFS_OPAQUE, xoap->xoa_opaque,
1002                     zp->z_pflags, tx);
1003                 XVA_SET_RTN(xvap, XAT_OPAQUE);
1004         }
1005         if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) {
1006                 ZFS_ATTR_SET(zp, ZFS_AV_QUARANTINED,
1007                     xoap->xoa_av_quarantined, zp->z_pflags, tx);
1008                 XVA_SET_RTN(xvap, XAT_AV_QUARANTINED);
1009         }
1010         if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) {
1011                 ZFS_ATTR_SET(zp, ZFS_AV_MODIFIED, xoap->xoa_av_modified,
1012                     zp->z_pflags, tx);
1013                 XVA_SET_RTN(xvap, XAT_AV_MODIFIED);
1014         }
1015         if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) {
1016                 zfs_sa_set_scanstamp(zp, xvap, tx);
1017                 XVA_SET_RTN(xvap, XAT_AV_SCANSTAMP);
1018         }
1019         if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) {
1020                 ZFS_ATTR_SET(zp, ZFS_REPARSE, xoap->xoa_reparse,
1021                     zp->z_pflags, tx);
1022                 XVA_SET_RTN(xvap, XAT_REPARSE);
1023         }
1024         if (XVA_ISSET_REQ(xvap, XAT_OFFLINE)) {
1025                 ZFS_ATTR_SET(zp, ZFS_OFFLINE, xoap->xoa_offline,
1026                     zp->z_pflags, tx);
1027                 XVA_SET_RTN(xvap, XAT_OFFLINE);
1028         }
1029         if (XVA_ISSET_REQ(xvap, XAT_SPARSE)) {
1030                 ZFS_ATTR_SET(zp, ZFS_SPARSE, xoap->xoa_sparse,
1031                     zp->z_pflags, tx);
1032                 XVA_SET_RTN(xvap, XAT_SPARSE);
1033         }
1034         if (XVA_ISSET_REQ(xvap, XAT_PROJINHERIT)) {
1035                 ZFS_ATTR_SET(zp, ZFS_PROJINHERIT, xoap->xoa_projinherit,
1036                     zp->z_pflags, tx);
1037                 XVA_SET_RTN(xvap, XAT_PROJINHERIT);
1038         }
1039
1040         if (update_inode)
1041                 zfs_set_inode_flags(zp, ZTOI(zp));
1042 }
1043
1044 int
1045 zfs_zget(zfsvfs_t *zfsvfs, uint64_t obj_num, znode_t **zpp)
1046 {
1047         dmu_object_info_t doi;
1048         dmu_buf_t       *db;
1049         znode_t         *zp;
1050         znode_hold_t    *zh;
1051         int err;
1052         sa_handle_t     *hdl;
1053
1054         *zpp = NULL;
1055
1056 again:
1057         zh = zfs_znode_hold_enter(zfsvfs, obj_num);
1058
1059         err = sa_buf_hold(zfsvfs->z_os, obj_num, NULL, &db);
1060         if (err) {
1061                 zfs_znode_hold_exit(zfsvfs, zh);
1062                 return (err);
1063         }
1064
1065         dmu_object_info_from_db(db, &doi);
1066         if (doi.doi_bonus_type != DMU_OT_SA &&
1067             (doi.doi_bonus_type != DMU_OT_ZNODE ||
1068             (doi.doi_bonus_type == DMU_OT_ZNODE &&
1069             doi.doi_bonus_size < sizeof (znode_phys_t)))) {
1070                 sa_buf_rele(db, NULL);
1071                 zfs_znode_hold_exit(zfsvfs, zh);
1072                 return (SET_ERROR(EINVAL));
1073         }
1074
1075         hdl = dmu_buf_get_user(db);
1076         if (hdl != NULL) {
1077                 zp = sa_get_userdata(hdl);
1078
1079
1080                 /*
1081                  * Since "SA" does immediate eviction we
1082                  * should never find a sa handle that doesn't
1083                  * know about the znode.
1084                  */
1085
1086                 ASSERT3P(zp, !=, NULL);
1087
1088                 mutex_enter(&zp->z_lock);
1089                 ASSERT3U(zp->z_id, ==, obj_num);
1090                 /*
1091                  * If zp->z_unlinked is set, the znode is already marked
1092                  * for deletion and should not be discovered. Check this
1093                  * after checking igrab() due to fsetxattr() & O_TMPFILE.
1094                  *
1095                  * If igrab() returns NULL the VFS has independently
1096                  * determined the inode should be evicted and has
1097                  * called iput_final() to start the eviction process.
1098                  * The SA handle is still valid but because the VFS
1099                  * requires that the eviction succeed we must drop
1100                  * our locks and references to allow the eviction to
1101                  * complete.  The zfs_zget() may then be retried.
1102                  *
1103                  * This unlikely case could be optimized by registering
1104                  * a sops->drop_inode() callback.  The callback would
1105                  * need to detect the active SA hold thereby informing
1106                  * the VFS that this inode should not be evicted.
1107                  */
1108                 if (igrab(ZTOI(zp)) == NULL) {
1109                         if (zp->z_unlinked)
1110                                 err = SET_ERROR(ENOENT);
1111                         else
1112                                 err = SET_ERROR(EAGAIN);
1113                 } else {
1114                         *zpp = zp;
1115                         err = 0;
1116                 }
1117
1118                 mutex_exit(&zp->z_lock);
1119                 sa_buf_rele(db, NULL);
1120                 zfs_znode_hold_exit(zfsvfs, zh);
1121
1122                 if (err == EAGAIN) {
1123                         /* inode might need this to finish evict */
1124                         cond_resched();
1125                         goto again;
1126                 }
1127                 return (err);
1128         }
1129
1130         /*
1131          * Not found create new znode/vnode but only if file exists.
1132          *
1133          * There is a small window where zfs_vget() could
1134          * find this object while a file create is still in
1135          * progress.  This is checked for in zfs_znode_alloc()
1136          *
1137          * if zfs_znode_alloc() fails it will drop the hold on the
1138          * bonus buffer.
1139          */
1140         zp = zfs_znode_alloc(zfsvfs, db, doi.doi_data_block_size,
1141             doi.doi_bonus_type, NULL);
1142         if (zp == NULL) {
1143                 err = SET_ERROR(ENOENT);
1144         } else {
1145                 *zpp = zp;
1146         }
1147         zfs_znode_hold_exit(zfsvfs, zh);
1148         return (err);
1149 }
1150
1151 int
1152 zfs_rezget(znode_t *zp)
1153 {
1154         zfsvfs_t *zfsvfs = ZTOZSB(zp);
1155         dmu_object_info_t doi;
1156         dmu_buf_t *db;
1157         uint64_t obj_num = zp->z_id;
1158         uint64_t mode;
1159         uint64_t links;
1160         sa_bulk_attr_t bulk[11];
1161         int err;
1162         int count = 0;
1163         uint64_t gen;
1164         uint64_t z_uid, z_gid;
1165         uint64_t atime[2], mtime[2], ctime[2], btime[2];
1166         inode_timespec_t tmp_ts;
1167         uint64_t projid = ZFS_DEFAULT_PROJID;
1168         znode_hold_t *zh;
1169
1170         /*
1171          * skip ctldir, otherwise they will always get invalidated. This will
1172          * cause funny behaviour for the mounted snapdirs. Especially for
1173          * Linux >= 3.18, d_invalidate will detach the mountpoint and prevent
1174          * anyone automount it again as long as someone is still using the
1175          * detached mount.
1176          */
1177         if (zp->z_is_ctldir)
1178                 return (0);
1179
1180         zh = zfs_znode_hold_enter(zfsvfs, obj_num);
1181
1182         mutex_enter(&zp->z_acl_lock);
1183         if (zp->z_acl_cached) {
1184                 zfs_acl_free(zp->z_acl_cached);
1185                 zp->z_acl_cached = NULL;
1186         }
1187         mutex_exit(&zp->z_acl_lock);
1188
1189         rw_enter(&zp->z_xattr_lock, RW_WRITER);
1190         if (zp->z_xattr_cached) {
1191                 nvlist_free(zp->z_xattr_cached);
1192                 zp->z_xattr_cached = NULL;
1193         }
1194         rw_exit(&zp->z_xattr_lock);
1195
1196         ASSERT(zp->z_sa_hdl == NULL);
1197         err = sa_buf_hold(zfsvfs->z_os, obj_num, NULL, &db);
1198         if (err) {
1199                 zfs_znode_hold_exit(zfsvfs, zh);
1200                 return (err);
1201         }
1202
1203         dmu_object_info_from_db(db, &doi);
1204         if (doi.doi_bonus_type != DMU_OT_SA &&
1205             (doi.doi_bonus_type != DMU_OT_ZNODE ||
1206             (doi.doi_bonus_type == DMU_OT_ZNODE &&
1207             doi.doi_bonus_size < sizeof (znode_phys_t)))) {
1208                 sa_buf_rele(db, NULL);
1209                 zfs_znode_hold_exit(zfsvfs, zh);
1210                 return (SET_ERROR(EINVAL));
1211         }
1212
1213         zfs_znode_sa_init(zfsvfs, zp, db, doi.doi_bonus_type, NULL);
1214
1215         /* reload cached values */
1216         SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GEN(zfsvfs), NULL,
1217             &gen, sizeof (gen));
1218         SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL,
1219             &zp->z_size, sizeof (zp->z_size));
1220         SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs), NULL,
1221             &links, sizeof (links));
1222         SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
1223             &zp->z_pflags, sizeof (zp->z_pflags));
1224         SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL,
1225             &z_uid, sizeof (z_uid));
1226         SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs), NULL,
1227             &z_gid, sizeof (z_gid));
1228         SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL,
1229             &mode, sizeof (mode));
1230         SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zfsvfs), NULL,
1231             &atime, 16);
1232         SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL,
1233             &mtime, 16);
1234         SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
1235             &ctime, 16);
1236         SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CRTIME(zfsvfs), NULL, &btime, 16);
1237
1238         if (sa_bulk_lookup(zp->z_sa_hdl, bulk, count)) {
1239                 zfs_znode_dmu_fini(zp);
1240                 zfs_znode_hold_exit(zfsvfs, zh);
1241                 return (SET_ERROR(EIO));
1242         }
1243
1244         if (dmu_objset_projectquota_enabled(zfsvfs->z_os)) {
1245                 err = sa_lookup(zp->z_sa_hdl, SA_ZPL_PROJID(zfsvfs),
1246                     &projid, 8);
1247                 if (err != 0 && err != ENOENT) {
1248                         zfs_znode_dmu_fini(zp);
1249                         zfs_znode_hold_exit(zfsvfs, zh);
1250                         return (SET_ERROR(err));
1251                 }
1252         }
1253
1254         zp->z_projid = projid;
1255         zp->z_mode = ZTOI(zp)->i_mode = mode;
1256         zfs_uid_write(ZTOI(zp), z_uid);
1257         zfs_gid_write(ZTOI(zp), z_gid);
1258
1259         ZFS_TIME_DECODE(&tmp_ts, atime);
1260         zpl_inode_set_atime_to_ts(ZTOI(zp), tmp_ts);
1261         ZFS_TIME_DECODE(&tmp_ts, mtime);
1262         zpl_inode_set_mtime_to_ts(ZTOI(zp), tmp_ts);
1263         ZFS_TIME_DECODE(&tmp_ts, ctime);
1264         zpl_inode_set_ctime_to_ts(ZTOI(zp), tmp_ts);
1265         ZFS_TIME_DECODE(&zp->z_btime, btime);
1266
1267         if ((uint32_t)gen != ZTOI(zp)->i_generation) {
1268                 zfs_znode_dmu_fini(zp);
1269                 zfs_znode_hold_exit(zfsvfs, zh);
1270                 return (SET_ERROR(EIO));
1271         }
1272
1273         set_nlink(ZTOI(zp), (uint32_t)links);
1274         zfs_set_inode_flags(zp, ZTOI(zp));
1275
1276         zp->z_blksz = doi.doi_data_block_size;
1277         zp->z_atime_dirty = B_FALSE;
1278         zfs_znode_update_vfs(zp);
1279
1280         /*
1281          * If the file has zero links, then it has been unlinked on the send
1282          * side and it must be in the received unlinked set.
1283          * We call zfs_znode_dmu_fini() now to prevent any accesses to the
1284          * stale data and to prevent automatic removal of the file in
1285          * zfs_zinactive().  The file will be removed either when it is removed
1286          * on the send side and the next incremental stream is received or
1287          * when the unlinked set gets processed.
1288          */
1289         zp->z_unlinked = (ZTOI(zp)->i_nlink == 0);
1290         if (zp->z_unlinked)
1291                 zfs_znode_dmu_fini(zp);
1292
1293         zfs_znode_hold_exit(zfsvfs, zh);
1294
1295         return (0);
1296 }
1297
1298 void
1299 zfs_znode_delete(znode_t *zp, dmu_tx_t *tx)
1300 {
1301         zfsvfs_t *zfsvfs = ZTOZSB(zp);
1302         objset_t *os = zfsvfs->z_os;
1303         uint64_t obj = zp->z_id;
1304         uint64_t acl_obj = zfs_external_acl(zp);
1305         znode_hold_t *zh;
1306
1307         zh = zfs_znode_hold_enter(zfsvfs, obj);
1308         if (acl_obj) {
1309                 VERIFY(!zp->z_is_sa);
1310                 VERIFY(0 == dmu_object_free(os, acl_obj, tx));
1311         }
1312         VERIFY(0 == dmu_object_free(os, obj, tx));
1313         zfs_znode_dmu_fini(zp);
1314         zfs_znode_hold_exit(zfsvfs, zh);
1315 }
1316
1317 void
1318 zfs_zinactive(znode_t *zp)
1319 {
1320         zfsvfs_t *zfsvfs = ZTOZSB(zp);
1321         uint64_t z_id = zp->z_id;
1322         znode_hold_t *zh;
1323
1324         ASSERT(zp->z_sa_hdl);
1325
1326         /*
1327          * Don't allow a zfs_zget() while were trying to release this znode.
1328          */
1329         zh = zfs_znode_hold_enter(zfsvfs, z_id);
1330
1331         mutex_enter(&zp->z_lock);
1332
1333         /*
1334          * If this was the last reference to a file with no links, remove
1335          * the file from the file system unless the file system is mounted
1336          * read-only.  That can happen, for example, if the file system was
1337          * originally read-write, the file was opened, then unlinked and
1338          * the file system was made read-only before the file was finally
1339          * closed.  The file will remain in the unlinked set.
1340          */
1341         if (zp->z_unlinked) {
1342                 ASSERT(!zfsvfs->z_issnap);
1343                 if (!zfs_is_readonly(zfsvfs) && !zfs_unlink_suspend_progress) {
1344                         mutex_exit(&zp->z_lock);
1345                         zfs_znode_hold_exit(zfsvfs, zh);
1346                         zfs_rmnode(zp);
1347                         return;
1348                 }
1349         }
1350
1351         mutex_exit(&zp->z_lock);
1352         zfs_znode_dmu_fini(zp);
1353
1354         zfs_znode_hold_exit(zfsvfs, zh);
1355 }
1356
1357 /*
1358  * Determine whether the znode's atime must be updated.  The logic mostly
1359  * duplicates the Linux kernel's relatime_need_update() functionality.
1360  * This function is only called if the underlying filesystem actually has
1361  * atime updates enabled.
1362  */
1363 boolean_t
1364 zfs_relatime_need_update(const struct inode *ip)
1365 {
1366         inode_timespec_t now, tmp_atime, tmp_ts;
1367
1368         gethrestime(&now);
1369         tmp_atime = zpl_inode_get_atime(ip);
1370         /*
1371          * In relatime mode, only update the atime if the previous atime
1372          * is earlier than either the ctime or mtime or if at least a day
1373          * has passed since the last update of atime.
1374          */
1375         tmp_ts = zpl_inode_get_mtime(ip);
1376         if (timespec64_compare(&tmp_ts, &tmp_atime) >= 0)
1377                 return (B_TRUE);
1378
1379         tmp_ts = zpl_inode_get_ctime(ip);
1380         if (timespec64_compare(&tmp_ts, &tmp_atime) >= 0)
1381                 return (B_TRUE);
1382
1383         if ((hrtime_t)now.tv_sec - (hrtime_t)tmp_atime.tv_sec >= 24*60*60)
1384                 return (B_TRUE);
1385
1386         return (B_FALSE);
1387 }
1388
1389 /*
1390  * Prepare to update znode time stamps.
1391  *
1392  *      IN:     zp      - znode requiring timestamp update
1393  *              flag    - ATTR_MTIME, ATTR_CTIME flags
1394  *
1395  *      OUT:    zp      - z_seq
1396  *              mtime   - new mtime
1397  *              ctime   - new ctime
1398  *
1399  *      Note: We don't update atime here, because we rely on Linux VFS to do
1400  *      atime updating.
1401  */
1402 void
1403 zfs_tstamp_update_setup(znode_t *zp, uint_t flag, uint64_t mtime[2],
1404     uint64_t ctime[2])
1405 {
1406         inode_timespec_t now, tmp_ts;
1407
1408         gethrestime(&now);
1409
1410         zp->z_seq++;
1411
1412         if (flag & ATTR_MTIME) {
1413                 ZFS_TIME_ENCODE(&now, mtime);
1414                 ZFS_TIME_DECODE(&tmp_ts, mtime);
1415                 zpl_inode_set_mtime_to_ts(ZTOI(zp), tmp_ts);
1416                 if (ZTOZSB(zp)->z_use_fuids) {
1417                         zp->z_pflags |= (ZFS_ARCHIVE |
1418                             ZFS_AV_MODIFIED);
1419                 }
1420         }
1421
1422         if (flag & ATTR_CTIME) {
1423                 ZFS_TIME_ENCODE(&now, ctime);
1424                 ZFS_TIME_DECODE(&tmp_ts, ctime);
1425                 zpl_inode_set_ctime_to_ts(ZTOI(zp), tmp_ts);
1426                 if (ZTOZSB(zp)->z_use_fuids)
1427                         zp->z_pflags |= ZFS_ARCHIVE;
1428         }
1429 }
1430
1431 /*
1432  * Grow the block size for a file.
1433  *
1434  *      IN:     zp      - znode of file to free data in.
1435  *              size    - requested block size
1436  *              tx      - open transaction.
1437  *
1438  * NOTE: this function assumes that the znode is write locked.
1439  */
1440 void
1441 zfs_grow_blocksize(znode_t *zp, uint64_t size, dmu_tx_t *tx)
1442 {
1443         int             error;
1444         u_longlong_t    dummy;
1445
1446         if (size <= zp->z_blksz)
1447                 return;
1448         /*
1449          * If the file size is already greater than the current blocksize,
1450          * we will not grow.  If there is more than one block in a file,
1451          * the blocksize cannot change.
1452          */
1453         if (zp->z_blksz && zp->z_size > zp->z_blksz)
1454                 return;
1455
1456         error = dmu_object_set_blocksize(ZTOZSB(zp)->z_os, zp->z_id,
1457             size, 0, tx);
1458
1459         if (error == ENOTSUP)
1460                 return;
1461         ASSERT0(error);
1462
1463         /* What blocksize did we actually get? */
1464         dmu_object_size_from_db(sa_get_db(zp->z_sa_hdl), &zp->z_blksz, &dummy);
1465 }
1466
1467 /*
1468  * Increase the file length
1469  *
1470  *      IN:     zp      - znode of file to free data in.
1471  *              end     - new end-of-file
1472  *
1473  *      RETURN: 0 on success, error code on failure
1474  */
1475 static int
1476 zfs_extend(znode_t *zp, uint64_t end)
1477 {
1478         zfsvfs_t *zfsvfs = ZTOZSB(zp);
1479         dmu_tx_t *tx;
1480         zfs_locked_range_t *lr;
1481         uint64_t newblksz;
1482         int error;
1483
1484         /*
1485          * We will change zp_size, lock the whole file.
1486          */
1487         lr = zfs_rangelock_enter(&zp->z_rangelock, 0, UINT64_MAX, RL_WRITER);
1488
1489         /*
1490          * Nothing to do if file already at desired length.
1491          */
1492         if (end <= zp->z_size) {
1493                 zfs_rangelock_exit(lr);
1494                 return (0);
1495         }
1496         tx = dmu_tx_create(zfsvfs->z_os);
1497         dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
1498         zfs_sa_upgrade_txholds(tx, zp);
1499         if (end > zp->z_blksz &&
1500             (!ISP2(zp->z_blksz) || zp->z_blksz < zfsvfs->z_max_blksz)) {
1501                 /*
1502                  * We are growing the file past the current block size.
1503                  */
1504                 if (zp->z_blksz > ZTOZSB(zp)->z_max_blksz) {
1505                         /*
1506                          * File's blocksize is already larger than the
1507                          * "recordsize" property.  Only let it grow to
1508                          * the next power of 2.
1509                          */
1510                         ASSERT(!ISP2(zp->z_blksz));
1511                         newblksz = MIN(end, 1 << highbit64(zp->z_blksz));
1512                 } else {
1513                         newblksz = MIN(end, ZTOZSB(zp)->z_max_blksz);
1514                 }
1515                 dmu_tx_hold_write(tx, zp->z_id, 0, newblksz);
1516         } else {
1517                 newblksz = 0;
1518         }
1519
1520         error = dmu_tx_assign(tx, TXG_WAIT);
1521         if (error) {
1522                 dmu_tx_abort(tx);
1523                 zfs_rangelock_exit(lr);
1524                 return (error);
1525         }
1526
1527         if (newblksz)
1528                 zfs_grow_blocksize(zp, newblksz, tx);
1529
1530         zp->z_size = end;
1531
1532         VERIFY(0 == sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(ZTOZSB(zp)),
1533             &zp->z_size, sizeof (zp->z_size), tx));
1534
1535         zfs_rangelock_exit(lr);
1536
1537         dmu_tx_commit(tx);
1538
1539         return (0);
1540 }
1541
1542 /*
1543  * zfs_zero_partial_page - Modeled after update_pages() but
1544  * with different arguments and semantics for use by zfs_freesp().
1545  *
1546  * Zeroes a piece of a single page cache entry for zp at offset
1547  * start and length len.
1548  *
1549  * Caller must acquire a range lock on the file for the region
1550  * being zeroed in order that the ARC and page cache stay in sync.
1551  */
1552 static void
1553 zfs_zero_partial_page(znode_t *zp, uint64_t start, uint64_t len)
1554 {
1555         struct address_space *mp = ZTOI(zp)->i_mapping;
1556         struct page *pp;
1557         int64_t off;
1558         void *pb;
1559
1560         ASSERT((start & PAGE_MASK) == ((start + len - 1) & PAGE_MASK));
1561
1562         off = start & (PAGE_SIZE - 1);
1563         start &= PAGE_MASK;
1564
1565         pp = find_lock_page(mp, start >> PAGE_SHIFT);
1566         if (pp) {
1567                 if (mapping_writably_mapped(mp))
1568                         flush_dcache_page(pp);
1569
1570                 pb = kmap(pp);
1571                 memset(pb + off, 0, len);
1572                 kunmap(pp);
1573
1574                 if (mapping_writably_mapped(mp))
1575                         flush_dcache_page(pp);
1576
1577                 mark_page_accessed(pp);
1578                 SetPageUptodate(pp);
1579                 ClearPageError(pp);
1580                 if (!PagePrivate(pp)) {
1581                         /*
1582                          * Set private bit so page migration will wait for us to
1583                          * finish writeback before calling migrate_folio().
1584                          */
1585                         SetPagePrivate(pp);
1586                         get_page(pp);
1587                 }
1588                 unlock_page(pp);
1589                 put_page(pp);
1590         }
1591 }
1592
1593 /*
1594  * Free space in a file.
1595  *
1596  *      IN:     zp      - znode of file to free data in.
1597  *              off     - start of section to free.
1598  *              len     - length of section to free.
1599  *
1600  *      RETURN: 0 on success, error code on failure
1601  */
1602 static int
1603 zfs_free_range(znode_t *zp, uint64_t off, uint64_t len)
1604 {
1605         zfsvfs_t *zfsvfs = ZTOZSB(zp);
1606         zfs_locked_range_t *lr;
1607         int error;
1608
1609         /*
1610          * Lock the range being freed.
1611          */
1612         lr = zfs_rangelock_enter(&zp->z_rangelock, off, len, RL_WRITER);
1613
1614         /*
1615          * Nothing to do if file already at desired length.
1616          */
1617         if (off >= zp->z_size) {
1618                 zfs_rangelock_exit(lr);
1619                 return (0);
1620         }
1621
1622         if (off + len > zp->z_size)
1623                 len = zp->z_size - off;
1624
1625         error = dmu_free_long_range(zfsvfs->z_os, zp->z_id, off, len);
1626
1627         /*
1628          * Zero partial page cache entries.  This must be done under a
1629          * range lock in order to keep the ARC and page cache in sync.
1630          */
1631         if (zn_has_cached_data(zp, off, off + len - 1)) {
1632                 loff_t first_page, last_page, page_len;
1633                 loff_t first_page_offset, last_page_offset;
1634
1635                 /* first possible full page in hole */
1636                 first_page = (off + PAGE_SIZE - 1) >> PAGE_SHIFT;
1637                 /* last page of hole */
1638                 last_page = (off + len) >> PAGE_SHIFT;
1639
1640                 /* offset of first_page */
1641                 first_page_offset = first_page << PAGE_SHIFT;
1642                 /* offset of last_page */
1643                 last_page_offset = last_page << PAGE_SHIFT;
1644
1645                 /* truncate whole pages */
1646                 if (last_page_offset > first_page_offset) {
1647                         truncate_inode_pages_range(ZTOI(zp)->i_mapping,
1648                             first_page_offset, last_page_offset - 1);
1649                 }
1650
1651                 /* truncate sub-page ranges */
1652                 if (first_page > last_page) {
1653                         /* entire punched area within a single page */
1654                         zfs_zero_partial_page(zp, off, len);
1655                 } else {
1656                         /* beginning of punched area at the end of a page */
1657                         page_len  = first_page_offset - off;
1658                         if (page_len > 0)
1659                                 zfs_zero_partial_page(zp, off, page_len);
1660
1661                         /* end of punched area at the beginning of a page */
1662                         page_len = off + len - last_page_offset;
1663                         if (page_len > 0)
1664                                 zfs_zero_partial_page(zp, last_page_offset,
1665                                     page_len);
1666                 }
1667         }
1668         zfs_rangelock_exit(lr);
1669
1670         return (error);
1671 }
1672
1673 /*
1674  * Truncate a file
1675  *
1676  *      IN:     zp      - znode of file to free data in.
1677  *              end     - new end-of-file.
1678  *
1679  *      RETURN: 0 on success, error code on failure
1680  */
1681 static int
1682 zfs_trunc(znode_t *zp, uint64_t end)
1683 {
1684         zfsvfs_t *zfsvfs = ZTOZSB(zp);
1685         dmu_tx_t *tx;
1686         zfs_locked_range_t *lr;
1687         int error;
1688         sa_bulk_attr_t bulk[2];
1689         int count = 0;
1690
1691         /*
1692          * We will change zp_size, lock the whole file.
1693          */
1694         lr = zfs_rangelock_enter(&zp->z_rangelock, 0, UINT64_MAX, RL_WRITER);
1695
1696         /*
1697          * Nothing to do if file already at desired length.
1698          */
1699         if (end >= zp->z_size) {
1700                 zfs_rangelock_exit(lr);
1701                 return (0);
1702         }
1703
1704         error = dmu_free_long_range(zfsvfs->z_os, zp->z_id, end,
1705             DMU_OBJECT_END);
1706         if (error) {
1707                 zfs_rangelock_exit(lr);
1708                 return (error);
1709         }
1710         tx = dmu_tx_create(zfsvfs->z_os);
1711         dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
1712         zfs_sa_upgrade_txholds(tx, zp);
1713         dmu_tx_mark_netfree(tx);
1714         error = dmu_tx_assign(tx, TXG_WAIT);
1715         if (error) {
1716                 dmu_tx_abort(tx);
1717                 zfs_rangelock_exit(lr);
1718                 return (error);
1719         }
1720
1721         zp->z_size = end;
1722         SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs),
1723             NULL, &zp->z_size, sizeof (zp->z_size));
1724
1725         if (end == 0) {
1726                 zp->z_pflags &= ~ZFS_SPARSE;
1727                 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs),
1728                     NULL, &zp->z_pflags, 8);
1729         }
1730         VERIFY(sa_bulk_update(zp->z_sa_hdl, bulk, count, tx) == 0);
1731
1732         dmu_tx_commit(tx);
1733         zfs_rangelock_exit(lr);
1734
1735         return (0);
1736 }
1737
1738 /*
1739  * Free space in a file
1740  *
1741  *      IN:     zp      - znode of file to free data in.
1742  *              off     - start of range
1743  *              len     - end of range (0 => EOF)
1744  *              flag    - current file open mode flags.
1745  *              log     - TRUE if this action should be logged
1746  *
1747  *      RETURN: 0 on success, error code on failure
1748  */
1749 int
1750 zfs_freesp(znode_t *zp, uint64_t off, uint64_t len, int flag, boolean_t log)
1751 {
1752         dmu_tx_t *tx;
1753         zfsvfs_t *zfsvfs = ZTOZSB(zp);
1754         zilog_t *zilog = zfsvfs->z_log;
1755         uint64_t mode;
1756         uint64_t mtime[2], ctime[2];
1757         sa_bulk_attr_t bulk[3];
1758         int count = 0;
1759         int error;
1760
1761         if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_MODE(zfsvfs), &mode,
1762             sizeof (mode))) != 0)
1763                 return (error);
1764
1765         if (off > zp->z_size) {
1766                 error =  zfs_extend(zp, off+len);
1767                 if (error == 0 && log)
1768                         goto log;
1769                 goto out;
1770         }
1771
1772         if (len == 0) {
1773                 error = zfs_trunc(zp, off);
1774         } else {
1775                 if ((error = zfs_free_range(zp, off, len)) == 0 &&
1776                     off + len > zp->z_size)
1777                         error = zfs_extend(zp, off+len);
1778         }
1779         if (error || !log)
1780                 goto out;
1781 log:
1782         tx = dmu_tx_create(zfsvfs->z_os);
1783         dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
1784         zfs_sa_upgrade_txholds(tx, zp);
1785         error = dmu_tx_assign(tx, TXG_WAIT);
1786         if (error) {
1787                 dmu_tx_abort(tx);
1788                 goto out;
1789         }
1790
1791         SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, mtime, 16);
1792         SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, ctime, 16);
1793         SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs),
1794             NULL, &zp->z_pflags, 8);
1795         zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime);
1796         error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
1797         ASSERT(error == 0);
1798
1799         zfs_log_truncate(zilog, tx, TX_TRUNCATE, zp, off, len);
1800
1801         dmu_tx_commit(tx);
1802
1803         zfs_znode_update_vfs(zp);
1804         error = 0;
1805
1806 out:
1807         /*
1808          * Truncate the page cache - for file truncate operations, use
1809          * the purpose-built API for truncations.  For punching operations,
1810          * the truncation is handled under a range lock in zfs_free_range.
1811          */
1812         if (len == 0)
1813                 truncate_setsize(ZTOI(zp), off);
1814         return (error);
1815 }
1816
1817 void
1818 zfs_create_fs(objset_t *os, cred_t *cr, nvlist_t *zplprops, dmu_tx_t *tx)
1819 {
1820         struct super_block *sb;
1821         zfsvfs_t        *zfsvfs;
1822         uint64_t        moid, obj, sa_obj, version;
1823         uint64_t        sense = ZFS_CASE_SENSITIVE;
1824         uint64_t        norm = 0;
1825         nvpair_t        *elem;
1826         int             size;
1827         int             error;
1828         int             i;
1829         znode_t         *rootzp = NULL;
1830         vattr_t         vattr;
1831         znode_t         *zp;
1832         zfs_acl_ids_t   acl_ids;
1833
1834         /*
1835          * First attempt to create master node.
1836          */
1837         /*
1838          * In an empty objset, there are no blocks to read and thus
1839          * there can be no i/o errors (which we assert below).
1840          */
1841         moid = MASTER_NODE_OBJ;
1842         error = zap_create_claim(os, moid, DMU_OT_MASTER_NODE,
1843             DMU_OT_NONE, 0, tx);
1844         ASSERT(error == 0);
1845
1846         /*
1847          * Set starting attributes.
1848          */
1849         version = zfs_zpl_version_map(spa_version(dmu_objset_spa(os)));
1850         elem = NULL;
1851         while ((elem = nvlist_next_nvpair(zplprops, elem)) != NULL) {
1852                 /* For the moment we expect all zpl props to be uint64_ts */
1853                 uint64_t val;
1854                 const char *name;
1855
1856                 ASSERT(nvpair_type(elem) == DATA_TYPE_UINT64);
1857                 VERIFY(nvpair_value_uint64(elem, &val) == 0);
1858                 name = nvpair_name(elem);
1859                 if (strcmp(name, zfs_prop_to_name(ZFS_PROP_VERSION)) == 0) {
1860                         if (val < version)
1861                                 version = val;
1862                 } else {
1863                         error = zap_update(os, moid, name, 8, 1, &val, tx);
1864                 }
1865                 ASSERT(error == 0);
1866                 if (strcmp(name, zfs_prop_to_name(ZFS_PROP_NORMALIZE)) == 0)
1867                         norm = val;
1868                 else if (strcmp(name, zfs_prop_to_name(ZFS_PROP_CASE)) == 0)
1869                         sense = val;
1870         }
1871         ASSERT(version != 0);
1872         error = zap_update(os, moid, ZPL_VERSION_STR, 8, 1, &version, tx);
1873         ASSERT(error == 0);
1874
1875         /*
1876          * Create zap object used for SA attribute registration
1877          */
1878
1879         if (version >= ZPL_VERSION_SA) {
1880                 sa_obj = zap_create(os, DMU_OT_SA_MASTER_NODE,
1881                     DMU_OT_NONE, 0, tx);
1882                 error = zap_add(os, moid, ZFS_SA_ATTRS, 8, 1, &sa_obj, tx);
1883                 ASSERT(error == 0);
1884         } else {
1885                 sa_obj = 0;
1886         }
1887         /*
1888          * Create a delete queue.
1889          */
1890         obj = zap_create(os, DMU_OT_UNLINKED_SET, DMU_OT_NONE, 0, tx);
1891
1892         error = zap_add(os, moid, ZFS_UNLINKED_SET, 8, 1, &obj, tx);
1893         ASSERT(error == 0);
1894
1895         /*
1896          * Create root znode.  Create minimal znode/inode/zfsvfs/sb
1897          * to allow zfs_mknode to work.
1898          */
1899         vattr.va_mask = ATTR_MODE|ATTR_UID|ATTR_GID;
1900         vattr.va_mode = S_IFDIR|0755;
1901         vattr.va_uid = crgetuid(cr);
1902         vattr.va_gid = crgetgid(cr);
1903
1904         rootzp = kmem_cache_alloc(znode_cache, KM_SLEEP);
1905         rootzp->z_unlinked = B_FALSE;
1906         rootzp->z_atime_dirty = B_FALSE;
1907         rootzp->z_is_sa = USE_SA(version, os);
1908         rootzp->z_pflags = 0;
1909
1910         zfsvfs = kmem_zalloc(sizeof (zfsvfs_t), KM_SLEEP);
1911         zfsvfs->z_os = os;
1912         zfsvfs->z_parent = zfsvfs;
1913         zfsvfs->z_version = version;
1914         zfsvfs->z_use_fuids = USE_FUIDS(version, os);
1915         zfsvfs->z_use_sa = USE_SA(version, os);
1916         zfsvfs->z_norm = norm;
1917
1918         sb = kmem_zalloc(sizeof (struct super_block), KM_SLEEP);
1919         sb->s_fs_info = zfsvfs;
1920
1921         ZTOI(rootzp)->i_sb = sb;
1922
1923         error = sa_setup(os, sa_obj, zfs_attr_table, ZPL_END,
1924             &zfsvfs->z_attr_table);
1925
1926         ASSERT(error == 0);
1927
1928         /*
1929          * Fold case on file systems that are always or sometimes case
1930          * insensitive.
1931          */
1932         if (sense == ZFS_CASE_INSENSITIVE || sense == ZFS_CASE_MIXED)
1933                 zfsvfs->z_norm |= U8_TEXTPREP_TOUPPER;
1934
1935         mutex_init(&zfsvfs->z_znodes_lock, NULL, MUTEX_DEFAULT, NULL);
1936         list_create(&zfsvfs->z_all_znodes, sizeof (znode_t),
1937             offsetof(znode_t, z_link_node));
1938
1939         size = MIN(1 << (highbit64(zfs_object_mutex_size)-1), ZFS_OBJ_MTX_MAX);
1940         zfsvfs->z_hold_size = size;
1941         zfsvfs->z_hold_trees = vmem_zalloc(sizeof (avl_tree_t) * size,
1942             KM_SLEEP);
1943         zfsvfs->z_hold_locks = vmem_zalloc(sizeof (kmutex_t) * size, KM_SLEEP);
1944         for (i = 0; i != size; i++) {
1945                 avl_create(&zfsvfs->z_hold_trees[i], zfs_znode_hold_compare,
1946                     sizeof (znode_hold_t), offsetof(znode_hold_t, zh_node));
1947                 mutex_init(&zfsvfs->z_hold_locks[i], NULL, MUTEX_DEFAULT, NULL);
1948         }
1949
1950         VERIFY(0 == zfs_acl_ids_create(rootzp, IS_ROOT_NODE, &vattr,
1951             cr, NULL, &acl_ids, zfs_init_idmap));
1952         zfs_mknode(rootzp, &vattr, tx, cr, IS_ROOT_NODE, &zp, &acl_ids);
1953         ASSERT3P(zp, ==, rootzp);
1954         error = zap_add(os, moid, ZFS_ROOT_OBJ, 8, 1, &rootzp->z_id, tx);
1955         ASSERT(error == 0);
1956         zfs_acl_ids_free(&acl_ids);
1957
1958         atomic_set(&ZTOI(rootzp)->i_count, 0);
1959         sa_handle_destroy(rootzp->z_sa_hdl);
1960         kmem_cache_free(znode_cache, rootzp);
1961
1962         for (i = 0; i != size; i++) {
1963                 avl_destroy(&zfsvfs->z_hold_trees[i]);
1964                 mutex_destroy(&zfsvfs->z_hold_locks[i]);
1965         }
1966
1967         mutex_destroy(&zfsvfs->z_znodes_lock);
1968
1969         vmem_free(zfsvfs->z_hold_trees, sizeof (avl_tree_t) * size);
1970         vmem_free(zfsvfs->z_hold_locks, sizeof (kmutex_t) * size);
1971         kmem_free(sb, sizeof (struct super_block));
1972         kmem_free(zfsvfs, sizeof (zfsvfs_t));
1973 }
1974
1975 EXPORT_SYMBOL(zfs_create_fs);
1976 EXPORT_SYMBOL(zfs_obj_to_path);
1977
1978 /* CSTYLED */
1979 module_param(zfs_object_mutex_size, uint, 0644);
1980 MODULE_PARM_DESC(zfs_object_mutex_size, "Size of znode hold array");
1981 module_param(zfs_unlink_suspend_progress, int, 0644);
1982 MODULE_PARM_DESC(zfs_unlink_suspend_progress, "Set to prevent async unlinks "
1983 "(debug - leaks space into the unlinked set)");