module/os/linux/zfs/zfs_vnops_os.c

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or https://opensource.org/licenses/CDDL-1.0.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21
  22 /*
  23  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  24  * Copyright (c) 2012, 2018 by Delphix. All rights reserved.
  25  * Copyright (c) 2015 by Chunwei Chen. All rights reserved.
  26  * Copyright 2017 Nexenta Systems, Inc.
  27  */
  28
  29 /* Portions Copyright 2007 Jeremy Teo */
  30 /* Portions Copyright 2010 Robert Milkowski */
  31
  32
  33 #include <sys/types.h>
  34 #include <sys/param.h>
  35 #include <sys/time.h>
  36 #include <sys/sysmacros.h>
  37 #include <sys/vfs.h>
  38 #include <sys/file.h>
  39 #include <sys/stat.h>
  40 #include <sys/kmem.h>
  41 #include <sys/taskq.h>
  42 #include <sys/uio.h>
  43 #include <sys/vmsystm.h>
  44 #include <sys/atomic.h>
  45 #include <sys/pathname.h>
  46 #include <sys/cmn_err.h>
  47 #include <sys/errno.h>
  48 #include <sys/zfs_dir.h>
  49 #include <sys/zfs_acl.h>
  50 #include <sys/zfs_ioctl.h>
  51 #include <sys/fs/zfs.h>
  52 #include <sys/dmu.h>
  53 #include <sys/dmu_objset.h>
  54 #include <sys/spa.h>
  55 #include <sys/txg.h>
  56 #include <sys/dbuf.h>
  57 #include <sys/zap.h>
  58 #include <sys/sa.h>
  59 #include <sys/policy.h>
  60 #include <sys/sunddi.h>
  61 #include <sys/sid.h>
  62 #include <sys/zfs_ctldir.h>
  63 #include <sys/zfs_fuid.h>
  64 #include <sys/zfs_quota.h>
  65 #include <sys/zfs_sa.h>
  66 #include <sys/zfs_vnops.h>
  67 #include <sys/zfs_rlock.h>
  68 #include <sys/cred.h>
  69 #include <sys/zpl.h>
  70 #include <sys/zil.h>
  71 #include <sys/sa_impl.h>
  72
  73 /*
  74  * Programming rules.
  75  *
  76  * Each vnode op performs some logical unit of work.  To do this, the ZPL must
  77  * properly lock its in-core state, create a DMU transaction, do the work,
  78  * record this work in the intent log (ZIL), commit the DMU transaction,
  79  * and wait for the intent log to commit if it is a synchronous operation.
  80  * Moreover, the vnode ops must work in both normal and log replay context.
  81  * The ordering of events is important to avoid deadlocks and references
  82  * to freed memory.  The example below illustrates the following Big Rules:
  83  *
  84  *  (1) A check must be made in each zfs thread for a mounted file system.
  85  *      This is done avoiding races using zfs_enter(zfsvfs).
  86  *      A zfs_exit(zfsvfs) is needed before all returns.  Any znodes
  87  *      must be checked with zfs_verify_zp(zp).  Both of these macros
  88  *      can return EIO from the calling function.
  89  *
  90  *  (2) zrele() should always be the last thing except for zil_commit() (if
  91  *      necessary) and zfs_exit(). This is for 3 reasons: First, if it's the
  92  *      last reference, the vnode/znode can be freed, so the zp may point to
  93  *      freed memory.  Second, the last reference will call zfs_zinactive(),
  94  *      which may induce a lot of work -- pushing cached pages (which acquires
  95  *      range locks) and syncing out cached atime changes.  Third,
  96  *      zfs_zinactive() may require a new tx, which could deadlock the system
  97  *      if you were already holding one. This deadlock occurs because the tx
  98  *      currently being operated on prevents a txg from syncing, which
  99  *      prevents the new tx from progressing, resulting in a deadlock.  If you
 100  *      must call zrele() within a tx, use zfs_zrele_async(). Note that iput()
 101  *      is a synonym for zrele().
 102  *
 103  *  (3) All range locks must be grabbed before calling dmu_tx_assign(),
 104  *      as they can span dmu_tx_assign() calls.
 105  *
 106  *  (4) If ZPL locks are held, pass TXG_NOWAIT as the second argument to
 107  *      dmu_tx_assign().  This is critical because we don't want to block
 108  *      while holding locks.
 109  *
 110  *      If no ZPL locks are held (aside from zfs_enter()), use TXG_WAIT.  This
 111  *      reduces lock contention and CPU usage when we must wait (note that if
 112  *      throughput is constrained by the storage, nearly every transaction
 113  *      must wait).
 114  *
 115  *      Note, in particular, that if a lock is sometimes acquired before
 116  *      the tx assigns, and sometimes after (e.g. z_lock), then failing
 117  *      to use a non-blocking assign can deadlock the system.  The scenario:
 118  *
 119  *      Thread A has grabbed a lock before calling dmu_tx_assign().
 120  *      Thread B is in an already-assigned tx, and blocks for this lock.
 121  *      Thread A calls dmu_tx_assign(TXG_WAIT) and blocks in txg_wait_open()
 122  *      forever, because the previous txg can't quiesce until B's tx commits.
 123  *
 124  *      If dmu_tx_assign() returns ERESTART and zfsvfs->z_assign is TXG_NOWAIT,
 125  *      then drop all locks, call dmu_tx_wait(), and try again.  On subsequent
 126  *      calls to dmu_tx_assign(), pass TXG_NOTHROTTLE in addition to TXG_NOWAIT,
 127  *      to indicate that this operation has already called dmu_tx_wait().
 128  *      This will ensure that we don't retry forever, waiting a short bit
 129  *      each time.
 130  *
 131  *  (5) If the operation succeeded, generate the intent log entry for it
 132  *      before dropping locks.  This ensures that the ordering of events
 133  *      in the intent log matches the order in which they actually occurred.
 134  *      During ZIL replay the zfs_log_* functions will update the sequence
 135  *      number to indicate the zil transaction has replayed.
 136  *
 137  *  (6) At the end of each vnode op, the DMU tx must always commit,
 138  *      regardless of whether there were any errors.
 139  *
 140  *  (7) After dropping all locks, invoke zil_commit(zilog, foid)
 141  *      to ensure that synchronous semantics are provided when necessary.
 142  *
 143  * In general, this is how things should be ordered in each vnode op:
 144  *
 145  *      zfs_enter(zfsvfs);              // exit if unmounted
 146  * top:
 147  *      zfs_dirent_lock(&dl, ...)       // lock directory entry (may igrab())
 148  *      rw_enter(...);                  // grab any other locks you need
 149  *      tx = dmu_tx_create(...);        // get DMU tx
 150  *      dmu_tx_hold_*();                // hold each object you might modify
 151  *      error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT);
 152  *      if (error) {
 153  *              rw_exit(...);           // drop locks
 154  *              zfs_dirent_unlock(dl);  // unlock directory entry
 155  *              zrele(...);             // release held znodes
 156  *              if (error == ERESTART) {
 157  *                      waited = B_TRUE;
 158  *                      dmu_tx_wait(tx);
 159  *                      dmu_tx_abort(tx);
 160  *                      goto top;
 161  *              }
 162  *              dmu_tx_abort(tx);       // abort DMU tx
 163  *              zfs_exit(zfsvfs);       // finished in zfs
 164  *              return (error);         // really out of space
 165  *      }
 166  *      error = do_real_work();         // do whatever this VOP does
 167  *      if (error == 0)
 168  *              zfs_log_*(...);         // on success, make ZIL entry
 169  *      dmu_tx_commit(tx);              // commit DMU tx -- error or not
 170  *      rw_exit(...);                   // drop locks
 171  *      zfs_dirent_unlock(dl);          // unlock directory entry
 172  *      zrele(...);                     // release held znodes
 173  *      zil_commit(zilog, foid);        // synchronous when necessary
 174  *      zfs_exit(zfsvfs);               // finished in zfs
 175  *      return (error);                 // done, report error
 176  */
 177 int
 178 zfs_open(struct inode *ip, int mode, int flag, cred_t *cr)
 179 {
 180         (void) cr;
 181         znode_t *zp = ITOZ(ip);
 182         zfsvfs_t *zfsvfs = ITOZSB(ip);
 183         int error;
 184
 185         if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
 186                 return (error);
 187
 188         /* Honor ZFS_APPENDONLY file attribute */
 189         if (blk_mode_is_open_write(mode) && (zp->z_pflags & ZFS_APPENDONLY) &&
 190             ((flag & O_APPEND) == 0)) {
 191                 zfs_exit(zfsvfs, FTAG);
 192                 return (SET_ERROR(EPERM));
 193         }
 194
 195         /* Keep a count of the synchronous opens in the znode */
 196         if (flag & O_SYNC)
 197                 atomic_inc_32(&zp->z_sync_cnt);
 198
 199         zfs_exit(zfsvfs, FTAG);
 200         return (0);
 201 }
 202
 203 int
 204 zfs_close(struct inode *ip, int flag, cred_t *cr)
 205 {
 206         (void) cr;
 207         znode_t *zp = ITOZ(ip);
 208         zfsvfs_t *zfsvfs = ITOZSB(ip);
 209         int error;
 210
 211         if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
 212                 return (error);
 213
 214         /* Decrement the synchronous opens in the znode */
 215         if (flag & O_SYNC)
 216                 atomic_dec_32(&zp->z_sync_cnt);
 217
 218         zfs_exit(zfsvfs, FTAG);
 219         return (0);
 220 }
 221
 222 #if defined(_KERNEL)
 223
 224 static int zfs_fillpage(struct inode *ip, struct page *pp);
 225
 226 /*
 227  * When a file is memory mapped, we must keep the IO data synchronized
 228  * between the DMU cache and the memory mapped pages.  Update all mapped
 229  * pages with the contents of the coresponding dmu buffer.
 230  */
 231 void
 232 update_pages(znode_t *zp, int64_t start, int len, objset_t *os)
 233 {
 234         struct address_space *mp = ZTOI(zp)->i_mapping;
 235         int64_t off = start & (PAGE_SIZE - 1);
 236
 237         for (start &= PAGE_MASK; len > 0; start += PAGE_SIZE) {
 238                 uint64_t nbytes = MIN(PAGE_SIZE - off, len);
 239
 240                 struct page *pp = find_lock_page(mp, start >> PAGE_SHIFT);
 241                 if (pp) {
 242                         if (mapping_writably_mapped(mp))
 243                                 flush_dcache_page(pp);
 244
 245                         void *pb = kmap(pp);
 246                         int error = dmu_read(os, zp->z_id, start + off,
 247                             nbytes, pb + off, DMU_READ_PREFETCH);
 248                         kunmap(pp);
 249
 250                         if (error) {
 251                                 SetPageError(pp);
 252                                 ClearPageUptodate(pp);
 253                         } else {
 254                                 ClearPageError(pp);
 255                                 SetPageUptodate(pp);
 256
 257                                 if (mapping_writably_mapped(mp))
 258                                         flush_dcache_page(pp);
 259
 260                                 mark_page_accessed(pp);
 261                         }
 262
 263                         unlock_page(pp);
 264                         put_page(pp);
 265                 }
 266
 267                 len -= nbytes;
 268                 off = 0;
 269         }
 270 }
 271
 272 /*
 273  * When a file is memory mapped, we must keep the I/O data synchronized
 274  * between the DMU cache and the memory mapped pages.  Preferentially read
 275  * from memory mapped pages, otherwise fallback to reading through the dmu.
 276  */
 277 int
 278 mappedread(znode_t *zp, int nbytes, zfs_uio_t *uio)
 279 {
 280         struct inode *ip = ZTOI(zp);
 281         struct address_space *mp = ip->i_mapping;
 282         int64_t start = uio->uio_loffset;
 283         int64_t off = start & (PAGE_SIZE - 1);
 284         int len = nbytes;
 285         int error = 0;
 286
 287         for (start &= PAGE_MASK; len > 0; start += PAGE_SIZE) {
 288                 uint64_t bytes = MIN(PAGE_SIZE - off, len);
 289
 290                 struct page *pp = find_lock_page(mp, start >> PAGE_SHIFT);
 291                 if (pp) {
 292                         /*
 293                          * If filemap_fault() retries there exists a window
 294                          * where the page will be unlocked and not up to date.
 295                          * In this case we must try and fill the page.
 296                          */
 297                         if (unlikely(!PageUptodate(pp))) {
 298                                 error = zfs_fillpage(ip, pp);
 299                                 if (error) {
 300                                         unlock_page(pp);
 301                                         put_page(pp);
 302                                         return (error);
 303                                 }
 304                         }
 305
 306                         ASSERT(PageUptodate(pp) || PageDirty(pp));
 307
 308                         unlock_page(pp);
 309
 310                         void *pb = kmap(pp);
 311                         error = zfs_uiomove(pb + off, bytes, UIO_READ, uio);
 312                         kunmap(pp);
 313
 314                         if (mapping_writably_mapped(mp))
 315                                 flush_dcache_page(pp);
 316
 317                         mark_page_accessed(pp);
 318                         put_page(pp);
 319                 } else {
 320                         error = dmu_read_uio_dbuf(sa_get_db(zp->z_sa_hdl),
 321                             uio, bytes);
 322                 }
 323
 324                 len -= bytes;
 325                 off = 0;
 326
 327                 if (error)
 328                         break;
 329         }
 330
 331         return (error);
 332 }
 333 #endif /* _KERNEL */
 334
 335 static unsigned long zfs_delete_blocks = DMU_MAX_DELETEBLKCNT;
 336
 337 /*
 338  * Write the bytes to a file.
 339  *
 340  *      IN:     zp      - znode of file to be written to
 341  *              data    - bytes to write
 342  *              len     - number of bytes to write
 343  *              pos     - offset to start writing at
 344  *
 345  *      OUT:    resid   - remaining bytes to write
 346  *
 347  *      RETURN: 0 if success
 348  *              positive error code if failure.  EIO is returned
 349  *              for a short write when residp isn't provided.
 350  *
 351  * Timestamps:
 352  *      zp - ctime|mtime updated if byte count > 0
 353  */
 354 int
 355 zfs_write_simple(znode_t *zp, const void *data, size_t len,
 356     loff_t pos, size_t *residp)
 357 {
 358         fstrans_cookie_t cookie;
 359         int error;
 360
 361         struct iovec iov;
 362         iov.iov_base = (void *)data;
 363         iov.iov_len = len;
 364
 365         zfs_uio_t uio;
 366         zfs_uio_iovec_init(&uio, &iov, 1, pos, UIO_SYSSPACE, len, 0);
 367
 368         cookie = spl_fstrans_mark();
 369         error = zfs_write(zp, &uio, 0, kcred);
 370         spl_fstrans_unmark(cookie);
 371
 372         if (error == 0) {
 373                 if (residp != NULL)
 374                         *residp = zfs_uio_resid(&uio);
 375                 else if (zfs_uio_resid(&uio) != 0)
 376                         error = SET_ERROR(EIO);
 377         }
 378
 379         return (error);
 380 }
 381
 382 static void
 383 zfs_rele_async_task(void *arg)
 384 {
 385         iput(arg);
 386 }
 387
 388 void
 389 zfs_zrele_async(znode_t *zp)
 390 {
 391         struct inode *ip = ZTOI(zp);
 392         objset_t *os = ITOZSB(ip)->z_os;
 393
 394         ASSERT(atomic_read(&ip->i_count) > 0);
 395         ASSERT(os != NULL);
 396
 397         /*
 398          * If decrementing the count would put us at 0, we can't do it inline
 399          * here, because that would be synchronous. Instead, dispatch an iput
 400          * to run later.
 401          *
 402          * For more information on the dangers of a synchronous iput, see the
 403          * header comment of this file.
 404          */
 405         if (!atomic_add_unless(&ip->i_count, -1, 1)) {
 406                 VERIFY(taskq_dispatch(dsl_pool_zrele_taskq(dmu_objset_pool(os)),
 407                     zfs_rele_async_task, ip, TQ_SLEEP) != TASKQID_INVALID);
 408         }
 409 }
 410
 411
 412 /*
 413  * Lookup an entry in a directory, or an extended attribute directory.
 414  * If it exists, return a held inode reference for it.
 415  *
 416  *      IN:     zdp     - znode of directory to search.
 417  *              nm      - name of entry to lookup.
 418  *              flags   - LOOKUP_XATTR set if looking for an attribute.
 419  *              cr      - credentials of caller.
 420  *              direntflags - directory lookup flags
 421  *              realpnp - returned pathname.
 422  *
 423  *      OUT:    zpp     - znode of located entry, NULL if not found.
 424  *
 425  *      RETURN: 0 on success, error code on failure.
 426  *
 427  * Timestamps:
 428  *      NA
 429  */
 430 int
 431 zfs_lookup(znode_t *zdp, char *nm, znode_t **zpp, int flags, cred_t *cr,
 432     int *direntflags, pathname_t *realpnp)
 433 {
 434         zfsvfs_t *zfsvfs = ZTOZSB(zdp);
 435         int error = 0;
 436
 437         /*
 438          * Fast path lookup, however we must skip DNLC lookup
 439          * for case folding or normalizing lookups because the
 440          * DNLC code only stores the passed in name.  This means
 441          * creating 'a' and removing 'A' on a case insensitive
 442          * file system would work, but DNLC still thinks 'a'
 443          * exists and won't let you create it again on the next
 444          * pass through fast path.
 445          */
 446         if (!(flags & (LOOKUP_XATTR | FIGNORECASE))) {
 447
 448                 if (!S_ISDIR(ZTOI(zdp)->i_mode)) {
 449                         return (SET_ERROR(ENOTDIR));
 450                 } else if (zdp->z_sa_hdl == NULL) {
 451                         return (SET_ERROR(EIO));
 452                 }
 453
 454                 if (nm[0] == 0 || (nm[0] == '.' && nm[1] == '\0')) {
 455                         error = zfs_fastaccesschk_execute(zdp, cr);
 456                         if (!error) {
 457                                 *zpp = zdp;
 458                                 zhold(*zpp);
 459                                 return (0);
 460                         }
 461                         return (error);
 462                 }
 463         }
 464
 465         if ((error = zfs_enter_verify_zp(zfsvfs, zdp, FTAG)) != 0)
 466                 return (error);
 467
 468         *zpp = NULL;
 469
 470         if (flags & LOOKUP_XATTR) {
 471                 /*
 472                  * We don't allow recursive attributes..
 473                  * Maybe someday we will.
 474                  */
 475                 if (zdp->z_pflags & ZFS_XATTR) {
 476                         zfs_exit(zfsvfs, FTAG);
 477                         return (SET_ERROR(EINVAL));
 478                 }
 479
 480                 if ((error = zfs_get_xattrdir(zdp, zpp, cr, flags))) {
 481                         zfs_exit(zfsvfs, FTAG);
 482                         return (error);
 483                 }
 484
 485                 /*
 486                  * Do we have permission to get into attribute directory?
 487                  */
 488
 489                 if ((error = zfs_zaccess(*zpp, ACE_EXECUTE, 0,
 490                     B_TRUE, cr, zfs_init_idmap))) {
 491                         zrele(*zpp);
 492                         *zpp = NULL;
 493                 }
 494
 495                 zfs_exit(zfsvfs, FTAG);
 496                 return (error);
 497         }
 498
 499         if (!S_ISDIR(ZTOI(zdp)->i_mode)) {
 500                 zfs_exit(zfsvfs, FTAG);
 501                 return (SET_ERROR(ENOTDIR));
 502         }
 503
 504         /*
 505          * Check accessibility of directory.
 506          */
 507
 508         if ((error = zfs_zaccess(zdp, ACE_EXECUTE, 0, B_FALSE, cr,
 509             zfs_init_idmap))) {
 510                 zfs_exit(zfsvfs, FTAG);
 511                 return (error);
 512         }
 513
 514         if (zfsvfs->z_utf8 && u8_validate(nm, strlen(nm),
 515             NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
 516                 zfs_exit(zfsvfs, FTAG);
 517                 return (SET_ERROR(EILSEQ));
 518         }
 519
 520         error = zfs_dirlook(zdp, nm, zpp, flags, direntflags, realpnp);
 521         if ((error == 0) && (*zpp))
 522                 zfs_znode_update_vfs(*zpp);
 523
 524         zfs_exit(zfsvfs, FTAG);
 525         return (error);
 526 }
 527
 528 /*
 529  * Attempt to create a new entry in a directory.  If the entry
 530  * already exists, truncate the file if permissible, else return
 531  * an error.  Return the ip of the created or trunc'd file.
 532  *
 533  *      IN:     dzp     - znode of directory to put new file entry in.
 534  *              name    - name of new file entry.
 535  *              vap     - attributes of new file.
 536  *              excl    - flag indicating exclusive or non-exclusive mode.
 537  *              mode    - mode to open file with.
 538  *              cr      - credentials of caller.
 539  *              flag    - file flag.
 540  *              vsecp   - ACL to be set
 541  *              mnt_ns  - user namespace of the mount
 542  *
 543  *      OUT:    zpp     - znode of created or trunc'd entry.
 544  *
 545  *      RETURN: 0 on success, error code on failure.
 546  *
 547  * Timestamps:
 548  *      dzp - ctime|mtime updated if new entry created
 549  *       zp - ctime|mtime always, atime if new
 550  */
 551 int
 552 zfs_create(znode_t *dzp, char *name, vattr_t *vap, int excl,
 553     int mode, znode_t **zpp, cred_t *cr, int flag, vsecattr_t *vsecp,
 554     zidmap_t *mnt_ns)
 555 {
 556         znode_t         *zp;
 557         zfsvfs_t        *zfsvfs = ZTOZSB(dzp);
 558         zilog_t         *zilog;
 559         objset_t        *os;
 560         zfs_dirlock_t   *dl;
 561         dmu_tx_t        *tx;
 562         int             error;
 563         uid_t           uid;
 564         gid_t           gid;
 565         zfs_acl_ids_t   acl_ids;
 566         boolean_t       fuid_dirtied;
 567         boolean_t       have_acl = B_FALSE;
 568         boolean_t       waited = B_FALSE;
 569         boolean_t       skip_acl = (flag & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
 570
 571         /*
 572          * If we have an ephemeral id, ACL, or XVATTR then
 573          * make sure file system is at proper version
 574          */
 575
 576         gid = crgetgid(cr);
 577         uid = crgetuid(cr);
 578
 579         if (zfsvfs->z_use_fuids == B_FALSE &&
 580             (vsecp || IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid)))
 581                 return (SET_ERROR(EINVAL));
 582
 583         if (name == NULL)
 584                 return (SET_ERROR(EINVAL));
 585
 586         if ((error = zfs_enter_verify_zp(zfsvfs, dzp, FTAG)) != 0)
 587                 return (error);
 588         os = zfsvfs->z_os;
 589         zilog = zfsvfs->z_log;
 590
 591         if (zfsvfs->z_utf8 && u8_validate(name, strlen(name),
 592             NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
 593                 zfs_exit(zfsvfs, FTAG);
 594                 return (SET_ERROR(EILSEQ));
 595         }
 596
 597         if (vap->va_mask & ATTR_XVATTR) {
 598                 if ((error = secpolicy_xvattr((xvattr_t *)vap,
 599                     crgetuid(cr), cr, vap->va_mode)) != 0) {
 600                         zfs_exit(zfsvfs, FTAG);
 601                         return (error);
 602                 }
 603         }
 604
 605 top:
 606         *zpp = NULL;
 607         if (*name == '\0') {
 608                 /*
 609                  * Null component name refers to the directory itself.
 610                  */
 611                 zhold(dzp);
 612                 zp = dzp;
 613                 dl = NULL;
 614                 error = 0;
 615         } else {
 616                 /* possible igrab(zp) */
 617                 int zflg = 0;
 618
 619                 if (flag & FIGNORECASE)
 620                         zflg |= ZCILOOK;
 621
 622                 error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg,
 623                     NULL, NULL);
 624                 if (error) {
 625                         if (have_acl)
 626                                 zfs_acl_ids_free(&acl_ids);
 627                         if (strcmp(name, "..") == 0)
 628                                 error = SET_ERROR(EISDIR);
 629                         zfs_exit(zfsvfs, FTAG);
 630                         return (error);
 631                 }
 632         }
 633
 634         if (zp == NULL) {
 635                 uint64_t txtype;
 636                 uint64_t projid = ZFS_DEFAULT_PROJID;
 637
 638                 /*
 639                  * Create a new file object and update the directory
 640                  * to reference it.
 641                  */
 642                 if ((error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, skip_acl, cr,
 643                     mnt_ns))) {
 644                         if (have_acl)
 645                                 zfs_acl_ids_free(&acl_ids);
 646                         goto out;
 647                 }
 648
 649                 /*
 650                  * We only support the creation of regular files in
 651                  * extended attribute directories.
 652                  */
 653
 654                 if ((dzp->z_pflags & ZFS_XATTR) && !S_ISREG(vap->va_mode)) {
 655                         if (have_acl)
 656                                 zfs_acl_ids_free(&acl_ids);
 657                         error = SET_ERROR(EINVAL);
 658                         goto out;
 659                 }
 660
 661                 if (!have_acl && (error = zfs_acl_ids_create(dzp, 0, vap,
 662                     cr, vsecp, &acl_ids, mnt_ns)) != 0)
 663                         goto out;
 664                 have_acl = B_TRUE;
 665
 666                 if (S_ISREG(vap->va_mode) || S_ISDIR(vap->va_mode))
 667                         projid = zfs_inherit_projid(dzp);
 668                 if (zfs_acl_ids_overquota(zfsvfs, &acl_ids, projid)) {
 669                         zfs_acl_ids_free(&acl_ids);
 670                         error = SET_ERROR(EDQUOT);
 671                         goto out;
 672                 }
 673
 674                 tx = dmu_tx_create(os);
 675
 676                 dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
 677                     ZFS_SA_BASE_ATTR_SIZE);
 678
 679                 fuid_dirtied = zfsvfs->z_fuid_dirty;
 680                 if (fuid_dirtied)
 681                         zfs_fuid_txhold(zfsvfs, tx);
 682                 dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
 683                 dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE);
 684                 if (!zfsvfs->z_use_sa &&
 685                     acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) {
 686                         dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
 687                             0, acl_ids.z_aclp->z_acl_bytes);
 688                 }
 689
 690                 error = dmu_tx_assign(tx,
 691                     (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT);
 692                 if (error) {
 693                         zfs_dirent_unlock(dl);
 694                         if (error == ERESTART) {
 695                                 waited = B_TRUE;
 696                                 dmu_tx_wait(tx);
 697                                 dmu_tx_abort(tx);
 698                                 goto top;
 699                         }
 700                         zfs_acl_ids_free(&acl_ids);
 701                         dmu_tx_abort(tx);
 702                         zfs_exit(zfsvfs, FTAG);
 703                         return (error);
 704                 }
 705                 zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids);
 706
 707                 error = zfs_link_create(dl, zp, tx, ZNEW);
 708                 if (error != 0) {
 709                         /*
 710                          * Since, we failed to add the directory entry for it,
 711                          * delete the newly created dnode.
 712                          */
 713                         zfs_znode_delete(zp, tx);
 714                         remove_inode_hash(ZTOI(zp));
 715                         zfs_acl_ids_free(&acl_ids);
 716                         dmu_tx_commit(tx);
 717                         goto out;
 718                 }
 719
 720                 if (fuid_dirtied)
 721                         zfs_fuid_sync(zfsvfs, tx);
 722
 723                 txtype = zfs_log_create_txtype(Z_FILE, vsecp, vap);
 724                 if (flag & FIGNORECASE)
 725                         txtype |= TX_CI;
 726                 zfs_log_create(zilog, tx, txtype, dzp, zp, name,
 727                     vsecp, acl_ids.z_fuidp, vap);
 728                 zfs_acl_ids_free(&acl_ids);
 729                 dmu_tx_commit(tx);
 730         } else {
 731                 int aflags = (flag & O_APPEND) ? V_APPEND : 0;
 732
 733                 if (have_acl)
 734                         zfs_acl_ids_free(&acl_ids);
 735
 736                 /*
 737                  * A directory entry already exists for this name.
 738                  */
 739                 /*
 740                  * Can't truncate an existing file if in exclusive mode.
 741                  */
 742                 if (excl) {
 743                         error = SET_ERROR(EEXIST);
 744                         goto out;
 745                 }
 746                 /*
 747                  * Can't open a directory for writing.
 748                  */
 749                 if (S_ISDIR(ZTOI(zp)->i_mode)) {
 750                         error = SET_ERROR(EISDIR);
 751                         goto out;
 752                 }
 753                 /*
 754                  * Verify requested access to file.
 755                  */
 756                 if (mode && (error = zfs_zaccess_rwx(zp, mode, aflags, cr,
 757                     mnt_ns))) {
 758                         goto out;
 759                 }
 760
 761                 mutex_enter(&dzp->z_lock);
 762                 dzp->z_seq++;
 763                 mutex_exit(&dzp->z_lock);
 764
 765                 /*
 766                  * Truncate regular files if requested.
 767                  */
 768                 if (S_ISREG(ZTOI(zp)->i_mode) &&
 769                     (vap->va_mask & ATTR_SIZE) && (vap->va_size == 0)) {
 770                         /* we can't hold any locks when calling zfs_freesp() */
 771                         if (dl) {
 772                                 zfs_dirent_unlock(dl);
 773                                 dl = NULL;
 774                         }
 775                         error = zfs_freesp(zp, 0, 0, mode, TRUE);
 776                 }
 777         }
 778 out:
 779
 780         if (dl)
 781                 zfs_dirent_unlock(dl);
 782
 783         if (error) {
 784                 if (zp)
 785                         zrele(zp);
 786         } else {
 787                 zfs_znode_update_vfs(dzp);
 788                 zfs_znode_update_vfs(zp);
 789                 *zpp = zp;
 790         }
 791
 792         if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
 793                 zil_commit(zilog, 0);
 794
 795         zfs_exit(zfsvfs, FTAG);
 796         return (error);
 797 }
 798
 799 int
 800 zfs_tmpfile(struct inode *dip, vattr_t *vap, int excl,
 801     int mode, struct inode **ipp, cred_t *cr, int flag, vsecattr_t *vsecp,
 802     zidmap_t *mnt_ns)
 803 {
 804         (void) excl, (void) mode, (void) flag;
 805         znode_t         *zp = NULL, *dzp = ITOZ(dip);
 806         zfsvfs_t        *zfsvfs = ITOZSB(dip);
 807         objset_t        *os;
 808         dmu_tx_t        *tx;
 809         int             error;
 810         uid_t           uid;
 811         gid_t           gid;
 812         zfs_acl_ids_t   acl_ids;
 813         uint64_t        projid = ZFS_DEFAULT_PROJID;
 814         boolean_t       fuid_dirtied;
 815         boolean_t       have_acl = B_FALSE;
 816         boolean_t       waited = B_FALSE;
 817
 818         /*
 819          * If we have an ephemeral id, ACL, or XVATTR then
 820          * make sure file system is at proper version
 821          */
 822
 823         gid = crgetgid(cr);
 824         uid = crgetuid(cr);
 825
 826         if (zfsvfs->z_use_fuids == B_FALSE &&
 827             (vsecp || IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid)))
 828                 return (SET_ERROR(EINVAL));
 829
 830         if ((error = zfs_enter_verify_zp(zfsvfs, dzp, FTAG)) != 0)
 831                 return (error);
 832         os = zfsvfs->z_os;
 833
 834         if (vap->va_mask & ATTR_XVATTR) {
 835                 if ((error = secpolicy_xvattr((xvattr_t *)vap,
 836                     crgetuid(cr), cr, vap->va_mode)) != 0) {
 837                         zfs_exit(zfsvfs, FTAG);
 838                         return (error);
 839                 }
 840         }
 841
 842 top:
 843         *ipp = NULL;
 844
 845         /*
 846          * Create a new file object and update the directory
 847          * to reference it.
 848          */
 849         if ((error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr, mnt_ns))) {
 850                 if (have_acl)
 851                         zfs_acl_ids_free(&acl_ids);
 852                 goto out;
 853         }
 854
 855         if (!have_acl && (error = zfs_acl_ids_create(dzp, 0, vap,
 856             cr, vsecp, &acl_ids, mnt_ns)) != 0)
 857                 goto out;
 858         have_acl = B_TRUE;
 859
 860         if (S_ISREG(vap->va_mode) || S_ISDIR(vap->va_mode))
 861                 projid = zfs_inherit_projid(dzp);
 862         if (zfs_acl_ids_overquota(zfsvfs, &acl_ids, projid)) {
 863                 zfs_acl_ids_free(&acl_ids);
 864                 error = SET_ERROR(EDQUOT);
 865                 goto out;
 866         }
 867
 868         tx = dmu_tx_create(os);
 869
 870         dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
 871             ZFS_SA_BASE_ATTR_SIZE);
 872         dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
 873
 874         fuid_dirtied = zfsvfs->z_fuid_dirty;
 875         if (fuid_dirtied)
 876                 zfs_fuid_txhold(zfsvfs, tx);
 877         if (!zfsvfs->z_use_sa &&
 878             acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) {
 879                 dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
 880                     0, acl_ids.z_aclp->z_acl_bytes);
 881         }
 882         error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT);
 883         if (error) {
 884                 if (error == ERESTART) {
 885                         waited = B_TRUE;
 886                         dmu_tx_wait(tx);
 887                         dmu_tx_abort(tx);
 888                         goto top;
 889                 }
 890                 zfs_acl_ids_free(&acl_ids);
 891                 dmu_tx_abort(tx);
 892                 zfs_exit(zfsvfs, FTAG);
 893                 return (error);
 894         }
 895         zfs_mknode(dzp, vap, tx, cr, IS_TMPFILE, &zp, &acl_ids);
 896
 897         if (fuid_dirtied)
 898                 zfs_fuid_sync(zfsvfs, tx);
 899
 900         /* Add to unlinked set */
 901         zp->z_unlinked = B_TRUE;
 902         zfs_unlinked_add(zp, tx);
 903         zfs_acl_ids_free(&acl_ids);
 904         dmu_tx_commit(tx);
 905 out:
 906
 907         if (error) {
 908                 if (zp)
 909                         zrele(zp);
 910         } else {
 911                 zfs_znode_update_vfs(dzp);
 912                 zfs_znode_update_vfs(zp);
 913                 *ipp = ZTOI(zp);
 914         }
 915
 916         zfs_exit(zfsvfs, FTAG);
 917         return (error);
 918 }
 919
 920 /*
 921  * Remove an entry from a directory.
 922  *
 923  *      IN:     dzp     - znode of directory to remove entry from.
 924  *              name    - name of entry to remove.
 925  *              cr      - credentials of caller.
 926  *              flags   - case flags.
 927  *
 928  *      RETURN: 0 if success
 929  *              error code if failure
 930  *
 931  * Timestamps:
 932  *      dzp - ctime|mtime
 933  *       ip - ctime (if nlink > 0)
 934  */
 935
 936 static uint64_t null_xattr = 0;
 937
 938 int
 939 zfs_remove(znode_t *dzp, char *name, cred_t *cr, int flags)
 940 {
 941         znode_t         *zp;
 942         znode_t         *xzp;
 943         zfsvfs_t        *zfsvfs = ZTOZSB(dzp);
 944         zilog_t         *zilog;
 945         uint64_t        acl_obj, xattr_obj;
 946         uint64_t        xattr_obj_unlinked = 0;
 947         uint64_t        obj = 0;
 948         uint64_t        links;
 949         zfs_dirlock_t   *dl;
 950         dmu_tx_t        *tx;
 951         boolean_t       may_delete_now, delete_now = FALSE;
 952         boolean_t       unlinked, toobig = FALSE;
 953         uint64_t        txtype;
 954         pathname_t      *realnmp = NULL;
 955         pathname_t      realnm;
 956         int             error;
 957         int             zflg = ZEXISTS;
 958         boolean_t       waited = B_FALSE;
 959
 960         if (name == NULL)
 961                 return (SET_ERROR(EINVAL));
 962
 963         if ((error = zfs_enter_verify_zp(zfsvfs, dzp, FTAG)) != 0)
 964                 return (error);
 965         zilog = zfsvfs->z_log;
 966
 967         if (flags & FIGNORECASE) {
 968                 zflg |= ZCILOOK;
 969                 pn_alloc(&realnm);
 970                 realnmp = &realnm;
 971         }
 972
 973 top:
 974         xattr_obj = 0;
 975         xzp = NULL;
 976         /*
 977          * Attempt to lock directory; fail if entry doesn't exist.
 978          */
 979         if ((error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg,
 980             NULL, realnmp))) {
 981                 if (realnmp)
 982                         pn_free(realnmp);
 983                 zfs_exit(zfsvfs, FTAG);
 984                 return (error);
 985         }
 986
 987         if ((error = zfs_zaccess_delete(dzp, zp, cr, zfs_init_idmap))) {
 988                 goto out;
 989         }
 990
 991         /*
 992          * Need to use rmdir for removing directories.
 993          */
 994         if (S_ISDIR(ZTOI(zp)->i_mode)) {
 995                 error = SET_ERROR(EPERM);
 996                 goto out;
 997         }
 998
 999         mutex_enter(&zp->z_lock);
1000         may_delete_now = atomic_read(&ZTOI(zp)->i_count) == 1 &&
1001             !zn_has_cached_data(zp, 0, LLONG_MAX);
1002         mutex_exit(&zp->z_lock);
1003
1004         /*
1005          * We may delete the znode now, or we may put it in the unlinked set;
1006          * it depends on whether we're the last link, and on whether there are
1007          * other holds on the inode.  So we dmu_tx_hold() the right things to
1008          * allow for either case.
1009          */
1010         obj = zp->z_id;
1011         tx = dmu_tx_create(zfsvfs->z_os);
1012         dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name);
1013         dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
1014         zfs_sa_upgrade_txholds(tx, zp);
1015         zfs_sa_upgrade_txholds(tx, dzp);
1016         if (may_delete_now) {
1017                 toobig = zp->z_size > zp->z_blksz * zfs_delete_blocks;
1018                 /* if the file is too big, only hold_free a token amount */
1019                 dmu_tx_hold_free(tx, zp->z_id, 0,
1020                     (toobig ? DMU_MAX_ACCESS : DMU_OBJECT_END));
1021         }
1022
1023         /* are there any extended attributes? */
1024         error = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs),
1025             &xattr_obj, sizeof (xattr_obj));
1026         if (error == 0 && xattr_obj) {
1027                 error = zfs_zget(zfsvfs, xattr_obj, &xzp);
1028                 ASSERT0(error);
1029                 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
1030                 dmu_tx_hold_sa(tx, xzp->z_sa_hdl, B_FALSE);
1031         }
1032
1033         mutex_enter(&zp->z_lock);
1034         if ((acl_obj = zfs_external_acl(zp)) != 0 && may_delete_now)
1035                 dmu_tx_hold_free(tx, acl_obj, 0, DMU_OBJECT_END);
1036         mutex_exit(&zp->z_lock);
1037
1038         /* charge as an update -- would be nice not to charge at all */
1039         dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
1040
1041         /*
1042          * Mark this transaction as typically resulting in a net free of space
1043          */
1044         dmu_tx_mark_netfree(tx);
1045
1046         error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT);
1047         if (error) {
1048                 zfs_dirent_unlock(dl);
1049                 if (error == ERESTART) {
1050                         waited = B_TRUE;
1051                         dmu_tx_wait(tx);
1052                         dmu_tx_abort(tx);
1053                         zrele(zp);
1054                         if (xzp)
1055                                 zrele(xzp);
1056                         goto top;
1057                 }
1058                 if (realnmp)
1059                         pn_free(realnmp);
1060                 dmu_tx_abort(tx);
1061                 zrele(zp);
1062                 if (xzp)
1063                         zrele(xzp);
1064                 zfs_exit(zfsvfs, FTAG);
1065                 return (error);
1066         }
1067
1068         /*
1069          * Remove the directory entry.
1070          */
1071         error = zfs_link_destroy(dl, zp, tx, zflg, &unlinked);
1072
1073         if (error) {
1074                 dmu_tx_commit(tx);
1075                 goto out;
1076         }
1077
1078         if (unlinked) {
1079                 /*
1080                  * Hold z_lock so that we can make sure that the ACL obj
1081                  * hasn't changed.  Could have been deleted due to
1082                  * zfs_sa_upgrade().
1083                  */
1084                 mutex_enter(&zp->z_lock);
1085                 (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs),
1086                     &xattr_obj_unlinked, sizeof (xattr_obj_unlinked));
1087                 delete_now = may_delete_now && !toobig &&
1088                     atomic_read(&ZTOI(zp)->i_count) == 1 &&
1089                     !zn_has_cached_data(zp, 0, LLONG_MAX) &&
1090                     xattr_obj == xattr_obj_unlinked &&
1091                     zfs_external_acl(zp) == acl_obj;
1092                 VERIFY_IMPLY(xattr_obj_unlinked, xzp);
1093         }
1094
1095         if (delete_now) {
1096                 if (xattr_obj_unlinked) {
1097                         ASSERT3U(ZTOI(xzp)->i_nlink, ==, 2);
1098                         mutex_enter(&xzp->z_lock);
1099                         xzp->z_unlinked = B_TRUE;
1100                         clear_nlink(ZTOI(xzp));
1101                         links = 0;
1102                         error = sa_update(xzp->z_sa_hdl, SA_ZPL_LINKS(zfsvfs),
1103                             &links, sizeof (links), tx);
1104                         ASSERT3U(error,  ==,  0);
1105                         mutex_exit(&xzp->z_lock);
1106                         zfs_unlinked_add(xzp, tx);
1107
1108                         if (zp->z_is_sa)
1109                                 error = sa_remove(zp->z_sa_hdl,
1110                                     SA_ZPL_XATTR(zfsvfs), tx);
1111                         else
1112                                 error = sa_update(zp->z_sa_hdl,
1113                                     SA_ZPL_XATTR(zfsvfs), &null_xattr,
1114                                     sizeof (uint64_t), tx);
1115                         ASSERT0(error);
1116                 }
1117                 /*
1118                  * Add to the unlinked set because a new reference could be
1119                  * taken concurrently resulting in a deferred destruction.
1120                  */
1121                 zfs_unlinked_add(zp, tx);
1122                 mutex_exit(&zp->z_lock);
1123         } else if (unlinked) {
1124                 mutex_exit(&zp->z_lock);
1125                 zfs_unlinked_add(zp, tx);
1126         }
1127
1128         txtype = TX_REMOVE;
1129         if (flags & FIGNORECASE)
1130                 txtype |= TX_CI;
1131         zfs_log_remove(zilog, tx, txtype, dzp, name, obj, unlinked);
1132
1133         dmu_tx_commit(tx);
1134 out:
1135         if (realnmp)
1136                 pn_free(realnmp);
1137
1138         zfs_dirent_unlock(dl);
1139         zfs_znode_update_vfs(dzp);
1140         zfs_znode_update_vfs(zp);
1141
1142         if (delete_now)
1143                 zrele(zp);
1144         else
1145                 zfs_zrele_async(zp);
1146
1147         if (xzp) {
1148                 zfs_znode_update_vfs(xzp);
1149                 zfs_zrele_async(xzp);
1150         }
1151
1152         if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
1153                 zil_commit(zilog, 0);
1154
1155         zfs_exit(zfsvfs, FTAG);
1156         return (error);
1157 }
1158
1159 /*
1160  * Create a new directory and insert it into dzp using the name
1161  * provided.  Return a pointer to the inserted directory.
1162  *
1163  *      IN:     dzp     - znode of directory to add subdir to.
1164  *              dirname - name of new directory.
1165  *              vap     - attributes of new directory.
1166  *              cr      - credentials of caller.
1167  *              flags   - case flags.
1168  *              vsecp   - ACL to be set
1169  *              mnt_ns  - user namespace of the mount
1170  *
1171  *      OUT:    zpp     - znode of created directory.
1172  *
1173  *      RETURN: 0 if success
1174  *              error code if failure
1175  *
1176  * Timestamps:
1177  *      dzp - ctime|mtime updated
1178  *      zpp - ctime|mtime|atime updated
1179  */
1180 int
1181 zfs_mkdir(znode_t *dzp, char *dirname, vattr_t *vap, znode_t **zpp,
1182     cred_t *cr, int flags, vsecattr_t *vsecp, zidmap_t *mnt_ns)
1183 {
1184         znode_t         *zp;
1185         zfsvfs_t        *zfsvfs = ZTOZSB(dzp);
1186         zilog_t         *zilog;
1187         zfs_dirlock_t   *dl;
1188         uint64_t        txtype;
1189         dmu_tx_t        *tx;
1190         int             error;
1191         int             zf = ZNEW;
1192         uid_t           uid;
1193         gid_t           gid = crgetgid(cr);
1194         zfs_acl_ids_t   acl_ids;
1195         boolean_t       fuid_dirtied;
1196         boolean_t       waited = B_FALSE;
1197
1198         ASSERT(S_ISDIR(vap->va_mode));
1199
1200         /*
1201          * If we have an ephemeral id, ACL, or XVATTR then
1202          * make sure file system is at proper version
1203          */
1204
1205         uid = crgetuid(cr);
1206         if (zfsvfs->z_use_fuids == B_FALSE &&
1207             (vsecp || IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid)))
1208                 return (SET_ERROR(EINVAL));
1209
1210         if (dirname == NULL)
1211                 return (SET_ERROR(EINVAL));
1212
1213         if ((error = zfs_enter_verify_zp(zfsvfs, dzp, FTAG)) != 0)
1214                 return (error);
1215         zilog = zfsvfs->z_log;
1216
1217         if (dzp->z_pflags & ZFS_XATTR) {
1218                 zfs_exit(zfsvfs, FTAG);
1219                 return (SET_ERROR(EINVAL));
1220         }
1221
1222         if (zfsvfs->z_utf8 && u8_validate(dirname,
1223             strlen(dirname), NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
1224                 zfs_exit(zfsvfs, FTAG);
1225                 return (SET_ERROR(EILSEQ));
1226         }
1227         if (flags & FIGNORECASE)
1228                 zf |= ZCILOOK;
1229
1230         if (vap->va_mask & ATTR_XVATTR) {
1231                 if ((error = secpolicy_xvattr((xvattr_t *)vap,
1232                     crgetuid(cr), cr, vap->va_mode)) != 0) {
1233                         zfs_exit(zfsvfs, FTAG);
1234                         return (error);
1235                 }
1236         }
1237
1238         if ((error = zfs_acl_ids_create(dzp, 0, vap, cr,
1239             vsecp, &acl_ids, mnt_ns)) != 0) {
1240                 zfs_exit(zfsvfs, FTAG);
1241                 return (error);
1242         }
1243         /*
1244          * First make sure the new directory doesn't exist.
1245          *
1246          * Existence is checked first to make sure we don't return
1247          * EACCES instead of EEXIST which can cause some applications
1248          * to fail.
1249          */
1250 top:
1251         *zpp = NULL;
1252
1253         if ((error = zfs_dirent_lock(&dl, dzp, dirname, &zp, zf,
1254             NULL, NULL))) {
1255                 zfs_acl_ids_free(&acl_ids);
1256                 zfs_exit(zfsvfs, FTAG);
1257                 return (error);
1258         }
1259
1260         if ((error = zfs_zaccess(dzp, ACE_ADD_SUBDIRECTORY, 0, B_FALSE, cr,
1261             mnt_ns))) {
1262                 zfs_acl_ids_free(&acl_ids);
1263                 zfs_dirent_unlock(dl);
1264                 zfs_exit(zfsvfs, FTAG);
1265                 return (error);
1266         }
1267
1268         if (zfs_acl_ids_overquota(zfsvfs, &acl_ids, zfs_inherit_projid(dzp))) {
1269                 zfs_acl_ids_free(&acl_ids);
1270                 zfs_dirent_unlock(dl);
1271                 zfs_exit(zfsvfs, FTAG);
1272                 return (SET_ERROR(EDQUOT));
1273         }
1274
1275         /*
1276          * Add a new entry to the directory.
1277          */
1278         tx = dmu_tx_create(zfsvfs->z_os);
1279         dmu_tx_hold_zap(tx, dzp->z_id, TRUE, dirname);
1280         dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL);
1281         fuid_dirtied = zfsvfs->z_fuid_dirty;
1282         if (fuid_dirtied)
1283                 zfs_fuid_txhold(zfsvfs, tx);
1284         if (!zfsvfs->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) {
1285                 dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0,
1286                     acl_ids.z_aclp->z_acl_bytes);
1287         }
1288
1289         dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
1290             ZFS_SA_BASE_ATTR_SIZE);
1291
1292         error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT);
1293         if (error) {
1294                 zfs_dirent_unlock(dl);
1295                 if (error == ERESTART) {
1296                         waited = B_TRUE;
1297                         dmu_tx_wait(tx);
1298                         dmu_tx_abort(tx);
1299                         goto top;
1300                 }
1301                 zfs_acl_ids_free(&acl_ids);
1302                 dmu_tx_abort(tx);
1303                 zfs_exit(zfsvfs, FTAG);
1304                 return (error);
1305         }
1306
1307         /*
1308          * Create new node.
1309          */
1310         zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids);
1311
1312         /*
1313          * Now put new name in parent dir.
1314          */
1315         error = zfs_link_create(dl, zp, tx, ZNEW);
1316         if (error != 0) {
1317                 zfs_znode_delete(zp, tx);
1318                 remove_inode_hash(ZTOI(zp));
1319                 goto out;
1320         }
1321
1322         if (fuid_dirtied)
1323                 zfs_fuid_sync(zfsvfs, tx);
1324
1325         *zpp = zp;
1326
1327         txtype = zfs_log_create_txtype(Z_DIR, vsecp, vap);
1328         if (flags & FIGNORECASE)
1329                 txtype |= TX_CI;
1330         zfs_log_create(zilog, tx, txtype, dzp, zp, dirname, vsecp,
1331             acl_ids.z_fuidp, vap);
1332
1333 out:
1334         zfs_acl_ids_free(&acl_ids);
1335
1336         dmu_tx_commit(tx);
1337
1338         zfs_dirent_unlock(dl);
1339
1340         if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
1341                 zil_commit(zilog, 0);
1342
1343         if (error != 0) {
1344                 zrele(zp);
1345         } else {
1346                 zfs_znode_update_vfs(dzp);
1347                 zfs_znode_update_vfs(zp);
1348         }
1349         zfs_exit(zfsvfs, FTAG);
1350         return (error);
1351 }
1352
1353 /*
1354  * Remove a directory subdir entry.  If the current working
1355  * directory is the same as the subdir to be removed, the
1356  * remove will fail.
1357  *
1358  *      IN:     dzp     - znode of directory to remove from.
1359  *              name    - name of directory to be removed.
1360  *              cwd     - inode of current working directory.
1361  *              cr      - credentials of caller.
1362  *              flags   - case flags
1363  *
1364  *      RETURN: 0 on success, error code on failure.
1365  *
1366  * Timestamps:
1367  *      dzp - ctime|mtime updated
1368  */
1369 int
1370 zfs_rmdir(znode_t *dzp, char *name, znode_t *cwd, cred_t *cr,
1371     int flags)
1372 {
1373         znode_t         *zp;
1374         zfsvfs_t        *zfsvfs = ZTOZSB(dzp);
1375         zilog_t         *zilog;
1376         zfs_dirlock_t   *dl;
1377         dmu_tx_t        *tx;
1378         int             error;
1379         int             zflg = ZEXISTS;
1380         boolean_t       waited = B_FALSE;
1381
1382         if (name == NULL)
1383                 return (SET_ERROR(EINVAL));
1384
1385         if ((error = zfs_enter_verify_zp(zfsvfs, dzp, FTAG)) != 0)
1386                 return (error);
1387         zilog = zfsvfs->z_log;
1388
1389         if (flags & FIGNORECASE)
1390                 zflg |= ZCILOOK;
1391 top:
1392         zp = NULL;
1393
1394         /*
1395          * Attempt to lock directory; fail if entry doesn't exist.
1396          */
1397         if ((error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg,
1398             NULL, NULL))) {
1399                 zfs_exit(zfsvfs, FTAG);
1400                 return (error);
1401         }
1402
1403         if ((error = zfs_zaccess_delete(dzp, zp, cr, zfs_init_idmap))) {
1404                 goto out;
1405         }
1406
1407         if (!S_ISDIR(ZTOI(zp)->i_mode)) {
1408                 error = SET_ERROR(ENOTDIR);
1409                 goto out;
1410         }
1411
1412         if (zp == cwd) {
1413                 error = SET_ERROR(EINVAL);
1414                 goto out;
1415         }
1416
1417         /*
1418          * Grab a lock on the directory to make sure that no one is
1419          * trying to add (or lookup) entries while we are removing it.
1420          */
1421         rw_enter(&zp->z_name_lock, RW_WRITER);
1422
1423         /*
1424          * Grab a lock on the parent pointer to make sure we play well
1425          * with the treewalk and directory rename code.
1426          */
1427         rw_enter(&zp->z_parent_lock, RW_WRITER);
1428
1429         tx = dmu_tx_create(zfsvfs->z_os);
1430         dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name);
1431         dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
1432         dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
1433         zfs_sa_upgrade_txholds(tx, zp);
1434         zfs_sa_upgrade_txholds(tx, dzp);
1435         dmu_tx_mark_netfree(tx);
1436         error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT);
1437         if (error) {
1438                 rw_exit(&zp->z_parent_lock);
1439                 rw_exit(&zp->z_name_lock);
1440                 zfs_dirent_unlock(dl);
1441                 if (error == ERESTART) {
1442                         waited = B_TRUE;
1443                         dmu_tx_wait(tx);
1444                         dmu_tx_abort(tx);
1445                         zrele(zp);
1446                         goto top;
1447                 }
1448                 dmu_tx_abort(tx);
1449                 zrele(zp);
1450                 zfs_exit(zfsvfs, FTAG);
1451                 return (error);
1452         }
1453
1454         error = zfs_link_destroy(dl, zp, tx, zflg, NULL);
1455
1456         if (error == 0) {
1457                 uint64_t txtype = TX_RMDIR;
1458                 if (flags & FIGNORECASE)
1459                         txtype |= TX_CI;
1460                 zfs_log_remove(zilog, tx, txtype, dzp, name, ZFS_NO_OBJECT,
1461                     B_FALSE);
1462         }
1463
1464         dmu_tx_commit(tx);
1465
1466         rw_exit(&zp->z_parent_lock);
1467         rw_exit(&zp->z_name_lock);
1468 out:
1469         zfs_dirent_unlock(dl);
1470
1471         zfs_znode_update_vfs(dzp);
1472         zfs_znode_update_vfs(zp);
1473         zrele(zp);
1474
1475         if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
1476                 zil_commit(zilog, 0);
1477
1478         zfs_exit(zfsvfs, FTAG);
1479         return (error);
1480 }
1481
1482 /*
1483  * Read directory entries from the given directory cursor position and emit
1484  * name and position for each entry.
1485  *
1486  *      IN:     ip      - inode of directory to read.
1487  *              ctx     - directory entry context.
1488  *              cr      - credentials of caller.
1489  *
1490  *      RETURN: 0 if success
1491  *              error code if failure
1492  *
1493  * Timestamps:
1494  *      ip - atime updated
1495  *
1496  * Note that the low 4 bits of the cookie returned by zap is always zero.
1497  * This allows us to use the low range for "special" directory entries:
1498  * We use 0 for '.', and 1 for '..'.  If this is the root of the filesystem,
1499  * we use the offset 2 for the '.zfs' directory.
1500  */
1501 int
1502 zfs_readdir(struct inode *ip, zpl_dir_context_t *ctx, cred_t *cr)
1503 {
1504         (void) cr;
1505         znode_t         *zp = ITOZ(ip);
1506         zfsvfs_t        *zfsvfs = ITOZSB(ip);
1507         objset_t        *os;
1508         zap_cursor_t    zc;
1509         zap_attribute_t zap;
1510         int             error;
1511         uint8_t         prefetch;
1512         uint8_t         type;
1513         int             done = 0;
1514         uint64_t        parent;
1515         uint64_t        offset; /* must be unsigned; checks for < 1 */
1516
1517         if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
1518                 return (error);
1519
1520         if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs),
1521             &parent, sizeof (parent))) != 0)
1522                 goto out;
1523
1524         /*
1525          * Quit if directory has been removed (posix)
1526          */
1527         if (zp->z_unlinked)
1528                 goto out;
1529
1530         error = 0;
1531         os = zfsvfs->z_os;
1532         offset = ctx->pos;
1533         prefetch = zp->z_zn_prefetch;
1534
1535         /*
1536          * Initialize the iterator cursor.
1537          */
1538         if (offset <= 3) {
1539                 /*
1540                  * Start iteration from the beginning of the directory.
1541                  */
1542                 zap_cursor_init(&zc, os, zp->z_id);
1543         } else {
1544                 /*
1545                  * The offset is a serialized cursor.
1546                  */
1547                 zap_cursor_init_serialized(&zc, os, zp->z_id, offset);
1548         }
1549
1550         /*
1551          * Transform to file-system independent format
1552          */
1553         while (!done) {
1554                 uint64_t objnum;
1555                 /*
1556                  * Special case `.', `..', and `.zfs'.
1557                  */
1558                 if (offset == 0) {
1559                         (void) strcpy(zap.za_name, ".");
1560                         zap.za_normalization_conflict = 0;
1561                         objnum = zp->z_id;
1562                         type = DT_DIR;
1563                 } else if (offset == 1) {
1564                         (void) strcpy(zap.za_name, "..");
1565                         zap.za_normalization_conflict = 0;
1566                         objnum = parent;
1567                         type = DT_DIR;
1568                 } else if (offset == 2 && zfs_show_ctldir(zp)) {
1569                         (void) strcpy(zap.za_name, ZFS_CTLDIR_NAME);
1570                         zap.za_normalization_conflict = 0;
1571                         objnum = ZFSCTL_INO_ROOT;
1572                         type = DT_DIR;
1573                 } else {
1574                         /*
1575                          * Grab next entry.
1576                          */
1577                         if ((error = zap_cursor_retrieve(&zc, &zap))) {
1578                                 if (error == ENOENT)
1579                                         break;
1580                                 else
1581                                         goto update;
1582                         }
1583
1584                         /*
1585                          * Allow multiple entries provided the first entry is
1586                          * the object id.  Non-zpl consumers may safely make
1587                          * use of the additional space.
1588                          *
1589                          * XXX: This should be a feature flag for compatibility
1590                          */
1591                         if (zap.za_integer_length != 8 ||
1592                             zap.za_num_integers == 0) {
1593                                 cmn_err(CE_WARN, "zap_readdir: bad directory "
1594                                     "entry, obj = %lld, offset = %lld, "
1595                                     "length = %d, num = %lld\n",
1596                                     (u_longlong_t)zp->z_id,
1597                                     (u_longlong_t)offset,
1598                                     zap.za_integer_length,
1599                                     (u_longlong_t)zap.za_num_integers);
1600                                 error = SET_ERROR(ENXIO);
1601                                 goto update;
1602                         }
1603
1604                         objnum = ZFS_DIRENT_OBJ(zap.za_first_integer);
1605                         type = ZFS_DIRENT_TYPE(zap.za_first_integer);
1606                 }
1607
1608                 done = !zpl_dir_emit(ctx, zap.za_name, strlen(zap.za_name),
1609                     objnum, type);
1610                 if (done)
1611                         break;
1612
1613                 /* Prefetch znode */
1614                 if (prefetch) {
1615                         dmu_prefetch(os, objnum, 0, 0, 0,
1616                             ZIO_PRIORITY_SYNC_READ);
1617                 }
1618
1619                 /*
1620                  * Move to the next entry, fill in the previous offset.
1621                  */
1622                 if (offset > 2 || (offset == 2 && !zfs_show_ctldir(zp))) {
1623                         zap_cursor_advance(&zc);
1624                         offset = zap_cursor_serialize(&zc);
1625                 } else {
1626                         offset += 1;
1627                 }
1628                 ctx->pos = offset;
1629         }
1630         zp->z_zn_prefetch = B_FALSE; /* a lookup will re-enable pre-fetching */
1631
1632 update:
1633         zap_cursor_fini(&zc);
1634         if (error == ENOENT)
1635                 error = 0;
1636 out:
1637         zfs_exit(zfsvfs, FTAG);
1638
1639         return (error);
1640 }
1641
1642 /*
1643  * Get the basic file attributes and place them in the provided kstat
1644  * structure.  The inode is assumed to be the authoritative source
1645  * for most of the attributes.  However, the znode currently has the
1646  * authoritative atime, blksize, and block count.
1647  *
1648  *      IN:     ip      - inode of file.
1649  *
1650  *      OUT:    sp      - kstat values.
1651  *
1652  *      RETURN: 0 (always succeeds)
1653  */
1654 int
1655 #ifdef HAVE_GENERIC_FILLATTR_IDMAP_REQMASK
1656 zfs_getattr_fast(zidmap_t *user_ns, u32 request_mask, struct inode *ip,
1657     struct kstat *sp)
1658 #else
1659 zfs_getattr_fast(zidmap_t *user_ns, struct inode *ip, struct kstat *sp)
1660 #endif
1661 {
1662         znode_t *zp = ITOZ(ip);
1663         zfsvfs_t *zfsvfs = ITOZSB(ip);
1664         uint32_t blksize;
1665         u_longlong_t nblocks;
1666         int error;
1667
1668         if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
1669                 return (error);
1670
1671         mutex_enter(&zp->z_lock);
1672
1673 #ifdef HAVE_GENERIC_FILLATTR_IDMAP_REQMASK
1674         zpl_generic_fillattr(user_ns, request_mask, ip, sp);
1675 #else
1676         zpl_generic_fillattr(user_ns, ip, sp);
1677 #endif
1678         /*
1679          * +1 link count for root inode with visible '.zfs' directory.
1680          */
1681         if ((zp->z_id == zfsvfs->z_root) && zfs_show_ctldir(zp))
1682                 if (sp->nlink < ZFS_LINK_MAX)
1683                         sp->nlink++;
1684
1685         sa_object_size(zp->z_sa_hdl, &blksize, &nblocks);
1686         sp->blksize = blksize;
1687         sp->blocks = nblocks;
1688
1689         if (unlikely(zp->z_blksz == 0)) {
1690                 /*
1691                  * Block size hasn't been set; suggest maximal I/O transfers.
1692                  */
1693                 sp->blksize = zfsvfs->z_max_blksz;
1694         }
1695
1696         mutex_exit(&zp->z_lock);
1697
1698         /*
1699          * Required to prevent NFS client from detecting different inode
1700          * numbers of snapshot root dentry before and after snapshot mount.
1701          */
1702         if (zfsvfs->z_issnap) {
1703                 if (ip->i_sb->s_root->d_inode == ip)
1704                         sp->ino = ZFSCTL_INO_SNAPDIRS -
1705                             dmu_objset_id(zfsvfs->z_os);
1706         }
1707
1708         zfs_exit(zfsvfs, FTAG);
1709
1710         return (0);
1711 }
1712
1713 /*
1714  * For the operation of changing file's user/group/project, we need to
1715  * handle not only the main object that is assigned to the file directly,
1716  * but also the ones that are used by the file via hidden xattr directory.
1717  *
1718  * Because the xattr directory may contains many EA entries, as to it may
1719  * be impossible to change all of them via the transaction of changing the
1720  * main object's user/group/project attributes. Then we have to change them
1721  * via other multiple independent transactions one by one. It may be not good
1722  * solution, but we have no better idea yet.
1723  */
1724 static int
1725 zfs_setattr_dir(znode_t *dzp)
1726 {
1727         struct inode    *dxip = ZTOI(dzp);
1728         struct inode    *xip = NULL;
1729         zfsvfs_t        *zfsvfs = ZTOZSB(dzp);
1730         objset_t        *os = zfsvfs->z_os;
1731         zap_cursor_t    zc;
1732         zap_attribute_t zap;
1733         zfs_dirlock_t   *dl;
1734         znode_t         *zp = NULL;
1735         dmu_tx_t        *tx = NULL;
1736         uint64_t        uid, gid;
1737         sa_bulk_attr_t  bulk[4];
1738         int             count;
1739         int             err;
1740
1741         zap_cursor_init(&zc, os, dzp->z_id);
1742         while ((err = zap_cursor_retrieve(&zc, &zap)) == 0) {
1743                 count = 0;
1744                 if (zap.za_integer_length != 8 || zap.za_num_integers != 1) {
1745                         err = ENXIO;
1746                         break;
1747                 }
1748
1749                 err = zfs_dirent_lock(&dl, dzp, (char *)zap.za_name, &zp,
1750                     ZEXISTS, NULL, NULL);
1751                 if (err == ENOENT)
1752                         goto next;
1753                 if (err)
1754                         break;
1755
1756                 xip = ZTOI(zp);
1757                 if (KUID_TO_SUID(xip->i_uid) == KUID_TO_SUID(dxip->i_uid) &&
1758                     KGID_TO_SGID(xip->i_gid) == KGID_TO_SGID(dxip->i_gid) &&
1759                     zp->z_projid == dzp->z_projid)
1760                         goto next;
1761
1762                 tx = dmu_tx_create(os);
1763                 if (!(zp->z_pflags & ZFS_PROJID))
1764                         dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
1765                 else
1766                         dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
1767
1768                 err = dmu_tx_assign(tx, TXG_WAIT);
1769                 if (err)
1770                         break;
1771
1772                 mutex_enter(&dzp->z_lock);
1773
1774                 if (KUID_TO_SUID(xip->i_uid) != KUID_TO_SUID(dxip->i_uid)) {
1775                         xip->i_uid = dxip->i_uid;
1776                         uid = zfs_uid_read(dxip);
1777                         SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL,
1778                             &uid, sizeof (uid));
1779                 }
1780
1781                 if (KGID_TO_SGID(xip->i_gid) != KGID_TO_SGID(dxip->i_gid)) {
1782                         xip->i_gid = dxip->i_gid;
1783                         gid = zfs_gid_read(dxip);
1784                         SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs), NULL,
1785                             &gid, sizeof (gid));
1786                 }
1787
1788                 if (zp->z_projid != dzp->z_projid) {
1789                         if (!(zp->z_pflags & ZFS_PROJID)) {
1790                                 zp->z_pflags |= ZFS_PROJID;
1791                                 SA_ADD_BULK_ATTR(bulk, count,
1792                                     SA_ZPL_FLAGS(zfsvfs), NULL, &zp->z_pflags,
1793                                     sizeof (zp->z_pflags));
1794                         }
1795
1796                         zp->z_projid = dzp->z_projid;
1797                         SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_PROJID(zfsvfs),
1798                             NULL, &zp->z_projid, sizeof (zp->z_projid));
1799                 }
1800
1801                 mutex_exit(&dzp->z_lock);
1802
1803                 if (likely(count > 0)) {
1804                         err = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
1805                         dmu_tx_commit(tx);
1806                 } else {
1807                         dmu_tx_abort(tx);
1808                 }
1809                 tx = NULL;
1810                 if (err != 0 && err != ENOENT)
1811                         break;
1812
1813 next:
1814                 if (zp) {
1815                         zrele(zp);
1816                         zp = NULL;
1817                         zfs_dirent_unlock(dl);
1818                 }
1819                 zap_cursor_advance(&zc);
1820         }
1821
1822         if (tx)
1823                 dmu_tx_abort(tx);
1824         if (zp) {
1825                 zrele(zp);
1826                 zfs_dirent_unlock(dl);
1827         }
1828         zap_cursor_fini(&zc);
1829
1830         return (err == ENOENT ? 0 : err);
1831 }
1832
1833 /*
1834  * Set the file attributes to the values contained in the
1835  * vattr structure.
1836  *
1837  *      IN:     zp      - znode of file to be modified.
1838  *              vap     - new attribute values.
1839  *                        If ATTR_XVATTR set, then optional attrs are being set
1840  *              flags   - ATTR_UTIME set if non-default time values provided.
1841  *                      - ATTR_NOACLCHECK (CIFS context only).
1842  *              cr      - credentials of caller.
1843  *              mnt_ns  - user namespace of the mount
1844  *
1845  *      RETURN: 0 if success
1846  *              error code if failure
1847  *
1848  * Timestamps:
1849  *      ip - ctime updated, mtime updated if size changed.
1850  */
1851 int
1852 zfs_setattr(znode_t *zp, vattr_t *vap, int flags, cred_t *cr, zidmap_t *mnt_ns)
1853 {
1854         struct inode    *ip;
1855         zfsvfs_t        *zfsvfs = ZTOZSB(zp);
1856         objset_t        *os = zfsvfs->z_os;
1857         zilog_t         *zilog;
1858         dmu_tx_t        *tx;
1859         vattr_t         oldva;
1860         xvattr_t        *tmpxvattr;
1861         uint_t          mask = vap->va_mask;
1862         uint_t          saved_mask = 0;
1863         int             trim_mask = 0;
1864         uint64_t        new_mode;
1865         uint64_t        new_kuid = 0, new_kgid = 0, new_uid, new_gid;
1866         uint64_t        xattr_obj;
1867         uint64_t        mtime[2], ctime[2], atime[2];
1868         uint64_t        projid = ZFS_INVALID_PROJID;
1869         znode_t         *attrzp;
1870         int             need_policy = FALSE;
1871         int             err, err2 = 0;
1872         zfs_fuid_info_t *fuidp = NULL;
1873         xvattr_t *xvap = (xvattr_t *)vap;       /* vap may be an xvattr_t * */
1874         xoptattr_t      *xoap;
1875         zfs_acl_t       *aclp;
1876         boolean_t skipaclchk = (flags & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
1877         boolean_t       fuid_dirtied = B_FALSE;
1878         boolean_t       handle_eadir = B_FALSE;
1879         sa_bulk_attr_t  *bulk, *xattr_bulk;
1880         int             count = 0, xattr_count = 0, bulks = 8;
1881
1882         if (mask == 0)
1883                 return (0);
1884
1885         if ((err = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
1886                 return (err);
1887         ip = ZTOI(zp);
1888
1889         /*
1890          * If this is a xvattr_t, then get a pointer to the structure of
1891          * optional attributes.  If this is NULL, then we have a vattr_t.
1892          */
1893         xoap = xva_getxoptattr(xvap);
1894         if (xoap != NULL && (mask & ATTR_XVATTR)) {
1895                 if (XVA_ISSET_REQ(xvap, XAT_PROJID)) {
1896                         if (!dmu_objset_projectquota_enabled(os) ||
1897                             (!S_ISREG(ip->i_mode) && !S_ISDIR(ip->i_mode))) {
1898                                 zfs_exit(zfsvfs, FTAG);
1899                                 return (SET_ERROR(ENOTSUP));
1900                         }
1901
1902                         projid = xoap->xoa_projid;
1903                         if (unlikely(projid == ZFS_INVALID_PROJID)) {
1904                                 zfs_exit(zfsvfs, FTAG);
1905                                 return (SET_ERROR(EINVAL));
1906                         }
1907
1908                         if (projid == zp->z_projid && zp->z_pflags & ZFS_PROJID)
1909                                 projid = ZFS_INVALID_PROJID;
1910                         else
1911                                 need_policy = TRUE;
1912                 }
1913
1914                 if (XVA_ISSET_REQ(xvap, XAT_PROJINHERIT) &&
1915                     (xoap->xoa_projinherit !=
1916                     ((zp->z_pflags & ZFS_PROJINHERIT) != 0)) &&
1917                     (!dmu_objset_projectquota_enabled(os) ||
1918                     (!S_ISREG(ip->i_mode) && !S_ISDIR(ip->i_mode)))) {
1919                         zfs_exit(zfsvfs, FTAG);
1920                         return (SET_ERROR(ENOTSUP));
1921                 }
1922         }
1923
1924         zilog = zfsvfs->z_log;
1925
1926         /*
1927          * Make sure that if we have ephemeral uid/gid or xvattr specified
1928          * that file system is at proper version level
1929          */
1930
1931         if (zfsvfs->z_use_fuids == B_FALSE &&
1932             (((mask & ATTR_UID) && IS_EPHEMERAL(vap->va_uid)) ||
1933             ((mask & ATTR_GID) && IS_EPHEMERAL(vap->va_gid)) ||
1934             (mask & ATTR_XVATTR))) {
1935                 zfs_exit(zfsvfs, FTAG);
1936                 return (SET_ERROR(EINVAL));
1937         }
1938
1939         if (mask & ATTR_SIZE && S_ISDIR(ip->i_mode)) {
1940                 zfs_exit(zfsvfs, FTAG);
1941                 return (SET_ERROR(EISDIR));
1942         }
1943
1944         if (mask & ATTR_SIZE && !S_ISREG(ip->i_mode) && !S_ISFIFO(ip->i_mode)) {
1945                 zfs_exit(zfsvfs, FTAG);
1946                 return (SET_ERROR(EINVAL));
1947         }
1948
1949         tmpxvattr = kmem_alloc(sizeof (xvattr_t), KM_SLEEP);
1950         xva_init(tmpxvattr);
1951
1952         bulk = kmem_alloc(sizeof (sa_bulk_attr_t) * bulks, KM_SLEEP);
1953         xattr_bulk = kmem_alloc(sizeof (sa_bulk_attr_t) * bulks, KM_SLEEP);
1954
1955         /*
1956          * Immutable files can only alter immutable bit and atime
1957          */
1958         if ((zp->z_pflags & ZFS_IMMUTABLE) &&
1959             ((mask & (ATTR_SIZE|ATTR_UID|ATTR_GID|ATTR_MTIME|ATTR_MODE)) ||
1960             ((mask & ATTR_XVATTR) && XVA_ISSET_REQ(xvap, XAT_CREATETIME)))) {
1961                 err = SET_ERROR(EPERM);
1962                 goto out3;
1963         }
1964
1965         if ((mask & ATTR_SIZE) && (zp->z_pflags & ZFS_READONLY)) {
1966                 err = SET_ERROR(EPERM);
1967                 goto out3;
1968         }
1969
1970         /*
1971          * Verify timestamps doesn't overflow 32 bits.
1972          * ZFS can handle large timestamps, but 32bit syscalls can't
1973          * handle times greater than 2039.  This check should be removed
1974          * once large timestamps are fully supported.
1975          */
1976         if (mask & (ATTR_ATIME | ATTR_MTIME)) {
1977                 if (((mask & ATTR_ATIME) &&
1978                     TIMESPEC_OVERFLOW(&vap->va_atime)) ||
1979                     ((mask & ATTR_MTIME) &&
1980                     TIMESPEC_OVERFLOW(&vap->va_mtime))) {
1981                         err = SET_ERROR(EOVERFLOW);
1982                         goto out3;
1983                 }
1984         }
1985
1986 top:
1987         attrzp = NULL;
1988         aclp = NULL;
1989
1990         /* Can this be moved to before the top label? */
1991         if (zfs_is_readonly(zfsvfs)) {
1992                 err = SET_ERROR(EROFS);
1993                 goto out3;
1994         }
1995
1996         /*
1997          * First validate permissions
1998          */
1999
2000         if (mask & ATTR_SIZE) {
2001                 err = zfs_zaccess(zp, ACE_WRITE_DATA, 0, skipaclchk, cr,
2002                     mnt_ns);
2003                 if (err)
2004                         goto out3;
2005
2006                 /*
2007                  * XXX - Note, we are not providing any open
2008                  * mode flags here (like FNDELAY), so we may
2009                  * block if there are locks present... this
2010                  * should be addressed in openat().
2011                  */
2012                 /* XXX - would it be OK to generate a log record here? */
2013                 err = zfs_freesp(zp, vap->va_size, 0, 0, FALSE);
2014                 if (err)
2015                         goto out3;
2016         }
2017
2018         if (mask & (ATTR_ATIME|ATTR_MTIME) ||
2019             ((mask & ATTR_XVATTR) && (XVA_ISSET_REQ(xvap, XAT_HIDDEN) ||
2020             XVA_ISSET_REQ(xvap, XAT_READONLY) ||
2021             XVA_ISSET_REQ(xvap, XAT_ARCHIVE) ||
2022             XVA_ISSET_REQ(xvap, XAT_OFFLINE) ||
2023             XVA_ISSET_REQ(xvap, XAT_SPARSE) ||
2024             XVA_ISSET_REQ(xvap, XAT_CREATETIME) ||
2025             XVA_ISSET_REQ(xvap, XAT_SYSTEM)))) {
2026                 need_policy = zfs_zaccess(zp, ACE_WRITE_ATTRIBUTES, 0,
2027                     skipaclchk, cr, mnt_ns);
2028         }
2029
2030         if (mask & (ATTR_UID|ATTR_GID)) {
2031                 int     idmask = (mask & (ATTR_UID|ATTR_GID));
2032                 int     take_owner;
2033                 int     take_group;
2034                 uid_t   uid;
2035                 gid_t   gid;
2036
2037                 /*
2038                  * NOTE: even if a new mode is being set,
2039                  * we may clear S_ISUID/S_ISGID bits.
2040                  */
2041
2042                 if (!(mask & ATTR_MODE))
2043                         vap->va_mode = zp->z_mode;
2044
2045                 /*
2046                  * Take ownership or chgrp to group we are a member of
2047                  */
2048
2049                 uid = zfs_uid_to_vfsuid(mnt_ns, zfs_i_user_ns(ip),
2050                     vap->va_uid);
2051                 gid = zfs_gid_to_vfsgid(mnt_ns, zfs_i_user_ns(ip),
2052                     vap->va_gid);
2053                 take_owner = (mask & ATTR_UID) && (uid == crgetuid(cr));
2054                 take_group = (mask & ATTR_GID) &&
2055                     zfs_groupmember(zfsvfs, gid, cr);
2056
2057                 /*
2058                  * If both ATTR_UID and ATTR_GID are set then take_owner and
2059                  * take_group must both be set in order to allow taking
2060                  * ownership.
2061                  *
2062                  * Otherwise, send the check through secpolicy_vnode_setattr()
2063                  *
2064                  */
2065
2066                 if (((idmask == (ATTR_UID|ATTR_GID)) &&
2067                     take_owner && take_group) ||
2068                     ((idmask == ATTR_UID) && take_owner) ||
2069                     ((idmask == ATTR_GID) && take_group)) {
2070                         if (zfs_zaccess(zp, ACE_WRITE_OWNER, 0,
2071                             skipaclchk, cr, mnt_ns) == 0) {
2072                                 /*
2073                                  * Remove setuid/setgid for non-privileged users
2074                                  */
2075                                 (void) secpolicy_setid_clear(vap, cr);
2076                                 trim_mask = (mask & (ATTR_UID|ATTR_GID));
2077                         } else {
2078                                 need_policy =  TRUE;
2079                         }
2080                 } else {
2081                         need_policy =  TRUE;
2082                 }
2083         }
2084
2085         mutex_enter(&zp->z_lock);
2086         oldva.va_mode = zp->z_mode;
2087         zfs_fuid_map_ids(zp, cr, &oldva.va_uid, &oldva.va_gid);
2088         if (mask & ATTR_XVATTR) {
2089                 /*
2090                  * Update xvattr mask to include only those attributes
2091                  * that are actually changing.
2092                  *
2093                  * the bits will be restored prior to actually setting
2094                  * the attributes so the caller thinks they were set.
2095                  */
2096                 if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) {
2097                         if (xoap->xoa_appendonly !=
2098                             ((zp->z_pflags & ZFS_APPENDONLY) != 0)) {
2099                                 need_policy = TRUE;
2100                         } else {
2101                                 XVA_CLR_REQ(xvap, XAT_APPENDONLY);
2102                                 XVA_SET_REQ(tmpxvattr, XAT_APPENDONLY);
2103                         }
2104                 }
2105
2106                 if (XVA_ISSET_REQ(xvap, XAT_PROJINHERIT)) {
2107                         if (xoap->xoa_projinherit !=
2108                             ((zp->z_pflags & ZFS_PROJINHERIT) != 0)) {
2109                                 need_policy = TRUE;
2110                         } else {
2111                                 XVA_CLR_REQ(xvap, XAT_PROJINHERIT);
2112                                 XVA_SET_REQ(tmpxvattr, XAT_PROJINHERIT);
2113                         }
2114                 }
2115
2116                 if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) {
2117                         if (xoap->xoa_nounlink !=
2118                             ((zp->z_pflags & ZFS_NOUNLINK) != 0)) {
2119                                 need_policy = TRUE;
2120                         } else {
2121                                 XVA_CLR_REQ(xvap, XAT_NOUNLINK);
2122                                 XVA_SET_REQ(tmpxvattr, XAT_NOUNLINK);
2123                         }
2124                 }
2125
2126                 if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) {
2127                         if (xoap->xoa_immutable !=
2128                             ((zp->z_pflags & ZFS_IMMUTABLE) != 0)) {
2129                                 need_policy = TRUE;
2130                         } else {
2131                                 XVA_CLR_REQ(xvap, XAT_IMMUTABLE);
2132                                 XVA_SET_REQ(tmpxvattr, XAT_IMMUTABLE);
2133                         }
2134                 }
2135
2136                 if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) {
2137                         if (xoap->xoa_nodump !=
2138                             ((zp->z_pflags & ZFS_NODUMP) != 0)) {
2139                                 need_policy = TRUE;
2140                         } else {
2141                                 XVA_CLR_REQ(xvap, XAT_NODUMP);
2142                                 XVA_SET_REQ(tmpxvattr, XAT_NODUMP);
2143                         }
2144                 }
2145
2146                 if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) {
2147                         if (xoap->xoa_av_modified !=
2148                             ((zp->z_pflags & ZFS_AV_MODIFIED) != 0)) {
2149                                 need_policy = TRUE;
2150                         } else {
2151                                 XVA_CLR_REQ(xvap, XAT_AV_MODIFIED);
2152                                 XVA_SET_REQ(tmpxvattr, XAT_AV_MODIFIED);
2153                         }
2154                 }
2155
2156                 if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) {
2157                         if ((!S_ISREG(ip->i_mode) &&
2158                             xoap->xoa_av_quarantined) ||
2159                             xoap->xoa_av_quarantined !=
2160                             ((zp->z_pflags & ZFS_AV_QUARANTINED) != 0)) {
2161                                 need_policy = TRUE;
2162                         } else {
2163                                 XVA_CLR_REQ(xvap, XAT_AV_QUARANTINED);
2164                                 XVA_SET_REQ(tmpxvattr, XAT_AV_QUARANTINED);
2165                         }
2166                 }
2167
2168                 if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) {
2169                         mutex_exit(&zp->z_lock);
2170                         err = SET_ERROR(EPERM);
2171                         goto out3;
2172                 }
2173
2174                 if (need_policy == FALSE &&
2175                     (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP) ||
2176                     XVA_ISSET_REQ(xvap, XAT_OPAQUE))) {
2177                         need_policy = TRUE;
2178                 }
2179         }
2180
2181         mutex_exit(&zp->z_lock);
2182
2183         if (mask & ATTR_MODE) {
2184                 if (zfs_zaccess(zp, ACE_WRITE_ACL, 0, skipaclchk, cr,
2185                     mnt_ns) == 0) {
2186                         err = secpolicy_setid_setsticky_clear(ip, vap,
2187                             &oldva, cr, mnt_ns, zfs_i_user_ns(ip));
2188                         if (err)
2189                                 goto out3;
2190                         trim_mask |= ATTR_MODE;
2191                 } else {
2192                         need_policy = TRUE;
2193                 }
2194         }
2195
2196         if (need_policy) {
2197                 /*
2198                  * If trim_mask is set then take ownership
2199                  * has been granted or write_acl is present and user
2200                  * has the ability to modify mode.  In that case remove
2201                  * UID|GID and or MODE from mask so that
2202                  * secpolicy_vnode_setattr() doesn't revoke it.
2203                  */
2204
2205                 if (trim_mask) {
2206                         saved_mask = vap->va_mask;
2207                         vap->va_mask &= ~trim_mask;
2208                 }
2209                 err = secpolicy_vnode_setattr(cr, ip, vap, &oldva, flags,
2210                     zfs_zaccess_unix, zp);
2211                 if (err)
2212                         goto out3;
2213
2214                 if (trim_mask)
2215                         vap->va_mask |= saved_mask;
2216         }
2217
2218         /*
2219          * secpolicy_vnode_setattr, or take ownership may have
2220          * changed va_mask
2221          */
2222         mask = vap->va_mask;
2223
2224         if ((mask & (ATTR_UID | ATTR_GID)) || projid != ZFS_INVALID_PROJID) {
2225                 handle_eadir = B_TRUE;
2226                 err = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs),
2227                     &xattr_obj, sizeof (xattr_obj));
2228
2229                 if (err == 0 && xattr_obj) {
2230                         err = zfs_zget(ZTOZSB(zp), xattr_obj, &attrzp);
2231                         if (err)
2232                                 goto out2;
2233                 }
2234                 if (mask & ATTR_UID) {
2235                         new_kuid = zfs_fuid_create(zfsvfs,
2236                             (uint64_t)vap->va_uid, cr, ZFS_OWNER, &fuidp);
2237                         if (new_kuid != KUID_TO_SUID(ZTOI(zp)->i_uid) &&
2238                             zfs_id_overquota(zfsvfs, DMU_USERUSED_OBJECT,
2239                             new_kuid)) {
2240                                 if (attrzp)
2241                                         zrele(attrzp);
2242                                 err = SET_ERROR(EDQUOT);
2243                                 goto out2;
2244                         }
2245                 }
2246
2247                 if (mask & ATTR_GID) {
2248                         new_kgid = zfs_fuid_create(zfsvfs,
2249                             (uint64_t)vap->va_gid, cr, ZFS_GROUP, &fuidp);
2250                         if (new_kgid != KGID_TO_SGID(ZTOI(zp)->i_gid) &&
2251                             zfs_id_overquota(zfsvfs, DMU_GROUPUSED_OBJECT,
2252                             new_kgid)) {
2253                                 if (attrzp)
2254                                         zrele(attrzp);
2255                                 err = SET_ERROR(EDQUOT);
2256                                 goto out2;
2257                         }
2258                 }
2259
2260                 if (projid != ZFS_INVALID_PROJID &&
2261                     zfs_id_overquota(zfsvfs, DMU_PROJECTUSED_OBJECT, projid)) {
2262                         if (attrzp)
2263                                 zrele(attrzp);
2264                         err = EDQUOT;
2265                         goto out2;
2266                 }
2267         }
2268         tx = dmu_tx_create(os);
2269
2270         if (mask & ATTR_MODE) {
2271                 uint64_t pmode = zp->z_mode;
2272                 uint64_t acl_obj;
2273                 new_mode = (pmode & S_IFMT) | (vap->va_mode & ~S_IFMT);
2274
2275                 if (ZTOZSB(zp)->z_acl_mode == ZFS_ACL_RESTRICTED &&
2276                     !(zp->z_pflags & ZFS_ACL_TRIVIAL)) {
2277                         err = EPERM;
2278                         goto out;
2279                 }
2280
2281                 if ((err = zfs_acl_chmod_setattr(zp, &aclp, new_mode)))
2282                         goto out;
2283
2284                 mutex_enter(&zp->z_lock);
2285                 if (!zp->z_is_sa && ((acl_obj = zfs_external_acl(zp)) != 0)) {
2286                         /*
2287                          * Are we upgrading ACL from old V0 format
2288                          * to V1 format?
2289                          */
2290                         if (zfsvfs->z_version >= ZPL_VERSION_FUID &&
2291                             zfs_znode_acl_version(zp) ==
2292                             ZFS_ACL_VERSION_INITIAL) {
2293                                 dmu_tx_hold_free(tx, acl_obj, 0,
2294                                     DMU_OBJECT_END);
2295                                 dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
2296                                     0, aclp->z_acl_bytes);
2297                         } else {
2298                                 dmu_tx_hold_write(tx, acl_obj, 0,
2299                                     aclp->z_acl_bytes);
2300                         }
2301                 } else if (!zp->z_is_sa && aclp->z_acl_bytes > ZFS_ACE_SPACE) {
2302                         dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
2303                             0, aclp->z_acl_bytes);
2304                 }
2305                 mutex_exit(&zp->z_lock);
2306                 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
2307         } else {
2308                 if (((mask & ATTR_XVATTR) &&
2309                     XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) ||
2310                     (projid != ZFS_INVALID_PROJID &&
2311                     !(zp->z_pflags & ZFS_PROJID)))
2312                         dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
2313                 else
2314                         dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
2315         }
2316
2317         if (attrzp) {
2318                 dmu_tx_hold_sa(tx, attrzp->z_sa_hdl, B_FALSE);
2319         }
2320
2321         fuid_dirtied = zfsvfs->z_fuid_dirty;
2322         if (fuid_dirtied)
2323                 zfs_fuid_txhold(zfsvfs, tx);
2324
2325         zfs_sa_upgrade_txholds(tx, zp);
2326
2327         err = dmu_tx_assign(tx, TXG_WAIT);
2328         if (err)
2329                 goto out;
2330
2331         count = 0;
2332         /*
2333          * Set each attribute requested.
2334          * We group settings according to the locks they need to acquire.
2335          *
2336          * Note: you cannot set ctime directly, although it will be
2337          * updated as a side-effect of calling this function.
2338          */
2339
2340         if (projid != ZFS_INVALID_PROJID && !(zp->z_pflags & ZFS_PROJID)) {
2341                 /*
2342                  * For the existed object that is upgraded from old system,
2343                  * its on-disk layout has no slot for the project ID attribute.
2344                  * But quota accounting logic needs to access related slots by
2345                  * offset directly. So we need to adjust old objects' layout
2346                  * to make the project ID to some unified and fixed offset.
2347                  */
2348                 if (attrzp)
2349                         err = sa_add_projid(attrzp->z_sa_hdl, tx, projid);
2350                 if (err == 0)
2351                         err = sa_add_projid(zp->z_sa_hdl, tx, projid);
2352
2353                 if (unlikely(err == EEXIST))
2354                         err = 0;
2355                 else if (err != 0)
2356                         goto out;
2357                 else
2358                         projid = ZFS_INVALID_PROJID;
2359         }
2360
2361         if (mask & (ATTR_UID|ATTR_GID|ATTR_MODE))
2362                 mutex_enter(&zp->z_acl_lock);
2363         mutex_enter(&zp->z_lock);
2364
2365         SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
2366             &zp->z_pflags, sizeof (zp->z_pflags));
2367
2368         if (attrzp) {
2369                 if (mask & (ATTR_UID|ATTR_GID|ATTR_MODE))
2370                         mutex_enter(&attrzp->z_acl_lock);
2371                 mutex_enter(&attrzp->z_lock);
2372                 SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
2373                     SA_ZPL_FLAGS(zfsvfs), NULL, &attrzp->z_pflags,
2374                     sizeof (attrzp->z_pflags));
2375                 if (projid != ZFS_INVALID_PROJID) {
2376                         attrzp->z_projid = projid;
2377                         SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
2378                             SA_ZPL_PROJID(zfsvfs), NULL, &attrzp->z_projid,
2379                             sizeof (attrzp->z_projid));
2380                 }
2381         }
2382
2383         if (mask & (ATTR_UID|ATTR_GID)) {
2384
2385                 if (mask & ATTR_UID) {
2386                         ZTOI(zp)->i_uid = SUID_TO_KUID(new_kuid);
2387                         new_uid = zfs_uid_read(ZTOI(zp));
2388                         SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL,
2389                             &new_uid, sizeof (new_uid));
2390                         if (attrzp) {
2391                                 SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
2392                                     SA_ZPL_UID(zfsvfs), NULL, &new_uid,
2393                                     sizeof (new_uid));
2394                                 ZTOI(attrzp)->i_uid = SUID_TO_KUID(new_uid);
2395                         }
2396                 }
2397
2398                 if (mask & ATTR_GID) {
2399                         ZTOI(zp)->i_gid = SGID_TO_KGID(new_kgid);
2400                         new_gid = zfs_gid_read(ZTOI(zp));
2401                         SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs),
2402                             NULL, &new_gid, sizeof (new_gid));
2403                         if (attrzp) {
2404                                 SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
2405                                     SA_ZPL_GID(zfsvfs), NULL, &new_gid,
2406                                     sizeof (new_gid));
2407                                 ZTOI(attrzp)->i_gid = SGID_TO_KGID(new_kgid);
2408                         }
2409                 }
2410                 if (!(mask & ATTR_MODE)) {
2411                         SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs),
2412                             NULL, &new_mode, sizeof (new_mode));
2413                         new_mode = zp->z_mode;
2414                 }
2415                 err = zfs_acl_chown_setattr(zp);
2416                 ASSERT(err == 0);
2417                 if (attrzp) {
2418                         err = zfs_acl_chown_setattr(attrzp);
2419                         ASSERT(err == 0);
2420                 }
2421         }
2422
2423         if (mask & ATTR_MODE) {
2424                 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL,
2425                     &new_mode, sizeof (new_mode));
2426                 zp->z_mode = ZTOI(zp)->i_mode = new_mode;
2427                 ASSERT3P(aclp, !=, NULL);
2428                 err = zfs_aclset_common(zp, aclp, cr, tx);
2429                 ASSERT0(err);
2430                 if (zp->z_acl_cached)
2431                         zfs_acl_free(zp->z_acl_cached);
2432                 zp->z_acl_cached = aclp;
2433                 aclp = NULL;
2434         }
2435
2436         if ((mask & ATTR_ATIME) || zp->z_atime_dirty) {
2437                 zp->z_atime_dirty = B_FALSE;
2438                 ZFS_TIME_ENCODE(&ip->i_atime, atime);
2439                 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zfsvfs), NULL,
2440                     &atime, sizeof (atime));
2441         }
2442
2443         if (mask & (ATTR_MTIME | ATTR_SIZE)) {
2444                 ZFS_TIME_ENCODE(&vap->va_mtime, mtime);
2445                 ZTOI(zp)->i_mtime = zpl_inode_timestamp_truncate(
2446                     vap->va_mtime, ZTOI(zp));
2447
2448                 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL,
2449                     mtime, sizeof (mtime));
2450         }
2451
2452         if (mask & (ATTR_CTIME | ATTR_SIZE)) {
2453                 ZFS_TIME_ENCODE(&vap->va_ctime, ctime);
2454                 zpl_inode_set_ctime_to_ts(ZTOI(zp),
2455                     zpl_inode_timestamp_truncate(vap->va_ctime, ZTOI(zp)));
2456                 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
2457                     ctime, sizeof (ctime));
2458         }
2459
2460         if (projid != ZFS_INVALID_PROJID) {
2461                 zp->z_projid = projid;
2462                 SA_ADD_BULK_ATTR(bulk, count,
2463                     SA_ZPL_PROJID(zfsvfs), NULL, &zp->z_projid,
2464                     sizeof (zp->z_projid));
2465         }
2466
2467         if (attrzp && mask) {
2468                 SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
2469                     SA_ZPL_CTIME(zfsvfs), NULL, &ctime,
2470                     sizeof (ctime));
2471         }
2472
2473         /*
2474          * Do this after setting timestamps to prevent timestamp
2475          * update from toggling bit
2476          */
2477
2478         if (xoap && (mask & ATTR_XVATTR)) {
2479
2480                 /*
2481                  * restore trimmed off masks
2482                  * so that return masks can be set for caller.
2483                  */
2484
2485                 if (XVA_ISSET_REQ(tmpxvattr, XAT_APPENDONLY)) {
2486                         XVA_SET_REQ(xvap, XAT_APPENDONLY);
2487                 }
2488                 if (XVA_ISSET_REQ(tmpxvattr, XAT_NOUNLINK)) {
2489                         XVA_SET_REQ(xvap, XAT_NOUNLINK);
2490                 }
2491                 if (XVA_ISSET_REQ(tmpxvattr, XAT_IMMUTABLE)) {
2492                         XVA_SET_REQ(xvap, XAT_IMMUTABLE);
2493                 }
2494                 if (XVA_ISSET_REQ(tmpxvattr, XAT_NODUMP)) {
2495                         XVA_SET_REQ(xvap, XAT_NODUMP);
2496                 }
2497                 if (XVA_ISSET_REQ(tmpxvattr, XAT_AV_MODIFIED)) {
2498                         XVA_SET_REQ(xvap, XAT_AV_MODIFIED);
2499                 }
2500                 if (XVA_ISSET_REQ(tmpxvattr, XAT_AV_QUARANTINED)) {
2501                         XVA_SET_REQ(xvap, XAT_AV_QUARANTINED);
2502                 }
2503                 if (XVA_ISSET_REQ(tmpxvattr, XAT_PROJINHERIT)) {
2504                         XVA_SET_REQ(xvap, XAT_PROJINHERIT);
2505                 }
2506
2507                 if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP))
2508                         ASSERT(S_ISREG(ip->i_mode));
2509
2510                 zfs_xvattr_set(zp, xvap, tx);
2511         }
2512
2513         if (fuid_dirtied)
2514                 zfs_fuid_sync(zfsvfs, tx);
2515
2516         if (mask != 0)
2517                 zfs_log_setattr(zilog, tx, TX_SETATTR, zp, vap, mask, fuidp);
2518
2519         mutex_exit(&zp->z_lock);
2520         if (mask & (ATTR_UID|ATTR_GID|ATTR_MODE))
2521                 mutex_exit(&zp->z_acl_lock);
2522
2523         if (attrzp) {
2524                 if (mask & (ATTR_UID|ATTR_GID|ATTR_MODE))
2525                         mutex_exit(&attrzp->z_acl_lock);
2526                 mutex_exit(&attrzp->z_lock);
2527         }
2528 out:
2529         if (err == 0 && xattr_count > 0) {
2530                 err2 = sa_bulk_update(attrzp->z_sa_hdl, xattr_bulk,
2531                     xattr_count, tx);
2532                 ASSERT(err2 == 0);
2533         }
2534
2535         if (aclp)
2536                 zfs_acl_free(aclp);
2537
2538         if (fuidp) {
2539                 zfs_fuid_info_free(fuidp);
2540                 fuidp = NULL;
2541         }
2542
2543         if (err) {
2544                 dmu_tx_abort(tx);
2545                 if (attrzp)
2546                         zrele(attrzp);
2547                 if (err == ERESTART)
2548                         goto top;
2549         } else {
2550                 if (count > 0)
2551                         err2 = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
2552                 dmu_tx_commit(tx);
2553                 if (attrzp) {
2554                         if (err2 == 0 && handle_eadir)
2555                                 err = zfs_setattr_dir(attrzp);
2556                         zrele(attrzp);
2557                 }
2558                 zfs_znode_update_vfs(zp);
2559         }
2560
2561 out2:
2562         if (os->os_sync == ZFS_SYNC_ALWAYS)
2563                 zil_commit(zilog, 0);
2564
2565 out3:
2566         kmem_free(xattr_bulk, sizeof (sa_bulk_attr_t) * bulks);
2567         kmem_free(bulk, sizeof (sa_bulk_attr_t) * bulks);
2568         kmem_free(tmpxvattr, sizeof (xvattr_t));
2569         zfs_exit(zfsvfs, FTAG);
2570         return (err);
2571 }
2572
2573 typedef struct zfs_zlock {
2574         krwlock_t       *zl_rwlock;     /* lock we acquired */
2575         znode_t         *zl_znode;      /* znode we held */
2576         struct zfs_zlock *zl_next;      /* next in list */
2577 } zfs_zlock_t;
2578
2579 /*
2580  * Drop locks and release vnodes that were held by zfs_rename_lock().
2581  */
2582 static void
2583 zfs_rename_unlock(zfs_zlock_t **zlpp)
2584 {
2585         zfs_zlock_t *zl;
2586
2587         while ((zl = *zlpp) != NULL) {
2588                 if (zl->zl_znode != NULL)
2589                         zfs_zrele_async(zl->zl_znode);
2590                 rw_exit(zl->zl_rwlock);
2591                 *zlpp = zl->zl_next;
2592                 kmem_free(zl, sizeof (*zl));
2593         }
2594 }
2595
2596 /*
2597  * Search back through the directory tree, using the ".." entries.
2598  * Lock each directory in the chain to prevent concurrent renames.
2599  * Fail any attempt to move a directory into one of its own descendants.
2600  * XXX - z_parent_lock can overlap with map or grow locks
2601  */
2602 static int
2603 zfs_rename_lock(znode_t *szp, znode_t *tdzp, znode_t *sdzp, zfs_zlock_t **zlpp)
2604 {
2605         zfs_zlock_t     *zl;
2606         znode_t         *zp = tdzp;
2607         uint64_t        rootid = ZTOZSB(zp)->z_root;
2608         uint64_t        oidp = zp->z_id;
2609         krwlock_t       *rwlp = &szp->z_parent_lock;
2610         krw_t           rw = RW_WRITER;
2611
2612         /*
2613          * First pass write-locks szp and compares to zp->z_id.
2614          * Later passes read-lock zp and compare to zp->z_parent.
2615          */
2616         do {
2617                 if (!rw_tryenter(rwlp, rw)) {
2618                         /*
2619                          * Another thread is renaming in this path.
2620                          * Note that if we are a WRITER, we don't have any
2621                          * parent_locks held yet.
2622                          */
2623                         if (rw == RW_READER && zp->z_id > szp->z_id) {
2624                                 /*
2625                                  * Drop our locks and restart
2626                                  */
2627                                 zfs_rename_unlock(&zl);
2628                                 *zlpp = NULL;
2629                                 zp = tdzp;
2630                                 oidp = zp->z_id;
2631                                 rwlp = &szp->z_parent_lock;
2632                                 rw = RW_WRITER;
2633                                 continue;
2634                         } else {
2635                                 /*
2636                                  * Wait for other thread to drop its locks
2637                                  */
2638                                 rw_enter(rwlp, rw);
2639                         }
2640                 }
2641
2642                 zl = kmem_alloc(sizeof (*zl), KM_SLEEP);
2643                 zl->zl_rwlock = rwlp;
2644                 zl->zl_znode = NULL;
2645                 zl->zl_next = *zlpp;
2646                 *zlpp = zl;
2647
2648                 if (oidp == szp->z_id)          /* We're a descendant of szp */
2649                         return (SET_ERROR(EINVAL));
2650
2651                 if (oidp == rootid)             /* We've hit the top */
2652                         return (0);
2653
2654                 if (rw == RW_READER) {          /* i.e. not the first pass */
2655                         int error = zfs_zget(ZTOZSB(zp), oidp, &zp);
2656                         if (error)
2657                                 return (error);
2658                         zl->zl_znode = zp;
2659                 }
2660                 (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_PARENT(ZTOZSB(zp)),
2661                     &oidp, sizeof (oidp));
2662                 rwlp = &zp->z_parent_lock;
2663                 rw = RW_READER;
2664
2665         } while (zp->z_id != sdzp->z_id);
2666
2667         return (0);
2668 }
2669
2670 /*
2671  * Move an entry from the provided source directory to the target
2672  * directory.  Change the entry name as indicated.
2673  *
2674  *      IN:     sdzp    - Source directory containing the "old entry".
2675  *              snm     - Old entry name.
2676  *              tdzp    - Target directory to contain the "new entry".
2677  *              tnm     - New entry name.
2678  *              cr      - credentials of caller.
2679  *              flags   - case flags
2680  *              rflags  - RENAME_* flags
2681  *              wa_vap  - attributes for RENAME_WHITEOUT (must be a char 0:0).
2682  *              mnt_ns  - user namespace of the mount
2683  *
2684  *      RETURN: 0 on success, error code on failure.
2685  *
2686  * Timestamps:
2687  *      sdzp,tdzp - ctime|mtime updated
2688  */
2689 int
2690 zfs_rename(znode_t *sdzp, char *snm, znode_t *tdzp, char *tnm,
2691     cred_t *cr, int flags, uint64_t rflags, vattr_t *wo_vap, zidmap_t *mnt_ns)
2692 {
2693         znode_t         *szp, *tzp;
2694         zfsvfs_t        *zfsvfs = ZTOZSB(sdzp);
2695         zilog_t         *zilog;
2696         zfs_dirlock_t   *sdl, *tdl;
2697         dmu_tx_t        *tx;
2698         zfs_zlock_t     *zl;
2699         int             cmp, serr, terr;
2700         int             error = 0;
2701         int             zflg = 0;
2702         boolean_t       waited = B_FALSE;
2703         /* Needed for whiteout inode creation. */
2704         boolean_t       fuid_dirtied;
2705         zfs_acl_ids_t   acl_ids;
2706         boolean_t       have_acl = B_FALSE;
2707         znode_t         *wzp = NULL;
2708
2709
2710         if (snm == NULL || tnm == NULL)
2711                 return (SET_ERROR(EINVAL));
2712
2713         if (rflags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT))
2714                 return (SET_ERROR(EINVAL));
2715
2716         /* Already checked by Linux VFS, but just to make sure. */
2717         if (rflags & RENAME_EXCHANGE &&
2718             (rflags & (RENAME_NOREPLACE | RENAME_WHITEOUT)))
2719                 return (SET_ERROR(EINVAL));
2720
2721         /*
2722          * Make sure we only get wo_vap iff. RENAME_WHITEOUT and that it's the
2723          * right kind of vattr_t for the whiteout file. These are set
2724          * internally by ZFS so should never be incorrect.
2725          */
2726         VERIFY_EQUIV(rflags & RENAME_WHITEOUT, wo_vap != NULL);
2727         VERIFY_IMPLY(wo_vap, wo_vap->va_mode == S_IFCHR);
2728         VERIFY_IMPLY(wo_vap, wo_vap->va_rdev == makedevice(0, 0));
2729
2730         if ((error = zfs_enter_verify_zp(zfsvfs, sdzp, FTAG)) != 0)
2731                 return (error);
2732         zilog = zfsvfs->z_log;
2733
2734         if ((error = zfs_verify_zp(tdzp)) != 0) {
2735                 zfs_exit(zfsvfs, FTAG);
2736                 return (error);
2737         }
2738
2739         /*
2740          * We check i_sb because snapshots and the ctldir must have different
2741          * super blocks.
2742          */
2743         if (ZTOI(tdzp)->i_sb != ZTOI(sdzp)->i_sb ||
2744             zfsctl_is_node(ZTOI(tdzp))) {
2745                 zfs_exit(zfsvfs, FTAG);
2746                 return (SET_ERROR(EXDEV));
2747         }
2748
2749         if (zfsvfs->z_utf8 && u8_validate(tnm,
2750             strlen(tnm), NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
2751                 zfs_exit(zfsvfs, FTAG);
2752                 return (SET_ERROR(EILSEQ));
2753         }
2754
2755         if (flags & FIGNORECASE)
2756                 zflg |= ZCILOOK;
2757
2758 top:
2759         szp = NULL;
2760         tzp = NULL;
2761         zl = NULL;
2762
2763         /*
2764          * This is to prevent the creation of links into attribute space
2765          * by renaming a linked file into/outof an attribute directory.
2766          * See the comment in zfs_link() for why this is considered bad.
2767          */
2768         if ((tdzp->z_pflags & ZFS_XATTR) != (sdzp->z_pflags & ZFS_XATTR)) {
2769                 zfs_exit(zfsvfs, FTAG);
2770                 return (SET_ERROR(EINVAL));
2771         }
2772
2773         /*
2774          * Lock source and target directory entries.  To prevent deadlock,
2775          * a lock ordering must be defined.  We lock the directory with
2776          * the smallest object id first, or if it's a tie, the one with
2777          * the lexically first name.
2778          */
2779         if (sdzp->z_id < tdzp->z_id) {
2780                 cmp = -1;
2781         } else if (sdzp->z_id > tdzp->z_id) {
2782                 cmp = 1;
2783         } else {
2784                 /*
2785                  * First compare the two name arguments without
2786                  * considering any case folding.
2787                  */
2788                 int nofold = (zfsvfs->z_norm & ~U8_TEXTPREP_TOUPPER);
2789
2790                 cmp = u8_strcmp(snm, tnm, 0, nofold, U8_UNICODE_LATEST, &error);
2791                 ASSERT(error == 0 || !zfsvfs->z_utf8);
2792                 if (cmp == 0) {
2793                         /*
2794                          * POSIX: "If the old argument and the new argument
2795                          * both refer to links to the same existing file,
2796                          * the rename() function shall return successfully
2797                          * and perform no other action."
2798                          */
2799                         zfs_exit(zfsvfs, FTAG);
2800                         return (0);
2801                 }
2802                 /*
2803                  * If the file system is case-folding, then we may
2804                  * have some more checking to do.  A case-folding file
2805                  * system is either supporting mixed case sensitivity
2806                  * access or is completely case-insensitive.  Note
2807                  * that the file system is always case preserving.
2808                  *
2809                  * In mixed sensitivity mode case sensitive behavior
2810                  * is the default.  FIGNORECASE must be used to
2811                  * explicitly request case insensitive behavior.
2812                  *
2813                  * If the source and target names provided differ only
2814                  * by case (e.g., a request to rename 'tim' to 'Tim'),
2815                  * we will treat this as a special case in the
2816                  * case-insensitive mode: as long as the source name
2817                  * is an exact match, we will allow this to proceed as
2818                  * a name-change request.
2819                  */
2820                 if ((zfsvfs->z_case == ZFS_CASE_INSENSITIVE ||
2821                     (zfsvfs->z_case == ZFS_CASE_MIXED &&
2822                     flags & FIGNORECASE)) &&
2823                     u8_strcmp(snm, tnm, 0, zfsvfs->z_norm, U8_UNICODE_LATEST,
2824                     &error) == 0) {
2825                         /*
2826                          * case preserving rename request, require exact
2827                          * name matches
2828                          */
2829                         zflg |= ZCIEXACT;
2830                         zflg &= ~ZCILOOK;
2831                 }
2832         }
2833
2834         /*
2835          * If the source and destination directories are the same, we should
2836          * grab the z_name_lock of that directory only once.
2837          */
2838         if (sdzp == tdzp) {
2839                 zflg |= ZHAVELOCK;
2840                 rw_enter(&sdzp->z_name_lock, RW_READER);
2841         }
2842
2843         if (cmp < 0) {
2844                 serr = zfs_dirent_lock(&sdl, sdzp, snm, &szp,
2845                     ZEXISTS | zflg, NULL, NULL);
2846                 terr = zfs_dirent_lock(&tdl,
2847                     tdzp, tnm, &tzp, ZRENAMING | zflg, NULL, NULL);
2848         } else {
2849                 terr = zfs_dirent_lock(&tdl,
2850                     tdzp, tnm, &tzp, zflg, NULL, NULL);
2851                 serr = zfs_dirent_lock(&sdl,
2852                     sdzp, snm, &szp, ZEXISTS | ZRENAMING | zflg,
2853                     NULL, NULL);
2854         }
2855
2856         if (serr) {
2857                 /*
2858                  * Source entry invalid or not there.
2859                  */
2860                 if (!terr) {
2861                         zfs_dirent_unlock(tdl);
2862                         if (tzp)
2863                                 zrele(tzp);
2864                 }
2865
2866                 if (sdzp == tdzp)
2867                         rw_exit(&sdzp->z_name_lock);
2868
2869                 if (strcmp(snm, "..") == 0)
2870                         serr = EINVAL;
2871                 zfs_exit(zfsvfs, FTAG);
2872                 return (serr);
2873         }
2874         if (terr) {
2875                 zfs_dirent_unlock(sdl);
2876                 zrele(szp);
2877
2878                 if (sdzp == tdzp)
2879                         rw_exit(&sdzp->z_name_lock);
2880
2881                 if (strcmp(tnm, "..") == 0)
2882                         terr = EINVAL;
2883                 zfs_exit(zfsvfs, FTAG);
2884                 return (terr);
2885         }
2886
2887         /*
2888          * If we are using project inheritance, means if the directory has
2889          * ZFS_PROJINHERIT set, then its descendant directories will inherit
2890          * not only the project ID, but also the ZFS_PROJINHERIT flag. Under
2891          * such case, we only allow renames into our tree when the project
2892          * IDs are the same.
2893          */
2894         if (tdzp->z_pflags & ZFS_PROJINHERIT &&
2895             tdzp->z_projid != szp->z_projid) {
2896                 error = SET_ERROR(EXDEV);
2897                 goto out;
2898         }
2899
2900         /*
2901          * Must have write access at the source to remove the old entry
2902          * and write access at the target to create the new entry.
2903          * Note that if target and source are the same, this can be
2904          * done in a single check.
2905          */
2906         if ((error = zfs_zaccess_rename(sdzp, szp, tdzp, tzp, cr, mnt_ns)))
2907                 goto out;
2908
2909         if (S_ISDIR(ZTOI(szp)->i_mode)) {
2910                 /*
2911                  * Check to make sure rename is valid.
2912                  * Can't do a move like this: /usr/a/b to /usr/a/b/c/d
2913                  */
2914                 if ((error = zfs_rename_lock(szp, tdzp, sdzp, &zl)))
2915                         goto out;
2916         }
2917
2918         /*
2919          * Does target exist?
2920          */
2921         if (tzp) {
2922                 if (rflags & RENAME_NOREPLACE) {
2923                         error = SET_ERROR(EEXIST);
2924                         goto out;
2925                 }
2926                 /*
2927                  * Source and target must be the same type (unless exchanging).
2928                  */
2929                 if (!(rflags & RENAME_EXCHANGE)) {
2930                         boolean_t s_is_dir = S_ISDIR(ZTOI(szp)->i_mode) != 0;
2931                         boolean_t t_is_dir = S_ISDIR(ZTOI(tzp)->i_mode) != 0;
2932
2933                         if (s_is_dir != t_is_dir) {
2934                                 error = SET_ERROR(s_is_dir ? ENOTDIR : EISDIR);
2935                                 goto out;
2936                         }
2937                 }
2938                 /*
2939                  * POSIX dictates that when the source and target
2940                  * entries refer to the same file object, rename
2941                  * must do nothing and exit without error.
2942                  */
2943                 if (szp->z_id == tzp->z_id) {
2944                         error = 0;
2945                         goto out;
2946                 }
2947         } else if (rflags & RENAME_EXCHANGE) {
2948                 /* Target must exist for RENAME_EXCHANGE. */
2949                 error = SET_ERROR(ENOENT);
2950                 goto out;
2951         }
2952
2953         /* Set up inode creation for RENAME_WHITEOUT. */
2954         if (rflags & RENAME_WHITEOUT) {
2955                 /*
2956                  * Whiteout files are not regular files or directories, so to
2957                  * match zfs_create() we do not inherit the project id.
2958                  */
2959                 uint64_t wo_projid = ZFS_DEFAULT_PROJID;
2960
2961                 error = zfs_zaccess(sdzp, ACE_ADD_FILE, 0, B_FALSE, cr, mnt_ns);
2962                 if (error)
2963                         goto out;
2964
2965                 if (!have_acl) {
2966                         error = zfs_acl_ids_create(sdzp, 0, wo_vap, cr, NULL,
2967                             &acl_ids, mnt_ns);
2968                         if (error)
2969                                 goto out;
2970                         have_acl = B_TRUE;
2971                 }
2972
2973                 if (zfs_acl_ids_overquota(zfsvfs, &acl_ids, wo_projid)) {
2974                         error = SET_ERROR(EDQUOT);
2975                         goto out;
2976                 }
2977         }
2978
2979         tx = dmu_tx_create(zfsvfs->z_os);
2980         dmu_tx_hold_sa(tx, szp->z_sa_hdl, B_FALSE);
2981         dmu_tx_hold_sa(tx, sdzp->z_sa_hdl, B_FALSE);
2982         dmu_tx_hold_zap(tx, sdzp->z_id,
2983             (rflags & RENAME_EXCHANGE) ? TRUE : FALSE, snm);
2984         dmu_tx_hold_zap(tx, tdzp->z_id, TRUE, tnm);
2985         if (sdzp != tdzp) {
2986                 dmu_tx_hold_sa(tx, tdzp->z_sa_hdl, B_FALSE);
2987                 zfs_sa_upgrade_txholds(tx, tdzp);
2988         }
2989         if (tzp) {
2990                 dmu_tx_hold_sa(tx, tzp->z_sa_hdl, B_FALSE);
2991                 zfs_sa_upgrade_txholds(tx, tzp);
2992         }
2993         if (rflags & RENAME_WHITEOUT) {
2994                 dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
2995                     ZFS_SA_BASE_ATTR_SIZE);
2996
2997                 dmu_tx_hold_zap(tx, sdzp->z_id, TRUE, snm);
2998                 dmu_tx_hold_sa(tx, sdzp->z_sa_hdl, B_FALSE);
2999                 if (!zfsvfs->z_use_sa &&
3000                     acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) {
3001                         dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
3002                             0, acl_ids.z_aclp->z_acl_bytes);
3003                 }
3004         }
3005         fuid_dirtied = zfsvfs->z_fuid_dirty;
3006         if (fuid_dirtied)
3007                 zfs_fuid_txhold(zfsvfs, tx);
3008         zfs_sa_upgrade_txholds(tx, szp);
3009         dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
3010         error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT);
3011         if (error) {
3012                 if (zl != NULL)
3013                         zfs_rename_unlock(&zl);
3014                 zfs_dirent_unlock(sdl);
3015                 zfs_dirent_unlock(tdl);
3016
3017                 if (sdzp == tdzp)
3018                         rw_exit(&sdzp->z_name_lock);
3019
3020                 if (error == ERESTART) {
3021                         waited = B_TRUE;
3022                         dmu_tx_wait(tx);
3023                         dmu_tx_abort(tx);
3024                         zrele(szp);
3025                         if (tzp)
3026                                 zrele(tzp);
3027                         goto top;
3028                 }
3029                 dmu_tx_abort(tx);
3030                 zrele(szp);
3031                 if (tzp)
3032                         zrele(tzp);
3033                 zfs_exit(zfsvfs, FTAG);
3034                 return (error);
3035         }
3036
3037         /*
3038          * Unlink the source.
3039          */
3040         szp->z_pflags |= ZFS_AV_MODIFIED;
3041         if (tdzp->z_pflags & ZFS_PROJINHERIT)
3042                 szp->z_pflags |= ZFS_PROJINHERIT;
3043
3044         error = sa_update(szp->z_sa_hdl, SA_ZPL_FLAGS(zfsvfs),
3045             (void *)&szp->z_pflags, sizeof (uint64_t), tx);
3046         VERIFY0(error);
3047
3048         error = zfs_link_destroy(sdl, szp, tx, ZRENAMING, NULL);
3049         if (error)
3050                 goto commit;
3051
3052         /*
3053          * Unlink the target.
3054          */
3055         if (tzp) {
3056                 int tzflg = zflg;
3057
3058                 if (rflags & RENAME_EXCHANGE) {
3059                         /* This inode will be re-linked soon. */
3060                         tzflg |= ZRENAMING;
3061
3062                         tzp->z_pflags |= ZFS_AV_MODIFIED;
3063                         if (sdzp->z_pflags & ZFS_PROJINHERIT)
3064                                 tzp->z_pflags |= ZFS_PROJINHERIT;
3065
3066                         error = sa_update(tzp->z_sa_hdl, SA_ZPL_FLAGS(zfsvfs),
3067                             (void *)&tzp->z_pflags, sizeof (uint64_t), tx);
3068                         ASSERT0(error);
3069                 }
3070                 error = zfs_link_destroy(tdl, tzp, tx, tzflg, NULL);
3071                 if (error)
3072                         goto commit_link_szp;
3073         }
3074
3075         /*
3076          * Create the new target links:
3077          *   * We always link the target.
3078          *   * RENAME_EXCHANGE: Link the old target to the source.
3079          *   * RENAME_WHITEOUT: Create a whiteout inode in-place of the source.
3080          */
3081         error = zfs_link_create(tdl, szp, tx, ZRENAMING);
3082         if (error) {
3083                 /*
3084                  * If we have removed the existing target, a subsequent call to
3085                  * zfs_link_create() to add back the same entry, but with a new
3086                  * dnode (szp), should not fail.
3087                  */
3088                 ASSERT3P(tzp, ==, NULL);
3089                 goto commit_link_tzp;
3090         }
3091
3092         switch (rflags & (RENAME_EXCHANGE | RENAME_WHITEOUT)) {
3093         case RENAME_EXCHANGE:
3094                 error = zfs_link_create(sdl, tzp, tx, ZRENAMING);
3095                 /*
3096                  * The same argument as zfs_link_create() failing for
3097                  * szp applies here, since the source directory must
3098                  * have had an entry we are replacing.
3099                  */
3100                 ASSERT0(error);
3101                 if (error)
3102                         goto commit_unlink_td_szp;
3103                 break;
3104         case RENAME_WHITEOUT:
3105                 zfs_mknode(sdzp, wo_vap, tx, cr, 0, &wzp, &acl_ids);
3106                 error = zfs_link_create(sdl, wzp, tx, ZNEW);
3107                 if (error) {
3108                         zfs_znode_delete(wzp, tx);
3109                         remove_inode_hash(ZTOI(wzp));
3110                         goto commit_unlink_td_szp;
3111                 }
3112                 break;
3113         }
3114
3115         if (fuid_dirtied)
3116                 zfs_fuid_sync(zfsvfs, tx);
3117
3118         switch (rflags & (RENAME_EXCHANGE | RENAME_WHITEOUT)) {
3119         case RENAME_EXCHANGE:
3120                 zfs_log_rename_exchange(zilog, tx,
3121                     (flags & FIGNORECASE ? TX_CI : 0), sdzp, sdl->dl_name,
3122                     tdzp, tdl->dl_name, szp);
3123                 break;
3124         case RENAME_WHITEOUT:
3125                 zfs_log_rename_whiteout(zilog, tx,
3126                     (flags & FIGNORECASE ? TX_CI : 0), sdzp, sdl->dl_name,
3127                     tdzp, tdl->dl_name, szp, wzp);
3128                 break;
3129         default:
3130                 ASSERT0(rflags & ~RENAME_NOREPLACE);
3131                 zfs_log_rename(zilog, tx, (flags & FIGNORECASE ? TX_CI : 0),
3132                     sdzp, sdl->dl_name, tdzp, tdl->dl_name, szp);
3133                 break;
3134         }
3135
3136 commit:
3137         dmu_tx_commit(tx);
3138 out:
3139         if (have_acl)
3140                 zfs_acl_ids_free(&acl_ids);
3141
3142         zfs_znode_update_vfs(sdzp);
3143         if (sdzp == tdzp)
3144                 rw_exit(&sdzp->z_name_lock);
3145
3146         if (sdzp != tdzp)
3147                 zfs_znode_update_vfs(tdzp);
3148
3149         zfs_znode_update_vfs(szp);
3150         zrele(szp);
3151         if (wzp) {
3152                 zfs_znode_update_vfs(wzp);
3153                 zrele(wzp);
3154         }
3155         if (tzp) {
3156                 zfs_znode_update_vfs(tzp);
3157                 zrele(tzp);
3158         }
3159
3160         if (zl != NULL)
3161                 zfs_rename_unlock(&zl);
3162
3163         zfs_dirent_unlock(sdl);
3164         zfs_dirent_unlock(tdl);
3165
3166         if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
3167                 zil_commit(zilog, 0);
3168
3169         zfs_exit(zfsvfs, FTAG);
3170         return (error);
3171
3172         /*
3173          * Clean-up path for broken link state.
3174          *
3175          * At this point we are in a (very) bad state, so we need to do our
3176          * best to correct the state. In particular, all of the nlinks are
3177          * wrong because we were destroying and creating links with ZRENAMING.
3178          *
3179          * In some form, all of these operations have to resolve the state:
3180          *
3181          *  * link_destroy() *must* succeed. Fortunately, this is very likely
3182          *    since we only just created it.
3183          *
3184          *  * link_create()s are allowed to fail (though they shouldn't because
3185          *    we only just unlinked them and are putting the entries back
3186          *    during clean-up). But if they fail, we can just forcefully drop
3187          *    the nlink value to (at the very least) avoid broken nlink values
3188          *    -- though in the case of non-empty directories we will have to
3189          *    panic (otherwise we'd have a leaked directory with a broken ..).
3190          */
3191 commit_unlink_td_szp:
3192         VERIFY0(zfs_link_destroy(tdl, szp, tx, ZRENAMING, NULL));
3193 commit_link_tzp:
3194         if (tzp) {
3195                 if (zfs_link_create(tdl, tzp, tx, ZRENAMING))
3196                         VERIFY0(zfs_drop_nlink(tzp, tx, NULL));
3197         }
3198 commit_link_szp:
3199         if (zfs_link_create(sdl, szp, tx, ZRENAMING))
3200                 VERIFY0(zfs_drop_nlink(szp, tx, NULL));
3201         goto commit;
3202 }
3203
3204 /*
3205  * Insert the indicated symbolic reference entry into the directory.
3206  *
3207  *      IN:     dzp     - Directory to contain new symbolic link.
3208  *              name    - Name of directory entry in dip.
3209  *              vap     - Attributes of new entry.
3210  *              link    - Name for new symlink entry.
3211  *              cr      - credentials of caller.
3212  *              flags   - case flags
3213  *              mnt_ns  - user namespace of the mount
3214  *
3215  *      OUT:    zpp     - Znode for new symbolic link.
3216  *
3217  *      RETURN: 0 on success, error code on failure.
3218  *
3219  * Timestamps:
3220  *      dip - ctime|mtime updated
3221  */
3222 int
3223 zfs_symlink(znode_t *dzp, char *name, vattr_t *vap, char *link,
3224     znode_t **zpp, cred_t *cr, int flags, zidmap_t *mnt_ns)
3225 {
3226         znode_t         *zp;
3227         zfs_dirlock_t   *dl;
3228         dmu_tx_t        *tx;
3229         zfsvfs_t        *zfsvfs = ZTOZSB(dzp);
3230         zilog_t         *zilog;
3231         uint64_t        len = strlen(link);
3232         int             error;
3233         int             zflg = ZNEW;
3234         zfs_acl_ids_t   acl_ids;
3235         boolean_t       fuid_dirtied;
3236         uint64_t        txtype = TX_SYMLINK;
3237         boolean_t       waited = B_FALSE;
3238
3239         ASSERT(S_ISLNK(vap->va_mode));
3240
3241         if (name == NULL)
3242                 return (SET_ERROR(EINVAL));
3243
3244         if ((error = zfs_enter_verify_zp(zfsvfs, dzp, FTAG)) != 0)
3245                 return (error);
3246         zilog = zfsvfs->z_log;
3247
3248         if (zfsvfs->z_utf8 && u8_validate(name, strlen(name),
3249             NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
3250                 zfs_exit(zfsvfs, FTAG);
3251                 return (SET_ERROR(EILSEQ));
3252         }
3253         if (flags & FIGNORECASE)
3254                 zflg |= ZCILOOK;
3255
3256         if (len > MAXPATHLEN) {
3257                 zfs_exit(zfsvfs, FTAG);
3258                 return (SET_ERROR(ENAMETOOLONG));
3259         }
3260
3261         if ((error = zfs_acl_ids_create(dzp, 0,
3262             vap, cr, NULL, &acl_ids, mnt_ns)) != 0) {
3263                 zfs_exit(zfsvfs, FTAG);
3264                 return (error);
3265         }
3266 top:
3267         *zpp = NULL;
3268
3269         /*
3270          * Attempt to lock directory; fail if entry already exists.
3271          */
3272         error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg, NULL, NULL);
3273         if (error) {
3274                 zfs_acl_ids_free(&acl_ids);
3275                 zfs_exit(zfsvfs, FTAG);
3276                 return (error);
3277         }
3278
3279         if ((error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr, mnt_ns))) {
3280                 zfs_acl_ids_free(&acl_ids);
3281                 zfs_dirent_unlock(dl);
3282                 zfs_exit(zfsvfs, FTAG);
3283                 return (error);
3284         }
3285
3286         if (zfs_acl_ids_overquota(zfsvfs, &acl_ids, ZFS_DEFAULT_PROJID)) {
3287                 zfs_acl_ids_free(&acl_ids);
3288                 zfs_dirent_unlock(dl);
3289                 zfs_exit(zfsvfs, FTAG);
3290                 return (SET_ERROR(EDQUOT));
3291         }
3292         tx = dmu_tx_create(zfsvfs->z_os);
3293         fuid_dirtied = zfsvfs->z_fuid_dirty;
3294         dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, MAX(1, len));
3295         dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
3296         dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
3297             ZFS_SA_BASE_ATTR_SIZE + len);
3298         dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE);
3299         if (!zfsvfs->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) {
3300                 dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0,
3301                     acl_ids.z_aclp->z_acl_bytes);
3302         }
3303         if (fuid_dirtied)
3304                 zfs_fuid_txhold(zfsvfs, tx);
3305         error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT);
3306         if (error) {
3307                 zfs_dirent_unlock(dl);
3308                 if (error == ERESTART) {
3309                         waited = B_TRUE;
3310                         dmu_tx_wait(tx);
3311                         dmu_tx_abort(tx);
3312                         goto top;
3313                 }
3314                 zfs_acl_ids_free(&acl_ids);
3315                 dmu_tx_abort(tx);
3316                 zfs_exit(zfsvfs, FTAG);
3317                 return (error);
3318         }
3319
3320         /*
3321          * Create a new object for the symlink.
3322          * for version 4 ZPL datasets the symlink will be an SA attribute
3323          */
3324         zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids);
3325
3326         if (fuid_dirtied)
3327                 zfs_fuid_sync(zfsvfs, tx);
3328
3329         mutex_enter(&zp->z_lock);
3330         if (zp->z_is_sa)
3331                 error = sa_update(zp->z_sa_hdl, SA_ZPL_SYMLINK(zfsvfs),
3332                     link, len, tx);
3333         else
3334                 zfs_sa_symlink(zp, link, len, tx);
3335         mutex_exit(&zp->z_lock);
3336
3337         zp->z_size = len;
3338         (void) sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zfsvfs),
3339             &zp->z_size, sizeof (zp->z_size), tx);
3340         /*
3341          * Insert the new object into the directory.
3342          */
3343         error = zfs_link_create(dl, zp, tx, ZNEW);
3344         if (error != 0) {
3345                 zfs_znode_delete(zp, tx);
3346                 remove_inode_hash(ZTOI(zp));
3347         } else {
3348                 if (flags & FIGNORECASE)
3349                         txtype |= TX_CI;
3350                 zfs_log_symlink(zilog, tx, txtype, dzp, zp, name, link);
3351
3352                 zfs_znode_update_vfs(dzp);
3353                 zfs_znode_update_vfs(zp);
3354         }
3355
3356         zfs_acl_ids_free(&acl_ids);
3357
3358         dmu_tx_commit(tx);
3359
3360         zfs_dirent_unlock(dl);
3361
3362         if (error == 0) {
3363                 *zpp = zp;
3364
3365                 if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
3366                         zil_commit(zilog, 0);
3367         } else {
3368                 zrele(zp);
3369         }
3370
3371         zfs_exit(zfsvfs, FTAG);
3372         return (error);
3373 }
3374
3375 /*
3376  * Return, in the buffer contained in the provided uio structure,
3377  * the symbolic path referred to by ip.
3378  *
3379  *      IN:     ip      - inode of symbolic link
3380  *              uio     - structure to contain the link path.
3381  *              cr      - credentials of caller.
3382  *
3383  *      RETURN: 0 if success
3384  *              error code if failure
3385  *
3386  * Timestamps:
3387  *      ip - atime updated
3388  */
3389 int
3390 zfs_readlink(struct inode *ip, zfs_uio_t *uio, cred_t *cr)
3391 {
3392         (void) cr;
3393         znode_t         *zp = ITOZ(ip);
3394         zfsvfs_t        *zfsvfs = ITOZSB(ip);
3395         int             error;
3396
3397         if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
3398                 return (error);
3399
3400         mutex_enter(&zp->z_lock);
3401         if (zp->z_is_sa)
3402                 error = sa_lookup_uio(zp->z_sa_hdl,
3403                     SA_ZPL_SYMLINK(zfsvfs), uio);
3404         else
3405                 error = zfs_sa_readlink(zp, uio);
3406         mutex_exit(&zp->z_lock);
3407
3408         zfs_exit(zfsvfs, FTAG);
3409         return (error);
3410 }
3411
3412 /*
3413  * Insert a new entry into directory tdzp referencing szp.
3414  *
3415  *      IN:     tdzp    - Directory to contain new entry.
3416  *              szp     - znode of new entry.
3417  *              name    - name of new entry.
3418  *              cr      - credentials of caller.
3419  *              flags   - case flags.
3420  *
3421  *      RETURN: 0 if success
3422  *              error code if failure
3423  *
3424  * Timestamps:
3425  *      tdzp - ctime|mtime updated
3426  *       szp - ctime updated
3427  */
3428 int
3429 zfs_link(znode_t *tdzp, znode_t *szp, char *name, cred_t *cr,
3430     int flags)
3431 {
3432         struct inode *sip = ZTOI(szp);
3433         znode_t         *tzp;
3434         zfsvfs_t        *zfsvfs = ZTOZSB(tdzp);
3435         zilog_t         *zilog;
3436         zfs_dirlock_t   *dl;
3437         dmu_tx_t        *tx;
3438         int             error;
3439         int             zf = ZNEW;
3440         uint64_t        parent;
3441         uid_t           owner;
3442         boolean_t       waited = B_FALSE;
3443         boolean_t       is_tmpfile = 0;
3444         uint64_t        txg;
3445 #ifdef HAVE_TMPFILE
3446         is_tmpfile = (sip->i_nlink == 0 && (sip->i_state & I_LINKABLE));
3447 #endif
3448         ASSERT(S_ISDIR(ZTOI(tdzp)->i_mode));
3449
3450         if (name == NULL)
3451                 return (SET_ERROR(EINVAL));
3452
3453         if ((error = zfs_enter_verify_zp(zfsvfs, tdzp, FTAG)) != 0)
3454                 return (error);
3455         zilog = zfsvfs->z_log;
3456
3457         /*
3458          * POSIX dictates that we return EPERM here.
3459          * Better choices include ENOTSUP or EISDIR.
3460          */
3461         if (S_ISDIR(sip->i_mode)) {
3462                 zfs_exit(zfsvfs, FTAG);
3463                 return (SET_ERROR(EPERM));
3464         }
3465
3466         if ((error = zfs_verify_zp(szp)) != 0) {
3467                 zfs_exit(zfsvfs, FTAG);
3468                 return (error);
3469         }
3470
3471         /*
3472          * If we are using project inheritance, means if the directory has
3473          * ZFS_PROJINHERIT set, then its descendant directories will inherit
3474          * not only the project ID, but also the ZFS_PROJINHERIT flag. Under
3475          * such case, we only allow hard link creation in our tree when the
3476          * project IDs are the same.
3477          */
3478         if (tdzp->z_pflags & ZFS_PROJINHERIT &&
3479             tdzp->z_projid != szp->z_projid) {
3480                 zfs_exit(zfsvfs, FTAG);
3481                 return (SET_ERROR(EXDEV));
3482         }
3483
3484         /*
3485          * We check i_sb because snapshots and the ctldir must have different
3486          * super blocks.
3487          */
3488         if (sip->i_sb != ZTOI(tdzp)->i_sb || zfsctl_is_node(sip)) {
3489                 zfs_exit(zfsvfs, FTAG);
3490                 return (SET_ERROR(EXDEV));
3491         }
3492
3493         /* Prevent links to .zfs/shares files */
3494
3495         if ((error = sa_lookup(szp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs),
3496             &parent, sizeof (uint64_t))) != 0) {
3497                 zfs_exit(zfsvfs, FTAG);
3498                 return (error);
3499         }
3500         if (parent == zfsvfs->z_shares_dir) {
3501                 zfs_exit(zfsvfs, FTAG);
3502                 return (SET_ERROR(EPERM));
3503         }
3504
3505         if (zfsvfs->z_utf8 && u8_validate(name,
3506             strlen(name), NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
3507                 zfs_exit(zfsvfs, FTAG);
3508                 return (SET_ERROR(EILSEQ));
3509         }
3510         if (flags & FIGNORECASE)
3511                 zf |= ZCILOOK;
3512
3513         /*
3514          * We do not support links between attributes and non-attributes
3515          * because of the potential security risk of creating links
3516          * into "normal" file space in order to circumvent restrictions
3517          * imposed in attribute space.
3518          */
3519         if ((szp->z_pflags & ZFS_XATTR) != (tdzp->z_pflags & ZFS_XATTR)) {
3520                 zfs_exit(zfsvfs, FTAG);
3521                 return (SET_ERROR(EINVAL));
3522         }
3523
3524         owner = zfs_fuid_map_id(zfsvfs, KUID_TO_SUID(sip->i_uid),
3525             cr, ZFS_OWNER);
3526         if (owner != crgetuid(cr) && secpolicy_basic_link(cr) != 0) {
3527                 zfs_exit(zfsvfs, FTAG);
3528                 return (SET_ERROR(EPERM));
3529         }
3530
3531         if ((error = zfs_zaccess(tdzp, ACE_ADD_FILE, 0, B_FALSE, cr,
3532             zfs_init_idmap))) {
3533                 zfs_exit(zfsvfs, FTAG);
3534                 return (error);
3535         }
3536
3537 top:
3538         /*
3539          * Attempt to lock directory; fail if entry already exists.
3540          */
3541         error = zfs_dirent_lock(&dl, tdzp, name, &tzp, zf, NULL, NULL);
3542         if (error) {
3543                 zfs_exit(zfsvfs, FTAG);
3544                 return (error);
3545         }
3546
3547         tx = dmu_tx_create(zfsvfs->z_os);
3548         dmu_tx_hold_sa(tx, szp->z_sa_hdl, B_FALSE);
3549         dmu_tx_hold_zap(tx, tdzp->z_id, TRUE, name);
3550         if (is_tmpfile)
3551                 dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
3552
3553         zfs_sa_upgrade_txholds(tx, szp);
3554         zfs_sa_upgrade_txholds(tx, tdzp);
3555         error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT);
3556         if (error) {
3557                 zfs_dirent_unlock(dl);
3558                 if (error == ERESTART) {
3559                         waited = B_TRUE;
3560                         dmu_tx_wait(tx);
3561                         dmu_tx_abort(tx);
3562                         goto top;
3563                 }
3564                 dmu_tx_abort(tx);
3565                 zfs_exit(zfsvfs, FTAG);
3566                 return (error);
3567         }
3568         /* unmark z_unlinked so zfs_link_create will not reject */
3569         if (is_tmpfile)
3570                 szp->z_unlinked = B_FALSE;
3571         error = zfs_link_create(dl, szp, tx, 0);
3572
3573         if (error == 0) {
3574                 uint64_t txtype = TX_LINK;
3575                 /*
3576                  * tmpfile is created to be in z_unlinkedobj, so remove it.
3577                  * Also, we don't log in ZIL, because all previous file
3578                  * operation on the tmpfile are ignored by ZIL. Instead we
3579                  * always wait for txg to sync to make sure all previous
3580                  * operation are sync safe.
3581                  */
3582                 if (is_tmpfile) {
3583                         VERIFY(zap_remove_int(zfsvfs->z_os,
3584                             zfsvfs->z_unlinkedobj, szp->z_id, tx) == 0);
3585                 } else {
3586                         if (flags & FIGNORECASE)
3587                                 txtype |= TX_CI;
3588                         zfs_log_link(zilog, tx, txtype, tdzp, szp, name);
3589                 }
3590         } else if (is_tmpfile) {
3591                 /* restore z_unlinked since when linking failed */
3592                 szp->z_unlinked = B_TRUE;
3593         }
3594         txg = dmu_tx_get_txg(tx);
3595         dmu_tx_commit(tx);
3596
3597         zfs_dirent_unlock(dl);
3598
3599         if (!is_tmpfile && zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
3600                 zil_commit(zilog, 0);
3601
3602         if (is_tmpfile && zfsvfs->z_os->os_sync != ZFS_SYNC_DISABLED)
3603                 txg_wait_synced(dmu_objset_pool(zfsvfs->z_os), txg);
3604
3605         zfs_znode_update_vfs(tdzp);
3606         zfs_znode_update_vfs(szp);
3607         zfs_exit(zfsvfs, FTAG);
3608         return (error);
3609 }
3610
3611 static void
3612 zfs_putpage_sync_commit_cb(void *arg)
3613 {
3614         struct page *pp = arg;
3615
3616         ClearPageError(pp);
3617         end_page_writeback(pp);
3618 }
3619
3620 static void
3621 zfs_putpage_async_commit_cb(void *arg)
3622 {
3623         struct page *pp = arg;
3624         znode_t *zp = ITOZ(pp->mapping->host);
3625
3626         ClearPageError(pp);
3627         end_page_writeback(pp);
3628         atomic_dec_32(&zp->z_async_writes_cnt);
3629 }
3630
3631 /*
3632  * Push a page out to disk, once the page is on stable storage the
3633  * registered commit callback will be run as notification of completion.
3634  *
3635  *      IN:     ip       - page mapped for inode.
3636  *              pp       - page to push (page is locked)
3637  *              wbc      - writeback control data
3638  *              for_sync - does the caller intend to wait synchronously for the
3639  *                         page writeback to complete?
3640  *
3641  *      RETURN: 0 if success
3642  *              error code if failure
3643  *
3644  * Timestamps:
3645  *      ip - ctime|mtime updated
3646  */
3647 int
3648 zfs_putpage(struct inode *ip, struct page *pp, struct writeback_control *wbc,
3649     boolean_t for_sync)
3650 {
3651         znode_t         *zp = ITOZ(ip);
3652         zfsvfs_t        *zfsvfs = ITOZSB(ip);
3653         loff_t          offset;
3654         loff_t          pgoff;
3655         unsigned int    pglen;
3656         dmu_tx_t        *tx;
3657         caddr_t         va;
3658         int             err = 0;
3659         uint64_t        mtime[2], ctime[2];
3660         inode_timespec_t tmp_ctime;
3661         sa_bulk_attr_t  bulk[3];
3662         int             cnt = 0;
3663         struct address_space *mapping;
3664
3665         if ((err = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
3666                 return (err);
3667
3668         ASSERT(PageLocked(pp));
3669
3670         pgoff = page_offset(pp);        /* Page byte-offset in file */
3671         offset = i_size_read(ip);       /* File length in bytes */
3672         pglen = MIN(PAGE_SIZE,          /* Page length in bytes */
3673             P2ROUNDUP(offset, PAGE_SIZE)-pgoff);
3674
3675         /* Page is beyond end of file */
3676         if (pgoff >= offset) {
3677                 unlock_page(pp);
3678                 zfs_exit(zfsvfs, FTAG);
3679                 return (0);
3680         }
3681
3682         /* Truncate page length to end of file */
3683         if (pgoff + pglen > offset)
3684                 pglen = offset - pgoff;
3685
3686 #if 0
3687         /*
3688          * FIXME: Allow mmap writes past its quota.  The correct fix
3689          * is to register a page_mkwrite() handler to count the page
3690          * against its quota when it is about to be dirtied.
3691          */
3692         if (zfs_id_overblockquota(zfsvfs, DMU_USERUSED_OBJECT,
3693             KUID_TO_SUID(ip->i_uid)) ||
3694             zfs_id_overblockquota(zfsvfs, DMU_GROUPUSED_OBJECT,
3695             KGID_TO_SGID(ip->i_gid)) ||
3696             (zp->z_projid != ZFS_DEFAULT_PROJID &&
3697             zfs_id_overblockquota(zfsvfs, DMU_PROJECTUSED_OBJECT,
3698             zp->z_projid))) {
3699                 err = EDQUOT;
3700         }
3701 #endif
3702
3703         /*
3704          * The ordering here is critical and must adhere to the following
3705          * rules in order to avoid deadlocking in either zfs_read() or
3706          * zfs_free_range() due to a lock inversion.
3707          *
3708          * 1) The page must be unlocked prior to acquiring the range lock.
3709          *    This is critical because zfs_read() calls find_lock_page()
3710          *    which may block on the page lock while holding the range lock.
3711          *
3712          * 2) Before setting or clearing write back on a page the range lock
3713          *    must be held in order to prevent a lock inversion with the
3714          *    zfs_free_range() function.
3715          *
3716          * This presents a problem because upon entering this function the
3717          * page lock is already held.  To safely acquire the range lock the
3718          * page lock must be dropped.  This creates a window where another
3719          * process could truncate, invalidate, dirty, or write out the page.
3720          *
3721          * Therefore, after successfully reacquiring the range and page locks
3722          * the current page state is checked.  In the common case everything
3723          * will be as is expected and it can be written out.  However, if
3724          * the page state has changed it must be handled accordingly.
3725          */
3726         mapping = pp->mapping;
3727         redirty_page_for_writepage(wbc, pp);
3728         unlock_page(pp);
3729
3730         zfs_locked_range_t *lr = zfs_rangelock_enter(&zp->z_rangelock,
3731             pgoff, pglen, RL_WRITER);
3732         lock_page(pp);
3733
3734         /* Page mapping changed or it was no longer dirty, we're done */
3735         if (unlikely((mapping != pp->mapping) || !PageDirty(pp))) {
3736                 unlock_page(pp);
3737                 zfs_rangelock_exit(lr);
3738                 zfs_exit(zfsvfs, FTAG);
3739                 return (0);
3740         }
3741
3742         /* Another process started write block if required */
3743         if (PageWriteback(pp)) {
3744                 unlock_page(pp);
3745                 zfs_rangelock_exit(lr);
3746
3747                 if (wbc->sync_mode != WB_SYNC_NONE) {
3748                         /*
3749                          * Speed up any non-sync page writebacks since
3750                          * they may take several seconds to complete.
3751                          * Refer to the comment in zpl_fsync() (when
3752                          * HAVE_FSYNC_RANGE is defined) for details.
3753                          */
3754                         if (atomic_load_32(&zp->z_async_writes_cnt) > 0) {
3755                                 zil_commit(zfsvfs->z_log, zp->z_id);
3756                         }
3757
3758                         if (PageWriteback(pp))
3759 #ifdef HAVE_PAGEMAP_FOLIO_WAIT_BIT
3760                                 folio_wait_bit(page_folio(pp), PG_writeback);
3761 #else
3762                                 wait_on_page_bit(pp, PG_writeback);
3763 #endif
3764                 }
3765
3766                 zfs_exit(zfsvfs, FTAG);
3767                 return (0);
3768         }
3769
3770         /* Clear the dirty flag the required locks are held */
3771         if (!clear_page_dirty_for_io(pp)) {
3772                 unlock_page(pp);
3773                 zfs_rangelock_exit(lr);
3774                 zfs_exit(zfsvfs, FTAG);
3775                 return (0);
3776         }
3777
3778         /*
3779          * Counterpart for redirty_page_for_writepage() above.  This page
3780          * was in fact not skipped and should not be counted as if it were.
3781          */
3782         wbc->pages_skipped--;
3783         if (!for_sync)
3784                 atomic_inc_32(&zp->z_async_writes_cnt);
3785         set_page_writeback(pp);
3786         unlock_page(pp);
3787
3788         tx = dmu_tx_create(zfsvfs->z_os);
3789         dmu_tx_hold_write(tx, zp->z_id, pgoff, pglen);
3790         dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
3791         zfs_sa_upgrade_txholds(tx, zp);
3792
3793         err = dmu_tx_assign(tx, TXG_NOWAIT);
3794         if (err != 0) {
3795                 if (err == ERESTART)
3796                         dmu_tx_wait(tx);
3797
3798                 dmu_tx_abort(tx);
3799 #ifdef HAVE_VFS_FILEMAP_DIRTY_FOLIO
3800                 filemap_dirty_folio(page_mapping(pp), page_folio(pp));
3801 #else
3802                 __set_page_dirty_nobuffers(pp);
3803 #endif
3804                 ClearPageError(pp);
3805                 end_page_writeback(pp);
3806                 if (!for_sync)
3807                         atomic_dec_32(&zp->z_async_writes_cnt);
3808                 zfs_rangelock_exit(lr);
3809                 zfs_exit(zfsvfs, FTAG);
3810                 return (err);
3811         }
3812
3813         va = kmap(pp);
3814         ASSERT3U(pglen, <=, PAGE_SIZE);
3815         dmu_write(zfsvfs->z_os, zp->z_id, pgoff, pglen, va, tx);
3816         kunmap(pp);
3817
3818         SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16);
3819         SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16);
3820         SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_FLAGS(zfsvfs), NULL,
3821             &zp->z_pflags, 8);
3822
3823         /* Preserve the mtime and ctime provided by the inode */
3824         ZFS_TIME_ENCODE(&ip->i_mtime, mtime);
3825         tmp_ctime = zpl_inode_get_ctime(ip);
3826         ZFS_TIME_ENCODE(&tmp_ctime, ctime);
3827         zp->z_atime_dirty = B_FALSE;
3828         zp->z_seq++;
3829
3830         err = sa_bulk_update(zp->z_sa_hdl, bulk, cnt, tx);
3831
3832         zfs_log_write(zfsvfs->z_log, tx, TX_WRITE, zp, pgoff, pglen, 0,
3833             for_sync ? zfs_putpage_sync_commit_cb :
3834             zfs_putpage_async_commit_cb, pp);
3835
3836         dmu_tx_commit(tx);
3837
3838         zfs_rangelock_exit(lr);
3839
3840         if (wbc->sync_mode != WB_SYNC_NONE) {
3841                 /*
3842                  * Note that this is rarely called under writepages(), because
3843                  * writepages() normally handles the entire commit for
3844                  * performance reasons.
3845                  */
3846                 zil_commit(zfsvfs->z_log, zp->z_id);
3847         } else if (!for_sync && atomic_load_32(&zp->z_sync_writes_cnt) > 0) {
3848                 /*
3849                  * If the caller does not intend to wait synchronously
3850                  * for this page writeback to complete and there are active
3851                  * synchronous calls on this file, do a commit so that
3852                  * the latter don't accidentally end up waiting for
3853                  * our writeback to complete. Refer to the comment in
3854                  * zpl_fsync() (when HAVE_FSYNC_RANGE is defined) for details.
3855                  */
3856                 zil_commit(zfsvfs->z_log, zp->z_id);
3857         }
3858
3859         dataset_kstats_update_write_kstats(&zfsvfs->z_kstat, pglen);
3860
3861         zfs_exit(zfsvfs, FTAG);
3862         return (err);
3863 }
3864
3865 /*
3866  * Update the system attributes when the inode has been dirtied.  For the
3867  * moment we only update the mode, atime, mtime, and ctime.
3868  */
3869 int
3870 zfs_dirty_inode(struct inode *ip, int flags)
3871 {
3872         znode_t         *zp = ITOZ(ip);
3873         zfsvfs_t        *zfsvfs = ITOZSB(ip);
3874         dmu_tx_t        *tx;
3875         uint64_t        mode, atime[2], mtime[2], ctime[2];
3876         inode_timespec_t tmp_ctime;
3877         sa_bulk_attr_t  bulk[4];
3878         int             error = 0;
3879         int             cnt = 0;
3880
3881         if (zfs_is_readonly(zfsvfs) || dmu_objset_is_snapshot(zfsvfs->z_os))
3882                 return (0);
3883
3884         if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
3885                 return (error);
3886
3887 #ifdef I_DIRTY_TIME
3888         /*
3889          * This is the lazytime semantic introduced in Linux 4.0
3890          * This flag will only be called from update_time when lazytime is set.
3891          * (Note, I_DIRTY_SYNC will also set if not lazytime)
3892          * Fortunately mtime and ctime are managed within ZFS itself, so we
3893          * only need to dirty atime.
3894          */
3895         if (flags == I_DIRTY_TIME) {
3896                 zp->z_atime_dirty = B_TRUE;
3897                 goto out;
3898         }
3899 #endif
3900
3901         tx = dmu_tx_create(zfsvfs->z_os);
3902
3903         dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
3904         zfs_sa_upgrade_txholds(tx, zp);
3905
3906         error = dmu_tx_assign(tx, TXG_WAIT);
3907         if (error) {
3908                 dmu_tx_abort(tx);
3909                 goto out;
3910         }
3911
3912         mutex_enter(&zp->z_lock);
3913         zp->z_atime_dirty = B_FALSE;
3914
3915         SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_MODE(zfsvfs), NULL, &mode, 8);
3916         SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_ATIME(zfsvfs), NULL, &atime, 16);
3917         SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16);
3918         SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16);
3919
3920         /* Preserve the mode, mtime and ctime provided by the inode */
3921         ZFS_TIME_ENCODE(&ip->i_atime, atime);
3922         ZFS_TIME_ENCODE(&ip->i_mtime, mtime);
3923         tmp_ctime = zpl_inode_get_ctime(ip);
3924         ZFS_TIME_ENCODE(&tmp_ctime, ctime);
3925         mode = ip->i_mode;
3926
3927         zp->z_mode = mode;
3928
3929         error = sa_bulk_update(zp->z_sa_hdl, bulk, cnt, tx);
3930         mutex_exit(&zp->z_lock);
3931
3932         dmu_tx_commit(tx);
3933 out:
3934         zfs_exit(zfsvfs, FTAG);
3935         return (error);
3936 }
3937
3938 void
3939 zfs_inactive(struct inode *ip)
3940 {
3941         znode_t *zp = ITOZ(ip);
3942         zfsvfs_t *zfsvfs = ITOZSB(ip);
3943         uint64_t atime[2];
3944         int error;
3945         int need_unlock = 0;
3946
3947         /* Only read lock if we haven't already write locked, e.g. rollback */
3948         if (!RW_WRITE_HELD(&zfsvfs->z_teardown_inactive_lock)) {
3949                 need_unlock = 1;
3950                 rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_READER);
3951         }
3952         if (zp->z_sa_hdl == NULL) {
3953                 if (need_unlock)
3954                         rw_exit(&zfsvfs->z_teardown_inactive_lock);
3955                 return;
3956         }
3957
3958         if (zp->z_atime_dirty && zp->z_unlinked == B_FALSE) {
3959                 dmu_tx_t *tx = dmu_tx_create(zfsvfs->z_os);
3960
3961                 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
3962                 zfs_sa_upgrade_txholds(tx, zp);
3963                 error = dmu_tx_assign(tx, TXG_WAIT);
3964                 if (error) {
3965                         dmu_tx_abort(tx);
3966                 } else {
3967                         ZFS_TIME_ENCODE(&ip->i_atime, atime);
3968                         mutex_enter(&zp->z_lock);
3969                         (void) sa_update(zp->z_sa_hdl, SA_ZPL_ATIME(zfsvfs),
3970                             (void *)&atime, sizeof (atime), tx);
3971                         zp->z_atime_dirty = B_FALSE;
3972                         mutex_exit(&zp->z_lock);
3973                         dmu_tx_commit(tx);
3974                 }
3975         }
3976
3977         zfs_zinactive(zp);
3978         if (need_unlock)
3979                 rw_exit(&zfsvfs->z_teardown_inactive_lock);
3980 }
3981
3982 /*
3983  * Fill pages with data from the disk.
3984  */
3985 static int
3986 zfs_fillpage(struct inode *ip, struct page *pp)
3987 {
3988         zfsvfs_t *zfsvfs = ITOZSB(ip);
3989         loff_t i_size = i_size_read(ip);
3990         u_offset_t io_off = page_offset(pp);
3991         size_t io_len = PAGE_SIZE;
3992
3993         ASSERT3U(io_off, <, i_size);
3994
3995         if (io_off + io_len > i_size)
3996                 io_len = i_size - io_off;
3997
3998         void *va = kmap(pp);
3999         int error = dmu_read(zfsvfs->z_os, ITOZ(ip)->z_id, io_off,
4000             io_len, va, DMU_READ_PREFETCH);
4001         if (io_len != PAGE_SIZE)
4002                 memset((char *)va + io_len, 0, PAGE_SIZE - io_len);
4003         kunmap(pp);
4004
4005         if (error) {
4006                 /* convert checksum errors into IO errors */
4007                 if (error == ECKSUM)
4008                         error = SET_ERROR(EIO);
4009
4010                 SetPageError(pp);
4011                 ClearPageUptodate(pp);
4012         } else {
4013                 ClearPageError(pp);
4014                 SetPageUptodate(pp);
4015         }
4016
4017         return (error);
4018 }
4019
4020 /*
4021  * Uses zfs_fillpage to read data from the file and fill the page.
4022  *
4023  *      IN:     ip       - inode of file to get data from.
4024  *              pp       - page to read
4025  *
4026  *      RETURN: 0 on success, error code on failure.
4027  *
4028  * Timestamps:
4029  *      vp - atime updated
4030  */
4031 int
4032 zfs_getpage(struct inode *ip, struct page *pp)
4033 {
4034         zfsvfs_t *zfsvfs = ITOZSB(ip);
4035         znode_t *zp = ITOZ(ip);
4036         int error;
4037
4038         if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
4039                 return (error);
4040
4041         error = zfs_fillpage(ip, pp);
4042         if (error == 0)
4043                 dataset_kstats_update_read_kstats(&zfsvfs->z_kstat, PAGE_SIZE);
4044
4045         zfs_exit(zfsvfs, FTAG);
4046
4047         return (error);
4048 }
4049
4050 /*
4051  * Check ZFS specific permissions to memory map a section of a file.
4052  *
4053  *      IN:     ip      - inode of the file to mmap
4054  *              off     - file offset
4055  *              addrp   - start address in memory region
4056  *              len     - length of memory region
4057  *              vm_flags- address flags
4058  *
4059  *      RETURN: 0 if success
4060  *              error code if failure
4061  */
4062 int
4063 zfs_map(struct inode *ip, offset_t off, caddr_t *addrp, size_t len,
4064     unsigned long vm_flags)
4065 {
4066         (void) addrp;
4067         znode_t  *zp = ITOZ(ip);
4068         zfsvfs_t *zfsvfs = ITOZSB(ip);
4069         int error;
4070
4071         if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
4072                 return (error);
4073
4074         if ((vm_flags & VM_WRITE) && (vm_flags & VM_SHARED) &&
4075             (zp->z_pflags & (ZFS_IMMUTABLE | ZFS_READONLY | ZFS_APPENDONLY))) {
4076                 zfs_exit(zfsvfs, FTAG);
4077                 return (SET_ERROR(EPERM));
4078         }
4079
4080         if ((vm_flags & (VM_READ | VM_EXEC)) &&
4081             (zp->z_pflags & ZFS_AV_QUARANTINED)) {
4082                 zfs_exit(zfsvfs, FTAG);
4083                 return (SET_ERROR(EACCES));
4084         }
4085
4086         if (off < 0 || len > MAXOFFSET_T - off) {
4087                 zfs_exit(zfsvfs, FTAG);
4088                 return (SET_ERROR(ENXIO));
4089         }
4090
4091         zfs_exit(zfsvfs, FTAG);
4092         return (0);
4093 }
4094
4095 /*
4096  * Free or allocate space in a file.  Currently, this function only
4097  * supports the `F_FREESP' command.  However, this command is somewhat
4098  * misnamed, as its functionality includes the ability to allocate as
4099  * well as free space.
4100  *
4101  *      IN:     zp      - znode of file to free data in.
4102  *              cmd     - action to take (only F_FREESP supported).
4103  *              bfp     - section of file to free/alloc.
4104  *              flag    - current file open mode flags.
4105  *              offset  - current file offset.
4106  *              cr      - credentials of caller.
4107  *
4108  *      RETURN: 0 on success, error code on failure.
4109  *
4110  * Timestamps:
4111  *      zp - ctime|mtime updated
4112  */
4113 int
4114 zfs_space(znode_t *zp, int cmd, flock64_t *bfp, int flag,
4115     offset_t offset, cred_t *cr)
4116 {
4117         (void) offset;
4118         zfsvfs_t        *zfsvfs = ZTOZSB(zp);
4119         uint64_t        off, len;
4120         int             error;
4121
4122         if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
4123                 return (error);
4124
4125         if (cmd != F_FREESP) {
4126                 zfs_exit(zfsvfs, FTAG);
4127                 return (SET_ERROR(EINVAL));
4128         }
4129
4130         /*
4131          * Callers might not be able to detect properly that we are read-only,
4132          * so check it explicitly here.
4133          */
4134         if (zfs_is_readonly(zfsvfs)) {
4135                 zfs_exit(zfsvfs, FTAG);
4136                 return (SET_ERROR(EROFS));
4137         }
4138
4139         if (bfp->l_len < 0) {
4140                 zfs_exit(zfsvfs, FTAG);
4141                 return (SET_ERROR(EINVAL));
4142         }
4143
4144         /*
4145          * Permissions aren't checked on Solaris because on this OS
4146          * zfs_space() can only be called with an opened file handle.
4147          * On Linux we can get here through truncate_range() which
4148          * operates directly on inodes, so we need to check access rights.
4149          */
4150         if ((error = zfs_zaccess(zp, ACE_WRITE_DATA, 0, B_FALSE, cr,
4151             zfs_init_idmap))) {
4152                 zfs_exit(zfsvfs, FTAG);
4153                 return (error);
4154         }
4155
4156         off = bfp->l_start;
4157         len = bfp->l_len; /* 0 means from off to end of file */
4158
4159         error = zfs_freesp(zp, off, len, flag, TRUE);
4160
4161         zfs_exit(zfsvfs, FTAG);
4162         return (error);
4163 }
4164
4165 int
4166 zfs_fid(struct inode *ip, fid_t *fidp)
4167 {
4168         znode_t         *zp = ITOZ(ip);
4169         zfsvfs_t        *zfsvfs = ITOZSB(ip);
4170         uint32_t        gen;
4171         uint64_t        gen64;
4172         uint64_t        object = zp->z_id;
4173         zfid_short_t    *zfid;
4174         int             size, i, error;
4175
4176         if ((error = zfs_enter(zfsvfs, FTAG)) != 0)
4177                 return (error);
4178
4179         if (fidp->fid_len < SHORT_FID_LEN) {
4180                 fidp->fid_len = SHORT_FID_LEN;
4181                 zfs_exit(zfsvfs, FTAG);
4182                 return (SET_ERROR(ENOSPC));
4183         }
4184
4185         if ((error = zfs_verify_zp(zp)) != 0) {
4186                 zfs_exit(zfsvfs, FTAG);
4187                 return (error);
4188         }
4189
4190         if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_GEN(zfsvfs),
4191             &gen64, sizeof (uint64_t))) != 0) {
4192                 zfs_exit(zfsvfs, FTAG);
4193                 return (error);
4194         }
4195
4196         gen = (uint32_t)gen64;
4197
4198         size = SHORT_FID_LEN;
4199
4200         zfid = (zfid_short_t *)fidp;
4201
4202         zfid->zf_len = size;
4203
4204         for (i = 0; i < sizeof (zfid->zf_object); i++)
4205                 zfid->zf_object[i] = (uint8_t)(object >> (8 * i));
4206
4207         /* Must have a non-zero generation number to distinguish from .zfs */
4208         if (gen == 0)
4209                 gen = 1;
4210         for (i = 0; i < sizeof (zfid->zf_gen); i++)
4211                 zfid->zf_gen[i] = (uint8_t)(gen >> (8 * i));
4212
4213         zfs_exit(zfsvfs, FTAG);
4214         return (0);
4215 }
4216
4217 #if defined(_KERNEL)
4218 EXPORT_SYMBOL(zfs_open);
4219 EXPORT_SYMBOL(zfs_close);
4220 EXPORT_SYMBOL(zfs_lookup);
4221 EXPORT_SYMBOL(zfs_create);
4222 EXPORT_SYMBOL(zfs_tmpfile);
4223 EXPORT_SYMBOL(zfs_remove);
4224 EXPORT_SYMBOL(zfs_mkdir);
4225 EXPORT_SYMBOL(zfs_rmdir);
4226 EXPORT_SYMBOL(zfs_readdir);
4227 EXPORT_SYMBOL(zfs_getattr_fast);
4228 EXPORT_SYMBOL(zfs_setattr);
4229 EXPORT_SYMBOL(zfs_rename);
4230 EXPORT_SYMBOL(zfs_symlink);
4231 EXPORT_SYMBOL(zfs_readlink);
4232 EXPORT_SYMBOL(zfs_link);
4233 EXPORT_SYMBOL(zfs_inactive);
4234 EXPORT_SYMBOL(zfs_space);
4235 EXPORT_SYMBOL(zfs_fid);
4236 EXPORT_SYMBOL(zfs_getpage);
4237 EXPORT_SYMBOL(zfs_putpage);
4238 EXPORT_SYMBOL(zfs_dirty_inode);
4239 EXPORT_SYMBOL(zfs_map);
4240
4241 /* CSTYLED */
4242 module_param(zfs_delete_blocks, ulong, 0644);
4243 MODULE_PARM_DESC(zfs_delete_blocks, "Delete files larger than N blocks async");
4244
4245 /* CSTYLED */
4246 module_param(zfs_bclone_enabled, uint, 0644);
4247 MODULE_PARM_DESC(zfs_bclone_enabled, "Enable block cloning");
4248
4249 #endif