module/os/linux/zfs/zfs_vnops_os.c

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or https://opensource.org/licenses/CDDL-1.0.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21
  22 /*
  23  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  24  * Copyright (c) 2012, 2018 by Delphix. All rights reserved.
  25  * Copyright (c) 2015 by Chunwei Chen. All rights reserved.
  26  * Copyright 2017 Nexenta Systems, Inc.
  27  */
  28
  29 /* Portions Copyright 2007 Jeremy Teo */
  30 /* Portions Copyright 2010 Robert Milkowski */
  31
  32
  33 #include <sys/types.h>
  34 #include <sys/param.h>
  35 #include <sys/time.h>
  36 #include <sys/sysmacros.h>
  37 #include <sys/vfs.h>
  38 #include <sys/file.h>
  39 #include <sys/stat.h>
  40 #include <sys/kmem.h>
  41 #include <sys/taskq.h>
  42 #include <sys/uio.h>
  43 #include <sys/vmsystm.h>
  44 #include <sys/atomic.h>
  45 #include <sys/pathname.h>
  46 #include <sys/cmn_err.h>
  47 #include <sys/errno.h>
  48 #include <sys/zfs_dir.h>
  49 #include <sys/zfs_acl.h>
  50 #include <sys/zfs_ioctl.h>
  51 #include <sys/fs/zfs.h>
  52 #include <sys/dmu.h>
  53 #include <sys/dmu_objset.h>
  54 #include <sys/spa.h>
  55 #include <sys/txg.h>
  56 #include <sys/dbuf.h>
  57 #include <sys/zap.h>
  58 #include <sys/sa.h>
  59 #include <sys/policy.h>
  60 #include <sys/sunddi.h>
  61 #include <sys/sid.h>
  62 #include <sys/zfs_ctldir.h>
  63 #include <sys/zfs_fuid.h>
  64 #include <sys/zfs_quota.h>
  65 #include <sys/zfs_sa.h>
  66 #include <sys/zfs_vnops.h>
  67 #include <sys/zfs_rlock.h>
  68 #include <sys/cred.h>
  69 #include <sys/zpl.h>
  70 #include <sys/zil.h>
  71 #include <sys/sa_impl.h>
  72
  73 /*
  74  * Programming rules.
  75  *
  76  * Each vnode op performs some logical unit of work.  To do this, the ZPL must
  77  * properly lock its in-core state, create a DMU transaction, do the work,
  78  * record this work in the intent log (ZIL), commit the DMU transaction,
  79  * and wait for the intent log to commit if it is a synchronous operation.
  80  * Moreover, the vnode ops must work in both normal and log replay context.
  81  * The ordering of events is important to avoid deadlocks and references
  82  * to freed memory.  The example below illustrates the following Big Rules:
  83  *
  84  *  (1) A check must be made in each zfs thread for a mounted file system.
  85  *      This is done avoiding races using zfs_enter(zfsvfs).
  86  *      A zfs_exit(zfsvfs) is needed before all returns.  Any znodes
  87  *      must be checked with zfs_verify_zp(zp).  Both of these macros
  88  *      can return EIO from the calling function.
  89  *
  90  *  (2) zrele() should always be the last thing except for zil_commit() (if
  91  *      necessary) and zfs_exit(). This is for 3 reasons: First, if it's the
  92  *      last reference, the vnode/znode can be freed, so the zp may point to
  93  *      freed memory.  Second, the last reference will call zfs_zinactive(),
  94  *      which may induce a lot of work -- pushing cached pages (which acquires
  95  *      range locks) and syncing out cached atime changes.  Third,
  96  *      zfs_zinactive() may require a new tx, which could deadlock the system
  97  *      if you were already holding one. This deadlock occurs because the tx
  98  *      currently being operated on prevents a txg from syncing, which
  99  *      prevents the new tx from progressing, resulting in a deadlock.  If you
 100  *      must call zrele() within a tx, use zfs_zrele_async(). Note that iput()
 101  *      is a synonym for zrele().
 102  *
 103  *  (3) All range locks must be grabbed before calling dmu_tx_assign(),
 104  *      as they can span dmu_tx_assign() calls.
 105  *
 106  *  (4) If ZPL locks are held, pass TXG_NOWAIT as the second argument to
 107  *      dmu_tx_assign().  This is critical because we don't want to block
 108  *      while holding locks.
 109  *
 110  *      If no ZPL locks are held (aside from zfs_enter()), use TXG_WAIT.  This
 111  *      reduces lock contention and CPU usage when we must wait (note that if
 112  *      throughput is constrained by the storage, nearly every transaction
 113  *      must wait).
 114  *
 115  *      Note, in particular, that if a lock is sometimes acquired before
 116  *      the tx assigns, and sometimes after (e.g. z_lock), then failing
 117  *      to use a non-blocking assign can deadlock the system.  The scenario:
 118  *
 119  *      Thread A has grabbed a lock before calling dmu_tx_assign().
 120  *      Thread B is in an already-assigned tx, and blocks for this lock.
 121  *      Thread A calls dmu_tx_assign(TXG_WAIT) and blocks in txg_wait_open()
 122  *      forever, because the previous txg can't quiesce until B's tx commits.
 123  *
 124  *      If dmu_tx_assign() returns ERESTART and zfsvfs->z_assign is TXG_NOWAIT,
 125  *      then drop all locks, call dmu_tx_wait(), and try again.  On subsequent
 126  *      calls to dmu_tx_assign(), pass TXG_NOTHROTTLE in addition to TXG_NOWAIT,
 127  *      to indicate that this operation has already called dmu_tx_wait().
 128  *      This will ensure that we don't retry forever, waiting a short bit
 129  *      each time.
 130  *
 131  *  (5) If the operation succeeded, generate the intent log entry for it
 132  *      before dropping locks.  This ensures that the ordering of events
 133  *      in the intent log matches the order in which they actually occurred.
 134  *      During ZIL replay the zfs_log_* functions will update the sequence
 135  *      number to indicate the zil transaction has replayed.
 136  *
 137  *  (6) At the end of each vnode op, the DMU tx must always commit,
 138  *      regardless of whether there were any errors.
 139  *
 140  *  (7) After dropping all locks, invoke zil_commit(zilog, foid)
 141  *      to ensure that synchronous semantics are provided when necessary.
 142  *
 143  * In general, this is how things should be ordered in each vnode op:
 144  *
 145  *      zfs_enter(zfsvfs);              // exit if unmounted
 146  * top:
 147  *      zfs_dirent_lock(&dl, ...)       // lock directory entry (may igrab())
 148  *      rw_enter(...);                  // grab any other locks you need
 149  *      tx = dmu_tx_create(...);        // get DMU tx
 150  *      dmu_tx_hold_*();                // hold each object you might modify
 151  *      error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT);
 152  *      if (error) {
 153  *              rw_exit(...);           // drop locks
 154  *              zfs_dirent_unlock(dl);  // unlock directory entry
 155  *              zrele(...);             // release held znodes
 156  *              if (error == ERESTART) {
 157  *                      waited = B_TRUE;
 158  *                      dmu_tx_wait(tx);
 159  *                      dmu_tx_abort(tx);
 160  *                      goto top;
 161  *              }
 162  *              dmu_tx_abort(tx);       // abort DMU tx
 163  *              zfs_exit(zfsvfs);       // finished in zfs
 164  *              return (error);         // really out of space
 165  *      }
 166  *      error = do_real_work();         // do whatever this VOP does
 167  *      if (error == 0)
 168  *              zfs_log_*(...);         // on success, make ZIL entry
 169  *      dmu_tx_commit(tx);              // commit DMU tx -- error or not
 170  *      rw_exit(...);                   // drop locks
 171  *      zfs_dirent_unlock(dl);          // unlock directory entry
 172  *      zrele(...);                     // release held znodes
 173  *      zil_commit(zilog, foid);        // synchronous when necessary
 174  *      zfs_exit(zfsvfs);               // finished in zfs
 175  *      return (error);                 // done, report error
 176  */
 177 int
 178 zfs_open(struct inode *ip, int mode, int flag, cred_t *cr)
 179 {
 180         (void) cr;
 181         znode_t *zp = ITOZ(ip);
 182         zfsvfs_t *zfsvfs = ITOZSB(ip);
 183         int error;
 184
 185         if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
 186                 return (error);
 187
 188         /* Honor ZFS_APPENDONLY file attribute */
 189         if ((mode & FMODE_WRITE) && (zp->z_pflags & ZFS_APPENDONLY) &&
 190             ((flag & O_APPEND) == 0)) {
 191                 zfs_exit(zfsvfs, FTAG);
 192                 return (SET_ERROR(EPERM));
 193         }
 194
 195         /* Keep a count of the synchronous opens in the znode */
 196         if (flag & O_SYNC)
 197                 atomic_inc_32(&zp->z_sync_cnt);
 198
 199         zfs_exit(zfsvfs, FTAG);
 200         return (0);
 201 }
 202
 203 int
 204 zfs_close(struct inode *ip, int flag, cred_t *cr)
 205 {
 206         (void) cr;
 207         znode_t *zp = ITOZ(ip);
 208         zfsvfs_t *zfsvfs = ITOZSB(ip);
 209         int error;
 210
 211         if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
 212                 return (error);
 213
 214         /* Decrement the synchronous opens in the znode */
 215         if (flag & O_SYNC)
 216                 atomic_dec_32(&zp->z_sync_cnt);
 217
 218         zfs_exit(zfsvfs, FTAG);
 219         return (0);
 220 }
 221
 222 #if defined(_KERNEL)
 223 /*
 224  * When a file is memory mapped, we must keep the IO data synchronized
 225  * between the DMU cache and the memory mapped pages.  What this means:
 226  *
 227  * On Write:    If we find a memory mapped page, we write to *both*
 228  *              the page and the dmu buffer.
 229  */
 230 void
 231 update_pages(znode_t *zp, int64_t start, int len, objset_t *os)
 232 {
 233         struct inode *ip = ZTOI(zp);
 234         struct address_space *mp = ip->i_mapping;
 235         struct page *pp;
 236         uint64_t nbytes;
 237         int64_t off;
 238         void *pb;
 239
 240         off = start & (PAGE_SIZE-1);
 241         for (start &= PAGE_MASK; len > 0; start += PAGE_SIZE) {
 242                 nbytes = MIN(PAGE_SIZE - off, len);
 243
 244                 pp = find_lock_page(mp, start >> PAGE_SHIFT);
 245                 if (pp) {
 246                         if (mapping_writably_mapped(mp))
 247                                 flush_dcache_page(pp);
 248
 249                         pb = kmap(pp);
 250                         (void) dmu_read(os, zp->z_id, start + off, nbytes,
 251                             pb + off, DMU_READ_PREFETCH);
 252                         kunmap(pp);
 253
 254                         if (mapping_writably_mapped(mp))
 255                                 flush_dcache_page(pp);
 256
 257                         mark_page_accessed(pp);
 258                         SetPageUptodate(pp);
 259                         ClearPageError(pp);
 260                         unlock_page(pp);
 261                         put_page(pp);
 262                 }
 263
 264                 len -= nbytes;
 265                 off = 0;
 266         }
 267 }
 268
 269 /*
 270  * When a file is memory mapped, we must keep the IO data synchronized
 271  * between the DMU cache and the memory mapped pages.  What this means:
 272  *
 273  * On Read:     We "read" preferentially from memory mapped pages,
 274  *              else we default from the dmu buffer.
 275  *
 276  * NOTE: We will always "break up" the IO into PAGESIZE uiomoves when
 277  *       the file is memory mapped.
 278  */
 279 int
 280 mappedread(znode_t *zp, int nbytes, zfs_uio_t *uio)
 281 {
 282         struct inode *ip = ZTOI(zp);
 283         struct address_space *mp = ip->i_mapping;
 284         struct page *pp;
 285         int64_t start, off;
 286         uint64_t bytes;
 287         int len = nbytes;
 288         int error = 0;
 289         void *pb;
 290
 291         start = uio->uio_loffset;
 292         off = start & (PAGE_SIZE-1);
 293         for (start &= PAGE_MASK; len > 0; start += PAGE_SIZE) {
 294                 bytes = MIN(PAGE_SIZE - off, len);
 295
 296                 pp = find_lock_page(mp, start >> PAGE_SHIFT);
 297                 if (pp) {
 298                         ASSERT(PageUptodate(pp));
 299                         unlock_page(pp);
 300
 301                         pb = kmap(pp);
 302                         error = zfs_uiomove(pb + off, bytes, UIO_READ, uio);
 303                         kunmap(pp);
 304
 305                         if (mapping_writably_mapped(mp))
 306                                 flush_dcache_page(pp);
 307
 308                         mark_page_accessed(pp);
 309                         put_page(pp);
 310                 } else {
 311                         error = dmu_read_uio_dbuf(sa_get_db(zp->z_sa_hdl),
 312                             uio, bytes);
 313                 }
 314
 315                 len -= bytes;
 316                 off = 0;
 317                 if (error)
 318                         break;
 319         }
 320         return (error);
 321 }
 322 #endif /* _KERNEL */
 323
 324 static unsigned long zfs_delete_blocks = DMU_MAX_DELETEBLKCNT;
 325
 326 /*
 327  * Write the bytes to a file.
 328  *
 329  *      IN:     zp      - znode of file to be written to
 330  *              data    - bytes to write
 331  *              len     - number of bytes to write
 332  *              pos     - offset to start writing at
 333  *
 334  *      OUT:    resid   - remaining bytes to write
 335  *
 336  *      RETURN: 0 if success
 337  *              positive error code if failure.  EIO is returned
 338  *              for a short write when residp isn't provided.
 339  *
 340  * Timestamps:
 341  *      zp - ctime|mtime updated if byte count > 0
 342  */
 343 int
 344 zfs_write_simple(znode_t *zp, const void *data, size_t len,
 345     loff_t pos, size_t *residp)
 346 {
 347         fstrans_cookie_t cookie;
 348         int error;
 349
 350         struct iovec iov;
 351         iov.iov_base = (void *)data;
 352         iov.iov_len = len;
 353
 354         zfs_uio_t uio;
 355         zfs_uio_iovec_init(&uio, &iov, 1, pos, UIO_SYSSPACE, len, 0);
 356
 357         cookie = spl_fstrans_mark();
 358         error = zfs_write(zp, &uio, 0, kcred);
 359         spl_fstrans_unmark(cookie);
 360
 361         if (error == 0) {
 362                 if (residp != NULL)
 363                         *residp = zfs_uio_resid(&uio);
 364                 else if (zfs_uio_resid(&uio) != 0)
 365                         error = SET_ERROR(EIO);
 366         }
 367
 368         return (error);
 369 }
 370
 371 static void
 372 zfs_rele_async_task(void *arg)
 373 {
 374         iput(arg);
 375 }
 376
 377 void
 378 zfs_zrele_async(znode_t *zp)
 379 {
 380         struct inode *ip = ZTOI(zp);
 381         objset_t *os = ITOZSB(ip)->z_os;
 382
 383         ASSERT(atomic_read(&ip->i_count) > 0);
 384         ASSERT(os != NULL);
 385
 386         /*
 387          * If decrementing the count would put us at 0, we can't do it inline
 388          * here, because that would be synchronous. Instead, dispatch an iput
 389          * to run later.
 390          *
 391          * For more information on the dangers of a synchronous iput, see the
 392          * header comment of this file.
 393          */
 394         if (!atomic_add_unless(&ip->i_count, -1, 1)) {
 395                 VERIFY(taskq_dispatch(dsl_pool_zrele_taskq(dmu_objset_pool(os)),
 396                     zfs_rele_async_task, ip, TQ_SLEEP) != TASKQID_INVALID);
 397         }
 398 }
 399
 400
 401 /*
 402  * Lookup an entry in a directory, or an extended attribute directory.
 403  * If it exists, return a held inode reference for it.
 404  *
 405  *      IN:     zdp     - znode of directory to search.
 406  *              nm      - name of entry to lookup.
 407  *              flags   - LOOKUP_XATTR set if looking for an attribute.
 408  *              cr      - credentials of caller.
 409  *              direntflags - directory lookup flags
 410  *              realpnp - returned pathname.
 411  *
 412  *      OUT:    zpp     - znode of located entry, NULL if not found.
 413  *
 414  *      RETURN: 0 on success, error code on failure.
 415  *
 416  * Timestamps:
 417  *      NA
 418  */
 419 int
 420 zfs_lookup(znode_t *zdp, char *nm, znode_t **zpp, int flags, cred_t *cr,
 421     int *direntflags, pathname_t *realpnp)
 422 {
 423         zfsvfs_t *zfsvfs = ZTOZSB(zdp);
 424         int error = 0;
 425
 426         /*
 427          * Fast path lookup, however we must skip DNLC lookup
 428          * for case folding or normalizing lookups because the
 429          * DNLC code only stores the passed in name.  This means
 430          * creating 'a' and removing 'A' on a case insensitive
 431          * file system would work, but DNLC still thinks 'a'
 432          * exists and won't let you create it again on the next
 433          * pass through fast path.
 434          */
 435         if (!(flags & (LOOKUP_XATTR | FIGNORECASE))) {
 436
 437                 if (!S_ISDIR(ZTOI(zdp)->i_mode)) {
 438                         return (SET_ERROR(ENOTDIR));
 439                 } else if (zdp->z_sa_hdl == NULL) {
 440                         return (SET_ERROR(EIO));
 441                 }
 442
 443                 if (nm[0] == 0 || (nm[0] == '.' && nm[1] == '\0')) {
 444                         error = zfs_fastaccesschk_execute(zdp, cr);
 445                         if (!error) {
 446                                 *zpp = zdp;
 447                                 zhold(*zpp);
 448                                 return (0);
 449                         }
 450                         return (error);
 451                 }
 452         }
 453
 454         if ((error = zfs_enter_verify_zp(zfsvfs, zdp, FTAG)) != 0)
 455                 return (error);
 456
 457         *zpp = NULL;
 458
 459         if (flags & LOOKUP_XATTR) {
 460                 /*
 461                  * We don't allow recursive attributes..
 462                  * Maybe someday we will.
 463                  */
 464                 if (zdp->z_pflags & ZFS_XATTR) {
 465                         zfs_exit(zfsvfs, FTAG);
 466                         return (SET_ERROR(EINVAL));
 467                 }
 468
 469                 if ((error = zfs_get_xattrdir(zdp, zpp, cr, flags))) {
 470                         zfs_exit(zfsvfs, FTAG);
 471                         return (error);
 472                 }
 473
 474                 /*
 475                  * Do we have permission to get into attribute directory?
 476                  */
 477
 478                 if ((error = zfs_zaccess(*zpp, ACE_EXECUTE, 0,
 479                     B_TRUE, cr))) {
 480                         zrele(*zpp);
 481                         *zpp = NULL;
 482                 }
 483
 484                 zfs_exit(zfsvfs, FTAG);
 485                 return (error);
 486         }
 487
 488         if (!S_ISDIR(ZTOI(zdp)->i_mode)) {
 489                 zfs_exit(zfsvfs, FTAG);
 490                 return (SET_ERROR(ENOTDIR));
 491         }
 492
 493         /*
 494          * Check accessibility of directory.
 495          */
 496
 497         if ((error = zfs_zaccess(zdp, ACE_EXECUTE, 0, B_FALSE, cr))) {
 498                 zfs_exit(zfsvfs, FTAG);
 499                 return (error);
 500         }
 501
 502         if (zfsvfs->z_utf8 && u8_validate(nm, strlen(nm),
 503             NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
 504                 zfs_exit(zfsvfs, FTAG);
 505                 return (SET_ERROR(EILSEQ));
 506         }
 507
 508         error = zfs_dirlook(zdp, nm, zpp, flags, direntflags, realpnp);
 509         if ((error == 0) && (*zpp))
 510                 zfs_znode_update_vfs(*zpp);
 511
 512         zfs_exit(zfsvfs, FTAG);
 513         return (error);
 514 }
 515
 516 /*
 517  * Attempt to create a new entry in a directory.  If the entry
 518  * already exists, truncate the file if permissible, else return
 519  * an error.  Return the ip of the created or trunc'd file.
 520  *
 521  *      IN:     dzp     - znode of directory to put new file entry in.
 522  *              name    - name of new file entry.
 523  *              vap     - attributes of new file.
 524  *              excl    - flag indicating exclusive or non-exclusive mode.
 525  *              mode    - mode to open file with.
 526  *              cr      - credentials of caller.
 527  *              flag    - file flag.
 528  *              vsecp   - ACL to be set
 529  *
 530  *      OUT:    zpp     - znode of created or trunc'd entry.
 531  *
 532  *      RETURN: 0 on success, error code on failure.
 533  *
 534  * Timestamps:
 535  *      dzp - ctime|mtime updated if new entry created
 536  *       zp - ctime|mtime always, atime if new
 537  */
 538 int
 539 zfs_create(znode_t *dzp, char *name, vattr_t *vap, int excl,
 540     int mode, znode_t **zpp, cred_t *cr, int flag, vsecattr_t *vsecp)
 541 {
 542         znode_t         *zp;
 543         zfsvfs_t        *zfsvfs = ZTOZSB(dzp);
 544         zilog_t         *zilog;
 545         objset_t        *os;
 546         zfs_dirlock_t   *dl;
 547         dmu_tx_t        *tx;
 548         int             error;
 549         uid_t           uid;
 550         gid_t           gid;
 551         zfs_acl_ids_t   acl_ids;
 552         boolean_t       fuid_dirtied;
 553         boolean_t       have_acl = B_FALSE;
 554         boolean_t       waited = B_FALSE;
 555
 556         /*
 557          * If we have an ephemeral id, ACL, or XVATTR then
 558          * make sure file system is at proper version
 559          */
 560
 561         gid = crgetgid(cr);
 562         uid = crgetuid(cr);
 563
 564         if (zfsvfs->z_use_fuids == B_FALSE &&
 565             (vsecp || IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid)))
 566                 return (SET_ERROR(EINVAL));
 567
 568         if (name == NULL)
 569                 return (SET_ERROR(EINVAL));
 570
 571         if ((error = zfs_enter_verify_zp(zfsvfs, dzp, FTAG)) != 0)
 572                 return (error);
 573         os = zfsvfs->z_os;
 574         zilog = zfsvfs->z_log;
 575
 576         if (zfsvfs->z_utf8 && u8_validate(name, strlen(name),
 577             NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
 578                 zfs_exit(zfsvfs, FTAG);
 579                 return (SET_ERROR(EILSEQ));
 580         }
 581
 582         if (vap->va_mask & ATTR_XVATTR) {
 583                 if ((error = secpolicy_xvattr((xvattr_t *)vap,
 584                     crgetuid(cr), cr, vap->va_mode)) != 0) {
 585                         zfs_exit(zfsvfs, FTAG);
 586                         return (error);
 587                 }
 588         }
 589
 590 top:
 591         *zpp = NULL;
 592         if (*name == '\0') {
 593                 /*
 594                  * Null component name refers to the directory itself.
 595                  */
 596                 zhold(dzp);
 597                 zp = dzp;
 598                 dl = NULL;
 599                 error = 0;
 600         } else {
 601                 /* possible igrab(zp) */
 602                 int zflg = 0;
 603
 604                 if (flag & FIGNORECASE)
 605                         zflg |= ZCILOOK;
 606
 607                 error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg,
 608                     NULL, NULL);
 609                 if (error) {
 610                         if (have_acl)
 611                                 zfs_acl_ids_free(&acl_ids);
 612                         if (strcmp(name, "..") == 0)
 613                                 error = SET_ERROR(EISDIR);
 614                         zfs_exit(zfsvfs, FTAG);
 615                         return (error);
 616                 }
 617         }
 618
 619         if (zp == NULL) {
 620                 uint64_t txtype;
 621                 uint64_t projid = ZFS_DEFAULT_PROJID;
 622
 623                 /*
 624                  * Create a new file object and update the directory
 625                  * to reference it.
 626                  */
 627                 if ((error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr))) {
 628                         if (have_acl)
 629                                 zfs_acl_ids_free(&acl_ids);
 630                         goto out;
 631                 }
 632
 633                 /*
 634                  * We only support the creation of regular files in
 635                  * extended attribute directories.
 636                  */
 637
 638                 if ((dzp->z_pflags & ZFS_XATTR) && !S_ISREG(vap->va_mode)) {
 639                         if (have_acl)
 640                                 zfs_acl_ids_free(&acl_ids);
 641                         error = SET_ERROR(EINVAL);
 642                         goto out;
 643                 }
 644
 645                 if (!have_acl && (error = zfs_acl_ids_create(dzp, 0, vap,
 646                     cr, vsecp, &acl_ids)) != 0)
 647                         goto out;
 648                 have_acl = B_TRUE;
 649
 650                 if (S_ISREG(vap->va_mode) || S_ISDIR(vap->va_mode))
 651                         projid = zfs_inherit_projid(dzp);
 652                 if (zfs_acl_ids_overquota(zfsvfs, &acl_ids, projid)) {
 653                         zfs_acl_ids_free(&acl_ids);
 654                         error = SET_ERROR(EDQUOT);
 655                         goto out;
 656                 }
 657
 658                 tx = dmu_tx_create(os);
 659
 660                 dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
 661                     ZFS_SA_BASE_ATTR_SIZE);
 662
 663                 fuid_dirtied = zfsvfs->z_fuid_dirty;
 664                 if (fuid_dirtied)
 665                         zfs_fuid_txhold(zfsvfs, tx);
 666                 dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
 667                 dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE);
 668                 if (!zfsvfs->z_use_sa &&
 669                     acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) {
 670                         dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
 671                             0, acl_ids.z_aclp->z_acl_bytes);
 672                 }
 673
 674                 error = dmu_tx_assign(tx,
 675                     (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT);
 676                 if (error) {
 677                         zfs_dirent_unlock(dl);
 678                         if (error == ERESTART) {
 679                                 waited = B_TRUE;
 680                                 dmu_tx_wait(tx);
 681                                 dmu_tx_abort(tx);
 682                                 goto top;
 683                         }
 684                         zfs_acl_ids_free(&acl_ids);
 685                         dmu_tx_abort(tx);
 686                         zfs_exit(zfsvfs, FTAG);
 687                         return (error);
 688                 }
 689                 zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids);
 690
 691                 error = zfs_link_create(dl, zp, tx, ZNEW);
 692                 if (error != 0) {
 693                         /*
 694                          * Since, we failed to add the directory entry for it,
 695                          * delete the newly created dnode.
 696                          */
 697                         zfs_znode_delete(zp, tx);
 698                         remove_inode_hash(ZTOI(zp));
 699                         zfs_acl_ids_free(&acl_ids);
 700                         dmu_tx_commit(tx);
 701                         goto out;
 702                 }
 703
 704                 if (fuid_dirtied)
 705                         zfs_fuid_sync(zfsvfs, tx);
 706
 707                 txtype = zfs_log_create_txtype(Z_FILE, vsecp, vap);
 708                 if (flag & FIGNORECASE)
 709                         txtype |= TX_CI;
 710                 zfs_log_create(zilog, tx, txtype, dzp, zp, name,
 711                     vsecp, acl_ids.z_fuidp, vap);
 712                 zfs_acl_ids_free(&acl_ids);
 713                 dmu_tx_commit(tx);
 714         } else {
 715                 int aflags = (flag & O_APPEND) ? V_APPEND : 0;
 716
 717                 if (have_acl)
 718                         zfs_acl_ids_free(&acl_ids);
 719                 have_acl = B_FALSE;
 720
 721                 /*
 722                  * A directory entry already exists for this name.
 723                  */
 724                 /*
 725                  * Can't truncate an existing file if in exclusive mode.
 726                  */
 727                 if (excl) {
 728                         error = SET_ERROR(EEXIST);
 729                         goto out;
 730                 }
 731                 /*
 732                  * Can't open a directory for writing.
 733                  */
 734                 if (S_ISDIR(ZTOI(zp)->i_mode)) {
 735                         error = SET_ERROR(EISDIR);
 736                         goto out;
 737                 }
 738                 /*
 739                  * Verify requested access to file.
 740                  */
 741                 if (mode && (error = zfs_zaccess_rwx(zp, mode, aflags, cr))) {
 742                         goto out;
 743                 }
 744
 745                 mutex_enter(&dzp->z_lock);
 746                 dzp->z_seq++;
 747                 mutex_exit(&dzp->z_lock);
 748
 749                 /*
 750                  * Truncate regular files if requested.
 751                  */
 752                 if (S_ISREG(ZTOI(zp)->i_mode) &&
 753                     (vap->va_mask & ATTR_SIZE) && (vap->va_size == 0)) {
 754                         /* we can't hold any locks when calling zfs_freesp() */
 755                         if (dl) {
 756                                 zfs_dirent_unlock(dl);
 757                                 dl = NULL;
 758                         }
 759                         error = zfs_freesp(zp, 0, 0, mode, TRUE);
 760                 }
 761         }
 762 out:
 763
 764         if (dl)
 765                 zfs_dirent_unlock(dl);
 766
 767         if (error) {
 768                 if (zp)
 769                         zrele(zp);
 770         } else {
 771                 zfs_znode_update_vfs(dzp);
 772                 zfs_znode_update_vfs(zp);
 773                 *zpp = zp;
 774         }
 775
 776         if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
 777                 zil_commit(zilog, 0);
 778
 779         zfs_exit(zfsvfs, FTAG);
 780         return (error);
 781 }
 782
 783 int
 784 zfs_tmpfile(struct inode *dip, vattr_t *vap, int excl,
 785     int mode, struct inode **ipp, cred_t *cr, int flag, vsecattr_t *vsecp)
 786 {
 787         (void) excl, (void) mode, (void) flag;
 788         znode_t         *zp = NULL, *dzp = ITOZ(dip);
 789         zfsvfs_t        *zfsvfs = ITOZSB(dip);
 790         objset_t        *os;
 791         dmu_tx_t        *tx;
 792         int             error;
 793         uid_t           uid;
 794         gid_t           gid;
 795         zfs_acl_ids_t   acl_ids;
 796         uint64_t        projid = ZFS_DEFAULT_PROJID;
 797         boolean_t       fuid_dirtied;
 798         boolean_t       have_acl = B_FALSE;
 799         boolean_t       waited = B_FALSE;
 800
 801         /*
 802          * If we have an ephemeral id, ACL, or XVATTR then
 803          * make sure file system is at proper version
 804          */
 805
 806         gid = crgetgid(cr);
 807         uid = crgetuid(cr);
 808
 809         if (zfsvfs->z_use_fuids == B_FALSE &&
 810             (vsecp || IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid)))
 811                 return (SET_ERROR(EINVAL));
 812
 813         if ((error = zfs_enter_verify_zp(zfsvfs, dzp, FTAG)) != 0)
 814                 return (error);
 815         os = zfsvfs->z_os;
 816
 817         if (vap->va_mask & ATTR_XVATTR) {
 818                 if ((error = secpolicy_xvattr((xvattr_t *)vap,
 819                     crgetuid(cr), cr, vap->va_mode)) != 0) {
 820                         zfs_exit(zfsvfs, FTAG);
 821                         return (error);
 822                 }
 823         }
 824
 825 top:
 826         *ipp = NULL;
 827
 828         /*
 829          * Create a new file object and update the directory
 830          * to reference it.
 831          */
 832         if ((error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr))) {
 833                 if (have_acl)
 834                         zfs_acl_ids_free(&acl_ids);
 835                 goto out;
 836         }
 837
 838         if (!have_acl && (error = zfs_acl_ids_create(dzp, 0, vap,
 839             cr, vsecp, &acl_ids)) != 0)
 840                 goto out;
 841         have_acl = B_TRUE;
 842
 843         if (S_ISREG(vap->va_mode) || S_ISDIR(vap->va_mode))
 844                 projid = zfs_inherit_projid(dzp);
 845         if (zfs_acl_ids_overquota(zfsvfs, &acl_ids, projid)) {
 846                 zfs_acl_ids_free(&acl_ids);
 847                 error = SET_ERROR(EDQUOT);
 848                 goto out;
 849         }
 850
 851         tx = dmu_tx_create(os);
 852
 853         dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
 854             ZFS_SA_BASE_ATTR_SIZE);
 855         dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
 856
 857         fuid_dirtied = zfsvfs->z_fuid_dirty;
 858         if (fuid_dirtied)
 859                 zfs_fuid_txhold(zfsvfs, tx);
 860         if (!zfsvfs->z_use_sa &&
 861             acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) {
 862                 dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
 863                     0, acl_ids.z_aclp->z_acl_bytes);
 864         }
 865         error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT);
 866         if (error) {
 867                 if (error == ERESTART) {
 868                         waited = B_TRUE;
 869                         dmu_tx_wait(tx);
 870                         dmu_tx_abort(tx);
 871                         goto top;
 872                 }
 873                 zfs_acl_ids_free(&acl_ids);
 874                 dmu_tx_abort(tx);
 875                 zfs_exit(zfsvfs, FTAG);
 876                 return (error);
 877         }
 878         zfs_mknode(dzp, vap, tx, cr, IS_TMPFILE, &zp, &acl_ids);
 879
 880         if (fuid_dirtied)
 881                 zfs_fuid_sync(zfsvfs, tx);
 882
 883         /* Add to unlinked set */
 884         zp->z_unlinked = B_TRUE;
 885         zfs_unlinked_add(zp, tx);
 886         zfs_acl_ids_free(&acl_ids);
 887         dmu_tx_commit(tx);
 888 out:
 889
 890         if (error) {
 891                 if (zp)
 892                         zrele(zp);
 893         } else {
 894                 zfs_znode_update_vfs(dzp);
 895                 zfs_znode_update_vfs(zp);
 896                 *ipp = ZTOI(zp);
 897         }
 898
 899         zfs_exit(zfsvfs, FTAG);
 900         return (error);
 901 }
 902
 903 /*
 904  * Remove an entry from a directory.
 905  *
 906  *      IN:     dzp     - znode of directory to remove entry from.
 907  *              name    - name of entry to remove.
 908  *              cr      - credentials of caller.
 909  *              flags   - case flags.
 910  *
 911  *      RETURN: 0 if success
 912  *              error code if failure
 913  *
 914  * Timestamps:
 915  *      dzp - ctime|mtime
 916  *       ip - ctime (if nlink > 0)
 917  */
 918
 919 static uint64_t null_xattr = 0;
 920
 921 int
 922 zfs_remove(znode_t *dzp, char *name, cred_t *cr, int flags)
 923 {
 924         znode_t         *zp;
 925         znode_t         *xzp;
 926         zfsvfs_t        *zfsvfs = ZTOZSB(dzp);
 927         zilog_t         *zilog;
 928         uint64_t        acl_obj, xattr_obj;
 929         uint64_t        xattr_obj_unlinked = 0;
 930         uint64_t        obj = 0;
 931         uint64_t        links;
 932         zfs_dirlock_t   *dl;
 933         dmu_tx_t        *tx;
 934         boolean_t       may_delete_now, delete_now = FALSE;
 935         boolean_t       unlinked, toobig = FALSE;
 936         uint64_t        txtype;
 937         pathname_t      *realnmp = NULL;
 938         pathname_t      realnm;
 939         int             error;
 940         int             zflg = ZEXISTS;
 941         boolean_t       waited = B_FALSE;
 942
 943         if (name == NULL)
 944                 return (SET_ERROR(EINVAL));
 945
 946         if ((error = zfs_enter_verify_zp(zfsvfs, dzp, FTAG)) != 0)
 947                 return (error);
 948         zilog = zfsvfs->z_log;
 949
 950         if (flags & FIGNORECASE) {
 951                 zflg |= ZCILOOK;
 952                 pn_alloc(&realnm);
 953                 realnmp = &realnm;
 954         }
 955
 956 top:
 957         xattr_obj = 0;
 958         xzp = NULL;
 959         /*
 960          * Attempt to lock directory; fail if entry doesn't exist.
 961          */
 962         if ((error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg,
 963             NULL, realnmp))) {
 964                 if (realnmp)
 965                         pn_free(realnmp);
 966                 zfs_exit(zfsvfs, FTAG);
 967                 return (error);
 968         }
 969
 970         if ((error = zfs_zaccess_delete(dzp, zp, cr))) {
 971                 goto out;
 972         }
 973
 974         /*
 975          * Need to use rmdir for removing directories.
 976          */
 977         if (S_ISDIR(ZTOI(zp)->i_mode)) {
 978                 error = SET_ERROR(EPERM);
 979                 goto out;
 980         }
 981
 982         mutex_enter(&zp->z_lock);
 983         may_delete_now = atomic_read(&ZTOI(zp)->i_count) == 1 &&
 984             !(zp->z_is_mapped);
 985         mutex_exit(&zp->z_lock);
 986
 987         /*
 988          * We may delete the znode now, or we may put it in the unlinked set;
 989          * it depends on whether we're the last link, and on whether there are
 990          * other holds on the inode.  So we dmu_tx_hold() the right things to
 991          * allow for either case.
 992          */
 993         obj = zp->z_id;
 994         tx = dmu_tx_create(zfsvfs->z_os);
 995         dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name);
 996         dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
 997         zfs_sa_upgrade_txholds(tx, zp);
 998         zfs_sa_upgrade_txholds(tx, dzp);
 999         if (may_delete_now) {
1000                 toobig = zp->z_size > zp->z_blksz * zfs_delete_blocks;
1001                 /* if the file is too big, only hold_free a token amount */
1002                 dmu_tx_hold_free(tx, zp->z_id, 0,
1003                     (toobig ? DMU_MAX_ACCESS : DMU_OBJECT_END));
1004         }
1005
1006         /* are there any extended attributes? */
1007         error = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs),
1008             &xattr_obj, sizeof (xattr_obj));
1009         if (error == 0 && xattr_obj) {
1010                 error = zfs_zget(zfsvfs, xattr_obj, &xzp);
1011                 ASSERT0(error);
1012                 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
1013                 dmu_tx_hold_sa(tx, xzp->z_sa_hdl, B_FALSE);
1014         }
1015
1016         mutex_enter(&zp->z_lock);
1017         if ((acl_obj = zfs_external_acl(zp)) != 0 && may_delete_now)
1018                 dmu_tx_hold_free(tx, acl_obj, 0, DMU_OBJECT_END);
1019         mutex_exit(&zp->z_lock);
1020
1021         /* charge as an update -- would be nice not to charge at all */
1022         dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
1023
1024         /*
1025          * Mark this transaction as typically resulting in a net free of space
1026          */
1027         dmu_tx_mark_netfree(tx);
1028
1029         error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT);
1030         if (error) {
1031                 zfs_dirent_unlock(dl);
1032                 if (error == ERESTART) {
1033                         waited = B_TRUE;
1034                         dmu_tx_wait(tx);
1035                         dmu_tx_abort(tx);
1036                         zrele(zp);
1037                         if (xzp)
1038                                 zrele(xzp);
1039                         goto top;
1040                 }
1041                 if (realnmp)
1042                         pn_free(realnmp);
1043                 dmu_tx_abort(tx);
1044                 zrele(zp);
1045                 if (xzp)
1046                         zrele(xzp);
1047                 zfs_exit(zfsvfs, FTAG);
1048                 return (error);
1049         }
1050
1051         /*
1052          * Remove the directory entry.
1053          */
1054         error = zfs_link_destroy(dl, zp, tx, zflg, &unlinked);
1055
1056         if (error) {
1057                 dmu_tx_commit(tx);
1058                 goto out;
1059         }
1060
1061         if (unlinked) {
1062                 /*
1063                  * Hold z_lock so that we can make sure that the ACL obj
1064                  * hasn't changed.  Could have been deleted due to
1065                  * zfs_sa_upgrade().
1066                  */
1067                 mutex_enter(&zp->z_lock);
1068                 (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs),
1069                     &xattr_obj_unlinked, sizeof (xattr_obj_unlinked));
1070                 delete_now = may_delete_now && !toobig &&
1071                     atomic_read(&ZTOI(zp)->i_count) == 1 &&
1072                     !(zp->z_is_mapped) && xattr_obj == xattr_obj_unlinked &&
1073                     zfs_external_acl(zp) == acl_obj;
1074         }
1075
1076         if (delete_now) {
1077                 if (xattr_obj_unlinked) {
1078                         ASSERT3U(ZTOI(xzp)->i_nlink, ==, 2);
1079                         mutex_enter(&xzp->z_lock);
1080                         xzp->z_unlinked = B_TRUE;
1081                         clear_nlink(ZTOI(xzp));
1082                         links = 0;
1083                         error = sa_update(xzp->z_sa_hdl, SA_ZPL_LINKS(zfsvfs),
1084                             &links, sizeof (links), tx);
1085                         ASSERT3U(error,  ==,  0);
1086                         mutex_exit(&xzp->z_lock);
1087                         zfs_unlinked_add(xzp, tx);
1088
1089                         if (zp->z_is_sa)
1090                                 error = sa_remove(zp->z_sa_hdl,
1091                                     SA_ZPL_XATTR(zfsvfs), tx);
1092                         else
1093                                 error = sa_update(zp->z_sa_hdl,
1094                                     SA_ZPL_XATTR(zfsvfs), &null_xattr,
1095                                     sizeof (uint64_t), tx);
1096                         ASSERT0(error);
1097                 }
1098                 /*
1099                  * Add to the unlinked set because a new reference could be
1100                  * taken concurrently resulting in a deferred destruction.
1101                  */
1102                 zfs_unlinked_add(zp, tx);
1103                 mutex_exit(&zp->z_lock);
1104         } else if (unlinked) {
1105                 mutex_exit(&zp->z_lock);
1106                 zfs_unlinked_add(zp, tx);
1107         }
1108
1109         txtype = TX_REMOVE;
1110         if (flags & FIGNORECASE)
1111                 txtype |= TX_CI;
1112         zfs_log_remove(zilog, tx, txtype, dzp, name, obj, unlinked);
1113
1114         dmu_tx_commit(tx);
1115 out:
1116         if (realnmp)
1117                 pn_free(realnmp);
1118
1119         zfs_dirent_unlock(dl);
1120         zfs_znode_update_vfs(dzp);
1121         zfs_znode_update_vfs(zp);
1122
1123         if (delete_now)
1124                 zrele(zp);
1125         else
1126                 zfs_zrele_async(zp);
1127
1128         if (xzp) {
1129                 zfs_znode_update_vfs(xzp);
1130                 zfs_zrele_async(xzp);
1131         }
1132
1133         if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
1134                 zil_commit(zilog, 0);
1135
1136         zfs_exit(zfsvfs, FTAG);
1137         return (error);
1138 }
1139
1140 /*
1141  * Create a new directory and insert it into dzp using the name
1142  * provided.  Return a pointer to the inserted directory.
1143  *
1144  *      IN:     dzp     - znode of directory to add subdir to.
1145  *              dirname - name of new directory.
1146  *              vap     - attributes of new directory.
1147  *              cr      - credentials of caller.
1148  *              flags   - case flags.
1149  *              vsecp   - ACL to be set
1150  *
1151  *      OUT:    zpp     - znode of created directory.
1152  *
1153  *      RETURN: 0 if success
1154  *              error code if failure
1155  *
1156  * Timestamps:
1157  *      dzp - ctime|mtime updated
1158  *      zpp - ctime|mtime|atime updated
1159  */
1160 int
1161 zfs_mkdir(znode_t *dzp, char *dirname, vattr_t *vap, znode_t **zpp,
1162     cred_t *cr, int flags, vsecattr_t *vsecp)
1163 {
1164         znode_t         *zp;
1165         zfsvfs_t        *zfsvfs = ZTOZSB(dzp);
1166         zilog_t         *zilog;
1167         zfs_dirlock_t   *dl;
1168         uint64_t        txtype;
1169         dmu_tx_t        *tx;
1170         int             error;
1171         int             zf = ZNEW;
1172         uid_t           uid;
1173         gid_t           gid = crgetgid(cr);
1174         zfs_acl_ids_t   acl_ids;
1175         boolean_t       fuid_dirtied;
1176         boolean_t       waited = B_FALSE;
1177
1178         ASSERT(S_ISDIR(vap->va_mode));
1179
1180         /*
1181          * If we have an ephemeral id, ACL, or XVATTR then
1182          * make sure file system is at proper version
1183          */
1184
1185         uid = crgetuid(cr);
1186         if (zfsvfs->z_use_fuids == B_FALSE &&
1187             (vsecp || IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid)))
1188                 return (SET_ERROR(EINVAL));
1189
1190         if (dirname == NULL)
1191                 return (SET_ERROR(EINVAL));
1192
1193         if ((error = zfs_enter_verify_zp(zfsvfs, dzp, FTAG)) != 0)
1194                 return (error);
1195         zilog = zfsvfs->z_log;
1196
1197         if (dzp->z_pflags & ZFS_XATTR) {
1198                 zfs_exit(zfsvfs, FTAG);
1199                 return (SET_ERROR(EINVAL));
1200         }
1201
1202         if (zfsvfs->z_utf8 && u8_validate(dirname,
1203             strlen(dirname), NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
1204                 zfs_exit(zfsvfs, FTAG);
1205                 return (SET_ERROR(EILSEQ));
1206         }
1207         if (flags & FIGNORECASE)
1208                 zf |= ZCILOOK;
1209
1210         if (vap->va_mask & ATTR_XVATTR) {
1211                 if ((error = secpolicy_xvattr((xvattr_t *)vap,
1212                     crgetuid(cr), cr, vap->va_mode)) != 0) {
1213                         zfs_exit(zfsvfs, FTAG);
1214                         return (error);
1215                 }
1216         }
1217
1218         if ((error = zfs_acl_ids_create(dzp, 0, vap, cr,
1219             vsecp, &acl_ids)) != 0) {
1220                 zfs_exit(zfsvfs, FTAG);
1221                 return (error);
1222         }
1223         /*
1224          * First make sure the new directory doesn't exist.
1225          *
1226          * Existence is checked first to make sure we don't return
1227          * EACCES instead of EEXIST which can cause some applications
1228          * to fail.
1229          */
1230 top:
1231         *zpp = NULL;
1232
1233         if ((error = zfs_dirent_lock(&dl, dzp, dirname, &zp, zf,
1234             NULL, NULL))) {
1235                 zfs_acl_ids_free(&acl_ids);
1236                 zfs_exit(zfsvfs, FTAG);
1237                 return (error);
1238         }
1239
1240         if ((error = zfs_zaccess(dzp, ACE_ADD_SUBDIRECTORY, 0, B_FALSE, cr))) {
1241                 zfs_acl_ids_free(&acl_ids);
1242                 zfs_dirent_unlock(dl);
1243                 zfs_exit(zfsvfs, FTAG);
1244                 return (error);
1245         }
1246
1247         if (zfs_acl_ids_overquota(zfsvfs, &acl_ids, zfs_inherit_projid(dzp))) {
1248                 zfs_acl_ids_free(&acl_ids);
1249                 zfs_dirent_unlock(dl);
1250                 zfs_exit(zfsvfs, FTAG);
1251                 return (SET_ERROR(EDQUOT));
1252         }
1253
1254         /*
1255          * Add a new entry to the directory.
1256          */
1257         tx = dmu_tx_create(zfsvfs->z_os);
1258         dmu_tx_hold_zap(tx, dzp->z_id, TRUE, dirname);
1259         dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL);
1260         fuid_dirtied = zfsvfs->z_fuid_dirty;
1261         if (fuid_dirtied)
1262                 zfs_fuid_txhold(zfsvfs, tx);
1263         if (!zfsvfs->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) {
1264                 dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0,
1265                     acl_ids.z_aclp->z_acl_bytes);
1266         }
1267
1268         dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
1269             ZFS_SA_BASE_ATTR_SIZE);
1270
1271         error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT);
1272         if (error) {
1273                 zfs_dirent_unlock(dl);
1274                 if (error == ERESTART) {
1275                         waited = B_TRUE;
1276                         dmu_tx_wait(tx);
1277                         dmu_tx_abort(tx);
1278                         goto top;
1279                 }
1280                 zfs_acl_ids_free(&acl_ids);
1281                 dmu_tx_abort(tx);
1282                 zfs_exit(zfsvfs, FTAG);
1283                 return (error);
1284         }
1285
1286         /*
1287          * Create new node.
1288          */
1289         zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids);
1290
1291         /*
1292          * Now put new name in parent dir.
1293          */
1294         error = zfs_link_create(dl, zp, tx, ZNEW);
1295         if (error != 0) {
1296                 zfs_znode_delete(zp, tx);
1297                 remove_inode_hash(ZTOI(zp));
1298                 goto out;
1299         }
1300
1301         if (fuid_dirtied)
1302                 zfs_fuid_sync(zfsvfs, tx);
1303
1304         *zpp = zp;
1305
1306         txtype = zfs_log_create_txtype(Z_DIR, vsecp, vap);
1307         if (flags & FIGNORECASE)
1308                 txtype |= TX_CI;
1309         zfs_log_create(zilog, tx, txtype, dzp, zp, dirname, vsecp,
1310             acl_ids.z_fuidp, vap);
1311
1312 out:
1313         zfs_acl_ids_free(&acl_ids);
1314
1315         dmu_tx_commit(tx);
1316
1317         zfs_dirent_unlock(dl);
1318
1319         if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
1320                 zil_commit(zilog, 0);
1321
1322         if (error != 0) {
1323                 zrele(zp);
1324         } else {
1325                 zfs_znode_update_vfs(dzp);
1326                 zfs_znode_update_vfs(zp);
1327         }
1328         zfs_exit(zfsvfs, FTAG);
1329         return (error);
1330 }
1331
1332 /*
1333  * Remove a directory subdir entry.  If the current working
1334  * directory is the same as the subdir to be removed, the
1335  * remove will fail.
1336  *
1337  *      IN:     dzp     - znode of directory to remove from.
1338  *              name    - name of directory to be removed.
1339  *              cwd     - inode of current working directory.
1340  *              cr      - credentials of caller.
1341  *              flags   - case flags
1342  *
1343  *      RETURN: 0 on success, error code on failure.
1344  *
1345  * Timestamps:
1346  *      dzp - ctime|mtime updated
1347  */
1348 int
1349 zfs_rmdir(znode_t *dzp, char *name, znode_t *cwd, cred_t *cr,
1350     int flags)
1351 {
1352         znode_t         *zp;
1353         zfsvfs_t        *zfsvfs = ZTOZSB(dzp);
1354         zilog_t         *zilog;
1355         zfs_dirlock_t   *dl;
1356         dmu_tx_t        *tx;
1357         int             error;
1358         int             zflg = ZEXISTS;
1359         boolean_t       waited = B_FALSE;
1360
1361         if (name == NULL)
1362                 return (SET_ERROR(EINVAL));
1363
1364         if ((error = zfs_enter_verify_zp(zfsvfs, dzp, FTAG)) != 0)
1365                 return (error);
1366         zilog = zfsvfs->z_log;
1367
1368         if (flags & FIGNORECASE)
1369                 zflg |= ZCILOOK;
1370 top:
1371         zp = NULL;
1372
1373         /*
1374          * Attempt to lock directory; fail if entry doesn't exist.
1375          */
1376         if ((error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg,
1377             NULL, NULL))) {
1378                 zfs_exit(zfsvfs, FTAG);
1379                 return (error);
1380         }
1381
1382         if ((error = zfs_zaccess_delete(dzp, zp, cr))) {
1383                 goto out;
1384         }
1385
1386         if (!S_ISDIR(ZTOI(zp)->i_mode)) {
1387                 error = SET_ERROR(ENOTDIR);
1388                 goto out;
1389         }
1390
1391         if (zp == cwd) {
1392                 error = SET_ERROR(EINVAL);
1393                 goto out;
1394         }
1395
1396         /*
1397          * Grab a lock on the directory to make sure that no one is
1398          * trying to add (or lookup) entries while we are removing it.
1399          */
1400         rw_enter(&zp->z_name_lock, RW_WRITER);
1401
1402         /*
1403          * Grab a lock on the parent pointer to make sure we play well
1404          * with the treewalk and directory rename code.
1405          */
1406         rw_enter(&zp->z_parent_lock, RW_WRITER);
1407
1408         tx = dmu_tx_create(zfsvfs->z_os);
1409         dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name);
1410         dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
1411         dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
1412         zfs_sa_upgrade_txholds(tx, zp);
1413         zfs_sa_upgrade_txholds(tx, dzp);
1414         dmu_tx_mark_netfree(tx);
1415         error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT);
1416         if (error) {
1417                 rw_exit(&zp->z_parent_lock);
1418                 rw_exit(&zp->z_name_lock);
1419                 zfs_dirent_unlock(dl);
1420                 if (error == ERESTART) {
1421                         waited = B_TRUE;
1422                         dmu_tx_wait(tx);
1423                         dmu_tx_abort(tx);
1424                         zrele(zp);
1425                         goto top;
1426                 }
1427                 dmu_tx_abort(tx);
1428                 zrele(zp);
1429                 zfs_exit(zfsvfs, FTAG);
1430                 return (error);
1431         }
1432
1433         error = zfs_link_destroy(dl, zp, tx, zflg, NULL);
1434
1435         if (error == 0) {
1436                 uint64_t txtype = TX_RMDIR;
1437                 if (flags & FIGNORECASE)
1438                         txtype |= TX_CI;
1439                 zfs_log_remove(zilog, tx, txtype, dzp, name, ZFS_NO_OBJECT,
1440                     B_FALSE);
1441         }
1442
1443         dmu_tx_commit(tx);
1444
1445         rw_exit(&zp->z_parent_lock);
1446         rw_exit(&zp->z_name_lock);
1447 out:
1448         zfs_dirent_unlock(dl);
1449
1450         zfs_znode_update_vfs(dzp);
1451         zfs_znode_update_vfs(zp);
1452         zrele(zp);
1453
1454         if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
1455                 zil_commit(zilog, 0);
1456
1457         zfs_exit(zfsvfs, FTAG);
1458         return (error);
1459 }
1460
1461 /*
1462  * Read directory entries from the given directory cursor position and emit
1463  * name and position for each entry.
1464  *
1465  *      IN:     ip      - inode of directory to read.
1466  *              ctx     - directory entry context.
1467  *              cr      - credentials of caller.
1468  *
1469  *      RETURN: 0 if success
1470  *              error code if failure
1471  *
1472  * Timestamps:
1473  *      ip - atime updated
1474  *
1475  * Note that the low 4 bits of the cookie returned by zap is always zero.
1476  * This allows us to use the low range for "special" directory entries:
1477  * We use 0 for '.', and 1 for '..'.  If this is the root of the filesystem,
1478  * we use the offset 2 for the '.zfs' directory.
1479  */
1480 int
1481 zfs_readdir(struct inode *ip, zpl_dir_context_t *ctx, cred_t *cr)
1482 {
1483         (void) cr;
1484         znode_t         *zp = ITOZ(ip);
1485         zfsvfs_t        *zfsvfs = ITOZSB(ip);
1486         objset_t        *os;
1487         zap_cursor_t    zc;
1488         zap_attribute_t zap;
1489         int             error;
1490         uint8_t         prefetch;
1491         uint8_t         type;
1492         int             done = 0;
1493         uint64_t        parent;
1494         uint64_t        offset; /* must be unsigned; checks for < 1 */
1495
1496         if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
1497                 return (error);
1498
1499         if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs),
1500             &parent, sizeof (parent))) != 0)
1501                 goto out;
1502
1503         /*
1504          * Quit if directory has been removed (posix)
1505          */
1506         if (zp->z_unlinked)
1507                 goto out;
1508
1509         error = 0;
1510         os = zfsvfs->z_os;
1511         offset = ctx->pos;
1512         prefetch = zp->z_zn_prefetch;
1513
1514         /*
1515          * Initialize the iterator cursor.
1516          */
1517         if (offset <= 3) {
1518                 /*
1519                  * Start iteration from the beginning of the directory.
1520                  */
1521                 zap_cursor_init(&zc, os, zp->z_id);
1522         } else {
1523                 /*
1524                  * The offset is a serialized cursor.
1525                  */
1526                 zap_cursor_init_serialized(&zc, os, zp->z_id, offset);
1527         }
1528
1529         /*
1530          * Transform to file-system independent format
1531          */
1532         while (!done) {
1533                 uint64_t objnum;
1534                 /*
1535                  * Special case `.', `..', and `.zfs'.
1536                  */
1537                 if (offset == 0) {
1538                         (void) strcpy(zap.za_name, ".");
1539                         zap.za_normalization_conflict = 0;
1540                         objnum = zp->z_id;
1541                         type = DT_DIR;
1542                 } else if (offset == 1) {
1543                         (void) strcpy(zap.za_name, "..");
1544                         zap.za_normalization_conflict = 0;
1545                         objnum = parent;
1546                         type = DT_DIR;
1547                 } else if (offset == 2 && zfs_show_ctldir(zp)) {
1548                         (void) strcpy(zap.za_name, ZFS_CTLDIR_NAME);
1549                         zap.za_normalization_conflict = 0;
1550                         objnum = ZFSCTL_INO_ROOT;
1551                         type = DT_DIR;
1552                 } else {
1553                         /*
1554                          * Grab next entry.
1555                          */
1556                         if ((error = zap_cursor_retrieve(&zc, &zap))) {
1557                                 if (error == ENOENT)
1558                                         break;
1559                                 else
1560                                         goto update;
1561                         }
1562
1563                         /*
1564                          * Allow multiple entries provided the first entry is
1565                          * the object id.  Non-zpl consumers may safely make
1566                          * use of the additional space.
1567                          *
1568                          * XXX: This should be a feature flag for compatibility
1569                          */
1570                         if (zap.za_integer_length != 8 ||
1571                             zap.za_num_integers == 0) {
1572                                 cmn_err(CE_WARN, "zap_readdir: bad directory "
1573                                     "entry, obj = %lld, offset = %lld, "
1574                                     "length = %d, num = %lld\n",
1575                                     (u_longlong_t)zp->z_id,
1576                                     (u_longlong_t)offset,
1577                                     zap.za_integer_length,
1578                                     (u_longlong_t)zap.za_num_integers);
1579                                 error = SET_ERROR(ENXIO);
1580                                 goto update;
1581                         }
1582
1583                         objnum = ZFS_DIRENT_OBJ(zap.za_first_integer);
1584                         type = ZFS_DIRENT_TYPE(zap.za_first_integer);
1585                 }
1586
1587                 done = !zpl_dir_emit(ctx, zap.za_name, strlen(zap.za_name),
1588                     objnum, type);
1589                 if (done)
1590                         break;
1591
1592                 /* Prefetch znode */
1593                 if (prefetch) {
1594                         dmu_prefetch(os, objnum, 0, 0, 0,
1595                             ZIO_PRIORITY_SYNC_READ);
1596                 }
1597
1598                 /*
1599                  * Move to the next entry, fill in the previous offset.
1600                  */
1601                 if (offset > 2 || (offset == 2 && !zfs_show_ctldir(zp))) {
1602                         zap_cursor_advance(&zc);
1603                         offset = zap_cursor_serialize(&zc);
1604                 } else {
1605                         offset += 1;
1606                 }
1607                 ctx->pos = offset;
1608         }
1609         zp->z_zn_prefetch = B_FALSE; /* a lookup will re-enable pre-fetching */
1610
1611 update:
1612         zap_cursor_fini(&zc);
1613         if (error == ENOENT)
1614                 error = 0;
1615 out:
1616         zfs_exit(zfsvfs, FTAG);
1617
1618         return (error);
1619 }
1620
1621 /*
1622  * Get the basic file attributes and place them in the provided kstat
1623  * structure.  The inode is assumed to be the authoritative source
1624  * for most of the attributes.  However, the znode currently has the
1625  * authoritative atime, blksize, and block count.
1626  *
1627  *      IN:     ip      - inode of file.
1628  *
1629  *      OUT:    sp      - kstat values.
1630  *
1631  *      RETURN: 0 (always succeeds)
1632  */
1633 int
1634 zfs_getattr_fast(struct user_namespace *user_ns, struct inode *ip,
1635     struct kstat *sp)
1636 {
1637         znode_t *zp = ITOZ(ip);
1638         zfsvfs_t *zfsvfs = ITOZSB(ip);
1639         uint32_t blksize;
1640         u_longlong_t nblocks;
1641         int error;
1642
1643         if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
1644                 return (error);
1645
1646         mutex_enter(&zp->z_lock);
1647
1648         zpl_generic_fillattr(user_ns, ip, sp);
1649         /*
1650          * +1 link count for root inode with visible '.zfs' directory.
1651          */
1652         if ((zp->z_id == zfsvfs->z_root) && zfs_show_ctldir(zp))
1653                 if (sp->nlink < ZFS_LINK_MAX)
1654                         sp->nlink++;
1655
1656         sa_object_size(zp->z_sa_hdl, &blksize, &nblocks);
1657         sp->blksize = blksize;
1658         sp->blocks = nblocks;
1659
1660         if (unlikely(zp->z_blksz == 0)) {
1661                 /*
1662                  * Block size hasn't been set; suggest maximal I/O transfers.
1663                  */
1664                 sp->blksize = zfsvfs->z_max_blksz;
1665         }
1666
1667         mutex_exit(&zp->z_lock);
1668
1669         /*
1670          * Required to prevent NFS client from detecting different inode
1671          * numbers of snapshot root dentry before and after snapshot mount.
1672          */
1673         if (zfsvfs->z_issnap) {
1674                 if (ip->i_sb->s_root->d_inode == ip)
1675                         sp->ino = ZFSCTL_INO_SNAPDIRS -
1676                             dmu_objset_id(zfsvfs->z_os);
1677         }
1678
1679         zfs_exit(zfsvfs, FTAG);
1680
1681         return (0);
1682 }
1683
1684 /*
1685  * For the operation of changing file's user/group/project, we need to
1686  * handle not only the main object that is assigned to the file directly,
1687  * but also the ones that are used by the file via hidden xattr directory.
1688  *
1689  * Because the xattr directory may contains many EA entries, as to it may
1690  * be impossible to change all of them via the transaction of changing the
1691  * main object's user/group/project attributes. Then we have to change them
1692  * via other multiple independent transactions one by one. It may be not good
1693  * solution, but we have no better idea yet.
1694  */
1695 static int
1696 zfs_setattr_dir(znode_t *dzp)
1697 {
1698         struct inode    *dxip = ZTOI(dzp);
1699         struct inode    *xip = NULL;
1700         zfsvfs_t        *zfsvfs = ZTOZSB(dzp);
1701         objset_t        *os = zfsvfs->z_os;
1702         zap_cursor_t    zc;
1703         zap_attribute_t zap;
1704         zfs_dirlock_t   *dl;
1705         znode_t         *zp = NULL;
1706         dmu_tx_t        *tx = NULL;
1707         uint64_t        uid, gid;
1708         sa_bulk_attr_t  bulk[4];
1709         int             count;
1710         int             err;
1711
1712         zap_cursor_init(&zc, os, dzp->z_id);
1713         while ((err = zap_cursor_retrieve(&zc, &zap)) == 0) {
1714                 count = 0;
1715                 if (zap.za_integer_length != 8 || zap.za_num_integers != 1) {
1716                         err = ENXIO;
1717                         break;
1718                 }
1719
1720                 err = zfs_dirent_lock(&dl, dzp, (char *)zap.za_name, &zp,
1721                     ZEXISTS, NULL, NULL);
1722                 if (err == ENOENT)
1723                         goto next;
1724                 if (err)
1725                         break;
1726
1727                 xip = ZTOI(zp);
1728                 if (KUID_TO_SUID(xip->i_uid) == KUID_TO_SUID(dxip->i_uid) &&
1729                     KGID_TO_SGID(xip->i_gid) == KGID_TO_SGID(dxip->i_gid) &&
1730                     zp->z_projid == dzp->z_projid)
1731                         goto next;
1732
1733                 tx = dmu_tx_create(os);
1734                 if (!(zp->z_pflags & ZFS_PROJID))
1735                         dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
1736                 else
1737                         dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
1738
1739                 err = dmu_tx_assign(tx, TXG_WAIT);
1740                 if (err)
1741                         break;
1742
1743                 mutex_enter(&dzp->z_lock);
1744
1745                 if (KUID_TO_SUID(xip->i_uid) != KUID_TO_SUID(dxip->i_uid)) {
1746                         xip->i_uid = dxip->i_uid;
1747                         uid = zfs_uid_read(dxip);
1748                         SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL,
1749                             &uid, sizeof (uid));
1750                 }
1751
1752                 if (KGID_TO_SGID(xip->i_gid) != KGID_TO_SGID(dxip->i_gid)) {
1753                         xip->i_gid = dxip->i_gid;
1754                         gid = zfs_gid_read(dxip);
1755                         SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs), NULL,
1756                             &gid, sizeof (gid));
1757                 }
1758
1759                 if (zp->z_projid != dzp->z_projid) {
1760                         if (!(zp->z_pflags & ZFS_PROJID)) {
1761                                 zp->z_pflags |= ZFS_PROJID;
1762                                 SA_ADD_BULK_ATTR(bulk, count,
1763                                     SA_ZPL_FLAGS(zfsvfs), NULL, &zp->z_pflags,
1764                                     sizeof (zp->z_pflags));
1765                         }
1766
1767                         zp->z_projid = dzp->z_projid;
1768                         SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_PROJID(zfsvfs),
1769                             NULL, &zp->z_projid, sizeof (zp->z_projid));
1770                 }
1771
1772                 mutex_exit(&dzp->z_lock);
1773
1774                 if (likely(count > 0)) {
1775                         err = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
1776                         dmu_tx_commit(tx);
1777                 } else {
1778                         dmu_tx_abort(tx);
1779                 }
1780                 tx = NULL;
1781                 if (err != 0 && err != ENOENT)
1782                         break;
1783
1784 next:
1785                 if (zp) {
1786                         zrele(zp);
1787                         zp = NULL;
1788                         zfs_dirent_unlock(dl);
1789                 }
1790                 zap_cursor_advance(&zc);
1791         }
1792
1793         if (tx)
1794                 dmu_tx_abort(tx);
1795         if (zp) {
1796                 zrele(zp);
1797                 zfs_dirent_unlock(dl);
1798         }
1799         zap_cursor_fini(&zc);
1800
1801         return (err == ENOENT ? 0 : err);
1802 }
1803
1804 /*
1805  * Set the file attributes to the values contained in the
1806  * vattr structure.
1807  *
1808  *      IN:     zp      - znode of file to be modified.
1809  *              vap     - new attribute values.
1810  *                        If ATTR_XVATTR set, then optional attrs are being set
1811  *              flags   - ATTR_UTIME set if non-default time values provided.
1812  *                      - ATTR_NOACLCHECK (CIFS context only).
1813  *              cr      - credentials of caller.
1814  *
1815  *      RETURN: 0 if success
1816  *              error code if failure
1817  *
1818  * Timestamps:
1819  *      ip - ctime updated, mtime updated if size changed.
1820  */
1821 int
1822 zfs_setattr(znode_t *zp, vattr_t *vap, int flags, cred_t *cr)
1823 {
1824         struct inode    *ip;
1825         zfsvfs_t        *zfsvfs = ZTOZSB(zp);
1826         objset_t        *os = zfsvfs->z_os;
1827         zilog_t         *zilog;
1828         dmu_tx_t        *tx;
1829         vattr_t         oldva;
1830         xvattr_t        *tmpxvattr;
1831         uint_t          mask = vap->va_mask;
1832         uint_t          saved_mask = 0;
1833         int             trim_mask = 0;
1834         uint64_t        new_mode;
1835         uint64_t        new_kuid = 0, new_kgid = 0, new_uid, new_gid;
1836         uint64_t        xattr_obj;
1837         uint64_t        mtime[2], ctime[2], atime[2];
1838         uint64_t        projid = ZFS_INVALID_PROJID;
1839         znode_t         *attrzp;
1840         int             need_policy = FALSE;
1841         int             err, err2 = 0;
1842         zfs_fuid_info_t *fuidp = NULL;
1843         xvattr_t *xvap = (xvattr_t *)vap;       /* vap may be an xvattr_t * */
1844         xoptattr_t      *xoap;
1845         zfs_acl_t       *aclp;
1846         boolean_t skipaclchk = (flags & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
1847         boolean_t       fuid_dirtied = B_FALSE;
1848         boolean_t       handle_eadir = B_FALSE;
1849         sa_bulk_attr_t  *bulk, *xattr_bulk;
1850         int             count = 0, xattr_count = 0, bulks = 8;
1851
1852         if (mask == 0)
1853                 return (0);
1854
1855         if ((err = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
1856                 return (err);
1857         ip = ZTOI(zp);
1858
1859         /*
1860          * If this is a xvattr_t, then get a pointer to the structure of
1861          * optional attributes.  If this is NULL, then we have a vattr_t.
1862          */
1863         xoap = xva_getxoptattr(xvap);
1864         if (xoap != NULL && (mask & ATTR_XVATTR)) {
1865                 if (XVA_ISSET_REQ(xvap, XAT_PROJID)) {
1866                         if (!dmu_objset_projectquota_enabled(os) ||
1867                             (!S_ISREG(ip->i_mode) && !S_ISDIR(ip->i_mode))) {
1868                                 zfs_exit(zfsvfs, FTAG);
1869                                 return (SET_ERROR(ENOTSUP));
1870                         }
1871
1872                         projid = xoap->xoa_projid;
1873                         if (unlikely(projid == ZFS_INVALID_PROJID)) {
1874                                 zfs_exit(zfsvfs, FTAG);
1875                                 return (SET_ERROR(EINVAL));
1876                         }
1877
1878                         if (projid == zp->z_projid && zp->z_pflags & ZFS_PROJID)
1879                                 projid = ZFS_INVALID_PROJID;
1880                         else
1881                                 need_policy = TRUE;
1882                 }
1883
1884                 if (XVA_ISSET_REQ(xvap, XAT_PROJINHERIT) &&
1885                     (xoap->xoa_projinherit !=
1886                     ((zp->z_pflags & ZFS_PROJINHERIT) != 0)) &&
1887                     (!dmu_objset_projectquota_enabled(os) ||
1888                     (!S_ISREG(ip->i_mode) && !S_ISDIR(ip->i_mode)))) {
1889                         zfs_exit(zfsvfs, FTAG);
1890                         return (SET_ERROR(ENOTSUP));
1891                 }
1892         }
1893
1894         zilog = zfsvfs->z_log;
1895
1896         /*
1897          * Make sure that if we have ephemeral uid/gid or xvattr specified
1898          * that file system is at proper version level
1899          */
1900
1901         if (zfsvfs->z_use_fuids == B_FALSE &&
1902             (((mask & ATTR_UID) && IS_EPHEMERAL(vap->va_uid)) ||
1903             ((mask & ATTR_GID) && IS_EPHEMERAL(vap->va_gid)) ||
1904             (mask & ATTR_XVATTR))) {
1905                 zfs_exit(zfsvfs, FTAG);
1906                 return (SET_ERROR(EINVAL));
1907         }
1908
1909         if (mask & ATTR_SIZE && S_ISDIR(ip->i_mode)) {
1910                 zfs_exit(zfsvfs, FTAG);
1911                 return (SET_ERROR(EISDIR));
1912         }
1913
1914         if (mask & ATTR_SIZE && !S_ISREG(ip->i_mode) && !S_ISFIFO(ip->i_mode)) {
1915                 zfs_exit(zfsvfs, FTAG);
1916                 return (SET_ERROR(EINVAL));
1917         }
1918
1919         tmpxvattr = kmem_alloc(sizeof (xvattr_t), KM_SLEEP);
1920         xva_init(tmpxvattr);
1921
1922         bulk = kmem_alloc(sizeof (sa_bulk_attr_t) * bulks, KM_SLEEP);
1923         xattr_bulk = kmem_alloc(sizeof (sa_bulk_attr_t) * bulks, KM_SLEEP);
1924
1925         /*
1926          * Immutable files can only alter immutable bit and atime
1927          */
1928         if ((zp->z_pflags & ZFS_IMMUTABLE) &&
1929             ((mask & (ATTR_SIZE|ATTR_UID|ATTR_GID|ATTR_MTIME|ATTR_MODE)) ||
1930             ((mask & ATTR_XVATTR) && XVA_ISSET_REQ(xvap, XAT_CREATETIME)))) {
1931                 err = SET_ERROR(EPERM);
1932                 goto out3;
1933         }
1934
1935         if ((mask & ATTR_SIZE) && (zp->z_pflags & ZFS_READONLY)) {
1936                 err = SET_ERROR(EPERM);
1937                 goto out3;
1938         }
1939
1940         /*
1941          * Verify timestamps doesn't overflow 32 bits.
1942          * ZFS can handle large timestamps, but 32bit syscalls can't
1943          * handle times greater than 2039.  This check should be removed
1944          * once large timestamps are fully supported.
1945          */
1946         if (mask & (ATTR_ATIME | ATTR_MTIME)) {
1947                 if (((mask & ATTR_ATIME) &&
1948                     TIMESPEC_OVERFLOW(&vap->va_atime)) ||
1949                     ((mask & ATTR_MTIME) &&
1950                     TIMESPEC_OVERFLOW(&vap->va_mtime))) {
1951                         err = SET_ERROR(EOVERFLOW);
1952                         goto out3;
1953                 }
1954         }
1955
1956 top:
1957         attrzp = NULL;
1958         aclp = NULL;
1959
1960         /* Can this be moved to before the top label? */
1961         if (zfs_is_readonly(zfsvfs)) {
1962                 err = SET_ERROR(EROFS);
1963                 goto out3;
1964         }
1965
1966         /*
1967          * First validate permissions
1968          */
1969
1970         if (mask & ATTR_SIZE) {
1971                 err = zfs_zaccess(zp, ACE_WRITE_DATA, 0, skipaclchk, cr);
1972                 if (err)
1973                         goto out3;
1974
1975                 /*
1976                  * XXX - Note, we are not providing any open
1977                  * mode flags here (like FNDELAY), so we may
1978                  * block if there are locks present... this
1979                  * should be addressed in openat().
1980                  */
1981                 /* XXX - would it be OK to generate a log record here? */
1982                 err = zfs_freesp(zp, vap->va_size, 0, 0, FALSE);
1983                 if (err)
1984                         goto out3;
1985         }
1986
1987         if (mask & (ATTR_ATIME|ATTR_MTIME) ||
1988             ((mask & ATTR_XVATTR) && (XVA_ISSET_REQ(xvap, XAT_HIDDEN) ||
1989             XVA_ISSET_REQ(xvap, XAT_READONLY) ||
1990             XVA_ISSET_REQ(xvap, XAT_ARCHIVE) ||
1991             XVA_ISSET_REQ(xvap, XAT_OFFLINE) ||
1992             XVA_ISSET_REQ(xvap, XAT_SPARSE) ||
1993             XVA_ISSET_REQ(xvap, XAT_CREATETIME) ||
1994             XVA_ISSET_REQ(xvap, XAT_SYSTEM)))) {
1995                 need_policy = zfs_zaccess(zp, ACE_WRITE_ATTRIBUTES, 0,
1996                     skipaclchk, cr);
1997         }
1998
1999         if (mask & (ATTR_UID|ATTR_GID)) {
2000                 int     idmask = (mask & (ATTR_UID|ATTR_GID));
2001                 int     take_owner;
2002                 int     take_group;
2003
2004                 /*
2005                  * NOTE: even if a new mode is being set,
2006                  * we may clear S_ISUID/S_ISGID bits.
2007                  */
2008
2009                 if (!(mask & ATTR_MODE))
2010                         vap->va_mode = zp->z_mode;
2011
2012                 /*
2013                  * Take ownership or chgrp to group we are a member of
2014                  */
2015
2016                 take_owner = (mask & ATTR_UID) && (vap->va_uid == crgetuid(cr));
2017                 take_group = (mask & ATTR_GID) &&
2018                     zfs_groupmember(zfsvfs, vap->va_gid, cr);
2019
2020                 /*
2021                  * If both ATTR_UID and ATTR_GID are set then take_owner and
2022                  * take_group must both be set in order to allow taking
2023                  * ownership.
2024                  *
2025                  * Otherwise, send the check through secpolicy_vnode_setattr()
2026                  *
2027                  */
2028
2029                 if (((idmask == (ATTR_UID|ATTR_GID)) &&
2030                     take_owner && take_group) ||
2031                     ((idmask == ATTR_UID) && take_owner) ||
2032                     ((idmask == ATTR_GID) && take_group)) {
2033                         if (zfs_zaccess(zp, ACE_WRITE_OWNER, 0,
2034                             skipaclchk, cr) == 0) {
2035                                 /*
2036                                  * Remove setuid/setgid for non-privileged users
2037                                  */
2038                                 (void) secpolicy_setid_clear(vap, cr);
2039                                 trim_mask = (mask & (ATTR_UID|ATTR_GID));
2040                         } else {
2041                                 need_policy =  TRUE;
2042                         }
2043                 } else {
2044                         need_policy =  TRUE;
2045                 }
2046         }
2047
2048         mutex_enter(&zp->z_lock);
2049         oldva.va_mode = zp->z_mode;
2050         zfs_fuid_map_ids(zp, cr, &oldva.va_uid, &oldva.va_gid);
2051         if (mask & ATTR_XVATTR) {
2052                 /*
2053                  * Update xvattr mask to include only those attributes
2054                  * that are actually changing.
2055                  *
2056                  * the bits will be restored prior to actually setting
2057                  * the attributes so the caller thinks they were set.
2058                  */
2059                 if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) {
2060                         if (xoap->xoa_appendonly !=
2061                             ((zp->z_pflags & ZFS_APPENDONLY) != 0)) {
2062                                 need_policy = TRUE;
2063                         } else {
2064                                 XVA_CLR_REQ(xvap, XAT_APPENDONLY);
2065                                 XVA_SET_REQ(tmpxvattr, XAT_APPENDONLY);
2066                         }
2067                 }
2068
2069                 if (XVA_ISSET_REQ(xvap, XAT_PROJINHERIT)) {
2070                         if (xoap->xoa_projinherit !=
2071                             ((zp->z_pflags & ZFS_PROJINHERIT) != 0)) {
2072                                 need_policy = TRUE;
2073                         } else {
2074                                 XVA_CLR_REQ(xvap, XAT_PROJINHERIT);
2075                                 XVA_SET_REQ(tmpxvattr, XAT_PROJINHERIT);
2076                         }
2077                 }
2078
2079                 if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) {
2080                         if (xoap->xoa_nounlink !=
2081                             ((zp->z_pflags & ZFS_NOUNLINK) != 0)) {
2082                                 need_policy = TRUE;
2083                         } else {
2084                                 XVA_CLR_REQ(xvap, XAT_NOUNLINK);
2085                                 XVA_SET_REQ(tmpxvattr, XAT_NOUNLINK);
2086                         }
2087                 }
2088
2089                 if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) {
2090                         if (xoap->xoa_immutable !=
2091                             ((zp->z_pflags & ZFS_IMMUTABLE) != 0)) {
2092                                 need_policy = TRUE;
2093                         } else {
2094                                 XVA_CLR_REQ(xvap, XAT_IMMUTABLE);
2095                                 XVA_SET_REQ(tmpxvattr, XAT_IMMUTABLE);
2096                         }
2097                 }
2098
2099                 if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) {
2100                         if (xoap->xoa_nodump !=
2101                             ((zp->z_pflags & ZFS_NODUMP) != 0)) {
2102                                 need_policy = TRUE;
2103                         } else {
2104                                 XVA_CLR_REQ(xvap, XAT_NODUMP);
2105                                 XVA_SET_REQ(tmpxvattr, XAT_NODUMP);
2106                         }
2107                 }
2108
2109                 if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) {
2110                         if (xoap->xoa_av_modified !=
2111                             ((zp->z_pflags & ZFS_AV_MODIFIED) != 0)) {
2112                                 need_policy = TRUE;
2113                         } else {
2114                                 XVA_CLR_REQ(xvap, XAT_AV_MODIFIED);
2115                                 XVA_SET_REQ(tmpxvattr, XAT_AV_MODIFIED);
2116                         }
2117                 }
2118
2119                 if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) {
2120                         if ((!S_ISREG(ip->i_mode) &&
2121                             xoap->xoa_av_quarantined) ||
2122                             xoap->xoa_av_quarantined !=
2123                             ((zp->z_pflags & ZFS_AV_QUARANTINED) != 0)) {
2124                                 need_policy = TRUE;
2125                         } else {
2126                                 XVA_CLR_REQ(xvap, XAT_AV_QUARANTINED);
2127                                 XVA_SET_REQ(tmpxvattr, XAT_AV_QUARANTINED);
2128                         }
2129                 }
2130
2131                 if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) {
2132                         mutex_exit(&zp->z_lock);
2133                         err = SET_ERROR(EPERM);
2134                         goto out3;
2135                 }
2136
2137                 if (need_policy == FALSE &&
2138                     (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP) ||
2139                     XVA_ISSET_REQ(xvap, XAT_OPAQUE))) {
2140                         need_policy = TRUE;
2141                 }
2142         }
2143
2144         mutex_exit(&zp->z_lock);
2145
2146         if (mask & ATTR_MODE) {
2147                 if (zfs_zaccess(zp, ACE_WRITE_ACL, 0, skipaclchk, cr) == 0) {
2148                         err = secpolicy_setid_setsticky_clear(ip, vap,
2149                             &oldva, cr);
2150                         if (err)
2151                                 goto out3;
2152
2153                         trim_mask |= ATTR_MODE;
2154                 } else {
2155                         need_policy = TRUE;
2156                 }
2157         }
2158
2159         if (need_policy) {
2160                 /*
2161                  * If trim_mask is set then take ownership
2162                  * has been granted or write_acl is present and user
2163                  * has the ability to modify mode.  In that case remove
2164                  * UID|GID and or MODE from mask so that
2165                  * secpolicy_vnode_setattr() doesn't revoke it.
2166                  */
2167
2168                 if (trim_mask) {
2169                         saved_mask = vap->va_mask;
2170                         vap->va_mask &= ~trim_mask;
2171                 }
2172                 err = secpolicy_vnode_setattr(cr, ip, vap, &oldva, flags,
2173                     (int (*)(void *, int, cred_t *))zfs_zaccess_unix, zp);
2174                 if (err)
2175                         goto out3;
2176
2177                 if (trim_mask)
2178                         vap->va_mask |= saved_mask;
2179         }
2180
2181         /*
2182          * secpolicy_vnode_setattr, or take ownership may have
2183          * changed va_mask
2184          */
2185         mask = vap->va_mask;
2186
2187         if ((mask & (ATTR_UID | ATTR_GID)) || projid != ZFS_INVALID_PROJID) {
2188                 handle_eadir = B_TRUE;
2189                 err = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs),
2190                     &xattr_obj, sizeof (xattr_obj));
2191
2192                 if (err == 0 && xattr_obj) {
2193                         err = zfs_zget(ZTOZSB(zp), xattr_obj, &attrzp);
2194                         if (err)
2195                                 goto out2;
2196                 }
2197                 if (mask & ATTR_UID) {
2198                         new_kuid = zfs_fuid_create(zfsvfs,
2199                             (uint64_t)vap->va_uid, cr, ZFS_OWNER, &fuidp);
2200                         if (new_kuid != KUID_TO_SUID(ZTOI(zp)->i_uid) &&
2201                             zfs_id_overquota(zfsvfs, DMU_USERUSED_OBJECT,
2202                             new_kuid)) {
2203                                 if (attrzp)
2204                                         zrele(attrzp);
2205                                 err = SET_ERROR(EDQUOT);
2206                                 goto out2;
2207                         }
2208                 }
2209
2210                 if (mask & ATTR_GID) {
2211                         new_kgid = zfs_fuid_create(zfsvfs,
2212                             (uint64_t)vap->va_gid, cr, ZFS_GROUP, &fuidp);
2213                         if (new_kgid != KGID_TO_SGID(ZTOI(zp)->i_gid) &&
2214                             zfs_id_overquota(zfsvfs, DMU_GROUPUSED_OBJECT,
2215                             new_kgid)) {
2216                                 if (attrzp)
2217                                         zrele(attrzp);
2218                                 err = SET_ERROR(EDQUOT);
2219                                 goto out2;
2220                         }
2221                 }
2222
2223                 if (projid != ZFS_INVALID_PROJID &&
2224                     zfs_id_overquota(zfsvfs, DMU_PROJECTUSED_OBJECT, projid)) {
2225                         if (attrzp)
2226                                 zrele(attrzp);
2227                         err = EDQUOT;
2228                         goto out2;
2229                 }
2230         }
2231         tx = dmu_tx_create(os);
2232
2233         if (mask & ATTR_MODE) {
2234                 uint64_t pmode = zp->z_mode;
2235                 uint64_t acl_obj;
2236                 new_mode = (pmode & S_IFMT) | (vap->va_mode & ~S_IFMT);
2237
2238                 if (ZTOZSB(zp)->z_acl_mode == ZFS_ACL_RESTRICTED &&
2239                     !(zp->z_pflags & ZFS_ACL_TRIVIAL)) {
2240                         err = EPERM;
2241                         goto out;
2242                 }
2243
2244                 if ((err = zfs_acl_chmod_setattr(zp, &aclp, new_mode)))
2245                         goto out;
2246
2247                 mutex_enter(&zp->z_lock);
2248                 if (!zp->z_is_sa && ((acl_obj = zfs_external_acl(zp)) != 0)) {
2249                         /*
2250                          * Are we upgrading ACL from old V0 format
2251                          * to V1 format?
2252                          */
2253                         if (zfsvfs->z_version >= ZPL_VERSION_FUID &&
2254                             zfs_znode_acl_version(zp) ==
2255                             ZFS_ACL_VERSION_INITIAL) {
2256                                 dmu_tx_hold_free(tx, acl_obj, 0,
2257                                     DMU_OBJECT_END);
2258                                 dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
2259                                     0, aclp->z_acl_bytes);
2260                         } else {
2261                                 dmu_tx_hold_write(tx, acl_obj, 0,
2262                                     aclp->z_acl_bytes);
2263                         }
2264                 } else if (!zp->z_is_sa && aclp->z_acl_bytes > ZFS_ACE_SPACE) {
2265                         dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
2266                             0, aclp->z_acl_bytes);
2267                 }
2268                 mutex_exit(&zp->z_lock);
2269                 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
2270         } else {
2271                 if (((mask & ATTR_XVATTR) &&
2272                     XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) ||
2273                     (projid != ZFS_INVALID_PROJID &&
2274                     !(zp->z_pflags & ZFS_PROJID)))
2275                         dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
2276                 else
2277                         dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
2278         }
2279
2280         if (attrzp) {
2281                 dmu_tx_hold_sa(tx, attrzp->z_sa_hdl, B_FALSE);
2282         }
2283
2284         fuid_dirtied = zfsvfs->z_fuid_dirty;
2285         if (fuid_dirtied)
2286                 zfs_fuid_txhold(zfsvfs, tx);
2287
2288         zfs_sa_upgrade_txholds(tx, zp);
2289
2290         err = dmu_tx_assign(tx, TXG_WAIT);
2291         if (err)
2292                 goto out;
2293
2294         count = 0;
2295         /*
2296          * Set each attribute requested.
2297          * We group settings according to the locks they need to acquire.
2298          *
2299          * Note: you cannot set ctime directly, although it will be
2300          * updated as a side-effect of calling this function.
2301          */
2302
2303         if (projid != ZFS_INVALID_PROJID && !(zp->z_pflags & ZFS_PROJID)) {
2304                 /*
2305                  * For the existed object that is upgraded from old system,
2306                  * its on-disk layout has no slot for the project ID attribute.
2307                  * But quota accounting logic needs to access related slots by
2308                  * offset directly. So we need to adjust old objects' layout
2309                  * to make the project ID to some unified and fixed offset.
2310                  */
2311                 if (attrzp)
2312                         err = sa_add_projid(attrzp->z_sa_hdl, tx, projid);
2313                 if (err == 0)
2314                         err = sa_add_projid(zp->z_sa_hdl, tx, projid);
2315
2316                 if (unlikely(err == EEXIST))
2317                         err = 0;
2318                 else if (err != 0)
2319                         goto out;
2320                 else
2321                         projid = ZFS_INVALID_PROJID;
2322         }
2323
2324         if (mask & (ATTR_UID|ATTR_GID|ATTR_MODE))
2325                 mutex_enter(&zp->z_acl_lock);
2326         mutex_enter(&zp->z_lock);
2327
2328         SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
2329             &zp->z_pflags, sizeof (zp->z_pflags));
2330
2331         if (attrzp) {
2332                 if (mask & (ATTR_UID|ATTR_GID|ATTR_MODE))
2333                         mutex_enter(&attrzp->z_acl_lock);
2334                 mutex_enter(&attrzp->z_lock);
2335                 SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
2336                     SA_ZPL_FLAGS(zfsvfs), NULL, &attrzp->z_pflags,
2337                     sizeof (attrzp->z_pflags));
2338                 if (projid != ZFS_INVALID_PROJID) {
2339                         attrzp->z_projid = projid;
2340                         SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
2341                             SA_ZPL_PROJID(zfsvfs), NULL, &attrzp->z_projid,
2342                             sizeof (attrzp->z_projid));
2343                 }
2344         }
2345
2346         if (mask & (ATTR_UID|ATTR_GID)) {
2347
2348                 if (mask & ATTR_UID) {
2349                         ZTOI(zp)->i_uid = SUID_TO_KUID(new_kuid);
2350                         new_uid = zfs_uid_read(ZTOI(zp));
2351                         SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL,
2352                             &new_uid, sizeof (new_uid));
2353                         if (attrzp) {
2354                                 SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
2355                                     SA_ZPL_UID(zfsvfs), NULL, &new_uid,
2356                                     sizeof (new_uid));
2357                                 ZTOI(attrzp)->i_uid = SUID_TO_KUID(new_uid);
2358                         }
2359                 }
2360
2361                 if (mask & ATTR_GID) {
2362                         ZTOI(zp)->i_gid = SGID_TO_KGID(new_kgid);
2363                         new_gid = zfs_gid_read(ZTOI(zp));
2364                         SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs),
2365                             NULL, &new_gid, sizeof (new_gid));
2366                         if (attrzp) {
2367                                 SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
2368                                     SA_ZPL_GID(zfsvfs), NULL, &new_gid,
2369                                     sizeof (new_gid));
2370                                 ZTOI(attrzp)->i_gid = SGID_TO_KGID(new_kgid);
2371                         }
2372                 }
2373                 if (!(mask & ATTR_MODE)) {
2374                         SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs),
2375                             NULL, &new_mode, sizeof (new_mode));
2376                         new_mode = zp->z_mode;
2377                 }
2378                 err = zfs_acl_chown_setattr(zp);
2379                 ASSERT(err == 0);
2380                 if (attrzp) {
2381                         err = zfs_acl_chown_setattr(attrzp);
2382                         ASSERT(err == 0);
2383                 }
2384         }
2385
2386         if (mask & ATTR_MODE) {
2387                 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL,
2388                     &new_mode, sizeof (new_mode));
2389                 zp->z_mode = ZTOI(zp)->i_mode = new_mode;
2390                 ASSERT3P(aclp, !=, NULL);
2391                 err = zfs_aclset_common(zp, aclp, cr, tx);
2392                 ASSERT0(err);
2393                 if (zp->z_acl_cached)
2394                         zfs_acl_free(zp->z_acl_cached);
2395                 zp->z_acl_cached = aclp;
2396                 aclp = NULL;
2397         }
2398
2399         if ((mask & ATTR_ATIME) || zp->z_atime_dirty) {
2400                 zp->z_atime_dirty = B_FALSE;
2401                 ZFS_TIME_ENCODE(&ip->i_atime, atime);
2402                 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zfsvfs), NULL,
2403                     &atime, sizeof (atime));
2404         }
2405
2406         if (mask & (ATTR_MTIME | ATTR_SIZE)) {
2407                 ZFS_TIME_ENCODE(&vap->va_mtime, mtime);
2408                 ZTOI(zp)->i_mtime = zpl_inode_timestamp_truncate(
2409                     vap->va_mtime, ZTOI(zp));
2410
2411                 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL,
2412                     mtime, sizeof (mtime));
2413         }
2414
2415         if (mask & (ATTR_CTIME | ATTR_SIZE)) {
2416                 ZFS_TIME_ENCODE(&vap->va_ctime, ctime);
2417                 ZTOI(zp)->i_ctime = zpl_inode_timestamp_truncate(vap->va_ctime,
2418                     ZTOI(zp));
2419                 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
2420                     ctime, sizeof (ctime));
2421         }
2422
2423         if (projid != ZFS_INVALID_PROJID) {
2424                 zp->z_projid = projid;
2425                 SA_ADD_BULK_ATTR(bulk, count,
2426                     SA_ZPL_PROJID(zfsvfs), NULL, &zp->z_projid,
2427                     sizeof (zp->z_projid));
2428         }
2429
2430         if (attrzp && mask) {
2431                 SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
2432                     SA_ZPL_CTIME(zfsvfs), NULL, &ctime,
2433                     sizeof (ctime));
2434         }
2435
2436         /*
2437          * Do this after setting timestamps to prevent timestamp
2438          * update from toggling bit
2439          */
2440
2441         if (xoap && (mask & ATTR_XVATTR)) {
2442
2443                 /*
2444                  * restore trimmed off masks
2445                  * so that return masks can be set for caller.
2446                  */
2447
2448                 if (XVA_ISSET_REQ(tmpxvattr, XAT_APPENDONLY)) {
2449                         XVA_SET_REQ(xvap, XAT_APPENDONLY);
2450                 }
2451                 if (XVA_ISSET_REQ(tmpxvattr, XAT_NOUNLINK)) {
2452                         XVA_SET_REQ(xvap, XAT_NOUNLINK);
2453                 }
2454                 if (XVA_ISSET_REQ(tmpxvattr, XAT_IMMUTABLE)) {
2455                         XVA_SET_REQ(xvap, XAT_IMMUTABLE);
2456                 }
2457                 if (XVA_ISSET_REQ(tmpxvattr, XAT_NODUMP)) {
2458                         XVA_SET_REQ(xvap, XAT_NODUMP);
2459                 }
2460                 if (XVA_ISSET_REQ(tmpxvattr, XAT_AV_MODIFIED)) {
2461                         XVA_SET_REQ(xvap, XAT_AV_MODIFIED);
2462                 }
2463                 if (XVA_ISSET_REQ(tmpxvattr, XAT_AV_QUARANTINED)) {
2464                         XVA_SET_REQ(xvap, XAT_AV_QUARANTINED);
2465                 }
2466                 if (XVA_ISSET_REQ(tmpxvattr, XAT_PROJINHERIT)) {
2467                         XVA_SET_REQ(xvap, XAT_PROJINHERIT);
2468                 }
2469
2470                 if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP))
2471                         ASSERT(S_ISREG(ip->i_mode));
2472
2473                 zfs_xvattr_set(zp, xvap, tx);
2474         }
2475
2476         if (fuid_dirtied)
2477                 zfs_fuid_sync(zfsvfs, tx);
2478
2479         if (mask != 0)
2480                 zfs_log_setattr(zilog, tx, TX_SETATTR, zp, vap, mask, fuidp);
2481
2482         mutex_exit(&zp->z_lock);
2483         if (mask & (ATTR_UID|ATTR_GID|ATTR_MODE))
2484                 mutex_exit(&zp->z_acl_lock);
2485
2486         if (attrzp) {
2487                 if (mask & (ATTR_UID|ATTR_GID|ATTR_MODE))
2488                         mutex_exit(&attrzp->z_acl_lock);
2489                 mutex_exit(&attrzp->z_lock);
2490         }
2491 out:
2492         if (err == 0 && xattr_count > 0) {
2493                 err2 = sa_bulk_update(attrzp->z_sa_hdl, xattr_bulk,
2494                     xattr_count, tx);
2495                 ASSERT(err2 == 0);
2496         }
2497
2498         if (aclp)
2499                 zfs_acl_free(aclp);
2500
2501         if (fuidp) {
2502                 zfs_fuid_info_free(fuidp);
2503                 fuidp = NULL;
2504         }
2505
2506         if (err) {
2507                 dmu_tx_abort(tx);
2508                 if (attrzp)
2509                         zrele(attrzp);
2510                 if (err == ERESTART)
2511                         goto top;
2512         } else {
2513                 if (count > 0)
2514                         err2 = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
2515                 dmu_tx_commit(tx);
2516                 if (attrzp) {
2517                         if (err2 == 0 && handle_eadir)
2518                                 err2 = zfs_setattr_dir(attrzp);
2519                         zrele(attrzp);
2520                 }
2521                 zfs_znode_update_vfs(zp);
2522         }
2523
2524 out2:
2525         if (os->os_sync == ZFS_SYNC_ALWAYS)
2526                 zil_commit(zilog, 0);
2527
2528 out3:
2529         kmem_free(xattr_bulk, sizeof (sa_bulk_attr_t) * bulks);
2530         kmem_free(bulk, sizeof (sa_bulk_attr_t) * bulks);
2531         kmem_free(tmpxvattr, sizeof (xvattr_t));
2532         zfs_exit(zfsvfs, FTAG);
2533         return (err);
2534 }
2535
2536 typedef struct zfs_zlock {
2537         krwlock_t       *zl_rwlock;     /* lock we acquired */
2538         znode_t         *zl_znode;      /* znode we held */
2539         struct zfs_zlock *zl_next;      /* next in list */
2540 } zfs_zlock_t;
2541
2542 /*
2543  * Drop locks and release vnodes that were held by zfs_rename_lock().
2544  */
2545 static void
2546 zfs_rename_unlock(zfs_zlock_t **zlpp)
2547 {
2548         zfs_zlock_t *zl;
2549
2550         while ((zl = *zlpp) != NULL) {
2551                 if (zl->zl_znode != NULL)
2552                         zfs_zrele_async(zl->zl_znode);
2553                 rw_exit(zl->zl_rwlock);
2554                 *zlpp = zl->zl_next;
2555                 kmem_free(zl, sizeof (*zl));
2556         }
2557 }
2558
2559 /*
2560  * Search back through the directory tree, using the ".." entries.
2561  * Lock each directory in the chain to prevent concurrent renames.
2562  * Fail any attempt to move a directory into one of its own descendants.
2563  * XXX - z_parent_lock can overlap with map or grow locks
2564  */
2565 static int
2566 zfs_rename_lock(znode_t *szp, znode_t *tdzp, znode_t *sdzp, zfs_zlock_t **zlpp)
2567 {
2568         zfs_zlock_t     *zl;
2569         znode_t         *zp = tdzp;
2570         uint64_t        rootid = ZTOZSB(zp)->z_root;
2571         uint64_t        oidp = zp->z_id;
2572         krwlock_t       *rwlp = &szp->z_parent_lock;
2573         krw_t           rw = RW_WRITER;
2574
2575         /*
2576          * First pass write-locks szp and compares to zp->z_id.
2577          * Later passes read-lock zp and compare to zp->z_parent.
2578          */
2579         do {
2580                 if (!rw_tryenter(rwlp, rw)) {
2581                         /*
2582                          * Another thread is renaming in this path.
2583                          * Note that if we are a WRITER, we don't have any
2584                          * parent_locks held yet.
2585                          */
2586                         if (rw == RW_READER && zp->z_id > szp->z_id) {
2587                                 /*
2588                                  * Drop our locks and restart
2589                                  */
2590                                 zfs_rename_unlock(&zl);
2591                                 *zlpp = NULL;
2592                                 zp = tdzp;
2593                                 oidp = zp->z_id;
2594                                 rwlp = &szp->z_parent_lock;
2595                                 rw = RW_WRITER;
2596                                 continue;
2597                         } else {
2598                                 /*
2599                                  * Wait for other thread to drop its locks
2600                                  */
2601                                 rw_enter(rwlp, rw);
2602                         }
2603                 }
2604
2605                 zl = kmem_alloc(sizeof (*zl), KM_SLEEP);
2606                 zl->zl_rwlock = rwlp;
2607                 zl->zl_znode = NULL;
2608                 zl->zl_next = *zlpp;
2609                 *zlpp = zl;
2610
2611                 if (oidp == szp->z_id)          /* We're a descendant of szp */
2612                         return (SET_ERROR(EINVAL));
2613
2614                 if (oidp == rootid)             /* We've hit the top */
2615                         return (0);
2616
2617                 if (rw == RW_READER) {          /* i.e. not the first pass */
2618                         int error = zfs_zget(ZTOZSB(zp), oidp, &zp);
2619                         if (error)
2620                                 return (error);
2621                         zl->zl_znode = zp;
2622                 }
2623                 (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_PARENT(ZTOZSB(zp)),
2624                     &oidp, sizeof (oidp));
2625                 rwlp = &zp->z_parent_lock;
2626                 rw = RW_READER;
2627
2628         } while (zp->z_id != sdzp->z_id);
2629
2630         return (0);
2631 }
2632
2633 /*
2634  * Move an entry from the provided source directory to the target
2635  * directory.  Change the entry name as indicated.
2636  *
2637  *      IN:     sdzp    - Source directory containing the "old entry".
2638  *              snm     - Old entry name.
2639  *              tdzp    - Target directory to contain the "new entry".
2640  *              tnm     - New entry name.
2641  *              cr      - credentials of caller.
2642  *              flags   - case flags
2643  *
2644  *      RETURN: 0 on success, error code on failure.
2645  *
2646  * Timestamps:
2647  *      sdzp,tdzp - ctime|mtime updated
2648  */
2649 int
2650 zfs_rename(znode_t *sdzp, char *snm, znode_t *tdzp, char *tnm,
2651     cred_t *cr, int flags)
2652 {
2653         znode_t         *szp, *tzp;
2654         zfsvfs_t        *zfsvfs = ZTOZSB(sdzp);
2655         zilog_t         *zilog;
2656         zfs_dirlock_t   *sdl, *tdl;
2657         dmu_tx_t        *tx;
2658         zfs_zlock_t     *zl;
2659         int             cmp, serr, terr;
2660         int             error = 0;
2661         int             zflg = 0;
2662         boolean_t       waited = B_FALSE;
2663
2664         if (snm == NULL || tnm == NULL)
2665                 return (SET_ERROR(EINVAL));
2666
2667         if ((error = zfs_enter_verify_zp(zfsvfs, sdzp, FTAG)) != 0)
2668                 return (error);
2669         zilog = zfsvfs->z_log;
2670
2671         if ((error = zfs_verify_zp(tdzp)) != 0) {
2672                 zfs_exit(zfsvfs, FTAG);
2673                 return (error);
2674         }
2675
2676         /*
2677          * We check i_sb because snapshots and the ctldir must have different
2678          * super blocks.
2679          */
2680         if (ZTOI(tdzp)->i_sb != ZTOI(sdzp)->i_sb ||
2681             zfsctl_is_node(ZTOI(tdzp))) {
2682                 zfs_exit(zfsvfs, FTAG);
2683                 return (SET_ERROR(EXDEV));
2684         }
2685
2686         if (zfsvfs->z_utf8 && u8_validate(tnm,
2687             strlen(tnm), NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
2688                 zfs_exit(zfsvfs, FTAG);
2689                 return (SET_ERROR(EILSEQ));
2690         }
2691
2692         if (flags & FIGNORECASE)
2693                 zflg |= ZCILOOK;
2694
2695 top:
2696         szp = NULL;
2697         tzp = NULL;
2698         zl = NULL;
2699
2700         /*
2701          * This is to prevent the creation of links into attribute space
2702          * by renaming a linked file into/outof an attribute directory.
2703          * See the comment in zfs_link() for why this is considered bad.
2704          */
2705         if ((tdzp->z_pflags & ZFS_XATTR) != (sdzp->z_pflags & ZFS_XATTR)) {
2706                 zfs_exit(zfsvfs, FTAG);
2707                 return (SET_ERROR(EINVAL));
2708         }
2709
2710         /*
2711          * Lock source and target directory entries.  To prevent deadlock,
2712          * a lock ordering must be defined.  We lock the directory with
2713          * the smallest object id first, or if it's a tie, the one with
2714          * the lexically first name.
2715          */
2716         if (sdzp->z_id < tdzp->z_id) {
2717                 cmp = -1;
2718         } else if (sdzp->z_id > tdzp->z_id) {
2719                 cmp = 1;
2720         } else {
2721                 /*
2722                  * First compare the two name arguments without
2723                  * considering any case folding.
2724                  */
2725                 int nofold = (zfsvfs->z_norm & ~U8_TEXTPREP_TOUPPER);
2726
2727                 cmp = u8_strcmp(snm, tnm, 0, nofold, U8_UNICODE_LATEST, &error);
2728                 ASSERT(error == 0 || !zfsvfs->z_utf8);
2729                 if (cmp == 0) {
2730                         /*
2731                          * POSIX: "If the old argument and the new argument
2732                          * both refer to links to the same existing file,
2733                          * the rename() function shall return successfully
2734                          * and perform no other action."
2735                          */
2736                         zfs_exit(zfsvfs, FTAG);
2737                         return (0);
2738                 }
2739                 /*
2740                  * If the file system is case-folding, then we may
2741                  * have some more checking to do.  A case-folding file
2742                  * system is either supporting mixed case sensitivity
2743                  * access or is completely case-insensitive.  Note
2744                  * that the file system is always case preserving.
2745                  *
2746                  * In mixed sensitivity mode case sensitive behavior
2747                  * is the default.  FIGNORECASE must be used to
2748                  * explicitly request case insensitive behavior.
2749                  *
2750                  * If the source and target names provided differ only
2751                  * by case (e.g., a request to rename 'tim' to 'Tim'),
2752                  * we will treat this as a special case in the
2753                  * case-insensitive mode: as long as the source name
2754                  * is an exact match, we will allow this to proceed as
2755                  * a name-change request.
2756                  */
2757                 if ((zfsvfs->z_case == ZFS_CASE_INSENSITIVE ||
2758                     (zfsvfs->z_case == ZFS_CASE_MIXED &&
2759                     flags & FIGNORECASE)) &&
2760                     u8_strcmp(snm, tnm, 0, zfsvfs->z_norm, U8_UNICODE_LATEST,
2761                     &error) == 0) {
2762                         /*
2763                          * case preserving rename request, require exact
2764                          * name matches
2765                          */
2766                         zflg |= ZCIEXACT;
2767                         zflg &= ~ZCILOOK;
2768                 }
2769         }
2770
2771         /*
2772          * If the source and destination directories are the same, we should
2773          * grab the z_name_lock of that directory only once.
2774          */
2775         if (sdzp == tdzp) {
2776                 zflg |= ZHAVELOCK;
2777                 rw_enter(&sdzp->z_name_lock, RW_READER);
2778         }
2779
2780         if (cmp < 0) {
2781                 serr = zfs_dirent_lock(&sdl, sdzp, snm, &szp,
2782                     ZEXISTS | zflg, NULL, NULL);
2783                 terr = zfs_dirent_lock(&tdl,
2784                     tdzp, tnm, &tzp, ZRENAMING | zflg, NULL, NULL);
2785         } else {
2786                 terr = zfs_dirent_lock(&tdl,
2787                     tdzp, tnm, &tzp, zflg, NULL, NULL);
2788                 serr = zfs_dirent_lock(&sdl,
2789                     sdzp, snm, &szp, ZEXISTS | ZRENAMING | zflg,
2790                     NULL, NULL);
2791         }
2792
2793         if (serr) {
2794                 /*
2795                  * Source entry invalid or not there.
2796                  */
2797                 if (!terr) {
2798                         zfs_dirent_unlock(tdl);
2799                         if (tzp)
2800                                 zrele(tzp);
2801                 }
2802
2803                 if (sdzp == tdzp)
2804                         rw_exit(&sdzp->z_name_lock);
2805
2806                 if (strcmp(snm, "..") == 0)
2807                         serr = EINVAL;
2808                 zfs_exit(zfsvfs, FTAG);
2809                 return (serr);
2810         }
2811         if (terr) {
2812                 zfs_dirent_unlock(sdl);
2813                 zrele(szp);
2814
2815                 if (sdzp == tdzp)
2816                         rw_exit(&sdzp->z_name_lock);
2817
2818                 if (strcmp(tnm, "..") == 0)
2819                         terr = EINVAL;
2820                 zfs_exit(zfsvfs, FTAG);
2821                 return (terr);
2822         }
2823
2824         /*
2825          * If we are using project inheritance, means if the directory has
2826          * ZFS_PROJINHERIT set, then its descendant directories will inherit
2827          * not only the project ID, but also the ZFS_PROJINHERIT flag. Under
2828          * such case, we only allow renames into our tree when the project
2829          * IDs are the same.
2830          */
2831         if (tdzp->z_pflags & ZFS_PROJINHERIT &&
2832             tdzp->z_projid != szp->z_projid) {
2833                 error = SET_ERROR(EXDEV);
2834                 goto out;
2835         }
2836
2837         /*
2838          * Must have write access at the source to remove the old entry
2839          * and write access at the target to create the new entry.
2840          * Note that if target and source are the same, this can be
2841          * done in a single check.
2842          */
2843
2844         if ((error = zfs_zaccess_rename(sdzp, szp, tdzp, tzp, cr)))
2845                 goto out;
2846
2847         if (S_ISDIR(ZTOI(szp)->i_mode)) {
2848                 /*
2849                  * Check to make sure rename is valid.
2850                  * Can't do a move like this: /usr/a/b to /usr/a/b/c/d
2851                  */
2852                 if ((error = zfs_rename_lock(szp, tdzp, sdzp, &zl)))
2853                         goto out;
2854         }
2855
2856         /*
2857          * Does target exist?
2858          */
2859         if (tzp) {
2860                 /*
2861                  * Source and target must be the same type.
2862                  */
2863                 if (S_ISDIR(ZTOI(szp)->i_mode)) {
2864                         if (!S_ISDIR(ZTOI(tzp)->i_mode)) {
2865                                 error = SET_ERROR(ENOTDIR);
2866                                 goto out;
2867                         }
2868                 } else {
2869                         if (S_ISDIR(ZTOI(tzp)->i_mode)) {
2870                                 error = SET_ERROR(EISDIR);
2871                                 goto out;
2872                         }
2873                 }
2874                 /*
2875                  * POSIX dictates that when the source and target
2876                  * entries refer to the same file object, rename
2877                  * must do nothing and exit without error.
2878                  */
2879                 if (szp->z_id == tzp->z_id) {
2880                         error = 0;
2881                         goto out;
2882                 }
2883         }
2884
2885         tx = dmu_tx_create(zfsvfs->z_os);
2886         dmu_tx_hold_sa(tx, szp->z_sa_hdl, B_FALSE);
2887         dmu_tx_hold_sa(tx, sdzp->z_sa_hdl, B_FALSE);
2888         dmu_tx_hold_zap(tx, sdzp->z_id, FALSE, snm);
2889         dmu_tx_hold_zap(tx, tdzp->z_id, TRUE, tnm);
2890         if (sdzp != tdzp) {
2891                 dmu_tx_hold_sa(tx, tdzp->z_sa_hdl, B_FALSE);
2892                 zfs_sa_upgrade_txholds(tx, tdzp);
2893         }
2894         if (tzp) {
2895                 dmu_tx_hold_sa(tx, tzp->z_sa_hdl, B_FALSE);
2896                 zfs_sa_upgrade_txholds(tx, tzp);
2897         }
2898
2899         zfs_sa_upgrade_txholds(tx, szp);
2900         dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
2901         error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT);
2902         if (error) {
2903                 if (zl != NULL)
2904                         zfs_rename_unlock(&zl);
2905                 zfs_dirent_unlock(sdl);
2906                 zfs_dirent_unlock(tdl);
2907
2908                 if (sdzp == tdzp)
2909                         rw_exit(&sdzp->z_name_lock);
2910
2911                 if (error == ERESTART) {
2912                         waited = B_TRUE;
2913                         dmu_tx_wait(tx);
2914                         dmu_tx_abort(tx);
2915                         zrele(szp);
2916                         if (tzp)
2917                                 zrele(tzp);
2918                         goto top;
2919                 }
2920                 dmu_tx_abort(tx);
2921                 zrele(szp);
2922                 if (tzp)
2923                         zrele(tzp);
2924                 zfs_exit(zfsvfs, FTAG);
2925                 return (error);
2926         }
2927
2928         if (tzp)        /* Attempt to remove the existing target */
2929                 error = zfs_link_destroy(tdl, tzp, tx, zflg, NULL);
2930
2931         if (error == 0) {
2932                 error = zfs_link_create(tdl, szp, tx, ZRENAMING);
2933                 if (error == 0) {
2934                         szp->z_pflags |= ZFS_AV_MODIFIED;
2935                         if (tdzp->z_pflags & ZFS_PROJINHERIT)
2936                                 szp->z_pflags |= ZFS_PROJINHERIT;
2937
2938                         error = sa_update(szp->z_sa_hdl, SA_ZPL_FLAGS(zfsvfs),
2939                             (void *)&szp->z_pflags, sizeof (uint64_t), tx);
2940                         ASSERT0(error);
2941
2942                         error = zfs_link_destroy(sdl, szp, tx, ZRENAMING, NULL);
2943                         if (error == 0) {
2944                                 zfs_log_rename(zilog, tx, TX_RENAME |
2945                                     (flags & FIGNORECASE ? TX_CI : 0), sdzp,
2946                                     sdl->dl_name, tdzp, tdl->dl_name, szp);
2947                         } else {
2948                                 /*
2949                                  * At this point, we have successfully created
2950                                  * the target name, but have failed to remove
2951                                  * the source name.  Since the create was done
2952                                  * with the ZRENAMING flag, there are
2953                                  * complications; for one, the link count is
2954                                  * wrong.  The easiest way to deal with this
2955                                  * is to remove the newly created target, and
2956                                  * return the original error.  This must
2957                                  * succeed; fortunately, it is very unlikely to
2958                                  * fail, since we just created it.
2959                                  */
2960                                 VERIFY3U(zfs_link_destroy(tdl, szp, tx,
2961                                     ZRENAMING, NULL), ==, 0);
2962                         }
2963                 } else {
2964                         /*
2965                          * If we had removed the existing target, subsequent
2966                          * call to zfs_link_create() to add back the same entry
2967                          * but, the new dnode (szp) should not fail.
2968                          */
2969                         ASSERT(tzp == NULL);
2970                 }
2971         }
2972
2973         dmu_tx_commit(tx);
2974 out:
2975         if (zl != NULL)
2976                 zfs_rename_unlock(&zl);
2977
2978         zfs_dirent_unlock(sdl);
2979         zfs_dirent_unlock(tdl);
2980
2981         zfs_znode_update_vfs(sdzp);
2982         if (sdzp == tdzp)
2983                 rw_exit(&sdzp->z_name_lock);
2984
2985         if (sdzp != tdzp)
2986                 zfs_znode_update_vfs(tdzp);
2987
2988         zfs_znode_update_vfs(szp);
2989         zrele(szp);
2990         if (tzp) {
2991                 zfs_znode_update_vfs(tzp);
2992                 zrele(tzp);
2993         }
2994
2995         if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
2996                 zil_commit(zilog, 0);
2997
2998         zfs_exit(zfsvfs, FTAG);
2999         return (error);
3000 }
3001
3002 /*
3003  * Insert the indicated symbolic reference entry into the directory.
3004  *
3005  *      IN:     dzp     - Directory to contain new symbolic link.
3006  *              name    - Name of directory entry in dip.
3007  *              vap     - Attributes of new entry.
3008  *              link    - Name for new symlink entry.
3009  *              cr      - credentials of caller.
3010  *              flags   - case flags
3011  *
3012  *      OUT:    zpp     - Znode for new symbolic link.
3013  *
3014  *      RETURN: 0 on success, error code on failure.
3015  *
3016  * Timestamps:
3017  *      dip - ctime|mtime updated
3018  */
3019 int
3020 zfs_symlink(znode_t *dzp, char *name, vattr_t *vap, char *link,
3021     znode_t **zpp, cred_t *cr, int flags)
3022 {
3023         znode_t         *zp;
3024         zfs_dirlock_t   *dl;
3025         dmu_tx_t        *tx;
3026         zfsvfs_t        *zfsvfs = ZTOZSB(dzp);
3027         zilog_t         *zilog;
3028         uint64_t        len = strlen(link);
3029         int             error;
3030         int             zflg = ZNEW;
3031         zfs_acl_ids_t   acl_ids;
3032         boolean_t       fuid_dirtied;
3033         uint64_t        txtype = TX_SYMLINK;
3034         boolean_t       waited = B_FALSE;
3035
3036         ASSERT(S_ISLNK(vap->va_mode));
3037
3038         if (name == NULL)
3039                 return (SET_ERROR(EINVAL));
3040
3041         if ((error = zfs_enter_verify_zp(zfsvfs, dzp, FTAG)) != 0)
3042                 return (error);
3043         zilog = zfsvfs->z_log;
3044
3045         if (zfsvfs->z_utf8 && u8_validate(name, strlen(name),
3046             NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
3047                 zfs_exit(zfsvfs, FTAG);
3048                 return (SET_ERROR(EILSEQ));
3049         }
3050         if (flags & FIGNORECASE)
3051                 zflg |= ZCILOOK;
3052
3053         if (len > MAXPATHLEN) {
3054                 zfs_exit(zfsvfs, FTAG);
3055                 return (SET_ERROR(ENAMETOOLONG));
3056         }
3057
3058         if ((error = zfs_acl_ids_create(dzp, 0,
3059             vap, cr, NULL, &acl_ids)) != 0) {
3060                 zfs_exit(zfsvfs, FTAG);
3061                 return (error);
3062         }
3063 top:
3064         *zpp = NULL;
3065
3066         /*
3067          * Attempt to lock directory; fail if entry already exists.
3068          */
3069         error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg, NULL, NULL);
3070         if (error) {
3071                 zfs_acl_ids_free(&acl_ids);
3072                 zfs_exit(zfsvfs, FTAG);
3073                 return (error);
3074         }
3075
3076         if ((error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr))) {
3077                 zfs_acl_ids_free(&acl_ids);
3078                 zfs_dirent_unlock(dl);
3079                 zfs_exit(zfsvfs, FTAG);
3080                 return (error);
3081         }
3082
3083         if (zfs_acl_ids_overquota(zfsvfs, &acl_ids, ZFS_DEFAULT_PROJID)) {
3084                 zfs_acl_ids_free(&acl_ids);
3085                 zfs_dirent_unlock(dl);
3086                 zfs_exit(zfsvfs, FTAG);
3087                 return (SET_ERROR(EDQUOT));
3088         }
3089         tx = dmu_tx_create(zfsvfs->z_os);
3090         fuid_dirtied = zfsvfs->z_fuid_dirty;
3091         dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, MAX(1, len));
3092         dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
3093         dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
3094             ZFS_SA_BASE_ATTR_SIZE + len);
3095         dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE);
3096         if (!zfsvfs->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) {
3097                 dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0,
3098                     acl_ids.z_aclp->z_acl_bytes);
3099         }
3100         if (fuid_dirtied)
3101                 zfs_fuid_txhold(zfsvfs, tx);
3102         error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT);
3103         if (error) {
3104                 zfs_dirent_unlock(dl);
3105                 if (error == ERESTART) {
3106                         waited = B_TRUE;
3107                         dmu_tx_wait(tx);
3108                         dmu_tx_abort(tx);
3109                         goto top;
3110                 }
3111                 zfs_acl_ids_free(&acl_ids);
3112                 dmu_tx_abort(tx);
3113                 zfs_exit(zfsvfs, FTAG);
3114                 return (error);
3115         }
3116
3117         /*
3118          * Create a new object for the symlink.
3119          * for version 4 ZPL datasets the symlink will be an SA attribute
3120          */
3121         zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids);
3122
3123         if (fuid_dirtied)
3124                 zfs_fuid_sync(zfsvfs, tx);
3125
3126         mutex_enter(&zp->z_lock);
3127         if (zp->z_is_sa)
3128                 error = sa_update(zp->z_sa_hdl, SA_ZPL_SYMLINK(zfsvfs),
3129                     link, len, tx);
3130         else
3131                 zfs_sa_symlink(zp, link, len, tx);
3132         mutex_exit(&zp->z_lock);
3133
3134         zp->z_size = len;
3135         (void) sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zfsvfs),
3136             &zp->z_size, sizeof (zp->z_size), tx);
3137         /*
3138          * Insert the new object into the directory.
3139          */
3140         error = zfs_link_create(dl, zp, tx, ZNEW);
3141         if (error != 0) {
3142                 zfs_znode_delete(zp, tx);
3143                 remove_inode_hash(ZTOI(zp));
3144         } else {
3145                 if (flags & FIGNORECASE)
3146                         txtype |= TX_CI;
3147                 zfs_log_symlink(zilog, tx, txtype, dzp, zp, name, link);
3148
3149                 zfs_znode_update_vfs(dzp);
3150                 zfs_znode_update_vfs(zp);
3151         }
3152
3153         zfs_acl_ids_free(&acl_ids);
3154
3155         dmu_tx_commit(tx);
3156
3157         zfs_dirent_unlock(dl);
3158
3159         if (error == 0) {
3160                 *zpp = zp;
3161
3162                 if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
3163                         zil_commit(zilog, 0);
3164         } else {
3165                 zrele(zp);
3166         }
3167
3168         zfs_exit(zfsvfs, FTAG);
3169         return (error);
3170 }
3171
3172 /*
3173  * Return, in the buffer contained in the provided uio structure,
3174  * the symbolic path referred to by ip.
3175  *
3176  *      IN:     ip      - inode of symbolic link
3177  *              uio     - structure to contain the link path.
3178  *              cr      - credentials of caller.
3179  *
3180  *      RETURN: 0 if success
3181  *              error code if failure
3182  *
3183  * Timestamps:
3184  *      ip - atime updated
3185  */
3186 int
3187 zfs_readlink(struct inode *ip, zfs_uio_t *uio, cred_t *cr)
3188 {
3189         (void) cr;
3190         znode_t         *zp = ITOZ(ip);
3191         zfsvfs_t        *zfsvfs = ITOZSB(ip);
3192         int             error;
3193
3194         if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
3195                 return (error);
3196
3197         mutex_enter(&zp->z_lock);
3198         if (zp->z_is_sa)
3199                 error = sa_lookup_uio(zp->z_sa_hdl,
3200                     SA_ZPL_SYMLINK(zfsvfs), uio);
3201         else
3202                 error = zfs_sa_readlink(zp, uio);
3203         mutex_exit(&zp->z_lock);
3204
3205         zfs_exit(zfsvfs, FTAG);
3206         return (error);
3207 }
3208
3209 /*
3210  * Insert a new entry into directory tdzp referencing szp.
3211  *
3212  *      IN:     tdzp    - Directory to contain new entry.
3213  *              szp     - znode of new entry.
3214  *              name    - name of new entry.
3215  *              cr      - credentials of caller.
3216  *              flags   - case flags.
3217  *
3218  *      RETURN: 0 if success
3219  *              error code if failure
3220  *
3221  * Timestamps:
3222  *      tdzp - ctime|mtime updated
3223  *       szp - ctime updated
3224  */
3225 int
3226 zfs_link(znode_t *tdzp, znode_t *szp, char *name, cred_t *cr,
3227     int flags)
3228 {
3229         struct inode *sip = ZTOI(szp);
3230         znode_t         *tzp;
3231         zfsvfs_t        *zfsvfs = ZTOZSB(tdzp);
3232         zilog_t         *zilog;
3233         zfs_dirlock_t   *dl;
3234         dmu_tx_t        *tx;
3235         int             error;
3236         int             zf = ZNEW;
3237         uint64_t        parent;
3238         uid_t           owner;
3239         boolean_t       waited = B_FALSE;
3240         boolean_t       is_tmpfile = 0;
3241         uint64_t        txg;
3242 #ifdef HAVE_TMPFILE
3243         is_tmpfile = (sip->i_nlink == 0 && (sip->i_state & I_LINKABLE));
3244 #endif
3245         ASSERT(S_ISDIR(ZTOI(tdzp)->i_mode));
3246
3247         if (name == NULL)
3248                 return (SET_ERROR(EINVAL));
3249
3250         if ((error = zfs_enter_verify_zp(zfsvfs, tdzp, FTAG)) != 0)
3251                 return (error);
3252         zilog = zfsvfs->z_log;
3253
3254         /*
3255          * POSIX dictates that we return EPERM here.
3256          * Better choices include ENOTSUP or EISDIR.
3257          */
3258         if (S_ISDIR(sip->i_mode)) {
3259                 zfs_exit(zfsvfs, FTAG);
3260                 return (SET_ERROR(EPERM));
3261         }
3262
3263         if ((error = zfs_verify_zp(szp)) != 0) {
3264                 zfs_exit(zfsvfs, FTAG);
3265                 return (error);
3266         }
3267
3268         /*
3269          * If we are using project inheritance, means if the directory has
3270          * ZFS_PROJINHERIT set, then its descendant directories will inherit
3271          * not only the project ID, but also the ZFS_PROJINHERIT flag. Under
3272          * such case, we only allow hard link creation in our tree when the
3273          * project IDs are the same.
3274          */
3275         if (tdzp->z_pflags & ZFS_PROJINHERIT &&
3276             tdzp->z_projid != szp->z_projid) {
3277                 zfs_exit(zfsvfs, FTAG);
3278                 return (SET_ERROR(EXDEV));
3279         }
3280
3281         /*
3282          * We check i_sb because snapshots and the ctldir must have different
3283          * super blocks.
3284          */
3285         if (sip->i_sb != ZTOI(tdzp)->i_sb || zfsctl_is_node(sip)) {
3286                 zfs_exit(zfsvfs, FTAG);
3287                 return (SET_ERROR(EXDEV));
3288         }
3289
3290         /* Prevent links to .zfs/shares files */
3291
3292         if ((error = sa_lookup(szp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs),
3293             &parent, sizeof (uint64_t))) != 0) {
3294                 zfs_exit(zfsvfs, FTAG);
3295                 return (error);
3296         }
3297         if (parent == zfsvfs->z_shares_dir) {
3298                 zfs_exit(zfsvfs, FTAG);
3299                 return (SET_ERROR(EPERM));
3300         }
3301
3302         if (zfsvfs->z_utf8 && u8_validate(name,
3303             strlen(name), NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
3304                 zfs_exit(zfsvfs, FTAG);
3305                 return (SET_ERROR(EILSEQ));
3306         }
3307         if (flags & FIGNORECASE)
3308                 zf |= ZCILOOK;
3309
3310         /*
3311          * We do not support links between attributes and non-attributes
3312          * because of the potential security risk of creating links
3313          * into "normal" file space in order to circumvent restrictions
3314          * imposed in attribute space.
3315          */
3316         if ((szp->z_pflags & ZFS_XATTR) != (tdzp->z_pflags & ZFS_XATTR)) {
3317                 zfs_exit(zfsvfs, FTAG);
3318                 return (SET_ERROR(EINVAL));
3319         }
3320
3321         owner = zfs_fuid_map_id(zfsvfs, KUID_TO_SUID(sip->i_uid),
3322             cr, ZFS_OWNER);
3323         if (owner != crgetuid(cr) && secpolicy_basic_link(cr) != 0) {
3324                 zfs_exit(zfsvfs, FTAG);
3325                 return (SET_ERROR(EPERM));
3326         }
3327
3328         if ((error = zfs_zaccess(tdzp, ACE_ADD_FILE, 0, B_FALSE, cr))) {
3329                 zfs_exit(zfsvfs, FTAG);
3330                 return (error);
3331         }
3332
3333 top:
3334         /*
3335          * Attempt to lock directory; fail if entry already exists.
3336          */
3337         error = zfs_dirent_lock(&dl, tdzp, name, &tzp, zf, NULL, NULL);
3338         if (error) {
3339                 zfs_exit(zfsvfs, FTAG);
3340                 return (error);
3341         }
3342
3343         tx = dmu_tx_create(zfsvfs->z_os);
3344         dmu_tx_hold_sa(tx, szp->z_sa_hdl, B_FALSE);
3345         dmu_tx_hold_zap(tx, tdzp->z_id, TRUE, name);
3346         if (is_tmpfile)
3347                 dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
3348
3349         zfs_sa_upgrade_txholds(tx, szp);
3350         zfs_sa_upgrade_txholds(tx, tdzp);
3351         error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT);
3352         if (error) {
3353                 zfs_dirent_unlock(dl);
3354                 if (error == ERESTART) {
3355                         waited = B_TRUE;
3356                         dmu_tx_wait(tx);
3357                         dmu_tx_abort(tx);
3358                         goto top;
3359                 }
3360                 dmu_tx_abort(tx);
3361                 zfs_exit(zfsvfs, FTAG);
3362                 return (error);
3363         }
3364         /* unmark z_unlinked so zfs_link_create will not reject */
3365         if (is_tmpfile)
3366                 szp->z_unlinked = B_FALSE;
3367         error = zfs_link_create(dl, szp, tx, 0);
3368
3369         if (error == 0) {
3370                 uint64_t txtype = TX_LINK;
3371                 /*
3372                  * tmpfile is created to be in z_unlinkedobj, so remove it.
3373                  * Also, we don't log in ZIL, because all previous file
3374                  * operation on the tmpfile are ignored by ZIL. Instead we
3375                  * always wait for txg to sync to make sure all previous
3376                  * operation are sync safe.
3377                  */
3378                 if (is_tmpfile) {
3379                         VERIFY(zap_remove_int(zfsvfs->z_os,
3380                             zfsvfs->z_unlinkedobj, szp->z_id, tx) == 0);
3381                 } else {
3382                         if (flags & FIGNORECASE)
3383                                 txtype |= TX_CI;
3384                         zfs_log_link(zilog, tx, txtype, tdzp, szp, name);
3385                 }
3386         } else if (is_tmpfile) {
3387                 /* restore z_unlinked since when linking failed */
3388                 szp->z_unlinked = B_TRUE;
3389         }
3390         txg = dmu_tx_get_txg(tx);
3391         dmu_tx_commit(tx);
3392
3393         zfs_dirent_unlock(dl);
3394
3395         if (!is_tmpfile && zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
3396                 zil_commit(zilog, 0);
3397
3398         if (is_tmpfile && zfsvfs->z_os->os_sync != ZFS_SYNC_DISABLED)
3399                 txg_wait_synced(dmu_objset_pool(zfsvfs->z_os), txg);
3400
3401         zfs_znode_update_vfs(tdzp);
3402         zfs_znode_update_vfs(szp);
3403         zfs_exit(zfsvfs, FTAG);
3404         return (error);
3405 }
3406
3407 static void
3408 zfs_putpage_sync_commit_cb(void *arg)
3409 {
3410         struct page *pp = arg;
3411
3412         ClearPageError(pp);
3413         end_page_writeback(pp);
3414 }
3415
3416 static void
3417 zfs_putpage_async_commit_cb(void *arg)
3418 {
3419         struct page *pp = arg;
3420         znode_t *zp = ITOZ(pp->mapping->host);
3421
3422         ClearPageError(pp);
3423         end_page_writeback(pp);
3424         atomic_dec_32(&zp->z_async_writes_cnt);
3425 }
3426
3427 /*
3428  * Push a page out to disk, once the page is on stable storage the
3429  * registered commit callback will be run as notification of completion.
3430  *
3431  *      IN:     ip       - page mapped for inode.
3432  *              pp       - page to push (page is locked)
3433  *              wbc      - writeback control data
3434  *              for_sync - does the caller intend to wait synchronously for the
3435  *                         page writeback to complete?
3436  *
3437  *      RETURN: 0 if success
3438  *              error code if failure
3439  *
3440  * Timestamps:
3441  *      ip - ctime|mtime updated
3442  */
3443 int
3444 zfs_putpage(struct inode *ip, struct page *pp, struct writeback_control *wbc,
3445     boolean_t for_sync)
3446 {
3447         znode_t         *zp = ITOZ(ip);
3448         zfsvfs_t        *zfsvfs = ITOZSB(ip);
3449         loff_t          offset;
3450         loff_t          pgoff;
3451         unsigned int    pglen;
3452         dmu_tx_t        *tx;
3453         caddr_t         va;
3454         int             err = 0;
3455         uint64_t        mtime[2], ctime[2];
3456         sa_bulk_attr_t  bulk[3];
3457         int             cnt = 0;
3458         struct address_space *mapping;
3459
3460         if ((err = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
3461                 return (err);
3462
3463         ASSERT(PageLocked(pp));
3464
3465         pgoff = page_offset(pp);        /* Page byte-offset in file */
3466         offset = i_size_read(ip);       /* File length in bytes */
3467         pglen = MIN(PAGE_SIZE,          /* Page length in bytes */
3468             P2ROUNDUP(offset, PAGE_SIZE)-pgoff);
3469
3470         /* Page is beyond end of file */
3471         if (pgoff >= offset) {
3472                 unlock_page(pp);
3473                 zfs_exit(zfsvfs, FTAG);
3474                 return (0);
3475         }
3476
3477         /* Truncate page length to end of file */
3478         if (pgoff + pglen > offset)
3479                 pglen = offset - pgoff;
3480
3481 #if 0
3482         /*
3483          * FIXME: Allow mmap writes past its quota.  The correct fix
3484          * is to register a page_mkwrite() handler to count the page
3485          * against its quota when it is about to be dirtied.
3486          */
3487         if (zfs_id_overblockquota(zfsvfs, DMU_USERUSED_OBJECT,
3488             KUID_TO_SUID(ip->i_uid)) ||
3489             zfs_id_overblockquota(zfsvfs, DMU_GROUPUSED_OBJECT,
3490             KGID_TO_SGID(ip->i_gid)) ||
3491             (zp->z_projid != ZFS_DEFAULT_PROJID &&
3492             zfs_id_overblockquota(zfsvfs, DMU_PROJECTUSED_OBJECT,
3493             zp->z_projid))) {
3494                 err = EDQUOT;
3495         }
3496 #endif
3497
3498         /*
3499          * The ordering here is critical and must adhere to the following
3500          * rules in order to avoid deadlocking in either zfs_read() or
3501          * zfs_free_range() due to a lock inversion.
3502          *
3503          * 1) The page must be unlocked prior to acquiring the range lock.
3504          *    This is critical because zfs_read() calls find_lock_page()
3505          *    which may block on the page lock while holding the range lock.
3506          *
3507          * 2) Before setting or clearing write back on a page the range lock
3508          *    must be held in order to prevent a lock inversion with the
3509          *    zfs_free_range() function.
3510          *
3511          * This presents a problem because upon entering this function the
3512          * page lock is already held.  To safely acquire the range lock the
3513          * page lock must be dropped.  This creates a window where another
3514          * process could truncate, invalidate, dirty, or write out the page.
3515          *
3516          * Therefore, after successfully reacquiring the range and page locks
3517          * the current page state is checked.  In the common case everything
3518          * will be as is expected and it can be written out.  However, if
3519          * the page state has changed it must be handled accordingly.
3520          */
3521         mapping = pp->mapping;
3522         redirty_page_for_writepage(wbc, pp);
3523         unlock_page(pp);
3524
3525         zfs_locked_range_t *lr = zfs_rangelock_enter(&zp->z_rangelock,
3526             pgoff, pglen, RL_WRITER);
3527         lock_page(pp);
3528
3529         /* Page mapping changed or it was no longer dirty, we're done */
3530         if (unlikely((mapping != pp->mapping) || !PageDirty(pp))) {
3531                 unlock_page(pp);
3532                 zfs_rangelock_exit(lr);
3533                 zfs_exit(zfsvfs, FTAG);
3534                 return (0);
3535         }
3536
3537         /* Another process started write block if required */
3538         if (PageWriteback(pp)) {
3539                 unlock_page(pp);
3540                 zfs_rangelock_exit(lr);
3541
3542                 if (wbc->sync_mode != WB_SYNC_NONE) {
3543                         /*
3544                          * Speed up any non-sync page writebacks since
3545                          * they may take several seconds to complete.
3546                          * Refer to the comment in zpl_fsync() (when
3547                          * HAVE_FSYNC_RANGE is defined) for details.
3548                          */
3549                         if (atomic_load_32(&zp->z_async_writes_cnt) > 0) {
3550                                 zil_commit(zfsvfs->z_log, zp->z_id);
3551                         }
3552
3553                         if (PageWriteback(pp))
3554 #ifdef HAVE_PAGEMAP_FOLIO_WAIT_BIT
3555                                 folio_wait_bit(page_folio(pp), PG_writeback);
3556 #else
3557                                 wait_on_page_bit(pp, PG_writeback);
3558 #endif
3559                 }
3560
3561                 zfs_exit(zfsvfs, FTAG);
3562                 return (0);
3563         }
3564
3565         /* Clear the dirty flag the required locks are held */
3566         if (!clear_page_dirty_for_io(pp)) {
3567                 unlock_page(pp);
3568                 zfs_rangelock_exit(lr);
3569                 zfs_exit(zfsvfs, FTAG);
3570                 return (0);
3571         }
3572
3573         /*
3574          * Counterpart for redirty_page_for_writepage() above.  This page
3575          * was in fact not skipped and should not be counted as if it were.
3576          */
3577         wbc->pages_skipped--;
3578         if (!for_sync)
3579                 atomic_inc_32(&zp->z_async_writes_cnt);
3580         set_page_writeback(pp);
3581         unlock_page(pp);
3582
3583         tx = dmu_tx_create(zfsvfs->z_os);
3584         dmu_tx_hold_write(tx, zp->z_id, pgoff, pglen);
3585         dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
3586         zfs_sa_upgrade_txholds(tx, zp);
3587
3588         err = dmu_tx_assign(tx, TXG_NOWAIT);
3589         if (err != 0) {
3590                 if (err == ERESTART)
3591                         dmu_tx_wait(tx);
3592
3593                 dmu_tx_abort(tx);
3594 #ifdef HAVE_VFS_FILEMAP_DIRTY_FOLIO
3595                 filemap_dirty_folio(page_mapping(pp), page_folio(pp));
3596 #else
3597                 __set_page_dirty_nobuffers(pp);
3598 #endif
3599                 ClearPageError(pp);
3600                 end_page_writeback(pp);
3601                 if (!for_sync)
3602                         atomic_dec_32(&zp->z_async_writes_cnt);
3603                 zfs_rangelock_exit(lr);
3604                 zfs_exit(zfsvfs, FTAG);
3605                 return (err);
3606         }
3607
3608         va = kmap(pp);
3609         ASSERT3U(pglen, <=, PAGE_SIZE);
3610         dmu_write(zfsvfs->z_os, zp->z_id, pgoff, pglen, va, tx);
3611         kunmap(pp);
3612
3613         SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16);
3614         SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16);
3615         SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_FLAGS(zfsvfs), NULL,
3616             &zp->z_pflags, 8);
3617
3618         /* Preserve the mtime and ctime provided by the inode */
3619         ZFS_TIME_ENCODE(&ip->i_mtime, mtime);
3620         ZFS_TIME_ENCODE(&ip->i_ctime, ctime);
3621         zp->z_atime_dirty = B_FALSE;
3622         zp->z_seq++;
3623
3624         err = sa_bulk_update(zp->z_sa_hdl, bulk, cnt, tx);
3625
3626         zfs_log_write(zfsvfs->z_log, tx, TX_WRITE, zp, pgoff, pglen, 0,
3627             for_sync ? zfs_putpage_sync_commit_cb :
3628             zfs_putpage_async_commit_cb, pp);
3629
3630         dmu_tx_commit(tx);
3631
3632         zfs_rangelock_exit(lr);
3633
3634         if (wbc->sync_mode != WB_SYNC_NONE) {
3635                 /*
3636                  * Note that this is rarely called under writepages(), because
3637                  * writepages() normally handles the entire commit for
3638                  * performance reasons.
3639                  */
3640                 zil_commit(zfsvfs->z_log, zp->z_id);
3641         } else if (!for_sync && atomic_load_32(&zp->z_sync_writes_cnt) > 0) {
3642                 /*
3643                  * If the caller does not intend to wait synchronously
3644                  * for this page writeback to complete and there are active
3645                  * synchronous calls on this file, do a commit so that
3646                  * the latter don't accidentally end up waiting for
3647                  * our writeback to complete. Refer to the comment in
3648                  * zpl_fsync() (when HAVE_FSYNC_RANGE is defined) for details.
3649                  */
3650                 zil_commit(zfsvfs->z_log, zp->z_id);
3651         }
3652
3653         dataset_kstats_update_write_kstats(&zfsvfs->z_kstat, pglen);
3654
3655         zfs_exit(zfsvfs, FTAG);
3656         return (err);
3657 }
3658
3659 /*
3660  * Update the system attributes when the inode has been dirtied.  For the
3661  * moment we only update the mode, atime, mtime, and ctime.
3662  */
3663 int
3664 zfs_dirty_inode(struct inode *ip, int flags)
3665 {
3666         znode_t         *zp = ITOZ(ip);
3667         zfsvfs_t        *zfsvfs = ITOZSB(ip);
3668         dmu_tx_t        *tx;
3669         uint64_t        mode, atime[2], mtime[2], ctime[2];
3670         sa_bulk_attr_t  bulk[4];
3671         int             error = 0;
3672         int             cnt = 0;
3673
3674         if (zfs_is_readonly(zfsvfs) || dmu_objset_is_snapshot(zfsvfs->z_os))
3675                 return (0);
3676
3677         if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
3678                 return (error);
3679
3680 #ifdef I_DIRTY_TIME
3681         /*
3682          * This is the lazytime semantic introduced in Linux 4.0
3683          * This flag will only be called from update_time when lazytime is set.
3684          * (Note, I_DIRTY_SYNC will also set if not lazytime)
3685          * Fortunately mtime and ctime are managed within ZFS itself, so we
3686          * only need to dirty atime.
3687          */
3688         if (flags == I_DIRTY_TIME) {
3689                 zp->z_atime_dirty = B_TRUE;
3690                 goto out;
3691         }
3692 #endif
3693
3694         tx = dmu_tx_create(zfsvfs->z_os);
3695
3696         dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
3697         zfs_sa_upgrade_txholds(tx, zp);
3698
3699         error = dmu_tx_assign(tx, TXG_WAIT);
3700         if (error) {
3701                 dmu_tx_abort(tx);
3702                 goto out;
3703         }
3704
3705         mutex_enter(&zp->z_lock);
3706         zp->z_atime_dirty = B_FALSE;
3707
3708         SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_MODE(zfsvfs), NULL, &mode, 8);
3709         SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_ATIME(zfsvfs), NULL, &atime, 16);
3710         SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16);
3711         SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16);
3712
3713         /* Preserve the mode, mtime and ctime provided by the inode */
3714         ZFS_TIME_ENCODE(&ip->i_atime, atime);
3715         ZFS_TIME_ENCODE(&ip->i_mtime, mtime);
3716         ZFS_TIME_ENCODE(&ip->i_ctime, ctime);
3717         mode = ip->i_mode;
3718
3719         zp->z_mode = mode;
3720
3721         error = sa_bulk_update(zp->z_sa_hdl, bulk, cnt, tx);
3722         mutex_exit(&zp->z_lock);
3723
3724         dmu_tx_commit(tx);
3725 out:
3726         zfs_exit(zfsvfs, FTAG);
3727         return (error);
3728 }
3729
3730 void
3731 zfs_inactive(struct inode *ip)
3732 {
3733         znode_t *zp = ITOZ(ip);
3734         zfsvfs_t *zfsvfs = ITOZSB(ip);
3735         uint64_t atime[2];
3736         int error;
3737         int need_unlock = 0;
3738
3739         /* Only read lock if we haven't already write locked, e.g. rollback */
3740         if (!RW_WRITE_HELD(&zfsvfs->z_teardown_inactive_lock)) {
3741                 need_unlock = 1;
3742                 rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_READER);
3743         }
3744         if (zp->z_sa_hdl == NULL) {
3745                 if (need_unlock)
3746                         rw_exit(&zfsvfs->z_teardown_inactive_lock);
3747                 return;
3748         }
3749
3750         if (zp->z_atime_dirty && zp->z_unlinked == B_FALSE) {
3751                 dmu_tx_t *tx = dmu_tx_create(zfsvfs->z_os);
3752
3753                 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
3754                 zfs_sa_upgrade_txholds(tx, zp);
3755                 error = dmu_tx_assign(tx, TXG_WAIT);
3756                 if (error) {
3757                         dmu_tx_abort(tx);
3758                 } else {
3759                         ZFS_TIME_ENCODE(&ip->i_atime, atime);
3760                         mutex_enter(&zp->z_lock);
3761                         (void) sa_update(zp->z_sa_hdl, SA_ZPL_ATIME(zfsvfs),
3762                             (void *)&atime, sizeof (atime), tx);
3763                         zp->z_atime_dirty = B_FALSE;
3764                         mutex_exit(&zp->z_lock);
3765                         dmu_tx_commit(tx);
3766                 }
3767         }
3768
3769         zfs_zinactive(zp);
3770         if (need_unlock)
3771                 rw_exit(&zfsvfs->z_teardown_inactive_lock);
3772 }
3773
3774 /*
3775  * Fill pages with data from the disk.
3776  */
3777 static int
3778 zfs_fillpage(struct inode *ip, struct page *pl[], int nr_pages)
3779 {
3780         znode_t *zp = ITOZ(ip);
3781         zfsvfs_t *zfsvfs = ITOZSB(ip);
3782         objset_t *os;
3783         struct page *cur_pp;
3784         u_offset_t io_off, total;
3785         size_t io_len;
3786         loff_t i_size;
3787         unsigned page_idx;
3788         int err;
3789
3790         os = zfsvfs->z_os;
3791         io_len = nr_pages << PAGE_SHIFT;
3792         i_size = i_size_read(ip);
3793         io_off = page_offset(pl[0]);
3794
3795         if (io_off + io_len > i_size)
3796                 io_len = i_size - io_off;
3797
3798         /*
3799          * Iterate over list of pages and read each page individually.
3800          */
3801         page_idx = 0;
3802         for (total = io_off + io_len; io_off < total; io_off += PAGESIZE) {
3803                 caddr_t va;
3804
3805                 cur_pp = pl[page_idx++];
3806                 va = kmap(cur_pp);
3807                 err = dmu_read(os, zp->z_id, io_off, PAGESIZE, va,
3808                     DMU_READ_PREFETCH);
3809                 kunmap(cur_pp);
3810                 if (err) {
3811                         /* convert checksum errors into IO errors */
3812                         if (err == ECKSUM)
3813                                 err = SET_ERROR(EIO);
3814                         return (err);
3815                 }
3816         }
3817
3818         return (0);
3819 }
3820
3821 /*
3822  * Uses zfs_fillpage to read data from the file and fill the pages.
3823  *
3824  *      IN:     ip       - inode of file to get data from.
3825  *              pl       - list of pages to read
3826  *              nr_pages - number of pages to read
3827  *
3828  *      RETURN: 0 on success, error code on failure.
3829  *
3830  * Timestamps:
3831  *      vp - atime updated
3832  */
3833 int
3834 zfs_getpage(struct inode *ip, struct page *pl[], int nr_pages)
3835 {
3836         znode_t  *zp  = ITOZ(ip);
3837         zfsvfs_t *zfsvfs = ITOZSB(ip);
3838         int      err;
3839
3840         if (pl == NULL)
3841                 return (0);
3842
3843         if ((err = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
3844                 return (err);
3845
3846         err = zfs_fillpage(ip, pl, nr_pages);
3847
3848         dataset_kstats_update_read_kstats(&zfsvfs->z_kstat, nr_pages*PAGESIZE);
3849
3850         zfs_exit(zfsvfs, FTAG);
3851         return (err);
3852 }
3853
3854 /*
3855  * Check ZFS specific permissions to memory map a section of a file.
3856  *
3857  *      IN:     ip      - inode of the file to mmap
3858  *              off     - file offset
3859  *              addrp   - start address in memory region
3860  *              len     - length of memory region
3861  *              vm_flags- address flags
3862  *
3863  *      RETURN: 0 if success
3864  *              error code if failure
3865  */
3866 int
3867 zfs_map(struct inode *ip, offset_t off, caddr_t *addrp, size_t len,
3868     unsigned long vm_flags)
3869 {
3870         (void) addrp;
3871         znode_t  *zp = ITOZ(ip);
3872         zfsvfs_t *zfsvfs = ITOZSB(ip);
3873         int error;
3874
3875         if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
3876                 return (error);
3877
3878         if ((vm_flags & VM_WRITE) && (zp->z_pflags &
3879             (ZFS_IMMUTABLE | ZFS_READONLY | ZFS_APPENDONLY))) {
3880                 zfs_exit(zfsvfs, FTAG);
3881                 return (SET_ERROR(EPERM));
3882         }
3883
3884         if ((vm_flags & (VM_READ | VM_EXEC)) &&
3885             (zp->z_pflags & ZFS_AV_QUARANTINED)) {
3886                 zfs_exit(zfsvfs, FTAG);
3887                 return (SET_ERROR(EACCES));
3888         }
3889
3890         if (off < 0 || len > MAXOFFSET_T - off) {
3891                 zfs_exit(zfsvfs, FTAG);
3892                 return (SET_ERROR(ENXIO));
3893         }
3894
3895         zfs_exit(zfsvfs, FTAG);
3896         return (0);
3897 }
3898
3899 /*
3900  * Free or allocate space in a file.  Currently, this function only
3901  * supports the `F_FREESP' command.  However, this command is somewhat
3902  * misnamed, as its functionality includes the ability to allocate as
3903  * well as free space.
3904  *
3905  *      IN:     zp      - znode of file to free data in.
3906  *              cmd     - action to take (only F_FREESP supported).
3907  *              bfp     - section of file to free/alloc.
3908  *              flag    - current file open mode flags.
3909  *              offset  - current file offset.
3910  *              cr      - credentials of caller.
3911  *
3912  *      RETURN: 0 on success, error code on failure.
3913  *
3914  * Timestamps:
3915  *      zp - ctime|mtime updated
3916  */
3917 int
3918 zfs_space(znode_t *zp, int cmd, flock64_t *bfp, int flag,
3919     offset_t offset, cred_t *cr)
3920 {
3921         (void) offset;
3922         zfsvfs_t        *zfsvfs = ZTOZSB(zp);
3923         uint64_t        off, len;
3924         int             error;
3925
3926         if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
3927                 return (error);
3928
3929         if (cmd != F_FREESP) {
3930                 zfs_exit(zfsvfs, FTAG);
3931                 return (SET_ERROR(EINVAL));
3932         }
3933
3934         /*
3935          * Callers might not be able to detect properly that we are read-only,
3936          * so check it explicitly here.
3937          */
3938         if (zfs_is_readonly(zfsvfs)) {
3939                 zfs_exit(zfsvfs, FTAG);
3940                 return (SET_ERROR(EROFS));
3941         }
3942
3943         if (bfp->l_len < 0) {
3944                 zfs_exit(zfsvfs, FTAG);
3945                 return (SET_ERROR(EINVAL));
3946         }
3947
3948         /*
3949          * Permissions aren't checked on Solaris because on this OS
3950          * zfs_space() can only be called with an opened file handle.
3951          * On Linux we can get here through truncate_range() which
3952          * operates directly on inodes, so we need to check access rights.
3953          */
3954         if ((error = zfs_zaccess(zp, ACE_WRITE_DATA, 0, B_FALSE, cr))) {
3955                 zfs_exit(zfsvfs, FTAG);
3956                 return (error);
3957         }
3958
3959         off = bfp->l_start;
3960         len = bfp->l_len; /* 0 means from off to end of file */
3961
3962         error = zfs_freesp(zp, off, len, flag, TRUE);
3963
3964         zfs_exit(zfsvfs, FTAG);
3965         return (error);
3966 }
3967
3968 int
3969 zfs_fid(struct inode *ip, fid_t *fidp)
3970 {
3971         znode_t         *zp = ITOZ(ip);
3972         zfsvfs_t        *zfsvfs = ITOZSB(ip);
3973         uint32_t        gen;
3974         uint64_t        gen64;
3975         uint64_t        object = zp->z_id;
3976         zfid_short_t    *zfid;
3977         int             size, i, error;
3978
3979         if ((error = zfs_enter(zfsvfs, FTAG)) != 0)
3980                 return (error);
3981
3982         if (fidp->fid_len < SHORT_FID_LEN) {
3983                 fidp->fid_len = SHORT_FID_LEN;
3984                 zfs_exit(zfsvfs, FTAG);
3985                 return (SET_ERROR(ENOSPC));
3986         }
3987
3988         if ((error = zfs_verify_zp(zp)) != 0) {
3989                 zfs_exit(zfsvfs, FTAG);
3990                 return (error);
3991         }
3992
3993         if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_GEN(zfsvfs),
3994             &gen64, sizeof (uint64_t))) != 0) {
3995                 zfs_exit(zfsvfs, FTAG);
3996                 return (error);
3997         }
3998
3999         gen = (uint32_t)gen64;
4000
4001         size = SHORT_FID_LEN;
4002
4003         zfid = (zfid_short_t *)fidp;
4004
4005         zfid->zf_len = size;
4006
4007         for (i = 0; i < sizeof (zfid->zf_object); i++)
4008                 zfid->zf_object[i] = (uint8_t)(object >> (8 * i));
4009
4010         /* Must have a non-zero generation number to distinguish from .zfs */
4011         if (gen == 0)
4012                 gen = 1;
4013         for (i = 0; i < sizeof (zfid->zf_gen); i++)
4014                 zfid->zf_gen[i] = (uint8_t)(gen >> (8 * i));
4015
4016         zfs_exit(zfsvfs, FTAG);
4017         return (0);
4018 }
4019
4020 #if defined(_KERNEL)
4021 EXPORT_SYMBOL(zfs_open);
4022 EXPORT_SYMBOL(zfs_close);
4023 EXPORT_SYMBOL(zfs_lookup);
4024 EXPORT_SYMBOL(zfs_create);
4025 EXPORT_SYMBOL(zfs_tmpfile);
4026 EXPORT_SYMBOL(zfs_remove);
4027 EXPORT_SYMBOL(zfs_mkdir);
4028 EXPORT_SYMBOL(zfs_rmdir);
4029 EXPORT_SYMBOL(zfs_readdir);
4030 EXPORT_SYMBOL(zfs_getattr_fast);
4031 EXPORT_SYMBOL(zfs_setattr);
4032 EXPORT_SYMBOL(zfs_rename);
4033 EXPORT_SYMBOL(zfs_symlink);
4034 EXPORT_SYMBOL(zfs_readlink);
4035 EXPORT_SYMBOL(zfs_link);
4036 EXPORT_SYMBOL(zfs_inactive);
4037 EXPORT_SYMBOL(zfs_space);
4038 EXPORT_SYMBOL(zfs_fid);
4039 EXPORT_SYMBOL(zfs_getpage);
4040 EXPORT_SYMBOL(zfs_putpage);
4041 EXPORT_SYMBOL(zfs_dirty_inode);
4042 EXPORT_SYMBOL(zfs_map);
4043
4044 /* CSTYLED */
4045 module_param(zfs_delete_blocks, ulong, 0644);
4046 MODULE_PARM_DESC(zfs_delete_blocks, "Delete files larger than N blocks async");
4047
4048 #endif