module/os/linux/zfs/zfs_vnops_os.c

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21
  22 /*
  23  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  24  * Copyright (c) 2012, 2018 by Delphix. All rights reserved.
  25  * Copyright (c) 2015 by Chunwei Chen. All rights reserved.
  26  * Copyright 2017 Nexenta Systems, Inc.
  27  */
  28
  29 /* Portions Copyright 2007 Jeremy Teo */
  30 /* Portions Copyright 2010 Robert Milkowski */
  31
  32
  33 #include <sys/types.h>
  34 #include <sys/param.h>
  35 #include <sys/time.h>
  36 #include <sys/sysmacros.h>
  37 #include <sys/vfs.h>
  38 #include <sys/file.h>
  39 #include <sys/stat.h>
  40 #include <sys/kmem.h>
  41 #include <sys/taskq.h>
  42 #include <sys/uio.h>
  43 #include <sys/vmsystm.h>
  44 #include <sys/atomic.h>
  45 #include <sys/pathname.h>
  46 #include <sys/cmn_err.h>
  47 #include <sys/errno.h>
  48 #include <sys/zfs_dir.h>
  49 #include <sys/zfs_acl.h>
  50 #include <sys/zfs_ioctl.h>
  51 #include <sys/fs/zfs.h>
  52 #include <sys/dmu.h>
  53 #include <sys/dmu_objset.h>
  54 #include <sys/spa.h>
  55 #include <sys/txg.h>
  56 #include <sys/dbuf.h>
  57 #include <sys/zap.h>
  58 #include <sys/sa.h>
  59 #include <sys/policy.h>
  60 #include <sys/sunddi.h>
  61 #include <sys/sid.h>
  62 #include <sys/zfs_ctldir.h>
  63 #include <sys/zfs_fuid.h>
  64 #include <sys/zfs_quota.h>
  65 #include <sys/zfs_sa.h>
  66 #include <sys/zfs_vnops.h>
  67 #include <sys/zfs_rlock.h>
  68 #include <sys/cred.h>
  69 #include <sys/zpl.h>
  70 #include <sys/zil.h>
  71 #include <sys/sa_impl.h>
  72
  73 /*
  74  * Programming rules.
  75  *
  76  * Each vnode op performs some logical unit of work.  To do this, the ZPL must
  77  * properly lock its in-core state, create a DMU transaction, do the work,
  78  * record this work in the intent log (ZIL), commit the DMU transaction,
  79  * and wait for the intent log to commit if it is a synchronous operation.
  80  * Moreover, the vnode ops must work in both normal and log replay context.
  81  * The ordering of events is important to avoid deadlocks and references
  82  * to freed memory.  The example below illustrates the following Big Rules:
  83  *
  84  *  (1) A check must be made in each zfs thread for a mounted file system.
  85  *      This is done avoiding races using ZFS_ENTER(zfsvfs).
  86  *      A ZFS_EXIT(zfsvfs) is needed before all returns.  Any znodes
  87  *      must be checked with ZFS_VERIFY_ZP(zp).  Both of these macros
  88  *      can return EIO from the calling function.
  89  *
  90  *  (2) zrele() should always be the last thing except for zil_commit() (if
  91  *      necessary) and ZFS_EXIT(). This is for 3 reasons: First, if it's the
  92  *      last reference, the vnode/znode can be freed, so the zp may point to
  93  *      freed memory.  Second, the last reference will call zfs_zinactive(),
  94  *      which may induce a lot of work -- pushing cached pages (which acquires
  95  *      range locks) and syncing out cached atime changes.  Third,
  96  *      zfs_zinactive() may require a new tx, which could deadlock the system
  97  *      if you were already holding one. This deadlock occurs because the tx
  98  *      currently being operated on prevents a txg from syncing, which
  99  *      prevents the new tx from progressing, resulting in a deadlock.  If you
 100  *      must call zrele() within a tx, use zfs_zrele_async(). Note that iput()
 101  *      is a synonym for zrele().
 102  *
 103  *  (3) All range locks must be grabbed before calling dmu_tx_assign(),
 104  *      as they can span dmu_tx_assign() calls.
 105  *
 106  *  (4) If ZPL locks are held, pass TXG_NOWAIT as the second argument to
 107  *      dmu_tx_assign().  This is critical because we don't want to block
 108  *      while holding locks.
 109  *
 110  *      If no ZPL locks are held (aside from ZFS_ENTER()), use TXG_WAIT.  This
 111  *      reduces lock contention and CPU usage when we must wait (note that if
 112  *      throughput is constrained by the storage, nearly every transaction
 113  *      must wait).
 114  *
 115  *      Note, in particular, that if a lock is sometimes acquired before
 116  *      the tx assigns, and sometimes after (e.g. z_lock), then failing
 117  *      to use a non-blocking assign can deadlock the system.  The scenario:
 118  *
 119  *      Thread A has grabbed a lock before calling dmu_tx_assign().
 120  *      Thread B is in an already-assigned tx, and blocks for this lock.
 121  *      Thread A calls dmu_tx_assign(TXG_WAIT) and blocks in txg_wait_open()
 122  *      forever, because the previous txg can't quiesce until B's tx commits.
 123  *
 124  *      If dmu_tx_assign() returns ERESTART and zfsvfs->z_assign is TXG_NOWAIT,
 125  *      then drop all locks, call dmu_tx_wait(), and try again.  On subsequent
 126  *      calls to dmu_tx_assign(), pass TXG_NOTHROTTLE in addition to TXG_NOWAIT,
 127  *      to indicate that this operation has already called dmu_tx_wait().
 128  *      This will ensure that we don't retry forever, waiting a short bit
 129  *      each time.
 130  *
 131  *  (5) If the operation succeeded, generate the intent log entry for it
 132  *      before dropping locks.  This ensures that the ordering of events
 133  *      in the intent log matches the order in which they actually occurred.
 134  *      During ZIL replay the zfs_log_* functions will update the sequence
 135  *      number to indicate the zil transaction has replayed.
 136  *
 137  *  (6) At the end of each vnode op, the DMU tx must always commit,
 138  *      regardless of whether there were any errors.
 139  *
 140  *  (7) After dropping all locks, invoke zil_commit(zilog, foid)
 141  *      to ensure that synchronous semantics are provided when necessary.
 142  *
 143  * In general, this is how things should be ordered in each vnode op:
 144  *
 145  *      ZFS_ENTER(zfsvfs);              // exit if unmounted
 146  * top:
 147  *      zfs_dirent_lock(&dl, ...)       // lock directory entry (may igrab())
 148  *      rw_enter(...);                  // grab any other locks you need
 149  *      tx = dmu_tx_create(...);        // get DMU tx
 150  *      dmu_tx_hold_*();                // hold each object you might modify
 151  *      error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT);
 152  *      if (error) {
 153  *              rw_exit(...);           // drop locks
 154  *              zfs_dirent_unlock(dl);  // unlock directory entry
 155  *              zrele(...);             // release held znodes
 156  *              if (error == ERESTART) {
 157  *                      waited = B_TRUE;
 158  *                      dmu_tx_wait(tx);
 159  *                      dmu_tx_abort(tx);
 160  *                      goto top;
 161  *              }
 162  *              dmu_tx_abort(tx);       // abort DMU tx
 163  *              ZFS_EXIT(zfsvfs);       // finished in zfs
 164  *              return (error);         // really out of space
 165  *      }
 166  *      error = do_real_work();         // do whatever this VOP does
 167  *      if (error == 0)
 168  *              zfs_log_*(...);         // on success, make ZIL entry
 169  *      dmu_tx_commit(tx);              // commit DMU tx -- error or not
 170  *      rw_exit(...);                   // drop locks
 171  *      zfs_dirent_unlock(dl);          // unlock directory entry
 172  *      zrele(...);                     // release held znodes
 173  *      zil_commit(zilog, foid);        // synchronous when necessary
 174  *      ZFS_EXIT(zfsvfs);               // finished in zfs
 175  *      return (error);                 // done, report error
 176  */
 177 int
 178 zfs_open(struct inode *ip, int mode, int flag, cred_t *cr)
 179 {
 180         (void) cr;
 181         znode_t *zp = ITOZ(ip);
 182         zfsvfs_t *zfsvfs = ITOZSB(ip);
 183
 184         ZFS_ENTER(zfsvfs);
 185         ZFS_VERIFY_ZP(zp);
 186
 187         /* Honor ZFS_APPENDONLY file attribute */
 188         if ((mode & FMODE_WRITE) && (zp->z_pflags & ZFS_APPENDONLY) &&
 189             ((flag & O_APPEND) == 0)) {
 190                 ZFS_EXIT(zfsvfs);
 191                 return (SET_ERROR(EPERM));
 192         }
 193
 194         /* Keep a count of the synchronous opens in the znode */
 195         if (flag & O_SYNC)
 196                 atomic_inc_32(&zp->z_sync_cnt);
 197
 198         ZFS_EXIT(zfsvfs);
 199         return (0);
 200 }
 201
 202 int
 203 zfs_close(struct inode *ip, int flag, cred_t *cr)
 204 {
 205         (void) cr;
 206         znode_t *zp = ITOZ(ip);
 207         zfsvfs_t *zfsvfs = ITOZSB(ip);
 208
 209         ZFS_ENTER(zfsvfs);
 210         ZFS_VERIFY_ZP(zp);
 211
 212         /* Decrement the synchronous opens in the znode */
 213         if (flag & O_SYNC)
 214                 atomic_dec_32(&zp->z_sync_cnt);
 215
 216         ZFS_EXIT(zfsvfs);
 217         return (0);
 218 }
 219
 220 #if defined(_KERNEL)
 221 /*
 222  * When a file is memory mapped, we must keep the IO data synchronized
 223  * between the DMU cache and the memory mapped pages.  What this means:
 224  *
 225  * On Write:    If we find a memory mapped page, we write to *both*
 226  *              the page and the dmu buffer.
 227  */
 228 void
 229 update_pages(znode_t *zp, int64_t start, int len, objset_t *os)
 230 {
 231         struct inode *ip = ZTOI(zp);
 232         struct address_space *mp = ip->i_mapping;
 233         struct page *pp;
 234         uint64_t nbytes;
 235         int64_t off;
 236         void *pb;
 237
 238         off = start & (PAGE_SIZE-1);
 239         for (start &= PAGE_MASK; len > 0; start += PAGE_SIZE) {
 240                 nbytes = MIN(PAGE_SIZE - off, len);
 241
 242                 pp = find_lock_page(mp, start >> PAGE_SHIFT);
 243                 if (pp) {
 244                         if (mapping_writably_mapped(mp))
 245                                 flush_dcache_page(pp);
 246
 247                         pb = kmap(pp);
 248                         (void) dmu_read(os, zp->z_id, start + off, nbytes,
 249                             pb + off, DMU_READ_PREFETCH);
 250                         kunmap(pp);
 251
 252                         if (mapping_writably_mapped(mp))
 253                                 flush_dcache_page(pp);
 254
 255                         mark_page_accessed(pp);
 256                         SetPageUptodate(pp);
 257                         ClearPageError(pp);
 258                         unlock_page(pp);
 259                         put_page(pp);
 260                 }
 261
 262                 len -= nbytes;
 263                 off = 0;
 264         }
 265 }
 266
 267 /*
 268  * When a file is memory mapped, we must keep the IO data synchronized
 269  * between the DMU cache and the memory mapped pages.  What this means:
 270  *
 271  * On Read:     We "read" preferentially from memory mapped pages,
 272  *              else we default from the dmu buffer.
 273  *
 274  * NOTE: We will always "break up" the IO into PAGESIZE uiomoves when
 275  *       the file is memory mapped.
 276  */
 277 int
 278 mappedread(znode_t *zp, int nbytes, zfs_uio_t *uio)
 279 {
 280         struct inode *ip = ZTOI(zp);
 281         struct address_space *mp = ip->i_mapping;
 282         struct page *pp;
 283         int64_t start, off;
 284         uint64_t bytes;
 285         int len = nbytes;
 286         int error = 0;
 287         void *pb;
 288
 289         start = uio->uio_loffset;
 290         off = start & (PAGE_SIZE-1);
 291         for (start &= PAGE_MASK; len > 0; start += PAGE_SIZE) {
 292                 bytes = MIN(PAGE_SIZE - off, len);
 293
 294                 pp = find_lock_page(mp, start >> PAGE_SHIFT);
 295                 if (pp) {
 296                         ASSERT(PageUptodate(pp));
 297                         unlock_page(pp);
 298
 299                         pb = kmap(pp);
 300                         error = zfs_uiomove(pb + off, bytes, UIO_READ, uio);
 301                         kunmap(pp);
 302
 303                         if (mapping_writably_mapped(mp))
 304                                 flush_dcache_page(pp);
 305
 306                         mark_page_accessed(pp);
 307                         put_page(pp);
 308                 } else {
 309                         error = dmu_read_uio_dbuf(sa_get_db(zp->z_sa_hdl),
 310                             uio, bytes);
 311                 }
 312
 313                 len -= bytes;
 314                 off = 0;
 315                 if (error)
 316                         break;
 317         }
 318         return (error);
 319 }
 320 #endif /* _KERNEL */
 321
 322 static unsigned long zfs_delete_blocks = DMU_MAX_DELETEBLKCNT;
 323
 324 /*
 325  * Write the bytes to a file.
 326  *
 327  *      IN:     zp      - znode of file to be written to
 328  *              data    - bytes to write
 329  *              len     - number of bytes to write
 330  *              pos     - offset to start writing at
 331  *
 332  *      OUT:    resid   - remaining bytes to write
 333  *
 334  *      RETURN: 0 if success
 335  *              positive error code if failure.  EIO is returned
 336  *              for a short write when residp isn't provided.
 337  *
 338  * Timestamps:
 339  *      zp - ctime|mtime updated if byte count > 0
 340  */
 341 int
 342 zfs_write_simple(znode_t *zp, const void *data, size_t len,
 343     loff_t pos, size_t *residp)
 344 {
 345         fstrans_cookie_t cookie;
 346         int error;
 347
 348         struct iovec iov;
 349         iov.iov_base = (void *)data;
 350         iov.iov_len = len;
 351
 352         zfs_uio_t uio;
 353         zfs_uio_iovec_init(&uio, &iov, 1, pos, UIO_SYSSPACE, len, 0);
 354
 355         cookie = spl_fstrans_mark();
 356         error = zfs_write(zp, &uio, 0, kcred);
 357         spl_fstrans_unmark(cookie);
 358
 359         if (error == 0) {
 360                 if (residp != NULL)
 361                         *residp = zfs_uio_resid(&uio);
 362                 else if (zfs_uio_resid(&uio) != 0)
 363                         error = SET_ERROR(EIO);
 364         }
 365
 366         return (error);
 367 }
 368
 369 static void
 370 zfs_rele_async_task(void *arg)
 371 {
 372         iput(arg);
 373 }
 374
 375 void
 376 zfs_zrele_async(znode_t *zp)
 377 {
 378         struct inode *ip = ZTOI(zp);
 379         objset_t *os = ITOZSB(ip)->z_os;
 380
 381         ASSERT(atomic_read(&ip->i_count) > 0);
 382         ASSERT(os != NULL);
 383
 384         /*
 385          * If decrementing the count would put us at 0, we can't do it inline
 386          * here, because that would be synchronous. Instead, dispatch an iput
 387          * to run later.
 388          *
 389          * For more information on the dangers of a synchronous iput, see the
 390          * header comment of this file.
 391          */
 392         if (!atomic_add_unless(&ip->i_count, -1, 1)) {
 393                 VERIFY(taskq_dispatch(dsl_pool_zrele_taskq(dmu_objset_pool(os)),
 394                     zfs_rele_async_task, ip, TQ_SLEEP) != TASKQID_INVALID);
 395         }
 396 }
 397
 398
 399 /*
 400  * Lookup an entry in a directory, or an extended attribute directory.
 401  * If it exists, return a held inode reference for it.
 402  *
 403  *      IN:     zdp     - znode of directory to search.
 404  *              nm      - name of entry to lookup.
 405  *              flags   - LOOKUP_XATTR set if looking for an attribute.
 406  *              cr      - credentials of caller.
 407  *              direntflags - directory lookup flags
 408  *              realpnp - returned pathname.
 409  *
 410  *      OUT:    zpp     - znode of located entry, NULL if not found.
 411  *
 412  *      RETURN: 0 on success, error code on failure.
 413  *
 414  * Timestamps:
 415  *      NA
 416  */
 417 int
 418 zfs_lookup(znode_t *zdp, char *nm, znode_t **zpp, int flags, cred_t *cr,
 419     int *direntflags, pathname_t *realpnp)
 420 {
 421         zfsvfs_t *zfsvfs = ZTOZSB(zdp);
 422         int error = 0;
 423
 424         /*
 425          * Fast path lookup, however we must skip DNLC lookup
 426          * for case folding or normalizing lookups because the
 427          * DNLC code only stores the passed in name.  This means
 428          * creating 'a' and removing 'A' on a case insensitive
 429          * file system would work, but DNLC still thinks 'a'
 430          * exists and won't let you create it again on the next
 431          * pass through fast path.
 432          */
 433         if (!(flags & (LOOKUP_XATTR | FIGNORECASE))) {
 434
 435                 if (!S_ISDIR(ZTOI(zdp)->i_mode)) {
 436                         return (SET_ERROR(ENOTDIR));
 437                 } else if (zdp->z_sa_hdl == NULL) {
 438                         return (SET_ERROR(EIO));
 439                 }
 440
 441                 if (nm[0] == 0 || (nm[0] == '.' && nm[1] == '\0')) {
 442                         error = zfs_fastaccesschk_execute(zdp, cr);
 443                         if (!error) {
 444                                 *zpp = zdp;
 445                                 zhold(*zpp);
 446                                 return (0);
 447                         }
 448                         return (error);
 449                 }
 450         }
 451
 452         ZFS_ENTER(zfsvfs);
 453         ZFS_VERIFY_ZP(zdp);
 454
 455         *zpp = NULL;
 456
 457         if (flags & LOOKUP_XATTR) {
 458                 /*
 459                  * We don't allow recursive attributes..
 460                  * Maybe someday we will.
 461                  */
 462                 if (zdp->z_pflags & ZFS_XATTR) {
 463                         ZFS_EXIT(zfsvfs);
 464                         return (SET_ERROR(EINVAL));
 465                 }
 466
 467                 if ((error = zfs_get_xattrdir(zdp, zpp, cr, flags))) {
 468                         ZFS_EXIT(zfsvfs);
 469                         return (error);
 470                 }
 471
 472                 /*
 473                  * Do we have permission to get into attribute directory?
 474                  */
 475
 476                 if ((error = zfs_zaccess(*zpp, ACE_EXECUTE, 0,
 477                     B_TRUE, cr))) {
 478                         zrele(*zpp);
 479                         *zpp = NULL;
 480                 }
 481
 482                 ZFS_EXIT(zfsvfs);
 483                 return (error);
 484         }
 485
 486         if (!S_ISDIR(ZTOI(zdp)->i_mode)) {
 487                 ZFS_EXIT(zfsvfs);
 488                 return (SET_ERROR(ENOTDIR));
 489         }
 490
 491         /*
 492          * Check accessibility of directory.
 493          */
 494
 495         if ((error = zfs_zaccess(zdp, ACE_EXECUTE, 0, B_FALSE, cr))) {
 496                 ZFS_EXIT(zfsvfs);
 497                 return (error);
 498         }
 499
 500         if (zfsvfs->z_utf8 && u8_validate(nm, strlen(nm),
 501             NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
 502                 ZFS_EXIT(zfsvfs);
 503                 return (SET_ERROR(EILSEQ));
 504         }
 505
 506         error = zfs_dirlook(zdp, nm, zpp, flags, direntflags, realpnp);
 507         if ((error == 0) && (*zpp))
 508                 zfs_znode_update_vfs(*zpp);
 509
 510         ZFS_EXIT(zfsvfs);
 511         return (error);
 512 }
 513
 514 /*
 515  * Attempt to create a new entry in a directory.  If the entry
 516  * already exists, truncate the file if permissible, else return
 517  * an error.  Return the ip of the created or trunc'd file.
 518  *
 519  *      IN:     dzp     - znode of directory to put new file entry in.
 520  *              name    - name of new file entry.
 521  *              vap     - attributes of new file.
 522  *              excl    - flag indicating exclusive or non-exclusive mode.
 523  *              mode    - mode to open file with.
 524  *              cr      - credentials of caller.
 525  *              flag    - file flag.
 526  *              vsecp   - ACL to be set
 527  *
 528  *      OUT:    zpp     - znode of created or trunc'd entry.
 529  *
 530  *      RETURN: 0 on success, error code on failure.
 531  *
 532  * Timestamps:
 533  *      dzp - ctime|mtime updated if new entry created
 534  *       zp - ctime|mtime always, atime if new
 535  */
 536 int
 537 zfs_create(znode_t *dzp, char *name, vattr_t *vap, int excl,
 538     int mode, znode_t **zpp, cred_t *cr, int flag, vsecattr_t *vsecp)
 539 {
 540         znode_t         *zp;
 541         zfsvfs_t        *zfsvfs = ZTOZSB(dzp);
 542         zilog_t         *zilog;
 543         objset_t        *os;
 544         zfs_dirlock_t   *dl;
 545         dmu_tx_t        *tx;
 546         int             error;
 547         uid_t           uid;
 548         gid_t           gid;
 549         zfs_acl_ids_t   acl_ids;
 550         boolean_t       fuid_dirtied;
 551         boolean_t       have_acl = B_FALSE;
 552         boolean_t       waited = B_FALSE;
 553
 554         /*
 555          * If we have an ephemeral id, ACL, or XVATTR then
 556          * make sure file system is at proper version
 557          */
 558
 559         gid = crgetgid(cr);
 560         uid = crgetuid(cr);
 561
 562         if (zfsvfs->z_use_fuids == B_FALSE &&
 563             (vsecp || IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid)))
 564                 return (SET_ERROR(EINVAL));
 565
 566         if (name == NULL)
 567                 return (SET_ERROR(EINVAL));
 568
 569         ZFS_ENTER(zfsvfs);
 570         ZFS_VERIFY_ZP(dzp);
 571         os = zfsvfs->z_os;
 572         zilog = zfsvfs->z_log;
 573
 574         if (zfsvfs->z_utf8 && u8_validate(name, strlen(name),
 575             NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
 576                 ZFS_EXIT(zfsvfs);
 577                 return (SET_ERROR(EILSEQ));
 578         }
 579
 580         if (vap->va_mask & ATTR_XVATTR) {
 581                 if ((error = secpolicy_xvattr((xvattr_t *)vap,
 582                     crgetuid(cr), cr, vap->va_mode)) != 0) {
 583                         ZFS_EXIT(zfsvfs);
 584                         return (error);
 585                 }
 586         }
 587
 588 top:
 589         *zpp = NULL;
 590         if (*name == '\0') {
 591                 /*
 592                  * Null component name refers to the directory itself.
 593                  */
 594                 zhold(dzp);
 595                 zp = dzp;
 596                 dl = NULL;
 597                 error = 0;
 598         } else {
 599                 /* possible igrab(zp) */
 600                 int zflg = 0;
 601
 602                 if (flag & FIGNORECASE)
 603                         zflg |= ZCILOOK;
 604
 605                 error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg,
 606                     NULL, NULL);
 607                 if (error) {
 608                         if (have_acl)
 609                                 zfs_acl_ids_free(&acl_ids);
 610                         if (strcmp(name, "..") == 0)
 611                                 error = SET_ERROR(EISDIR);
 612                         ZFS_EXIT(zfsvfs);
 613                         return (error);
 614                 }
 615         }
 616
 617         if (zp == NULL) {
 618                 uint64_t txtype;
 619                 uint64_t projid = ZFS_DEFAULT_PROJID;
 620
 621                 /*
 622                  * Create a new file object and update the directory
 623                  * to reference it.
 624                  */
 625                 if ((error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr))) {
 626                         if (have_acl)
 627                                 zfs_acl_ids_free(&acl_ids);
 628                         goto out;
 629                 }
 630
 631                 /*
 632                  * We only support the creation of regular files in
 633                  * extended attribute directories.
 634                  */
 635
 636                 if ((dzp->z_pflags & ZFS_XATTR) && !S_ISREG(vap->va_mode)) {
 637                         if (have_acl)
 638                                 zfs_acl_ids_free(&acl_ids);
 639                         error = SET_ERROR(EINVAL);
 640                         goto out;
 641                 }
 642
 643                 if (!have_acl && (error = zfs_acl_ids_create(dzp, 0, vap,
 644                     cr, vsecp, &acl_ids)) != 0)
 645                         goto out;
 646                 have_acl = B_TRUE;
 647
 648                 if (S_ISREG(vap->va_mode) || S_ISDIR(vap->va_mode))
 649                         projid = zfs_inherit_projid(dzp);
 650                 if (zfs_acl_ids_overquota(zfsvfs, &acl_ids, projid)) {
 651                         zfs_acl_ids_free(&acl_ids);
 652                         error = SET_ERROR(EDQUOT);
 653                         goto out;
 654                 }
 655
 656                 tx = dmu_tx_create(os);
 657
 658                 dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
 659                     ZFS_SA_BASE_ATTR_SIZE);
 660
 661                 fuid_dirtied = zfsvfs->z_fuid_dirty;
 662                 if (fuid_dirtied)
 663                         zfs_fuid_txhold(zfsvfs, tx);
 664                 dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
 665                 dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE);
 666                 if (!zfsvfs->z_use_sa &&
 667                     acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) {
 668                         dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
 669                             0, acl_ids.z_aclp->z_acl_bytes);
 670                 }
 671
 672                 error = dmu_tx_assign(tx,
 673                     (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT);
 674                 if (error) {
 675                         zfs_dirent_unlock(dl);
 676                         if (error == ERESTART) {
 677                                 waited = B_TRUE;
 678                                 dmu_tx_wait(tx);
 679                                 dmu_tx_abort(tx);
 680                                 goto top;
 681                         }
 682                         zfs_acl_ids_free(&acl_ids);
 683                         dmu_tx_abort(tx);
 684                         ZFS_EXIT(zfsvfs);
 685                         return (error);
 686                 }
 687                 zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids);
 688
 689                 error = zfs_link_create(dl, zp, tx, ZNEW);
 690                 if (error != 0) {
 691                         /*
 692                          * Since, we failed to add the directory entry for it,
 693                          * delete the newly created dnode.
 694                          */
 695                         zfs_znode_delete(zp, tx);
 696                         remove_inode_hash(ZTOI(zp));
 697                         zfs_acl_ids_free(&acl_ids);
 698                         dmu_tx_commit(tx);
 699                         goto out;
 700                 }
 701
 702                 if (fuid_dirtied)
 703                         zfs_fuid_sync(zfsvfs, tx);
 704
 705                 txtype = zfs_log_create_txtype(Z_FILE, vsecp, vap);
 706                 if (flag & FIGNORECASE)
 707                         txtype |= TX_CI;
 708                 zfs_log_create(zilog, tx, txtype, dzp, zp, name,
 709                     vsecp, acl_ids.z_fuidp, vap);
 710                 zfs_acl_ids_free(&acl_ids);
 711                 dmu_tx_commit(tx);
 712         } else {
 713                 int aflags = (flag & O_APPEND) ? V_APPEND : 0;
 714
 715                 if (have_acl)
 716                         zfs_acl_ids_free(&acl_ids);
 717                 have_acl = B_FALSE;
 718
 719                 /*
 720                  * A directory entry already exists for this name.
 721                  */
 722                 /*
 723                  * Can't truncate an existing file if in exclusive mode.
 724                  */
 725                 if (excl) {
 726                         error = SET_ERROR(EEXIST);
 727                         goto out;
 728                 }
 729                 /*
 730                  * Can't open a directory for writing.
 731                  */
 732                 if (S_ISDIR(ZTOI(zp)->i_mode)) {
 733                         error = SET_ERROR(EISDIR);
 734                         goto out;
 735                 }
 736                 /*
 737                  * Verify requested access to file.
 738                  */
 739                 if (mode && (error = zfs_zaccess_rwx(zp, mode, aflags, cr))) {
 740                         goto out;
 741                 }
 742
 743                 mutex_enter(&dzp->z_lock);
 744                 dzp->z_seq++;
 745                 mutex_exit(&dzp->z_lock);
 746
 747                 /*
 748                  * Truncate regular files if requested.
 749                  */
 750                 if (S_ISREG(ZTOI(zp)->i_mode) &&
 751                     (vap->va_mask & ATTR_SIZE) && (vap->va_size == 0)) {
 752                         /* we can't hold any locks when calling zfs_freesp() */
 753                         if (dl) {
 754                                 zfs_dirent_unlock(dl);
 755                                 dl = NULL;
 756                         }
 757                         error = zfs_freesp(zp, 0, 0, mode, TRUE);
 758                 }
 759         }
 760 out:
 761
 762         if (dl)
 763                 zfs_dirent_unlock(dl);
 764
 765         if (error) {
 766                 if (zp)
 767                         zrele(zp);
 768         } else {
 769                 zfs_znode_update_vfs(dzp);
 770                 zfs_znode_update_vfs(zp);
 771                 *zpp = zp;
 772         }
 773
 774         if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
 775                 zil_commit(zilog, 0);
 776
 777         ZFS_EXIT(zfsvfs);
 778         return (error);
 779 }
 780
 781 int
 782 zfs_tmpfile(struct inode *dip, vattr_t *vap, int excl,
 783     int mode, struct inode **ipp, cred_t *cr, int flag, vsecattr_t *vsecp)
 784 {
 785         (void) excl, (void) mode, (void) flag;
 786         znode_t         *zp = NULL, *dzp = ITOZ(dip);
 787         zfsvfs_t        *zfsvfs = ITOZSB(dip);
 788         objset_t        *os;
 789         dmu_tx_t        *tx;
 790         int             error;
 791         uid_t           uid;
 792         gid_t           gid;
 793         zfs_acl_ids_t   acl_ids;
 794         uint64_t        projid = ZFS_DEFAULT_PROJID;
 795         boolean_t       fuid_dirtied;
 796         boolean_t       have_acl = B_FALSE;
 797         boolean_t       waited = B_FALSE;
 798
 799         /*
 800          * If we have an ephemeral id, ACL, or XVATTR then
 801          * make sure file system is at proper version
 802          */
 803
 804         gid = crgetgid(cr);
 805         uid = crgetuid(cr);
 806
 807         if (zfsvfs->z_use_fuids == B_FALSE &&
 808             (vsecp || IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid)))
 809                 return (SET_ERROR(EINVAL));
 810
 811         ZFS_ENTER(zfsvfs);
 812         ZFS_VERIFY_ZP(dzp);
 813         os = zfsvfs->z_os;
 814
 815         if (vap->va_mask & ATTR_XVATTR) {
 816                 if ((error = secpolicy_xvattr((xvattr_t *)vap,
 817                     crgetuid(cr), cr, vap->va_mode)) != 0) {
 818                         ZFS_EXIT(zfsvfs);
 819                         return (error);
 820                 }
 821         }
 822
 823 top:
 824         *ipp = NULL;
 825
 826         /*
 827          * Create a new file object and update the directory
 828          * to reference it.
 829          */
 830         if ((error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr))) {
 831                 if (have_acl)
 832                         zfs_acl_ids_free(&acl_ids);
 833                 goto out;
 834         }
 835
 836         if (!have_acl && (error = zfs_acl_ids_create(dzp, 0, vap,
 837             cr, vsecp, &acl_ids)) != 0)
 838                 goto out;
 839         have_acl = B_TRUE;
 840
 841         if (S_ISREG(vap->va_mode) || S_ISDIR(vap->va_mode))
 842                 projid = zfs_inherit_projid(dzp);
 843         if (zfs_acl_ids_overquota(zfsvfs, &acl_ids, projid)) {
 844                 zfs_acl_ids_free(&acl_ids);
 845                 error = SET_ERROR(EDQUOT);
 846                 goto out;
 847         }
 848
 849         tx = dmu_tx_create(os);
 850
 851         dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
 852             ZFS_SA_BASE_ATTR_SIZE);
 853         dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
 854
 855         fuid_dirtied = zfsvfs->z_fuid_dirty;
 856         if (fuid_dirtied)
 857                 zfs_fuid_txhold(zfsvfs, tx);
 858         if (!zfsvfs->z_use_sa &&
 859             acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) {
 860                 dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
 861                     0, acl_ids.z_aclp->z_acl_bytes);
 862         }
 863         error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT);
 864         if (error) {
 865                 if (error == ERESTART) {
 866                         waited = B_TRUE;
 867                         dmu_tx_wait(tx);
 868                         dmu_tx_abort(tx);
 869                         goto top;
 870                 }
 871                 zfs_acl_ids_free(&acl_ids);
 872                 dmu_tx_abort(tx);
 873                 ZFS_EXIT(zfsvfs);
 874                 return (error);
 875         }
 876         zfs_mknode(dzp, vap, tx, cr, IS_TMPFILE, &zp, &acl_ids);
 877
 878         if (fuid_dirtied)
 879                 zfs_fuid_sync(zfsvfs, tx);
 880
 881         /* Add to unlinked set */
 882         zp->z_unlinked = B_TRUE;
 883         zfs_unlinked_add(zp, tx);
 884         zfs_acl_ids_free(&acl_ids);
 885         dmu_tx_commit(tx);
 886 out:
 887
 888         if (error) {
 889                 if (zp)
 890                         zrele(zp);
 891         } else {
 892                 zfs_znode_update_vfs(dzp);
 893                 zfs_znode_update_vfs(zp);
 894                 *ipp = ZTOI(zp);
 895         }
 896
 897         ZFS_EXIT(zfsvfs);
 898         return (error);
 899 }
 900
 901 /*
 902  * Remove an entry from a directory.
 903  *
 904  *      IN:     dzp     - znode of directory to remove entry from.
 905  *              name    - name of entry to remove.
 906  *              cr      - credentials of caller.
 907  *              flags   - case flags.
 908  *
 909  *      RETURN: 0 if success
 910  *              error code if failure
 911  *
 912  * Timestamps:
 913  *      dzp - ctime|mtime
 914  *       ip - ctime (if nlink > 0)
 915  */
 916
 917 static uint64_t null_xattr = 0;
 918
 919 int
 920 zfs_remove(znode_t *dzp, char *name, cred_t *cr, int flags)
 921 {
 922         znode_t         *zp;
 923         znode_t         *xzp;
 924         zfsvfs_t        *zfsvfs = ZTOZSB(dzp);
 925         zilog_t         *zilog;
 926         uint64_t        acl_obj, xattr_obj;
 927         uint64_t        xattr_obj_unlinked = 0;
 928         uint64_t        obj = 0;
 929         uint64_t        links;
 930         zfs_dirlock_t   *dl;
 931         dmu_tx_t        *tx;
 932         boolean_t       may_delete_now, delete_now = FALSE;
 933         boolean_t       unlinked, toobig = FALSE;
 934         uint64_t        txtype;
 935         pathname_t      *realnmp = NULL;
 936         pathname_t      realnm;
 937         int             error;
 938         int             zflg = ZEXISTS;
 939         boolean_t       waited = B_FALSE;
 940
 941         if (name == NULL)
 942                 return (SET_ERROR(EINVAL));
 943
 944         ZFS_ENTER(zfsvfs);
 945         ZFS_VERIFY_ZP(dzp);
 946         zilog = zfsvfs->z_log;
 947
 948         if (flags & FIGNORECASE) {
 949                 zflg |= ZCILOOK;
 950                 pn_alloc(&realnm);
 951                 realnmp = &realnm;
 952         }
 953
 954 top:
 955         xattr_obj = 0;
 956         xzp = NULL;
 957         /*
 958          * Attempt to lock directory; fail if entry doesn't exist.
 959          */
 960         if ((error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg,
 961             NULL, realnmp))) {
 962                 if (realnmp)
 963                         pn_free(realnmp);
 964                 ZFS_EXIT(zfsvfs);
 965                 return (error);
 966         }
 967
 968         if ((error = zfs_zaccess_delete(dzp, zp, cr))) {
 969                 goto out;
 970         }
 971
 972         /*
 973          * Need to use rmdir for removing directories.
 974          */
 975         if (S_ISDIR(ZTOI(zp)->i_mode)) {
 976                 error = SET_ERROR(EPERM);
 977                 goto out;
 978         }
 979
 980         mutex_enter(&zp->z_lock);
 981         may_delete_now = atomic_read(&ZTOI(zp)->i_count) == 1 &&
 982             !(zp->z_is_mapped);
 983         mutex_exit(&zp->z_lock);
 984
 985         /*
 986          * We may delete the znode now, or we may put it in the unlinked set;
 987          * it depends on whether we're the last link, and on whether there are
 988          * other holds on the inode.  So we dmu_tx_hold() the right things to
 989          * allow for either case.
 990          */
 991         obj = zp->z_id;
 992         tx = dmu_tx_create(zfsvfs->z_os);
 993         dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name);
 994         dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
 995         zfs_sa_upgrade_txholds(tx, zp);
 996         zfs_sa_upgrade_txholds(tx, dzp);
 997         if (may_delete_now) {
 998                 toobig = zp->z_size > zp->z_blksz * zfs_delete_blocks;
 999                 /* if the file is too big, only hold_free a token amount */
1000                 dmu_tx_hold_free(tx, zp->z_id, 0,
1001                     (toobig ? DMU_MAX_ACCESS : DMU_OBJECT_END));
1002         }
1003
1004         /* are there any extended attributes? */
1005         error = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs),
1006             &xattr_obj, sizeof (xattr_obj));
1007         if (error == 0 && xattr_obj) {
1008                 error = zfs_zget(zfsvfs, xattr_obj, &xzp);
1009                 ASSERT0(error);
1010                 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
1011                 dmu_tx_hold_sa(tx, xzp->z_sa_hdl, B_FALSE);
1012         }
1013
1014         mutex_enter(&zp->z_lock);
1015         if ((acl_obj = zfs_external_acl(zp)) != 0 && may_delete_now)
1016                 dmu_tx_hold_free(tx, acl_obj, 0, DMU_OBJECT_END);
1017         mutex_exit(&zp->z_lock);
1018
1019         /* charge as an update -- would be nice not to charge at all */
1020         dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
1021
1022         /*
1023          * Mark this transaction as typically resulting in a net free of space
1024          */
1025         dmu_tx_mark_netfree(tx);
1026
1027         error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT);
1028         if (error) {
1029                 zfs_dirent_unlock(dl);
1030                 if (error == ERESTART) {
1031                         waited = B_TRUE;
1032                         dmu_tx_wait(tx);
1033                         dmu_tx_abort(tx);
1034                         zrele(zp);
1035                         if (xzp)
1036                                 zrele(xzp);
1037                         goto top;
1038                 }
1039                 if (realnmp)
1040                         pn_free(realnmp);
1041                 dmu_tx_abort(tx);
1042                 zrele(zp);
1043                 if (xzp)
1044                         zrele(xzp);
1045                 ZFS_EXIT(zfsvfs);
1046                 return (error);
1047         }
1048
1049         /*
1050          * Remove the directory entry.
1051          */
1052         error = zfs_link_destroy(dl, zp, tx, zflg, &unlinked);
1053
1054         if (error) {
1055                 dmu_tx_commit(tx);
1056                 goto out;
1057         }
1058
1059         if (unlinked) {
1060                 /*
1061                  * Hold z_lock so that we can make sure that the ACL obj
1062                  * hasn't changed.  Could have been deleted due to
1063                  * zfs_sa_upgrade().
1064                  */
1065                 mutex_enter(&zp->z_lock);
1066                 (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs),
1067                     &xattr_obj_unlinked, sizeof (xattr_obj_unlinked));
1068                 delete_now = may_delete_now && !toobig &&
1069                     atomic_read(&ZTOI(zp)->i_count) == 1 &&
1070                     !(zp->z_is_mapped) && xattr_obj == xattr_obj_unlinked &&
1071                     zfs_external_acl(zp) == acl_obj;
1072         }
1073
1074         if (delete_now) {
1075                 if (xattr_obj_unlinked) {
1076                         ASSERT3U(ZTOI(xzp)->i_nlink, ==, 2);
1077                         mutex_enter(&xzp->z_lock);
1078                         xzp->z_unlinked = B_TRUE;
1079                         clear_nlink(ZTOI(xzp));
1080                         links = 0;
1081                         error = sa_update(xzp->z_sa_hdl, SA_ZPL_LINKS(zfsvfs),
1082                             &links, sizeof (links), tx);
1083                         ASSERT3U(error,  ==,  0);
1084                         mutex_exit(&xzp->z_lock);
1085                         zfs_unlinked_add(xzp, tx);
1086
1087                         if (zp->z_is_sa)
1088                                 error = sa_remove(zp->z_sa_hdl,
1089                                     SA_ZPL_XATTR(zfsvfs), tx);
1090                         else
1091                                 error = sa_update(zp->z_sa_hdl,
1092                                     SA_ZPL_XATTR(zfsvfs), &null_xattr,
1093                                     sizeof (uint64_t), tx);
1094                         ASSERT0(error);
1095                 }
1096                 /*
1097                  * Add to the unlinked set because a new reference could be
1098                  * taken concurrently resulting in a deferred destruction.
1099                  */
1100                 zfs_unlinked_add(zp, tx);
1101                 mutex_exit(&zp->z_lock);
1102         } else if (unlinked) {
1103                 mutex_exit(&zp->z_lock);
1104                 zfs_unlinked_add(zp, tx);
1105         }
1106
1107         txtype = TX_REMOVE;
1108         if (flags & FIGNORECASE)
1109                 txtype |= TX_CI;
1110         zfs_log_remove(zilog, tx, txtype, dzp, name, obj, unlinked);
1111
1112         dmu_tx_commit(tx);
1113 out:
1114         if (realnmp)
1115                 pn_free(realnmp);
1116
1117         zfs_dirent_unlock(dl);
1118         zfs_znode_update_vfs(dzp);
1119         zfs_znode_update_vfs(zp);
1120
1121         if (delete_now)
1122                 zrele(zp);
1123         else
1124                 zfs_zrele_async(zp);
1125
1126         if (xzp) {
1127                 zfs_znode_update_vfs(xzp);
1128                 zfs_zrele_async(xzp);
1129         }
1130
1131         if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
1132                 zil_commit(zilog, 0);
1133
1134         ZFS_EXIT(zfsvfs);
1135         return (error);
1136 }
1137
1138 /*
1139  * Create a new directory and insert it into dzp using the name
1140  * provided.  Return a pointer to the inserted directory.
1141  *
1142  *      IN:     dzp     - znode of directory to add subdir to.
1143  *              dirname - name of new directory.
1144  *              vap     - attributes of new directory.
1145  *              cr      - credentials of caller.
1146  *              flags   - case flags.
1147  *              vsecp   - ACL to be set
1148  *
1149  *      OUT:    zpp     - znode of created directory.
1150  *
1151  *      RETURN: 0 if success
1152  *              error code if failure
1153  *
1154  * Timestamps:
1155  *      dzp - ctime|mtime updated
1156  *      zpp - ctime|mtime|atime updated
1157  */
1158 int
1159 zfs_mkdir(znode_t *dzp, char *dirname, vattr_t *vap, znode_t **zpp,
1160     cred_t *cr, int flags, vsecattr_t *vsecp)
1161 {
1162         znode_t         *zp;
1163         zfsvfs_t        *zfsvfs = ZTOZSB(dzp);
1164         zilog_t         *zilog;
1165         zfs_dirlock_t   *dl;
1166         uint64_t        txtype;
1167         dmu_tx_t        *tx;
1168         int             error;
1169         int             zf = ZNEW;
1170         uid_t           uid;
1171         gid_t           gid = crgetgid(cr);
1172         zfs_acl_ids_t   acl_ids;
1173         boolean_t       fuid_dirtied;
1174         boolean_t       waited = B_FALSE;
1175
1176         ASSERT(S_ISDIR(vap->va_mode));
1177
1178         /*
1179          * If we have an ephemeral id, ACL, or XVATTR then
1180          * make sure file system is at proper version
1181          */
1182
1183         uid = crgetuid(cr);
1184         if (zfsvfs->z_use_fuids == B_FALSE &&
1185             (vsecp || IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid)))
1186                 return (SET_ERROR(EINVAL));
1187
1188         if (dirname == NULL)
1189                 return (SET_ERROR(EINVAL));
1190
1191         ZFS_ENTER(zfsvfs);
1192         ZFS_VERIFY_ZP(dzp);
1193         zilog = zfsvfs->z_log;
1194
1195         if (dzp->z_pflags & ZFS_XATTR) {
1196                 ZFS_EXIT(zfsvfs);
1197                 return (SET_ERROR(EINVAL));
1198         }
1199
1200         if (zfsvfs->z_utf8 && u8_validate(dirname,
1201             strlen(dirname), NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
1202                 ZFS_EXIT(zfsvfs);
1203                 return (SET_ERROR(EILSEQ));
1204         }
1205         if (flags & FIGNORECASE)
1206                 zf |= ZCILOOK;
1207
1208         if (vap->va_mask & ATTR_XVATTR) {
1209                 if ((error = secpolicy_xvattr((xvattr_t *)vap,
1210                     crgetuid(cr), cr, vap->va_mode)) != 0) {
1211                         ZFS_EXIT(zfsvfs);
1212                         return (error);
1213                 }
1214         }
1215
1216         if ((error = zfs_acl_ids_create(dzp, 0, vap, cr,
1217             vsecp, &acl_ids)) != 0) {
1218                 ZFS_EXIT(zfsvfs);
1219                 return (error);
1220         }
1221         /*
1222          * First make sure the new directory doesn't exist.
1223          *
1224          * Existence is checked first to make sure we don't return
1225          * EACCES instead of EEXIST which can cause some applications
1226          * to fail.
1227          */
1228 top:
1229         *zpp = NULL;
1230
1231         if ((error = zfs_dirent_lock(&dl, dzp, dirname, &zp, zf,
1232             NULL, NULL))) {
1233                 zfs_acl_ids_free(&acl_ids);
1234                 ZFS_EXIT(zfsvfs);
1235                 return (error);
1236         }
1237
1238         if ((error = zfs_zaccess(dzp, ACE_ADD_SUBDIRECTORY, 0, B_FALSE, cr))) {
1239                 zfs_acl_ids_free(&acl_ids);
1240                 zfs_dirent_unlock(dl);
1241                 ZFS_EXIT(zfsvfs);
1242                 return (error);
1243         }
1244
1245         if (zfs_acl_ids_overquota(zfsvfs, &acl_ids, zfs_inherit_projid(dzp))) {
1246                 zfs_acl_ids_free(&acl_ids);
1247                 zfs_dirent_unlock(dl);
1248                 ZFS_EXIT(zfsvfs);
1249                 return (SET_ERROR(EDQUOT));
1250         }
1251
1252         /*
1253          * Add a new entry to the directory.
1254          */
1255         tx = dmu_tx_create(zfsvfs->z_os);
1256         dmu_tx_hold_zap(tx, dzp->z_id, TRUE, dirname);
1257         dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL);
1258         fuid_dirtied = zfsvfs->z_fuid_dirty;
1259         if (fuid_dirtied)
1260                 zfs_fuid_txhold(zfsvfs, tx);
1261         if (!zfsvfs->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) {
1262                 dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0,
1263                     acl_ids.z_aclp->z_acl_bytes);
1264         }
1265
1266         dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
1267             ZFS_SA_BASE_ATTR_SIZE);
1268
1269         error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT);
1270         if (error) {
1271                 zfs_dirent_unlock(dl);
1272                 if (error == ERESTART) {
1273                         waited = B_TRUE;
1274                         dmu_tx_wait(tx);
1275                         dmu_tx_abort(tx);
1276                         goto top;
1277                 }
1278                 zfs_acl_ids_free(&acl_ids);
1279                 dmu_tx_abort(tx);
1280                 ZFS_EXIT(zfsvfs);
1281                 return (error);
1282         }
1283
1284         /*
1285          * Create new node.
1286          */
1287         zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids);
1288
1289         /*
1290          * Now put new name in parent dir.
1291          */
1292         error = zfs_link_create(dl, zp, tx, ZNEW);
1293         if (error != 0) {
1294                 zfs_znode_delete(zp, tx);
1295                 remove_inode_hash(ZTOI(zp));
1296                 goto out;
1297         }
1298
1299         if (fuid_dirtied)
1300                 zfs_fuid_sync(zfsvfs, tx);
1301
1302         *zpp = zp;
1303
1304         txtype = zfs_log_create_txtype(Z_DIR, vsecp, vap);
1305         if (flags & FIGNORECASE)
1306                 txtype |= TX_CI;
1307         zfs_log_create(zilog, tx, txtype, dzp, zp, dirname, vsecp,
1308             acl_ids.z_fuidp, vap);
1309
1310 out:
1311         zfs_acl_ids_free(&acl_ids);
1312
1313         dmu_tx_commit(tx);
1314
1315         zfs_dirent_unlock(dl);
1316
1317         if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
1318                 zil_commit(zilog, 0);
1319
1320         if (error != 0) {
1321                 zrele(zp);
1322         } else {
1323                 zfs_znode_update_vfs(dzp);
1324                 zfs_znode_update_vfs(zp);
1325         }
1326         ZFS_EXIT(zfsvfs);
1327         return (error);
1328 }
1329
1330 /*
1331  * Remove a directory subdir entry.  If the current working
1332  * directory is the same as the subdir to be removed, the
1333  * remove will fail.
1334  *
1335  *      IN:     dzp     - znode of directory to remove from.
1336  *              name    - name of directory to be removed.
1337  *              cwd     - inode of current working directory.
1338  *              cr      - credentials of caller.
1339  *              flags   - case flags
1340  *
1341  *      RETURN: 0 on success, error code on failure.
1342  *
1343  * Timestamps:
1344  *      dzp - ctime|mtime updated
1345  */
1346 int
1347 zfs_rmdir(znode_t *dzp, char *name, znode_t *cwd, cred_t *cr,
1348     int flags)
1349 {
1350         znode_t         *zp;
1351         zfsvfs_t        *zfsvfs = ZTOZSB(dzp);
1352         zilog_t         *zilog;
1353         zfs_dirlock_t   *dl;
1354         dmu_tx_t        *tx;
1355         int             error;
1356         int             zflg = ZEXISTS;
1357         boolean_t       waited = B_FALSE;
1358
1359         if (name == NULL)
1360                 return (SET_ERROR(EINVAL));
1361
1362         ZFS_ENTER(zfsvfs);
1363         ZFS_VERIFY_ZP(dzp);
1364         zilog = zfsvfs->z_log;
1365
1366         if (flags & FIGNORECASE)
1367                 zflg |= ZCILOOK;
1368 top:
1369         zp = NULL;
1370
1371         /*
1372          * Attempt to lock directory; fail if entry doesn't exist.
1373          */
1374         if ((error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg,
1375             NULL, NULL))) {
1376                 ZFS_EXIT(zfsvfs);
1377                 return (error);
1378         }
1379
1380         if ((error = zfs_zaccess_delete(dzp, zp, cr))) {
1381                 goto out;
1382         }
1383
1384         if (!S_ISDIR(ZTOI(zp)->i_mode)) {
1385                 error = SET_ERROR(ENOTDIR);
1386                 goto out;
1387         }
1388
1389         if (zp == cwd) {
1390                 error = SET_ERROR(EINVAL);
1391                 goto out;
1392         }
1393
1394         /*
1395          * Grab a lock on the directory to make sure that no one is
1396          * trying to add (or lookup) entries while we are removing it.
1397          */
1398         rw_enter(&zp->z_name_lock, RW_WRITER);
1399
1400         /*
1401          * Grab a lock on the parent pointer to make sure we play well
1402          * with the treewalk and directory rename code.
1403          */
1404         rw_enter(&zp->z_parent_lock, RW_WRITER);
1405
1406         tx = dmu_tx_create(zfsvfs->z_os);
1407         dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name);
1408         dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
1409         dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
1410         zfs_sa_upgrade_txholds(tx, zp);
1411         zfs_sa_upgrade_txholds(tx, dzp);
1412         dmu_tx_mark_netfree(tx);
1413         error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT);
1414         if (error) {
1415                 rw_exit(&zp->z_parent_lock);
1416                 rw_exit(&zp->z_name_lock);
1417                 zfs_dirent_unlock(dl);
1418                 if (error == ERESTART) {
1419                         waited = B_TRUE;
1420                         dmu_tx_wait(tx);
1421                         dmu_tx_abort(tx);
1422                         zrele(zp);
1423                         goto top;
1424                 }
1425                 dmu_tx_abort(tx);
1426                 zrele(zp);
1427                 ZFS_EXIT(zfsvfs);
1428                 return (error);
1429         }
1430
1431         error = zfs_link_destroy(dl, zp, tx, zflg, NULL);
1432
1433         if (error == 0) {
1434                 uint64_t txtype = TX_RMDIR;
1435                 if (flags & FIGNORECASE)
1436                         txtype |= TX_CI;
1437                 zfs_log_remove(zilog, tx, txtype, dzp, name, ZFS_NO_OBJECT,
1438                     B_FALSE);
1439         }
1440
1441         dmu_tx_commit(tx);
1442
1443         rw_exit(&zp->z_parent_lock);
1444         rw_exit(&zp->z_name_lock);
1445 out:
1446         zfs_dirent_unlock(dl);
1447
1448         zfs_znode_update_vfs(dzp);
1449         zfs_znode_update_vfs(zp);
1450         zrele(zp);
1451
1452         if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
1453                 zil_commit(zilog, 0);
1454
1455         ZFS_EXIT(zfsvfs);
1456         return (error);
1457 }
1458
1459 /*
1460  * Read directory entries from the given directory cursor position and emit
1461  * name and position for each entry.
1462  *
1463  *      IN:     ip      - inode of directory to read.
1464  *              ctx     - directory entry context.
1465  *              cr      - credentials of caller.
1466  *
1467  *      RETURN: 0 if success
1468  *              error code if failure
1469  *
1470  * Timestamps:
1471  *      ip - atime updated
1472  *
1473  * Note that the low 4 bits of the cookie returned by zap is always zero.
1474  * This allows us to use the low range for "special" directory entries:
1475  * We use 0 for '.', and 1 for '..'.  If this is the root of the filesystem,
1476  * we use the offset 2 for the '.zfs' directory.
1477  */
1478 int
1479 zfs_readdir(struct inode *ip, zpl_dir_context_t *ctx, cred_t *cr)
1480 {
1481         (void) cr;
1482         znode_t         *zp = ITOZ(ip);
1483         zfsvfs_t        *zfsvfs = ITOZSB(ip);
1484         objset_t        *os;
1485         zap_cursor_t    zc;
1486         zap_attribute_t zap;
1487         int             error;
1488         uint8_t         prefetch;
1489         uint8_t         type;
1490         int             done = 0;
1491         uint64_t        parent;
1492         uint64_t        offset; /* must be unsigned; checks for < 1 */
1493
1494         ZFS_ENTER(zfsvfs);
1495         ZFS_VERIFY_ZP(zp);
1496
1497         if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs),
1498             &parent, sizeof (parent))) != 0)
1499                 goto out;
1500
1501         /*
1502          * Quit if directory has been removed (posix)
1503          */
1504         if (zp->z_unlinked)
1505                 goto out;
1506
1507         error = 0;
1508         os = zfsvfs->z_os;
1509         offset = ctx->pos;
1510         prefetch = zp->z_zn_prefetch;
1511
1512         /*
1513          * Initialize the iterator cursor.
1514          */
1515         if (offset <= 3) {
1516                 /*
1517                  * Start iteration from the beginning of the directory.
1518                  */
1519                 zap_cursor_init(&zc, os, zp->z_id);
1520         } else {
1521                 /*
1522                  * The offset is a serialized cursor.
1523                  */
1524                 zap_cursor_init_serialized(&zc, os, zp->z_id, offset);
1525         }
1526
1527         /*
1528          * Transform to file-system independent format
1529          */
1530         while (!done) {
1531                 uint64_t objnum;
1532                 /*
1533                  * Special case `.', `..', and `.zfs'.
1534                  */
1535                 if (offset == 0) {
1536                         (void) strcpy(zap.za_name, ".");
1537                         zap.za_normalization_conflict = 0;
1538                         objnum = zp->z_id;
1539                         type = DT_DIR;
1540                 } else if (offset == 1) {
1541                         (void) strcpy(zap.za_name, "..");
1542                         zap.za_normalization_conflict = 0;
1543                         objnum = parent;
1544                         type = DT_DIR;
1545                 } else if (offset == 2 && zfs_show_ctldir(zp)) {
1546                         (void) strcpy(zap.za_name, ZFS_CTLDIR_NAME);
1547                         zap.za_normalization_conflict = 0;
1548                         objnum = ZFSCTL_INO_ROOT;
1549                         type = DT_DIR;
1550                 } else {
1551                         /*
1552                          * Grab next entry.
1553                          */
1554                         if ((error = zap_cursor_retrieve(&zc, &zap))) {
1555                                 if (error == ENOENT)
1556                                         break;
1557                                 else
1558                                         goto update;
1559                         }
1560
1561                         /*
1562                          * Allow multiple entries provided the first entry is
1563                          * the object id.  Non-zpl consumers may safely make
1564                          * use of the additional space.
1565                          *
1566                          * XXX: This should be a feature flag for compatibility
1567                          */
1568                         if (zap.za_integer_length != 8 ||
1569                             zap.za_num_integers == 0) {
1570                                 cmn_err(CE_WARN, "zap_readdir: bad directory "
1571                                     "entry, obj = %lld, offset = %lld, "
1572                                     "length = %d, num = %lld\n",
1573                                     (u_longlong_t)zp->z_id,
1574                                     (u_longlong_t)offset,
1575                                     zap.za_integer_length,
1576                                     (u_longlong_t)zap.za_num_integers);
1577                                 error = SET_ERROR(ENXIO);
1578                                 goto update;
1579                         }
1580
1581                         objnum = ZFS_DIRENT_OBJ(zap.za_first_integer);
1582                         type = ZFS_DIRENT_TYPE(zap.za_first_integer);
1583                 }
1584
1585                 done = !zpl_dir_emit(ctx, zap.za_name, strlen(zap.za_name),
1586                     objnum, type);
1587                 if (done)
1588                         break;
1589
1590                 /* Prefetch znode */
1591                 if (prefetch) {
1592                         dmu_prefetch(os, objnum, 0, 0, 0,
1593                             ZIO_PRIORITY_SYNC_READ);
1594                 }
1595
1596                 /*
1597                  * Move to the next entry, fill in the previous offset.
1598                  */
1599                 if (offset > 2 || (offset == 2 && !zfs_show_ctldir(zp))) {
1600                         zap_cursor_advance(&zc);
1601                         offset = zap_cursor_serialize(&zc);
1602                 } else {
1603                         offset += 1;
1604                 }
1605                 ctx->pos = offset;
1606         }
1607         zp->z_zn_prefetch = B_FALSE; /* a lookup will re-enable pre-fetching */
1608
1609 update:
1610         zap_cursor_fini(&zc);
1611         if (error == ENOENT)
1612                 error = 0;
1613 out:
1614         ZFS_EXIT(zfsvfs);
1615
1616         return (error);
1617 }
1618
1619 /*
1620  * Get the basic file attributes and place them in the provided kstat
1621  * structure.  The inode is assumed to be the authoritative source
1622  * for most of the attributes.  However, the znode currently has the
1623  * authoritative atime, blksize, and block count.
1624  *
1625  *      IN:     ip      - inode of file.
1626  *
1627  *      OUT:    sp      - kstat values.
1628  *
1629  *      RETURN: 0 (always succeeds)
1630  */
1631 int
1632 zfs_getattr_fast(struct user_namespace *user_ns, struct inode *ip,
1633     struct kstat *sp)
1634 {
1635         znode_t *zp = ITOZ(ip);
1636         zfsvfs_t *zfsvfs = ITOZSB(ip);
1637         uint32_t blksize;
1638         u_longlong_t nblocks;
1639
1640         ZFS_ENTER(zfsvfs);
1641         ZFS_VERIFY_ZP(zp);
1642
1643         mutex_enter(&zp->z_lock);
1644
1645         zpl_generic_fillattr(user_ns, ip, sp);
1646         /*
1647          * +1 link count for root inode with visible '.zfs' directory.
1648          */
1649         if ((zp->z_id == zfsvfs->z_root) && zfs_show_ctldir(zp))
1650                 if (sp->nlink < ZFS_LINK_MAX)
1651                         sp->nlink++;
1652
1653         sa_object_size(zp->z_sa_hdl, &blksize, &nblocks);
1654         sp->blksize = blksize;
1655         sp->blocks = nblocks;
1656
1657         if (unlikely(zp->z_blksz == 0)) {
1658                 /*
1659                  * Block size hasn't been set; suggest maximal I/O transfers.
1660                  */
1661                 sp->blksize = zfsvfs->z_max_blksz;
1662         }
1663
1664         mutex_exit(&zp->z_lock);
1665
1666         /*
1667          * Required to prevent NFS client from detecting different inode
1668          * numbers of snapshot root dentry before and after snapshot mount.
1669          */
1670         if (zfsvfs->z_issnap) {
1671                 if (ip->i_sb->s_root->d_inode == ip)
1672                         sp->ino = ZFSCTL_INO_SNAPDIRS -
1673                             dmu_objset_id(zfsvfs->z_os);
1674         }
1675
1676         ZFS_EXIT(zfsvfs);
1677
1678         return (0);
1679 }
1680
1681 /*
1682  * For the operation of changing file's user/group/project, we need to
1683  * handle not only the main object that is assigned to the file directly,
1684  * but also the ones that are used by the file via hidden xattr directory.
1685  *
1686  * Because the xattr directory may contains many EA entries, as to it may
1687  * be impossible to change all of them via the transaction of changing the
1688  * main object's user/group/project attributes. Then we have to change them
1689  * via other multiple independent transactions one by one. It may be not good
1690  * solution, but we have no better idea yet.
1691  */
1692 static int
1693 zfs_setattr_dir(znode_t *dzp)
1694 {
1695         struct inode    *dxip = ZTOI(dzp);
1696         struct inode    *xip = NULL;
1697         zfsvfs_t        *zfsvfs = ZTOZSB(dzp);
1698         objset_t        *os = zfsvfs->z_os;
1699         zap_cursor_t    zc;
1700         zap_attribute_t zap;
1701         zfs_dirlock_t   *dl;
1702         znode_t         *zp = NULL;
1703         dmu_tx_t        *tx = NULL;
1704         uint64_t        uid, gid;
1705         sa_bulk_attr_t  bulk[4];
1706         int             count;
1707         int             err;
1708
1709         zap_cursor_init(&zc, os, dzp->z_id);
1710         while ((err = zap_cursor_retrieve(&zc, &zap)) == 0) {
1711                 count = 0;
1712                 if (zap.za_integer_length != 8 || zap.za_num_integers != 1) {
1713                         err = ENXIO;
1714                         break;
1715                 }
1716
1717                 err = zfs_dirent_lock(&dl, dzp, (char *)zap.za_name, &zp,
1718                     ZEXISTS, NULL, NULL);
1719                 if (err == ENOENT)
1720                         goto next;
1721                 if (err)
1722                         break;
1723
1724                 xip = ZTOI(zp);
1725                 if (KUID_TO_SUID(xip->i_uid) == KUID_TO_SUID(dxip->i_uid) &&
1726                     KGID_TO_SGID(xip->i_gid) == KGID_TO_SGID(dxip->i_gid) &&
1727                     zp->z_projid == dzp->z_projid)
1728                         goto next;
1729
1730                 tx = dmu_tx_create(os);
1731                 if (!(zp->z_pflags & ZFS_PROJID))
1732                         dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
1733                 else
1734                         dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
1735
1736                 err = dmu_tx_assign(tx, TXG_WAIT);
1737                 if (err)
1738                         break;
1739
1740                 mutex_enter(&dzp->z_lock);
1741
1742                 if (KUID_TO_SUID(xip->i_uid) != KUID_TO_SUID(dxip->i_uid)) {
1743                         xip->i_uid = dxip->i_uid;
1744                         uid = zfs_uid_read(dxip);
1745                         SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL,
1746                             &uid, sizeof (uid));
1747                 }
1748
1749                 if (KGID_TO_SGID(xip->i_gid) != KGID_TO_SGID(dxip->i_gid)) {
1750                         xip->i_gid = dxip->i_gid;
1751                         gid = zfs_gid_read(dxip);
1752                         SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs), NULL,
1753                             &gid, sizeof (gid));
1754                 }
1755
1756                 if (zp->z_projid != dzp->z_projid) {
1757                         if (!(zp->z_pflags & ZFS_PROJID)) {
1758                                 zp->z_pflags |= ZFS_PROJID;
1759                                 SA_ADD_BULK_ATTR(bulk, count,
1760                                     SA_ZPL_FLAGS(zfsvfs), NULL, &zp->z_pflags,
1761                                     sizeof (zp->z_pflags));
1762                         }
1763
1764                         zp->z_projid = dzp->z_projid;
1765                         SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_PROJID(zfsvfs),
1766                             NULL, &zp->z_projid, sizeof (zp->z_projid));
1767                 }
1768
1769                 mutex_exit(&dzp->z_lock);
1770
1771                 if (likely(count > 0)) {
1772                         err = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
1773                         dmu_tx_commit(tx);
1774                 } else {
1775                         dmu_tx_abort(tx);
1776                 }
1777                 tx = NULL;
1778                 if (err != 0 && err != ENOENT)
1779                         break;
1780
1781 next:
1782                 if (zp) {
1783                         zrele(zp);
1784                         zp = NULL;
1785                         zfs_dirent_unlock(dl);
1786                 }
1787                 zap_cursor_advance(&zc);
1788         }
1789
1790         if (tx)
1791                 dmu_tx_abort(tx);
1792         if (zp) {
1793                 zrele(zp);
1794                 zfs_dirent_unlock(dl);
1795         }
1796         zap_cursor_fini(&zc);
1797
1798         return (err == ENOENT ? 0 : err);
1799 }
1800
1801 /*
1802  * Set the file attributes to the values contained in the
1803  * vattr structure.
1804  *
1805  *      IN:     zp      - znode of file to be modified.
1806  *              vap     - new attribute values.
1807  *                        If ATTR_XVATTR set, then optional attrs are being set
1808  *              flags   - ATTR_UTIME set if non-default time values provided.
1809  *                      - ATTR_NOACLCHECK (CIFS context only).
1810  *              cr      - credentials of caller.
1811  *
1812  *      RETURN: 0 if success
1813  *              error code if failure
1814  *
1815  * Timestamps:
1816  *      ip - ctime updated, mtime updated if size changed.
1817  */
1818 int
1819 zfs_setattr(znode_t *zp, vattr_t *vap, int flags, cred_t *cr)
1820 {
1821         struct inode    *ip;
1822         zfsvfs_t        *zfsvfs = ZTOZSB(zp);
1823         objset_t        *os = zfsvfs->z_os;
1824         zilog_t         *zilog;
1825         dmu_tx_t        *tx;
1826         vattr_t         oldva;
1827         xvattr_t        *tmpxvattr;
1828         uint_t          mask = vap->va_mask;
1829         uint_t          saved_mask = 0;
1830         int             trim_mask = 0;
1831         uint64_t        new_mode;
1832         uint64_t        new_kuid = 0, new_kgid = 0, new_uid, new_gid;
1833         uint64_t        xattr_obj;
1834         uint64_t        mtime[2], ctime[2], atime[2];
1835         uint64_t        projid = ZFS_INVALID_PROJID;
1836         znode_t         *attrzp;
1837         int             need_policy = FALSE;
1838         int             err, err2 = 0;
1839         zfs_fuid_info_t *fuidp = NULL;
1840         xvattr_t *xvap = (xvattr_t *)vap;       /* vap may be an xvattr_t * */
1841         xoptattr_t      *xoap;
1842         zfs_acl_t       *aclp;
1843         boolean_t skipaclchk = (flags & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
1844         boolean_t       fuid_dirtied = B_FALSE;
1845         boolean_t       handle_eadir = B_FALSE;
1846         sa_bulk_attr_t  *bulk, *xattr_bulk;
1847         int             count = 0, xattr_count = 0, bulks = 8;
1848
1849         if (mask == 0)
1850                 return (0);
1851
1852         ZFS_ENTER(zfsvfs);
1853         ZFS_VERIFY_ZP(zp);
1854         ip = ZTOI(zp);
1855
1856         /*
1857          * If this is a xvattr_t, then get a pointer to the structure of
1858          * optional attributes.  If this is NULL, then we have a vattr_t.
1859          */
1860         xoap = xva_getxoptattr(xvap);
1861         if (xoap != NULL && (mask & ATTR_XVATTR)) {
1862                 if (XVA_ISSET_REQ(xvap, XAT_PROJID)) {
1863                         if (!dmu_objset_projectquota_enabled(os) ||
1864                             (!S_ISREG(ip->i_mode) && !S_ISDIR(ip->i_mode))) {
1865                                 ZFS_EXIT(zfsvfs);
1866                                 return (SET_ERROR(ENOTSUP));
1867                         }
1868
1869                         projid = xoap->xoa_projid;
1870                         if (unlikely(projid == ZFS_INVALID_PROJID)) {
1871                                 ZFS_EXIT(zfsvfs);
1872                                 return (SET_ERROR(EINVAL));
1873                         }
1874
1875                         if (projid == zp->z_projid && zp->z_pflags & ZFS_PROJID)
1876                                 projid = ZFS_INVALID_PROJID;
1877                         else
1878                                 need_policy = TRUE;
1879                 }
1880
1881                 if (XVA_ISSET_REQ(xvap, XAT_PROJINHERIT) &&
1882                     (xoap->xoa_projinherit !=
1883                     ((zp->z_pflags & ZFS_PROJINHERIT) != 0)) &&
1884                     (!dmu_objset_projectquota_enabled(os) ||
1885                     (!S_ISREG(ip->i_mode) && !S_ISDIR(ip->i_mode)))) {
1886                         ZFS_EXIT(zfsvfs);
1887                         return (SET_ERROR(ENOTSUP));
1888                 }
1889         }
1890
1891         zilog = zfsvfs->z_log;
1892
1893         /*
1894          * Make sure that if we have ephemeral uid/gid or xvattr specified
1895          * that file system is at proper version level
1896          */
1897
1898         if (zfsvfs->z_use_fuids == B_FALSE &&
1899             (((mask & ATTR_UID) && IS_EPHEMERAL(vap->va_uid)) ||
1900             ((mask & ATTR_GID) && IS_EPHEMERAL(vap->va_gid)) ||
1901             (mask & ATTR_XVATTR))) {
1902                 ZFS_EXIT(zfsvfs);
1903                 return (SET_ERROR(EINVAL));
1904         }
1905
1906         if (mask & ATTR_SIZE && S_ISDIR(ip->i_mode)) {
1907                 ZFS_EXIT(zfsvfs);
1908                 return (SET_ERROR(EISDIR));
1909         }
1910
1911         if (mask & ATTR_SIZE && !S_ISREG(ip->i_mode) && !S_ISFIFO(ip->i_mode)) {
1912                 ZFS_EXIT(zfsvfs);
1913                 return (SET_ERROR(EINVAL));
1914         }
1915
1916         tmpxvattr = kmem_alloc(sizeof (xvattr_t), KM_SLEEP);
1917         xva_init(tmpxvattr);
1918
1919         bulk = kmem_alloc(sizeof (sa_bulk_attr_t) * bulks, KM_SLEEP);
1920         xattr_bulk = kmem_alloc(sizeof (sa_bulk_attr_t) * bulks, KM_SLEEP);
1921
1922         /*
1923          * Immutable files can only alter immutable bit and atime
1924          */
1925         if ((zp->z_pflags & ZFS_IMMUTABLE) &&
1926             ((mask & (ATTR_SIZE|ATTR_UID|ATTR_GID|ATTR_MTIME|ATTR_MODE)) ||
1927             ((mask & ATTR_XVATTR) && XVA_ISSET_REQ(xvap, XAT_CREATETIME)))) {
1928                 err = SET_ERROR(EPERM);
1929                 goto out3;
1930         }
1931
1932         if ((mask & ATTR_SIZE) && (zp->z_pflags & ZFS_READONLY)) {
1933                 err = SET_ERROR(EPERM);
1934                 goto out3;
1935         }
1936
1937         /*
1938          * Verify timestamps doesn't overflow 32 bits.
1939          * ZFS can handle large timestamps, but 32bit syscalls can't
1940          * handle times greater than 2039.  This check should be removed
1941          * once large timestamps are fully supported.
1942          */
1943         if (mask & (ATTR_ATIME | ATTR_MTIME)) {
1944                 if (((mask & ATTR_ATIME) &&
1945                     TIMESPEC_OVERFLOW(&vap->va_atime)) ||
1946                     ((mask & ATTR_MTIME) &&
1947                     TIMESPEC_OVERFLOW(&vap->va_mtime))) {
1948                         err = SET_ERROR(EOVERFLOW);
1949                         goto out3;
1950                 }
1951         }
1952
1953 top:
1954         attrzp = NULL;
1955         aclp = NULL;
1956
1957         /* Can this be moved to before the top label? */
1958         if (zfs_is_readonly(zfsvfs)) {
1959                 err = SET_ERROR(EROFS);
1960                 goto out3;
1961         }
1962
1963         /*
1964          * First validate permissions
1965          */
1966
1967         if (mask & ATTR_SIZE) {
1968                 err = zfs_zaccess(zp, ACE_WRITE_DATA, 0, skipaclchk, cr);
1969                 if (err)
1970                         goto out3;
1971
1972                 /*
1973                  * XXX - Note, we are not providing any open
1974                  * mode flags here (like FNDELAY), so we may
1975                  * block if there are locks present... this
1976                  * should be addressed in openat().
1977                  */
1978                 /* XXX - would it be OK to generate a log record here? */
1979                 err = zfs_freesp(zp, vap->va_size, 0, 0, FALSE);
1980                 if (err)
1981                         goto out3;
1982         }
1983
1984         if (mask & (ATTR_ATIME|ATTR_MTIME) ||
1985             ((mask & ATTR_XVATTR) && (XVA_ISSET_REQ(xvap, XAT_HIDDEN) ||
1986             XVA_ISSET_REQ(xvap, XAT_READONLY) ||
1987             XVA_ISSET_REQ(xvap, XAT_ARCHIVE) ||
1988             XVA_ISSET_REQ(xvap, XAT_OFFLINE) ||
1989             XVA_ISSET_REQ(xvap, XAT_SPARSE) ||
1990             XVA_ISSET_REQ(xvap, XAT_CREATETIME) ||
1991             XVA_ISSET_REQ(xvap, XAT_SYSTEM)))) {
1992                 need_policy = zfs_zaccess(zp, ACE_WRITE_ATTRIBUTES, 0,
1993                     skipaclchk, cr);
1994         }
1995
1996         if (mask & (ATTR_UID|ATTR_GID)) {
1997                 int     idmask = (mask & (ATTR_UID|ATTR_GID));
1998                 int     take_owner;
1999                 int     take_group;
2000
2001                 /*
2002                  * NOTE: even if a new mode is being set,
2003                  * we may clear S_ISUID/S_ISGID bits.
2004                  */
2005
2006                 if (!(mask & ATTR_MODE))
2007                         vap->va_mode = zp->z_mode;
2008
2009                 /*
2010                  * Take ownership or chgrp to group we are a member of
2011                  */
2012
2013                 take_owner = (mask & ATTR_UID) && (vap->va_uid == crgetuid(cr));
2014                 take_group = (mask & ATTR_GID) &&
2015                     zfs_groupmember(zfsvfs, vap->va_gid, cr);
2016
2017                 /*
2018                  * If both ATTR_UID and ATTR_GID are set then take_owner and
2019                  * take_group must both be set in order to allow taking
2020                  * ownership.
2021                  *
2022                  * Otherwise, send the check through secpolicy_vnode_setattr()
2023                  *
2024                  */
2025
2026                 if (((idmask == (ATTR_UID|ATTR_GID)) &&
2027                     take_owner && take_group) ||
2028                     ((idmask == ATTR_UID) && take_owner) ||
2029                     ((idmask == ATTR_GID) && take_group)) {
2030                         if (zfs_zaccess(zp, ACE_WRITE_OWNER, 0,
2031                             skipaclchk, cr) == 0) {
2032                                 /*
2033                                  * Remove setuid/setgid for non-privileged users
2034                                  */
2035                                 (void) secpolicy_setid_clear(vap, cr);
2036                                 trim_mask = (mask & (ATTR_UID|ATTR_GID));
2037                         } else {
2038                                 need_policy =  TRUE;
2039                         }
2040                 } else {
2041                         need_policy =  TRUE;
2042                 }
2043         }
2044
2045         mutex_enter(&zp->z_lock);
2046         oldva.va_mode = zp->z_mode;
2047         zfs_fuid_map_ids(zp, cr, &oldva.va_uid, &oldva.va_gid);
2048         if (mask & ATTR_XVATTR) {
2049                 /*
2050                  * Update xvattr mask to include only those attributes
2051                  * that are actually changing.
2052                  *
2053                  * the bits will be restored prior to actually setting
2054                  * the attributes so the caller thinks they were set.
2055                  */
2056                 if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) {
2057                         if (xoap->xoa_appendonly !=
2058                             ((zp->z_pflags & ZFS_APPENDONLY) != 0)) {
2059                                 need_policy = TRUE;
2060                         } else {
2061                                 XVA_CLR_REQ(xvap, XAT_APPENDONLY);
2062                                 XVA_SET_REQ(tmpxvattr, XAT_APPENDONLY);
2063                         }
2064                 }
2065
2066                 if (XVA_ISSET_REQ(xvap, XAT_PROJINHERIT)) {
2067                         if (xoap->xoa_projinherit !=
2068                             ((zp->z_pflags & ZFS_PROJINHERIT) != 0)) {
2069                                 need_policy = TRUE;
2070                         } else {
2071                                 XVA_CLR_REQ(xvap, XAT_PROJINHERIT);
2072                                 XVA_SET_REQ(tmpxvattr, XAT_PROJINHERIT);
2073                         }
2074                 }
2075
2076                 if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) {
2077                         if (xoap->xoa_nounlink !=
2078                             ((zp->z_pflags & ZFS_NOUNLINK) != 0)) {
2079                                 need_policy = TRUE;
2080                         } else {
2081                                 XVA_CLR_REQ(xvap, XAT_NOUNLINK);
2082                                 XVA_SET_REQ(tmpxvattr, XAT_NOUNLINK);
2083                         }
2084                 }
2085
2086                 if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) {
2087                         if (xoap->xoa_immutable !=
2088                             ((zp->z_pflags & ZFS_IMMUTABLE) != 0)) {
2089                                 need_policy = TRUE;
2090                         } else {
2091                                 XVA_CLR_REQ(xvap, XAT_IMMUTABLE);
2092                                 XVA_SET_REQ(tmpxvattr, XAT_IMMUTABLE);
2093                         }
2094                 }
2095
2096                 if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) {
2097                         if (xoap->xoa_nodump !=
2098                             ((zp->z_pflags & ZFS_NODUMP) != 0)) {
2099                                 need_policy = TRUE;
2100                         } else {
2101                                 XVA_CLR_REQ(xvap, XAT_NODUMP);
2102                                 XVA_SET_REQ(tmpxvattr, XAT_NODUMP);
2103                         }
2104                 }
2105
2106                 if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) {
2107                         if (xoap->xoa_av_modified !=
2108                             ((zp->z_pflags & ZFS_AV_MODIFIED) != 0)) {
2109                                 need_policy = TRUE;
2110                         } else {
2111                                 XVA_CLR_REQ(xvap, XAT_AV_MODIFIED);
2112                                 XVA_SET_REQ(tmpxvattr, XAT_AV_MODIFIED);
2113                         }
2114                 }
2115
2116                 if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) {
2117                         if ((!S_ISREG(ip->i_mode) &&
2118                             xoap->xoa_av_quarantined) ||
2119                             xoap->xoa_av_quarantined !=
2120                             ((zp->z_pflags & ZFS_AV_QUARANTINED) != 0)) {
2121                                 need_policy = TRUE;
2122                         } else {
2123                                 XVA_CLR_REQ(xvap, XAT_AV_QUARANTINED);
2124                                 XVA_SET_REQ(tmpxvattr, XAT_AV_QUARANTINED);
2125                         }
2126                 }
2127
2128                 if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) {
2129                         mutex_exit(&zp->z_lock);
2130                         err = SET_ERROR(EPERM);
2131                         goto out3;
2132                 }
2133
2134                 if (need_policy == FALSE &&
2135                     (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP) ||
2136                     XVA_ISSET_REQ(xvap, XAT_OPAQUE))) {
2137                         need_policy = TRUE;
2138                 }
2139         }
2140
2141         mutex_exit(&zp->z_lock);
2142
2143         if (mask & ATTR_MODE) {
2144                 if (zfs_zaccess(zp, ACE_WRITE_ACL, 0, skipaclchk, cr) == 0) {
2145                         err = secpolicy_setid_setsticky_clear(ip, vap,
2146                             &oldva, cr);
2147                         if (err)
2148                                 goto out3;
2149
2150                         trim_mask |= ATTR_MODE;
2151                 } else {
2152                         need_policy = TRUE;
2153                 }
2154         }
2155
2156         if (need_policy) {
2157                 /*
2158                  * If trim_mask is set then take ownership
2159                  * has been granted or write_acl is present and user
2160                  * has the ability to modify mode.  In that case remove
2161                  * UID|GID and or MODE from mask so that
2162                  * secpolicy_vnode_setattr() doesn't revoke it.
2163                  */
2164
2165                 if (trim_mask) {
2166                         saved_mask = vap->va_mask;
2167                         vap->va_mask &= ~trim_mask;
2168                 }
2169                 err = secpolicy_vnode_setattr(cr, ip, vap, &oldva, flags,
2170                     (int (*)(void *, int, cred_t *))zfs_zaccess_unix, zp);
2171                 if (err)
2172                         goto out3;
2173
2174                 if (trim_mask)
2175                         vap->va_mask |= saved_mask;
2176         }
2177
2178         /*
2179          * secpolicy_vnode_setattr, or take ownership may have
2180          * changed va_mask
2181          */
2182         mask = vap->va_mask;
2183
2184         if ((mask & (ATTR_UID | ATTR_GID)) || projid != ZFS_INVALID_PROJID) {
2185                 handle_eadir = B_TRUE;
2186                 err = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs),
2187                     &xattr_obj, sizeof (xattr_obj));
2188
2189                 if (err == 0 && xattr_obj) {
2190                         err = zfs_zget(ZTOZSB(zp), xattr_obj, &attrzp);
2191                         if (err)
2192                                 goto out2;
2193                 }
2194                 if (mask & ATTR_UID) {
2195                         new_kuid = zfs_fuid_create(zfsvfs,
2196                             (uint64_t)vap->va_uid, cr, ZFS_OWNER, &fuidp);
2197                         if (new_kuid != KUID_TO_SUID(ZTOI(zp)->i_uid) &&
2198                             zfs_id_overquota(zfsvfs, DMU_USERUSED_OBJECT,
2199                             new_kuid)) {
2200                                 if (attrzp)
2201                                         zrele(attrzp);
2202                                 err = SET_ERROR(EDQUOT);
2203                                 goto out2;
2204                         }
2205                 }
2206
2207                 if (mask & ATTR_GID) {
2208                         new_kgid = zfs_fuid_create(zfsvfs,
2209                             (uint64_t)vap->va_gid, cr, ZFS_GROUP, &fuidp);
2210                         if (new_kgid != KGID_TO_SGID(ZTOI(zp)->i_gid) &&
2211                             zfs_id_overquota(zfsvfs, DMU_GROUPUSED_OBJECT,
2212                             new_kgid)) {
2213                                 if (attrzp)
2214                                         zrele(attrzp);
2215                                 err = SET_ERROR(EDQUOT);
2216                                 goto out2;
2217                         }
2218                 }
2219
2220                 if (projid != ZFS_INVALID_PROJID &&
2221                     zfs_id_overquota(zfsvfs, DMU_PROJECTUSED_OBJECT, projid)) {
2222                         if (attrzp)
2223                                 zrele(attrzp);
2224                         err = EDQUOT;
2225                         goto out2;
2226                 }
2227         }
2228         tx = dmu_tx_create(os);
2229
2230         if (mask & ATTR_MODE) {
2231                 uint64_t pmode = zp->z_mode;
2232                 uint64_t acl_obj;
2233                 new_mode = (pmode & S_IFMT) | (vap->va_mode & ~S_IFMT);
2234
2235                 if (ZTOZSB(zp)->z_acl_mode == ZFS_ACL_RESTRICTED &&
2236                     !(zp->z_pflags & ZFS_ACL_TRIVIAL)) {
2237                         err = EPERM;
2238                         goto out;
2239                 }
2240
2241                 if ((err = zfs_acl_chmod_setattr(zp, &aclp, new_mode)))
2242                         goto out;
2243
2244                 mutex_enter(&zp->z_lock);
2245                 if (!zp->z_is_sa && ((acl_obj = zfs_external_acl(zp)) != 0)) {
2246                         /*
2247                          * Are we upgrading ACL from old V0 format
2248                          * to V1 format?
2249                          */
2250                         if (zfsvfs->z_version >= ZPL_VERSION_FUID &&
2251                             zfs_znode_acl_version(zp) ==
2252                             ZFS_ACL_VERSION_INITIAL) {
2253                                 dmu_tx_hold_free(tx, acl_obj, 0,
2254                                     DMU_OBJECT_END);
2255                                 dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
2256                                     0, aclp->z_acl_bytes);
2257                         } else {
2258                                 dmu_tx_hold_write(tx, acl_obj, 0,
2259                                     aclp->z_acl_bytes);
2260                         }
2261                 } else if (!zp->z_is_sa && aclp->z_acl_bytes > ZFS_ACE_SPACE) {
2262                         dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
2263                             0, aclp->z_acl_bytes);
2264                 }
2265                 mutex_exit(&zp->z_lock);
2266                 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
2267         } else {
2268                 if (((mask & ATTR_XVATTR) &&
2269                     XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) ||
2270                     (projid != ZFS_INVALID_PROJID &&
2271                     !(zp->z_pflags & ZFS_PROJID)))
2272                         dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
2273                 else
2274                         dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
2275         }
2276
2277         if (attrzp) {
2278                 dmu_tx_hold_sa(tx, attrzp->z_sa_hdl, B_FALSE);
2279         }
2280
2281         fuid_dirtied = zfsvfs->z_fuid_dirty;
2282         if (fuid_dirtied)
2283                 zfs_fuid_txhold(zfsvfs, tx);
2284
2285         zfs_sa_upgrade_txholds(tx, zp);
2286
2287         err = dmu_tx_assign(tx, TXG_WAIT);
2288         if (err)
2289                 goto out;
2290
2291         count = 0;
2292         /*
2293          * Set each attribute requested.
2294          * We group settings according to the locks they need to acquire.
2295          *
2296          * Note: you cannot set ctime directly, although it will be
2297          * updated as a side-effect of calling this function.
2298          */
2299
2300         if (projid != ZFS_INVALID_PROJID && !(zp->z_pflags & ZFS_PROJID)) {
2301                 /*
2302                  * For the existed object that is upgraded from old system,
2303                  * its on-disk layout has no slot for the project ID attribute.
2304                  * But quota accounting logic needs to access related slots by
2305                  * offset directly. So we need to adjust old objects' layout
2306                  * to make the project ID to some unified and fixed offset.
2307                  */
2308                 if (attrzp)
2309                         err = sa_add_projid(attrzp->z_sa_hdl, tx, projid);
2310                 if (err == 0)
2311                         err = sa_add_projid(zp->z_sa_hdl, tx, projid);
2312
2313                 if (unlikely(err == EEXIST))
2314                         err = 0;
2315                 else if (err != 0)
2316                         goto out;
2317                 else
2318                         projid = ZFS_INVALID_PROJID;
2319         }
2320
2321         if (mask & (ATTR_UID|ATTR_GID|ATTR_MODE))
2322                 mutex_enter(&zp->z_acl_lock);
2323         mutex_enter(&zp->z_lock);
2324
2325         SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
2326             &zp->z_pflags, sizeof (zp->z_pflags));
2327
2328         if (attrzp) {
2329                 if (mask & (ATTR_UID|ATTR_GID|ATTR_MODE))
2330                         mutex_enter(&attrzp->z_acl_lock);
2331                 mutex_enter(&attrzp->z_lock);
2332                 SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
2333                     SA_ZPL_FLAGS(zfsvfs), NULL, &attrzp->z_pflags,
2334                     sizeof (attrzp->z_pflags));
2335                 if (projid != ZFS_INVALID_PROJID) {
2336                         attrzp->z_projid = projid;
2337                         SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
2338                             SA_ZPL_PROJID(zfsvfs), NULL, &attrzp->z_projid,
2339                             sizeof (attrzp->z_projid));
2340                 }
2341         }
2342
2343         if (mask & (ATTR_UID|ATTR_GID)) {
2344
2345                 if (mask & ATTR_UID) {
2346                         ZTOI(zp)->i_uid = SUID_TO_KUID(new_kuid);
2347                         new_uid = zfs_uid_read(ZTOI(zp));
2348                         SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL,
2349                             &new_uid, sizeof (new_uid));
2350                         if (attrzp) {
2351                                 SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
2352                                     SA_ZPL_UID(zfsvfs), NULL, &new_uid,
2353                                     sizeof (new_uid));
2354                                 ZTOI(attrzp)->i_uid = SUID_TO_KUID(new_uid);
2355                         }
2356                 }
2357
2358                 if (mask & ATTR_GID) {
2359                         ZTOI(zp)->i_gid = SGID_TO_KGID(new_kgid);
2360                         new_gid = zfs_gid_read(ZTOI(zp));
2361                         SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs),
2362                             NULL, &new_gid, sizeof (new_gid));
2363                         if (attrzp) {
2364                                 SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
2365                                     SA_ZPL_GID(zfsvfs), NULL, &new_gid,
2366                                     sizeof (new_gid));
2367                                 ZTOI(attrzp)->i_gid = SGID_TO_KGID(new_kgid);
2368                         }
2369                 }
2370                 if (!(mask & ATTR_MODE)) {
2371                         SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs),
2372                             NULL, &new_mode, sizeof (new_mode));
2373                         new_mode = zp->z_mode;
2374                 }
2375                 err = zfs_acl_chown_setattr(zp);
2376                 ASSERT(err == 0);
2377                 if (attrzp) {
2378                         err = zfs_acl_chown_setattr(attrzp);
2379                         ASSERT(err == 0);
2380                 }
2381         }
2382
2383         if (mask & ATTR_MODE) {
2384                 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL,
2385                     &new_mode, sizeof (new_mode));
2386                 zp->z_mode = ZTOI(zp)->i_mode = new_mode;
2387                 ASSERT3P(aclp, !=, NULL);
2388                 err = zfs_aclset_common(zp, aclp, cr, tx);
2389                 ASSERT0(err);
2390                 if (zp->z_acl_cached)
2391                         zfs_acl_free(zp->z_acl_cached);
2392                 zp->z_acl_cached = aclp;
2393                 aclp = NULL;
2394         }
2395
2396         if ((mask & ATTR_ATIME) || zp->z_atime_dirty) {
2397                 zp->z_atime_dirty = B_FALSE;
2398                 ZFS_TIME_ENCODE(&ip->i_atime, atime);
2399                 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zfsvfs), NULL,
2400                     &atime, sizeof (atime));
2401         }
2402
2403         if (mask & (ATTR_MTIME | ATTR_SIZE)) {
2404                 ZFS_TIME_ENCODE(&vap->va_mtime, mtime);
2405                 ZTOI(zp)->i_mtime = zpl_inode_timestamp_truncate(
2406                     vap->va_mtime, ZTOI(zp));
2407
2408                 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL,
2409                     mtime, sizeof (mtime));
2410         }
2411
2412         if (mask & (ATTR_CTIME | ATTR_SIZE)) {
2413                 ZFS_TIME_ENCODE(&vap->va_ctime, ctime);
2414                 ZTOI(zp)->i_ctime = zpl_inode_timestamp_truncate(vap->va_ctime,
2415                     ZTOI(zp));
2416                 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
2417                     ctime, sizeof (ctime));
2418         }
2419
2420         if (projid != ZFS_INVALID_PROJID) {
2421                 zp->z_projid = projid;
2422                 SA_ADD_BULK_ATTR(bulk, count,
2423                     SA_ZPL_PROJID(zfsvfs), NULL, &zp->z_projid,
2424                     sizeof (zp->z_projid));
2425         }
2426
2427         if (attrzp && mask) {
2428                 SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
2429                     SA_ZPL_CTIME(zfsvfs), NULL, &ctime,
2430                     sizeof (ctime));
2431         }
2432
2433         /*
2434          * Do this after setting timestamps to prevent timestamp
2435          * update from toggling bit
2436          */
2437
2438         if (xoap && (mask & ATTR_XVATTR)) {
2439
2440                 /*
2441                  * restore trimmed off masks
2442                  * so that return masks can be set for caller.
2443                  */
2444
2445                 if (XVA_ISSET_REQ(tmpxvattr, XAT_APPENDONLY)) {
2446                         XVA_SET_REQ(xvap, XAT_APPENDONLY);
2447                 }
2448                 if (XVA_ISSET_REQ(tmpxvattr, XAT_NOUNLINK)) {
2449                         XVA_SET_REQ(xvap, XAT_NOUNLINK);
2450                 }
2451                 if (XVA_ISSET_REQ(tmpxvattr, XAT_IMMUTABLE)) {
2452                         XVA_SET_REQ(xvap, XAT_IMMUTABLE);
2453                 }
2454                 if (XVA_ISSET_REQ(tmpxvattr, XAT_NODUMP)) {
2455                         XVA_SET_REQ(xvap, XAT_NODUMP);
2456                 }
2457                 if (XVA_ISSET_REQ(tmpxvattr, XAT_AV_MODIFIED)) {
2458                         XVA_SET_REQ(xvap, XAT_AV_MODIFIED);
2459                 }
2460                 if (XVA_ISSET_REQ(tmpxvattr, XAT_AV_QUARANTINED)) {
2461                         XVA_SET_REQ(xvap, XAT_AV_QUARANTINED);
2462                 }
2463                 if (XVA_ISSET_REQ(tmpxvattr, XAT_PROJINHERIT)) {
2464                         XVA_SET_REQ(xvap, XAT_PROJINHERIT);
2465                 }
2466
2467                 if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP))
2468                         ASSERT(S_ISREG(ip->i_mode));
2469
2470                 zfs_xvattr_set(zp, xvap, tx);
2471         }
2472
2473         if (fuid_dirtied)
2474                 zfs_fuid_sync(zfsvfs, tx);
2475
2476         if (mask != 0)
2477                 zfs_log_setattr(zilog, tx, TX_SETATTR, zp, vap, mask, fuidp);
2478
2479         mutex_exit(&zp->z_lock);
2480         if (mask & (ATTR_UID|ATTR_GID|ATTR_MODE))
2481                 mutex_exit(&zp->z_acl_lock);
2482
2483         if (attrzp) {
2484                 if (mask & (ATTR_UID|ATTR_GID|ATTR_MODE))
2485                         mutex_exit(&attrzp->z_acl_lock);
2486                 mutex_exit(&attrzp->z_lock);
2487         }
2488 out:
2489         if (err == 0 && xattr_count > 0) {
2490                 err2 = sa_bulk_update(attrzp->z_sa_hdl, xattr_bulk,
2491                     xattr_count, tx);
2492                 ASSERT(err2 == 0);
2493         }
2494
2495         if (aclp)
2496                 zfs_acl_free(aclp);
2497
2498         if (fuidp) {
2499                 zfs_fuid_info_free(fuidp);
2500                 fuidp = NULL;
2501         }
2502
2503         if (err) {
2504                 dmu_tx_abort(tx);
2505                 if (attrzp)
2506                         zrele(attrzp);
2507                 if (err == ERESTART)
2508                         goto top;
2509         } else {
2510                 if (count > 0)
2511                         err2 = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
2512                 dmu_tx_commit(tx);
2513                 if (attrzp) {
2514                         if (err2 == 0 && handle_eadir)
2515                                 err2 = zfs_setattr_dir(attrzp);
2516                         zrele(attrzp);
2517                 }
2518                 zfs_znode_update_vfs(zp);
2519         }
2520
2521 out2:
2522         if (os->os_sync == ZFS_SYNC_ALWAYS)
2523                 zil_commit(zilog, 0);
2524
2525 out3:
2526         kmem_free(xattr_bulk, sizeof (sa_bulk_attr_t) * bulks);
2527         kmem_free(bulk, sizeof (sa_bulk_attr_t) * bulks);
2528         kmem_free(tmpxvattr, sizeof (xvattr_t));
2529         ZFS_EXIT(zfsvfs);
2530         return (err);
2531 }
2532
2533 typedef struct zfs_zlock {
2534         krwlock_t       *zl_rwlock;     /* lock we acquired */
2535         znode_t         *zl_znode;      /* znode we held */
2536         struct zfs_zlock *zl_next;      /* next in list */
2537 } zfs_zlock_t;
2538
2539 /*
2540  * Drop locks and release vnodes that were held by zfs_rename_lock().
2541  */
2542 static void
2543 zfs_rename_unlock(zfs_zlock_t **zlpp)
2544 {
2545         zfs_zlock_t *zl;
2546
2547         while ((zl = *zlpp) != NULL) {
2548                 if (zl->zl_znode != NULL)
2549                         zfs_zrele_async(zl->zl_znode);
2550                 rw_exit(zl->zl_rwlock);
2551                 *zlpp = zl->zl_next;
2552                 kmem_free(zl, sizeof (*zl));
2553         }
2554 }
2555
2556 /*
2557  * Search back through the directory tree, using the ".." entries.
2558  * Lock each directory in the chain to prevent concurrent renames.
2559  * Fail any attempt to move a directory into one of its own descendants.
2560  * XXX - z_parent_lock can overlap with map or grow locks
2561  */
2562 static int
2563 zfs_rename_lock(znode_t *szp, znode_t *tdzp, znode_t *sdzp, zfs_zlock_t **zlpp)
2564 {
2565         zfs_zlock_t     *zl;
2566         znode_t         *zp = tdzp;
2567         uint64_t        rootid = ZTOZSB(zp)->z_root;
2568         uint64_t        oidp = zp->z_id;
2569         krwlock_t       *rwlp = &szp->z_parent_lock;
2570         krw_t           rw = RW_WRITER;
2571
2572         /*
2573          * First pass write-locks szp and compares to zp->z_id.
2574          * Later passes read-lock zp and compare to zp->z_parent.
2575          */
2576         do {
2577                 if (!rw_tryenter(rwlp, rw)) {
2578                         /*
2579                          * Another thread is renaming in this path.
2580                          * Note that if we are a WRITER, we don't have any
2581                          * parent_locks held yet.
2582                          */
2583                         if (rw == RW_READER && zp->z_id > szp->z_id) {
2584                                 /*
2585                                  * Drop our locks and restart
2586                                  */
2587                                 zfs_rename_unlock(&zl);
2588                                 *zlpp = NULL;
2589                                 zp = tdzp;
2590                                 oidp = zp->z_id;
2591                                 rwlp = &szp->z_parent_lock;
2592                                 rw = RW_WRITER;
2593                                 continue;
2594                         } else {
2595                                 /*
2596                                  * Wait for other thread to drop its locks
2597                                  */
2598                                 rw_enter(rwlp, rw);
2599                         }
2600                 }
2601
2602                 zl = kmem_alloc(sizeof (*zl), KM_SLEEP);
2603                 zl->zl_rwlock = rwlp;
2604                 zl->zl_znode = NULL;
2605                 zl->zl_next = *zlpp;
2606                 *zlpp = zl;
2607
2608                 if (oidp == szp->z_id)          /* We're a descendant of szp */
2609                         return (SET_ERROR(EINVAL));
2610
2611                 if (oidp == rootid)             /* We've hit the top */
2612                         return (0);
2613
2614                 if (rw == RW_READER) {          /* i.e. not the first pass */
2615                         int error = zfs_zget(ZTOZSB(zp), oidp, &zp);
2616                         if (error)
2617                                 return (error);
2618                         zl->zl_znode = zp;
2619                 }
2620                 (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_PARENT(ZTOZSB(zp)),
2621                     &oidp, sizeof (oidp));
2622                 rwlp = &zp->z_parent_lock;
2623                 rw = RW_READER;
2624
2625         } while (zp->z_id != sdzp->z_id);
2626
2627         return (0);
2628 }
2629
2630 /*
2631  * Move an entry from the provided source directory to the target
2632  * directory.  Change the entry name as indicated.
2633  *
2634  *      IN:     sdzp    - Source directory containing the "old entry".
2635  *              snm     - Old entry name.
2636  *              tdzp    - Target directory to contain the "new entry".
2637  *              tnm     - New entry name.
2638  *              cr      - credentials of caller.
2639  *              flags   - case flags
2640  *
2641  *      RETURN: 0 on success, error code on failure.
2642  *
2643  * Timestamps:
2644  *      sdzp,tdzp - ctime|mtime updated
2645  */
2646 int
2647 zfs_rename(znode_t *sdzp, char *snm, znode_t *tdzp, char *tnm,
2648     cred_t *cr, int flags)
2649 {
2650         znode_t         *szp, *tzp;
2651         zfsvfs_t        *zfsvfs = ZTOZSB(sdzp);
2652         zilog_t         *zilog;
2653         zfs_dirlock_t   *sdl, *tdl;
2654         dmu_tx_t        *tx;
2655         zfs_zlock_t     *zl;
2656         int             cmp, serr, terr;
2657         int             error = 0;
2658         int             zflg = 0;
2659         boolean_t       waited = B_FALSE;
2660
2661         if (snm == NULL || tnm == NULL)
2662                 return (SET_ERROR(EINVAL));
2663
2664         ZFS_ENTER(zfsvfs);
2665         ZFS_VERIFY_ZP(sdzp);
2666         zilog = zfsvfs->z_log;
2667
2668         ZFS_VERIFY_ZP(tdzp);
2669
2670         /*
2671          * We check i_sb because snapshots and the ctldir must have different
2672          * super blocks.
2673          */
2674         if (ZTOI(tdzp)->i_sb != ZTOI(sdzp)->i_sb ||
2675             zfsctl_is_node(ZTOI(tdzp))) {
2676                 ZFS_EXIT(zfsvfs);
2677                 return (SET_ERROR(EXDEV));
2678         }
2679
2680         if (zfsvfs->z_utf8 && u8_validate(tnm,
2681             strlen(tnm), NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
2682                 ZFS_EXIT(zfsvfs);
2683                 return (SET_ERROR(EILSEQ));
2684         }
2685
2686         if (flags & FIGNORECASE)
2687                 zflg |= ZCILOOK;
2688
2689 top:
2690         szp = NULL;
2691         tzp = NULL;
2692         zl = NULL;
2693
2694         /*
2695          * This is to prevent the creation of links into attribute space
2696          * by renaming a linked file into/outof an attribute directory.
2697          * See the comment in zfs_link() for why this is considered bad.
2698          */
2699         if ((tdzp->z_pflags & ZFS_XATTR) != (sdzp->z_pflags & ZFS_XATTR)) {
2700                 ZFS_EXIT(zfsvfs);
2701                 return (SET_ERROR(EINVAL));
2702         }
2703
2704         /*
2705          * Lock source and target directory entries.  To prevent deadlock,
2706          * a lock ordering must be defined.  We lock the directory with
2707          * the smallest object id first, or if it's a tie, the one with
2708          * the lexically first name.
2709          */
2710         if (sdzp->z_id < tdzp->z_id) {
2711                 cmp = -1;
2712         } else if (sdzp->z_id > tdzp->z_id) {
2713                 cmp = 1;
2714         } else {
2715                 /*
2716                  * First compare the two name arguments without
2717                  * considering any case folding.
2718                  */
2719                 int nofold = (zfsvfs->z_norm & ~U8_TEXTPREP_TOUPPER);
2720
2721                 cmp = u8_strcmp(snm, tnm, 0, nofold, U8_UNICODE_LATEST, &error);
2722                 ASSERT(error == 0 || !zfsvfs->z_utf8);
2723                 if (cmp == 0) {
2724                         /*
2725                          * POSIX: "If the old argument and the new argument
2726                          * both refer to links to the same existing file,
2727                          * the rename() function shall return successfully
2728                          * and perform no other action."
2729                          */
2730                         ZFS_EXIT(zfsvfs);
2731                         return (0);
2732                 }
2733                 /*
2734                  * If the file system is case-folding, then we may
2735                  * have some more checking to do.  A case-folding file
2736                  * system is either supporting mixed case sensitivity
2737                  * access or is completely case-insensitive.  Note
2738                  * that the file system is always case preserving.
2739                  *
2740                  * In mixed sensitivity mode case sensitive behavior
2741                  * is the default.  FIGNORECASE must be used to
2742                  * explicitly request case insensitive behavior.
2743                  *
2744                  * If the source and target names provided differ only
2745                  * by case (e.g., a request to rename 'tim' to 'Tim'),
2746                  * we will treat this as a special case in the
2747                  * case-insensitive mode: as long as the source name
2748                  * is an exact match, we will allow this to proceed as
2749                  * a name-change request.
2750                  */
2751                 if ((zfsvfs->z_case == ZFS_CASE_INSENSITIVE ||
2752                     (zfsvfs->z_case == ZFS_CASE_MIXED &&
2753                     flags & FIGNORECASE)) &&
2754                     u8_strcmp(snm, tnm, 0, zfsvfs->z_norm, U8_UNICODE_LATEST,
2755                     &error) == 0) {
2756                         /*
2757                          * case preserving rename request, require exact
2758                          * name matches
2759                          */
2760                         zflg |= ZCIEXACT;
2761                         zflg &= ~ZCILOOK;
2762                 }
2763         }
2764
2765         /*
2766          * If the source and destination directories are the same, we should
2767          * grab the z_name_lock of that directory only once.
2768          */
2769         if (sdzp == tdzp) {
2770                 zflg |= ZHAVELOCK;
2771                 rw_enter(&sdzp->z_name_lock, RW_READER);
2772         }
2773
2774         if (cmp < 0) {
2775                 serr = zfs_dirent_lock(&sdl, sdzp, snm, &szp,
2776                     ZEXISTS | zflg, NULL, NULL);
2777                 terr = zfs_dirent_lock(&tdl,
2778                     tdzp, tnm, &tzp, ZRENAMING | zflg, NULL, NULL);
2779         } else {
2780                 terr = zfs_dirent_lock(&tdl,
2781                     tdzp, tnm, &tzp, zflg, NULL, NULL);
2782                 serr = zfs_dirent_lock(&sdl,
2783                     sdzp, snm, &szp, ZEXISTS | ZRENAMING | zflg,
2784                     NULL, NULL);
2785         }
2786
2787         if (serr) {
2788                 /*
2789                  * Source entry invalid or not there.
2790                  */
2791                 if (!terr) {
2792                         zfs_dirent_unlock(tdl);
2793                         if (tzp)
2794                                 zrele(tzp);
2795                 }
2796
2797                 if (sdzp == tdzp)
2798                         rw_exit(&sdzp->z_name_lock);
2799
2800                 if (strcmp(snm, "..") == 0)
2801                         serr = EINVAL;
2802                 ZFS_EXIT(zfsvfs);
2803                 return (serr);
2804         }
2805         if (terr) {
2806                 zfs_dirent_unlock(sdl);
2807                 zrele(szp);
2808
2809                 if (sdzp == tdzp)
2810                         rw_exit(&sdzp->z_name_lock);
2811
2812                 if (strcmp(tnm, "..") == 0)
2813                         terr = EINVAL;
2814                 ZFS_EXIT(zfsvfs);
2815                 return (terr);
2816         }
2817
2818         /*
2819          * If we are using project inheritance, means if the directory has
2820          * ZFS_PROJINHERIT set, then its descendant directories will inherit
2821          * not only the project ID, but also the ZFS_PROJINHERIT flag. Under
2822          * such case, we only allow renames into our tree when the project
2823          * IDs are the same.
2824          */
2825         if (tdzp->z_pflags & ZFS_PROJINHERIT &&
2826             tdzp->z_projid != szp->z_projid) {
2827                 error = SET_ERROR(EXDEV);
2828                 goto out;
2829         }
2830
2831         /*
2832          * Must have write access at the source to remove the old entry
2833          * and write access at the target to create the new entry.
2834          * Note that if target and source are the same, this can be
2835          * done in a single check.
2836          */
2837
2838         if ((error = zfs_zaccess_rename(sdzp, szp, tdzp, tzp, cr)))
2839                 goto out;
2840
2841         if (S_ISDIR(ZTOI(szp)->i_mode)) {
2842                 /*
2843                  * Check to make sure rename is valid.
2844                  * Can't do a move like this: /usr/a/b to /usr/a/b/c/d
2845                  */
2846                 if ((error = zfs_rename_lock(szp, tdzp, sdzp, &zl)))
2847                         goto out;
2848         }
2849
2850         /*
2851          * Does target exist?
2852          */
2853         if (tzp) {
2854                 /*
2855                  * Source and target must be the same type.
2856                  */
2857                 if (S_ISDIR(ZTOI(szp)->i_mode)) {
2858                         if (!S_ISDIR(ZTOI(tzp)->i_mode)) {
2859                                 error = SET_ERROR(ENOTDIR);
2860                                 goto out;
2861                         }
2862                 } else {
2863                         if (S_ISDIR(ZTOI(tzp)->i_mode)) {
2864                                 error = SET_ERROR(EISDIR);
2865                                 goto out;
2866                         }
2867                 }
2868                 /*
2869                  * POSIX dictates that when the source and target
2870                  * entries refer to the same file object, rename
2871                  * must do nothing and exit without error.
2872                  */
2873                 if (szp->z_id == tzp->z_id) {
2874                         error = 0;
2875                         goto out;
2876                 }
2877         }
2878
2879         tx = dmu_tx_create(zfsvfs->z_os);
2880         dmu_tx_hold_sa(tx, szp->z_sa_hdl, B_FALSE);
2881         dmu_tx_hold_sa(tx, sdzp->z_sa_hdl, B_FALSE);
2882         dmu_tx_hold_zap(tx, sdzp->z_id, FALSE, snm);
2883         dmu_tx_hold_zap(tx, tdzp->z_id, TRUE, tnm);
2884         if (sdzp != tdzp) {
2885                 dmu_tx_hold_sa(tx, tdzp->z_sa_hdl, B_FALSE);
2886                 zfs_sa_upgrade_txholds(tx, tdzp);
2887         }
2888         if (tzp) {
2889                 dmu_tx_hold_sa(tx, tzp->z_sa_hdl, B_FALSE);
2890                 zfs_sa_upgrade_txholds(tx, tzp);
2891         }
2892
2893         zfs_sa_upgrade_txholds(tx, szp);
2894         dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
2895         error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT);
2896         if (error) {
2897                 if (zl != NULL)
2898                         zfs_rename_unlock(&zl);
2899                 zfs_dirent_unlock(sdl);
2900                 zfs_dirent_unlock(tdl);
2901
2902                 if (sdzp == tdzp)
2903                         rw_exit(&sdzp->z_name_lock);
2904
2905                 if (error == ERESTART) {
2906                         waited = B_TRUE;
2907                         dmu_tx_wait(tx);
2908                         dmu_tx_abort(tx);
2909                         zrele(szp);
2910                         if (tzp)
2911                                 zrele(tzp);
2912                         goto top;
2913                 }
2914                 dmu_tx_abort(tx);
2915                 zrele(szp);
2916                 if (tzp)
2917                         zrele(tzp);
2918                 ZFS_EXIT(zfsvfs);
2919                 return (error);
2920         }
2921
2922         if (tzp)        /* Attempt to remove the existing target */
2923                 error = zfs_link_destroy(tdl, tzp, tx, zflg, NULL);
2924
2925         if (error == 0) {
2926                 error = zfs_link_create(tdl, szp, tx, ZRENAMING);
2927                 if (error == 0) {
2928                         szp->z_pflags |= ZFS_AV_MODIFIED;
2929                         if (tdzp->z_pflags & ZFS_PROJINHERIT)
2930                                 szp->z_pflags |= ZFS_PROJINHERIT;
2931
2932                         error = sa_update(szp->z_sa_hdl, SA_ZPL_FLAGS(zfsvfs),
2933                             (void *)&szp->z_pflags, sizeof (uint64_t), tx);
2934                         ASSERT0(error);
2935
2936                         error = zfs_link_destroy(sdl, szp, tx, ZRENAMING, NULL);
2937                         if (error == 0) {
2938                                 zfs_log_rename(zilog, tx, TX_RENAME |
2939                                     (flags & FIGNORECASE ? TX_CI : 0), sdzp,
2940                                     sdl->dl_name, tdzp, tdl->dl_name, szp);
2941                         } else {
2942                                 /*
2943                                  * At this point, we have successfully created
2944                                  * the target name, but have failed to remove
2945                                  * the source name.  Since the create was done
2946                                  * with the ZRENAMING flag, there are
2947                                  * complications; for one, the link count is
2948                                  * wrong.  The easiest way to deal with this
2949                                  * is to remove the newly created target, and
2950                                  * return the original error.  This must
2951                                  * succeed; fortunately, it is very unlikely to
2952                                  * fail, since we just created it.
2953                                  */
2954                                 VERIFY3U(zfs_link_destroy(tdl, szp, tx,
2955                                     ZRENAMING, NULL), ==, 0);
2956                         }
2957                 } else {
2958                         /*
2959                          * If we had removed the existing target, subsequent
2960                          * call to zfs_link_create() to add back the same entry
2961                          * but, the new dnode (szp) should not fail.
2962                          */
2963                         ASSERT(tzp == NULL);
2964                 }
2965         }
2966
2967         dmu_tx_commit(tx);
2968 out:
2969         if (zl != NULL)
2970                 zfs_rename_unlock(&zl);
2971
2972         zfs_dirent_unlock(sdl);
2973         zfs_dirent_unlock(tdl);
2974
2975         zfs_znode_update_vfs(sdzp);
2976         if (sdzp == tdzp)
2977                 rw_exit(&sdzp->z_name_lock);
2978
2979         if (sdzp != tdzp)
2980                 zfs_znode_update_vfs(tdzp);
2981
2982         zfs_znode_update_vfs(szp);
2983         zrele(szp);
2984         if (tzp) {
2985                 zfs_znode_update_vfs(tzp);
2986                 zrele(tzp);
2987         }
2988
2989         if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
2990                 zil_commit(zilog, 0);
2991
2992         ZFS_EXIT(zfsvfs);
2993         return (error);
2994 }
2995
2996 /*
2997  * Insert the indicated symbolic reference entry into the directory.
2998  *
2999  *      IN:     dzp     - Directory to contain new symbolic link.
3000  *              name    - Name of directory entry in dip.
3001  *              vap     - Attributes of new entry.
3002  *              link    - Name for new symlink entry.
3003  *              cr      - credentials of caller.
3004  *              flags   - case flags
3005  *
3006  *      OUT:    zpp     - Znode for new symbolic link.
3007  *
3008  *      RETURN: 0 on success, error code on failure.
3009  *
3010  * Timestamps:
3011  *      dip - ctime|mtime updated
3012  */
3013 int
3014 zfs_symlink(znode_t *dzp, char *name, vattr_t *vap, char *link,
3015     znode_t **zpp, cred_t *cr, int flags)
3016 {
3017         znode_t         *zp;
3018         zfs_dirlock_t   *dl;
3019         dmu_tx_t        *tx;
3020         zfsvfs_t        *zfsvfs = ZTOZSB(dzp);
3021         zilog_t         *zilog;
3022         uint64_t        len = strlen(link);
3023         int             error;
3024         int             zflg = ZNEW;
3025         zfs_acl_ids_t   acl_ids;
3026         boolean_t       fuid_dirtied;
3027         uint64_t        txtype = TX_SYMLINK;
3028         boolean_t       waited = B_FALSE;
3029
3030         ASSERT(S_ISLNK(vap->va_mode));
3031
3032         if (name == NULL)
3033                 return (SET_ERROR(EINVAL));
3034
3035         ZFS_ENTER(zfsvfs);
3036         ZFS_VERIFY_ZP(dzp);
3037         zilog = zfsvfs->z_log;
3038
3039         if (zfsvfs->z_utf8 && u8_validate(name, strlen(name),
3040             NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
3041                 ZFS_EXIT(zfsvfs);
3042                 return (SET_ERROR(EILSEQ));
3043         }
3044         if (flags & FIGNORECASE)
3045                 zflg |= ZCILOOK;
3046
3047         if (len > MAXPATHLEN) {
3048                 ZFS_EXIT(zfsvfs);
3049                 return (SET_ERROR(ENAMETOOLONG));
3050         }
3051
3052         if ((error = zfs_acl_ids_create(dzp, 0,
3053             vap, cr, NULL, &acl_ids)) != 0) {
3054                 ZFS_EXIT(zfsvfs);
3055                 return (error);
3056         }
3057 top:
3058         *zpp = NULL;
3059
3060         /*
3061          * Attempt to lock directory; fail if entry already exists.
3062          */
3063         error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg, NULL, NULL);
3064         if (error) {
3065                 zfs_acl_ids_free(&acl_ids);
3066                 ZFS_EXIT(zfsvfs);
3067                 return (error);
3068         }
3069
3070         if ((error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr))) {
3071                 zfs_acl_ids_free(&acl_ids);
3072                 zfs_dirent_unlock(dl);
3073                 ZFS_EXIT(zfsvfs);
3074                 return (error);
3075         }
3076
3077         if (zfs_acl_ids_overquota(zfsvfs, &acl_ids, ZFS_DEFAULT_PROJID)) {
3078                 zfs_acl_ids_free(&acl_ids);
3079                 zfs_dirent_unlock(dl);
3080                 ZFS_EXIT(zfsvfs);
3081                 return (SET_ERROR(EDQUOT));
3082         }
3083         tx = dmu_tx_create(zfsvfs->z_os);
3084         fuid_dirtied = zfsvfs->z_fuid_dirty;
3085         dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, MAX(1, len));
3086         dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
3087         dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
3088             ZFS_SA_BASE_ATTR_SIZE + len);
3089         dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE);
3090         if (!zfsvfs->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) {
3091                 dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0,
3092                     acl_ids.z_aclp->z_acl_bytes);
3093         }
3094         if (fuid_dirtied)
3095                 zfs_fuid_txhold(zfsvfs, tx);
3096         error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT);
3097         if (error) {
3098                 zfs_dirent_unlock(dl);
3099                 if (error == ERESTART) {
3100                         waited = B_TRUE;
3101                         dmu_tx_wait(tx);
3102                         dmu_tx_abort(tx);
3103                         goto top;
3104                 }
3105                 zfs_acl_ids_free(&acl_ids);
3106                 dmu_tx_abort(tx);
3107                 ZFS_EXIT(zfsvfs);
3108                 return (error);
3109         }
3110
3111         /*
3112          * Create a new object for the symlink.
3113          * for version 4 ZPL datasets the symlink will be an SA attribute
3114          */
3115         zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids);
3116
3117         if (fuid_dirtied)
3118                 zfs_fuid_sync(zfsvfs, tx);
3119
3120         mutex_enter(&zp->z_lock);
3121         if (zp->z_is_sa)
3122                 error = sa_update(zp->z_sa_hdl, SA_ZPL_SYMLINK(zfsvfs),
3123                     link, len, tx);
3124         else
3125                 zfs_sa_symlink(zp, link, len, tx);
3126         mutex_exit(&zp->z_lock);
3127
3128         zp->z_size = len;
3129         (void) sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zfsvfs),
3130             &zp->z_size, sizeof (zp->z_size), tx);
3131         /*
3132          * Insert the new object into the directory.
3133          */
3134         error = zfs_link_create(dl, zp, tx, ZNEW);
3135         if (error != 0) {
3136                 zfs_znode_delete(zp, tx);
3137                 remove_inode_hash(ZTOI(zp));
3138         } else {
3139                 if (flags & FIGNORECASE)
3140                         txtype |= TX_CI;
3141                 zfs_log_symlink(zilog, tx, txtype, dzp, zp, name, link);
3142
3143                 zfs_znode_update_vfs(dzp);
3144                 zfs_znode_update_vfs(zp);
3145         }
3146
3147         zfs_acl_ids_free(&acl_ids);
3148
3149         dmu_tx_commit(tx);
3150
3151         zfs_dirent_unlock(dl);
3152
3153         if (error == 0) {
3154                 *zpp = zp;
3155
3156                 if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
3157                         zil_commit(zilog, 0);
3158         } else {
3159                 zrele(zp);
3160         }
3161
3162         ZFS_EXIT(zfsvfs);
3163         return (error);
3164 }
3165
3166 /*
3167  * Return, in the buffer contained in the provided uio structure,
3168  * the symbolic path referred to by ip.
3169  *
3170  *      IN:     ip      - inode of symbolic link
3171  *              uio     - structure to contain the link path.
3172  *              cr      - credentials of caller.
3173  *
3174  *      RETURN: 0 if success
3175  *              error code if failure
3176  *
3177  * Timestamps:
3178  *      ip - atime updated
3179  */
3180 int
3181 zfs_readlink(struct inode *ip, zfs_uio_t *uio, cred_t *cr)
3182 {
3183         (void) cr;
3184         znode_t         *zp = ITOZ(ip);
3185         zfsvfs_t        *zfsvfs = ITOZSB(ip);
3186         int             error;
3187
3188         ZFS_ENTER(zfsvfs);
3189         ZFS_VERIFY_ZP(zp);
3190
3191         mutex_enter(&zp->z_lock);
3192         if (zp->z_is_sa)
3193                 error = sa_lookup_uio(zp->z_sa_hdl,
3194                     SA_ZPL_SYMLINK(zfsvfs), uio);
3195         else
3196                 error = zfs_sa_readlink(zp, uio);
3197         mutex_exit(&zp->z_lock);
3198
3199         ZFS_EXIT(zfsvfs);
3200         return (error);
3201 }
3202
3203 /*
3204  * Insert a new entry into directory tdzp referencing szp.
3205  *
3206  *      IN:     tdzp    - Directory to contain new entry.
3207  *              szp     - znode of new entry.
3208  *              name    - name of new entry.
3209  *              cr      - credentials of caller.
3210  *              flags   - case flags.
3211  *
3212  *      RETURN: 0 if success
3213  *              error code if failure
3214  *
3215  * Timestamps:
3216  *      tdzp - ctime|mtime updated
3217  *       szp - ctime updated
3218  */
3219 int
3220 zfs_link(znode_t *tdzp, znode_t *szp, char *name, cred_t *cr,
3221     int flags)
3222 {
3223         struct inode *sip = ZTOI(szp);
3224         znode_t         *tzp;
3225         zfsvfs_t        *zfsvfs = ZTOZSB(tdzp);
3226         zilog_t         *zilog;
3227         zfs_dirlock_t   *dl;
3228         dmu_tx_t        *tx;
3229         int             error;
3230         int             zf = ZNEW;
3231         uint64_t        parent;
3232         uid_t           owner;
3233         boolean_t       waited = B_FALSE;
3234         boolean_t       is_tmpfile = 0;
3235         uint64_t        txg;
3236 #ifdef HAVE_TMPFILE
3237         is_tmpfile = (sip->i_nlink == 0 && (sip->i_state & I_LINKABLE));
3238 #endif
3239         ASSERT(S_ISDIR(ZTOI(tdzp)->i_mode));
3240
3241         if (name == NULL)
3242                 return (SET_ERROR(EINVAL));
3243
3244         ZFS_ENTER(zfsvfs);
3245         ZFS_VERIFY_ZP(tdzp);
3246         zilog = zfsvfs->z_log;
3247
3248         /*
3249          * POSIX dictates that we return EPERM here.
3250          * Better choices include ENOTSUP or EISDIR.
3251          */
3252         if (S_ISDIR(sip->i_mode)) {
3253                 ZFS_EXIT(zfsvfs);
3254                 return (SET_ERROR(EPERM));
3255         }
3256
3257         ZFS_VERIFY_ZP(szp);
3258
3259         /*
3260          * If we are using project inheritance, means if the directory has
3261          * ZFS_PROJINHERIT set, then its descendant directories will inherit
3262          * not only the project ID, but also the ZFS_PROJINHERIT flag. Under
3263          * such case, we only allow hard link creation in our tree when the
3264          * project IDs are the same.
3265          */
3266         if (tdzp->z_pflags & ZFS_PROJINHERIT &&
3267             tdzp->z_projid != szp->z_projid) {
3268                 ZFS_EXIT(zfsvfs);
3269                 return (SET_ERROR(EXDEV));
3270         }
3271
3272         /*
3273          * We check i_sb because snapshots and the ctldir must have different
3274          * super blocks.
3275          */
3276         if (sip->i_sb != ZTOI(tdzp)->i_sb || zfsctl_is_node(sip)) {
3277                 ZFS_EXIT(zfsvfs);
3278                 return (SET_ERROR(EXDEV));
3279         }
3280
3281         /* Prevent links to .zfs/shares files */
3282
3283         if ((error = sa_lookup(szp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs),
3284             &parent, sizeof (uint64_t))) != 0) {
3285                 ZFS_EXIT(zfsvfs);
3286                 return (error);
3287         }
3288         if (parent == zfsvfs->z_shares_dir) {
3289                 ZFS_EXIT(zfsvfs);
3290                 return (SET_ERROR(EPERM));
3291         }
3292
3293         if (zfsvfs->z_utf8 && u8_validate(name,
3294             strlen(name), NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
3295                 ZFS_EXIT(zfsvfs);
3296                 return (SET_ERROR(EILSEQ));
3297         }
3298         if (flags & FIGNORECASE)
3299                 zf |= ZCILOOK;
3300
3301         /*
3302          * We do not support links between attributes and non-attributes
3303          * because of the potential security risk of creating links
3304          * into "normal" file space in order to circumvent restrictions
3305          * imposed in attribute space.
3306          */
3307         if ((szp->z_pflags & ZFS_XATTR) != (tdzp->z_pflags & ZFS_XATTR)) {
3308                 ZFS_EXIT(zfsvfs);
3309                 return (SET_ERROR(EINVAL));
3310         }
3311
3312         owner = zfs_fuid_map_id(zfsvfs, KUID_TO_SUID(sip->i_uid),
3313             cr, ZFS_OWNER);
3314         if (owner != crgetuid(cr) && secpolicy_basic_link(cr) != 0) {
3315                 ZFS_EXIT(zfsvfs);
3316                 return (SET_ERROR(EPERM));
3317         }
3318
3319         if ((error = zfs_zaccess(tdzp, ACE_ADD_FILE, 0, B_FALSE, cr))) {
3320                 ZFS_EXIT(zfsvfs);
3321                 return (error);
3322         }
3323
3324 top:
3325         /*
3326          * Attempt to lock directory; fail if entry already exists.
3327          */
3328         error = zfs_dirent_lock(&dl, tdzp, name, &tzp, zf, NULL, NULL);
3329         if (error) {
3330                 ZFS_EXIT(zfsvfs);
3331                 return (error);
3332         }
3333
3334         tx = dmu_tx_create(zfsvfs->z_os);
3335         dmu_tx_hold_sa(tx, szp->z_sa_hdl, B_FALSE);
3336         dmu_tx_hold_zap(tx, tdzp->z_id, TRUE, name);
3337         if (is_tmpfile)
3338                 dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
3339
3340         zfs_sa_upgrade_txholds(tx, szp);
3341         zfs_sa_upgrade_txholds(tx, tdzp);
3342         error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT);
3343         if (error) {
3344                 zfs_dirent_unlock(dl);
3345                 if (error == ERESTART) {
3346                         waited = B_TRUE;
3347                         dmu_tx_wait(tx);
3348                         dmu_tx_abort(tx);
3349                         goto top;
3350                 }
3351                 dmu_tx_abort(tx);
3352                 ZFS_EXIT(zfsvfs);
3353                 return (error);
3354         }
3355         /* unmark z_unlinked so zfs_link_create will not reject */
3356         if (is_tmpfile)
3357                 szp->z_unlinked = B_FALSE;
3358         error = zfs_link_create(dl, szp, tx, 0);
3359
3360         if (error == 0) {
3361                 uint64_t txtype = TX_LINK;
3362                 /*
3363                  * tmpfile is created to be in z_unlinkedobj, so remove it.
3364                  * Also, we don't log in ZIL, because all previous file
3365                  * operation on the tmpfile are ignored by ZIL. Instead we
3366                  * always wait for txg to sync to make sure all previous
3367                  * operation are sync safe.
3368                  */
3369                 if (is_tmpfile) {
3370                         VERIFY(zap_remove_int(zfsvfs->z_os,
3371                             zfsvfs->z_unlinkedobj, szp->z_id, tx) == 0);
3372                 } else {
3373                         if (flags & FIGNORECASE)
3374                                 txtype |= TX_CI;
3375                         zfs_log_link(zilog, tx, txtype, tdzp, szp, name);
3376                 }
3377         } else if (is_tmpfile) {
3378                 /* restore z_unlinked since when linking failed */
3379                 szp->z_unlinked = B_TRUE;
3380         }
3381         txg = dmu_tx_get_txg(tx);
3382         dmu_tx_commit(tx);
3383
3384         zfs_dirent_unlock(dl);
3385
3386         if (!is_tmpfile && zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
3387                 zil_commit(zilog, 0);
3388
3389         if (is_tmpfile && zfsvfs->z_os->os_sync != ZFS_SYNC_DISABLED)
3390                 txg_wait_synced(dmu_objset_pool(zfsvfs->z_os), txg);
3391
3392         zfs_znode_update_vfs(tdzp);
3393         zfs_znode_update_vfs(szp);
3394         ZFS_EXIT(zfsvfs);
3395         return (error);
3396 }
3397
3398 static void
3399 zfs_putpage_commit_cb(void *arg)
3400 {
3401         struct page *pp = arg;
3402
3403         ClearPageError(pp);
3404         end_page_writeback(pp);
3405 }
3406
3407 /*
3408  * Push a page out to disk, once the page is on stable storage the
3409  * registered commit callback will be run as notification of completion.
3410  *
3411  *      IN:     ip      - page mapped for inode.
3412  *              pp      - page to push (page is locked)
3413  *              wbc     - writeback control data
3414  *
3415  *      RETURN: 0 if success
3416  *              error code if failure
3417  *
3418  * Timestamps:
3419  *      ip - ctime|mtime updated
3420  */
3421 int
3422 zfs_putpage(struct inode *ip, struct page *pp, struct writeback_control *wbc)
3423 {
3424         znode_t         *zp = ITOZ(ip);
3425         zfsvfs_t        *zfsvfs = ITOZSB(ip);
3426         loff_t          offset;
3427         loff_t          pgoff;
3428         unsigned int    pglen;
3429         dmu_tx_t        *tx;
3430         caddr_t         va;
3431         int             err = 0;
3432         uint64_t        mtime[2], ctime[2];
3433         sa_bulk_attr_t  bulk[3];
3434         int             cnt = 0;
3435         struct address_space *mapping;
3436
3437         ZFS_ENTER(zfsvfs);
3438         ZFS_VERIFY_ZP(zp);
3439
3440         ASSERT(PageLocked(pp));
3441
3442         pgoff = page_offset(pp);        /* Page byte-offset in file */
3443         offset = i_size_read(ip);       /* File length in bytes */
3444         pglen = MIN(PAGE_SIZE,          /* Page length in bytes */
3445             P2ROUNDUP(offset, PAGE_SIZE)-pgoff);
3446
3447         /* Page is beyond end of file */
3448         if (pgoff >= offset) {
3449                 unlock_page(pp);
3450                 ZFS_EXIT(zfsvfs);
3451                 return (0);
3452         }
3453
3454         /* Truncate page length to end of file */
3455         if (pgoff + pglen > offset)
3456                 pglen = offset - pgoff;
3457
3458 #if 0
3459         /*
3460          * FIXME: Allow mmap writes past its quota.  The correct fix
3461          * is to register a page_mkwrite() handler to count the page
3462          * against its quota when it is about to be dirtied.
3463          */
3464         if (zfs_id_overblockquota(zfsvfs, DMU_USERUSED_OBJECT,
3465             KUID_TO_SUID(ip->i_uid)) ||
3466             zfs_id_overblockquota(zfsvfs, DMU_GROUPUSED_OBJECT,
3467             KGID_TO_SGID(ip->i_gid)) ||
3468             (zp->z_projid != ZFS_DEFAULT_PROJID &&
3469             zfs_id_overblockquota(zfsvfs, DMU_PROJECTUSED_OBJECT,
3470             zp->z_projid))) {
3471                 err = EDQUOT;
3472         }
3473 #endif
3474
3475         /*
3476          * The ordering here is critical and must adhere to the following
3477          * rules in order to avoid deadlocking in either zfs_read() or
3478          * zfs_free_range() due to a lock inversion.
3479          *
3480          * 1) The page must be unlocked prior to acquiring the range lock.
3481          *    This is critical because zfs_read() calls find_lock_page()
3482          *    which may block on the page lock while holding the range lock.
3483          *
3484          * 2) Before setting or clearing write back on a page the range lock
3485          *    must be held in order to prevent a lock inversion with the
3486          *    zfs_free_range() function.
3487          *
3488          * This presents a problem because upon entering this function the
3489          * page lock is already held.  To safely acquire the range lock the
3490          * page lock must be dropped.  This creates a window where another
3491          * process could truncate, invalidate, dirty, or write out the page.
3492          *
3493          * Therefore, after successfully reacquiring the range and page locks
3494          * the current page state is checked.  In the common case everything
3495          * will be as is expected and it can be written out.  However, if
3496          * the page state has changed it must be handled accordingly.
3497          */
3498         mapping = pp->mapping;
3499         redirty_page_for_writepage(wbc, pp);
3500         unlock_page(pp);
3501
3502         zfs_locked_range_t *lr = zfs_rangelock_enter(&zp->z_rangelock,
3503             pgoff, pglen, RL_WRITER);
3504         lock_page(pp);
3505
3506         /* Page mapping changed or it was no longer dirty, we're done */
3507         if (unlikely((mapping != pp->mapping) || !PageDirty(pp))) {
3508                 unlock_page(pp);
3509                 zfs_rangelock_exit(lr);
3510                 ZFS_EXIT(zfsvfs);
3511                 return (0);
3512         }
3513
3514         /* Another process started write block if required */
3515         if (PageWriteback(pp)) {
3516                 unlock_page(pp);
3517                 zfs_rangelock_exit(lr);
3518
3519                 if (wbc->sync_mode != WB_SYNC_NONE) {
3520                         if (PageWriteback(pp))
3521 #ifdef HAVE_PAGEMAP_FOLIO_WAIT_BIT
3522                                 folio_wait_bit(page_folio(pp), PG_writeback);
3523 #else
3524                                 wait_on_page_bit(pp, PG_writeback);
3525 #endif
3526                 }
3527
3528                 ZFS_EXIT(zfsvfs);
3529                 return (0);
3530         }
3531
3532         /* Clear the dirty flag the required locks are held */
3533         if (!clear_page_dirty_for_io(pp)) {
3534                 unlock_page(pp);
3535                 zfs_rangelock_exit(lr);
3536                 ZFS_EXIT(zfsvfs);
3537                 return (0);
3538         }
3539
3540         /*
3541          * Counterpart for redirty_page_for_writepage() above.  This page
3542          * was in fact not skipped and should not be counted as if it were.
3543          */
3544         wbc->pages_skipped--;
3545         set_page_writeback(pp);
3546         unlock_page(pp);
3547
3548         tx = dmu_tx_create(zfsvfs->z_os);
3549         dmu_tx_hold_write(tx, zp->z_id, pgoff, pglen);
3550         dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
3551         zfs_sa_upgrade_txholds(tx, zp);
3552
3553         err = dmu_tx_assign(tx, TXG_NOWAIT);
3554         if (err != 0) {
3555                 if (err == ERESTART)
3556                         dmu_tx_wait(tx);
3557
3558                 dmu_tx_abort(tx);
3559 #ifdef HAVE_VFS_FILEMAP_DIRTY_FOLIO
3560                 filemap_dirty_folio(page_mapping(pp), page_folio(pp));
3561 #else
3562                 __set_page_dirty_nobuffers(pp);
3563 #endif
3564                 ClearPageError(pp);
3565                 end_page_writeback(pp);
3566                 zfs_rangelock_exit(lr);
3567                 ZFS_EXIT(zfsvfs);
3568                 return (err);
3569         }
3570
3571         va = kmap(pp);
3572         ASSERT3U(pglen, <=, PAGE_SIZE);
3573         dmu_write(zfsvfs->z_os, zp->z_id, pgoff, pglen, va, tx);
3574         kunmap(pp);
3575
3576         SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16);
3577         SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16);
3578         SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_FLAGS(zfsvfs), NULL,
3579             &zp->z_pflags, 8);
3580
3581         /* Preserve the mtime and ctime provided by the inode */
3582         ZFS_TIME_ENCODE(&ip->i_mtime, mtime);
3583         ZFS_TIME_ENCODE(&ip->i_ctime, ctime);
3584         zp->z_atime_dirty = B_FALSE;
3585         zp->z_seq++;
3586
3587         err = sa_bulk_update(zp->z_sa_hdl, bulk, cnt, tx);
3588
3589         zfs_log_write(zfsvfs->z_log, tx, TX_WRITE, zp, pgoff, pglen, 0,
3590             zfs_putpage_commit_cb, pp);
3591         dmu_tx_commit(tx);
3592
3593         zfs_rangelock_exit(lr);
3594
3595         if (wbc->sync_mode != WB_SYNC_NONE) {
3596                 /*
3597                  * Note that this is rarely called under writepages(), because
3598                  * writepages() normally handles the entire commit for
3599                  * performance reasons.
3600                  */
3601                 zil_commit(zfsvfs->z_log, zp->z_id);
3602         }
3603
3604         dataset_kstats_update_write_kstats(&zfsvfs->z_kstat, pglen);
3605
3606         ZFS_EXIT(zfsvfs);
3607         return (err);
3608 }
3609
3610 /*
3611  * Update the system attributes when the inode has been dirtied.  For the
3612  * moment we only update the mode, atime, mtime, and ctime.
3613  */
3614 int
3615 zfs_dirty_inode(struct inode *ip, int flags)
3616 {
3617         znode_t         *zp = ITOZ(ip);
3618         zfsvfs_t        *zfsvfs = ITOZSB(ip);
3619         dmu_tx_t        *tx;
3620         uint64_t        mode, atime[2], mtime[2], ctime[2];
3621         sa_bulk_attr_t  bulk[4];
3622         int             error = 0;
3623         int             cnt = 0;
3624
3625         if (zfs_is_readonly(zfsvfs) || dmu_objset_is_snapshot(zfsvfs->z_os))
3626                 return (0);
3627
3628         ZFS_ENTER(zfsvfs);
3629         ZFS_VERIFY_ZP(zp);
3630
3631 #ifdef I_DIRTY_TIME
3632         /*
3633          * This is the lazytime semantic introduced in Linux 4.0
3634          * This flag will only be called from update_time when lazytime is set.
3635          * (Note, I_DIRTY_SYNC will also set if not lazytime)
3636          * Fortunately mtime and ctime are managed within ZFS itself, so we
3637          * only need to dirty atime.
3638          */
3639         if (flags == I_DIRTY_TIME) {
3640                 zp->z_atime_dirty = B_TRUE;
3641                 goto out;
3642         }
3643 #endif
3644
3645         tx = dmu_tx_create(zfsvfs->z_os);
3646
3647         dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
3648         zfs_sa_upgrade_txholds(tx, zp);
3649
3650         error = dmu_tx_assign(tx, TXG_WAIT);
3651         if (error) {
3652                 dmu_tx_abort(tx);
3653                 goto out;
3654         }
3655
3656         mutex_enter(&zp->z_lock);
3657         zp->z_atime_dirty = B_FALSE;
3658
3659         SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_MODE(zfsvfs), NULL, &mode, 8);
3660         SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_ATIME(zfsvfs), NULL, &atime, 16);
3661         SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16);
3662         SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16);
3663
3664         /* Preserve the mode, mtime and ctime provided by the inode */
3665         ZFS_TIME_ENCODE(&ip->i_atime, atime);
3666         ZFS_TIME_ENCODE(&ip->i_mtime, mtime);
3667         ZFS_TIME_ENCODE(&ip->i_ctime, ctime);
3668         mode = ip->i_mode;
3669
3670         zp->z_mode = mode;
3671
3672         error = sa_bulk_update(zp->z_sa_hdl, bulk, cnt, tx);
3673         mutex_exit(&zp->z_lock);
3674
3675         dmu_tx_commit(tx);
3676 out:
3677         ZFS_EXIT(zfsvfs);
3678         return (error);
3679 }
3680
3681 void
3682 zfs_inactive(struct inode *ip)
3683 {
3684         znode_t *zp = ITOZ(ip);
3685         zfsvfs_t *zfsvfs = ITOZSB(ip);
3686         uint64_t atime[2];
3687         int error;
3688         int need_unlock = 0;
3689
3690         /* Only read lock if we haven't already write locked, e.g. rollback */
3691         if (!RW_WRITE_HELD(&zfsvfs->z_teardown_inactive_lock)) {
3692                 need_unlock = 1;
3693                 rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_READER);
3694         }
3695         if (zp->z_sa_hdl == NULL) {
3696                 if (need_unlock)
3697                         rw_exit(&zfsvfs->z_teardown_inactive_lock);
3698                 return;
3699         }
3700
3701         if (zp->z_atime_dirty && zp->z_unlinked == B_FALSE) {
3702                 dmu_tx_t *tx = dmu_tx_create(zfsvfs->z_os);
3703
3704                 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
3705                 zfs_sa_upgrade_txholds(tx, zp);
3706                 error = dmu_tx_assign(tx, TXG_WAIT);
3707                 if (error) {
3708                         dmu_tx_abort(tx);
3709                 } else {
3710                         ZFS_TIME_ENCODE(&ip->i_atime, atime);
3711                         mutex_enter(&zp->z_lock);
3712                         (void) sa_update(zp->z_sa_hdl, SA_ZPL_ATIME(zfsvfs),
3713                             (void *)&atime, sizeof (atime), tx);
3714                         zp->z_atime_dirty = B_FALSE;
3715                         mutex_exit(&zp->z_lock);
3716                         dmu_tx_commit(tx);
3717                 }
3718         }
3719
3720         zfs_zinactive(zp);
3721         if (need_unlock)
3722                 rw_exit(&zfsvfs->z_teardown_inactive_lock);
3723 }
3724
3725 /*
3726  * Fill pages with data from the disk.
3727  */
3728 static int
3729 zfs_fillpage(struct inode *ip, struct page *pl[], int nr_pages)
3730 {
3731         znode_t *zp = ITOZ(ip);
3732         zfsvfs_t *zfsvfs = ITOZSB(ip);
3733         objset_t *os;
3734         struct page *cur_pp;
3735         u_offset_t io_off, total;
3736         size_t io_len;
3737         loff_t i_size;
3738         unsigned page_idx;
3739         int err;
3740
3741         os = zfsvfs->z_os;
3742         io_len = nr_pages << PAGE_SHIFT;
3743         i_size = i_size_read(ip);
3744         io_off = page_offset(pl[0]);
3745
3746         if (io_off + io_len > i_size)
3747                 io_len = i_size - io_off;
3748
3749         /*
3750          * Iterate over list of pages and read each page individually.
3751          */
3752         page_idx = 0;
3753         for (total = io_off + io_len; io_off < total; io_off += PAGESIZE) {
3754                 caddr_t va;
3755
3756                 cur_pp = pl[page_idx++];
3757                 va = kmap(cur_pp);
3758                 err = dmu_read(os, zp->z_id, io_off, PAGESIZE, va,
3759                     DMU_READ_PREFETCH);
3760                 kunmap(cur_pp);
3761                 if (err) {
3762                         /* convert checksum errors into IO errors */
3763                         if (err == ECKSUM)
3764                                 err = SET_ERROR(EIO);
3765                         return (err);
3766                 }
3767         }
3768
3769         return (0);
3770 }
3771
3772 /*
3773  * Uses zfs_fillpage to read data from the file and fill the pages.
3774  *
3775  *      IN:     ip       - inode of file to get data from.
3776  *              pl       - list of pages to read
3777  *              nr_pages - number of pages to read
3778  *
3779  *      RETURN: 0 on success, error code on failure.
3780  *
3781  * Timestamps:
3782  *      vp - atime updated
3783  */
3784 int
3785 zfs_getpage(struct inode *ip, struct page *pl[], int nr_pages)
3786 {
3787         znode_t  *zp  = ITOZ(ip);
3788         zfsvfs_t *zfsvfs = ITOZSB(ip);
3789         int      err;
3790
3791         if (pl == NULL)
3792                 return (0);
3793
3794         ZFS_ENTER(zfsvfs);
3795         ZFS_VERIFY_ZP(zp);
3796
3797         err = zfs_fillpage(ip, pl, nr_pages);
3798
3799         dataset_kstats_update_read_kstats(&zfsvfs->z_kstat, nr_pages*PAGESIZE);
3800
3801         ZFS_EXIT(zfsvfs);
3802         return (err);
3803 }
3804
3805 /*
3806  * Check ZFS specific permissions to memory map a section of a file.
3807  *
3808  *      IN:     ip      - inode of the file to mmap
3809  *              off     - file offset
3810  *              addrp   - start address in memory region
3811  *              len     - length of memory region
3812  *              vm_flags- address flags
3813  *
3814  *      RETURN: 0 if success
3815  *              error code if failure
3816  */
3817 int
3818 zfs_map(struct inode *ip, offset_t off, caddr_t *addrp, size_t len,
3819     unsigned long vm_flags)
3820 {
3821         (void) addrp;
3822         znode_t  *zp = ITOZ(ip);
3823         zfsvfs_t *zfsvfs = ITOZSB(ip);
3824
3825         ZFS_ENTER(zfsvfs);
3826         ZFS_VERIFY_ZP(zp);
3827
3828         if ((vm_flags & VM_WRITE) && (zp->z_pflags &
3829             (ZFS_IMMUTABLE | ZFS_READONLY | ZFS_APPENDONLY))) {
3830                 ZFS_EXIT(zfsvfs);
3831                 return (SET_ERROR(EPERM));
3832         }
3833
3834         if ((vm_flags & (VM_READ | VM_EXEC)) &&
3835             (zp->z_pflags & ZFS_AV_QUARANTINED)) {
3836                 ZFS_EXIT(zfsvfs);
3837                 return (SET_ERROR(EACCES));
3838         }
3839
3840         if (off < 0 || len > MAXOFFSET_T - off) {
3841                 ZFS_EXIT(zfsvfs);
3842                 return (SET_ERROR(ENXIO));
3843         }
3844
3845         ZFS_EXIT(zfsvfs);
3846         return (0);
3847 }
3848
3849 /*
3850  * Free or allocate space in a file.  Currently, this function only
3851  * supports the `F_FREESP' command.  However, this command is somewhat
3852  * misnamed, as its functionality includes the ability to allocate as
3853  * well as free space.
3854  *
3855  *      IN:     zp      - znode of file to free data in.
3856  *              cmd     - action to take (only F_FREESP supported).
3857  *              bfp     - section of file to free/alloc.
3858  *              flag    - current file open mode flags.
3859  *              offset  - current file offset.
3860  *              cr      - credentials of caller.
3861  *
3862  *      RETURN: 0 on success, error code on failure.
3863  *
3864  * Timestamps:
3865  *      zp - ctime|mtime updated
3866  */
3867 int
3868 zfs_space(znode_t *zp, int cmd, flock64_t *bfp, int flag,
3869     offset_t offset, cred_t *cr)
3870 {
3871         (void) offset;
3872         zfsvfs_t        *zfsvfs = ZTOZSB(zp);
3873         uint64_t        off, len;
3874         int             error;
3875
3876         ZFS_ENTER(zfsvfs);
3877         ZFS_VERIFY_ZP(zp);
3878
3879         if (cmd != F_FREESP) {
3880                 ZFS_EXIT(zfsvfs);
3881                 return (SET_ERROR(EINVAL));
3882         }
3883
3884         /*
3885          * Callers might not be able to detect properly that we are read-only,
3886          * so check it explicitly here.
3887          */
3888         if (zfs_is_readonly(zfsvfs)) {
3889                 ZFS_EXIT(zfsvfs);
3890                 return (SET_ERROR(EROFS));
3891         }
3892
3893         if (bfp->l_len < 0) {
3894                 ZFS_EXIT(zfsvfs);
3895                 return (SET_ERROR(EINVAL));
3896         }
3897
3898         /*
3899          * Permissions aren't checked on Solaris because on this OS
3900          * zfs_space() can only be called with an opened file handle.
3901          * On Linux we can get here through truncate_range() which
3902          * operates directly on inodes, so we need to check access rights.
3903          */
3904         if ((error = zfs_zaccess(zp, ACE_WRITE_DATA, 0, B_FALSE, cr))) {
3905                 ZFS_EXIT(zfsvfs);
3906                 return (error);
3907         }
3908
3909         off = bfp->l_start;
3910         len = bfp->l_len; /* 0 means from off to end of file */
3911
3912         error = zfs_freesp(zp, off, len, flag, TRUE);
3913
3914         ZFS_EXIT(zfsvfs);
3915         return (error);
3916 }
3917
3918 int
3919 zfs_fid(struct inode *ip, fid_t *fidp)
3920 {
3921         znode_t         *zp = ITOZ(ip);
3922         zfsvfs_t        *zfsvfs = ITOZSB(ip);
3923         uint32_t        gen;
3924         uint64_t        gen64;
3925         uint64_t        object = zp->z_id;
3926         zfid_short_t    *zfid;
3927         int             size, i, error;
3928
3929         ZFS_ENTER(zfsvfs);
3930
3931         if (fidp->fid_len < SHORT_FID_LEN) {
3932                 fidp->fid_len = SHORT_FID_LEN;
3933                 ZFS_EXIT(zfsvfs);
3934                 return (SET_ERROR(ENOSPC));
3935         }
3936
3937         ZFS_VERIFY_ZP(zp);
3938
3939         if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_GEN(zfsvfs),
3940             &gen64, sizeof (uint64_t))) != 0) {
3941                 ZFS_EXIT(zfsvfs);
3942                 return (error);
3943         }
3944
3945         gen = (uint32_t)gen64;
3946
3947         size = SHORT_FID_LEN;
3948
3949         zfid = (zfid_short_t *)fidp;
3950
3951         zfid->zf_len = size;
3952
3953         for (i = 0; i < sizeof (zfid->zf_object); i++)
3954                 zfid->zf_object[i] = (uint8_t)(object >> (8 * i));
3955
3956         /* Must have a non-zero generation number to distinguish from .zfs */
3957         if (gen == 0)
3958                 gen = 1;
3959         for (i = 0; i < sizeof (zfid->zf_gen); i++)
3960                 zfid->zf_gen[i] = (uint8_t)(gen >> (8 * i));
3961
3962         ZFS_EXIT(zfsvfs);
3963         return (0);
3964 }
3965
3966 #if defined(_KERNEL)
3967 EXPORT_SYMBOL(zfs_open);
3968 EXPORT_SYMBOL(zfs_close);
3969 EXPORT_SYMBOL(zfs_lookup);
3970 EXPORT_SYMBOL(zfs_create);
3971 EXPORT_SYMBOL(zfs_tmpfile);
3972 EXPORT_SYMBOL(zfs_remove);
3973 EXPORT_SYMBOL(zfs_mkdir);
3974 EXPORT_SYMBOL(zfs_rmdir);
3975 EXPORT_SYMBOL(zfs_readdir);
3976 EXPORT_SYMBOL(zfs_getattr_fast);
3977 EXPORT_SYMBOL(zfs_setattr);
3978 EXPORT_SYMBOL(zfs_rename);
3979 EXPORT_SYMBOL(zfs_symlink);
3980 EXPORT_SYMBOL(zfs_readlink);
3981 EXPORT_SYMBOL(zfs_link);
3982 EXPORT_SYMBOL(zfs_inactive);
3983 EXPORT_SYMBOL(zfs_space);
3984 EXPORT_SYMBOL(zfs_fid);
3985 EXPORT_SYMBOL(zfs_getpage);
3986 EXPORT_SYMBOL(zfs_putpage);
3987 EXPORT_SYMBOL(zfs_dirty_inode);
3988 EXPORT_SYMBOL(zfs_map);
3989
3990 /* CSTYLED */
3991 module_param(zfs_delete_blocks, ulong, 0644);
3992 MODULE_PARM_DESC(zfs_delete_blocks, "Delete files larger than N blocks async");
3993
3994 #endif