module/os/freebsd/zfs/zfs_vnops_os.c

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or https://opensource.org/licenses/CDDL-1.0.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21
  22 /*
  23  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  24  * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
  25  * Copyright (c) 2014 Integros [integros.com]
  26  * Copyright 2017 Nexenta Systems, Inc.
  27  */
  28
  29 /* Portions Copyright 2007 Jeremy Teo */
  30 /* Portions Copyright 2010 Robert Milkowski */
  31
  32 #include <sys/param.h>
  33 #include <sys/time.h>
  34 #include <sys/systm.h>
  35 #include <sys/sysmacros.h>
  36 #include <sys/resource.h>
  37 #include <security/mac/mac_framework.h>
  38 #include <sys/vfs.h>
  39 #include <sys/endian.h>
  40 #include <sys/vm.h>
  41 #include <sys/vnode.h>
  42 #include <sys/smr.h>
  43 #include <sys/dirent.h>
  44 #include <sys/file.h>
  45 #include <sys/stat.h>
  46 #include <sys/kmem.h>
  47 #include <sys/taskq.h>
  48 #include <sys/uio.h>
  49 #include <sys/atomic.h>
  50 #include <sys/namei.h>
  51 #include <sys/mman.h>
  52 #include <sys/cmn_err.h>
  53 #include <sys/kdb.h>
  54 #include <sys/sysproto.h>
  55 #include <sys/errno.h>
  56 #include <sys/unistd.h>
  57 #include <sys/zfs_dir.h>
  58 #include <sys/zfs_ioctl.h>
  59 #include <sys/fs/zfs.h>
  60 #include <sys/dmu.h>
  61 #include <sys/dmu_objset.h>
  62 #include <sys/spa.h>
  63 #include <sys/txg.h>
  64 #include <sys/dbuf.h>
  65 #include <sys/zap.h>
  66 #include <sys/sa.h>
  67 #include <sys/policy.h>
  68 #include <sys/sunddi.h>
  69 #include <sys/filio.h>
  70 #include <sys/sid.h>
  71 #include <sys/zfs_ctldir.h>
  72 #include <sys/zfs_fuid.h>
  73 #include <sys/zfs_quota.h>
  74 #include <sys/zfs_sa.h>
  75 #include <sys/zfs_rlock.h>
  76 #include <sys/bio.h>
  77 #include <sys/buf.h>
  78 #include <sys/sched.h>
  79 #include <sys/acl.h>
  80 #include <sys/vmmeter.h>
  81 #include <vm/vm_param.h>
  82 #include <sys/zil.h>
  83 #include <sys/zfs_vnops.h>
  84 #include <sys/module.h>
  85 #include <sys/sysent.h>
  86 #include <sys/dmu_impl.h>
  87 #include <sys/brt.h>
  88 #include <sys/zfeature.h>
  89
  90 #include <vm/vm_object.h>
  91
  92 #include <sys/extattr.h>
  93 #include <sys/priv.h>
  94
  95 #ifndef VN_OPEN_INVFS
  96 #define VN_OPEN_INVFS   0x0
  97 #endif
  98
  99 VFS_SMR_DECLARE;
 100
 101 #ifdef DEBUG_VFS_LOCKS
 102 #define VNCHECKREF(vp)                            \
 103         VNASSERT((vp)->v_holdcnt > 0 && (vp)->v_usecount > 0, vp,       \
 104             ("%s: wrong ref counts", __func__));
 105 #else
 106 #define VNCHECKREF(vp)
 107 #endif
 108
 109 #if __FreeBSD_version >= 1400045
 110 typedef uint64_t cookie_t;
 111 #else
 112 typedef ulong_t cookie_t;
 113 #endif
 114
 115 /*
 116  * Programming rules.
 117  *
 118  * Each vnode op performs some logical unit of work.  To do this, the ZPL must
 119  * properly lock its in-core state, create a DMU transaction, do the work,
 120  * record this work in the intent log (ZIL), commit the DMU transaction,
 121  * and wait for the intent log to commit if it is a synchronous operation.
 122  * Moreover, the vnode ops must work in both normal and log replay context.
 123  * The ordering of events is important to avoid deadlocks and references
 124  * to freed memory.  The example below illustrates the following Big Rules:
 125  *
 126  *  (1) A check must be made in each zfs thread for a mounted file system.
 127  *      This is done avoiding races using zfs_enter(zfsvfs).
 128  *      A zfs_exit(zfsvfs) is needed before all returns.  Any znodes
 129  *      must be checked with zfs_verify_zp(zp).  Both of these macros
 130  *      can return EIO from the calling function.
 131  *
 132  *  (2) VN_RELE() should always be the last thing except for zil_commit()
 133  *      (if necessary) and zfs_exit(). This is for 3 reasons:
 134  *      First, if it's the last reference, the vnode/znode
 135  *      can be freed, so the zp may point to freed memory.  Second, the last
 136  *      reference will call zfs_zinactive(), which may induce a lot of work --
 137  *      pushing cached pages (which acquires range locks) and syncing out
 138  *      cached atime changes.  Third, zfs_zinactive() may require a new tx,
 139  *      which could deadlock the system if you were already holding one.
 140  *      If you must call VN_RELE() within a tx then use VN_RELE_ASYNC().
 141  *
 142  *  (3) All range locks must be grabbed before calling dmu_tx_assign(),
 143  *      as they can span dmu_tx_assign() calls.
 144  *
 145  *  (4) If ZPL locks are held, pass TXG_NOWAIT as the second argument to
 146  *      dmu_tx_assign().  This is critical because we don't want to block
 147  *      while holding locks.
 148  *
 149  *      If no ZPL locks are held (aside from zfs_enter()), use TXG_WAIT.  This
 150  *      reduces lock contention and CPU usage when we must wait (note that if
 151  *      throughput is constrained by the storage, nearly every transaction
 152  *      must wait).
 153  *
 154  *      Note, in particular, that if a lock is sometimes acquired before
 155  *      the tx assigns, and sometimes after (e.g. z_lock), then failing
 156  *      to use a non-blocking assign can deadlock the system.  The scenario:
 157  *
 158  *      Thread A has grabbed a lock before calling dmu_tx_assign().
 159  *      Thread B is in an already-assigned tx, and blocks for this lock.
 160  *      Thread A calls dmu_tx_assign(TXG_WAIT) and blocks in txg_wait_open()
 161  *      forever, because the previous txg can't quiesce until B's tx commits.
 162  *
 163  *      If dmu_tx_assign() returns ERESTART and zfsvfs->z_assign is TXG_NOWAIT,
 164  *      then drop all locks, call dmu_tx_wait(), and try again.  On subsequent
 165  *      calls to dmu_tx_assign(), pass TXG_NOTHROTTLE in addition to TXG_NOWAIT,
 166  *      to indicate that this operation has already called dmu_tx_wait().
 167  *      This will ensure that we don't retry forever, waiting a short bit
 168  *      each time.
 169  *
 170  *  (5) If the operation succeeded, generate the intent log entry for it
 171  *      before dropping locks.  This ensures that the ordering of events
 172  *      in the intent log matches the order in which they actually occurred.
 173  *      During ZIL replay the zfs_log_* functions will update the sequence
 174  *      number to indicate the zil transaction has replayed.
 175  *
 176  *  (6) At the end of each vnode op, the DMU tx must always commit,
 177  *      regardless of whether there were any errors.
 178  *
 179  *  (7) After dropping all locks, invoke zil_commit(zilog, foid)
 180  *      to ensure that synchronous semantics are provided when necessary.
 181  *
 182  * In general, this is how things should be ordered in each vnode op:
 183  *
 184  *      zfs_enter(zfsvfs);              // exit if unmounted
 185  * top:
 186  *      zfs_dirent_lookup(&dl, ...)     // lock directory entry (may VN_HOLD())
 187  *      rw_enter(...);                  // grab any other locks you need
 188  *      tx = dmu_tx_create(...);        // get DMU tx
 189  *      dmu_tx_hold_*();                // hold each object you might modify
 190  *      error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT);
 191  *      if (error) {
 192  *              rw_exit(...);           // drop locks
 193  *              zfs_dirent_unlock(dl);  // unlock directory entry
 194  *              VN_RELE(...);           // release held vnodes
 195  *              if (error == ERESTART) {
 196  *                      waited = B_TRUE;
 197  *                      dmu_tx_wait(tx);
 198  *                      dmu_tx_abort(tx);
 199  *                      goto top;
 200  *              }
 201  *              dmu_tx_abort(tx);       // abort DMU tx
 202  *              zfs_exit(zfsvfs);       // finished in zfs
 203  *              return (error);         // really out of space
 204  *      }
 205  *      error = do_real_work();         // do whatever this VOP does
 206  *      if (error == 0)
 207  *              zfs_log_*(...);         // on success, make ZIL entry
 208  *      dmu_tx_commit(tx);              // commit DMU tx -- error or not
 209  *      rw_exit(...);                   // drop locks
 210  *      zfs_dirent_unlock(dl);          // unlock directory entry
 211  *      VN_RELE(...);                   // release held vnodes
 212  *      zil_commit(zilog, foid);        // synchronous when necessary
 213  *      zfs_exit(zfsvfs);               // finished in zfs
 214  *      return (error);                 // done, report error
 215  */
 216 static int
 217 zfs_open(vnode_t **vpp, int flag, cred_t *cr)
 218 {
 219         (void) cr;
 220         znode_t *zp = VTOZ(*vpp);
 221         zfsvfs_t *zfsvfs = zp->z_zfsvfs;
 222         int error;
 223
 224         if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
 225                 return (error);
 226
 227         if ((flag & FWRITE) && (zp->z_pflags & ZFS_APPENDONLY) &&
 228             ((flag & FAPPEND) == 0)) {
 229                 zfs_exit(zfsvfs, FTAG);
 230                 return (SET_ERROR(EPERM));
 231         }
 232
 233         /*
 234          * Keep a count of the synchronous opens in the znode.  On first
 235          * synchronous open we must convert all previous async transactions
 236          * into sync to keep correct ordering.
 237          */
 238         if (flag & O_SYNC) {
 239                 if (atomic_inc_32_nv(&zp->z_sync_cnt) == 1)
 240                         zil_async_to_sync(zfsvfs->z_log, zp->z_id);
 241         }
 242
 243         zfs_exit(zfsvfs, FTAG);
 244         return (0);
 245 }
 246
 247 static int
 248 zfs_close(vnode_t *vp, int flag, int count, offset_t offset, cred_t *cr)
 249 {
 250         (void) offset, (void) cr;
 251         znode_t *zp = VTOZ(vp);
 252         zfsvfs_t *zfsvfs = zp->z_zfsvfs;
 253         int error;
 254
 255         if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
 256                 return (error);
 257
 258         /* Decrement the synchronous opens in the znode */
 259         if ((flag & O_SYNC) && (count == 1))
 260                 atomic_dec_32(&zp->z_sync_cnt);
 261
 262         zfs_exit(zfsvfs, FTAG);
 263         return (0);
 264 }
 265
 266 static int
 267 zfs_ioctl(vnode_t *vp, ulong_t com, intptr_t data, int flag, cred_t *cred,
 268     int *rvalp)
 269 {
 270         (void) flag, (void) cred, (void) rvalp;
 271         loff_t off;
 272         int error;
 273
 274         switch (com) {
 275         case _FIOFFS:
 276         {
 277                 return (0);
 278
 279                 /*
 280                  * The following two ioctls are used by bfu.  Faking out,
 281                  * necessary to avoid bfu errors.
 282                  */
 283         }
 284         case _FIOGDIO:
 285         case _FIOSDIO:
 286         {
 287                 return (0);
 288         }
 289
 290         case F_SEEK_DATA:
 291         case F_SEEK_HOLE:
 292         {
 293                 off = *(offset_t *)data;
 294                 /* offset parameter is in/out */
 295                 error = zfs_holey(VTOZ(vp), com, &off);
 296                 if (error)
 297                         return (error);
 298                 *(offset_t *)data = off;
 299                 return (0);
 300         }
 301         }
 302         return (SET_ERROR(ENOTTY));
 303 }
 304
 305 static vm_page_t
 306 page_busy(vnode_t *vp, int64_t start, int64_t off, int64_t nbytes)
 307 {
 308         vm_object_t obj;
 309         vm_page_t pp;
 310         int64_t end;
 311
 312         /*
 313          * At present vm_page_clear_dirty extends the cleared range to DEV_BSIZE
 314          * aligned boundaries, if the range is not aligned.  As a result a
 315          * DEV_BSIZE subrange with partially dirty data may get marked as clean.
 316          * It may happen that all DEV_BSIZE subranges are marked clean and thus
 317          * the whole page would be considered clean despite have some
 318          * dirty data.
 319          * For this reason we should shrink the range to DEV_BSIZE aligned
 320          * boundaries before calling vm_page_clear_dirty.
 321          */
 322         end = rounddown2(off + nbytes, DEV_BSIZE);
 323         off = roundup2(off, DEV_BSIZE);
 324         nbytes = end - off;
 325
 326         obj = vp->v_object;
 327         vm_page_grab_valid_unlocked(&pp, obj, OFF_TO_IDX(start),
 328             VM_ALLOC_NOCREAT | VM_ALLOC_SBUSY | VM_ALLOC_NORMAL |
 329             VM_ALLOC_IGN_SBUSY);
 330         if (pp != NULL) {
 331                 ASSERT3U(pp->valid, ==, VM_PAGE_BITS_ALL);
 332                 vm_object_pip_add(obj, 1);
 333                 pmap_remove_write(pp);
 334                 if (nbytes != 0)
 335                         vm_page_clear_dirty(pp, off, nbytes);
 336         }
 337         return (pp);
 338 }
 339
 340 static void
 341 page_unbusy(vm_page_t pp)
 342 {
 343
 344         vm_page_sunbusy(pp);
 345         vm_object_pip_wakeup(pp->object);
 346 }
 347
 348 static vm_page_t
 349 page_hold(vnode_t *vp, int64_t start)
 350 {
 351         vm_object_t obj;
 352         vm_page_t m;
 353
 354         obj = vp->v_object;
 355         vm_page_grab_valid_unlocked(&m, obj, OFF_TO_IDX(start),
 356             VM_ALLOC_NOCREAT | VM_ALLOC_WIRED | VM_ALLOC_IGN_SBUSY |
 357             VM_ALLOC_NOBUSY);
 358         return (m);
 359 }
 360
 361 static void
 362 page_unhold(vm_page_t pp)
 363 {
 364         vm_page_unwire(pp, PQ_ACTIVE);
 365 }
 366
 367 /*
 368  * When a file is memory mapped, we must keep the IO data synchronized
 369  * between the DMU cache and the memory mapped pages.  What this means:
 370  *
 371  * On Write:    If we find a memory mapped page, we write to *both*
 372  *              the page and the dmu buffer.
 373  */
 374 void
 375 update_pages(znode_t *zp, int64_t start, int len, objset_t *os)
 376 {
 377         vm_object_t obj;
 378         struct sf_buf *sf;
 379         vnode_t *vp = ZTOV(zp);
 380         caddr_t va;
 381         int off;
 382
 383         ASSERT3P(vp->v_mount, !=, NULL);
 384         obj = vp->v_object;
 385         ASSERT3P(obj, !=, NULL);
 386
 387         off = start & PAGEOFFSET;
 388         vm_object_pip_add(obj, 1);
 389         for (start &= PAGEMASK; len > 0; start += PAGESIZE) {
 390                 vm_page_t pp;
 391                 int nbytes = imin(PAGESIZE - off, len);
 392
 393                 if ((pp = page_busy(vp, start, off, nbytes)) != NULL) {
 394                         va = zfs_map_page(pp, &sf);
 395                         (void) dmu_read(os, zp->z_id, start + off, nbytes,
 396                             va + off, DMU_READ_PREFETCH);
 397                         zfs_unmap_page(sf);
 398                         page_unbusy(pp);
 399                 }
 400                 len -= nbytes;
 401                 off = 0;
 402         }
 403         vm_object_pip_wakeup(obj);
 404 }
 405
 406 /*
 407  * Read with UIO_NOCOPY flag means that sendfile(2) requests
 408  * ZFS to populate a range of page cache pages with data.
 409  *
 410  * NOTE: this function could be optimized to pre-allocate
 411  * all pages in advance, drain exclusive busy on all of them,
 412  * map them into contiguous KVA region and populate them
 413  * in one single dmu_read() call.
 414  */
 415 int
 416 mappedread_sf(znode_t *zp, int nbytes, zfs_uio_t *uio)
 417 {
 418         vnode_t *vp = ZTOV(zp);
 419         objset_t *os = zp->z_zfsvfs->z_os;
 420         struct sf_buf *sf;
 421         vm_object_t obj;
 422         vm_page_t pp;
 423         int64_t start;
 424         caddr_t va;
 425         int len = nbytes;
 426         int error = 0;
 427
 428         ASSERT3U(zfs_uio_segflg(uio), ==, UIO_NOCOPY);
 429         ASSERT3P(vp->v_mount, !=, NULL);
 430         obj = vp->v_object;
 431         ASSERT3P(obj, !=, NULL);
 432         ASSERT0(zfs_uio_offset(uio) & PAGEOFFSET);
 433
 434         for (start = zfs_uio_offset(uio); len > 0; start += PAGESIZE) {
 435                 int bytes = MIN(PAGESIZE, len);
 436
 437                 pp = vm_page_grab_unlocked(obj, OFF_TO_IDX(start),
 438                     VM_ALLOC_SBUSY | VM_ALLOC_NORMAL | VM_ALLOC_IGN_SBUSY);
 439                 if (vm_page_none_valid(pp)) {
 440                         va = zfs_map_page(pp, &sf);
 441                         error = dmu_read(os, zp->z_id, start, bytes, va,
 442                             DMU_READ_PREFETCH);
 443                         if (bytes != PAGESIZE && error == 0)
 444                                 memset(va + bytes, 0, PAGESIZE - bytes);
 445                         zfs_unmap_page(sf);
 446                         if (error == 0) {
 447                                 vm_page_valid(pp);
 448                                 vm_page_activate(pp);
 449                                 vm_page_sunbusy(pp);
 450                         } else {
 451                                 zfs_vmobject_wlock(obj);
 452                                 if (!vm_page_wired(pp) && pp->valid == 0 &&
 453                                     vm_page_busy_tryupgrade(pp))
 454                                         vm_page_free(pp);
 455                                 else
 456                                         vm_page_sunbusy(pp);
 457                                 zfs_vmobject_wunlock(obj);
 458                         }
 459                 } else {
 460                         ASSERT3U(pp->valid, ==, VM_PAGE_BITS_ALL);
 461                         vm_page_sunbusy(pp);
 462                 }
 463                 if (error)
 464                         break;
 465                 zfs_uio_advance(uio, bytes);
 466                 len -= bytes;
 467         }
 468         return (error);
 469 }
 470
 471 /*
 472  * When a file is memory mapped, we must keep the IO data synchronized
 473  * between the DMU cache and the memory mapped pages.  What this means:
 474  *
 475  * On Read:     We "read" preferentially from memory mapped pages,
 476  *              else we default from the dmu buffer.
 477  *
 478  * NOTE: We will always "break up" the IO into PAGESIZE uiomoves when
 479  *       the file is memory mapped.
 480  */
 481 int
 482 mappedread(znode_t *zp, int nbytes, zfs_uio_t *uio)
 483 {
 484         vnode_t *vp = ZTOV(zp);
 485         vm_object_t obj;
 486         int64_t start;
 487         int len = nbytes;
 488         int off;
 489         int error = 0;
 490
 491         ASSERT3P(vp->v_mount, !=, NULL);
 492         obj = vp->v_object;
 493         ASSERT3P(obj, !=, NULL);
 494
 495         start = zfs_uio_offset(uio);
 496         off = start & PAGEOFFSET;
 497         for (start &= PAGEMASK; len > 0; start += PAGESIZE) {
 498                 vm_page_t pp;
 499                 uint64_t bytes = MIN(PAGESIZE - off, len);
 500
 501                 if ((pp = page_hold(vp, start))) {
 502                         struct sf_buf *sf;
 503                         caddr_t va;
 504
 505                         va = zfs_map_page(pp, &sf);
 506                         error = vn_io_fault_uiomove(va + off, bytes,
 507                             GET_UIO_STRUCT(uio));
 508                         zfs_unmap_page(sf);
 509                         page_unhold(pp);
 510                 } else {
 511                         error = dmu_read_uio_dbuf(sa_get_db(zp->z_sa_hdl),
 512                             uio, bytes);
 513                 }
 514                 len -= bytes;
 515                 off = 0;
 516                 if (error)
 517                         break;
 518         }
 519         return (error);
 520 }
 521
 522 int
 523 zfs_write_simple(znode_t *zp, const void *data, size_t len,
 524     loff_t pos, size_t *presid)
 525 {
 526         int error = 0;
 527         ssize_t resid;
 528
 529         error = vn_rdwr(UIO_WRITE, ZTOV(zp), __DECONST(void *, data), len, pos,
 530             UIO_SYSSPACE, IO_SYNC, kcred, NOCRED, &resid, curthread);
 531
 532         if (error) {
 533                 return (SET_ERROR(error));
 534         } else if (presid == NULL) {
 535                 if (resid != 0) {
 536                         error = SET_ERROR(EIO);
 537                 }
 538         } else {
 539                 *presid = resid;
 540         }
 541         return (error);
 542 }
 543
 544 void
 545 zfs_zrele_async(znode_t *zp)
 546 {
 547         vnode_t *vp = ZTOV(zp);
 548         objset_t *os = ITOZSB(vp)->z_os;
 549
 550         VN_RELE_ASYNC(vp, dsl_pool_zrele_taskq(dmu_objset_pool(os)));
 551 }
 552
 553 static int
 554 zfs_dd_callback(struct mount *mp, void *arg, int lkflags, struct vnode **vpp)
 555 {
 556         int error;
 557
 558         *vpp = arg;
 559         error = vn_lock(*vpp, lkflags);
 560         if (error != 0)
 561                 vrele(*vpp);
 562         return (error);
 563 }
 564
 565 static int
 566 zfs_lookup_lock(vnode_t *dvp, vnode_t *vp, const char *name, int lkflags)
 567 {
 568         znode_t *zdp = VTOZ(dvp);
 569         zfsvfs_t *zfsvfs __unused = zdp->z_zfsvfs;
 570         int error;
 571         int ltype;
 572
 573         if (zfsvfs->z_replay == B_FALSE)
 574                 ASSERT_VOP_LOCKED(dvp, __func__);
 575
 576         if (name[0] == 0 || (name[0] == '.' && name[1] == 0)) {
 577                 ASSERT3P(dvp, ==, vp);
 578                 vref(dvp);
 579                 ltype = lkflags & LK_TYPE_MASK;
 580                 if (ltype != VOP_ISLOCKED(dvp)) {
 581                         if (ltype == LK_EXCLUSIVE)
 582                                 vn_lock(dvp, LK_UPGRADE | LK_RETRY);
 583                         else /* if (ltype == LK_SHARED) */
 584                                 vn_lock(dvp, LK_DOWNGRADE | LK_RETRY);
 585
 586                         /*
 587                          * Relock for the "." case could leave us with
 588                          * reclaimed vnode.
 589                          */
 590                         if (VN_IS_DOOMED(dvp)) {
 591                                 vrele(dvp);
 592                                 return (SET_ERROR(ENOENT));
 593                         }
 594                 }
 595                 return (0);
 596         } else if (name[0] == '.' && name[1] == '.' && name[2] == 0) {
 597                 /*
 598                  * Note that in this case, dvp is the child vnode, and we
 599                  * are looking up the parent vnode - exactly reverse from
 600                  * normal operation.  Unlocking dvp requires some rather
 601                  * tricky unlock/relock dance to prevent mp from being freed;
 602                  * use vn_vget_ino_gen() which takes care of all that.
 603                  *
 604                  * XXX Note that there is a time window when both vnodes are
 605                  * unlocked.  It is possible, although highly unlikely, that
 606                  * during that window the parent-child relationship between
 607                  * the vnodes may change, for example, get reversed.
 608                  * In that case we would have a wrong lock order for the vnodes.
 609                  * All other filesystems seem to ignore this problem, so we
 610                  * do the same here.
 611                  * A potential solution could be implemented as follows:
 612                  * - using LK_NOWAIT when locking the second vnode and retrying
 613                  *   if necessary
 614                  * - checking that the parent-child relationship still holds
 615                  *   after locking both vnodes and retrying if it doesn't
 616                  */
 617                 error = vn_vget_ino_gen(dvp, zfs_dd_callback, vp, lkflags, &vp);
 618                 return (error);
 619         } else {
 620                 error = vn_lock(vp, lkflags);
 621                 if (error != 0)
 622                         vrele(vp);
 623                 return (error);
 624         }
 625 }
 626
 627 /*
 628  * Lookup an entry in a directory, or an extended attribute directory.
 629  * If it exists, return a held vnode reference for it.
 630  *
 631  *      IN:     dvp     - vnode of directory to search.
 632  *              nm      - name of entry to lookup.
 633  *              pnp     - full pathname to lookup [UNUSED].
 634  *              flags   - LOOKUP_XATTR set if looking for an attribute.
 635  *              rdir    - root directory vnode [UNUSED].
 636  *              cr      - credentials of caller.
 637  *              ct      - caller context
 638  *
 639  *      OUT:    vpp     - vnode of located entry, NULL if not found.
 640  *
 641  *      RETURN: 0 on success, error code on failure.
 642  *
 643  * Timestamps:
 644  *      NA
 645  */
 646 static int
 647 zfs_lookup(vnode_t *dvp, const char *nm, vnode_t **vpp,
 648     struct componentname *cnp, int nameiop, cred_t *cr, int flags,
 649     boolean_t cached)
 650 {
 651         znode_t *zdp = VTOZ(dvp);
 652         znode_t *zp;
 653         zfsvfs_t *zfsvfs = zdp->z_zfsvfs;
 654         seqc_t dvp_seqc;
 655         int     error = 0;
 656
 657         /*
 658          * Fast path lookup, however we must skip DNLC lookup
 659          * for case folding or normalizing lookups because the
 660          * DNLC code only stores the passed in name.  This means
 661          * creating 'a' and removing 'A' on a case insensitive
 662          * file system would work, but DNLC still thinks 'a'
 663          * exists and won't let you create it again on the next
 664          * pass through fast path.
 665          */
 666         if (!(flags & LOOKUP_XATTR)) {
 667                 if (dvp->v_type != VDIR) {
 668                         return (SET_ERROR(ENOTDIR));
 669                 } else if (zdp->z_sa_hdl == NULL) {
 670                         return (SET_ERROR(EIO));
 671                 }
 672         }
 673
 674         DTRACE_PROBE2(zfs__fastpath__lookup__miss, vnode_t *, dvp,
 675             const char *, nm);
 676
 677         if ((error = zfs_enter_verify_zp(zfsvfs, zdp, FTAG)) != 0)
 678                 return (error);
 679
 680         dvp_seqc = vn_seqc_read_notmodify(dvp);
 681
 682         *vpp = NULL;
 683
 684         if (flags & LOOKUP_XATTR) {
 685                 /*
 686                  * If the xattr property is off, refuse the lookup request.
 687                  */
 688                 if (!(zfsvfs->z_flags & ZSB_XATTR)) {
 689                         zfs_exit(zfsvfs, FTAG);
 690                         return (SET_ERROR(EOPNOTSUPP));
 691                 }
 692
 693                 /*
 694                  * We don't allow recursive attributes..
 695                  * Maybe someday we will.
 696                  */
 697                 if (zdp->z_pflags & ZFS_XATTR) {
 698                         zfs_exit(zfsvfs, FTAG);
 699                         return (SET_ERROR(EINVAL));
 700                 }
 701
 702                 if ((error = zfs_get_xattrdir(VTOZ(dvp), &zp, cr, flags))) {
 703                         zfs_exit(zfsvfs, FTAG);
 704                         return (error);
 705                 }
 706                 *vpp = ZTOV(zp);
 707
 708                 /*
 709                  * Do we have permission to get into attribute directory?
 710                  */
 711                 error = zfs_zaccess(zp, ACE_EXECUTE, 0, B_FALSE, cr, NULL);
 712                 if (error) {
 713                         vrele(ZTOV(zp));
 714                 }
 715
 716                 zfs_exit(zfsvfs, FTAG);
 717                 return (error);
 718         }
 719
 720         /*
 721          * Check accessibility of directory if we're not coming in via
 722          * VOP_CACHEDLOOKUP.
 723          */
 724         if (!cached) {
 725 #ifdef NOEXECCHECK
 726                 if ((cnp->cn_flags & NOEXECCHECK) != 0) {
 727                         cnp->cn_flags &= ~NOEXECCHECK;
 728                 } else
 729 #endif
 730                 if ((error = zfs_zaccess(zdp, ACE_EXECUTE, 0, B_FALSE, cr,
 731                     NULL))) {
 732                         zfs_exit(zfsvfs, FTAG);
 733                         return (error);
 734                 }
 735         }
 736
 737         if (zfsvfs->z_utf8 && u8_validate(nm, strlen(nm),
 738             NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
 739                 zfs_exit(zfsvfs, FTAG);
 740                 return (SET_ERROR(EILSEQ));
 741         }
 742
 743
 744         /*
 745          * First handle the special cases.
 746          */
 747         if ((cnp->cn_flags & ISDOTDOT) != 0) {
 748                 /*
 749                  * If we are a snapshot mounted under .zfs, return
 750                  * the vp for the snapshot directory.
 751                  */
 752                 if (zdp->z_id == zfsvfs->z_root && zfsvfs->z_parent != zfsvfs) {
 753                         struct componentname cn;
 754                         vnode_t *zfsctl_vp;
 755                         int ltype;
 756
 757                         zfs_exit(zfsvfs, FTAG);
 758                         ltype = VOP_ISLOCKED(dvp);
 759                         VOP_UNLOCK(dvp);
 760                         error = zfsctl_root(zfsvfs->z_parent, LK_SHARED,
 761                             &zfsctl_vp);
 762                         if (error == 0) {
 763                                 cn.cn_nameptr = "snapshot";
 764                                 cn.cn_namelen = strlen(cn.cn_nameptr);
 765                                 cn.cn_nameiop = cnp->cn_nameiop;
 766                                 cn.cn_flags = cnp->cn_flags & ~ISDOTDOT;
 767                                 cn.cn_lkflags = cnp->cn_lkflags;
 768                                 error = VOP_LOOKUP(zfsctl_vp, vpp, &cn);
 769                                 vput(zfsctl_vp);
 770                         }
 771                         vn_lock(dvp, ltype | LK_RETRY);
 772                         return (error);
 773                 }
 774         }
 775         if (zfs_has_ctldir(zdp) && strcmp(nm, ZFS_CTLDIR_NAME) == 0) {
 776                 zfs_exit(zfsvfs, FTAG);
 777                 if (zfsvfs->z_show_ctldir == ZFS_SNAPDIR_DISABLED)
 778                         return (SET_ERROR(ENOENT));
 779                 if ((cnp->cn_flags & ISLASTCN) != 0 && nameiop != LOOKUP)
 780                         return (SET_ERROR(ENOTSUP));
 781                 error = zfsctl_root(zfsvfs, cnp->cn_lkflags, vpp);
 782                 return (error);
 783         }
 784
 785         /*
 786          * The loop is retry the lookup if the parent-child relationship
 787          * changes during the dot-dot locking complexities.
 788          */
 789         for (;;) {
 790                 uint64_t parent;
 791
 792                 error = zfs_dirlook(zdp, nm, &zp);
 793                 if (error == 0)
 794                         *vpp = ZTOV(zp);
 795
 796                 zfs_exit(zfsvfs, FTAG);
 797                 if (error != 0)
 798                         break;
 799
 800                 error = zfs_lookup_lock(dvp, *vpp, nm, cnp->cn_lkflags);
 801                 if (error != 0) {
 802                         /*
 803                          * If we've got a locking error, then the vnode
 804                          * got reclaimed because of a force unmount.
 805                          * We never enter doomed vnodes into the name cache.
 806                          */
 807                         *vpp = NULL;
 808                         return (error);
 809                 }
 810
 811                 if ((cnp->cn_flags & ISDOTDOT) == 0)
 812                         break;
 813
 814                 if ((error = zfs_enter(zfsvfs, FTAG)) != 0) {
 815                         vput(ZTOV(zp));
 816                         *vpp = NULL;
 817                         return (error);
 818                 }
 819                 if (zdp->z_sa_hdl == NULL) {
 820                         error = SET_ERROR(EIO);
 821                 } else {
 822                         error = sa_lookup(zdp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs),
 823                             &parent, sizeof (parent));
 824                 }
 825                 if (error != 0) {
 826                         zfs_exit(zfsvfs, FTAG);
 827                         vput(ZTOV(zp));
 828                         break;
 829                 }
 830                 if (zp->z_id == parent) {
 831                         zfs_exit(zfsvfs, FTAG);
 832                         break;
 833                 }
 834                 vput(ZTOV(zp));
 835         }
 836
 837         if (error != 0)
 838                 *vpp = NULL;
 839
 840         /* Translate errors and add SAVENAME when needed. */
 841         if (cnp->cn_flags & ISLASTCN) {
 842                 switch (nameiop) {
 843                 case CREATE:
 844                 case RENAME:
 845                         if (error == ENOENT) {
 846                                 error = EJUSTRETURN;
 847 #if __FreeBSD_version < 1400068
 848                                 cnp->cn_flags |= SAVENAME;
 849 #endif
 850                                 break;
 851                         }
 852                         zfs_fallthrough;
 853                 case DELETE:
 854 #if __FreeBSD_version < 1400068
 855                         if (error == 0)
 856                                 cnp->cn_flags |= SAVENAME;
 857 #endif
 858                         break;
 859                 }
 860         }
 861
 862         if ((cnp->cn_flags & ISDOTDOT) != 0) {
 863                 /*
 864                  * FIXME: zfs_lookup_lock relocks vnodes and does nothing to
 865                  * handle races. In particular different callers may end up
 866                  * with different vnodes and will try to add conflicting
 867                  * entries to the namecache.
 868                  *
 869                  * While finding different result may be acceptable in face
 870                  * of concurrent modification, adding conflicting entries
 871                  * trips over an assert in the namecache.
 872                  *
 873                  * Ultimately let an entry through once everything settles.
 874                  */
 875                 if (!vn_seqc_consistent(dvp, dvp_seqc)) {
 876                         cnp->cn_flags &= ~MAKEENTRY;
 877                 }
 878         }
 879
 880         /* Insert name into cache (as non-existent) if appropriate. */
 881         if (zfsvfs->z_use_namecache && !zfsvfs->z_replay &&
 882             error == ENOENT && (cnp->cn_flags & MAKEENTRY) != 0)
 883                 cache_enter(dvp, NULL, cnp);
 884
 885         /* Insert name into cache if appropriate. */
 886         if (zfsvfs->z_use_namecache && !zfsvfs->z_replay &&
 887             error == 0 && (cnp->cn_flags & MAKEENTRY)) {
 888                 if (!(cnp->cn_flags & ISLASTCN) ||
 889                     (nameiop != DELETE && nameiop != RENAME)) {
 890                         cache_enter(dvp, *vpp, cnp);
 891                 }
 892         }
 893
 894         return (error);
 895 }
 896
 897 static inline bool
 898 is_nametoolong(zfsvfs_t *zfsvfs, const char *name)
 899 {
 900         size_t dlen = strlen(name);
 901         return ((!zfsvfs->z_longname && dlen >= ZAP_MAXNAMELEN) ||
 902             dlen >= ZAP_MAXNAMELEN_NEW);
 903 }
 904
 905 /*
 906  * Attempt to create a new entry in a directory.  If the entry
 907  * already exists, truncate the file if permissible, else return
 908  * an error.  Return the vp of the created or trunc'd file.
 909  *
 910  *      IN:     dvp     - vnode of directory to put new file entry in.
 911  *              name    - name of new file entry.
 912  *              vap     - attributes of new file.
 913  *              excl    - flag indicating exclusive or non-exclusive mode.
 914  *              mode    - mode to open file with.
 915  *              cr      - credentials of caller.
 916  *              flag    - large file flag [UNUSED].
 917  *              ct      - caller context
 918  *              vsecp   - ACL to be set
 919  *              mnt_ns  - Unused on FreeBSD
 920  *
 921  *      OUT:    vpp     - vnode of created or trunc'd entry.
 922  *
 923  *      RETURN: 0 on success, error code on failure.
 924  *
 925  * Timestamps:
 926  *      dvp - ctime|mtime updated if new entry created
 927  *       vp - ctime|mtime always, atime if new
 928  */
 929 int
 930 zfs_create(znode_t *dzp, const char *name, vattr_t *vap, int excl, int mode,
 931     znode_t **zpp, cred_t *cr, int flag, vsecattr_t *vsecp, zidmap_t *mnt_ns)
 932 {
 933         (void) excl, (void) mode, (void) flag;
 934         znode_t         *zp;
 935         zfsvfs_t        *zfsvfs = dzp->z_zfsvfs;
 936         zilog_t         *zilog;
 937         objset_t        *os;
 938         dmu_tx_t        *tx;
 939         int             error;
 940         uid_t           uid = crgetuid(cr);
 941         gid_t           gid = crgetgid(cr);
 942         uint64_t        projid = ZFS_DEFAULT_PROJID;
 943         zfs_acl_ids_t   acl_ids;
 944         boolean_t       fuid_dirtied;
 945         uint64_t        txtype;
 946 #ifdef DEBUG_VFS_LOCKS
 947         vnode_t *dvp = ZTOV(dzp);
 948 #endif
 949
 950         if (is_nametoolong(zfsvfs, name))
 951                 return (SET_ERROR(ENAMETOOLONG));
 952
 953         /*
 954          * If we have an ephemeral id, ACL, or XVATTR then
 955          * make sure file system is at proper version
 956          */
 957         if (zfsvfs->z_use_fuids == B_FALSE &&
 958             (vsecp || (vap->va_mask & AT_XVATTR) ||
 959             IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid)))
 960                 return (SET_ERROR(EINVAL));
 961
 962         if ((error = zfs_enter_verify_zp(zfsvfs, dzp, FTAG)) != 0)
 963                 return (error);
 964         os = zfsvfs->z_os;
 965         zilog = zfsvfs->z_log;
 966
 967         if (zfsvfs->z_utf8 && u8_validate(name, strlen(name),
 968             NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
 969                 zfs_exit(zfsvfs, FTAG);
 970                 return (SET_ERROR(EILSEQ));
 971         }
 972
 973         if (vap->va_mask & AT_XVATTR) {
 974                 if ((error = secpolicy_xvattr(ZTOV(dzp), (xvattr_t *)vap,
 975                     crgetuid(cr), cr, vap->va_type)) != 0) {
 976                         zfs_exit(zfsvfs, FTAG);
 977                         return (error);
 978                 }
 979         }
 980
 981         *zpp = NULL;
 982
 983         if ((vap->va_mode & S_ISVTX) && secpolicy_vnode_stky_modify(cr))
 984                 vap->va_mode &= ~S_ISVTX;
 985
 986         error = zfs_dirent_lookup(dzp, name, &zp, ZNEW);
 987         if (error) {
 988                 zfs_exit(zfsvfs, FTAG);
 989                 return (error);
 990         }
 991         ASSERT3P(zp, ==, NULL);
 992
 993         /*
 994          * Create a new file object and update the directory
 995          * to reference it.
 996          */
 997         if ((error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr, mnt_ns))) {
 998                 goto out;
 999         }
1000
1001         /*
1002          * We only support the creation of regular files in
1003          * extended attribute directories.
1004          */
1005
1006         if ((dzp->z_pflags & ZFS_XATTR) &&
1007             (vap->va_type != VREG)) {
1008                 error = SET_ERROR(EINVAL);
1009                 goto out;
1010         }
1011
1012         if ((error = zfs_acl_ids_create(dzp, 0, vap,
1013             cr, vsecp, &acl_ids, NULL)) != 0)
1014                 goto out;
1015
1016         if (S_ISREG(vap->va_mode) || S_ISDIR(vap->va_mode))
1017                 projid = zfs_inherit_projid(dzp);
1018         if (zfs_acl_ids_overquota(zfsvfs, &acl_ids, projid)) {
1019                 zfs_acl_ids_free(&acl_ids);
1020                 error = SET_ERROR(EDQUOT);
1021                 goto out;
1022         }
1023
1024         getnewvnode_reserve();
1025
1026         tx = dmu_tx_create(os);
1027
1028         dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
1029             ZFS_SA_BASE_ATTR_SIZE);
1030
1031         fuid_dirtied = zfsvfs->z_fuid_dirty;
1032         if (fuid_dirtied)
1033                 zfs_fuid_txhold(zfsvfs, tx);
1034         dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
1035         dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE);
1036         if (!zfsvfs->z_use_sa &&
1037             acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) {
1038                 dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
1039                     0, acl_ids.z_aclp->z_acl_bytes);
1040         }
1041         error = dmu_tx_assign(tx, TXG_WAIT);
1042         if (error) {
1043                 zfs_acl_ids_free(&acl_ids);
1044                 dmu_tx_abort(tx);
1045                 getnewvnode_drop_reserve();
1046                 zfs_exit(zfsvfs, FTAG);
1047                 return (error);
1048         }
1049         zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids);
1050
1051         error = zfs_link_create(dzp, name, zp, tx, ZNEW);
1052         if (error != 0) {
1053                 /*
1054                  * Since, we failed to add the directory entry for it,
1055                  * delete the newly created dnode.
1056                  */
1057                 zfs_znode_delete(zp, tx);
1058                 VOP_UNLOCK(ZTOV(zp));
1059                 zrele(zp);
1060                 zfs_acl_ids_free(&acl_ids);
1061                 dmu_tx_commit(tx);
1062                 getnewvnode_drop_reserve();
1063                 goto out;
1064         }
1065
1066         if (fuid_dirtied)
1067                 zfs_fuid_sync(zfsvfs, tx);
1068
1069         txtype = zfs_log_create_txtype(Z_FILE, vsecp, vap);
1070         zfs_log_create(zilog, tx, txtype, dzp, zp, name,
1071             vsecp, acl_ids.z_fuidp, vap);
1072         zfs_acl_ids_free(&acl_ids);
1073         dmu_tx_commit(tx);
1074
1075         getnewvnode_drop_reserve();
1076
1077 out:
1078         VNCHECKREF(dvp);
1079         if (error == 0) {
1080                 *zpp = zp;
1081         }
1082
1083         if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
1084                 zil_commit(zilog, 0);
1085
1086         zfs_exit(zfsvfs, FTAG);
1087         return (error);
1088 }
1089
1090 /*
1091  * Remove an entry from a directory.
1092  *
1093  *      IN:     dvp     - vnode of directory to remove entry from.
1094  *              name    - name of entry to remove.
1095  *              cr      - credentials of caller.
1096  *              ct      - caller context
1097  *              flags   - case flags
1098  *
1099  *      RETURN: 0 on success, error code on failure.
1100  *
1101  * Timestamps:
1102  *      dvp - ctime|mtime
1103  *       vp - ctime (if nlink > 0)
1104  */
1105 static int
1106 zfs_remove_(vnode_t *dvp, vnode_t *vp, const char *name, cred_t *cr)
1107 {
1108         znode_t         *dzp = VTOZ(dvp);
1109         znode_t         *zp;
1110         znode_t         *xzp;
1111         zfsvfs_t        *zfsvfs = dzp->z_zfsvfs;
1112         zilog_t         *zilog;
1113         uint64_t        xattr_obj;
1114         uint64_t        obj = 0;
1115         dmu_tx_t        *tx;
1116         boolean_t       unlinked;
1117         uint64_t        txtype;
1118         int             error;
1119
1120
1121         if ((error = zfs_enter_verify_zp(zfsvfs, dzp, FTAG)) != 0)
1122                 return (error);
1123         zp = VTOZ(vp);
1124         if ((error = zfs_verify_zp(zp)) != 0) {
1125                 zfs_exit(zfsvfs, FTAG);
1126                 return (error);
1127         }
1128         zilog = zfsvfs->z_log;
1129
1130         xattr_obj = 0;
1131         xzp = NULL;
1132
1133         if ((error = zfs_zaccess_delete(dzp, zp, cr, NULL))) {
1134                 goto out;
1135         }
1136
1137         /*
1138          * Need to use rmdir for removing directories.
1139          */
1140         if (vp->v_type == VDIR) {
1141                 error = SET_ERROR(EPERM);
1142                 goto out;
1143         }
1144
1145         vnevent_remove(vp, dvp, name, ct);
1146
1147         obj = zp->z_id;
1148
1149         /* are there any extended attributes? */
1150         error = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs),
1151             &xattr_obj, sizeof (xattr_obj));
1152         if (error == 0 && xattr_obj) {
1153                 error = zfs_zget(zfsvfs, xattr_obj, &xzp);
1154                 ASSERT0(error);
1155         }
1156
1157         /*
1158          * We may delete the znode now, or we may put it in the unlinked set;
1159          * it depends on whether we're the last link, and on whether there are
1160          * other holds on the vnode.  So we dmu_tx_hold() the right things to
1161          * allow for either case.
1162          */
1163         tx = dmu_tx_create(zfsvfs->z_os);
1164         dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name);
1165         dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
1166         zfs_sa_upgrade_txholds(tx, zp);
1167         zfs_sa_upgrade_txholds(tx, dzp);
1168
1169         if (xzp) {
1170                 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
1171                 dmu_tx_hold_sa(tx, xzp->z_sa_hdl, B_FALSE);
1172         }
1173
1174         /* charge as an update -- would be nice not to charge at all */
1175         dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
1176
1177         /*
1178          * Mark this transaction as typically resulting in a net free of space
1179          */
1180         dmu_tx_mark_netfree(tx);
1181
1182         error = dmu_tx_assign(tx, TXG_WAIT);
1183         if (error) {
1184                 dmu_tx_abort(tx);
1185                 zfs_exit(zfsvfs, FTAG);
1186                 return (error);
1187         }
1188
1189         /*
1190          * Remove the directory entry.
1191          */
1192         error = zfs_link_destroy(dzp, name, zp, tx, ZEXISTS, &unlinked);
1193
1194         if (error) {
1195                 dmu_tx_commit(tx);
1196                 goto out;
1197         }
1198
1199         if (unlinked) {
1200                 zfs_unlinked_add(zp, tx);
1201                 vp->v_vflag |= VV_NOSYNC;
1202         }
1203         /* XXX check changes to linux vnops */
1204         txtype = TX_REMOVE;
1205         zfs_log_remove(zilog, tx, txtype, dzp, name, obj, unlinked);
1206
1207         dmu_tx_commit(tx);
1208 out:
1209
1210         if (xzp)
1211                 vrele(ZTOV(xzp));
1212
1213         if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
1214                 zil_commit(zilog, 0);
1215
1216
1217         zfs_exit(zfsvfs, FTAG);
1218         return (error);
1219 }
1220
1221
1222 static int
1223 zfs_lookup_internal(znode_t *dzp, const char *name, vnode_t **vpp,
1224     struct componentname *cnp, int nameiop)
1225 {
1226         zfsvfs_t        *zfsvfs = dzp->z_zfsvfs;
1227         int error;
1228
1229         cnp->cn_nameptr = __DECONST(char *, name);
1230         cnp->cn_namelen = strlen(name);
1231         cnp->cn_nameiop = nameiop;
1232         cnp->cn_flags = ISLASTCN;
1233 #if __FreeBSD_version < 1400068
1234         cnp->cn_flags |= SAVENAME;
1235 #endif
1236         cnp->cn_lkflags = LK_EXCLUSIVE | LK_RETRY;
1237         cnp->cn_cred = kcred;
1238 #if __FreeBSD_version < 1400037
1239         cnp->cn_thread = curthread;
1240 #endif
1241
1242         if (zfsvfs->z_use_namecache && !zfsvfs->z_replay) {
1243                 struct vop_lookup_args a;
1244
1245                 a.a_gen.a_desc = &vop_lookup_desc;
1246                 a.a_dvp = ZTOV(dzp);
1247                 a.a_vpp = vpp;
1248                 a.a_cnp = cnp;
1249                 error = vfs_cache_lookup(&a);
1250         } else {
1251                 error = zfs_lookup(ZTOV(dzp), name, vpp, cnp, nameiop, kcred, 0,
1252                     B_FALSE);
1253         }
1254 #ifdef ZFS_DEBUG
1255         if (error) {
1256                 printf("got error %d on name %s on op %d\n", error, name,
1257                     nameiop);
1258                 kdb_backtrace();
1259         }
1260 #endif
1261         return (error);
1262 }
1263
1264 int
1265 zfs_remove(znode_t *dzp, const char *name, cred_t *cr, int flags)
1266 {
1267         vnode_t *vp;
1268         int error;
1269         struct componentname cn;
1270
1271         if ((error = zfs_lookup_internal(dzp, name, &vp, &cn, DELETE)))
1272                 return (error);
1273
1274         error = zfs_remove_(ZTOV(dzp), vp, name, cr);
1275         vput(vp);
1276         return (error);
1277 }
1278 /*
1279  * Create a new directory and insert it into dvp using the name
1280  * provided.  Return a pointer to the inserted directory.
1281  *
1282  *      IN:     dvp     - vnode of directory to add subdir to.
1283  *              dirname - name of new directory.
1284  *              vap     - attributes of new directory.
1285  *              cr      - credentials of caller.
1286  *              ct      - caller context
1287  *              flags   - case flags
1288  *              vsecp   - ACL to be set
1289  *              mnt_ns  - Unused on FreeBSD
1290  *
1291  *      OUT:    vpp     - vnode of created directory.
1292  *
1293  *      RETURN: 0 on success, error code on failure.
1294  *
1295  * Timestamps:
1296  *      dvp - ctime|mtime updated
1297  *       vp - ctime|mtime|atime updated
1298  */
1299 int
1300 zfs_mkdir(znode_t *dzp, const char *dirname, vattr_t *vap, znode_t **zpp,
1301     cred_t *cr, int flags, vsecattr_t *vsecp, zidmap_t *mnt_ns)
1302 {
1303         (void) flags, (void) vsecp;
1304         znode_t         *zp;
1305         zfsvfs_t        *zfsvfs = dzp->z_zfsvfs;
1306         zilog_t         *zilog;
1307         uint64_t        txtype;
1308         dmu_tx_t        *tx;
1309         int             error;
1310         uid_t           uid = crgetuid(cr);
1311         gid_t           gid = crgetgid(cr);
1312         zfs_acl_ids_t   acl_ids;
1313         boolean_t       fuid_dirtied;
1314
1315         ASSERT3U(vap->va_type, ==, VDIR);
1316
1317         if (is_nametoolong(zfsvfs, dirname))
1318                 return (SET_ERROR(ENAMETOOLONG));
1319
1320         /*
1321          * If we have an ephemeral id, ACL, or XVATTR then
1322          * make sure file system is at proper version
1323          */
1324         if (zfsvfs->z_use_fuids == B_FALSE &&
1325             ((vap->va_mask & AT_XVATTR) ||
1326             IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid)))
1327                 return (SET_ERROR(EINVAL));
1328
1329         if ((error = zfs_enter_verify_zp(zfsvfs, dzp, FTAG)) != 0)
1330                 return (error);
1331         zilog = zfsvfs->z_log;
1332
1333         if (dzp->z_pflags & ZFS_XATTR) {
1334                 zfs_exit(zfsvfs, FTAG);
1335                 return (SET_ERROR(EINVAL));
1336         }
1337
1338         if (zfsvfs->z_utf8 && u8_validate(dirname,
1339             strlen(dirname), NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
1340                 zfs_exit(zfsvfs, FTAG);
1341                 return (SET_ERROR(EILSEQ));
1342         }
1343
1344         if (vap->va_mask & AT_XVATTR) {
1345                 if ((error = secpolicy_xvattr(ZTOV(dzp), (xvattr_t *)vap,
1346                     crgetuid(cr), cr, vap->va_type)) != 0) {
1347                         zfs_exit(zfsvfs, FTAG);
1348                         return (error);
1349                 }
1350         }
1351
1352         if ((error = zfs_acl_ids_create(dzp, 0, vap, cr,
1353             NULL, &acl_ids, NULL)) != 0) {
1354                 zfs_exit(zfsvfs, FTAG);
1355                 return (error);
1356         }
1357
1358         /*
1359          * First make sure the new directory doesn't exist.
1360          *
1361          * Existence is checked first to make sure we don't return
1362          * EACCES instead of EEXIST which can cause some applications
1363          * to fail.
1364          */
1365         *zpp = NULL;
1366
1367         if ((error = zfs_dirent_lookup(dzp, dirname, &zp, ZNEW))) {
1368                 zfs_acl_ids_free(&acl_ids);
1369                 zfs_exit(zfsvfs, FTAG);
1370                 return (error);
1371         }
1372         ASSERT3P(zp, ==, NULL);
1373
1374         if ((error = zfs_zaccess(dzp, ACE_ADD_SUBDIRECTORY, 0, B_FALSE, cr,
1375             mnt_ns))) {
1376                 zfs_acl_ids_free(&acl_ids);
1377                 zfs_exit(zfsvfs, FTAG);
1378                 return (error);
1379         }
1380
1381         if (zfs_acl_ids_overquota(zfsvfs, &acl_ids, zfs_inherit_projid(dzp))) {
1382                 zfs_acl_ids_free(&acl_ids);
1383                 zfs_exit(zfsvfs, FTAG);
1384                 return (SET_ERROR(EDQUOT));
1385         }
1386
1387         /*
1388          * Add a new entry to the directory.
1389          */
1390         getnewvnode_reserve();
1391         tx = dmu_tx_create(zfsvfs->z_os);
1392         dmu_tx_hold_zap(tx, dzp->z_id, TRUE, dirname);
1393         dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL);
1394         fuid_dirtied = zfsvfs->z_fuid_dirty;
1395         if (fuid_dirtied)
1396                 zfs_fuid_txhold(zfsvfs, tx);
1397         if (!zfsvfs->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) {
1398                 dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0,
1399                     acl_ids.z_aclp->z_acl_bytes);
1400         }
1401
1402         dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
1403             ZFS_SA_BASE_ATTR_SIZE);
1404
1405         error = dmu_tx_assign(tx, TXG_WAIT);
1406         if (error) {
1407                 zfs_acl_ids_free(&acl_ids);
1408                 dmu_tx_abort(tx);
1409                 getnewvnode_drop_reserve();
1410                 zfs_exit(zfsvfs, FTAG);
1411                 return (error);
1412         }
1413
1414         /*
1415          * Create new node.
1416          */
1417         zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids);
1418
1419         /*
1420          * Now put new name in parent dir.
1421          */
1422         error = zfs_link_create(dzp, dirname, zp, tx, ZNEW);
1423         if (error != 0) {
1424                 zfs_znode_delete(zp, tx);
1425                 VOP_UNLOCK(ZTOV(zp));
1426                 zrele(zp);
1427                 goto out;
1428         }
1429
1430         if (fuid_dirtied)
1431                 zfs_fuid_sync(zfsvfs, tx);
1432
1433         *zpp = zp;
1434
1435         txtype = zfs_log_create_txtype(Z_DIR, NULL, vap);
1436         zfs_log_create(zilog, tx, txtype, dzp, zp, dirname, NULL,
1437             acl_ids.z_fuidp, vap);
1438
1439 out:
1440         zfs_acl_ids_free(&acl_ids);
1441
1442         dmu_tx_commit(tx);
1443
1444         getnewvnode_drop_reserve();
1445
1446         if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
1447                 zil_commit(zilog, 0);
1448
1449         zfs_exit(zfsvfs, FTAG);
1450         return (error);
1451 }
1452
1453 /*
1454  * Remove a directory subdir entry.  If the current working
1455  * directory is the same as the subdir to be removed, the
1456  * remove will fail.
1457  *
1458  *      IN:     dvp     - vnode of directory to remove from.
1459  *              name    - name of directory to be removed.
1460  *              cwd     - vnode of current working directory.
1461  *              cr      - credentials of caller.
1462  *              ct      - caller context
1463  *              flags   - case flags
1464  *
1465  *      RETURN: 0 on success, error code on failure.
1466  *
1467  * Timestamps:
1468  *      dvp - ctime|mtime updated
1469  */
1470 static int
1471 zfs_rmdir_(vnode_t *dvp, vnode_t *vp, const char *name, cred_t *cr)
1472 {
1473         znode_t         *dzp = VTOZ(dvp);
1474         znode_t         *zp = VTOZ(vp);
1475         zfsvfs_t        *zfsvfs = dzp->z_zfsvfs;
1476         zilog_t         *zilog;
1477         dmu_tx_t        *tx;
1478         int             error;
1479
1480         if ((error = zfs_enter_verify_zp(zfsvfs, dzp, FTAG)) != 0)
1481                 return (error);
1482         if ((error = zfs_verify_zp(zp)) != 0) {
1483                 zfs_exit(zfsvfs, FTAG);
1484                 return (error);
1485         }
1486         zilog = zfsvfs->z_log;
1487
1488
1489         if ((error = zfs_zaccess_delete(dzp, zp, cr, NULL))) {
1490                 goto out;
1491         }
1492
1493         if (vp->v_type != VDIR) {
1494                 error = SET_ERROR(ENOTDIR);
1495                 goto out;
1496         }
1497
1498         vnevent_rmdir(vp, dvp, name, ct);
1499
1500         tx = dmu_tx_create(zfsvfs->z_os);
1501         dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name);
1502         dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
1503         dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
1504         zfs_sa_upgrade_txholds(tx, zp);
1505         zfs_sa_upgrade_txholds(tx, dzp);
1506         dmu_tx_mark_netfree(tx);
1507         error = dmu_tx_assign(tx, TXG_WAIT);
1508         if (error) {
1509                 dmu_tx_abort(tx);
1510                 zfs_exit(zfsvfs, FTAG);
1511                 return (error);
1512         }
1513
1514         error = zfs_link_destroy(dzp, name, zp, tx, ZEXISTS, NULL);
1515
1516         if (error == 0) {
1517                 uint64_t txtype = TX_RMDIR;
1518                 zfs_log_remove(zilog, tx, txtype, dzp, name,
1519                     ZFS_NO_OBJECT, B_FALSE);
1520         }
1521
1522         dmu_tx_commit(tx);
1523
1524         if (zfsvfs->z_use_namecache)
1525                 cache_vop_rmdir(dvp, vp);
1526 out:
1527         if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
1528                 zil_commit(zilog, 0);
1529
1530         zfs_exit(zfsvfs, FTAG);
1531         return (error);
1532 }
1533
1534 int
1535 zfs_rmdir(znode_t *dzp, const char *name, znode_t *cwd, cred_t *cr, int flags)
1536 {
1537         struct componentname cn;
1538         vnode_t *vp;
1539         int error;
1540
1541         if ((error = zfs_lookup_internal(dzp, name, &vp, &cn, DELETE)))
1542                 return (error);
1543
1544         error = zfs_rmdir_(ZTOV(dzp), vp, name, cr);
1545         vput(vp);
1546         return (error);
1547 }
1548
1549 /*
1550  * Read as many directory entries as will fit into the provided
1551  * buffer from the given directory cursor position (specified in
1552  * the uio structure).
1553  *
1554  *      IN:     vp      - vnode of directory to read.
1555  *              uio     - structure supplying read location, range info,
1556  *                        and return buffer.
1557  *              cr      - credentials of caller.
1558  *              ct      - caller context
1559  *
1560  *      OUT:    uio     - updated offset and range, buffer filled.
1561  *              eofp    - set to true if end-of-file detected.
1562  *              ncookies- number of entries in cookies
1563  *              cookies - offsets to directory entries
1564  *
1565  *      RETURN: 0 on success, error code on failure.
1566  *
1567  * Timestamps:
1568  *      vp - atime updated
1569  *
1570  * Note that the low 4 bits of the cookie returned by zap is always zero.
1571  * This allows us to use the low range for "special" directory entries:
1572  * We use 0 for '.', and 1 for '..'.  If this is the root of the filesystem,
1573  * we use the offset 2 for the '.zfs' directory.
1574  */
1575 static int
1576 zfs_readdir(vnode_t *vp, zfs_uio_t *uio, cred_t *cr, int *eofp,
1577     int *ncookies, cookie_t **cookies)
1578 {
1579         znode_t         *zp = VTOZ(vp);
1580         iovec_t         *iovp;
1581         dirent64_t      *odp;
1582         zfsvfs_t        *zfsvfs = zp->z_zfsvfs;
1583         objset_t        *os;
1584         caddr_t         outbuf;
1585         size_t          bufsize;
1586         zap_cursor_t    zc;
1587         zap_attribute_t *zap;
1588         uint_t          bytes_wanted;
1589         uint64_t        offset; /* must be unsigned; checks for < 1 */
1590         uint64_t        parent;
1591         int             local_eof;
1592         int             outcount;
1593         int             error;
1594         uint8_t         prefetch;
1595         uint8_t         type;
1596         int             ncooks;
1597         cookie_t        *cooks = NULL;
1598
1599         if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
1600                 return (error);
1601
1602         if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs),
1603             &parent, sizeof (parent))) != 0) {
1604                 zfs_exit(zfsvfs, FTAG);
1605                 return (error);
1606         }
1607
1608         /*
1609          * If we are not given an eof variable,
1610          * use a local one.
1611          */
1612         if (eofp == NULL)
1613                 eofp = &local_eof;
1614
1615         /*
1616          * Check for valid iov_len.
1617          */
1618         if (GET_UIO_STRUCT(uio)->uio_iov->iov_len <= 0) {
1619                 zfs_exit(zfsvfs, FTAG);
1620                 return (SET_ERROR(EINVAL));
1621         }
1622
1623         /*
1624          * Quit if directory has been removed (posix)
1625          */
1626         if ((*eofp = zp->z_unlinked) != 0) {
1627                 zfs_exit(zfsvfs, FTAG);
1628                 return (0);
1629         }
1630
1631         error = 0;
1632         os = zfsvfs->z_os;
1633         offset = zfs_uio_offset(uio);
1634         prefetch = zp->z_zn_prefetch;
1635         zap = zap_attribute_long_alloc();
1636
1637         /*
1638          * Initialize the iterator cursor.
1639          */
1640         if (offset <= 3) {
1641                 /*
1642                  * Start iteration from the beginning of the directory.
1643                  */
1644                 zap_cursor_init(&zc, os, zp->z_id);
1645         } else {
1646                 /*
1647                  * The offset is a serialized cursor.
1648                  */
1649                 zap_cursor_init_serialized(&zc, os, zp->z_id, offset);
1650         }
1651
1652         /*
1653          * Get space to change directory entries into fs independent format.
1654          */
1655         iovp = GET_UIO_STRUCT(uio)->uio_iov;
1656         bytes_wanted = iovp->iov_len;
1657         if (zfs_uio_segflg(uio) != UIO_SYSSPACE || zfs_uio_iovcnt(uio) != 1) {
1658                 bufsize = bytes_wanted;
1659                 outbuf = kmem_alloc(bufsize, KM_SLEEP);
1660                 odp = (struct dirent64 *)outbuf;
1661         } else {
1662                 bufsize = bytes_wanted;
1663                 outbuf = NULL;
1664                 odp = (struct dirent64 *)iovp->iov_base;
1665         }
1666
1667         if (ncookies != NULL) {
1668                 /*
1669                  * Minimum entry size is dirent size and 1 byte for a file name.
1670                  */
1671                 ncooks = zfs_uio_resid(uio) / (sizeof (struct dirent) -
1672                     sizeof (((struct dirent *)NULL)->d_name) + 1);
1673                 cooks = malloc(ncooks * sizeof (*cooks), M_TEMP, M_WAITOK);
1674                 *cookies = cooks;
1675                 *ncookies = ncooks;
1676         }
1677
1678         /*
1679          * Transform to file-system independent format
1680          */
1681         outcount = 0;
1682         while (outcount < bytes_wanted) {
1683                 ino64_t objnum;
1684                 ushort_t reclen;
1685                 off64_t *next = NULL;
1686
1687                 /*
1688                  * Special case `.', `..', and `.zfs'.
1689                  */
1690                 if (offset == 0) {
1691                         (void) strcpy(zap->za_name, ".");
1692                         zap->za_normalization_conflict = 0;
1693                         objnum = zp->z_id;
1694                         type = DT_DIR;
1695                 } else if (offset == 1) {
1696                         (void) strcpy(zap->za_name, "..");
1697                         zap->za_normalization_conflict = 0;
1698                         objnum = parent;
1699                         type = DT_DIR;
1700                 } else if (offset == 2 && zfs_show_ctldir(zp)) {
1701                         (void) strcpy(zap->za_name, ZFS_CTLDIR_NAME);
1702                         zap->za_normalization_conflict = 0;
1703                         objnum = ZFSCTL_INO_ROOT;
1704                         type = DT_DIR;
1705                 } else {
1706                         /*
1707                          * Grab next entry.
1708                          */
1709                         if ((error = zap_cursor_retrieve(&zc, zap))) {
1710                                 if ((*eofp = (error == ENOENT)) != 0)
1711                                         break;
1712                                 else
1713                                         goto update;
1714                         }
1715
1716                         if (zap->za_integer_length != 8 ||
1717                             zap->za_num_integers != 1) {
1718                                 cmn_err(CE_WARN, "zap_readdir: bad directory "
1719                                     "entry, obj = %lld, offset = %lld\n",
1720                                     (u_longlong_t)zp->z_id,
1721                                     (u_longlong_t)offset);
1722                                 error = SET_ERROR(ENXIO);
1723                                 goto update;
1724                         }
1725
1726                         objnum = ZFS_DIRENT_OBJ(zap->za_first_integer);
1727                         /*
1728                          * MacOS X can extract the object type here such as:
1729                          * uint8_t type = ZFS_DIRENT_TYPE(zap.za_first_integer);
1730                          */
1731                         type = ZFS_DIRENT_TYPE(zap->za_first_integer);
1732                 }
1733
1734                 reclen = DIRENT64_RECLEN(strlen(zap->za_name));
1735
1736                 /*
1737                  * Will this entry fit in the buffer?
1738                  */
1739                 if (outcount + reclen > bufsize) {
1740                         /*
1741                          * Did we manage to fit anything in the buffer?
1742                          */
1743                         if (!outcount) {
1744                                 error = SET_ERROR(EINVAL);
1745                                 goto update;
1746                         }
1747                         break;
1748                 }
1749                 /*
1750                  * Add normal entry:
1751                  */
1752                 odp->d_ino = objnum;
1753                 odp->d_reclen = reclen;
1754                 odp->d_namlen = strlen(zap->za_name);
1755                 /* NOTE: d_off is the offset for the *next* entry. */
1756                 next = &odp->d_off;
1757                 strlcpy(odp->d_name, zap->za_name, odp->d_namlen + 1);
1758                 odp->d_type = type;
1759                 dirent_terminate(odp);
1760                 odp = (dirent64_t *)((intptr_t)odp + reclen);
1761
1762                 outcount += reclen;
1763
1764                 ASSERT3S(outcount, <=, bufsize);
1765
1766                 if (prefetch)
1767                         dmu_prefetch_dnode(os, objnum, ZIO_PRIORITY_SYNC_READ);
1768
1769                 /*
1770                  * Move to the next entry, fill in the previous offset.
1771                  */
1772                 if (offset > 2 || (offset == 2 && !zfs_show_ctldir(zp))) {
1773                         zap_cursor_advance(&zc);
1774                         offset = zap_cursor_serialize(&zc);
1775                 } else {
1776                         offset += 1;
1777                 }
1778
1779                 /* Fill the offset right after advancing the cursor. */
1780                 if (next != NULL)
1781                         *next = offset;
1782                 if (cooks != NULL) {
1783                         *cooks++ = offset;
1784                         ncooks--;
1785                         KASSERT(ncooks >= 0, ("ncookies=%d", ncooks));
1786                 }
1787         }
1788         zp->z_zn_prefetch = B_FALSE; /* a lookup will re-enable pre-fetching */
1789
1790         /* Subtract unused cookies */
1791         if (ncookies != NULL)
1792                 *ncookies -= ncooks;
1793
1794         if (zfs_uio_segflg(uio) == UIO_SYSSPACE && zfs_uio_iovcnt(uio) == 1) {
1795                 iovp->iov_base += outcount;
1796                 iovp->iov_len -= outcount;
1797                 zfs_uio_resid(uio) -= outcount;
1798         } else if ((error =
1799             zfs_uiomove(outbuf, (long)outcount, UIO_READ, uio))) {
1800                 /*
1801                  * Reset the pointer.
1802                  */
1803                 offset = zfs_uio_offset(uio);
1804         }
1805
1806 update:
1807         zap_cursor_fini(&zc);
1808         zap_attribute_free(zap);
1809         if (zfs_uio_segflg(uio) != UIO_SYSSPACE || zfs_uio_iovcnt(uio) != 1)
1810                 kmem_free(outbuf, bufsize);
1811
1812         if (error == ENOENT)
1813                 error = 0;
1814
1815         ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
1816
1817         zfs_uio_setoffset(uio, offset);
1818         zfs_exit(zfsvfs, FTAG);
1819         if (error != 0 && cookies != NULL) {
1820                 free(*cookies, M_TEMP);
1821                 *cookies = NULL;
1822                 *ncookies = 0;
1823         }
1824         return (error);
1825 }
1826
1827 /*
1828  * Get the requested file attributes and place them in the provided
1829  * vattr structure.
1830  *
1831  *      IN:     vp      - vnode of file.
1832  *              vap     - va_mask identifies requested attributes.
1833  *                        If AT_XVATTR set, then optional attrs are requested
1834  *              flags   - ATTR_NOACLCHECK (CIFS server context)
1835  *              cr      - credentials of caller.
1836  *
1837  *      OUT:    vap     - attribute values.
1838  *
1839  *      RETURN: 0 (always succeeds).
1840  */
1841 static int
1842 zfs_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr)
1843 {
1844         znode_t *zp = VTOZ(vp);
1845         zfsvfs_t *zfsvfs = zp->z_zfsvfs;
1846         int     error = 0;
1847         uint32_t blksize;
1848         u_longlong_t nblocks;
1849         uint64_t mtime[2], ctime[2], crtime[2], rdev;
1850         xvattr_t *xvap = (xvattr_t *)vap;       /* vap may be an xvattr_t * */
1851         xoptattr_t *xoap = NULL;
1852         boolean_t skipaclchk = (flags & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
1853         sa_bulk_attr_t bulk[4];
1854         int count = 0;
1855
1856         if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
1857                 return (error);
1858
1859         zfs_fuid_map_ids(zp, cr, &vap->va_uid, &vap->va_gid);
1860
1861         SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16);
1862         SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16);
1863         SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CRTIME(zfsvfs), NULL, &crtime, 16);
1864         if (vp->v_type == VBLK || vp->v_type == VCHR)
1865                 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_RDEV(zfsvfs), NULL,
1866                     &rdev, 8);
1867
1868         if ((error = sa_bulk_lookup(zp->z_sa_hdl, bulk, count)) != 0) {
1869                 zfs_exit(zfsvfs, FTAG);
1870                 return (error);
1871         }
1872
1873         /*
1874          * If ACL is trivial don't bother looking for ACE_READ_ATTRIBUTES.
1875          * Also, if we are the owner don't bother, since owner should
1876          * always be allowed to read basic attributes of file.
1877          */
1878         if (!(zp->z_pflags & ZFS_ACL_TRIVIAL) &&
1879             (vap->va_uid != crgetuid(cr))) {
1880                 if ((error = zfs_zaccess(zp, ACE_READ_ATTRIBUTES, 0,
1881                     skipaclchk, cr, NULL))) {
1882                         zfs_exit(zfsvfs, FTAG);
1883                         return (error);
1884                 }
1885         }
1886
1887         /*
1888          * Return all attributes.  It's cheaper to provide the answer
1889          * than to determine whether we were asked the question.
1890          */
1891
1892         vap->va_type = IFTOVT(zp->z_mode);
1893         vap->va_mode = zp->z_mode & ~S_IFMT;
1894         vn_fsid(vp, vap);
1895         vap->va_nodeid = zp->z_id;
1896         vap->va_nlink = zp->z_links;
1897         if ((vp->v_flag & VROOT) && zfs_show_ctldir(zp) &&
1898             zp->z_links < ZFS_LINK_MAX)
1899                 vap->va_nlink++;
1900         vap->va_size = zp->z_size;
1901         if (vp->v_type == VBLK || vp->v_type == VCHR)
1902                 vap->va_rdev = zfs_cmpldev(rdev);
1903         else
1904                 vap->va_rdev = 0;
1905         vap->va_gen = zp->z_gen;
1906         vap->va_flags = 0;      /* FreeBSD: Reset chflags(2) flags. */
1907         vap->va_filerev = zp->z_seq;
1908
1909         /*
1910          * Add in any requested optional attributes and the create time.
1911          * Also set the corresponding bits in the returned attribute bitmap.
1912          */
1913         if ((xoap = xva_getxoptattr(xvap)) != NULL && zfsvfs->z_use_fuids) {
1914                 if (XVA_ISSET_REQ(xvap, XAT_ARCHIVE)) {
1915                         xoap->xoa_archive =
1916                             ((zp->z_pflags & ZFS_ARCHIVE) != 0);
1917                         XVA_SET_RTN(xvap, XAT_ARCHIVE);
1918                 }
1919
1920                 if (XVA_ISSET_REQ(xvap, XAT_READONLY)) {
1921                         xoap->xoa_readonly =
1922                             ((zp->z_pflags & ZFS_READONLY) != 0);
1923                         XVA_SET_RTN(xvap, XAT_READONLY);
1924                 }
1925
1926                 if (XVA_ISSET_REQ(xvap, XAT_SYSTEM)) {
1927                         xoap->xoa_system =
1928                             ((zp->z_pflags & ZFS_SYSTEM) != 0);
1929                         XVA_SET_RTN(xvap, XAT_SYSTEM);
1930                 }
1931
1932                 if (XVA_ISSET_REQ(xvap, XAT_HIDDEN)) {
1933                         xoap->xoa_hidden =
1934                             ((zp->z_pflags & ZFS_HIDDEN) != 0);
1935                         XVA_SET_RTN(xvap, XAT_HIDDEN);
1936                 }
1937
1938                 if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) {
1939                         xoap->xoa_nounlink =
1940                             ((zp->z_pflags & ZFS_NOUNLINK) != 0);
1941                         XVA_SET_RTN(xvap, XAT_NOUNLINK);
1942                 }
1943
1944                 if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) {
1945                         xoap->xoa_immutable =
1946                             ((zp->z_pflags & ZFS_IMMUTABLE) != 0);
1947                         XVA_SET_RTN(xvap, XAT_IMMUTABLE);
1948                 }
1949
1950                 if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) {
1951                         xoap->xoa_appendonly =
1952                             ((zp->z_pflags & ZFS_APPENDONLY) != 0);
1953                         XVA_SET_RTN(xvap, XAT_APPENDONLY);
1954                 }
1955
1956                 if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) {
1957                         xoap->xoa_nodump =
1958                             ((zp->z_pflags & ZFS_NODUMP) != 0);
1959                         XVA_SET_RTN(xvap, XAT_NODUMP);
1960                 }
1961
1962                 if (XVA_ISSET_REQ(xvap, XAT_OPAQUE)) {
1963                         xoap->xoa_opaque =
1964                             ((zp->z_pflags & ZFS_OPAQUE) != 0);
1965                         XVA_SET_RTN(xvap, XAT_OPAQUE);
1966                 }
1967
1968                 if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) {
1969                         xoap->xoa_av_quarantined =
1970                             ((zp->z_pflags & ZFS_AV_QUARANTINED) != 0);
1971                         XVA_SET_RTN(xvap, XAT_AV_QUARANTINED);
1972                 }
1973
1974                 if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) {
1975                         xoap->xoa_av_modified =
1976                             ((zp->z_pflags & ZFS_AV_MODIFIED) != 0);
1977                         XVA_SET_RTN(xvap, XAT_AV_MODIFIED);
1978                 }
1979
1980                 if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP) &&
1981                     vp->v_type == VREG) {
1982                         zfs_sa_get_scanstamp(zp, xvap);
1983                 }
1984
1985                 if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) {
1986                         xoap->xoa_reparse = ((zp->z_pflags & ZFS_REPARSE) != 0);
1987                         XVA_SET_RTN(xvap, XAT_REPARSE);
1988                 }
1989                 if (XVA_ISSET_REQ(xvap, XAT_GEN)) {
1990                         xoap->xoa_generation = zp->z_gen;
1991                         XVA_SET_RTN(xvap, XAT_GEN);
1992                 }
1993
1994                 if (XVA_ISSET_REQ(xvap, XAT_OFFLINE)) {
1995                         xoap->xoa_offline =
1996                             ((zp->z_pflags & ZFS_OFFLINE) != 0);
1997                         XVA_SET_RTN(xvap, XAT_OFFLINE);
1998                 }
1999
2000                 if (XVA_ISSET_REQ(xvap, XAT_SPARSE)) {
2001                         xoap->xoa_sparse =
2002                             ((zp->z_pflags & ZFS_SPARSE) != 0);
2003                         XVA_SET_RTN(xvap, XAT_SPARSE);
2004                 }
2005
2006                 if (XVA_ISSET_REQ(xvap, XAT_PROJINHERIT)) {
2007                         xoap->xoa_projinherit =
2008                             ((zp->z_pflags & ZFS_PROJINHERIT) != 0);
2009                         XVA_SET_RTN(xvap, XAT_PROJINHERIT);
2010                 }
2011
2012                 if (XVA_ISSET_REQ(xvap, XAT_PROJID)) {
2013                         xoap->xoa_projid = zp->z_projid;
2014                         XVA_SET_RTN(xvap, XAT_PROJID);
2015                 }
2016         }
2017
2018         ZFS_TIME_DECODE(&vap->va_atime, zp->z_atime);
2019         ZFS_TIME_DECODE(&vap->va_mtime, mtime);
2020         ZFS_TIME_DECODE(&vap->va_ctime, ctime);
2021         ZFS_TIME_DECODE(&vap->va_birthtime, crtime);
2022
2023
2024         sa_object_size(zp->z_sa_hdl, &blksize, &nblocks);
2025         vap->va_blksize = blksize;
2026         vap->va_bytes = nblocks << 9;   /* nblocks * 512 */
2027
2028         if (zp->z_blksz == 0) {
2029                 /*
2030                  * Block size hasn't been set; suggest maximal I/O transfers.
2031                  */
2032                 vap->va_blksize = zfsvfs->z_max_blksz;
2033         }
2034
2035         zfs_exit(zfsvfs, FTAG);
2036         return (0);
2037 }
2038
2039 /*
2040  * Set the file attributes to the values contained in the
2041  * vattr structure.
2042  *
2043  *      IN:     zp      - znode of file to be modified.
2044  *              vap     - new attribute values.
2045  *                        If AT_XVATTR set, then optional attrs are being set
2046  *              flags   - ATTR_UTIME set if non-default time values provided.
2047  *                      - ATTR_NOACLCHECK (CIFS context only).
2048  *              cr      - credentials of caller.
2049  *              mnt_ns  - Unused on FreeBSD
2050  *
2051  *      RETURN: 0 on success, error code on failure.
2052  *
2053  * Timestamps:
2054  *      vp - ctime updated, mtime updated if size changed.
2055  */
2056 int
2057 zfs_setattr(znode_t *zp, vattr_t *vap, int flags, cred_t *cr, zidmap_t *mnt_ns)
2058 {
2059         vnode_t         *vp = ZTOV(zp);
2060         zfsvfs_t        *zfsvfs = zp->z_zfsvfs;
2061         objset_t        *os;
2062         zilog_t         *zilog;
2063         dmu_tx_t        *tx;
2064         vattr_t         oldva;
2065         xvattr_t        tmpxvattr;
2066         uint_t          mask = vap->va_mask;
2067         uint_t          saved_mask = 0;
2068         uint64_t        saved_mode;
2069         int             trim_mask = 0;
2070         uint64_t        new_mode;
2071         uint64_t        new_uid, new_gid;
2072         uint64_t        xattr_obj;
2073         uint64_t        mtime[2], ctime[2];
2074         uint64_t        projid = ZFS_INVALID_PROJID;
2075         znode_t         *attrzp;
2076         int             need_policy = FALSE;
2077         int             err, err2;
2078         zfs_fuid_info_t *fuidp = NULL;
2079         xvattr_t *xvap = (xvattr_t *)vap;       /* vap may be an xvattr_t * */
2080         xoptattr_t      *xoap;
2081         zfs_acl_t       *aclp;
2082         boolean_t skipaclchk = (flags & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
2083         boolean_t       fuid_dirtied = B_FALSE;
2084         sa_bulk_attr_t  bulk[7], xattr_bulk[7];
2085         int             count = 0, xattr_count = 0;
2086
2087         if (mask == 0)
2088                 return (0);
2089
2090         if (mask & AT_NOSET)
2091                 return (SET_ERROR(EINVAL));
2092
2093         if ((err = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
2094                 return (err);
2095
2096         os = zfsvfs->z_os;
2097         zilog = zfsvfs->z_log;
2098
2099         /*
2100          * Make sure that if we have ephemeral uid/gid or xvattr specified
2101          * that file system is at proper version level
2102          */
2103
2104         if (zfsvfs->z_use_fuids == B_FALSE &&
2105             (((mask & AT_UID) && IS_EPHEMERAL(vap->va_uid)) ||
2106             ((mask & AT_GID) && IS_EPHEMERAL(vap->va_gid)) ||
2107             (mask & AT_XVATTR))) {
2108                 zfs_exit(zfsvfs, FTAG);
2109                 return (SET_ERROR(EINVAL));
2110         }
2111
2112         if (mask & AT_SIZE && vp->v_type == VDIR) {
2113                 zfs_exit(zfsvfs, FTAG);
2114                 return (SET_ERROR(EISDIR));
2115         }
2116
2117         if (mask & AT_SIZE && vp->v_type != VREG && vp->v_type != VFIFO) {
2118                 zfs_exit(zfsvfs, FTAG);
2119                 return (SET_ERROR(EINVAL));
2120         }
2121
2122         /*
2123          * If this is an xvattr_t, then get a pointer to the structure of
2124          * optional attributes.  If this is NULL, then we have a vattr_t.
2125          */
2126         xoap = xva_getxoptattr(xvap);
2127
2128         xva_init(&tmpxvattr);
2129
2130         /*
2131          * Immutable files can only alter immutable bit and atime
2132          */
2133         if ((zp->z_pflags & ZFS_IMMUTABLE) &&
2134             ((mask & (AT_SIZE|AT_UID|AT_GID|AT_MTIME|AT_MODE)) ||
2135             ((mask & AT_XVATTR) && XVA_ISSET_REQ(xvap, XAT_CREATETIME)))) {
2136                 zfs_exit(zfsvfs, FTAG);
2137                 return (SET_ERROR(EPERM));
2138         }
2139
2140         /*
2141          * Note: ZFS_READONLY is handled in zfs_zaccess_common.
2142          */
2143
2144         /*
2145          * Verify timestamps doesn't overflow 32 bits.
2146          * ZFS can handle large timestamps, but 32bit syscalls can't
2147          * handle times greater than 2039.  This check should be removed
2148          * once large timestamps are fully supported.
2149          */
2150         if (mask & (AT_ATIME | AT_MTIME)) {
2151                 if (((mask & AT_ATIME) && TIMESPEC_OVERFLOW(&vap->va_atime)) ||
2152                     ((mask & AT_MTIME) && TIMESPEC_OVERFLOW(&vap->va_mtime))) {
2153                         zfs_exit(zfsvfs, FTAG);
2154                         return (SET_ERROR(EOVERFLOW));
2155                 }
2156         }
2157         if (xoap != NULL && (mask & AT_XVATTR)) {
2158                 if (XVA_ISSET_REQ(xvap, XAT_CREATETIME) &&
2159                     TIMESPEC_OVERFLOW(&vap->va_birthtime)) {
2160                         zfs_exit(zfsvfs, FTAG);
2161                         return (SET_ERROR(EOVERFLOW));
2162                 }
2163
2164                 if (XVA_ISSET_REQ(xvap, XAT_PROJID)) {
2165                         if (!dmu_objset_projectquota_enabled(os) ||
2166                             (!S_ISREG(zp->z_mode) && !S_ISDIR(zp->z_mode))) {
2167                                 zfs_exit(zfsvfs, FTAG);
2168                                 return (SET_ERROR(EOPNOTSUPP));
2169                         }
2170
2171                         projid = xoap->xoa_projid;
2172                         if (unlikely(projid == ZFS_INVALID_PROJID)) {
2173                                 zfs_exit(zfsvfs, FTAG);
2174                                 return (SET_ERROR(EINVAL));
2175                         }
2176
2177                         if (projid == zp->z_projid && zp->z_pflags & ZFS_PROJID)
2178                                 projid = ZFS_INVALID_PROJID;
2179                         else
2180                                 need_policy = TRUE;
2181                 }
2182
2183                 if (XVA_ISSET_REQ(xvap, XAT_PROJINHERIT) &&
2184                     (xoap->xoa_projinherit !=
2185                     ((zp->z_pflags & ZFS_PROJINHERIT) != 0)) &&
2186                     (!dmu_objset_projectquota_enabled(os) ||
2187                     (!S_ISREG(zp->z_mode) && !S_ISDIR(zp->z_mode)))) {
2188                         zfs_exit(zfsvfs, FTAG);
2189                         return (SET_ERROR(EOPNOTSUPP));
2190                 }
2191         }
2192
2193         attrzp = NULL;
2194         aclp = NULL;
2195
2196         if (zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) {
2197                 zfs_exit(zfsvfs, FTAG);
2198                 return (SET_ERROR(EROFS));
2199         }
2200
2201         /*
2202          * First validate permissions
2203          */
2204
2205         if (mask & AT_SIZE) {
2206                 /*
2207                  * XXX - Note, we are not providing any open
2208                  * mode flags here (like FNDELAY), so we may
2209                  * block if there are locks present... this
2210                  * should be addressed in openat().
2211                  */
2212                 /* XXX - would it be OK to generate a log record here? */
2213                 err = zfs_freesp(zp, vap->va_size, 0, 0, FALSE);
2214                 if (err) {
2215                         zfs_exit(zfsvfs, FTAG);
2216                         return (err);
2217                 }
2218         }
2219
2220         if (mask & (AT_ATIME|AT_MTIME) ||
2221             ((mask & AT_XVATTR) && (XVA_ISSET_REQ(xvap, XAT_HIDDEN) ||
2222             XVA_ISSET_REQ(xvap, XAT_READONLY) ||
2223             XVA_ISSET_REQ(xvap, XAT_ARCHIVE) ||
2224             XVA_ISSET_REQ(xvap, XAT_OFFLINE) ||
2225             XVA_ISSET_REQ(xvap, XAT_SPARSE) ||
2226             XVA_ISSET_REQ(xvap, XAT_CREATETIME) ||
2227             XVA_ISSET_REQ(xvap, XAT_SYSTEM)))) {
2228                 need_policy = zfs_zaccess(zp, ACE_WRITE_ATTRIBUTES, 0,
2229                     skipaclchk, cr, mnt_ns);
2230         }
2231
2232         if (mask & (AT_UID|AT_GID)) {
2233                 int     idmask = (mask & (AT_UID|AT_GID));
2234                 int     take_owner;
2235                 int     take_group;
2236
2237                 /*
2238                  * NOTE: even if a new mode is being set,
2239                  * we may clear S_ISUID/S_ISGID bits.
2240                  */
2241
2242                 if (!(mask & AT_MODE))
2243                         vap->va_mode = zp->z_mode;
2244
2245                 /*
2246                  * Take ownership or chgrp to group we are a member of
2247                  */
2248
2249                 take_owner = (mask & AT_UID) && (vap->va_uid == crgetuid(cr));
2250                 take_group = (mask & AT_GID) &&
2251                     zfs_groupmember(zfsvfs, vap->va_gid, cr);
2252
2253                 /*
2254                  * If both AT_UID and AT_GID are set then take_owner and
2255                  * take_group must both be set in order to allow taking
2256                  * ownership.
2257                  *
2258                  * Otherwise, send the check through secpolicy_vnode_setattr()
2259                  *
2260                  */
2261
2262                 if (((idmask == (AT_UID|AT_GID)) && take_owner && take_group) ||
2263                     ((idmask == AT_UID) && take_owner) ||
2264                     ((idmask == AT_GID) && take_group)) {
2265                         if (zfs_zaccess(zp, ACE_WRITE_OWNER, 0,
2266                             skipaclchk, cr, mnt_ns) == 0) {
2267                                 /*
2268                                  * Remove setuid/setgid for non-privileged users
2269                                  */
2270                                 secpolicy_setid_clear(vap, vp, cr);
2271                                 trim_mask = (mask & (AT_UID|AT_GID));
2272                         } else {
2273                                 need_policy =  TRUE;
2274                         }
2275                 } else {
2276                         need_policy =  TRUE;
2277                 }
2278         }
2279
2280         oldva.va_mode = zp->z_mode;
2281         zfs_fuid_map_ids(zp, cr, &oldva.va_uid, &oldva.va_gid);
2282         if (mask & AT_XVATTR) {
2283                 /*
2284                  * Update xvattr mask to include only those attributes
2285                  * that are actually changing.
2286                  *
2287                  * the bits will be restored prior to actually setting
2288                  * the attributes so the caller thinks they were set.
2289                  */
2290                 if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) {
2291                         if (xoap->xoa_appendonly !=
2292                             ((zp->z_pflags & ZFS_APPENDONLY) != 0)) {
2293                                 need_policy = TRUE;
2294                         } else {
2295                                 XVA_CLR_REQ(xvap, XAT_APPENDONLY);
2296                                 XVA_SET_REQ(&tmpxvattr, XAT_APPENDONLY);
2297                         }
2298                 }
2299
2300                 if (XVA_ISSET_REQ(xvap, XAT_PROJINHERIT)) {
2301                         if (xoap->xoa_projinherit !=
2302                             ((zp->z_pflags & ZFS_PROJINHERIT) != 0)) {
2303                                 need_policy = TRUE;
2304                         } else {
2305                                 XVA_CLR_REQ(xvap, XAT_PROJINHERIT);
2306                                 XVA_SET_REQ(&tmpxvattr, XAT_PROJINHERIT);
2307                         }
2308                 }
2309
2310                 if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) {
2311                         if (xoap->xoa_nounlink !=
2312                             ((zp->z_pflags & ZFS_NOUNLINK) != 0)) {
2313                                 need_policy = TRUE;
2314                         } else {
2315                                 XVA_CLR_REQ(xvap, XAT_NOUNLINK);
2316                                 XVA_SET_REQ(&tmpxvattr, XAT_NOUNLINK);
2317                         }
2318                 }
2319
2320                 if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) {
2321                         if (xoap->xoa_immutable !=
2322                             ((zp->z_pflags & ZFS_IMMUTABLE) != 0)) {
2323                                 need_policy = TRUE;
2324                         } else {
2325                                 XVA_CLR_REQ(xvap, XAT_IMMUTABLE);
2326                                 XVA_SET_REQ(&tmpxvattr, XAT_IMMUTABLE);
2327                         }
2328                 }
2329
2330                 if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) {
2331                         if (xoap->xoa_nodump !=
2332                             ((zp->z_pflags & ZFS_NODUMP) != 0)) {
2333                                 need_policy = TRUE;
2334                         } else {
2335                                 XVA_CLR_REQ(xvap, XAT_NODUMP);
2336                                 XVA_SET_REQ(&tmpxvattr, XAT_NODUMP);
2337                         }
2338                 }
2339
2340                 if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) {
2341                         if (xoap->xoa_av_modified !=
2342                             ((zp->z_pflags & ZFS_AV_MODIFIED) != 0)) {
2343                                 need_policy = TRUE;
2344                         } else {
2345                                 XVA_CLR_REQ(xvap, XAT_AV_MODIFIED);
2346                                 XVA_SET_REQ(&tmpxvattr, XAT_AV_MODIFIED);
2347                         }
2348                 }
2349
2350                 if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) {
2351                         if ((vp->v_type != VREG &&
2352                             xoap->xoa_av_quarantined) ||
2353                             xoap->xoa_av_quarantined !=
2354                             ((zp->z_pflags & ZFS_AV_QUARANTINED) != 0)) {
2355                                 need_policy = TRUE;
2356                         } else {
2357                                 XVA_CLR_REQ(xvap, XAT_AV_QUARANTINED);
2358                                 XVA_SET_REQ(&tmpxvattr, XAT_AV_QUARANTINED);
2359                         }
2360                 }
2361
2362                 if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) {
2363                         zfs_exit(zfsvfs, FTAG);
2364                         return (SET_ERROR(EPERM));
2365                 }
2366
2367                 if (need_policy == FALSE &&
2368                     (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP) ||
2369                     XVA_ISSET_REQ(xvap, XAT_OPAQUE))) {
2370                         need_policy = TRUE;
2371                 }
2372         }
2373
2374         if (mask & AT_MODE) {
2375                 if (zfs_zaccess(zp, ACE_WRITE_ACL, 0, skipaclchk, cr,
2376                     mnt_ns) == 0) {
2377                         err = secpolicy_setid_setsticky_clear(vp, vap,
2378                             &oldva, cr);
2379                         if (err) {
2380                                 zfs_exit(zfsvfs, FTAG);
2381                                 return (err);
2382                         }
2383                         trim_mask |= AT_MODE;
2384                 } else {
2385                         need_policy = TRUE;
2386                 }
2387         }
2388
2389         if (need_policy) {
2390                 /*
2391                  * If trim_mask is set then take ownership
2392                  * has been granted or write_acl is present and user
2393                  * has the ability to modify mode.  In that case remove
2394                  * UID|GID and or MODE from mask so that
2395                  * secpolicy_vnode_setattr() doesn't revoke it.
2396                  */
2397
2398                 if (trim_mask) {
2399                         saved_mask = vap->va_mask;
2400                         vap->va_mask &= ~trim_mask;
2401                         if (trim_mask & AT_MODE) {
2402                                 /*
2403                                  * Save the mode, as secpolicy_vnode_setattr()
2404                                  * will overwrite it with ova.va_mode.
2405                                  */
2406                                 saved_mode = vap->va_mode;
2407                         }
2408                 }
2409                 err = secpolicy_vnode_setattr(cr, vp, vap, &oldva, flags,
2410                     (int (*)(void *, int, cred_t *))zfs_zaccess_unix, zp);
2411                 if (err) {
2412                         zfs_exit(zfsvfs, FTAG);
2413                         return (err);
2414                 }
2415
2416                 if (trim_mask) {
2417                         vap->va_mask |= saved_mask;
2418                         if (trim_mask & AT_MODE) {
2419                                 /*
2420                                  * Recover the mode after
2421                                  * secpolicy_vnode_setattr().
2422                                  */
2423                                 vap->va_mode = saved_mode;
2424                         }
2425                 }
2426         }
2427
2428         /*
2429          * secpolicy_vnode_setattr, or take ownership may have
2430          * changed va_mask
2431          */
2432         mask = vap->va_mask;
2433
2434         if ((mask & (AT_UID | AT_GID)) || projid != ZFS_INVALID_PROJID) {
2435                 err = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs),
2436                     &xattr_obj, sizeof (xattr_obj));
2437
2438                 if (err == 0 && xattr_obj) {
2439                         err = zfs_zget(zp->z_zfsvfs, xattr_obj, &attrzp);
2440                         if (err == 0) {
2441                                 err = vn_lock(ZTOV(attrzp), LK_EXCLUSIVE);
2442                                 if (err != 0)
2443                                         vrele(ZTOV(attrzp));
2444                         }
2445                         if (err)
2446                                 goto out2;
2447                 }
2448                 if (mask & AT_UID) {
2449                         new_uid = zfs_fuid_create(zfsvfs,
2450                             (uint64_t)vap->va_uid, cr, ZFS_OWNER, &fuidp);
2451                         if (new_uid != zp->z_uid &&
2452                             zfs_id_overquota(zfsvfs, DMU_USERUSED_OBJECT,
2453                             new_uid)) {
2454                                 if (attrzp)
2455                                         vput(ZTOV(attrzp));
2456                                 err = SET_ERROR(EDQUOT);
2457                                 goto out2;
2458                         }
2459                 }
2460
2461                 if (mask & AT_GID) {
2462                         new_gid = zfs_fuid_create(zfsvfs, (uint64_t)vap->va_gid,
2463                             cr, ZFS_GROUP, &fuidp);
2464                         if (new_gid != zp->z_gid &&
2465                             zfs_id_overquota(zfsvfs, DMU_GROUPUSED_OBJECT,
2466                             new_gid)) {
2467                                 if (attrzp)
2468                                         vput(ZTOV(attrzp));
2469                                 err = SET_ERROR(EDQUOT);
2470                                 goto out2;
2471                         }
2472                 }
2473
2474                 if (projid != ZFS_INVALID_PROJID &&
2475                     zfs_id_overquota(zfsvfs, DMU_PROJECTUSED_OBJECT, projid)) {
2476                         if (attrzp)
2477                                 vput(ZTOV(attrzp));
2478                         err = SET_ERROR(EDQUOT);
2479                         goto out2;
2480                 }
2481         }
2482         tx = dmu_tx_create(os);
2483
2484         if (mask & AT_MODE) {
2485                 uint64_t pmode = zp->z_mode;
2486                 uint64_t acl_obj;
2487                 new_mode = (pmode & S_IFMT) | (vap->va_mode & ~S_IFMT);
2488
2489                 if (zp->z_zfsvfs->z_acl_mode == ZFS_ACL_RESTRICTED &&
2490                     !(zp->z_pflags & ZFS_ACL_TRIVIAL)) {
2491                         err = SET_ERROR(EPERM);
2492                         goto out;
2493                 }
2494
2495                 if ((err = zfs_acl_chmod_setattr(zp, &aclp, new_mode)))
2496                         goto out;
2497
2498                 if (!zp->z_is_sa && ((acl_obj = zfs_external_acl(zp)) != 0)) {
2499                         /*
2500                          * Are we upgrading ACL from old V0 format
2501                          * to V1 format?
2502                          */
2503                         if (zfsvfs->z_version >= ZPL_VERSION_FUID &&
2504                             zfs_znode_acl_version(zp) ==
2505                             ZFS_ACL_VERSION_INITIAL) {
2506                                 dmu_tx_hold_free(tx, acl_obj, 0,
2507                                     DMU_OBJECT_END);
2508                                 dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
2509                                     0, aclp->z_acl_bytes);
2510                         } else {
2511                                 dmu_tx_hold_write(tx, acl_obj, 0,
2512                                     aclp->z_acl_bytes);
2513                         }
2514                 } else if (!zp->z_is_sa && aclp->z_acl_bytes > ZFS_ACE_SPACE) {
2515                         dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
2516                             0, aclp->z_acl_bytes);
2517                 }
2518                 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
2519         } else {
2520                 if (((mask & AT_XVATTR) &&
2521                     XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) ||
2522                     (projid != ZFS_INVALID_PROJID &&
2523                     !(zp->z_pflags & ZFS_PROJID)))
2524                         dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
2525                 else
2526                         dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
2527         }
2528
2529         if (attrzp) {
2530                 dmu_tx_hold_sa(tx, attrzp->z_sa_hdl, B_FALSE);
2531         }
2532
2533         fuid_dirtied = zfsvfs->z_fuid_dirty;
2534         if (fuid_dirtied)
2535                 zfs_fuid_txhold(zfsvfs, tx);
2536
2537         zfs_sa_upgrade_txholds(tx, zp);
2538
2539         err = dmu_tx_assign(tx, TXG_WAIT);
2540         if (err)
2541                 goto out;
2542
2543         count = 0;
2544         /*
2545          * Set each attribute requested.
2546          * We group settings according to the locks they need to acquire.
2547          *
2548          * Note: you cannot set ctime directly, although it will be
2549          * updated as a side-effect of calling this function.
2550          */
2551
2552         if (projid != ZFS_INVALID_PROJID && !(zp->z_pflags & ZFS_PROJID)) {
2553                 /*
2554                  * For the existed object that is upgraded from old system,
2555                  * its on-disk layout has no slot for the project ID attribute.
2556                  * But quota accounting logic needs to access related slots by
2557                  * offset directly. So we need to adjust old objects' layout
2558                  * to make the project ID to some unified and fixed offset.
2559                  */
2560                 if (attrzp)
2561                         err = sa_add_projid(attrzp->z_sa_hdl, tx, projid);
2562                 if (err == 0)
2563                         err = sa_add_projid(zp->z_sa_hdl, tx, projid);
2564
2565                 if (unlikely(err == EEXIST))
2566                         err = 0;
2567                 else if (err != 0)
2568                         goto out;
2569                 else
2570                         projid = ZFS_INVALID_PROJID;
2571         }
2572
2573         if (mask & (AT_UID|AT_GID|AT_MODE))
2574                 mutex_enter(&zp->z_acl_lock);
2575
2576         SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
2577             &zp->z_pflags, sizeof (zp->z_pflags));
2578
2579         if (attrzp) {
2580                 if (mask & (AT_UID|AT_GID|AT_MODE))
2581                         mutex_enter(&attrzp->z_acl_lock);
2582                 SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
2583                     SA_ZPL_FLAGS(zfsvfs), NULL, &attrzp->z_pflags,
2584                     sizeof (attrzp->z_pflags));
2585                 if (projid != ZFS_INVALID_PROJID) {
2586                         attrzp->z_projid = projid;
2587                         SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
2588                             SA_ZPL_PROJID(zfsvfs), NULL, &attrzp->z_projid,
2589                             sizeof (attrzp->z_projid));
2590                 }
2591         }
2592
2593         if (mask & (AT_UID|AT_GID)) {
2594
2595                 if (mask & AT_UID) {
2596                         SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL,
2597                             &new_uid, sizeof (new_uid));
2598                         zp->z_uid = new_uid;
2599                         if (attrzp) {
2600                                 SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
2601                                     SA_ZPL_UID(zfsvfs), NULL, &new_uid,
2602                                     sizeof (new_uid));
2603                                 attrzp->z_uid = new_uid;
2604                         }
2605                 }
2606
2607                 if (mask & AT_GID) {
2608                         SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs),
2609                             NULL, &new_gid, sizeof (new_gid));
2610                         zp->z_gid = new_gid;
2611                         if (attrzp) {
2612                                 SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
2613                                     SA_ZPL_GID(zfsvfs), NULL, &new_gid,
2614                                     sizeof (new_gid));
2615                                 attrzp->z_gid = new_gid;
2616                         }
2617                 }
2618                 if (!(mask & AT_MODE)) {
2619                         SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs),
2620                             NULL, &new_mode, sizeof (new_mode));
2621                         new_mode = zp->z_mode;
2622                 }
2623                 err = zfs_acl_chown_setattr(zp);
2624                 ASSERT0(err);
2625                 if (attrzp) {
2626                         vn_seqc_write_begin(ZTOV(attrzp));
2627                         err = zfs_acl_chown_setattr(attrzp);
2628                         vn_seqc_write_end(ZTOV(attrzp));
2629                         ASSERT0(err);
2630                 }
2631         }
2632
2633         if (mask & AT_MODE) {
2634                 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL,
2635                     &new_mode, sizeof (new_mode));
2636                 zp->z_mode = new_mode;
2637                 ASSERT3P(aclp, !=, NULL);
2638                 err = zfs_aclset_common(zp, aclp, cr, tx);
2639                 ASSERT0(err);
2640                 if (zp->z_acl_cached)
2641                         zfs_acl_free(zp->z_acl_cached);
2642                 zp->z_acl_cached = aclp;
2643                 aclp = NULL;
2644         }
2645
2646
2647         if (mask & AT_ATIME) {
2648                 ZFS_TIME_ENCODE(&vap->va_atime, zp->z_atime);
2649                 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zfsvfs), NULL,
2650                     &zp->z_atime, sizeof (zp->z_atime));
2651         }
2652
2653         if (mask & AT_MTIME) {
2654                 ZFS_TIME_ENCODE(&vap->va_mtime, mtime);
2655                 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL,
2656                     mtime, sizeof (mtime));
2657         }
2658
2659         if (projid != ZFS_INVALID_PROJID) {
2660                 zp->z_projid = projid;
2661                 SA_ADD_BULK_ATTR(bulk, count,
2662                     SA_ZPL_PROJID(zfsvfs), NULL, &zp->z_projid,
2663                     sizeof (zp->z_projid));
2664         }
2665
2666         /* XXX - shouldn't this be done *before* the ATIME/MTIME checks? */
2667         if (mask & AT_SIZE && !(mask & AT_MTIME)) {
2668                 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs),
2669                     NULL, mtime, sizeof (mtime));
2670                 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
2671                     &ctime, sizeof (ctime));
2672                 zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime);
2673         } else if (mask != 0) {
2674                 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
2675                     &ctime, sizeof (ctime));
2676                 zfs_tstamp_update_setup(zp, STATE_CHANGED, mtime, ctime);
2677                 if (attrzp) {
2678                         SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
2679                             SA_ZPL_CTIME(zfsvfs), NULL,
2680                             &ctime, sizeof (ctime));
2681                         zfs_tstamp_update_setup(attrzp, STATE_CHANGED,
2682                             mtime, ctime);
2683                 }
2684         }
2685
2686         /*
2687          * Do this after setting timestamps to prevent timestamp
2688          * update from toggling bit
2689          */
2690
2691         if (xoap && (mask & AT_XVATTR)) {
2692
2693                 if (XVA_ISSET_REQ(xvap, XAT_CREATETIME))
2694                         xoap->xoa_createtime = vap->va_birthtime;
2695                 /*
2696                  * restore trimmed off masks
2697                  * so that return masks can be set for caller.
2698                  */
2699
2700                 if (XVA_ISSET_REQ(&tmpxvattr, XAT_APPENDONLY)) {
2701                         XVA_SET_REQ(xvap, XAT_APPENDONLY);
2702                 }
2703                 if (XVA_ISSET_REQ(&tmpxvattr, XAT_NOUNLINK)) {
2704                         XVA_SET_REQ(xvap, XAT_NOUNLINK);
2705                 }
2706                 if (XVA_ISSET_REQ(&tmpxvattr, XAT_IMMUTABLE)) {
2707                         XVA_SET_REQ(xvap, XAT_IMMUTABLE);
2708                 }
2709                 if (XVA_ISSET_REQ(&tmpxvattr, XAT_NODUMP)) {
2710                         XVA_SET_REQ(xvap, XAT_NODUMP);
2711                 }
2712                 if (XVA_ISSET_REQ(&tmpxvattr, XAT_AV_MODIFIED)) {
2713                         XVA_SET_REQ(xvap, XAT_AV_MODIFIED);
2714                 }
2715                 if (XVA_ISSET_REQ(&tmpxvattr, XAT_AV_QUARANTINED)) {
2716                         XVA_SET_REQ(xvap, XAT_AV_QUARANTINED);
2717                 }
2718                 if (XVA_ISSET_REQ(&tmpxvattr, XAT_PROJINHERIT)) {
2719                         XVA_SET_REQ(xvap, XAT_PROJINHERIT);
2720                 }
2721
2722                 if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP))
2723                         ASSERT3S(vp->v_type, ==, VREG);
2724
2725                 zfs_xvattr_set(zp, xvap, tx);
2726         }
2727
2728         if (fuid_dirtied)
2729                 zfs_fuid_sync(zfsvfs, tx);
2730
2731         if (mask != 0)
2732                 zfs_log_setattr(zilog, tx, TX_SETATTR, zp, vap, mask, fuidp);
2733
2734         if (mask & (AT_UID|AT_GID|AT_MODE))
2735                 mutex_exit(&zp->z_acl_lock);
2736
2737         if (attrzp) {
2738                 if (mask & (AT_UID|AT_GID|AT_MODE))
2739                         mutex_exit(&attrzp->z_acl_lock);
2740         }
2741 out:
2742         if (err == 0 && attrzp) {
2743                 err2 = sa_bulk_update(attrzp->z_sa_hdl, xattr_bulk,
2744                     xattr_count, tx);
2745                 ASSERT0(err2);
2746         }
2747
2748         if (attrzp)
2749                 vput(ZTOV(attrzp));
2750
2751         if (aclp)
2752                 zfs_acl_free(aclp);
2753
2754         if (fuidp) {
2755                 zfs_fuid_info_free(fuidp);
2756                 fuidp = NULL;
2757         }
2758
2759         if (err) {
2760                 dmu_tx_abort(tx);
2761         } else {
2762                 err2 = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
2763                 dmu_tx_commit(tx);
2764         }
2765
2766 out2:
2767         if (os->os_sync == ZFS_SYNC_ALWAYS)
2768                 zil_commit(zilog, 0);
2769
2770         zfs_exit(zfsvfs, FTAG);
2771         return (err);
2772 }
2773
2774 /*
2775  * Look up the directory entries corresponding to the source and target
2776  * directory/name pairs.
2777  */
2778 static int
2779 zfs_rename_relock_lookup(znode_t *sdzp, const struct componentname *scnp,
2780     znode_t **szpp, znode_t *tdzp, const struct componentname *tcnp,
2781     znode_t **tzpp)
2782 {
2783         zfsvfs_t *zfsvfs;
2784         znode_t *szp, *tzp;
2785         int error;
2786
2787         /*
2788          * Before using sdzp and tdzp we must ensure that they are live.
2789          * As a porting legacy from illumos we have two things to worry
2790          * about.  One is typical for FreeBSD and it is that the vnode is
2791          * not reclaimed (doomed).  The other is that the znode is live.
2792          * The current code can invalidate the znode without acquiring the
2793          * corresponding vnode lock if the object represented by the znode
2794          * and vnode is no longer valid after a rollback or receive operation.
2795          * z_teardown_lock hidden behind zfs_enter and zfs_exit is the lock
2796          * that protects the znodes from the invalidation.
2797          */
2798         zfsvfs = sdzp->z_zfsvfs;
2799         ASSERT3P(zfsvfs, ==, tdzp->z_zfsvfs);
2800         if ((error = zfs_enter_verify_zp(zfsvfs, sdzp, FTAG)) != 0)
2801                 return (error);
2802         if ((error = zfs_verify_zp(tdzp)) != 0) {
2803                 zfs_exit(zfsvfs, FTAG);
2804                 return (error);
2805         }
2806
2807         /*
2808          * Re-resolve svp to be certain it still exists and fetch the
2809          * correct vnode.
2810          */
2811         error = zfs_dirent_lookup(sdzp, scnp->cn_nameptr, &szp, ZEXISTS);
2812         if (error != 0) {
2813                 /* Source entry invalid or not there. */
2814                 if ((scnp->cn_flags & ISDOTDOT) != 0 ||
2815                     (scnp->cn_namelen == 1 && scnp->cn_nameptr[0] == '.'))
2816                         error = SET_ERROR(EINVAL);
2817                 goto out;
2818         }
2819         *szpp = szp;
2820
2821         /*
2822          * Re-resolve tvp, if it disappeared we just carry on.
2823          */
2824         error = zfs_dirent_lookup(tdzp, tcnp->cn_nameptr, &tzp, 0);
2825         if (error != 0) {
2826                 vrele(ZTOV(szp));
2827                 if ((tcnp->cn_flags & ISDOTDOT) != 0)
2828                         error = SET_ERROR(EINVAL);
2829                 goto out;
2830         }
2831         *tzpp = tzp;
2832 out:
2833         zfs_exit(zfsvfs, FTAG);
2834         return (error);
2835 }
2836
2837 /*
2838  * We acquire all but fdvp locks using non-blocking acquisitions.  If we
2839  * fail to acquire any lock in the path we will drop all held locks,
2840  * acquire the new lock in a blocking fashion, and then release it and
2841  * restart the rename.  This acquire/release step ensures that we do not
2842  * spin on a lock waiting for release.  On error release all vnode locks
2843  * and decrement references the way tmpfs_rename() would do.
2844  */
2845 static int
2846 zfs_rename_relock(struct vnode *sdvp, struct vnode **svpp,
2847     struct vnode *tdvp, struct vnode **tvpp,
2848     const struct componentname *scnp, const struct componentname *tcnp)
2849 {
2850         struct vnode    *nvp, *svp, *tvp;
2851         znode_t         *sdzp, *tdzp, *szp, *tzp;
2852         int             error;
2853
2854         VOP_UNLOCK(tdvp);
2855         if (*tvpp != NULL && *tvpp != tdvp)
2856                 VOP_UNLOCK(*tvpp);
2857
2858 relock:
2859         error = vn_lock(sdvp, LK_EXCLUSIVE);
2860         if (error)
2861                 goto out;
2862         error = vn_lock(tdvp, LK_EXCLUSIVE | LK_NOWAIT);
2863         if (error != 0) {
2864                 VOP_UNLOCK(sdvp);
2865                 if (error != EBUSY)
2866                         goto out;
2867                 error = vn_lock(tdvp, LK_EXCLUSIVE);
2868                 if (error)
2869                         goto out;
2870                 VOP_UNLOCK(tdvp);
2871                 goto relock;
2872         }
2873         tdzp = VTOZ(tdvp);
2874         sdzp = VTOZ(sdvp);
2875
2876         error = zfs_rename_relock_lookup(sdzp, scnp, &szp, tdzp, tcnp, &tzp);
2877         if (error != 0) {
2878                 VOP_UNLOCK(sdvp);
2879                 VOP_UNLOCK(tdvp);
2880                 goto out;
2881         }
2882         svp = ZTOV(szp);
2883         tvp = tzp != NULL ? ZTOV(tzp) : NULL;
2884
2885         /*
2886          * Now try acquire locks on svp and tvp.
2887          */
2888         nvp = svp;
2889         error = vn_lock(nvp, LK_EXCLUSIVE | LK_NOWAIT);
2890         if (error != 0) {
2891                 VOP_UNLOCK(sdvp);
2892                 VOP_UNLOCK(tdvp);
2893                 if (tvp != NULL)
2894                         vrele(tvp);
2895                 if (error != EBUSY) {
2896                         vrele(nvp);
2897                         goto out;
2898                 }
2899                 error = vn_lock(nvp, LK_EXCLUSIVE);
2900                 if (error != 0) {
2901                         vrele(nvp);
2902                         goto out;
2903                 }
2904                 VOP_UNLOCK(nvp);
2905                 /*
2906                  * Concurrent rename race.
2907                  * XXX ?
2908                  */
2909                 if (nvp == tdvp) {
2910                         vrele(nvp);
2911                         error = SET_ERROR(EINVAL);
2912                         goto out;
2913                 }
2914                 vrele(*svpp);
2915                 *svpp = nvp;
2916                 goto relock;
2917         }
2918         vrele(*svpp);
2919         *svpp = nvp;
2920
2921         if (*tvpp != NULL)
2922                 vrele(*tvpp);
2923         *tvpp = NULL;
2924         if (tvp != NULL) {
2925                 nvp = tvp;
2926                 error = vn_lock(nvp, LK_EXCLUSIVE | LK_NOWAIT);
2927                 if (error != 0) {
2928                         VOP_UNLOCK(sdvp);
2929                         VOP_UNLOCK(tdvp);
2930                         VOP_UNLOCK(*svpp);
2931                         if (error != EBUSY) {
2932                                 vrele(nvp);
2933                                 goto out;
2934                         }
2935                         error = vn_lock(nvp, LK_EXCLUSIVE);
2936                         if (error != 0) {
2937                                 vrele(nvp);
2938                                 goto out;
2939                         }
2940                         vput(nvp);
2941                         goto relock;
2942                 }
2943                 *tvpp = nvp;
2944         }
2945
2946         return (0);
2947
2948 out:
2949         return (error);
2950 }
2951
2952 /*
2953  * Note that we must use VRELE_ASYNC in this function as it walks
2954  * up the directory tree and vrele may need to acquire an exclusive
2955  * lock if a last reference to a vnode is dropped.
2956  */
2957 static int
2958 zfs_rename_check(znode_t *szp, znode_t *sdzp, znode_t *tdzp)
2959 {
2960         zfsvfs_t        *zfsvfs;
2961         znode_t         *zp, *zp1;
2962         uint64_t        parent;
2963         int             error;
2964
2965         zfsvfs = tdzp->z_zfsvfs;
2966         if (tdzp == szp)
2967                 return (SET_ERROR(EINVAL));
2968         if (tdzp == sdzp)
2969                 return (0);
2970         if (tdzp->z_id == zfsvfs->z_root)
2971                 return (0);
2972         zp = tdzp;
2973         for (;;) {
2974                 ASSERT(!zp->z_unlinked);
2975                 if ((error = sa_lookup(zp->z_sa_hdl,
2976                     SA_ZPL_PARENT(zfsvfs), &parent, sizeof (parent))) != 0)
2977                         break;
2978
2979                 if (parent == szp->z_id) {
2980                         error = SET_ERROR(EINVAL);
2981                         break;
2982                 }
2983                 if (parent == zfsvfs->z_root)
2984                         break;
2985                 if (parent == sdzp->z_id)
2986                         break;
2987
2988                 error = zfs_zget(zfsvfs, parent, &zp1);
2989                 if (error != 0)
2990                         break;
2991
2992                 if (zp != tdzp)
2993                         VN_RELE_ASYNC(ZTOV(zp),
2994                             dsl_pool_zrele_taskq(
2995                             dmu_objset_pool(zfsvfs->z_os)));
2996                 zp = zp1;
2997         }
2998
2999         if (error == ENOTDIR)
3000                 panic("checkpath: .. not a directory\n");
3001         if (zp != tdzp)
3002                 VN_RELE_ASYNC(ZTOV(zp),
3003                     dsl_pool_zrele_taskq(dmu_objset_pool(zfsvfs->z_os)));
3004         return (error);
3005 }
3006
3007 static int
3008 zfs_do_rename_impl(vnode_t *sdvp, vnode_t **svpp, struct componentname *scnp,
3009     vnode_t *tdvp, vnode_t **tvpp, struct componentname *tcnp,
3010     cred_t *cr);
3011
3012 /*
3013  * Move an entry from the provided source directory to the target
3014  * directory.  Change the entry name as indicated.
3015  *
3016  *      IN:     sdvp    - Source directory containing the "old entry".
3017  *              scnp    - Old entry name.
3018  *              tdvp    - Target directory to contain the "new entry".
3019  *              tcnp    - New entry name.
3020  *              cr      - credentials of caller.
3021  *      INOUT:  svpp    - Source file
3022  *              tvpp    - Target file, may point to NULL initially
3023  *
3024  *      RETURN: 0 on success, error code on failure.
3025  *
3026  * Timestamps:
3027  *      sdvp,tdvp - ctime|mtime updated
3028  */
3029 static int
3030 zfs_do_rename(vnode_t *sdvp, vnode_t **svpp, struct componentname *scnp,
3031     vnode_t *tdvp, vnode_t **tvpp, struct componentname *tcnp,
3032     cred_t *cr)
3033 {
3034         int     error;
3035
3036         ASSERT_VOP_ELOCKED(tdvp, __func__);
3037         if (*tvpp != NULL)
3038                 ASSERT_VOP_ELOCKED(*tvpp, __func__);
3039
3040         /* Reject renames across filesystems. */
3041         if ((*svpp)->v_mount != tdvp->v_mount ||
3042             ((*tvpp) != NULL && (*svpp)->v_mount != (*tvpp)->v_mount)) {
3043                 error = SET_ERROR(EXDEV);
3044                 goto out;
3045         }
3046
3047         if (zfsctl_is_node(tdvp)) {
3048                 error = SET_ERROR(EXDEV);
3049                 goto out;
3050         }
3051
3052         /*
3053          * Lock all four vnodes to ensure safety and semantics of renaming.
3054          */
3055         error = zfs_rename_relock(sdvp, svpp, tdvp, tvpp, scnp, tcnp);
3056         if (error != 0) {
3057                 /* no vnodes are locked in the case of error here */
3058                 return (error);
3059         }
3060
3061         error = zfs_do_rename_impl(sdvp, svpp, scnp, tdvp, tvpp, tcnp, cr);
3062         VOP_UNLOCK(sdvp);
3063         VOP_UNLOCK(*svpp);
3064 out:
3065         if (*tvpp != NULL)
3066                 VOP_UNLOCK(*tvpp);
3067         if (tdvp != *tvpp)
3068                 VOP_UNLOCK(tdvp);
3069
3070         return (error);
3071 }
3072
3073 static int
3074 zfs_do_rename_impl(vnode_t *sdvp, vnode_t **svpp, struct componentname *scnp,
3075     vnode_t *tdvp, vnode_t **tvpp, struct componentname *tcnp,
3076     cred_t *cr)
3077 {
3078         dmu_tx_t        *tx;
3079         zfsvfs_t        *zfsvfs;
3080         zilog_t         *zilog;
3081         znode_t         *tdzp, *sdzp, *tzp, *szp;
3082         const char      *snm = scnp->cn_nameptr;
3083         const char      *tnm = tcnp->cn_nameptr;
3084         int             error;
3085
3086         tdzp = VTOZ(tdvp);
3087         sdzp = VTOZ(sdvp);
3088         zfsvfs = tdzp->z_zfsvfs;
3089
3090         if ((error = zfs_enter_verify_zp(zfsvfs, tdzp, FTAG)) != 0)
3091                 return (error);
3092         if ((error = zfs_verify_zp(sdzp)) != 0) {
3093                 zfs_exit(zfsvfs, FTAG);
3094                 return (error);
3095         }
3096         zilog = zfsvfs->z_log;
3097
3098         if (zfsvfs->z_utf8 && u8_validate(tnm,
3099             strlen(tnm), NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
3100                 error = SET_ERROR(EILSEQ);
3101                 goto out;
3102         }
3103
3104         /* If source and target are the same file, there is nothing to do. */
3105         if ((*svpp) == (*tvpp)) {
3106                 error = 0;
3107                 goto out;
3108         }
3109
3110         if (((*svpp)->v_type == VDIR && (*svpp)->v_mountedhere != NULL) ||
3111             ((*tvpp) != NULL && (*tvpp)->v_type == VDIR &&
3112             (*tvpp)->v_mountedhere != NULL)) {
3113                 error = SET_ERROR(EXDEV);
3114                 goto out;
3115         }
3116
3117         szp = VTOZ(*svpp);
3118         if ((error = zfs_verify_zp(szp)) != 0) {
3119                 zfs_exit(zfsvfs, FTAG);
3120                 return (error);
3121         }
3122         tzp = *tvpp == NULL ? NULL : VTOZ(*tvpp);
3123         if (tzp != NULL) {
3124                 if ((error = zfs_verify_zp(tzp)) != 0) {
3125                         zfs_exit(zfsvfs, FTAG);
3126                         return (error);
3127                 }
3128         }
3129
3130         /*
3131          * This is to prevent the creation of links into attribute space
3132          * by renaming a linked file into/outof an attribute directory.
3133          * See the comment in zfs_link() for why this is considered bad.
3134          */
3135         if ((tdzp->z_pflags & ZFS_XATTR) != (sdzp->z_pflags & ZFS_XATTR)) {
3136                 error = SET_ERROR(EINVAL);
3137                 goto out;
3138         }
3139
3140         /*
3141          * If we are using project inheritance, means if the directory has
3142          * ZFS_PROJINHERIT set, then its descendant directories will inherit
3143          * not only the project ID, but also the ZFS_PROJINHERIT flag. Under
3144          * such case, we only allow renames into our tree when the project
3145          * IDs are the same.
3146          */
3147         if (tdzp->z_pflags & ZFS_PROJINHERIT &&
3148             tdzp->z_projid != szp->z_projid) {
3149                 error = SET_ERROR(EXDEV);
3150                 goto out;
3151         }
3152
3153         /*
3154          * Must have write access at the source to remove the old entry
3155          * and write access at the target to create the new entry.
3156          * Note that if target and source are the same, this can be
3157          * done in a single check.
3158          */
3159         if ((error = zfs_zaccess_rename(sdzp, szp, tdzp, tzp, cr, NULL)))
3160                 goto out;
3161
3162         if ((*svpp)->v_type == VDIR) {
3163                 /*
3164                  * Avoid ".", "..", and aliases of "." for obvious reasons.
3165                  */
3166                 if ((scnp->cn_namelen == 1 && scnp->cn_nameptr[0] == '.') ||
3167                     sdzp == szp ||
3168                     (scnp->cn_flags | tcnp->cn_flags) & ISDOTDOT) {
3169                         error = EINVAL;
3170                         goto out;
3171                 }
3172
3173                 /*
3174                  * Check to make sure rename is valid.
3175                  * Can't do a move like this: /usr/a/b to /usr/a/b/c/d
3176                  */
3177                 if ((error = zfs_rename_check(szp, sdzp, tdzp)))
3178                         goto out;
3179         }
3180
3181         /*
3182          * Does target exist?
3183          */
3184         if (tzp) {
3185                 /*
3186                  * Source and target must be the same type.
3187                  */
3188                 if ((*svpp)->v_type == VDIR) {
3189                         if ((*tvpp)->v_type != VDIR) {
3190                                 error = SET_ERROR(ENOTDIR);
3191                                 goto out;
3192                         } else {
3193                                 cache_purge(tdvp);
3194                                 if (sdvp != tdvp)
3195                                         cache_purge(sdvp);
3196                         }
3197                 } else {
3198                         if ((*tvpp)->v_type == VDIR) {
3199                                 error = SET_ERROR(EISDIR);
3200                                 goto out;
3201                         }
3202                 }
3203         }
3204
3205         vn_seqc_write_begin(*svpp);
3206         vn_seqc_write_begin(sdvp);
3207         if (*tvpp != NULL)
3208                 vn_seqc_write_begin(*tvpp);
3209         if (tdvp != *tvpp)
3210                 vn_seqc_write_begin(tdvp);
3211
3212         vnevent_rename_src(*svpp, sdvp, scnp->cn_nameptr, ct);
3213         if (tzp)
3214                 vnevent_rename_dest(*tvpp, tdvp, tnm, ct);
3215
3216         /*
3217          * notify the target directory if it is not the same
3218          * as source directory.
3219          */
3220         if (tdvp != sdvp) {
3221                 vnevent_rename_dest_dir(tdvp, ct);
3222         }
3223
3224         tx = dmu_tx_create(zfsvfs->z_os);
3225         dmu_tx_hold_sa(tx, szp->z_sa_hdl, B_FALSE);
3226         dmu_tx_hold_sa(tx, sdzp->z_sa_hdl, B_FALSE);
3227         dmu_tx_hold_zap(tx, sdzp->z_id, FALSE, snm);
3228         dmu_tx_hold_zap(tx, tdzp->z_id, TRUE, tnm);
3229         if (sdzp != tdzp) {
3230                 dmu_tx_hold_sa(tx, tdzp->z_sa_hdl, B_FALSE);
3231                 zfs_sa_upgrade_txholds(tx, tdzp);
3232         }
3233         if (tzp) {
3234                 dmu_tx_hold_sa(tx, tzp->z_sa_hdl, B_FALSE);
3235                 zfs_sa_upgrade_txholds(tx, tzp);
3236         }
3237
3238         zfs_sa_upgrade_txholds(tx, szp);
3239         dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
3240         error = dmu_tx_assign(tx, TXG_WAIT);
3241         if (error) {
3242                 dmu_tx_abort(tx);
3243                 goto out_seq;
3244         }
3245
3246         if (tzp)        /* Attempt to remove the existing target */
3247                 error = zfs_link_destroy(tdzp, tnm, tzp, tx, 0, NULL);
3248
3249         if (error == 0) {
3250                 error = zfs_link_create(tdzp, tnm, szp, tx, ZRENAMING);
3251                 if (error == 0) {
3252                         szp->z_pflags |= ZFS_AV_MODIFIED;
3253
3254                         error = sa_update(szp->z_sa_hdl, SA_ZPL_FLAGS(zfsvfs),
3255                             (void *)&szp->z_pflags, sizeof (uint64_t), tx);
3256                         ASSERT0(error);
3257
3258                         error = zfs_link_destroy(sdzp, snm, szp, tx, ZRENAMING,
3259                             NULL);
3260                         if (error == 0) {
3261                                 zfs_log_rename(zilog, tx, TX_RENAME, sdzp,
3262                                     snm, tdzp, tnm, szp);
3263                         } else {
3264                                 /*
3265                                  * At this point, we have successfully created
3266                                  * the target name, but have failed to remove
3267                                  * the source name.  Since the create was done
3268                                  * with the ZRENAMING flag, there are
3269                                  * complications; for one, the link count is
3270                                  * wrong.  The easiest way to deal with this
3271                                  * is to remove the newly created target, and
3272                                  * return the original error.  This must
3273                                  * succeed; fortunately, it is very unlikely to
3274                                  * fail, since we just created it.
3275                                  */
3276                                 VERIFY0(zfs_link_destroy(tdzp, tnm, szp, tx,
3277                                     ZRENAMING, NULL));
3278                         }
3279                 }
3280                 if (error == 0) {
3281                         cache_vop_rename(sdvp, *svpp, tdvp, *tvpp, scnp, tcnp);
3282                 }
3283         }
3284
3285         dmu_tx_commit(tx);
3286
3287 out_seq:
3288         vn_seqc_write_end(*svpp);
3289         vn_seqc_write_end(sdvp);
3290         if (*tvpp != NULL)
3291                 vn_seqc_write_end(*tvpp);
3292         if (tdvp != *tvpp)
3293                 vn_seqc_write_end(tdvp);
3294
3295 out:
3296         if (error == 0 && zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
3297                 zil_commit(zilog, 0);
3298         zfs_exit(zfsvfs, FTAG);
3299
3300         return (error);
3301 }
3302
3303 int
3304 zfs_rename(znode_t *sdzp, const char *sname, znode_t *tdzp, const char *tname,
3305     cred_t *cr, int flags, uint64_t rflags, vattr_t *wo_vap, zidmap_t *mnt_ns)
3306 {
3307         struct componentname scn, tcn;
3308         vnode_t *sdvp, *tdvp;
3309         vnode_t *svp, *tvp;
3310         int error;
3311         svp = tvp = NULL;
3312
3313         if (is_nametoolong(tdzp->z_zfsvfs, tname))
3314                 return (SET_ERROR(ENAMETOOLONG));
3315
3316         if (rflags != 0 || wo_vap != NULL)
3317                 return (SET_ERROR(EINVAL));
3318
3319         sdvp = ZTOV(sdzp);
3320         tdvp = ZTOV(tdzp);
3321         error = zfs_lookup_internal(sdzp, sname, &svp, &scn, DELETE);
3322         if (sdzp->z_zfsvfs->z_replay == B_FALSE)
3323                 VOP_UNLOCK(sdvp);
3324         if (error != 0)
3325                 goto fail;
3326         VOP_UNLOCK(svp);
3327
3328         vn_lock(tdvp, LK_EXCLUSIVE | LK_RETRY);
3329         error = zfs_lookup_internal(tdzp, tname, &tvp, &tcn, RENAME);
3330         if (error == EJUSTRETURN)
3331                 tvp = NULL;
3332         else if (error != 0) {
3333                 VOP_UNLOCK(tdvp);
3334                 goto fail;
3335         }
3336
3337         error = zfs_do_rename(sdvp, &svp, &scn, tdvp, &tvp, &tcn, cr);
3338 fail:
3339         if (svp != NULL)
3340                 vrele(svp);
3341         if (tvp != NULL)
3342                 vrele(tvp);
3343
3344         return (error);
3345 }
3346
3347 /*
3348  * Insert the indicated symbolic reference entry into the directory.
3349  *
3350  *      IN:     dvp     - Directory to contain new symbolic link.
3351  *              link    - Name for new symlink entry.
3352  *              vap     - Attributes of new entry.
3353  *              cr      - credentials of caller.
3354  *              ct      - caller context
3355  *              flags   - case flags
3356  *              mnt_ns  - Unused on FreeBSD
3357  *
3358  *      RETURN: 0 on success, error code on failure.
3359  *
3360  * Timestamps:
3361  *      dvp - ctime|mtime updated
3362  */
3363 int
3364 zfs_symlink(znode_t *dzp, const char *name, vattr_t *vap,
3365     const char *link, znode_t **zpp, cred_t *cr, int flags, zidmap_t *mnt_ns)
3366 {
3367         (void) flags;
3368         znode_t         *zp;
3369         dmu_tx_t        *tx;
3370         zfsvfs_t        *zfsvfs = dzp->z_zfsvfs;
3371         zilog_t         *zilog;
3372         uint64_t        len = strlen(link);
3373         int             error;
3374         zfs_acl_ids_t   acl_ids;
3375         boolean_t       fuid_dirtied;
3376         uint64_t        txtype = TX_SYMLINK;
3377
3378         ASSERT3S(vap->va_type, ==, VLNK);
3379
3380         if (is_nametoolong(zfsvfs, name))
3381                 return (SET_ERROR(ENAMETOOLONG));
3382
3383         if ((error = zfs_enter_verify_zp(zfsvfs, dzp, FTAG)) != 0)
3384                 return (error);
3385         zilog = zfsvfs->z_log;
3386
3387         if (zfsvfs->z_utf8 && u8_validate(name, strlen(name),
3388             NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
3389                 zfs_exit(zfsvfs, FTAG);
3390                 return (SET_ERROR(EILSEQ));
3391         }
3392
3393         if (len > MAXPATHLEN) {
3394                 zfs_exit(zfsvfs, FTAG);
3395                 return (SET_ERROR(ENAMETOOLONG));
3396         }
3397
3398         if ((error = zfs_acl_ids_create(dzp, 0,
3399             vap, cr, NULL, &acl_ids, NULL)) != 0) {
3400                 zfs_exit(zfsvfs, FTAG);
3401                 return (error);
3402         }
3403
3404         /*
3405          * Attempt to lock directory; fail if entry already exists.
3406          */
3407         error = zfs_dirent_lookup(dzp, name, &zp, ZNEW);
3408         if (error) {
3409                 zfs_acl_ids_free(&acl_ids);
3410                 zfs_exit(zfsvfs, FTAG);
3411                 return (error);
3412         }
3413
3414         if ((error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr, mnt_ns))) {
3415                 zfs_acl_ids_free(&acl_ids);
3416                 zfs_exit(zfsvfs, FTAG);
3417                 return (error);
3418         }
3419
3420         if (zfs_acl_ids_overquota(zfsvfs, &acl_ids,
3421             0 /* projid */)) {
3422                 zfs_acl_ids_free(&acl_ids);
3423                 zfs_exit(zfsvfs, FTAG);
3424                 return (SET_ERROR(EDQUOT));
3425         }
3426
3427         getnewvnode_reserve();
3428         tx = dmu_tx_create(zfsvfs->z_os);
3429         fuid_dirtied = zfsvfs->z_fuid_dirty;
3430         dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, MAX(1, len));
3431         dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
3432         dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
3433             ZFS_SA_BASE_ATTR_SIZE + len);
3434         dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE);
3435         if (!zfsvfs->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) {
3436                 dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0,
3437                     acl_ids.z_aclp->z_acl_bytes);
3438         }
3439         if (fuid_dirtied)
3440                 zfs_fuid_txhold(zfsvfs, tx);
3441         error = dmu_tx_assign(tx, TXG_WAIT);
3442         if (error) {
3443                 zfs_acl_ids_free(&acl_ids);
3444                 dmu_tx_abort(tx);
3445                 getnewvnode_drop_reserve();
3446                 zfs_exit(zfsvfs, FTAG);
3447                 return (error);
3448         }
3449
3450         /*
3451          * Create a new object for the symlink.
3452          * for version 4 ZPL datasets the symlink will be an SA attribute
3453          */
3454         zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids);
3455
3456         if (fuid_dirtied)
3457                 zfs_fuid_sync(zfsvfs, tx);
3458
3459         if (zp->z_is_sa)
3460                 error = sa_update(zp->z_sa_hdl, SA_ZPL_SYMLINK(zfsvfs),
3461                     __DECONST(void *, link), len, tx);
3462         else
3463                 zfs_sa_symlink(zp, __DECONST(char *, link), len, tx);
3464
3465         zp->z_size = len;
3466         (void) sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zfsvfs),
3467             &zp->z_size, sizeof (zp->z_size), tx);
3468         /*
3469          * Insert the new object into the directory.
3470          */
3471         error = zfs_link_create(dzp, name, zp, tx, ZNEW);
3472         if (error != 0) {
3473                 zfs_znode_delete(zp, tx);
3474                 VOP_UNLOCK(ZTOV(zp));
3475                 zrele(zp);
3476         } else {
3477                 zfs_log_symlink(zilog, tx, txtype, dzp, zp, name, link);
3478         }
3479
3480         zfs_acl_ids_free(&acl_ids);
3481
3482         dmu_tx_commit(tx);
3483
3484         getnewvnode_drop_reserve();
3485
3486         if (error == 0) {
3487                 *zpp = zp;
3488
3489                 if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
3490                         zil_commit(zilog, 0);
3491         }
3492
3493         zfs_exit(zfsvfs, FTAG);
3494         return (error);
3495 }
3496
3497 /*
3498  * Return, in the buffer contained in the provided uio structure,
3499  * the symbolic path referred to by vp.
3500  *
3501  *      IN:     vp      - vnode of symbolic link.
3502  *              uio     - structure to contain the link path.
3503  *              cr      - credentials of caller.
3504  *              ct      - caller context
3505  *
3506  *      OUT:    uio     - structure containing the link path.
3507  *
3508  *      RETURN: 0 on success, error code on failure.
3509  *
3510  * Timestamps:
3511  *      vp - atime updated
3512  */
3513 static int
3514 zfs_readlink(vnode_t *vp, zfs_uio_t *uio, cred_t *cr, caller_context_t *ct)
3515 {
3516         (void) cr, (void) ct;
3517         znode_t         *zp = VTOZ(vp);
3518         zfsvfs_t        *zfsvfs = zp->z_zfsvfs;
3519         int             error;
3520
3521         if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
3522                 return (error);
3523
3524         if (zp->z_is_sa)
3525                 error = sa_lookup_uio(zp->z_sa_hdl,
3526                     SA_ZPL_SYMLINK(zfsvfs), uio);
3527         else
3528                 error = zfs_sa_readlink(zp, uio);
3529
3530         ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
3531
3532         zfs_exit(zfsvfs, FTAG);
3533         return (error);
3534 }
3535
3536 /*
3537  * Insert a new entry into directory tdvp referencing svp.
3538  *
3539  *      IN:     tdvp    - Directory to contain new entry.
3540  *              svp     - vnode of new entry.
3541  *              name    - name of new entry.
3542  *              cr      - credentials of caller.
3543  *
3544  *      RETURN: 0 on success, error code on failure.
3545  *
3546  * Timestamps:
3547  *      tdvp - ctime|mtime updated
3548  *       svp - ctime updated
3549  */
3550 int
3551 zfs_link(znode_t *tdzp, znode_t *szp, const char *name, cred_t *cr,
3552     int flags)
3553 {
3554         (void) flags;
3555         znode_t         *tzp;
3556         zfsvfs_t        *zfsvfs = tdzp->z_zfsvfs;
3557         zilog_t         *zilog;
3558         dmu_tx_t        *tx;
3559         int             error;
3560         uint64_t        parent;
3561         uid_t           owner;
3562
3563         ASSERT3S(ZTOV(tdzp)->v_type, ==, VDIR);
3564
3565         if (is_nametoolong(zfsvfs, name))
3566                 return (SET_ERROR(ENAMETOOLONG));
3567
3568         if ((error = zfs_enter_verify_zp(zfsvfs, tdzp, FTAG)) != 0)
3569                 return (error);
3570         zilog = zfsvfs->z_log;
3571
3572         /*
3573          * POSIX dictates that we return EPERM here.
3574          * Better choices include ENOTSUP or EISDIR.
3575          */
3576         if (ZTOV(szp)->v_type == VDIR) {
3577                 zfs_exit(zfsvfs, FTAG);
3578                 return (SET_ERROR(EPERM));
3579         }
3580
3581         if ((error = zfs_verify_zp(szp)) != 0) {
3582                 zfs_exit(zfsvfs, FTAG);
3583                 return (error);
3584         }
3585
3586         /*
3587          * If we are using project inheritance, means if the directory has
3588          * ZFS_PROJINHERIT set, then its descendant directories will inherit
3589          * not only the project ID, but also the ZFS_PROJINHERIT flag. Under
3590          * such case, we only allow hard link creation in our tree when the
3591          * project IDs are the same.
3592          */
3593         if (tdzp->z_pflags & ZFS_PROJINHERIT &&
3594             tdzp->z_projid != szp->z_projid) {
3595                 zfs_exit(zfsvfs, FTAG);
3596                 return (SET_ERROR(EXDEV));
3597         }
3598
3599         if (szp->z_pflags & (ZFS_APPENDONLY |
3600             ZFS_IMMUTABLE | ZFS_READONLY)) {
3601                 zfs_exit(zfsvfs, FTAG);
3602                 return (SET_ERROR(EPERM));
3603         }
3604
3605         /* Prevent links to .zfs/shares files */
3606
3607         if ((error = sa_lookup(szp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs),
3608             &parent, sizeof (uint64_t))) != 0) {
3609                 zfs_exit(zfsvfs, FTAG);
3610                 return (error);
3611         }
3612         if (parent == zfsvfs->z_shares_dir) {
3613                 zfs_exit(zfsvfs, FTAG);
3614                 return (SET_ERROR(EPERM));
3615         }
3616
3617         if (zfsvfs->z_utf8 && u8_validate(name,
3618             strlen(name), NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
3619                 zfs_exit(zfsvfs, FTAG);
3620                 return (SET_ERROR(EILSEQ));
3621         }
3622
3623         /*
3624          * We do not support links between attributes and non-attributes
3625          * because of the potential security risk of creating links
3626          * into "normal" file space in order to circumvent restrictions
3627          * imposed in attribute space.
3628          */
3629         if ((szp->z_pflags & ZFS_XATTR) != (tdzp->z_pflags & ZFS_XATTR)) {
3630                 zfs_exit(zfsvfs, FTAG);
3631                 return (SET_ERROR(EINVAL));
3632         }
3633
3634
3635         owner = zfs_fuid_map_id(zfsvfs, szp->z_uid, cr, ZFS_OWNER);
3636         if (owner != crgetuid(cr) && secpolicy_basic_link(ZTOV(szp), cr) != 0) {
3637                 zfs_exit(zfsvfs, FTAG);
3638                 return (SET_ERROR(EPERM));
3639         }
3640
3641         if ((error = zfs_zaccess(tdzp, ACE_ADD_FILE, 0, B_FALSE, cr, NULL))) {
3642                 zfs_exit(zfsvfs, FTAG);
3643                 return (error);
3644         }
3645
3646         /*
3647          * Attempt to lock directory; fail if entry already exists.
3648          */
3649         error = zfs_dirent_lookup(tdzp, name, &tzp, ZNEW);
3650         if (error) {
3651                 zfs_exit(zfsvfs, FTAG);
3652                 return (error);
3653         }
3654
3655         tx = dmu_tx_create(zfsvfs->z_os);
3656         dmu_tx_hold_sa(tx, szp->z_sa_hdl, B_FALSE);
3657         dmu_tx_hold_zap(tx, tdzp->z_id, TRUE, name);
3658         zfs_sa_upgrade_txholds(tx, szp);
3659         zfs_sa_upgrade_txholds(tx, tdzp);
3660         error = dmu_tx_assign(tx, TXG_WAIT);
3661         if (error) {
3662                 dmu_tx_abort(tx);
3663                 zfs_exit(zfsvfs, FTAG);
3664                 return (error);
3665         }
3666
3667         error = zfs_link_create(tdzp, name, szp, tx, 0);
3668
3669         if (error == 0) {
3670                 uint64_t txtype = TX_LINK;
3671                 zfs_log_link(zilog, tx, txtype, tdzp, szp, name);
3672         }
3673
3674         dmu_tx_commit(tx);
3675
3676         if (error == 0) {
3677                 vnevent_link(ZTOV(szp), ct);
3678         }
3679
3680         if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
3681                 zil_commit(zilog, 0);
3682
3683         zfs_exit(zfsvfs, FTAG);
3684         return (error);
3685 }
3686
3687 /*
3688  * Free or allocate space in a file.  Currently, this function only
3689  * supports the `F_FREESP' command.  However, this command is somewhat
3690  * misnamed, as its functionality includes the ability to allocate as
3691  * well as free space.
3692  *
3693  *      IN:     ip      - inode of file to free data in.
3694  *              cmd     - action to take (only F_FREESP supported).
3695  *              bfp     - section of file to free/alloc.
3696  *              flag    - current file open mode flags.
3697  *              offset  - current file offset.
3698  *              cr      - credentials of caller.
3699  *
3700  *      RETURN: 0 on success, error code on failure.
3701  *
3702  * Timestamps:
3703  *      ip - ctime|mtime updated
3704  */
3705 int
3706 zfs_space(znode_t *zp, int cmd, flock64_t *bfp, int flag,
3707     offset_t offset, cred_t *cr)
3708 {
3709         (void) offset;
3710         zfsvfs_t        *zfsvfs = ZTOZSB(zp);
3711         uint64_t        off, len;
3712         int             error;
3713
3714         if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
3715                 return (error);
3716
3717         if (cmd != F_FREESP) {
3718                 zfs_exit(zfsvfs, FTAG);
3719                 return (SET_ERROR(EINVAL));
3720         }
3721
3722         /*
3723          * Callers might not be able to detect properly that we are read-only,
3724          * so check it explicitly here.
3725          */
3726         if (zfs_is_readonly(zfsvfs)) {
3727                 zfs_exit(zfsvfs, FTAG);
3728                 return (SET_ERROR(EROFS));
3729         }
3730
3731         if (bfp->l_len < 0) {
3732                 zfs_exit(zfsvfs, FTAG);
3733                 return (SET_ERROR(EINVAL));
3734         }
3735
3736         /*
3737          * Permissions aren't checked on Solaris because on this OS
3738          * zfs_space() can only be called with an opened file handle.
3739          * On Linux we can get here through truncate_range() which
3740          * operates directly on inodes, so we need to check access rights.
3741          */
3742         if ((error = zfs_zaccess(zp, ACE_WRITE_DATA, 0, B_FALSE, cr, NULL))) {
3743                 zfs_exit(zfsvfs, FTAG);
3744                 return (error);
3745         }
3746
3747         off = bfp->l_start;
3748         len = bfp->l_len; /* 0 means from off to end of file */
3749
3750         error = zfs_freesp(zp, off, len, flag, TRUE);
3751
3752         zfs_exit(zfsvfs, FTAG);
3753         return (error);
3754 }
3755
3756 static void
3757 zfs_inactive(vnode_t *vp, cred_t *cr, caller_context_t *ct)
3758 {
3759         (void) cr, (void) ct;
3760         znode_t *zp = VTOZ(vp);
3761         zfsvfs_t *zfsvfs = zp->z_zfsvfs;
3762         int error;
3763
3764         ZFS_TEARDOWN_INACTIVE_ENTER_READ(zfsvfs);
3765         if (zp->z_sa_hdl == NULL) {
3766                 /*
3767                  * The fs has been unmounted, or we did a
3768                  * suspend/resume and this file no longer exists.
3769                  */
3770                 ZFS_TEARDOWN_INACTIVE_EXIT_READ(zfsvfs);
3771                 vrecycle(vp);
3772                 return;
3773         }
3774
3775         if (zp->z_unlinked) {
3776                 /*
3777                  * Fast path to recycle a vnode of a removed file.
3778                  */
3779                 ZFS_TEARDOWN_INACTIVE_EXIT_READ(zfsvfs);
3780                 vrecycle(vp);
3781                 return;
3782         }
3783
3784         if (zp->z_atime_dirty && zp->z_unlinked == 0) {
3785                 dmu_tx_t *tx = dmu_tx_create(zfsvfs->z_os);
3786
3787                 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
3788                 zfs_sa_upgrade_txholds(tx, zp);
3789                 error = dmu_tx_assign(tx, TXG_WAIT);
3790                 if (error) {
3791                         dmu_tx_abort(tx);
3792                 } else {
3793                         (void) sa_update(zp->z_sa_hdl, SA_ZPL_ATIME(zfsvfs),
3794                             (void *)&zp->z_atime, sizeof (zp->z_atime), tx);
3795                         zp->z_atime_dirty = 0;
3796                         dmu_tx_commit(tx);
3797                 }
3798         }
3799         ZFS_TEARDOWN_INACTIVE_EXIT_READ(zfsvfs);
3800 }
3801
3802
3803 _Static_assert(sizeof (struct zfid_short) <= sizeof (struct fid),
3804         "struct zfid_short bigger than struct fid");
3805 _Static_assert(sizeof (struct zfid_long) <= sizeof (struct fid),
3806         "struct zfid_long bigger than struct fid");
3807
3808 static int
3809 zfs_fid(vnode_t *vp, fid_t *fidp, caller_context_t *ct)
3810 {
3811         (void) ct;
3812         znode_t         *zp = VTOZ(vp);
3813         zfsvfs_t        *zfsvfs = zp->z_zfsvfs;
3814         uint32_t        gen;
3815         uint64_t        gen64;
3816         uint64_t        object = zp->z_id;
3817         zfid_short_t    *zfid;
3818         int             size, i, error;
3819
3820         if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
3821                 return (error);
3822
3823         if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_GEN(zfsvfs),
3824             &gen64, sizeof (uint64_t))) != 0) {
3825                 zfs_exit(zfsvfs, FTAG);
3826                 return (error);
3827         }
3828
3829         gen = (uint32_t)gen64;
3830
3831         size = (zfsvfs->z_parent != zfsvfs) ? LONG_FID_LEN : SHORT_FID_LEN;
3832         fidp->fid_len = size;
3833
3834         zfid = (zfid_short_t *)fidp;
3835
3836         zfid->zf_len = size;
3837
3838         for (i = 0; i < sizeof (zfid->zf_object); i++)
3839                 zfid->zf_object[i] = (uint8_t)(object >> (8 * i));
3840
3841         /* Must have a non-zero generation number to distinguish from .zfs */
3842         if (gen == 0)
3843                 gen = 1;
3844         for (i = 0; i < sizeof (zfid->zf_gen); i++)
3845                 zfid->zf_gen[i] = (uint8_t)(gen >> (8 * i));
3846
3847         if (size == LONG_FID_LEN) {
3848                 uint64_t        objsetid = dmu_objset_id(zfsvfs->z_os);
3849                 zfid_long_t     *zlfid;
3850
3851                 zlfid = (zfid_long_t *)fidp;
3852
3853                 for (i = 0; i < sizeof (zlfid->zf_setid); i++)
3854                         zlfid->zf_setid[i] = (uint8_t)(objsetid >> (8 * i));
3855
3856                 /* XXX - this should be the generation number for the objset */
3857                 for (i = 0; i < sizeof (zlfid->zf_setgen); i++)
3858                         zlfid->zf_setgen[i] = 0;
3859         }
3860
3861         zfs_exit(zfsvfs, FTAG);
3862         return (0);
3863 }
3864
3865 static int
3866 zfs_pathconf(vnode_t *vp, int cmd, ulong_t *valp, cred_t *cr,
3867     caller_context_t *ct)
3868 {
3869         znode_t *zp;
3870         zfsvfs_t *zfsvfs;
3871         int error;
3872
3873         switch (cmd) {
3874         case _PC_LINK_MAX:
3875                 *valp = MIN(LONG_MAX, ZFS_LINK_MAX);
3876                 return (0);
3877
3878         case _PC_FILESIZEBITS:
3879                 *valp = 64;
3880                 return (0);
3881         case _PC_MIN_HOLE_SIZE:
3882                 *valp = (int)SPA_MINBLOCKSIZE;
3883                 return (0);
3884         case _PC_ACL_EXTENDED:
3885 #if 0           /* POSIX ACLs are not implemented for ZFS on FreeBSD yet. */
3886                 zp = VTOZ(vp);
3887                 zfsvfs = zp->z_zfsvfs;
3888                 if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
3889                         return (error);
3890                 *valp = zfsvfs->z_acl_type == ZFSACLTYPE_POSIX ? 1 : 0;
3891                 zfs_exit(zfsvfs, FTAG);
3892 #else
3893                 *valp = 0;
3894 #endif
3895                 return (0);
3896
3897         case _PC_ACL_NFS4:
3898                 zp = VTOZ(vp);
3899                 zfsvfs = zp->z_zfsvfs;
3900                 if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
3901                         return (error);
3902                 *valp = zfsvfs->z_acl_type == ZFS_ACLTYPE_NFSV4 ? 1 : 0;
3903                 zfs_exit(zfsvfs, FTAG);
3904                 return (0);
3905
3906         case _PC_ACL_PATH_MAX:
3907                 *valp = ACL_MAX_ENTRIES;
3908                 return (0);
3909
3910         default:
3911                 return (EOPNOTSUPP);
3912         }
3913 }
3914
3915 static int
3916 zfs_getpages(struct vnode *vp, vm_page_t *ma, int count, int *rbehind,
3917     int *rahead)
3918 {
3919         znode_t *zp = VTOZ(vp);
3920         zfsvfs_t *zfsvfs = zp->z_zfsvfs;
3921         zfs_locked_range_t *lr;
3922         vm_object_t object;
3923         off_t start, end, obj_size;
3924         uint_t blksz;
3925         int pgsin_b, pgsin_a;
3926         int error;
3927
3928         if (zfs_enter_verify_zp(zfsvfs, zp, FTAG) != 0)
3929                 return (zfs_vm_pagerret_error);
3930
3931         start = IDX_TO_OFF(ma[0]->pindex);
3932         end = IDX_TO_OFF(ma[count - 1]->pindex + 1);
3933
3934         /*
3935          * Lock a range covering all required and optional pages.
3936          * Note that we need to handle the case of the block size growing.
3937          */
3938         for (;;) {
3939                 blksz = zp->z_blksz;
3940                 lr = zfs_rangelock_tryenter(&zp->z_rangelock,
3941                     rounddown(start, blksz),
3942                     roundup(end, blksz) - rounddown(start, blksz), RL_READER);
3943                 if (lr == NULL) {
3944                         if (rahead != NULL) {
3945                                 *rahead = 0;
3946                                 rahead = NULL;
3947                         }
3948                         if (rbehind != NULL) {
3949                                 *rbehind = 0;
3950                                 rbehind = NULL;
3951                         }
3952                         break;
3953                 }
3954                 if (blksz == zp->z_blksz)
3955                         break;
3956                 zfs_rangelock_exit(lr);
3957         }
3958
3959         object = ma[0]->object;
3960         zfs_vmobject_wlock(object);
3961         obj_size = object->un_pager.vnp.vnp_size;
3962         zfs_vmobject_wunlock(object);
3963         if (IDX_TO_OFF(ma[count - 1]->pindex) >= obj_size) {
3964                 if (lr != NULL)
3965                         zfs_rangelock_exit(lr);
3966                 zfs_exit(zfsvfs, FTAG);
3967                 return (zfs_vm_pagerret_bad);
3968         }
3969
3970         pgsin_b = 0;
3971         if (rbehind != NULL) {
3972                 pgsin_b = OFF_TO_IDX(start - rounddown(start, blksz));
3973                 pgsin_b = MIN(*rbehind, pgsin_b);
3974         }
3975
3976         pgsin_a = 0;
3977         if (rahead != NULL) {
3978                 pgsin_a = OFF_TO_IDX(roundup(end, blksz) - end);
3979                 if (end + IDX_TO_OFF(pgsin_a) >= obj_size)
3980                         pgsin_a = OFF_TO_IDX(round_page(obj_size) - end);
3981                 pgsin_a = MIN(*rahead, pgsin_a);
3982         }
3983
3984         /*
3985          * NB: we need to pass the exact byte size of the data that we expect
3986          * to read after accounting for the file size.  This is required because
3987          * ZFS will panic if we request DMU to read beyond the end of the last
3988          * allocated block.
3989          */
3990         error = dmu_read_pages(zfsvfs->z_os, zp->z_id, ma, count, &pgsin_b,
3991             &pgsin_a, MIN(end, obj_size) - (end - PAGE_SIZE));
3992
3993         if (lr != NULL)
3994                 zfs_rangelock_exit(lr);
3995         ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
3996
3997         dataset_kstats_update_read_kstats(&zfsvfs->z_kstat, count*PAGE_SIZE);
3998
3999         zfs_exit(zfsvfs, FTAG);
4000
4001         if (error != 0)
4002                 return (zfs_vm_pagerret_error);
4003
4004         VM_CNT_INC(v_vnodein);
4005         VM_CNT_ADD(v_vnodepgsin, count + pgsin_b + pgsin_a);
4006         if (rbehind != NULL)
4007                 *rbehind = pgsin_b;
4008         if (rahead != NULL)
4009                 *rahead = pgsin_a;
4010         return (zfs_vm_pagerret_ok);
4011 }
4012
4013 #ifndef _SYS_SYSPROTO_H_
4014 struct vop_getpages_args {
4015         struct vnode *a_vp;
4016         vm_page_t *a_m;
4017         int a_count;
4018         int *a_rbehind;
4019         int *a_rahead;
4020 };
4021 #endif
4022
4023 static int
4024 zfs_freebsd_getpages(struct vop_getpages_args *ap)
4025 {
4026
4027         return (zfs_getpages(ap->a_vp, ap->a_m, ap->a_count, ap->a_rbehind,
4028             ap->a_rahead));
4029 }
4030
4031 static int
4032 zfs_putpages(struct vnode *vp, vm_page_t *ma, size_t len, int flags,
4033     int *rtvals)
4034 {
4035         znode_t         *zp = VTOZ(vp);
4036         zfsvfs_t        *zfsvfs = zp->z_zfsvfs;
4037         zfs_locked_range_t              *lr;
4038         dmu_tx_t        *tx;
4039         struct sf_buf   *sf;
4040         vm_object_t     object;
4041         vm_page_t       m;
4042         caddr_t         va;
4043         size_t          tocopy;
4044         size_t          lo_len;
4045         vm_ooffset_t    lo_off;
4046         vm_ooffset_t    off;
4047         uint_t          blksz;
4048         int             ncount;
4049         int             pcount;
4050         int             err;
4051         int             i;
4052
4053         object = vp->v_object;
4054         KASSERT(ma[0]->object == object, ("mismatching object"));
4055         KASSERT(len > 0 && (len & PAGE_MASK) == 0, ("unexpected length"));
4056
4057         pcount = btoc(len);
4058         ncount = pcount;
4059         for (i = 0; i < pcount; i++)
4060                 rtvals[i] = zfs_vm_pagerret_error;
4061
4062         if (zfs_enter_verify_zp(zfsvfs, zp, FTAG) != 0)
4063                 return (zfs_vm_pagerret_error);
4064
4065         off = IDX_TO_OFF(ma[0]->pindex);
4066         blksz = zp->z_blksz;
4067         lo_off = rounddown(off, blksz);
4068         lo_len = roundup(len + (off - lo_off), blksz);
4069         lr = zfs_rangelock_enter(&zp->z_rangelock, lo_off, lo_len, RL_WRITER);
4070
4071         zfs_vmobject_wlock(object);
4072         if (len + off > object->un_pager.vnp.vnp_size) {
4073                 if (object->un_pager.vnp.vnp_size > off) {
4074                         int pgoff;
4075
4076                         len = object->un_pager.vnp.vnp_size - off;
4077                         ncount = btoc(len);
4078                         if ((pgoff = (int)len & PAGE_MASK) != 0) {
4079                                 /*
4080                                  * If the object is locked and the following
4081                                  * conditions hold, then the page's dirty
4082                                  * field cannot be concurrently changed by a
4083                                  * pmap operation.
4084                                  */
4085                                 m = ma[ncount - 1];
4086                                 vm_page_assert_sbusied(m);
4087                                 KASSERT(!pmap_page_is_write_mapped(m),
4088                                     ("zfs_putpages: page %p is not read-only",
4089                                     m));
4090                                 vm_page_clear_dirty(m, pgoff, PAGE_SIZE -
4091                                     pgoff);
4092                         }
4093                 } else {
4094                         len = 0;
4095                         ncount = 0;
4096                 }
4097                 if (ncount < pcount) {
4098                         for (i = ncount; i < pcount; i++) {
4099                                 rtvals[i] = zfs_vm_pagerret_bad;
4100                         }
4101                 }
4102         }
4103         zfs_vmobject_wunlock(object);
4104
4105         boolean_t commit = (flags & (zfs_vm_pagerput_sync |
4106             zfs_vm_pagerput_inval)) != 0 ||
4107             zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS;
4108
4109         if (ncount == 0)
4110                 goto out;
4111
4112         if (zfs_id_overblockquota(zfsvfs, DMU_USERUSED_OBJECT, zp->z_uid) ||
4113             zfs_id_overblockquota(zfsvfs, DMU_GROUPUSED_OBJECT, zp->z_gid) ||
4114             (zp->z_projid != ZFS_DEFAULT_PROJID &&
4115             zfs_id_overblockquota(zfsvfs, DMU_PROJECTUSED_OBJECT,
4116             zp->z_projid))) {
4117                 goto out;
4118         }
4119
4120         tx = dmu_tx_create(zfsvfs->z_os);
4121         dmu_tx_hold_write(tx, zp->z_id, off, len);
4122
4123         dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
4124         zfs_sa_upgrade_txholds(tx, zp);
4125         err = dmu_tx_assign(tx, TXG_WAIT);
4126         if (err != 0) {
4127                 dmu_tx_abort(tx);
4128                 goto out;
4129         }
4130
4131         if (zp->z_blksz < PAGE_SIZE) {
4132                 for (i = 0; len > 0; off += tocopy, len -= tocopy, i++) {
4133                         tocopy = len > PAGE_SIZE ? PAGE_SIZE : len;
4134                         va = zfs_map_page(ma[i], &sf);
4135                         dmu_write(zfsvfs->z_os, zp->z_id, off, tocopy, va, tx);
4136                         zfs_unmap_page(sf);
4137                 }
4138         } else {
4139                 err = dmu_write_pages(zfsvfs->z_os, zp->z_id, off, len, ma, tx);
4140         }
4141
4142         if (err == 0) {
4143                 uint64_t mtime[2], ctime[2];
4144                 sa_bulk_attr_t bulk[3];
4145                 int count = 0;
4146
4147                 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL,
4148                     &mtime, 16);
4149                 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
4150                     &ctime, 16);
4151                 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
4152                     &zp->z_pflags, 8);
4153                 zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime);
4154                 err = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
4155                 ASSERT0(err);
4156                 /*
4157                  * XXX we should be passing a callback to undirty
4158                  * but that would make the locking messier
4159                  */
4160                 zfs_log_write(zfsvfs->z_log, tx, TX_WRITE, zp, off,
4161                     len, commit, B_FALSE, NULL, NULL);
4162
4163                 zfs_vmobject_wlock(object);
4164                 for (i = 0; i < ncount; i++) {
4165                         rtvals[i] = zfs_vm_pagerret_ok;
4166                         vm_page_undirty(ma[i]);
4167                 }
4168                 zfs_vmobject_wunlock(object);
4169                 VM_CNT_INC(v_vnodeout);
4170                 VM_CNT_ADD(v_vnodepgsout, ncount);
4171         }
4172         dmu_tx_commit(tx);
4173
4174 out:
4175         zfs_rangelock_exit(lr);
4176         if (commit)
4177                 zil_commit(zfsvfs->z_log, zp->z_id);
4178
4179         dataset_kstats_update_write_kstats(&zfsvfs->z_kstat, len);
4180
4181         zfs_exit(zfsvfs, FTAG);
4182         return (rtvals[0]);
4183 }
4184
4185 #ifndef _SYS_SYSPROTO_H_
4186 struct vop_putpages_args {
4187         struct vnode *a_vp;
4188         vm_page_t *a_m;
4189         int a_count;
4190         int a_sync;
4191         int *a_rtvals;
4192 };
4193 #endif
4194
4195 static int
4196 zfs_freebsd_putpages(struct vop_putpages_args *ap)
4197 {
4198
4199         return (zfs_putpages(ap->a_vp, ap->a_m, ap->a_count, ap->a_sync,
4200             ap->a_rtvals));
4201 }
4202
4203 #ifndef _SYS_SYSPROTO_H_
4204 struct vop_bmap_args {
4205         struct vnode *a_vp;
4206         daddr_t  a_bn;
4207         struct bufobj **a_bop;
4208         daddr_t *a_bnp;
4209         int *a_runp;
4210         int *a_runb;
4211 };
4212 #endif
4213
4214 static int
4215 zfs_freebsd_bmap(struct vop_bmap_args *ap)
4216 {
4217
4218         if (ap->a_bop != NULL)
4219                 *ap->a_bop = &ap->a_vp->v_bufobj;
4220         if (ap->a_bnp != NULL)
4221                 *ap->a_bnp = ap->a_bn;
4222         if (ap->a_runp != NULL)
4223                 *ap->a_runp = 0;
4224         if (ap->a_runb != NULL)
4225                 *ap->a_runb = 0;
4226
4227         return (0);
4228 }
4229
4230 #ifndef _SYS_SYSPROTO_H_
4231 struct vop_open_args {
4232         struct vnode *a_vp;
4233         int a_mode;
4234         struct ucred *a_cred;
4235         struct thread *a_td;
4236 };
4237 #endif
4238
4239 static int
4240 zfs_freebsd_open(struct vop_open_args *ap)
4241 {
4242         vnode_t *vp = ap->a_vp;
4243         znode_t *zp = VTOZ(vp);
4244         int error;
4245
4246         error = zfs_open(&vp, ap->a_mode, ap->a_cred);
4247         if (error == 0)
4248                 vnode_create_vobject(vp, zp->z_size, ap->a_td);
4249         return (error);
4250 }
4251
4252 #ifndef _SYS_SYSPROTO_H_
4253 struct vop_close_args {
4254         struct vnode *a_vp;
4255         int  a_fflag;
4256         struct ucred *a_cred;
4257         struct thread *a_td;
4258 };
4259 #endif
4260
4261 static int
4262 zfs_freebsd_close(struct vop_close_args *ap)
4263 {
4264
4265         return (zfs_close(ap->a_vp, ap->a_fflag, 1, 0, ap->a_cred));
4266 }
4267
4268 #ifndef _SYS_SYSPROTO_H_
4269 struct vop_ioctl_args {
4270         struct vnode *a_vp;
4271         ulong_t a_command;
4272         caddr_t a_data;
4273         int a_fflag;
4274         struct ucred *cred;
4275         struct thread *td;
4276 };
4277 #endif
4278
4279 static int
4280 zfs_freebsd_ioctl(struct vop_ioctl_args *ap)
4281 {
4282
4283         return (zfs_ioctl(ap->a_vp, ap->a_command, (intptr_t)ap->a_data,
4284             ap->a_fflag, ap->a_cred, NULL));
4285 }
4286
4287 static int
4288 ioflags(int ioflags)
4289 {
4290         int flags = 0;
4291
4292         if (ioflags & IO_APPEND)
4293                 flags |= O_APPEND;
4294         if (ioflags & IO_NDELAY)
4295                 flags |= O_NONBLOCK;
4296         if (ioflags & IO_DIRECT)
4297                 flags |= O_DIRECT;
4298         if (ioflags & IO_SYNC)
4299                 flags |= O_SYNC;
4300
4301         return (flags);
4302 }
4303
4304 #ifndef _SYS_SYSPROTO_H_
4305 struct vop_read_args {
4306         struct vnode *a_vp;
4307         struct uio *a_uio;
4308         int a_ioflag;
4309         struct ucred *a_cred;
4310 };
4311 #endif
4312
4313 static int
4314 zfs_freebsd_read(struct vop_read_args *ap)
4315 {
4316         zfs_uio_t uio;
4317         int error = 0;
4318         zfs_uio_init(&uio, ap->a_uio);
4319         error = zfs_read(VTOZ(ap->a_vp), &uio, ioflags(ap->a_ioflag),
4320             ap->a_cred);
4321         /*
4322          * XXX We occasionally get an EFAULT for Direct I/O reads on
4323          * FreeBSD 13. This still needs to be resolved. The EFAULT comes
4324          * from:
4325          * zfs_uio_get__dio_pages_alloc() ->
4326          * zfs_uio_get_dio_pages_impl() ->
4327          * zfs_uio_iov_step() ->
4328          * zfs_uio_get_user_pages().
4329          * We return EFAULT from zfs_uio_iov_step(). When a Direct I/O
4330          * read fails to map in the user pages (returning EFAULT) the
4331          * Direct I/O request is broken up into two separate IO requests
4332          * and issued separately using Direct I/O.
4333          */
4334 #ifdef ZFS_DEBUG
4335         if (error == EFAULT && uio.uio_extflg & UIO_DIRECT) {
4336 #if 0
4337                 printf("%s(%d): Direct I/O read returning EFAULT "
4338                     "uio = %p, zfs_uio_offset(uio) = %lu "
4339                     "zfs_uio_resid(uio) = %lu\n",
4340                     __FUNCTION__, __LINE__, &uio, zfs_uio_offset(&uio),
4341                     zfs_uio_resid(&uio));
4342 #endif
4343         }
4344
4345 #endif
4346         return (error);
4347 }
4348
4349 #ifndef _SYS_SYSPROTO_H_
4350 struct vop_write_args {
4351         struct vnode *a_vp;
4352         struct uio *a_uio;
4353         int a_ioflag;
4354         struct ucred *a_cred;
4355 };
4356 #endif
4357
4358 static int
4359 zfs_freebsd_write(struct vop_write_args *ap)
4360 {
4361         zfs_uio_t uio;
4362         zfs_uio_init(&uio, ap->a_uio);
4363         return (zfs_write(VTOZ(ap->a_vp), &uio, ioflags(ap->a_ioflag),
4364             ap->a_cred));
4365 }
4366
4367 /*
4368  * VOP_FPLOOKUP_VEXEC routines are subject to special circumstances, see
4369  * the comment above cache_fplookup for details.
4370  */
4371 static int
4372 zfs_freebsd_fplookup_vexec(struct vop_fplookup_vexec_args *v)
4373 {
4374         vnode_t *vp;
4375         znode_t *zp;
4376         uint64_t pflags;
4377
4378         vp = v->a_vp;
4379         zp = VTOZ_SMR(vp);
4380         if (__predict_false(zp == NULL))
4381                 return (EAGAIN);
4382         pflags = atomic_load_64(&zp->z_pflags);
4383         if (pflags & ZFS_AV_QUARANTINED)
4384                 return (EAGAIN);
4385         if (pflags & ZFS_XATTR)
4386                 return (EAGAIN);
4387         if ((pflags & ZFS_NO_EXECS_DENIED) == 0)
4388                 return (EAGAIN);
4389         return (0);
4390 }
4391
4392 static int
4393 zfs_freebsd_fplookup_symlink(struct vop_fplookup_symlink_args *v)
4394 {
4395         vnode_t *vp;
4396         znode_t *zp;
4397         char *target;
4398
4399         vp = v->a_vp;
4400         zp = VTOZ_SMR(vp);
4401         if (__predict_false(zp == NULL)) {
4402                 return (EAGAIN);
4403         }
4404
4405         target = atomic_load_consume_ptr(&zp->z_cached_symlink);
4406         if (target == NULL) {
4407                 return (EAGAIN);
4408         }
4409         return (cache_symlink_resolve(v->a_fpl, target, strlen(target)));
4410 }
4411
4412 #ifndef _SYS_SYSPROTO_H_
4413 struct vop_access_args {
4414         struct vnode *a_vp;
4415         accmode_t a_accmode;
4416         struct ucred *a_cred;
4417         struct thread *a_td;
4418 };
4419 #endif
4420
4421 static int
4422 zfs_freebsd_access(struct vop_access_args *ap)
4423 {
4424         vnode_t *vp = ap->a_vp;
4425         znode_t *zp = VTOZ(vp);
4426         accmode_t accmode;
4427         int error = 0;
4428
4429
4430         if (ap->a_accmode == VEXEC) {
4431                 if (zfs_fastaccesschk_execute(zp, ap->a_cred) == 0)
4432                         return (0);
4433         }
4434
4435         /*
4436          * ZFS itself only knowns about VREAD, VWRITE, VEXEC and VAPPEND,
4437          */
4438         accmode = ap->a_accmode & (VREAD|VWRITE|VEXEC|VAPPEND);
4439         if (accmode != 0)
4440                 error = zfs_access(zp, accmode, 0, ap->a_cred);
4441
4442         /*
4443          * VADMIN has to be handled by vaccess().
4444          */
4445         if (error == 0) {
4446                 accmode = ap->a_accmode & ~(VREAD|VWRITE|VEXEC|VAPPEND);
4447                 if (accmode != 0) {
4448                         error = vaccess(vp->v_type, zp->z_mode, zp->z_uid,
4449                             zp->z_gid, accmode, ap->a_cred);
4450                 }
4451         }
4452
4453         /*
4454          * For VEXEC, ensure that at least one execute bit is set for
4455          * non-directories.
4456          */
4457         if (error == 0 && (ap->a_accmode & VEXEC) != 0 && vp->v_type != VDIR &&
4458             (zp->z_mode & (S_IXUSR | S_IXGRP | S_IXOTH)) == 0) {
4459                 error = EACCES;
4460         }
4461
4462         return (error);
4463 }
4464
4465 #ifndef _SYS_SYSPROTO_H_
4466 struct vop_lookup_args {
4467         struct vnode *a_dvp;
4468         struct vnode **a_vpp;
4469         struct componentname *a_cnp;
4470 };
4471 #endif
4472
4473 static int
4474 zfs_freebsd_lookup(struct vop_lookup_args *ap, boolean_t cached)
4475 {
4476         struct componentname *cnp = ap->a_cnp;
4477         char nm[NAME_MAX + 1];
4478
4479         ASSERT3U(cnp->cn_namelen, <, sizeof (nm));
4480         strlcpy(nm, cnp->cn_nameptr, MIN(cnp->cn_namelen + 1, sizeof (nm)));
4481
4482         return (zfs_lookup(ap->a_dvp, nm, ap->a_vpp, cnp, cnp->cn_nameiop,
4483             cnp->cn_cred, 0, cached));
4484 }
4485
4486 static int
4487 zfs_freebsd_cachedlookup(struct vop_cachedlookup_args *ap)
4488 {
4489
4490         return (zfs_freebsd_lookup((struct vop_lookup_args *)ap, B_TRUE));
4491 }
4492
4493 #ifndef _SYS_SYSPROTO_H_
4494 struct vop_lookup_args {
4495         struct vnode *a_dvp;
4496         struct vnode **a_vpp;
4497         struct componentname *a_cnp;
4498 };
4499 #endif
4500
4501 static int
4502 zfs_cache_lookup(struct vop_lookup_args *ap)
4503 {
4504         zfsvfs_t *zfsvfs;
4505
4506         zfsvfs = ap->a_dvp->v_mount->mnt_data;
4507         if (zfsvfs->z_use_namecache)
4508                 return (vfs_cache_lookup(ap));
4509         else
4510                 return (zfs_freebsd_lookup(ap, B_FALSE));
4511 }
4512
4513 #ifndef _SYS_SYSPROTO_H_
4514 struct vop_create_args {
4515         struct vnode *a_dvp;
4516         struct vnode **a_vpp;
4517         struct componentname *a_cnp;
4518         struct vattr *a_vap;
4519 };
4520 #endif
4521
4522 static int
4523 zfs_freebsd_create(struct vop_create_args *ap)
4524 {
4525         zfsvfs_t *zfsvfs;
4526         struct componentname *cnp = ap->a_cnp;
4527         vattr_t *vap = ap->a_vap;
4528         znode_t *zp = NULL;
4529         int rc, mode;
4530
4531 #if __FreeBSD_version < 1400068
4532         ASSERT(cnp->cn_flags & SAVENAME);
4533 #endif
4534
4535         vattr_init_mask(vap);
4536         mode = vap->va_mode & ALLPERMS;
4537         zfsvfs = ap->a_dvp->v_mount->mnt_data;
4538         *ap->a_vpp = NULL;
4539
4540         rc = zfs_create(VTOZ(ap->a_dvp), cnp->cn_nameptr, vap, 0, mode,
4541             &zp, cnp->cn_cred, 0 /* flag */, NULL /* vsecattr */, NULL);
4542         if (rc == 0)
4543                 *ap->a_vpp = ZTOV(zp);
4544         if (zfsvfs->z_use_namecache &&
4545             rc == 0 && (cnp->cn_flags & MAKEENTRY) != 0)
4546                 cache_enter(ap->a_dvp, *ap->a_vpp, cnp);
4547
4548         return (rc);
4549 }
4550
4551 #ifndef _SYS_SYSPROTO_H_
4552 struct vop_remove_args {
4553         struct vnode *a_dvp;
4554         struct vnode *a_vp;
4555         struct componentname *a_cnp;
4556 };
4557 #endif
4558
4559 static int
4560 zfs_freebsd_remove(struct vop_remove_args *ap)
4561 {
4562
4563 #if __FreeBSD_version < 1400068
4564         ASSERT(ap->a_cnp->cn_flags & SAVENAME);
4565 #endif
4566
4567         return (zfs_remove_(ap->a_dvp, ap->a_vp, ap->a_cnp->cn_nameptr,
4568             ap->a_cnp->cn_cred));
4569 }
4570
4571 #ifndef _SYS_SYSPROTO_H_
4572 struct vop_mkdir_args {
4573         struct vnode *a_dvp;
4574         struct vnode **a_vpp;
4575         struct componentname *a_cnp;
4576         struct vattr *a_vap;
4577 };
4578 #endif
4579
4580 static int
4581 zfs_freebsd_mkdir(struct vop_mkdir_args *ap)
4582 {
4583         vattr_t *vap = ap->a_vap;
4584         znode_t *zp = NULL;
4585         int rc;
4586
4587 #if __FreeBSD_version < 1400068
4588         ASSERT(ap->a_cnp->cn_flags & SAVENAME);
4589 #endif
4590
4591         vattr_init_mask(vap);
4592         *ap->a_vpp = NULL;
4593
4594         rc = zfs_mkdir(VTOZ(ap->a_dvp), ap->a_cnp->cn_nameptr, vap, &zp,
4595             ap->a_cnp->cn_cred, 0, NULL, NULL);
4596
4597         if (rc == 0)
4598                 *ap->a_vpp = ZTOV(zp);
4599         return (rc);
4600 }
4601
4602 #ifndef _SYS_SYSPROTO_H_
4603 struct vop_rmdir_args {
4604         struct vnode *a_dvp;
4605         struct vnode *a_vp;
4606         struct componentname *a_cnp;
4607 };
4608 #endif
4609
4610 static int
4611 zfs_freebsd_rmdir(struct vop_rmdir_args *ap)
4612 {
4613         struct componentname *cnp = ap->a_cnp;
4614
4615 #if __FreeBSD_version < 1400068
4616         ASSERT(cnp->cn_flags & SAVENAME);
4617 #endif
4618
4619         return (zfs_rmdir_(ap->a_dvp, ap->a_vp, cnp->cn_nameptr, cnp->cn_cred));
4620 }
4621
4622 #ifndef _SYS_SYSPROTO_H_
4623 struct vop_readdir_args {
4624         struct vnode *a_vp;
4625         struct uio *a_uio;
4626         struct ucred *a_cred;
4627         int *a_eofflag;
4628         int *a_ncookies;
4629         cookie_t **a_cookies;
4630 };
4631 #endif
4632
4633 static int
4634 zfs_freebsd_readdir(struct vop_readdir_args *ap)
4635 {
4636         zfs_uio_t uio;
4637         zfs_uio_init(&uio, ap->a_uio);
4638         return (zfs_readdir(ap->a_vp, &uio, ap->a_cred, ap->a_eofflag,
4639             ap->a_ncookies, ap->a_cookies));
4640 }
4641
4642 #ifndef _SYS_SYSPROTO_H_
4643 struct vop_fsync_args {
4644         struct vnode *a_vp;
4645         int a_waitfor;
4646         struct thread *a_td;
4647 };
4648 #endif
4649
4650 static int
4651 zfs_freebsd_fsync(struct vop_fsync_args *ap)
4652 {
4653
4654         return (zfs_fsync(VTOZ(ap->a_vp), 0, ap->a_td->td_ucred));
4655 }
4656
4657 #ifndef _SYS_SYSPROTO_H_
4658 struct vop_getattr_args {
4659         struct vnode *a_vp;
4660         struct vattr *a_vap;
4661         struct ucred *a_cred;
4662 };
4663 #endif
4664
4665 static int
4666 zfs_freebsd_getattr(struct vop_getattr_args *ap)
4667 {
4668         vattr_t *vap = ap->a_vap;
4669         xvattr_t xvap;
4670         ulong_t fflags = 0;
4671         int error;
4672
4673         xva_init(&xvap);
4674         xvap.xva_vattr = *vap;
4675         xvap.xva_vattr.va_mask |= AT_XVATTR;
4676
4677         /* Convert chflags into ZFS-type flags. */
4678         /* XXX: what about SF_SETTABLE?. */
4679         XVA_SET_REQ(&xvap, XAT_IMMUTABLE);
4680         XVA_SET_REQ(&xvap, XAT_APPENDONLY);
4681         XVA_SET_REQ(&xvap, XAT_NOUNLINK);
4682         XVA_SET_REQ(&xvap, XAT_NODUMP);
4683         XVA_SET_REQ(&xvap, XAT_READONLY);
4684         XVA_SET_REQ(&xvap, XAT_ARCHIVE);
4685         XVA_SET_REQ(&xvap, XAT_SYSTEM);
4686         XVA_SET_REQ(&xvap, XAT_HIDDEN);
4687         XVA_SET_REQ(&xvap, XAT_REPARSE);
4688         XVA_SET_REQ(&xvap, XAT_OFFLINE);
4689         XVA_SET_REQ(&xvap, XAT_SPARSE);
4690
4691         error = zfs_getattr(ap->a_vp, (vattr_t *)&xvap, 0, ap->a_cred);
4692         if (error != 0)
4693                 return (error);
4694
4695         /* Convert ZFS xattr into chflags. */
4696 #define FLAG_CHECK(fflag, xflag, xfield)        do {                    \
4697         if (XVA_ISSET_RTN(&xvap, (xflag)) && (xfield) != 0)             \
4698                 fflags |= (fflag);                                      \
4699 } while (0)
4700         FLAG_CHECK(SF_IMMUTABLE, XAT_IMMUTABLE,
4701             xvap.xva_xoptattrs.xoa_immutable);
4702         FLAG_CHECK(SF_APPEND, XAT_APPENDONLY,
4703             xvap.xva_xoptattrs.xoa_appendonly);
4704         FLAG_CHECK(SF_NOUNLINK, XAT_NOUNLINK,
4705             xvap.xva_xoptattrs.xoa_nounlink);
4706         FLAG_CHECK(UF_ARCHIVE, XAT_ARCHIVE,
4707             xvap.xva_xoptattrs.xoa_archive);
4708         FLAG_CHECK(UF_NODUMP, XAT_NODUMP,
4709             xvap.xva_xoptattrs.xoa_nodump);
4710         FLAG_CHECK(UF_READONLY, XAT_READONLY,
4711             xvap.xva_xoptattrs.xoa_readonly);
4712         FLAG_CHECK(UF_SYSTEM, XAT_SYSTEM,
4713             xvap.xva_xoptattrs.xoa_system);
4714         FLAG_CHECK(UF_HIDDEN, XAT_HIDDEN,
4715             xvap.xva_xoptattrs.xoa_hidden);
4716         FLAG_CHECK(UF_REPARSE, XAT_REPARSE,
4717             xvap.xva_xoptattrs.xoa_reparse);
4718         FLAG_CHECK(UF_OFFLINE, XAT_OFFLINE,
4719             xvap.xva_xoptattrs.xoa_offline);
4720         FLAG_CHECK(UF_SPARSE, XAT_SPARSE,
4721             xvap.xva_xoptattrs.xoa_sparse);
4722
4723 #undef  FLAG_CHECK
4724         *vap = xvap.xva_vattr;
4725         vap->va_flags = fflags;
4726         return (0);
4727 }
4728
4729 #ifndef _SYS_SYSPROTO_H_
4730 struct vop_setattr_args {
4731         struct vnode *a_vp;
4732         struct vattr *a_vap;
4733         struct ucred *a_cred;
4734 };
4735 #endif
4736
4737 static int
4738 zfs_freebsd_setattr(struct vop_setattr_args *ap)
4739 {
4740         vnode_t *vp = ap->a_vp;
4741         vattr_t *vap = ap->a_vap;
4742         cred_t *cred = ap->a_cred;
4743         xvattr_t xvap;
4744         ulong_t fflags;
4745         uint64_t zflags;
4746
4747         vattr_init_mask(vap);
4748         vap->va_mask &= ~AT_NOSET;
4749
4750         xva_init(&xvap);
4751         xvap.xva_vattr = *vap;
4752
4753         zflags = VTOZ(vp)->z_pflags;
4754
4755         if (vap->va_flags != VNOVAL) {
4756                 zfsvfs_t *zfsvfs = VTOZ(vp)->z_zfsvfs;
4757                 int error;
4758
4759                 if (zfsvfs->z_use_fuids == B_FALSE)
4760                         return (EOPNOTSUPP);
4761
4762                 fflags = vap->va_flags;
4763                 /*
4764                  * XXX KDM
4765                  * We need to figure out whether it makes sense to allow
4766                  * UF_REPARSE through, since we don't really have other
4767                  * facilities to handle reparse points and zfs_setattr()
4768                  * doesn't currently allow setting that attribute anyway.
4769                  */
4770                 if ((fflags & ~(SF_IMMUTABLE|SF_APPEND|SF_NOUNLINK|UF_ARCHIVE|
4771                     UF_NODUMP|UF_SYSTEM|UF_HIDDEN|UF_READONLY|UF_REPARSE|
4772                     UF_OFFLINE|UF_SPARSE)) != 0)
4773                         return (EOPNOTSUPP);
4774                 /*
4775                  * Unprivileged processes are not permitted to unset system
4776                  * flags, or modify flags if any system flags are set.
4777                  * Privileged non-jail processes may not modify system flags
4778                  * if securelevel > 0 and any existing system flags are set.
4779                  * Privileged jail processes behave like privileged non-jail
4780                  * processes if the PR_ALLOW_CHFLAGS permission bit is set;
4781                  * otherwise, they behave like unprivileged processes.
4782                  */
4783                 if (secpolicy_fs_owner(vp->v_mount, cred) == 0 ||
4784                     priv_check_cred(cred, PRIV_VFS_SYSFLAGS) == 0) {
4785                         if (zflags &
4786                             (ZFS_IMMUTABLE | ZFS_APPENDONLY | ZFS_NOUNLINK)) {
4787                                 error = securelevel_gt(cred, 0);
4788                                 if (error != 0)
4789                                         return (error);
4790                         }
4791                 } else {
4792                         /*
4793                          * Callers may only modify the file flags on
4794                          * objects they have VADMIN rights for.
4795                          */
4796                         if ((error = VOP_ACCESS(vp, VADMIN, cred,
4797                             curthread)) != 0)
4798                                 return (error);
4799                         if (zflags &
4800                             (ZFS_IMMUTABLE | ZFS_APPENDONLY |
4801                             ZFS_NOUNLINK)) {
4802                                 return (EPERM);
4803                         }
4804                         if (fflags &
4805                             (SF_IMMUTABLE | SF_APPEND | SF_NOUNLINK)) {
4806                                 return (EPERM);
4807                         }
4808                 }
4809
4810 #define FLAG_CHANGE(fflag, zflag, xflag, xfield)        do {            \
4811         if (((fflags & (fflag)) && !(zflags & (zflag))) ||              \
4812             ((zflags & (zflag)) && !(fflags & (fflag)))) {              \
4813                 XVA_SET_REQ(&xvap, (xflag));                            \
4814                 (xfield) = ((fflags & (fflag)) != 0);                   \
4815         }                                                               \
4816 } while (0)
4817                 /* Convert chflags into ZFS-type flags. */
4818                 /* XXX: what about SF_SETTABLE?. */
4819                 FLAG_CHANGE(SF_IMMUTABLE, ZFS_IMMUTABLE, XAT_IMMUTABLE,
4820                     xvap.xva_xoptattrs.xoa_immutable);
4821                 FLAG_CHANGE(SF_APPEND, ZFS_APPENDONLY, XAT_APPENDONLY,
4822                     xvap.xva_xoptattrs.xoa_appendonly);
4823                 FLAG_CHANGE(SF_NOUNLINK, ZFS_NOUNLINK, XAT_NOUNLINK,
4824                     xvap.xva_xoptattrs.xoa_nounlink);
4825                 FLAG_CHANGE(UF_ARCHIVE, ZFS_ARCHIVE, XAT_ARCHIVE,
4826                     xvap.xva_xoptattrs.xoa_archive);
4827                 FLAG_CHANGE(UF_NODUMP, ZFS_NODUMP, XAT_NODUMP,
4828                     xvap.xva_xoptattrs.xoa_nodump);
4829                 FLAG_CHANGE(UF_READONLY, ZFS_READONLY, XAT_READONLY,
4830                     xvap.xva_xoptattrs.xoa_readonly);
4831                 FLAG_CHANGE(UF_SYSTEM, ZFS_SYSTEM, XAT_SYSTEM,
4832                     xvap.xva_xoptattrs.xoa_system);
4833                 FLAG_CHANGE(UF_HIDDEN, ZFS_HIDDEN, XAT_HIDDEN,
4834                     xvap.xva_xoptattrs.xoa_hidden);
4835                 FLAG_CHANGE(UF_REPARSE, ZFS_REPARSE, XAT_REPARSE,
4836                     xvap.xva_xoptattrs.xoa_reparse);
4837                 FLAG_CHANGE(UF_OFFLINE, ZFS_OFFLINE, XAT_OFFLINE,
4838                     xvap.xva_xoptattrs.xoa_offline);
4839                 FLAG_CHANGE(UF_SPARSE, ZFS_SPARSE, XAT_SPARSE,
4840                     xvap.xva_xoptattrs.xoa_sparse);
4841 #undef  FLAG_CHANGE
4842         }
4843         if (vap->va_birthtime.tv_sec != VNOVAL) {
4844                 xvap.xva_vattr.va_mask |= AT_XVATTR;
4845                 XVA_SET_REQ(&xvap, XAT_CREATETIME);
4846         }
4847         return (zfs_setattr(VTOZ(vp), (vattr_t *)&xvap, 0, cred, NULL));
4848 }
4849
4850 #ifndef _SYS_SYSPROTO_H_
4851 struct vop_rename_args {
4852         struct vnode *a_fdvp;
4853         struct vnode *a_fvp;
4854         struct componentname *a_fcnp;
4855         struct vnode *a_tdvp;
4856         struct vnode *a_tvp;
4857         struct componentname *a_tcnp;
4858 };
4859 #endif
4860
4861 static int
4862 zfs_freebsd_rename(struct vop_rename_args *ap)
4863 {
4864         vnode_t *fdvp = ap->a_fdvp;
4865         vnode_t *fvp = ap->a_fvp;
4866         vnode_t *tdvp = ap->a_tdvp;
4867         vnode_t *tvp = ap->a_tvp;
4868         int error;
4869
4870 #if __FreeBSD_version < 1400068
4871         ASSERT(ap->a_fcnp->cn_flags & (SAVENAME|SAVESTART));
4872         ASSERT(ap->a_tcnp->cn_flags & (SAVENAME|SAVESTART));
4873 #endif
4874
4875         error = zfs_do_rename(fdvp, &fvp, ap->a_fcnp, tdvp, &tvp,
4876             ap->a_tcnp, ap->a_fcnp->cn_cred);
4877
4878         vrele(fdvp);
4879         vrele(fvp);
4880         vrele(tdvp);
4881         if (tvp != NULL)
4882                 vrele(tvp);
4883
4884         return (error);
4885 }
4886
4887 #ifndef _SYS_SYSPROTO_H_
4888 struct vop_symlink_args {
4889         struct vnode *a_dvp;
4890         struct vnode **a_vpp;
4891         struct componentname *a_cnp;
4892         struct vattr *a_vap;
4893         char *a_target;
4894 };
4895 #endif
4896
4897 static int
4898 zfs_freebsd_symlink(struct vop_symlink_args *ap)
4899 {
4900         struct componentname *cnp = ap->a_cnp;
4901         vattr_t *vap = ap->a_vap;
4902         znode_t *zp = NULL;
4903         char *symlink;
4904         size_t symlink_len;
4905         int rc;
4906
4907 #if __FreeBSD_version < 1400068
4908         ASSERT(cnp->cn_flags & SAVENAME);
4909 #endif
4910
4911         vap->va_type = VLNK;    /* FreeBSD: Syscall only sets va_mode. */
4912         vattr_init_mask(vap);
4913         *ap->a_vpp = NULL;
4914
4915         rc = zfs_symlink(VTOZ(ap->a_dvp), cnp->cn_nameptr, vap,
4916             ap->a_target, &zp, cnp->cn_cred, 0 /* flags */, NULL);
4917         if (rc == 0) {
4918                 *ap->a_vpp = ZTOV(zp);
4919                 ASSERT_VOP_ELOCKED(ZTOV(zp), __func__);
4920                 MPASS(zp->z_cached_symlink == NULL);
4921                 symlink_len = strlen(ap->a_target);
4922                 symlink = cache_symlink_alloc(symlink_len + 1, M_WAITOK);
4923                 if (symlink != NULL) {
4924                         memcpy(symlink, ap->a_target, symlink_len);
4925                         symlink[symlink_len] = '\0';
4926                         atomic_store_rel_ptr((uintptr_t *)&zp->z_cached_symlink,
4927                             (uintptr_t)symlink);
4928                 }
4929         }
4930         return (rc);
4931 }
4932
4933 #ifndef _SYS_SYSPROTO_H_
4934 struct vop_readlink_args {
4935         struct vnode *a_vp;
4936         struct uio *a_uio;
4937         struct ucred *a_cred;
4938 };
4939 #endif
4940
4941 static int
4942 zfs_freebsd_readlink(struct vop_readlink_args *ap)
4943 {
4944         zfs_uio_t uio;
4945         int error;
4946         znode_t *zp = VTOZ(ap->a_vp);
4947         char *symlink, *base;
4948         size_t symlink_len;
4949         bool trycache;
4950
4951         zfs_uio_init(&uio, ap->a_uio);
4952         trycache = false;
4953         if (zfs_uio_segflg(&uio) == UIO_SYSSPACE &&
4954             zfs_uio_iovcnt(&uio) == 1) {
4955                 base = zfs_uio_iovbase(&uio, 0);
4956                 symlink_len = zfs_uio_iovlen(&uio, 0);
4957                 trycache = true;
4958         }
4959         error = zfs_readlink(ap->a_vp, &uio, ap->a_cred, NULL);
4960         if (atomic_load_ptr(&zp->z_cached_symlink) != NULL ||
4961             error != 0 || !trycache) {
4962                 return (error);
4963         }
4964         symlink_len -= zfs_uio_resid(&uio);
4965         symlink = cache_symlink_alloc(symlink_len + 1, M_WAITOK);
4966         if (symlink != NULL) {
4967                 memcpy(symlink, base, symlink_len);
4968                 symlink[symlink_len] = '\0';
4969                 if (!atomic_cmpset_rel_ptr((uintptr_t *)&zp->z_cached_symlink,
4970                     (uintptr_t)NULL, (uintptr_t)symlink)) {
4971                         cache_symlink_free(symlink, symlink_len + 1);
4972                 }
4973         }
4974         return (error);
4975 }
4976
4977 #ifndef _SYS_SYSPROTO_H_
4978 struct vop_link_args {
4979         struct vnode *a_tdvp;
4980         struct vnode *a_vp;
4981         struct componentname *a_cnp;
4982 };
4983 #endif
4984
4985 static int
4986 zfs_freebsd_link(struct vop_link_args *ap)
4987 {
4988         struct componentname *cnp = ap->a_cnp;
4989         vnode_t *vp = ap->a_vp;
4990         vnode_t *tdvp = ap->a_tdvp;
4991
4992         if (tdvp->v_mount != vp->v_mount)
4993                 return (EXDEV);
4994
4995 #if __FreeBSD_version < 1400068
4996         ASSERT(cnp->cn_flags & SAVENAME);
4997 #endif
4998
4999         return (zfs_link(VTOZ(tdvp), VTOZ(vp),
5000             cnp->cn_nameptr, cnp->cn_cred, 0));
5001 }
5002
5003 #ifndef _SYS_SYSPROTO_H_
5004 struct vop_inactive_args {
5005         struct vnode *a_vp;
5006         struct thread *a_td;
5007 };
5008 #endif
5009
5010 static int
5011 zfs_freebsd_inactive(struct vop_inactive_args *ap)
5012 {
5013         vnode_t *vp = ap->a_vp;
5014
5015         zfs_inactive(vp, curthread->td_ucred, NULL);
5016         return (0);
5017 }
5018
5019 #ifndef _SYS_SYSPROTO_H_
5020 struct vop_need_inactive_args {
5021         struct vnode *a_vp;
5022         struct thread *a_td;
5023 };
5024 #endif
5025
5026 static int
5027 zfs_freebsd_need_inactive(struct vop_need_inactive_args *ap)
5028 {
5029         vnode_t *vp = ap->a_vp;
5030         znode_t *zp = VTOZ(vp);
5031         zfsvfs_t *zfsvfs = zp->z_zfsvfs;
5032         int need;
5033
5034         if (vn_need_pageq_flush(vp))
5035                 return (1);
5036
5037         if (!ZFS_TEARDOWN_INACTIVE_TRY_ENTER_READ(zfsvfs))
5038                 return (1);
5039         need = (zp->z_sa_hdl == NULL || zp->z_unlinked || zp->z_atime_dirty);
5040         ZFS_TEARDOWN_INACTIVE_EXIT_READ(zfsvfs);
5041
5042         return (need);
5043 }
5044
5045 #ifndef _SYS_SYSPROTO_H_
5046 struct vop_reclaim_args {
5047         struct vnode *a_vp;
5048         struct thread *a_td;
5049 };
5050 #endif
5051
5052 static int
5053 zfs_freebsd_reclaim(struct vop_reclaim_args *ap)
5054 {
5055         vnode_t *vp = ap->a_vp;
5056         znode_t *zp = VTOZ(vp);
5057         zfsvfs_t *zfsvfs = zp->z_zfsvfs;
5058
5059         ASSERT3P(zp, !=, NULL);
5060
5061         /*
5062          * z_teardown_inactive_lock protects from a race with
5063          * zfs_znode_dmu_fini in zfsvfs_teardown during
5064          * force unmount.
5065          */
5066         ZFS_TEARDOWN_INACTIVE_ENTER_READ(zfsvfs);
5067         if (zp->z_sa_hdl == NULL)
5068                 zfs_znode_free(zp);
5069         else
5070                 zfs_zinactive(zp);
5071         ZFS_TEARDOWN_INACTIVE_EXIT_READ(zfsvfs);
5072
5073         vp->v_data = NULL;
5074         return (0);
5075 }
5076
5077 #ifndef _SYS_SYSPROTO_H_
5078 struct vop_fid_args {
5079         struct vnode *a_vp;
5080         struct fid *a_fid;
5081 };
5082 #endif
5083
5084 static int
5085 zfs_freebsd_fid(struct vop_fid_args *ap)
5086 {
5087
5088         return (zfs_fid(ap->a_vp, (void *)ap->a_fid, NULL));
5089 }
5090
5091
5092 #ifndef _SYS_SYSPROTO_H_
5093 struct vop_pathconf_args {
5094         struct vnode *a_vp;
5095         int a_name;
5096         register_t *a_retval;
5097 } *ap;
5098 #endif
5099
5100 static int
5101 zfs_freebsd_pathconf(struct vop_pathconf_args *ap)
5102 {
5103         ulong_t val;
5104         int error;
5105
5106         error = zfs_pathconf(ap->a_vp, ap->a_name, &val,
5107             curthread->td_ucred, NULL);
5108         if (error == 0) {
5109                 *ap->a_retval = val;
5110                 return (error);
5111         }
5112         if (error != EOPNOTSUPP)
5113                 return (error);
5114
5115         switch (ap->a_name) {
5116         case _PC_NAME_MAX:
5117                 *ap->a_retval = NAME_MAX;
5118                 return (0);
5119 #if __FreeBSD_version >= 1400032
5120         case _PC_DEALLOC_PRESENT:
5121                 *ap->a_retval = 1;
5122                 return (0);
5123 #endif
5124         case _PC_PIPE_BUF:
5125                 if (ap->a_vp->v_type == VDIR || ap->a_vp->v_type == VFIFO) {
5126                         *ap->a_retval = PIPE_BUF;
5127                         return (0);
5128                 }
5129                 return (EINVAL);
5130         default:
5131                 return (vop_stdpathconf(ap));
5132         }
5133 }
5134
5135 static int zfs_xattr_compat = 1;
5136
5137 static int
5138 zfs_check_attrname(const char *name)
5139 {
5140         /* We don't allow '/' character in attribute name. */
5141         if (strchr(name, '/') != NULL)
5142                 return (SET_ERROR(EINVAL));
5143         /* We don't allow attribute names that start with a namespace prefix. */
5144         if (ZFS_XA_NS_PREFIX_FORBIDDEN(name))
5145                 return (SET_ERROR(EINVAL));
5146         return (0);
5147 }
5148
5149 /*
5150  * FreeBSD's extended attributes namespace defines file name prefix for ZFS'
5151  * extended attribute name:
5152  *
5153  *      NAMESPACE       XATTR_COMPAT    PREFIX
5154  *      system          *               freebsd:system:
5155  *      user            1               (none, can be used to access ZFS
5156  *                                      fsattr(5) attributes created on Solaris)
5157  *      user            0               user.
5158  */
5159 static int
5160 zfs_create_attrname(int attrnamespace, const char *name, char *attrname,
5161     size_t size, boolean_t compat)
5162 {
5163         const char *namespace, *prefix, *suffix;
5164
5165         memset(attrname, 0, size);
5166
5167         switch (attrnamespace) {
5168         case EXTATTR_NAMESPACE_USER:
5169                 if (compat) {
5170                         /*
5171                          * This is the default namespace by which we can access
5172                          * all attributes created on Solaris.
5173                          */
5174                         prefix = namespace = suffix = "";
5175                 } else {
5176                         /*
5177                          * This is compatible with the user namespace encoding
5178                          * on Linux prior to xattr_compat, but nothing
5179                          * else.
5180                          */
5181                         prefix = "";
5182                         namespace = "user";
5183                         suffix = ".";
5184                 }
5185                 break;
5186         case EXTATTR_NAMESPACE_SYSTEM:
5187                 prefix = "freebsd:";
5188                 namespace = EXTATTR_NAMESPACE_SYSTEM_STRING;
5189                 suffix = ":";
5190                 break;
5191         case EXTATTR_NAMESPACE_EMPTY:
5192         default:
5193                 return (SET_ERROR(EINVAL));
5194         }
5195         if (snprintf(attrname, size, "%s%s%s%s", prefix, namespace, suffix,
5196             name) >= size) {
5197                 return (SET_ERROR(ENAMETOOLONG));
5198         }
5199         return (0);
5200 }
5201
5202 static int
5203 zfs_ensure_xattr_cached(znode_t *zp)
5204 {
5205         int error = 0;
5206
5207         ASSERT(RW_LOCK_HELD(&zp->z_xattr_lock));
5208
5209         if (zp->z_xattr_cached != NULL)
5210                 return (0);
5211
5212         if (rw_write_held(&zp->z_xattr_lock))
5213                 return (zfs_sa_get_xattr(zp));
5214
5215         if (!rw_tryupgrade(&zp->z_xattr_lock)) {
5216                 rw_exit(&zp->z_xattr_lock);
5217                 rw_enter(&zp->z_xattr_lock, RW_WRITER);
5218         }
5219         if (zp->z_xattr_cached == NULL)
5220                 error = zfs_sa_get_xattr(zp);
5221         rw_downgrade(&zp->z_xattr_lock);
5222         return (error);
5223 }
5224
5225 #ifndef _SYS_SYSPROTO_H_
5226 struct vop_getextattr {
5227         IN struct vnode *a_vp;
5228         IN int a_attrnamespace;
5229         IN const char *a_name;
5230         INOUT struct uio *a_uio;
5231         OUT size_t *a_size;
5232         IN struct ucred *a_cred;
5233         IN struct thread *a_td;
5234 };
5235 #endif
5236
5237 static int
5238 zfs_getextattr_dir(struct vop_getextattr_args *ap, const char *attrname)
5239 {
5240         struct thread *td = ap->a_td;
5241         struct nameidata nd;
5242         struct vattr va;
5243         vnode_t *xvp = NULL, *vp;
5244         int error, flags;
5245
5246         error = zfs_lookup(ap->a_vp, NULL, &xvp, NULL, 0, ap->a_cred,
5247             LOOKUP_XATTR, B_FALSE);
5248         if (error != 0)
5249                 return (error);
5250
5251         flags = FREAD;
5252 #if __FreeBSD_version < 1400043
5253         NDINIT_ATVP(&nd, LOOKUP, NOFOLLOW, UIO_SYSSPACE, attrname,
5254             xvp, td);
5255 #else
5256         NDINIT_ATVP(&nd, LOOKUP, NOFOLLOW, UIO_SYSSPACE, attrname, xvp);
5257 #endif
5258         error = vn_open_cred(&nd, &flags, 0, VN_OPEN_INVFS, ap->a_cred, NULL);
5259         if (error != 0)
5260                 return (SET_ERROR(error));
5261         vp = nd.ni_vp;
5262         NDFREE_PNBUF(&nd);
5263
5264         if (ap->a_size != NULL) {
5265                 error = VOP_GETATTR(vp, &va, ap->a_cred);
5266                 if (error == 0)
5267                         *ap->a_size = (size_t)va.va_size;
5268         } else if (ap->a_uio != NULL)
5269                 error = VOP_READ(vp, ap->a_uio, IO_UNIT, ap->a_cred);
5270
5271         VOP_UNLOCK(vp);
5272         vn_close(vp, flags, ap->a_cred, td);
5273         return (error);
5274 }
5275
5276 static int
5277 zfs_getextattr_sa(struct vop_getextattr_args *ap, const char *attrname)
5278 {
5279         znode_t *zp = VTOZ(ap->a_vp);
5280         uchar_t *nv_value;
5281         uint_t nv_size;
5282         int error;
5283
5284         error = zfs_ensure_xattr_cached(zp);
5285         if (error != 0)
5286                 return (error);
5287
5288         ASSERT(RW_LOCK_HELD(&zp->z_xattr_lock));
5289         ASSERT3P(zp->z_xattr_cached, !=, NULL);
5290
5291         error = nvlist_lookup_byte_array(zp->z_xattr_cached, attrname,
5292             &nv_value, &nv_size);
5293         if (error != 0)
5294                 return (SET_ERROR(error));
5295
5296         if (ap->a_size != NULL)
5297                 *ap->a_size = nv_size;
5298         else if (ap->a_uio != NULL)
5299                 error = uiomove(nv_value, nv_size, ap->a_uio);
5300         if (error != 0)
5301                 return (SET_ERROR(error));
5302
5303         return (0);
5304 }
5305
5306 static int
5307 zfs_getextattr_impl(struct vop_getextattr_args *ap, boolean_t compat)
5308 {
5309         znode_t *zp = VTOZ(ap->a_vp);
5310         zfsvfs_t *zfsvfs = ZTOZSB(zp);
5311         char attrname[EXTATTR_MAXNAMELEN+1];
5312         int error;
5313
5314         error = zfs_create_attrname(ap->a_attrnamespace, ap->a_name, attrname,
5315             sizeof (attrname), compat);
5316         if (error != 0)
5317                 return (error);
5318
5319         error = ENOENT;
5320         if (zfsvfs->z_use_sa && zp->z_is_sa)
5321                 error = zfs_getextattr_sa(ap, attrname);
5322         if (error == ENOENT)
5323                 error = zfs_getextattr_dir(ap, attrname);
5324         return (error);
5325 }
5326
5327 /*
5328  * Vnode operation to retrieve a named extended attribute.
5329  */
5330 static int
5331 zfs_getextattr(struct vop_getextattr_args *ap)
5332 {
5333         znode_t *zp = VTOZ(ap->a_vp);
5334         zfsvfs_t *zfsvfs = ZTOZSB(zp);
5335         int error;
5336
5337         /*
5338          * If the xattr property is off, refuse the request.
5339          */
5340         if (!(zfsvfs->z_flags & ZSB_XATTR))
5341                 return (SET_ERROR(EOPNOTSUPP));
5342
5343         error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace,
5344             ap->a_cred, ap->a_td, VREAD);
5345         if (error != 0)
5346                 return (SET_ERROR(error));
5347
5348         error = zfs_check_attrname(ap->a_name);
5349         if (error != 0)
5350                 return (error);
5351
5352         if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
5353                 return (error);
5354         error = ENOENT;
5355         rw_enter(&zp->z_xattr_lock, RW_READER);
5356
5357         error = zfs_getextattr_impl(ap, zfs_xattr_compat);
5358         if ((error == ENOENT || error == ENOATTR) &&
5359             ap->a_attrnamespace == EXTATTR_NAMESPACE_USER) {
5360                 /*
5361                  * Fall back to the alternate namespace format if we failed to
5362                  * find a user xattr.
5363                  */
5364                 error = zfs_getextattr_impl(ap, !zfs_xattr_compat);
5365         }
5366
5367         rw_exit(&zp->z_xattr_lock);
5368         zfs_exit(zfsvfs, FTAG);
5369         if (error == ENOENT)
5370                 error = SET_ERROR(ENOATTR);
5371         return (error);
5372 }
5373
5374 #ifndef _SYS_SYSPROTO_H_
5375 struct vop_deleteextattr {
5376         IN struct vnode *a_vp;
5377         IN int a_attrnamespace;
5378         IN const char *a_name;
5379         IN struct ucred *a_cred;
5380         IN struct thread *a_td;
5381 };
5382 #endif
5383
5384 static int
5385 zfs_deleteextattr_dir(struct vop_deleteextattr_args *ap, const char *attrname)
5386 {
5387         struct nameidata nd;
5388         vnode_t *xvp = NULL, *vp;
5389         int error;
5390
5391         error = zfs_lookup(ap->a_vp, NULL, &xvp, NULL, 0, ap->a_cred,
5392             LOOKUP_XATTR, B_FALSE);
5393         if (error != 0)
5394                 return (error);
5395
5396 #if __FreeBSD_version < 1400043
5397         NDINIT_ATVP(&nd, DELETE, NOFOLLOW | LOCKPARENT | LOCKLEAF,
5398             UIO_SYSSPACE, attrname, xvp, ap->a_td);
5399 #else
5400         NDINIT_ATVP(&nd, DELETE, NOFOLLOW | LOCKPARENT | LOCKLEAF,
5401             UIO_SYSSPACE, attrname, xvp);
5402 #endif
5403         error = namei(&nd);
5404         if (error != 0)
5405                 return (SET_ERROR(error));
5406
5407         vp = nd.ni_vp;
5408         error = VOP_REMOVE(nd.ni_dvp, vp, &nd.ni_cnd);
5409         NDFREE_PNBUF(&nd);
5410
5411         vput(nd.ni_dvp);
5412         if (vp == nd.ni_dvp)
5413                 vrele(vp);
5414         else
5415                 vput(vp);
5416
5417         return (error);
5418 }
5419
5420 static int
5421 zfs_deleteextattr_sa(struct vop_deleteextattr_args *ap, const char *attrname)
5422 {
5423         znode_t *zp = VTOZ(ap->a_vp);
5424         nvlist_t *nvl;
5425         int error;
5426
5427         error = zfs_ensure_xattr_cached(zp);
5428         if (error != 0)
5429                 return (error);
5430
5431         ASSERT(RW_WRITE_HELD(&zp->z_xattr_lock));
5432         ASSERT3P(zp->z_xattr_cached, !=, NULL);
5433
5434         nvl = zp->z_xattr_cached;
5435         error = nvlist_remove(nvl, attrname, DATA_TYPE_BYTE_ARRAY);
5436         if (error != 0)
5437                 error = SET_ERROR(error);
5438         else
5439                 error = zfs_sa_set_xattr(zp, attrname, NULL, 0);
5440         if (error != 0) {
5441                 zp->z_xattr_cached = NULL;
5442                 nvlist_free(nvl);
5443         }
5444         return (error);
5445 }
5446
5447 static int
5448 zfs_deleteextattr_impl(struct vop_deleteextattr_args *ap, boolean_t compat)
5449 {
5450         znode_t *zp = VTOZ(ap->a_vp);
5451         zfsvfs_t *zfsvfs = ZTOZSB(zp);
5452         char attrname[EXTATTR_MAXNAMELEN+1];
5453         int error;
5454
5455         error = zfs_create_attrname(ap->a_attrnamespace, ap->a_name, attrname,
5456             sizeof (attrname), compat);
5457         if (error != 0)
5458                 return (error);
5459
5460         error = ENOENT;
5461         if (zfsvfs->z_use_sa && zp->z_is_sa)
5462                 error = zfs_deleteextattr_sa(ap, attrname);
5463         if (error == ENOENT)
5464                 error = zfs_deleteextattr_dir(ap, attrname);
5465         return (error);
5466 }
5467
5468 /*
5469  * Vnode operation to remove a named attribute.
5470  */
5471 static int
5472 zfs_deleteextattr(struct vop_deleteextattr_args *ap)
5473 {
5474         znode_t *zp = VTOZ(ap->a_vp);
5475         zfsvfs_t *zfsvfs = ZTOZSB(zp);
5476         int error;
5477
5478         /*
5479          * If the xattr property is off, refuse the request.
5480          */
5481         if (!(zfsvfs->z_flags & ZSB_XATTR))
5482                 return (SET_ERROR(EOPNOTSUPP));
5483
5484         error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace,
5485             ap->a_cred, ap->a_td, VWRITE);
5486         if (error != 0)
5487                 return (SET_ERROR(error));
5488
5489         error = zfs_check_attrname(ap->a_name);
5490         if (error != 0)
5491                 return (error);
5492
5493         if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
5494                 return (error);
5495         rw_enter(&zp->z_xattr_lock, RW_WRITER);
5496
5497         error = zfs_deleteextattr_impl(ap, zfs_xattr_compat);
5498         if ((error == ENOENT || error == ENOATTR) &&
5499             ap->a_attrnamespace == EXTATTR_NAMESPACE_USER) {
5500                 /*
5501                  * Fall back to the alternate namespace format if we failed to
5502                  * find a user xattr.
5503                  */
5504                 error = zfs_deleteextattr_impl(ap, !zfs_xattr_compat);
5505         }
5506
5507         rw_exit(&zp->z_xattr_lock);
5508         zfs_exit(zfsvfs, FTAG);
5509         if (error == ENOENT)
5510                 error = SET_ERROR(ENOATTR);
5511         return (error);
5512 }
5513
5514 #ifndef _SYS_SYSPROTO_H_
5515 struct vop_setextattr {
5516         IN struct vnode *a_vp;
5517         IN int a_attrnamespace;
5518         IN const char *a_name;
5519         INOUT struct uio *a_uio;
5520         IN struct ucred *a_cred;
5521         IN struct thread *a_td;
5522 };
5523 #endif
5524
5525 static int
5526 zfs_setextattr_dir(struct vop_setextattr_args *ap, const char *attrname)
5527 {
5528         struct thread *td = ap->a_td;
5529         struct nameidata nd;
5530         struct vattr va;
5531         vnode_t *xvp = NULL, *vp;
5532         int error, flags;
5533
5534         error = zfs_lookup(ap->a_vp, NULL, &xvp, NULL, 0, ap->a_cred,
5535             LOOKUP_XATTR | CREATE_XATTR_DIR, B_FALSE);
5536         if (error != 0)
5537                 return (error);
5538
5539         flags = FFLAGS(O_WRONLY | O_CREAT);
5540 #if __FreeBSD_version < 1400043
5541         NDINIT_ATVP(&nd, LOOKUP, NOFOLLOW, UIO_SYSSPACE, attrname, xvp, td);
5542 #else
5543         NDINIT_ATVP(&nd, LOOKUP, NOFOLLOW, UIO_SYSSPACE, attrname, xvp);
5544 #endif
5545         error = vn_open_cred(&nd, &flags, 0600, VN_OPEN_INVFS, ap->a_cred,
5546             NULL);
5547         if (error != 0)
5548                 return (SET_ERROR(error));
5549         vp = nd.ni_vp;
5550         NDFREE_PNBUF(&nd);
5551
5552         VATTR_NULL(&va);
5553         va.va_size = 0;
5554         error = VOP_SETATTR(vp, &va, ap->a_cred);
5555         if (error == 0)
5556                 VOP_WRITE(vp, ap->a_uio, IO_UNIT, ap->a_cred);
5557
5558         VOP_UNLOCK(vp);
5559         vn_close(vp, flags, ap->a_cred, td);
5560         return (error);
5561 }
5562
5563 static int
5564 zfs_setextattr_sa(struct vop_setextattr_args *ap, const char *attrname)
5565 {
5566         znode_t *zp = VTOZ(ap->a_vp);
5567         nvlist_t *nvl;
5568         size_t sa_size;
5569         int error;
5570
5571         error = zfs_ensure_xattr_cached(zp);
5572         if (error != 0)
5573                 return (error);
5574
5575         ASSERT(RW_WRITE_HELD(&zp->z_xattr_lock));
5576         ASSERT3P(zp->z_xattr_cached, !=, NULL);
5577
5578         nvl = zp->z_xattr_cached;
5579         size_t entry_size = ap->a_uio->uio_resid;
5580         if (entry_size > DXATTR_MAX_ENTRY_SIZE)
5581                 return (SET_ERROR(EFBIG));
5582         error = nvlist_size(nvl, &sa_size, NV_ENCODE_XDR);
5583         if (error != 0)
5584                 return (SET_ERROR(error));
5585         if (sa_size > DXATTR_MAX_SA_SIZE)
5586                 return (SET_ERROR(EFBIG));
5587         uchar_t *buf = kmem_alloc(entry_size, KM_SLEEP);
5588         error = uiomove(buf, entry_size, ap->a_uio);
5589         if (error != 0) {
5590                 error = SET_ERROR(error);
5591         } else {
5592                 error = nvlist_add_byte_array(nvl, attrname, buf, entry_size);
5593                 if (error != 0)
5594                         error = SET_ERROR(error);
5595         }
5596         if (error == 0)
5597                 error = zfs_sa_set_xattr(zp, attrname, buf, entry_size);
5598         kmem_free(buf, entry_size);
5599         if (error != 0) {
5600                 zp->z_xattr_cached = NULL;
5601                 nvlist_free(nvl);
5602         }
5603         return (error);
5604 }
5605
5606 static int
5607 zfs_setextattr_impl(struct vop_setextattr_args *ap, boolean_t compat)
5608 {
5609         znode_t *zp = VTOZ(ap->a_vp);
5610         zfsvfs_t *zfsvfs = ZTOZSB(zp);
5611         char attrname[EXTATTR_MAXNAMELEN+1];
5612         int error;
5613
5614         error = zfs_create_attrname(ap->a_attrnamespace, ap->a_name, attrname,
5615             sizeof (attrname), compat);
5616         if (error != 0)
5617                 return (error);
5618
5619         struct vop_deleteextattr_args vda = {
5620                 .a_vp = ap->a_vp,
5621                 .a_attrnamespace = ap->a_attrnamespace,
5622                 .a_name = ap->a_name,
5623                 .a_cred = ap->a_cred,
5624                 .a_td = ap->a_td,
5625         };
5626         error = ENOENT;
5627         if (zfsvfs->z_use_sa && zp->z_is_sa && zfsvfs->z_xattr_sa) {
5628                 error = zfs_setextattr_sa(ap, attrname);
5629                 if (error == 0) {
5630                         /*
5631                          * Successfully put into SA, we need to clear the one
5632                          * in dir if present.
5633                          */
5634                         zfs_deleteextattr_dir(&vda, attrname);
5635                 }
5636         }
5637         if (error != 0) {
5638                 error = zfs_setextattr_dir(ap, attrname);
5639                 if (error == 0 && zp->z_is_sa) {
5640                         /*
5641                          * Successfully put into dir, we need to clear the one
5642                          * in SA if present.
5643                          */
5644                         zfs_deleteextattr_sa(&vda, attrname);
5645                 }
5646         }
5647         if (error == 0 && ap->a_attrnamespace == EXTATTR_NAMESPACE_USER) {
5648                 /*
5649                  * Also clear all versions of the alternate compat name.
5650                  */
5651                 zfs_deleteextattr_impl(&vda, !compat);
5652         }
5653         return (error);
5654 }
5655
5656 /*
5657  * Vnode operation to set a named attribute.
5658  */
5659 static int
5660 zfs_setextattr(struct vop_setextattr_args *ap)
5661 {
5662         znode_t *zp = VTOZ(ap->a_vp);
5663         zfsvfs_t *zfsvfs = ZTOZSB(zp);
5664         int error;
5665
5666         /*
5667          * If the xattr property is off, refuse the request.
5668          */
5669         if (!(zfsvfs->z_flags & ZSB_XATTR))
5670                 return (SET_ERROR(EOPNOTSUPP));
5671
5672         error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace,
5673             ap->a_cred, ap->a_td, VWRITE);
5674         if (error != 0)
5675                 return (SET_ERROR(error));
5676
5677         error = zfs_check_attrname(ap->a_name);
5678         if (error != 0)
5679                 return (error);
5680
5681         if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
5682                 return (error);
5683         rw_enter(&zp->z_xattr_lock, RW_WRITER);
5684
5685         error = zfs_setextattr_impl(ap, zfs_xattr_compat);
5686
5687         rw_exit(&zp->z_xattr_lock);
5688         zfs_exit(zfsvfs, FTAG);
5689         return (error);
5690 }
5691
5692 #ifndef _SYS_SYSPROTO_H_
5693 struct vop_listextattr {
5694         IN struct vnode *a_vp;
5695         IN int a_attrnamespace;
5696         INOUT struct uio *a_uio;
5697         OUT size_t *a_size;
5698         IN struct ucred *a_cred;
5699         IN struct thread *a_td;
5700 };
5701 #endif
5702
5703 static int
5704 zfs_listextattr_dir(struct vop_listextattr_args *ap, const char *attrprefix)
5705 {
5706         struct thread *td = ap->a_td;
5707         struct nameidata nd;
5708         uint8_t dirbuf[sizeof (struct dirent)];
5709         struct iovec aiov;
5710         struct uio auio;
5711         vnode_t *xvp = NULL, *vp;
5712         int error, eof;
5713
5714         error = zfs_lookup(ap->a_vp, NULL, &xvp, NULL, 0, ap->a_cred,
5715             LOOKUP_XATTR, B_FALSE);
5716         if (error != 0) {
5717                 /*
5718                  * ENOATTR means that the EA directory does not yet exist,
5719                  * i.e. there are no extended attributes there.
5720                  */
5721                 if (error == ENOATTR)
5722                         error = 0;
5723                 return (error);
5724         }
5725
5726 #if __FreeBSD_version < 1400043
5727         NDINIT_ATVP(&nd, LOOKUP, NOFOLLOW | LOCKLEAF | LOCKSHARED,
5728             UIO_SYSSPACE, ".", xvp, td);
5729 #else
5730         NDINIT_ATVP(&nd, LOOKUP, NOFOLLOW | LOCKLEAF | LOCKSHARED,
5731             UIO_SYSSPACE, ".", xvp);
5732 #endif
5733         error = namei(&nd);
5734         if (error != 0)
5735                 return (SET_ERROR(error));
5736         vp = nd.ni_vp;
5737         NDFREE_PNBUF(&nd);
5738
5739         auio.uio_iov = &aiov;
5740         auio.uio_iovcnt = 1;
5741         auio.uio_segflg = UIO_SYSSPACE;
5742         auio.uio_td = td;
5743         auio.uio_rw = UIO_READ;
5744         auio.uio_offset = 0;
5745
5746         size_t plen = strlen(attrprefix);
5747
5748         do {
5749                 aiov.iov_base = (void *)dirbuf;
5750                 aiov.iov_len = sizeof (dirbuf);
5751                 auio.uio_resid = sizeof (dirbuf);
5752                 error = VOP_READDIR(vp, &auio, ap->a_cred, &eof, NULL, NULL);
5753                 if (error != 0)
5754                         break;
5755                 int done = sizeof (dirbuf) - auio.uio_resid;
5756                 for (int pos = 0; pos < done; ) {
5757                         struct dirent *dp = (struct dirent *)(dirbuf + pos);
5758                         pos += dp->d_reclen;
5759                         /*
5760                          * XXX: Temporarily we also accept DT_UNKNOWN, as this
5761                          * is what we get when attribute was created on Solaris.
5762                          */
5763                         if (dp->d_type != DT_REG && dp->d_type != DT_UNKNOWN)
5764                                 continue;
5765                         else if (plen == 0 &&
5766                             ZFS_XA_NS_PREFIX_FORBIDDEN(dp->d_name))
5767                                 continue;
5768                         else if (strncmp(dp->d_name, attrprefix, plen) != 0)
5769                                 continue;
5770                         uint8_t nlen = dp->d_namlen - plen;
5771                         if (ap->a_size != NULL) {
5772                                 *ap->a_size += 1 + nlen;
5773                         } else if (ap->a_uio != NULL) {
5774                                 /*
5775                                  * Format of extattr name entry is one byte for
5776                                  * length and the rest for name.
5777                                  */
5778                                 error = uiomove(&nlen, 1, ap->a_uio);
5779                                 if (error == 0) {
5780                                         char *namep = dp->d_name + plen;
5781                                         error = uiomove(namep, nlen, ap->a_uio);
5782                                 }
5783                                 if (error != 0) {
5784                                         error = SET_ERROR(error);
5785                                         break;
5786                                 }
5787                         }
5788                 }
5789         } while (!eof && error == 0);
5790
5791         vput(vp);
5792         return (error);
5793 }
5794
5795 static int
5796 zfs_listextattr_sa(struct vop_listextattr_args *ap, const char *attrprefix)
5797 {
5798         znode_t *zp = VTOZ(ap->a_vp);
5799         int error;
5800
5801         error = zfs_ensure_xattr_cached(zp);
5802         if (error != 0)
5803                 return (error);
5804
5805         ASSERT(RW_LOCK_HELD(&zp->z_xattr_lock));
5806         ASSERT3P(zp->z_xattr_cached, !=, NULL);
5807
5808         size_t plen = strlen(attrprefix);
5809         nvpair_t *nvp = NULL;
5810         while ((nvp = nvlist_next_nvpair(zp->z_xattr_cached, nvp)) != NULL) {
5811                 ASSERT3U(nvpair_type(nvp), ==, DATA_TYPE_BYTE_ARRAY);
5812
5813                 const char *name = nvpair_name(nvp);
5814                 if (plen == 0 && ZFS_XA_NS_PREFIX_FORBIDDEN(name))
5815                         continue;
5816                 else if (strncmp(name, attrprefix, plen) != 0)
5817                         continue;
5818                 uint8_t nlen = strlen(name) - plen;
5819                 if (ap->a_size != NULL) {
5820                         *ap->a_size += 1 + nlen;
5821                 } else if (ap->a_uio != NULL) {
5822                         /*
5823                          * Format of extattr name entry is one byte for
5824                          * length and the rest for name.
5825                          */
5826                         error = uiomove(&nlen, 1, ap->a_uio);
5827                         if (error == 0) {
5828                                 char *namep = __DECONST(char *, name) + plen;
5829                                 error = uiomove(namep, nlen, ap->a_uio);
5830                         }
5831                         if (error != 0) {
5832                                 error = SET_ERROR(error);
5833                                 break;
5834                         }
5835                 }
5836         }
5837
5838         return (error);
5839 }
5840
5841 static int
5842 zfs_listextattr_impl(struct vop_listextattr_args *ap, boolean_t compat)
5843 {
5844         znode_t *zp = VTOZ(ap->a_vp);
5845         zfsvfs_t *zfsvfs = ZTOZSB(zp);
5846         char attrprefix[16];
5847         int error;
5848
5849         error = zfs_create_attrname(ap->a_attrnamespace, "", attrprefix,
5850             sizeof (attrprefix), compat);
5851         if (error != 0)
5852                 return (error);
5853
5854         if (zfsvfs->z_use_sa && zp->z_is_sa)
5855                 error = zfs_listextattr_sa(ap, attrprefix);
5856         if (error == 0)
5857                 error = zfs_listextattr_dir(ap, attrprefix);
5858         return (error);
5859 }
5860
5861 /*
5862  * Vnode operation to retrieve extended attributes on a vnode.
5863  */
5864 static int
5865 zfs_listextattr(struct vop_listextattr_args *ap)
5866 {
5867         znode_t *zp = VTOZ(ap->a_vp);
5868         zfsvfs_t *zfsvfs = ZTOZSB(zp);
5869         int error;
5870
5871         if (ap->a_size != NULL)
5872                 *ap->a_size = 0;
5873
5874         /*
5875          * If the xattr property is off, refuse the request.
5876          */
5877         if (!(zfsvfs->z_flags & ZSB_XATTR))
5878                 return (SET_ERROR(EOPNOTSUPP));
5879
5880         error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace,
5881             ap->a_cred, ap->a_td, VREAD);
5882         if (error != 0)
5883                 return (SET_ERROR(error));
5884
5885         if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
5886                 return (error);
5887         rw_enter(&zp->z_xattr_lock, RW_READER);
5888
5889         error = zfs_listextattr_impl(ap, zfs_xattr_compat);
5890         if (error == 0 && ap->a_attrnamespace == EXTATTR_NAMESPACE_USER) {
5891                 /* Also list user xattrs with the alternate format. */
5892                 error = zfs_listextattr_impl(ap, !zfs_xattr_compat);
5893         }
5894
5895         rw_exit(&zp->z_xattr_lock);
5896         zfs_exit(zfsvfs, FTAG);
5897         return (error);
5898 }
5899
5900 #ifndef _SYS_SYSPROTO_H_
5901 struct vop_getacl_args {
5902         struct vnode *vp;
5903         acl_type_t type;
5904         struct acl *aclp;
5905         struct ucred *cred;
5906         struct thread *td;
5907 };
5908 #endif
5909
5910 static int
5911 zfs_freebsd_getacl(struct vop_getacl_args *ap)
5912 {
5913         int             error;
5914         vsecattr_t      vsecattr;
5915
5916         if (ap->a_type != ACL_TYPE_NFS4)
5917                 return (EINVAL);
5918
5919         vsecattr.vsa_mask = VSA_ACE | VSA_ACECNT;
5920         if ((error = zfs_getsecattr(VTOZ(ap->a_vp),
5921             &vsecattr, 0, ap->a_cred)))
5922                 return (error);
5923
5924         error = acl_from_aces(ap->a_aclp, vsecattr.vsa_aclentp,
5925             vsecattr.vsa_aclcnt);
5926         if (vsecattr.vsa_aclentp != NULL)
5927                 kmem_free(vsecattr.vsa_aclentp, vsecattr.vsa_aclentsz);
5928
5929         return (error);
5930 }
5931
5932 #ifndef _SYS_SYSPROTO_H_
5933 struct vop_setacl_args {
5934         struct vnode *vp;
5935         acl_type_t type;
5936         struct acl *aclp;
5937         struct ucred *cred;
5938         struct thread *td;
5939 };
5940 #endif
5941
5942 static int
5943 zfs_freebsd_setacl(struct vop_setacl_args *ap)
5944 {
5945         int             error;
5946         vsecattr_t vsecattr;
5947         int             aclbsize;       /* size of acl list in bytes */
5948         aclent_t        *aaclp;
5949
5950         if (ap->a_type != ACL_TYPE_NFS4)
5951                 return (EINVAL);
5952
5953         if (ap->a_aclp == NULL)
5954                 return (EINVAL);
5955
5956         if (ap->a_aclp->acl_cnt < 1 || ap->a_aclp->acl_cnt > MAX_ACL_ENTRIES)
5957                 return (EINVAL);
5958
5959         /*
5960          * With NFSv4 ACLs, chmod(2) may need to add additional entries,
5961          * splitting every entry into two and appending "canonical six"
5962          * entries at the end.  Don't allow for setting an ACL that would
5963          * cause chmod(2) to run out of ACL entries.
5964          */
5965         if (ap->a_aclp->acl_cnt * 2 + 6 > ACL_MAX_ENTRIES)
5966                 return (ENOSPC);
5967
5968         error = acl_nfs4_check(ap->a_aclp, ap->a_vp->v_type == VDIR);
5969         if (error != 0)
5970                 return (error);
5971
5972         vsecattr.vsa_mask = VSA_ACE;
5973         aclbsize = ap->a_aclp->acl_cnt * sizeof (ace_t);
5974         vsecattr.vsa_aclentp = kmem_alloc(aclbsize, KM_SLEEP);
5975         aaclp = vsecattr.vsa_aclentp;
5976         vsecattr.vsa_aclentsz = aclbsize;
5977
5978         aces_from_acl(vsecattr.vsa_aclentp, &vsecattr.vsa_aclcnt, ap->a_aclp);
5979         error = zfs_setsecattr(VTOZ(ap->a_vp), &vsecattr, 0, ap->a_cred);
5980         kmem_free(aaclp, aclbsize);
5981
5982         return (error);
5983 }
5984
5985 #ifndef _SYS_SYSPROTO_H_
5986 struct vop_aclcheck_args {
5987         struct vnode *vp;
5988         acl_type_t type;
5989         struct acl *aclp;
5990         struct ucred *cred;
5991         struct thread *td;
5992 };
5993 #endif
5994
5995 static int
5996 zfs_freebsd_aclcheck(struct vop_aclcheck_args *ap)
5997 {
5998
5999         return (EOPNOTSUPP);
6000 }
6001
6002 static int
6003 zfs_vptocnp(struct vop_vptocnp_args *ap)
6004 {
6005         vnode_t *covered_vp;
6006         vnode_t *vp = ap->a_vp;
6007         zfsvfs_t *zfsvfs = vp->v_vfsp->vfs_data;
6008         znode_t *zp = VTOZ(vp);
6009         int ltype;
6010         int error;
6011
6012         if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
6013                 return (error);
6014
6015         /*
6016          * If we are a snapshot mounted under .zfs, run the operation
6017          * on the covered vnode.
6018          */
6019         if (zp->z_id != zfsvfs->z_root || zfsvfs->z_parent == zfsvfs) {
6020                 char name[MAXNAMLEN + 1];
6021                 znode_t *dzp;
6022                 size_t len;
6023
6024                 error = zfs_znode_parent_and_name(zp, &dzp, name,
6025                     sizeof (name));
6026                 if (error == 0) {
6027                         len = strlen(name);
6028                         if (*ap->a_buflen < len)
6029                                 error = SET_ERROR(ENOMEM);
6030                 }
6031                 if (error == 0) {
6032                         *ap->a_buflen -= len;
6033                         memcpy(ap->a_buf + *ap->a_buflen, name, len);
6034                         *ap->a_vpp = ZTOV(dzp);
6035                 }
6036                 zfs_exit(zfsvfs, FTAG);
6037                 return (error);
6038         }
6039         zfs_exit(zfsvfs, FTAG);
6040
6041         covered_vp = vp->v_mount->mnt_vnodecovered;
6042         enum vgetstate vs = vget_prep(covered_vp);
6043         ltype = VOP_ISLOCKED(vp);
6044         VOP_UNLOCK(vp);
6045         error = vget_finish(covered_vp, LK_SHARED, vs);
6046         if (error == 0) {
6047                 error = VOP_VPTOCNP(covered_vp, ap->a_vpp, ap->a_buf,
6048                     ap->a_buflen);
6049                 vput(covered_vp);
6050         }
6051         vn_lock(vp, ltype | LK_RETRY);
6052         if (VN_IS_DOOMED(vp))
6053                 error = SET_ERROR(ENOENT);
6054         return (error);
6055 }
6056
6057 #if __FreeBSD_version >= 1400032
6058 static int
6059 zfs_deallocate(struct vop_deallocate_args *ap)
6060 {
6061         znode_t *zp = VTOZ(ap->a_vp);
6062         zfsvfs_t *zfsvfs = zp->z_zfsvfs;
6063         zilog_t *zilog;
6064         off_t off, len, file_sz;
6065         int error;
6066
6067         if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
6068                 return (error);
6069
6070         /*
6071          * Callers might not be able to detect properly that we are read-only,
6072          * so check it explicitly here.
6073          */
6074         if (zfs_is_readonly(zfsvfs)) {
6075                 zfs_exit(zfsvfs, FTAG);
6076                 return (SET_ERROR(EROFS));
6077         }
6078
6079         zilog = zfsvfs->z_log;
6080         off = *ap->a_offset;
6081         len = *ap->a_len;
6082         file_sz = zp->z_size;
6083         if (off + len > file_sz)
6084                 len = file_sz - off;
6085         /* Fast path for out-of-range request. */
6086         if (len <= 0) {
6087                 *ap->a_len = 0;
6088                 zfs_exit(zfsvfs, FTAG);
6089                 return (0);
6090         }
6091
6092         error = zfs_freesp(zp, off, len, O_RDWR, TRUE);
6093         if (error == 0) {
6094                 if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS ||
6095                     (ap->a_ioflag & IO_SYNC) != 0)
6096                         zil_commit(zilog, zp->z_id);
6097                 *ap->a_offset = off + len;
6098                 *ap->a_len = 0;
6099         }
6100
6101         zfs_exit(zfsvfs, FTAG);
6102         return (error);
6103 }
6104 #endif
6105
6106 #ifndef _SYS_SYSPROTO_H_
6107 struct vop_copy_file_range_args {
6108         struct vnode *a_invp;
6109         off_t *a_inoffp;
6110         struct vnode *a_outvp;
6111         off_t *a_outoffp;
6112         size_t *a_lenp;
6113         unsigned int a_flags;
6114         struct ucred *a_incred;
6115         struct ucred *a_outcred;
6116         struct thread *a_fsizetd;
6117 }
6118 #endif
6119 /*
6120  * TODO: FreeBSD will only call file system-specific copy_file_range() if both
6121  * files resides under the same mountpoint. In case of ZFS we want to be called
6122  * even is files are in different datasets (but on the same pools, but we need
6123  * to check that ourselves).
6124  */
6125 static int
6126 zfs_freebsd_copy_file_range(struct vop_copy_file_range_args *ap)
6127 {
6128         zfsvfs_t *outzfsvfs;
6129         struct vnode *invp = ap->a_invp;
6130         struct vnode *outvp = ap->a_outvp;
6131         struct mount *mp;
6132         int error;
6133         uint64_t len = *ap->a_lenp;
6134
6135         if (!zfs_bclone_enabled) {
6136                 mp = NULL;
6137                 goto bad_write_fallback;
6138         }
6139
6140         /*
6141          * TODO: If offset/length is not aligned to recordsize, use
6142          * vn_generic_copy_file_range() on this fragment.
6143          * It would be better to do this after we lock the vnodes, but then we
6144          * need something else than vn_generic_copy_file_range().
6145          */
6146
6147         vn_start_write(outvp, &mp, V_WAIT);
6148         if (__predict_true(mp == outvp->v_mount)) {
6149                 outzfsvfs = (zfsvfs_t *)mp->mnt_data;
6150                 if (!spa_feature_is_enabled(dmu_objset_spa(outzfsvfs->z_os),
6151                     SPA_FEATURE_BLOCK_CLONING)) {
6152                         goto bad_write_fallback;
6153                 }
6154         }
6155         if (invp == outvp) {
6156                 if (vn_lock(outvp, LK_EXCLUSIVE) != 0) {
6157                         goto bad_write_fallback;
6158                 }
6159         } else {
6160 #if (__FreeBSD_version >= 1302506 && __FreeBSD_version < 1400000) || \
6161         __FreeBSD_version >= 1400086
6162                 vn_lock_pair(invp, false, LK_EXCLUSIVE, outvp, false,
6163                     LK_EXCLUSIVE);
6164 #else
6165                 vn_lock_pair(invp, false, outvp, false);
6166 #endif
6167                 if (VN_IS_DOOMED(invp) || VN_IS_DOOMED(outvp)) {
6168                         goto bad_locked_fallback;
6169                 }
6170         }
6171
6172 #ifdef MAC
6173         error = mac_vnode_check_write(curthread->td_ucred, ap->a_outcred,
6174             outvp);
6175         if (error != 0)
6176                 goto out_locked;
6177 #endif
6178
6179         error = zfs_clone_range(VTOZ(invp), ap->a_inoffp, VTOZ(outvp),
6180             ap->a_outoffp, &len, ap->a_outcred);
6181         if (error == EXDEV || error == EAGAIN || error == EINVAL ||
6182             error == EOPNOTSUPP)
6183                 goto bad_locked_fallback;
6184         *ap->a_lenp = (size_t)len;
6185 #ifdef MAC
6186 out_locked:
6187 #endif
6188         if (invp != outvp)
6189                 VOP_UNLOCK(invp);
6190         VOP_UNLOCK(outvp);
6191         if (mp != NULL)
6192                 vn_finished_write(mp);
6193         return (error);
6194
6195 bad_locked_fallback:
6196         if (invp != outvp)
6197                 VOP_UNLOCK(invp);
6198         VOP_UNLOCK(outvp);
6199 bad_write_fallback:
6200         if (mp != NULL)
6201                 vn_finished_write(mp);
6202         error = vn_generic_copy_file_range(ap->a_invp, ap->a_inoffp,
6203             ap->a_outvp, ap->a_outoffp, ap->a_lenp, ap->a_flags,
6204             ap->a_incred, ap->a_outcred, ap->a_fsizetd);
6205         return (error);
6206 }
6207
6208 struct vop_vector zfs_vnodeops;
6209 struct vop_vector zfs_fifoops;
6210 struct vop_vector zfs_shareops;
6211
6212 struct vop_vector zfs_vnodeops = {
6213         .vop_default =          &default_vnodeops,
6214         .vop_inactive =         zfs_freebsd_inactive,
6215         .vop_need_inactive =    zfs_freebsd_need_inactive,
6216         .vop_reclaim =          zfs_freebsd_reclaim,
6217         .vop_fplookup_vexec = zfs_freebsd_fplookup_vexec,
6218         .vop_fplookup_symlink = zfs_freebsd_fplookup_symlink,
6219         .vop_access =           zfs_freebsd_access,
6220         .vop_allocate =         VOP_EINVAL,
6221 #if __FreeBSD_version >= 1400032
6222         .vop_deallocate =       zfs_deallocate,
6223 #endif
6224         .vop_lookup =           zfs_cache_lookup,
6225         .vop_cachedlookup =     zfs_freebsd_cachedlookup,
6226         .vop_getattr =          zfs_freebsd_getattr,
6227         .vop_setattr =          zfs_freebsd_setattr,
6228         .vop_create =           zfs_freebsd_create,
6229         .vop_mknod =            (vop_mknod_t *)zfs_freebsd_create,
6230         .vop_mkdir =            zfs_freebsd_mkdir,
6231         .vop_readdir =          zfs_freebsd_readdir,
6232         .vop_fsync =            zfs_freebsd_fsync,
6233         .vop_open =             zfs_freebsd_open,
6234         .vop_close =            zfs_freebsd_close,
6235         .vop_rmdir =            zfs_freebsd_rmdir,
6236         .vop_ioctl =            zfs_freebsd_ioctl,
6237         .vop_link =             zfs_freebsd_link,
6238         .vop_symlink =          zfs_freebsd_symlink,
6239         .vop_readlink =         zfs_freebsd_readlink,
6240         .vop_read =             zfs_freebsd_read,
6241         .vop_write =            zfs_freebsd_write,
6242         .vop_remove =           zfs_freebsd_remove,
6243         .vop_rename =           zfs_freebsd_rename,
6244         .vop_pathconf =         zfs_freebsd_pathconf,
6245         .vop_bmap =             zfs_freebsd_bmap,
6246         .vop_fid =              zfs_freebsd_fid,
6247         .vop_getextattr =       zfs_getextattr,
6248         .vop_deleteextattr =    zfs_deleteextattr,
6249         .vop_setextattr =       zfs_setextattr,
6250         .vop_listextattr =      zfs_listextattr,
6251         .vop_getacl =           zfs_freebsd_getacl,
6252         .vop_setacl =           zfs_freebsd_setacl,
6253         .vop_aclcheck =         zfs_freebsd_aclcheck,
6254         .vop_getpages =         zfs_freebsd_getpages,
6255         .vop_putpages =         zfs_freebsd_putpages,
6256         .vop_vptocnp =          zfs_vptocnp,
6257         .vop_lock1 =            vop_lock,
6258         .vop_unlock =           vop_unlock,
6259         .vop_islocked =         vop_islocked,
6260 #if __FreeBSD_version >= 1400043
6261         .vop_add_writecount =   vop_stdadd_writecount_nomsync,
6262 #endif
6263         .vop_copy_file_range =  zfs_freebsd_copy_file_range,
6264 };
6265 VFS_VOP_VECTOR_REGISTER(zfs_vnodeops);
6266
6267 struct vop_vector zfs_fifoops = {
6268         .vop_default =          &fifo_specops,
6269         .vop_fsync =            zfs_freebsd_fsync,
6270         .vop_fplookup_vexec =   zfs_freebsd_fplookup_vexec,
6271         .vop_fplookup_symlink = zfs_freebsd_fplookup_symlink,
6272         .vop_access =           zfs_freebsd_access,
6273         .vop_getattr =          zfs_freebsd_getattr,
6274         .vop_inactive =         zfs_freebsd_inactive,
6275         .vop_read =             VOP_PANIC,
6276         .vop_reclaim =          zfs_freebsd_reclaim,
6277         .vop_setattr =          zfs_freebsd_setattr,
6278         .vop_write =            VOP_PANIC,
6279         .vop_pathconf =         zfs_freebsd_pathconf,
6280         .vop_fid =              zfs_freebsd_fid,
6281         .vop_getacl =           zfs_freebsd_getacl,
6282         .vop_setacl =           zfs_freebsd_setacl,
6283         .vop_aclcheck =         zfs_freebsd_aclcheck,
6284 #if __FreeBSD_version >= 1400043
6285         .vop_add_writecount =   vop_stdadd_writecount_nomsync,
6286 #endif
6287 };
6288 VFS_VOP_VECTOR_REGISTER(zfs_fifoops);
6289
6290 /*
6291  * special share hidden files vnode operations template
6292  */
6293 struct vop_vector zfs_shareops = {
6294         .vop_default =          &default_vnodeops,
6295         .vop_fplookup_vexec =   VOP_EAGAIN,
6296         .vop_fplookup_symlink = VOP_EAGAIN,
6297         .vop_access =           zfs_freebsd_access,
6298         .vop_inactive =         zfs_freebsd_inactive,
6299         .vop_reclaim =          zfs_freebsd_reclaim,
6300         .vop_fid =              zfs_freebsd_fid,
6301         .vop_pathconf =         zfs_freebsd_pathconf,
6302 #if __FreeBSD_version >= 1400043
6303         .vop_add_writecount =   vop_stdadd_writecount_nomsync,
6304 #endif
6305 };
6306 VFS_VOP_VECTOR_REGISTER(zfs_shareops);
6307
6308 ZFS_MODULE_PARAM(zfs, zfs_, xattr_compat, INT, ZMOD_RW,
6309         "Use legacy ZFS xattr naming for writing new user namespace xattrs");