module/os/freebsd/zfs/zfs_vnops_os.c

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or https://opensource.org/licenses/CDDL-1.0.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21
  22 /*
  23  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  24  * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
  25  * Copyright (c) 2014 Integros [integros.com]
  26  * Copyright 2017 Nexenta Systems, Inc.
  27  */
  28
  29 /* Portions Copyright 2007 Jeremy Teo */
  30 /* Portions Copyright 2010 Robert Milkowski */
  31
  32 #include <sys/param.h>
  33 #include <sys/time.h>
  34 #include <sys/systm.h>
  35 #include <sys/sysmacros.h>
  36 #include <sys/resource.h>
  37 #include <security/mac/mac_framework.h>
  38 #include <sys/vfs.h>
  39 #include <sys/endian.h>
  40 #include <sys/vm.h>
  41 #include <sys/vnode.h>
  42 #include <sys/smr.h>
  43 #include <sys/dirent.h>
  44 #include <sys/file.h>
  45 #include <sys/stat.h>
  46 #include <sys/kmem.h>
  47 #include <sys/taskq.h>
  48 #include <sys/uio.h>
  49 #include <sys/atomic.h>
  50 #include <sys/namei.h>
  51 #include <sys/mman.h>
  52 #include <sys/cmn_err.h>
  53 #include <sys/kdb.h>
  54 #include <sys/sysproto.h>
  55 #include <sys/errno.h>
  56 #include <sys/unistd.h>
  57 #include <sys/zfs_dir.h>
  58 #include <sys/zfs_ioctl.h>
  59 #include <sys/fs/zfs.h>
  60 #include <sys/dmu.h>
  61 #include <sys/dmu_objset.h>
  62 #include <sys/spa.h>
  63 #include <sys/txg.h>
  64 #include <sys/dbuf.h>
  65 #include <sys/zap.h>
  66 #include <sys/sa.h>
  67 #include <sys/policy.h>
  68 #include <sys/sunddi.h>
  69 #include <sys/filio.h>
  70 #include <sys/sid.h>
  71 #include <sys/zfs_ctldir.h>
  72 #include <sys/zfs_fuid.h>
  73 #include <sys/zfs_quota.h>
  74 #include <sys/zfs_sa.h>
  75 #include <sys/zfs_rlock.h>
  76 #include <sys/bio.h>
  77 #include <sys/buf.h>
  78 #include <sys/sched.h>
  79 #include <sys/acl.h>
  80 #include <sys/vmmeter.h>
  81 #include <vm/vm_param.h>
  82 #include <sys/zil.h>
  83 #include <sys/zfs_vnops.h>
  84 #include <sys/module.h>
  85 #include <sys/sysent.h>
  86 #include <sys/dmu_impl.h>
  87 #include <sys/brt.h>
  88 #include <sys/zfeature.h>
  89
  90 #include <vm/vm_object.h>
  91
  92 #include <sys/extattr.h>
  93 #include <sys/priv.h>
  94
  95 #ifndef VN_OPEN_INVFS
  96 #define VN_OPEN_INVFS   0x0
  97 #endif
  98
  99 VFS_SMR_DECLARE;
 100
 101 #ifdef DEBUG_VFS_LOCKS
 102 #define VNCHECKREF(vp)                            \
 103         VNASSERT((vp)->v_holdcnt > 0 && (vp)->v_usecount > 0, vp,       \
 104             ("%s: wrong ref counts", __func__));
 105 #else
 106 #define VNCHECKREF(vp)
 107 #endif
 108
 109 #if __FreeBSD_version >= 1400045
 110 typedef uint64_t cookie_t;
 111 #else
 112 typedef ulong_t cookie_t;
 113 #endif
 114
 115 /*
 116  * Programming rules.
 117  *
 118  * Each vnode op performs some logical unit of work.  To do this, the ZPL must
 119  * properly lock its in-core state, create a DMU transaction, do the work,
 120  * record this work in the intent log (ZIL), commit the DMU transaction,
 121  * and wait for the intent log to commit if it is a synchronous operation.
 122  * Moreover, the vnode ops must work in both normal and log replay context.
 123  * The ordering of events is important to avoid deadlocks and references
 124  * to freed memory.  The example below illustrates the following Big Rules:
 125  *
 126  *  (1) A check must be made in each zfs thread for a mounted file system.
 127  *      This is done avoiding races using zfs_enter(zfsvfs).
 128  *      A zfs_exit(zfsvfs) is needed before all returns.  Any znodes
 129  *      must be checked with zfs_verify_zp(zp).  Both of these macros
 130  *      can return EIO from the calling function.
 131  *
 132  *  (2) VN_RELE() should always be the last thing except for zil_commit()
 133  *      (if necessary) and zfs_exit(). This is for 3 reasons:
 134  *      First, if it's the last reference, the vnode/znode
 135  *      can be freed, so the zp may point to freed memory.  Second, the last
 136  *      reference will call zfs_zinactive(), which may induce a lot of work --
 137  *      pushing cached pages (which acquires range locks) and syncing out
 138  *      cached atime changes.  Third, zfs_zinactive() may require a new tx,
 139  *      which could deadlock the system if you were already holding one.
 140  *      If you must call VN_RELE() within a tx then use VN_RELE_ASYNC().
 141  *
 142  *  (3) All range locks must be grabbed before calling dmu_tx_assign(),
 143  *      as they can span dmu_tx_assign() calls.
 144  *
 145  *  (4) If ZPL locks are held, pass TXG_NOWAIT as the second argument to
 146  *      dmu_tx_assign().  This is critical because we don't want to block
 147  *      while holding locks.
 148  *
 149  *      If no ZPL locks are held (aside from zfs_enter()), use TXG_WAIT.  This
 150  *      reduces lock contention and CPU usage when we must wait (note that if
 151  *      throughput is constrained by the storage, nearly every transaction
 152  *      must wait).
 153  *
 154  *      Note, in particular, that if a lock is sometimes acquired before
 155  *      the tx assigns, and sometimes after (e.g. z_lock), then failing
 156  *      to use a non-blocking assign can deadlock the system.  The scenario:
 157  *
 158  *      Thread A has grabbed a lock before calling dmu_tx_assign().
 159  *      Thread B is in an already-assigned tx, and blocks for this lock.
 160  *      Thread A calls dmu_tx_assign(TXG_WAIT) and blocks in txg_wait_open()
 161  *      forever, because the previous txg can't quiesce until B's tx commits.
 162  *
 163  *      If dmu_tx_assign() returns ERESTART and zfsvfs->z_assign is TXG_NOWAIT,
 164  *      then drop all locks, call dmu_tx_wait(), and try again.  On subsequent
 165  *      calls to dmu_tx_assign(), pass TXG_NOTHROTTLE in addition to TXG_NOWAIT,
 166  *      to indicate that this operation has already called dmu_tx_wait().
 167  *      This will ensure that we don't retry forever, waiting a short bit
 168  *      each time.
 169  *
 170  *  (5) If the operation succeeded, generate the intent log entry for it
 171  *      before dropping locks.  This ensures that the ordering of events
 172  *      in the intent log matches the order in which they actually occurred.
 173  *      During ZIL replay the zfs_log_* functions will update the sequence
 174  *      number to indicate the zil transaction has replayed.
 175  *
 176  *  (6) At the end of each vnode op, the DMU tx must always commit,
 177  *      regardless of whether there were any errors.
 178  *
 179  *  (7) After dropping all locks, invoke zil_commit(zilog, foid)
 180  *      to ensure that synchronous semantics are provided when necessary.
 181  *
 182  * In general, this is how things should be ordered in each vnode op:
 183  *
 184  *      zfs_enter(zfsvfs);              // exit if unmounted
 185  * top:
 186  *      zfs_dirent_lookup(&dl, ...)     // lock directory entry (may VN_HOLD())
 187  *      rw_enter(...);                  // grab any other locks you need
 188  *      tx = dmu_tx_create(...);        // get DMU tx
 189  *      dmu_tx_hold_*();                // hold each object you might modify
 190  *      error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT);
 191  *      if (error) {
 192  *              rw_exit(...);           // drop locks
 193  *              zfs_dirent_unlock(dl);  // unlock directory entry
 194  *              VN_RELE(...);           // release held vnodes
 195  *              if (error == ERESTART) {
 196  *                      waited = B_TRUE;
 197  *                      dmu_tx_wait(tx);
 198  *                      dmu_tx_abort(tx);
 199  *                      goto top;
 200  *              }
 201  *              dmu_tx_abort(tx);       // abort DMU tx
 202  *              zfs_exit(zfsvfs);       // finished in zfs
 203  *              return (error);         // really out of space
 204  *      }
 205  *      error = do_real_work();         // do whatever this VOP does
 206  *      if (error == 0)
 207  *              zfs_log_*(...);         // on success, make ZIL entry
 208  *      dmu_tx_commit(tx);              // commit DMU tx -- error or not
 209  *      rw_exit(...);                   // drop locks
 210  *      zfs_dirent_unlock(dl);          // unlock directory entry
 211  *      VN_RELE(...);                   // release held vnodes
 212  *      zil_commit(zilog, foid);        // synchronous when necessary
 213  *      zfs_exit(zfsvfs);               // finished in zfs
 214  *      return (error);                 // done, report error
 215  */
 216 static int
 217 zfs_open(vnode_t **vpp, int flag, cred_t *cr)
 218 {
 219         (void) cr;
 220         znode_t *zp = VTOZ(*vpp);
 221         zfsvfs_t *zfsvfs = zp->z_zfsvfs;
 222         int error;
 223
 224         if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
 225                 return (error);
 226
 227         if ((flag & FWRITE) && (zp->z_pflags & ZFS_APPENDONLY) &&
 228             ((flag & FAPPEND) == 0)) {
 229                 zfs_exit(zfsvfs, FTAG);
 230                 return (SET_ERROR(EPERM));
 231         }
 232
 233         /*
 234          * Keep a count of the synchronous opens in the znode.  On first
 235          * synchronous open we must convert all previous async transactions
 236          * into sync to keep correct ordering.
 237          */
 238         if (flag & O_SYNC) {
 239                 if (atomic_inc_32_nv(&zp->z_sync_cnt) == 1)
 240                         zil_async_to_sync(zfsvfs->z_log, zp->z_id);
 241         }
 242
 243         zfs_exit(zfsvfs, FTAG);
 244         return (0);
 245 }
 246
 247 static int
 248 zfs_close(vnode_t *vp, int flag, int count, offset_t offset, cred_t *cr)
 249 {
 250         (void) offset, (void) cr;
 251         znode_t *zp = VTOZ(vp);
 252         zfsvfs_t *zfsvfs = zp->z_zfsvfs;
 253         int error;
 254
 255         if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
 256                 return (error);
 257
 258         /* Decrement the synchronous opens in the znode */
 259         if ((flag & O_SYNC) && (count == 1))
 260                 atomic_dec_32(&zp->z_sync_cnt);
 261
 262         zfs_exit(zfsvfs, FTAG);
 263         return (0);
 264 }
 265
 266 static int
 267 zfs_ioctl(vnode_t *vp, ulong_t com, intptr_t data, int flag, cred_t *cred,
 268     int *rvalp)
 269 {
 270         (void) flag, (void) cred, (void) rvalp;
 271         loff_t off;
 272         int error;
 273
 274         switch (com) {
 275         case _FIOFFS:
 276         {
 277                 return (0);
 278
 279                 /*
 280                  * The following two ioctls are used by bfu.  Faking out,
 281                  * necessary to avoid bfu errors.
 282                  */
 283         }
 284         case _FIOGDIO:
 285         case _FIOSDIO:
 286         {
 287                 return (0);
 288         }
 289
 290         case F_SEEK_DATA:
 291         case F_SEEK_HOLE:
 292         {
 293                 off = *(offset_t *)data;
 294                 error = vn_lock(vp, LK_SHARED);
 295                 if (error)
 296                         return (error);
 297                 /* offset parameter is in/out */
 298                 error = zfs_holey(VTOZ(vp), com, &off);
 299                 VOP_UNLOCK(vp);
 300                 if (error)
 301                         return (error);
 302                 *(offset_t *)data = off;
 303                 return (0);
 304         }
 305         }
 306         return (SET_ERROR(ENOTTY));
 307 }
 308
 309 static vm_page_t
 310 page_busy(vnode_t *vp, int64_t start, int64_t off, int64_t nbytes)
 311 {
 312         vm_object_t obj;
 313         vm_page_t pp;
 314         int64_t end;
 315
 316         /*
 317          * At present vm_page_clear_dirty extends the cleared range to DEV_BSIZE
 318          * aligned boundaries, if the range is not aligned.  As a result a
 319          * DEV_BSIZE subrange with partially dirty data may get marked as clean.
 320          * It may happen that all DEV_BSIZE subranges are marked clean and thus
 321          * the whole page would be considered clean despite have some
 322          * dirty data.
 323          * For this reason we should shrink the range to DEV_BSIZE aligned
 324          * boundaries before calling vm_page_clear_dirty.
 325          */
 326         end = rounddown2(off + nbytes, DEV_BSIZE);
 327         off = roundup2(off, DEV_BSIZE);
 328         nbytes = end - off;
 329
 330         obj = vp->v_object;
 331         vm_page_grab_valid_unlocked(&pp, obj, OFF_TO_IDX(start),
 332             VM_ALLOC_NOCREAT | VM_ALLOC_SBUSY | VM_ALLOC_NORMAL |
 333             VM_ALLOC_IGN_SBUSY);
 334         if (pp != NULL) {
 335                 ASSERT3U(pp->valid, ==, VM_PAGE_BITS_ALL);
 336                 vm_object_pip_add(obj, 1);
 337                 pmap_remove_write(pp);
 338                 if (nbytes != 0)
 339                         vm_page_clear_dirty(pp, off, nbytes);
 340         }
 341         return (pp);
 342 }
 343
 344 static void
 345 page_unbusy(vm_page_t pp)
 346 {
 347
 348         vm_page_sunbusy(pp);
 349         vm_object_pip_wakeup(pp->object);
 350 }
 351
 352 static vm_page_t
 353 page_hold(vnode_t *vp, int64_t start)
 354 {
 355         vm_object_t obj;
 356         vm_page_t m;
 357
 358         obj = vp->v_object;
 359         vm_page_grab_valid_unlocked(&m, obj, OFF_TO_IDX(start),
 360             VM_ALLOC_NOCREAT | VM_ALLOC_WIRED | VM_ALLOC_IGN_SBUSY |
 361             VM_ALLOC_NOBUSY);
 362         return (m);
 363 }
 364
 365 static void
 366 page_unhold(vm_page_t pp)
 367 {
 368         vm_page_unwire(pp, PQ_ACTIVE);
 369 }
 370
 371 /*
 372  * When a file is memory mapped, we must keep the IO data synchronized
 373  * between the DMU cache and the memory mapped pages.  What this means:
 374  *
 375  * On Write:    If we find a memory mapped page, we write to *both*
 376  *              the page and the dmu buffer.
 377  */
 378 void
 379 update_pages(znode_t *zp, int64_t start, int len, objset_t *os)
 380 {
 381         vm_object_t obj;
 382         struct sf_buf *sf;
 383         vnode_t *vp = ZTOV(zp);
 384         caddr_t va;
 385         int off;
 386
 387         ASSERT3P(vp->v_mount, !=, NULL);
 388         obj = vp->v_object;
 389         ASSERT3P(obj, !=, NULL);
 390
 391         off = start & PAGEOFFSET;
 392         vm_object_pip_add(obj, 1);
 393         for (start &= PAGEMASK; len > 0; start += PAGESIZE) {
 394                 vm_page_t pp;
 395                 int nbytes = imin(PAGESIZE - off, len);
 396
 397                 if ((pp = page_busy(vp, start, off, nbytes)) != NULL) {
 398                         va = zfs_map_page(pp, &sf);
 399                         (void) dmu_read(os, zp->z_id, start + off, nbytes,
 400                             va + off, DMU_READ_PREFETCH);
 401                         zfs_unmap_page(sf);
 402                         page_unbusy(pp);
 403                 }
 404                 len -= nbytes;
 405                 off = 0;
 406         }
 407         vm_object_pip_wakeup(obj);
 408 }
 409
 410 /*
 411  * Read with UIO_NOCOPY flag means that sendfile(2) requests
 412  * ZFS to populate a range of page cache pages with data.
 413  *
 414  * NOTE: this function could be optimized to pre-allocate
 415  * all pages in advance, drain exclusive busy on all of them,
 416  * map them into contiguous KVA region and populate them
 417  * in one single dmu_read() call.
 418  */
 419 int
 420 mappedread_sf(znode_t *zp, int nbytes, zfs_uio_t *uio)
 421 {
 422         vnode_t *vp = ZTOV(zp);
 423         objset_t *os = zp->z_zfsvfs->z_os;
 424         struct sf_buf *sf;
 425         vm_object_t obj;
 426         vm_page_t pp;
 427         int64_t start;
 428         caddr_t va;
 429         int len = nbytes;
 430         int error = 0;
 431
 432         ASSERT3U(zfs_uio_segflg(uio), ==, UIO_NOCOPY);
 433         ASSERT3P(vp->v_mount, !=, NULL);
 434         obj = vp->v_object;
 435         ASSERT3P(obj, !=, NULL);
 436         ASSERT0(zfs_uio_offset(uio) & PAGEOFFSET);
 437
 438         for (start = zfs_uio_offset(uio); len > 0; start += PAGESIZE) {
 439                 int bytes = MIN(PAGESIZE, len);
 440
 441                 pp = vm_page_grab_unlocked(obj, OFF_TO_IDX(start),
 442                     VM_ALLOC_SBUSY | VM_ALLOC_NORMAL | VM_ALLOC_IGN_SBUSY);
 443                 if (vm_page_none_valid(pp)) {
 444                         va = zfs_map_page(pp, &sf);
 445                         error = dmu_read(os, zp->z_id, start, bytes, va,
 446                             DMU_READ_PREFETCH);
 447                         if (bytes != PAGESIZE && error == 0)
 448                                 memset(va + bytes, 0, PAGESIZE - bytes);
 449                         zfs_unmap_page(sf);
 450                         if (error == 0) {
 451                                 vm_page_valid(pp);
 452                                 vm_page_activate(pp);
 453                                 vm_page_sunbusy(pp);
 454                         } else {
 455                                 zfs_vmobject_wlock(obj);
 456                                 if (!vm_page_wired(pp) && pp->valid == 0 &&
 457                                     vm_page_busy_tryupgrade(pp))
 458                                         vm_page_free(pp);
 459                                 else {
 460                                         vm_page_deactivate_noreuse(pp);
 461                                         vm_page_sunbusy(pp);
 462                                 }
 463                                 zfs_vmobject_wunlock(obj);
 464                         }
 465                 } else {
 466                         ASSERT3U(pp->valid, ==, VM_PAGE_BITS_ALL);
 467                         vm_page_sunbusy(pp);
 468                 }
 469                 if (error)
 470                         break;
 471                 zfs_uio_advance(uio, bytes);
 472                 len -= bytes;
 473         }
 474         return (error);
 475 }
 476
 477 /*
 478  * When a file is memory mapped, we must keep the IO data synchronized
 479  * between the DMU cache and the memory mapped pages.  What this means:
 480  *
 481  * On Read:     We "read" preferentially from memory mapped pages,
 482  *              else we default from the dmu buffer.
 483  *
 484  * NOTE: We will always "break up" the IO into PAGESIZE uiomoves when
 485  *       the file is memory mapped.
 486  */
 487 int
 488 mappedread(znode_t *zp, int nbytes, zfs_uio_t *uio)
 489 {
 490         vnode_t *vp = ZTOV(zp);
 491         vm_object_t obj;
 492         int64_t start;
 493         int len = nbytes;
 494         int off;
 495         int error = 0;
 496
 497         ASSERT3P(vp->v_mount, !=, NULL);
 498         obj = vp->v_object;
 499         ASSERT3P(obj, !=, NULL);
 500
 501         start = zfs_uio_offset(uio);
 502         off = start & PAGEOFFSET;
 503         for (start &= PAGEMASK; len > 0; start += PAGESIZE) {
 504                 vm_page_t pp;
 505                 uint64_t bytes = MIN(PAGESIZE - off, len);
 506
 507                 if ((pp = page_hold(vp, start))) {
 508                         struct sf_buf *sf;
 509                         caddr_t va;
 510
 511                         va = zfs_map_page(pp, &sf);
 512                         error = vn_io_fault_uiomove(va + off, bytes,
 513                             GET_UIO_STRUCT(uio));
 514                         zfs_unmap_page(sf);
 515                         page_unhold(pp);
 516                 } else {
 517                         error = dmu_read_uio_dbuf(sa_get_db(zp->z_sa_hdl),
 518                             uio, bytes);
 519                 }
 520                 len -= bytes;
 521                 off = 0;
 522                 if (error)
 523                         break;
 524         }
 525         return (error);
 526 }
 527
 528 int
 529 zfs_write_simple(znode_t *zp, const void *data, size_t len,
 530     loff_t pos, size_t *presid)
 531 {
 532         int error = 0;
 533         ssize_t resid;
 534
 535         error = vn_rdwr(UIO_WRITE, ZTOV(zp), __DECONST(void *, data), len, pos,
 536             UIO_SYSSPACE, IO_SYNC, kcred, NOCRED, &resid, curthread);
 537
 538         if (error) {
 539                 return (SET_ERROR(error));
 540         } else if (presid == NULL) {
 541                 if (resid != 0) {
 542                         error = SET_ERROR(EIO);
 543                 }
 544         } else {
 545                 *presid = resid;
 546         }
 547         return (error);
 548 }
 549
 550 void
 551 zfs_zrele_async(znode_t *zp)
 552 {
 553         vnode_t *vp = ZTOV(zp);
 554         objset_t *os = ITOZSB(vp)->z_os;
 555
 556         VN_RELE_ASYNC(vp, dsl_pool_zrele_taskq(dmu_objset_pool(os)));
 557 }
 558
 559 static int
 560 zfs_dd_callback(struct mount *mp, void *arg, int lkflags, struct vnode **vpp)
 561 {
 562         int error;
 563
 564         *vpp = arg;
 565         error = vn_lock(*vpp, lkflags);
 566         if (error != 0)
 567                 vrele(*vpp);
 568         return (error);
 569 }
 570
 571 static int
 572 zfs_lookup_lock(vnode_t *dvp, vnode_t *vp, const char *name, int lkflags)
 573 {
 574         znode_t *zdp = VTOZ(dvp);
 575         zfsvfs_t *zfsvfs __unused = zdp->z_zfsvfs;
 576         int error;
 577         int ltype;
 578
 579         if (zfsvfs->z_replay == B_FALSE)
 580                 ASSERT_VOP_LOCKED(dvp, __func__);
 581
 582         if (name[0] == 0 || (name[0] == '.' && name[1] == 0)) {
 583                 ASSERT3P(dvp, ==, vp);
 584                 vref(dvp);
 585                 ltype = lkflags & LK_TYPE_MASK;
 586                 if (ltype != VOP_ISLOCKED(dvp)) {
 587                         if (ltype == LK_EXCLUSIVE)
 588                                 vn_lock(dvp, LK_UPGRADE | LK_RETRY);
 589                         else /* if (ltype == LK_SHARED) */
 590                                 vn_lock(dvp, LK_DOWNGRADE | LK_RETRY);
 591
 592                         /*
 593                          * Relock for the "." case could leave us with
 594                          * reclaimed vnode.
 595                          */
 596                         if (VN_IS_DOOMED(dvp)) {
 597                                 vrele(dvp);
 598                                 return (SET_ERROR(ENOENT));
 599                         }
 600                 }
 601                 return (0);
 602         } else if (name[0] == '.' && name[1] == '.' && name[2] == 0) {
 603                 /*
 604                  * Note that in this case, dvp is the child vnode, and we
 605                  * are looking up the parent vnode - exactly reverse from
 606                  * normal operation.  Unlocking dvp requires some rather
 607                  * tricky unlock/relock dance to prevent mp from being freed;
 608                  * use vn_vget_ino_gen() which takes care of all that.
 609                  *
 610                  * XXX Note that there is a time window when both vnodes are
 611                  * unlocked.  It is possible, although highly unlikely, that
 612                  * during that window the parent-child relationship between
 613                  * the vnodes may change, for example, get reversed.
 614                  * In that case we would have a wrong lock order for the vnodes.
 615                  * All other filesystems seem to ignore this problem, so we
 616                  * do the same here.
 617                  * A potential solution could be implemented as follows:
 618                  * - using LK_NOWAIT when locking the second vnode and retrying
 619                  *   if necessary
 620                  * - checking that the parent-child relationship still holds
 621                  *   after locking both vnodes and retrying if it doesn't
 622                  */
 623                 error = vn_vget_ino_gen(dvp, zfs_dd_callback, vp, lkflags, &vp);
 624                 return (error);
 625         } else {
 626                 error = vn_lock(vp, lkflags);
 627                 if (error != 0)
 628                         vrele(vp);
 629                 return (error);
 630         }
 631 }
 632
 633 /*
 634  * Lookup an entry in a directory, or an extended attribute directory.
 635  * If it exists, return a held vnode reference for it.
 636  *
 637  *      IN:     dvp     - vnode of directory to search.
 638  *              nm      - name of entry to lookup.
 639  *              pnp     - full pathname to lookup [UNUSED].
 640  *              flags   - LOOKUP_XATTR set if looking for an attribute.
 641  *              rdir    - root directory vnode [UNUSED].
 642  *              cr      - credentials of caller.
 643  *              ct      - caller context
 644  *
 645  *      OUT:    vpp     - vnode of located entry, NULL if not found.
 646  *
 647  *      RETURN: 0 on success, error code on failure.
 648  *
 649  * Timestamps:
 650  *      NA
 651  */
 652 static int
 653 zfs_lookup(vnode_t *dvp, const char *nm, vnode_t **vpp,
 654     struct componentname *cnp, int nameiop, cred_t *cr, int flags,
 655     boolean_t cached)
 656 {
 657         znode_t *zdp = VTOZ(dvp);
 658         znode_t *zp;
 659         zfsvfs_t *zfsvfs = zdp->z_zfsvfs;
 660         seqc_t dvp_seqc;
 661         int     error = 0;
 662
 663         /*
 664          * Fast path lookup, however we must skip DNLC lookup
 665          * for case folding or normalizing lookups because the
 666          * DNLC code only stores the passed in name.  This means
 667          * creating 'a' and removing 'A' on a case insensitive
 668          * file system would work, but DNLC still thinks 'a'
 669          * exists and won't let you create it again on the next
 670          * pass through fast path.
 671          */
 672         if (!(flags & LOOKUP_XATTR)) {
 673                 if (dvp->v_type != VDIR) {
 674                         return (SET_ERROR(ENOTDIR));
 675                 } else if (zdp->z_sa_hdl == NULL) {
 676                         return (SET_ERROR(EIO));
 677                 }
 678         }
 679
 680         DTRACE_PROBE2(zfs__fastpath__lookup__miss, vnode_t *, dvp,
 681             const char *, nm);
 682
 683         if ((error = zfs_enter_verify_zp(zfsvfs, zdp, FTAG)) != 0)
 684                 return (error);
 685
 686         dvp_seqc = vn_seqc_read_notmodify(dvp);
 687
 688         *vpp = NULL;
 689
 690         if (flags & LOOKUP_XATTR) {
 691                 /*
 692                  * If the xattr property is off, refuse the lookup request.
 693                  */
 694                 if (!(zfsvfs->z_flags & ZSB_XATTR)) {
 695                         zfs_exit(zfsvfs, FTAG);
 696                         return (SET_ERROR(EOPNOTSUPP));
 697                 }
 698
 699                 /*
 700                  * We don't allow recursive attributes..
 701                  * Maybe someday we will.
 702                  */
 703                 if (zdp->z_pflags & ZFS_XATTR) {
 704                         zfs_exit(zfsvfs, FTAG);
 705                         return (SET_ERROR(EINVAL));
 706                 }
 707
 708                 if ((error = zfs_get_xattrdir(VTOZ(dvp), &zp, cr, flags))) {
 709                         zfs_exit(zfsvfs, FTAG);
 710                         return (error);
 711                 }
 712                 *vpp = ZTOV(zp);
 713
 714                 /*
 715                  * Do we have permission to get into attribute directory?
 716                  */
 717                 error = zfs_zaccess(zp, ACE_EXECUTE, 0, B_FALSE, cr, NULL);
 718                 if (error) {
 719                         vrele(ZTOV(zp));
 720                 }
 721
 722                 zfs_exit(zfsvfs, FTAG);
 723                 return (error);
 724         }
 725
 726         /*
 727          * Check accessibility of directory if we're not coming in via
 728          * VOP_CACHEDLOOKUP.
 729          */
 730         if (!cached) {
 731 #ifdef NOEXECCHECK
 732                 if ((cnp->cn_flags & NOEXECCHECK) != 0) {
 733                         cnp->cn_flags &= ~NOEXECCHECK;
 734                 } else
 735 #endif
 736                 if ((error = zfs_zaccess(zdp, ACE_EXECUTE, 0, B_FALSE, cr,
 737                     NULL))) {
 738                         zfs_exit(zfsvfs, FTAG);
 739                         return (error);
 740                 }
 741         }
 742
 743         if (zfsvfs->z_utf8 && u8_validate(nm, strlen(nm),
 744             NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
 745                 zfs_exit(zfsvfs, FTAG);
 746                 return (SET_ERROR(EILSEQ));
 747         }
 748
 749
 750         /*
 751          * First handle the special cases.
 752          */
 753         if ((cnp->cn_flags & ISDOTDOT) != 0) {
 754                 /*
 755                  * If we are a snapshot mounted under .zfs, return
 756                  * the vp for the snapshot directory.
 757                  */
 758                 if (zdp->z_id == zfsvfs->z_root && zfsvfs->z_parent != zfsvfs) {
 759                         struct componentname cn;
 760                         vnode_t *zfsctl_vp;
 761                         int ltype;
 762
 763                         zfs_exit(zfsvfs, FTAG);
 764                         ltype = VOP_ISLOCKED(dvp);
 765                         VOP_UNLOCK(dvp);
 766                         error = zfsctl_root(zfsvfs->z_parent, LK_SHARED,
 767                             &zfsctl_vp);
 768                         if (error == 0) {
 769                                 cn.cn_nameptr = "snapshot";
 770                                 cn.cn_namelen = strlen(cn.cn_nameptr);
 771                                 cn.cn_nameiop = cnp->cn_nameiop;
 772                                 cn.cn_flags = cnp->cn_flags & ~ISDOTDOT;
 773                                 cn.cn_lkflags = cnp->cn_lkflags;
 774                                 error = VOP_LOOKUP(zfsctl_vp, vpp, &cn);
 775                                 vput(zfsctl_vp);
 776                         }
 777                         vn_lock(dvp, ltype | LK_RETRY);
 778                         return (error);
 779                 }
 780         }
 781         if (zfs_has_ctldir(zdp) && strcmp(nm, ZFS_CTLDIR_NAME) == 0) {
 782                 zfs_exit(zfsvfs, FTAG);
 783                 if (zfsvfs->z_show_ctldir == ZFS_SNAPDIR_DISABLED)
 784                         return (SET_ERROR(ENOENT));
 785                 if ((cnp->cn_flags & ISLASTCN) != 0 && nameiop != LOOKUP)
 786                         return (SET_ERROR(ENOTSUP));
 787                 error = zfsctl_root(zfsvfs, cnp->cn_lkflags, vpp);
 788                 return (error);
 789         }
 790
 791         /*
 792          * The loop is retry the lookup if the parent-child relationship
 793          * changes during the dot-dot locking complexities.
 794          */
 795         for (;;) {
 796                 uint64_t parent;
 797
 798                 error = zfs_dirlook(zdp, nm, &zp);
 799                 if (error == 0)
 800                         *vpp = ZTOV(zp);
 801
 802                 zfs_exit(zfsvfs, FTAG);
 803                 if (error != 0)
 804                         break;
 805
 806                 error = zfs_lookup_lock(dvp, *vpp, nm, cnp->cn_lkflags);
 807                 if (error != 0) {
 808                         /*
 809                          * If we've got a locking error, then the vnode
 810                          * got reclaimed because of a force unmount.
 811                          * We never enter doomed vnodes into the name cache.
 812                          */
 813                         *vpp = NULL;
 814                         return (error);
 815                 }
 816
 817                 if ((cnp->cn_flags & ISDOTDOT) == 0)
 818                         break;
 819
 820                 if ((error = zfs_enter(zfsvfs, FTAG)) != 0) {
 821                         vput(ZTOV(zp));
 822                         *vpp = NULL;
 823                         return (error);
 824                 }
 825                 if (zdp->z_sa_hdl == NULL) {
 826                         error = SET_ERROR(EIO);
 827                 } else {
 828                         error = sa_lookup(zdp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs),
 829                             &parent, sizeof (parent));
 830                 }
 831                 if (error != 0) {
 832                         zfs_exit(zfsvfs, FTAG);
 833                         vput(ZTOV(zp));
 834                         break;
 835                 }
 836                 if (zp->z_id == parent) {
 837                         zfs_exit(zfsvfs, FTAG);
 838                         break;
 839                 }
 840                 vput(ZTOV(zp));
 841         }
 842
 843         if (error != 0)
 844                 *vpp = NULL;
 845
 846         /* Translate errors and add SAVENAME when needed. */
 847         if (cnp->cn_flags & ISLASTCN) {
 848                 switch (nameiop) {
 849                 case CREATE:
 850                 case RENAME:
 851                         if (error == ENOENT) {
 852                                 error = EJUSTRETURN;
 853 #if __FreeBSD_version < 1400068
 854                                 cnp->cn_flags |= SAVENAME;
 855 #endif
 856                                 break;
 857                         }
 858                         zfs_fallthrough;
 859                 case DELETE:
 860 #if __FreeBSD_version < 1400068
 861                         if (error == 0)
 862                                 cnp->cn_flags |= SAVENAME;
 863 #endif
 864                         break;
 865                 }
 866         }
 867
 868         if ((cnp->cn_flags & ISDOTDOT) != 0) {
 869                 /*
 870                  * FIXME: zfs_lookup_lock relocks vnodes and does nothing to
 871                  * handle races. In particular different callers may end up
 872                  * with different vnodes and will try to add conflicting
 873                  * entries to the namecache.
 874                  *
 875                  * While finding different result may be acceptable in face
 876                  * of concurrent modification, adding conflicting entries
 877                  * trips over an assert in the namecache.
 878                  *
 879                  * Ultimately let an entry through once everything settles.
 880                  */
 881                 if (!vn_seqc_consistent(dvp, dvp_seqc)) {
 882                         cnp->cn_flags &= ~MAKEENTRY;
 883                 }
 884         }
 885
 886         /* Insert name into cache (as non-existent) if appropriate. */
 887         if (zfsvfs->z_use_namecache && !zfsvfs->z_replay &&
 888             error == ENOENT && (cnp->cn_flags & MAKEENTRY) != 0)
 889                 cache_enter(dvp, NULL, cnp);
 890
 891         /* Insert name into cache if appropriate. */
 892         if (zfsvfs->z_use_namecache && !zfsvfs->z_replay &&
 893             error == 0 && (cnp->cn_flags & MAKEENTRY)) {
 894                 if (!(cnp->cn_flags & ISLASTCN) ||
 895                     (nameiop != DELETE && nameiop != RENAME)) {
 896                         cache_enter(dvp, *vpp, cnp);
 897                 }
 898         }
 899
 900         return (error);
 901 }
 902
 903 static inline bool
 904 is_nametoolong(zfsvfs_t *zfsvfs, const char *name)
 905 {
 906         size_t dlen = strlen(name);
 907         return ((!zfsvfs->z_longname && dlen >= ZAP_MAXNAMELEN) ||
 908             dlen >= ZAP_MAXNAMELEN_NEW);
 909 }
 910
 911 /*
 912  * Attempt to create a new entry in a directory.  If the entry
 913  * already exists, truncate the file if permissible, else return
 914  * an error.  Return the vp of the created or trunc'd file.
 915  *
 916  *      IN:     dvp     - vnode of directory to put new file entry in.
 917  *              name    - name of new file entry.
 918  *              vap     - attributes of new file.
 919  *              excl    - flag indicating exclusive or non-exclusive mode.
 920  *              mode    - mode to open file with.
 921  *              cr      - credentials of caller.
 922  *              flag    - large file flag [UNUSED].
 923  *              ct      - caller context
 924  *              vsecp   - ACL to be set
 925  *              mnt_ns  - Unused on FreeBSD
 926  *
 927  *      OUT:    vpp     - vnode of created or trunc'd entry.
 928  *
 929  *      RETURN: 0 on success, error code on failure.
 930  *
 931  * Timestamps:
 932  *      dvp - ctime|mtime updated if new entry created
 933  *       vp - ctime|mtime always, atime if new
 934  */
 935 int
 936 zfs_create(znode_t *dzp, const char *name, vattr_t *vap, int excl, int mode,
 937     znode_t **zpp, cred_t *cr, int flag, vsecattr_t *vsecp, zidmap_t *mnt_ns)
 938 {
 939         (void) excl, (void) mode, (void) flag;
 940         znode_t         *zp;
 941         zfsvfs_t        *zfsvfs = dzp->z_zfsvfs;
 942         zilog_t         *zilog;
 943         objset_t        *os;
 944         dmu_tx_t        *tx;
 945         int             error;
 946         uid_t           uid = crgetuid(cr);
 947         gid_t           gid = crgetgid(cr);
 948         uint64_t        projid = ZFS_DEFAULT_PROJID;
 949         zfs_acl_ids_t   acl_ids;
 950         boolean_t       fuid_dirtied;
 951         uint64_t        txtype;
 952 #ifdef DEBUG_VFS_LOCKS
 953         vnode_t *dvp = ZTOV(dzp);
 954 #endif
 955
 956         if (is_nametoolong(zfsvfs, name))
 957                 return (SET_ERROR(ENAMETOOLONG));
 958
 959         /*
 960          * If we have an ephemeral id, ACL, or XVATTR then
 961          * make sure file system is at proper version
 962          */
 963         if (zfsvfs->z_use_fuids == B_FALSE &&
 964             (vsecp || (vap->va_mask & AT_XVATTR) ||
 965             IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid)))
 966                 return (SET_ERROR(EINVAL));
 967
 968         if ((error = zfs_enter_verify_zp(zfsvfs, dzp, FTAG)) != 0)
 969                 return (error);
 970         os = zfsvfs->z_os;
 971         zilog = zfsvfs->z_log;
 972
 973         if (zfsvfs->z_utf8 && u8_validate(name, strlen(name),
 974             NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
 975                 zfs_exit(zfsvfs, FTAG);
 976                 return (SET_ERROR(EILSEQ));
 977         }
 978
 979         if (vap->va_mask & AT_XVATTR) {
 980                 if ((error = secpolicy_xvattr(ZTOV(dzp), (xvattr_t *)vap,
 981                     crgetuid(cr), cr, vap->va_type)) != 0) {
 982                         zfs_exit(zfsvfs, FTAG);
 983                         return (error);
 984                 }
 985         }
 986
 987         *zpp = NULL;
 988
 989         if ((vap->va_mode & S_ISVTX) && secpolicy_vnode_stky_modify(cr))
 990                 vap->va_mode &= ~S_ISVTX;
 991
 992         error = zfs_dirent_lookup(dzp, name, &zp, ZNEW);
 993         if (error) {
 994                 zfs_exit(zfsvfs, FTAG);
 995                 return (error);
 996         }
 997         ASSERT3P(zp, ==, NULL);
 998
 999         /*
1000          * Create a new file object and update the directory
1001          * to reference it.
1002          */
1003         if ((error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr, mnt_ns))) {
1004                 goto out;
1005         }
1006
1007         /*
1008          * We only support the creation of regular files in
1009          * extended attribute directories.
1010          */
1011
1012         if ((dzp->z_pflags & ZFS_XATTR) &&
1013             (vap->va_type != VREG)) {
1014                 error = SET_ERROR(EINVAL);
1015                 goto out;
1016         }
1017
1018         if ((error = zfs_acl_ids_create(dzp, 0, vap,
1019             cr, vsecp, &acl_ids, NULL)) != 0)
1020                 goto out;
1021
1022         if (S_ISREG(vap->va_mode) || S_ISDIR(vap->va_mode))
1023                 projid = zfs_inherit_projid(dzp);
1024         if (zfs_acl_ids_overquota(zfsvfs, &acl_ids, projid)) {
1025                 zfs_acl_ids_free(&acl_ids);
1026                 error = SET_ERROR(EDQUOT);
1027                 goto out;
1028         }
1029
1030         getnewvnode_reserve();
1031
1032         tx = dmu_tx_create(os);
1033
1034         dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
1035             ZFS_SA_BASE_ATTR_SIZE);
1036
1037         fuid_dirtied = zfsvfs->z_fuid_dirty;
1038         if (fuid_dirtied)
1039                 zfs_fuid_txhold(zfsvfs, tx);
1040         dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
1041         dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE);
1042         if (!zfsvfs->z_use_sa &&
1043             acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) {
1044                 dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
1045                     0, acl_ids.z_aclp->z_acl_bytes);
1046         }
1047         error = dmu_tx_assign(tx, TXG_WAIT);
1048         if (error) {
1049                 zfs_acl_ids_free(&acl_ids);
1050                 dmu_tx_abort(tx);
1051                 getnewvnode_drop_reserve();
1052                 zfs_exit(zfsvfs, FTAG);
1053                 return (error);
1054         }
1055         zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids);
1056
1057         error = zfs_link_create(dzp, name, zp, tx, ZNEW);
1058         if (error != 0) {
1059                 /*
1060                  * Since, we failed to add the directory entry for it,
1061                  * delete the newly created dnode.
1062                  */
1063                 zfs_znode_delete(zp, tx);
1064                 VOP_UNLOCK(ZTOV(zp));
1065                 zrele(zp);
1066                 zfs_acl_ids_free(&acl_ids);
1067                 dmu_tx_commit(tx);
1068                 getnewvnode_drop_reserve();
1069                 goto out;
1070         }
1071
1072         if (fuid_dirtied)
1073                 zfs_fuid_sync(zfsvfs, tx);
1074
1075         txtype = zfs_log_create_txtype(Z_FILE, vsecp, vap);
1076         zfs_log_create(zilog, tx, txtype, dzp, zp, name,
1077             vsecp, acl_ids.z_fuidp, vap);
1078         zfs_acl_ids_free(&acl_ids);
1079         dmu_tx_commit(tx);
1080
1081         getnewvnode_drop_reserve();
1082
1083 out:
1084         VNCHECKREF(dvp);
1085         if (error == 0) {
1086                 *zpp = zp;
1087         }
1088
1089         if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
1090                 zil_commit(zilog, 0);
1091
1092         zfs_exit(zfsvfs, FTAG);
1093         return (error);
1094 }
1095
1096 /*
1097  * Remove an entry from a directory.
1098  *
1099  *      IN:     dvp     - vnode of directory to remove entry from.
1100  *              name    - name of entry to remove.
1101  *              cr      - credentials of caller.
1102  *              ct      - caller context
1103  *              flags   - case flags
1104  *
1105  *      RETURN: 0 on success, error code on failure.
1106  *
1107  * Timestamps:
1108  *      dvp - ctime|mtime
1109  *       vp - ctime (if nlink > 0)
1110  */
1111 static int
1112 zfs_remove_(vnode_t *dvp, vnode_t *vp, const char *name, cred_t *cr)
1113 {
1114         znode_t         *dzp = VTOZ(dvp);
1115         znode_t         *zp;
1116         znode_t         *xzp;
1117         zfsvfs_t        *zfsvfs = dzp->z_zfsvfs;
1118         zilog_t         *zilog;
1119         uint64_t        xattr_obj;
1120         uint64_t        obj = 0;
1121         dmu_tx_t        *tx;
1122         boolean_t       unlinked;
1123         uint64_t        txtype;
1124         int             error;
1125
1126
1127         if ((error = zfs_enter_verify_zp(zfsvfs, dzp, FTAG)) != 0)
1128                 return (error);
1129         zp = VTOZ(vp);
1130         if ((error = zfs_verify_zp(zp)) != 0) {
1131                 zfs_exit(zfsvfs, FTAG);
1132                 return (error);
1133         }
1134         zilog = zfsvfs->z_log;
1135
1136         xattr_obj = 0;
1137         xzp = NULL;
1138
1139         if ((error = zfs_zaccess_delete(dzp, zp, cr, NULL))) {
1140                 goto out;
1141         }
1142
1143         /*
1144          * Need to use rmdir for removing directories.
1145          */
1146         if (vp->v_type == VDIR) {
1147                 error = SET_ERROR(EPERM);
1148                 goto out;
1149         }
1150
1151         vnevent_remove(vp, dvp, name, ct);
1152
1153         obj = zp->z_id;
1154
1155         /* are there any extended attributes? */
1156         error = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs),
1157             &xattr_obj, sizeof (xattr_obj));
1158         if (error == 0 && xattr_obj) {
1159                 error = zfs_zget(zfsvfs, xattr_obj, &xzp);
1160                 ASSERT0(error);
1161         }
1162
1163         /*
1164          * We may delete the znode now, or we may put it in the unlinked set;
1165          * it depends on whether we're the last link, and on whether there are
1166          * other holds on the vnode.  So we dmu_tx_hold() the right things to
1167          * allow for either case.
1168          */
1169         tx = dmu_tx_create(zfsvfs->z_os);
1170         dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name);
1171         dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
1172         zfs_sa_upgrade_txholds(tx, zp);
1173         zfs_sa_upgrade_txholds(tx, dzp);
1174
1175         if (xzp) {
1176                 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
1177                 dmu_tx_hold_sa(tx, xzp->z_sa_hdl, B_FALSE);
1178         }
1179
1180         /* charge as an update -- would be nice not to charge at all */
1181         dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
1182
1183         /*
1184          * Mark this transaction as typically resulting in a net free of space
1185          */
1186         dmu_tx_mark_netfree(tx);
1187
1188         error = dmu_tx_assign(tx, TXG_WAIT);
1189         if (error) {
1190                 dmu_tx_abort(tx);
1191                 zfs_exit(zfsvfs, FTAG);
1192                 return (error);
1193         }
1194
1195         /*
1196          * Remove the directory entry.
1197          */
1198         error = zfs_link_destroy(dzp, name, zp, tx, ZEXISTS, &unlinked);
1199
1200         if (error) {
1201                 dmu_tx_commit(tx);
1202                 goto out;
1203         }
1204
1205         if (unlinked) {
1206                 zfs_unlinked_add(zp, tx);
1207                 vp->v_vflag |= VV_NOSYNC;
1208         }
1209         /* XXX check changes to linux vnops */
1210         txtype = TX_REMOVE;
1211         zfs_log_remove(zilog, tx, txtype, dzp, name, obj, unlinked);
1212
1213         dmu_tx_commit(tx);
1214 out:
1215
1216         if (xzp)
1217                 vrele(ZTOV(xzp));
1218
1219         if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
1220                 zil_commit(zilog, 0);
1221
1222
1223         zfs_exit(zfsvfs, FTAG);
1224         return (error);
1225 }
1226
1227
1228 static int
1229 zfs_lookup_internal(znode_t *dzp, const char *name, vnode_t **vpp,
1230     struct componentname *cnp, int nameiop)
1231 {
1232         zfsvfs_t        *zfsvfs = dzp->z_zfsvfs;
1233         int error;
1234
1235         cnp->cn_nameptr = __DECONST(char *, name);
1236         cnp->cn_namelen = strlen(name);
1237         cnp->cn_nameiop = nameiop;
1238         cnp->cn_flags = ISLASTCN;
1239 #if __FreeBSD_version < 1400068
1240         cnp->cn_flags |= SAVENAME;
1241 #endif
1242         cnp->cn_lkflags = LK_EXCLUSIVE | LK_RETRY;
1243         cnp->cn_cred = kcred;
1244 #if __FreeBSD_version < 1400037
1245         cnp->cn_thread = curthread;
1246 #endif
1247
1248         if (zfsvfs->z_use_namecache && !zfsvfs->z_replay) {
1249                 struct vop_lookup_args a;
1250
1251                 a.a_gen.a_desc = &vop_lookup_desc;
1252                 a.a_dvp = ZTOV(dzp);
1253                 a.a_vpp = vpp;
1254                 a.a_cnp = cnp;
1255                 error = vfs_cache_lookup(&a);
1256         } else {
1257                 error = zfs_lookup(ZTOV(dzp), name, vpp, cnp, nameiop, kcred, 0,
1258                     B_FALSE);
1259         }
1260 #ifdef ZFS_DEBUG
1261         if (error) {
1262                 printf("got error %d on name %s on op %d\n", error, name,
1263                     nameiop);
1264                 kdb_backtrace();
1265         }
1266 #endif
1267         return (error);
1268 }
1269
1270 int
1271 zfs_remove(znode_t *dzp, const char *name, cred_t *cr, int flags)
1272 {
1273         vnode_t *vp;
1274         int error;
1275         struct componentname cn;
1276
1277         if ((error = zfs_lookup_internal(dzp, name, &vp, &cn, DELETE)))
1278                 return (error);
1279
1280         error = zfs_remove_(ZTOV(dzp), vp, name, cr);
1281         vput(vp);
1282         return (error);
1283 }
1284 /*
1285  * Create a new directory and insert it into dvp using the name
1286  * provided.  Return a pointer to the inserted directory.
1287  *
1288  *      IN:     dvp     - vnode of directory to add subdir to.
1289  *              dirname - name of new directory.
1290  *              vap     - attributes of new directory.
1291  *              cr      - credentials of caller.
1292  *              ct      - caller context
1293  *              flags   - case flags
1294  *              vsecp   - ACL to be set
1295  *              mnt_ns  - Unused on FreeBSD
1296  *
1297  *      OUT:    vpp     - vnode of created directory.
1298  *
1299  *      RETURN: 0 on success, error code on failure.
1300  *
1301  * Timestamps:
1302  *      dvp - ctime|mtime updated
1303  *       vp - ctime|mtime|atime updated
1304  */
1305 int
1306 zfs_mkdir(znode_t *dzp, const char *dirname, vattr_t *vap, znode_t **zpp,
1307     cred_t *cr, int flags, vsecattr_t *vsecp, zidmap_t *mnt_ns)
1308 {
1309         (void) flags, (void) vsecp;
1310         znode_t         *zp;
1311         zfsvfs_t        *zfsvfs = dzp->z_zfsvfs;
1312         zilog_t         *zilog;
1313         uint64_t        txtype;
1314         dmu_tx_t        *tx;
1315         int             error;
1316         uid_t           uid = crgetuid(cr);
1317         gid_t           gid = crgetgid(cr);
1318         zfs_acl_ids_t   acl_ids;
1319         boolean_t       fuid_dirtied;
1320
1321         ASSERT3U(vap->va_type, ==, VDIR);
1322
1323         if (is_nametoolong(zfsvfs, dirname))
1324                 return (SET_ERROR(ENAMETOOLONG));
1325
1326         /*
1327          * If we have an ephemeral id, ACL, or XVATTR then
1328          * make sure file system is at proper version
1329          */
1330         if (zfsvfs->z_use_fuids == B_FALSE &&
1331             ((vap->va_mask & AT_XVATTR) ||
1332             IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid)))
1333                 return (SET_ERROR(EINVAL));
1334
1335         if ((error = zfs_enter_verify_zp(zfsvfs, dzp, FTAG)) != 0)
1336                 return (error);
1337         zilog = zfsvfs->z_log;
1338
1339         if (dzp->z_pflags & ZFS_XATTR) {
1340                 zfs_exit(zfsvfs, FTAG);
1341                 return (SET_ERROR(EINVAL));
1342         }
1343
1344         if (zfsvfs->z_utf8 && u8_validate(dirname,
1345             strlen(dirname), NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
1346                 zfs_exit(zfsvfs, FTAG);
1347                 return (SET_ERROR(EILSEQ));
1348         }
1349
1350         if (vap->va_mask & AT_XVATTR) {
1351                 if ((error = secpolicy_xvattr(ZTOV(dzp), (xvattr_t *)vap,
1352                     crgetuid(cr), cr, vap->va_type)) != 0) {
1353                         zfs_exit(zfsvfs, FTAG);
1354                         return (error);
1355                 }
1356         }
1357
1358         if ((error = zfs_acl_ids_create(dzp, 0, vap, cr,
1359             NULL, &acl_ids, NULL)) != 0) {
1360                 zfs_exit(zfsvfs, FTAG);
1361                 return (error);
1362         }
1363
1364         /*
1365          * First make sure the new directory doesn't exist.
1366          *
1367          * Existence is checked first to make sure we don't return
1368          * EACCES instead of EEXIST which can cause some applications
1369          * to fail.
1370          */
1371         *zpp = NULL;
1372
1373         if ((error = zfs_dirent_lookup(dzp, dirname, &zp, ZNEW))) {
1374                 zfs_acl_ids_free(&acl_ids);
1375                 zfs_exit(zfsvfs, FTAG);
1376                 return (error);
1377         }
1378         ASSERT3P(zp, ==, NULL);
1379
1380         if ((error = zfs_zaccess(dzp, ACE_ADD_SUBDIRECTORY, 0, B_FALSE, cr,
1381             mnt_ns))) {
1382                 zfs_acl_ids_free(&acl_ids);
1383                 zfs_exit(zfsvfs, FTAG);
1384                 return (error);
1385         }
1386
1387         if (zfs_acl_ids_overquota(zfsvfs, &acl_ids, zfs_inherit_projid(dzp))) {
1388                 zfs_acl_ids_free(&acl_ids);
1389                 zfs_exit(zfsvfs, FTAG);
1390                 return (SET_ERROR(EDQUOT));
1391         }
1392
1393         /*
1394          * Add a new entry to the directory.
1395          */
1396         getnewvnode_reserve();
1397         tx = dmu_tx_create(zfsvfs->z_os);
1398         dmu_tx_hold_zap(tx, dzp->z_id, TRUE, dirname);
1399         dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL);
1400         fuid_dirtied = zfsvfs->z_fuid_dirty;
1401         if (fuid_dirtied)
1402                 zfs_fuid_txhold(zfsvfs, tx);
1403         if (!zfsvfs->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) {
1404                 dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0,
1405                     acl_ids.z_aclp->z_acl_bytes);
1406         }
1407
1408         dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
1409             ZFS_SA_BASE_ATTR_SIZE);
1410
1411         error = dmu_tx_assign(tx, TXG_WAIT);
1412         if (error) {
1413                 zfs_acl_ids_free(&acl_ids);
1414                 dmu_tx_abort(tx);
1415                 getnewvnode_drop_reserve();
1416                 zfs_exit(zfsvfs, FTAG);
1417                 return (error);
1418         }
1419
1420         /*
1421          * Create new node.
1422          */
1423         zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids);
1424
1425         /*
1426          * Now put new name in parent dir.
1427          */
1428         error = zfs_link_create(dzp, dirname, zp, tx, ZNEW);
1429         if (error != 0) {
1430                 zfs_znode_delete(zp, tx);
1431                 VOP_UNLOCK(ZTOV(zp));
1432                 zrele(zp);
1433                 goto out;
1434         }
1435
1436         if (fuid_dirtied)
1437                 zfs_fuid_sync(zfsvfs, tx);
1438
1439         *zpp = zp;
1440
1441         txtype = zfs_log_create_txtype(Z_DIR, NULL, vap);
1442         zfs_log_create(zilog, tx, txtype, dzp, zp, dirname, NULL,
1443             acl_ids.z_fuidp, vap);
1444
1445 out:
1446         zfs_acl_ids_free(&acl_ids);
1447
1448         dmu_tx_commit(tx);
1449
1450         getnewvnode_drop_reserve();
1451
1452         if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
1453                 zil_commit(zilog, 0);
1454
1455         zfs_exit(zfsvfs, FTAG);
1456         return (error);
1457 }
1458
1459 /*
1460  * Remove a directory subdir entry.  If the current working
1461  * directory is the same as the subdir to be removed, the
1462  * remove will fail.
1463  *
1464  *      IN:     dvp     - vnode of directory to remove from.
1465  *              name    - name of directory to be removed.
1466  *              cwd     - vnode of current working directory.
1467  *              cr      - credentials of caller.
1468  *              ct      - caller context
1469  *              flags   - case flags
1470  *
1471  *      RETURN: 0 on success, error code on failure.
1472  *
1473  * Timestamps:
1474  *      dvp - ctime|mtime updated
1475  */
1476 static int
1477 zfs_rmdir_(vnode_t *dvp, vnode_t *vp, const char *name, cred_t *cr)
1478 {
1479         znode_t         *dzp = VTOZ(dvp);
1480         znode_t         *zp = VTOZ(vp);
1481         zfsvfs_t        *zfsvfs = dzp->z_zfsvfs;
1482         zilog_t         *zilog;
1483         dmu_tx_t        *tx;
1484         int             error;
1485
1486         if ((error = zfs_enter_verify_zp(zfsvfs, dzp, FTAG)) != 0)
1487                 return (error);
1488         if ((error = zfs_verify_zp(zp)) != 0) {
1489                 zfs_exit(zfsvfs, FTAG);
1490                 return (error);
1491         }
1492         zilog = zfsvfs->z_log;
1493
1494
1495         if ((error = zfs_zaccess_delete(dzp, zp, cr, NULL))) {
1496                 goto out;
1497         }
1498
1499         if (vp->v_type != VDIR) {
1500                 error = SET_ERROR(ENOTDIR);
1501                 goto out;
1502         }
1503
1504         vnevent_rmdir(vp, dvp, name, ct);
1505
1506         tx = dmu_tx_create(zfsvfs->z_os);
1507         dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name);
1508         dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
1509         dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
1510         zfs_sa_upgrade_txholds(tx, zp);
1511         zfs_sa_upgrade_txholds(tx, dzp);
1512         dmu_tx_mark_netfree(tx);
1513         error = dmu_tx_assign(tx, TXG_WAIT);
1514         if (error) {
1515                 dmu_tx_abort(tx);
1516                 zfs_exit(zfsvfs, FTAG);
1517                 return (error);
1518         }
1519
1520         error = zfs_link_destroy(dzp, name, zp, tx, ZEXISTS, NULL);
1521
1522         if (error == 0) {
1523                 uint64_t txtype = TX_RMDIR;
1524                 zfs_log_remove(zilog, tx, txtype, dzp, name,
1525                     ZFS_NO_OBJECT, B_FALSE);
1526         }
1527
1528         dmu_tx_commit(tx);
1529
1530         if (zfsvfs->z_use_namecache)
1531                 cache_vop_rmdir(dvp, vp);
1532 out:
1533         if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
1534                 zil_commit(zilog, 0);
1535
1536         zfs_exit(zfsvfs, FTAG);
1537         return (error);
1538 }
1539
1540 int
1541 zfs_rmdir(znode_t *dzp, const char *name, znode_t *cwd, cred_t *cr, int flags)
1542 {
1543         struct componentname cn;
1544         vnode_t *vp;
1545         int error;
1546
1547         if ((error = zfs_lookup_internal(dzp, name, &vp, &cn, DELETE)))
1548                 return (error);
1549
1550         error = zfs_rmdir_(ZTOV(dzp), vp, name, cr);
1551         vput(vp);
1552         return (error);
1553 }
1554
1555 /*
1556  * Read as many directory entries as will fit into the provided
1557  * buffer from the given directory cursor position (specified in
1558  * the uio structure).
1559  *
1560  *      IN:     vp      - vnode of directory to read.
1561  *              uio     - structure supplying read location, range info,
1562  *                        and return buffer.
1563  *              cr      - credentials of caller.
1564  *              ct      - caller context
1565  *
1566  *      OUT:    uio     - updated offset and range, buffer filled.
1567  *              eofp    - set to true if end-of-file detected.
1568  *              ncookies- number of entries in cookies
1569  *              cookies - offsets to directory entries
1570  *
1571  *      RETURN: 0 on success, error code on failure.
1572  *
1573  * Timestamps:
1574  *      vp - atime updated
1575  *
1576  * Note that the low 4 bits of the cookie returned by zap is always zero.
1577  * This allows us to use the low range for "special" directory entries:
1578  * We use 0 for '.', and 1 for '..'.  If this is the root of the filesystem,
1579  * we use the offset 2 for the '.zfs' directory.
1580  */
1581 static int
1582 zfs_readdir(vnode_t *vp, zfs_uio_t *uio, cred_t *cr, int *eofp,
1583     int *ncookies, cookie_t **cookies)
1584 {
1585         znode_t         *zp = VTOZ(vp);
1586         iovec_t         *iovp;
1587         dirent64_t      *odp;
1588         zfsvfs_t        *zfsvfs = zp->z_zfsvfs;
1589         objset_t        *os;
1590         caddr_t         outbuf;
1591         size_t          bufsize;
1592         zap_cursor_t    zc;
1593         zap_attribute_t *zap;
1594         uint_t          bytes_wanted;
1595         uint64_t        offset; /* must be unsigned; checks for < 1 */
1596         uint64_t        parent;
1597         int             local_eof;
1598         int             outcount;
1599         int             error;
1600         uint8_t         prefetch;
1601         uint8_t         type;
1602         int             ncooks;
1603         cookie_t        *cooks = NULL;
1604
1605         if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
1606                 return (error);
1607
1608         if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs),
1609             &parent, sizeof (parent))) != 0) {
1610                 zfs_exit(zfsvfs, FTAG);
1611                 return (error);
1612         }
1613
1614         /*
1615          * If we are not given an eof variable,
1616          * use a local one.
1617          */
1618         if (eofp == NULL)
1619                 eofp = &local_eof;
1620
1621         /*
1622          * Check for valid iov_len.
1623          */
1624         if (GET_UIO_STRUCT(uio)->uio_iov->iov_len <= 0) {
1625                 zfs_exit(zfsvfs, FTAG);
1626                 return (SET_ERROR(EINVAL));
1627         }
1628
1629         /*
1630          * Quit if directory has been removed (posix)
1631          */
1632         if ((*eofp = zp->z_unlinked) != 0) {
1633                 zfs_exit(zfsvfs, FTAG);
1634                 return (0);
1635         }
1636
1637         error = 0;
1638         os = zfsvfs->z_os;
1639         offset = zfs_uio_offset(uio);
1640         prefetch = zp->z_zn_prefetch;
1641         zap = zap_attribute_long_alloc();
1642
1643         /*
1644          * Initialize the iterator cursor.
1645          */
1646         if (offset <= 3) {
1647                 /*
1648                  * Start iteration from the beginning of the directory.
1649                  */
1650                 zap_cursor_init(&zc, os, zp->z_id);
1651         } else {
1652                 /*
1653                  * The offset is a serialized cursor.
1654                  */
1655                 zap_cursor_init_serialized(&zc, os, zp->z_id, offset);
1656         }
1657
1658         /*
1659          * Get space to change directory entries into fs independent format.
1660          */
1661         iovp = GET_UIO_STRUCT(uio)->uio_iov;
1662         bytes_wanted = iovp->iov_len;
1663         if (zfs_uio_segflg(uio) != UIO_SYSSPACE || zfs_uio_iovcnt(uio) != 1) {
1664                 bufsize = bytes_wanted;
1665                 outbuf = kmem_alloc(bufsize, KM_SLEEP);
1666                 odp = (struct dirent64 *)outbuf;
1667         } else {
1668                 bufsize = bytes_wanted;
1669                 outbuf = NULL;
1670                 odp = (struct dirent64 *)iovp->iov_base;
1671         }
1672
1673         if (ncookies != NULL) {
1674                 /*
1675                  * Minimum entry size is dirent size and 1 byte for a file name.
1676                  */
1677                 ncooks = zfs_uio_resid(uio) / (sizeof (struct dirent) -
1678                     sizeof (((struct dirent *)NULL)->d_name) + 1);
1679                 cooks = malloc(ncooks * sizeof (*cooks), M_TEMP, M_WAITOK);
1680                 *cookies = cooks;
1681                 *ncookies = ncooks;
1682         }
1683
1684         /*
1685          * Transform to file-system independent format
1686          */
1687         outcount = 0;
1688         while (outcount < bytes_wanted) {
1689                 ino64_t objnum;
1690                 ushort_t reclen;
1691                 off64_t *next = NULL;
1692
1693                 /*
1694                  * Special case `.', `..', and `.zfs'.
1695                  */
1696                 if (offset == 0) {
1697                         (void) strcpy(zap->za_name, ".");
1698                         zap->za_normalization_conflict = 0;
1699                         objnum = zp->z_id;
1700                         type = DT_DIR;
1701                 } else if (offset == 1) {
1702                         (void) strcpy(zap->za_name, "..");
1703                         zap->za_normalization_conflict = 0;
1704                         objnum = parent;
1705                         type = DT_DIR;
1706                 } else if (offset == 2 && zfs_show_ctldir(zp)) {
1707                         (void) strcpy(zap->za_name, ZFS_CTLDIR_NAME);
1708                         zap->za_normalization_conflict = 0;
1709                         objnum = ZFSCTL_INO_ROOT;
1710                         type = DT_DIR;
1711                 } else {
1712                         /*
1713                          * Grab next entry.
1714                          */
1715                         if ((error = zap_cursor_retrieve(&zc, zap))) {
1716                                 if ((*eofp = (error == ENOENT)) != 0)
1717                                         break;
1718                                 else
1719                                         goto update;
1720                         }
1721
1722                         if (zap->za_integer_length != 8 ||
1723                             zap->za_num_integers != 1) {
1724                                 cmn_err(CE_WARN, "zap_readdir: bad directory "
1725                                     "entry, obj = %lld, offset = %lld\n",
1726                                     (u_longlong_t)zp->z_id,
1727                                     (u_longlong_t)offset);
1728                                 error = SET_ERROR(ENXIO);
1729                                 goto update;
1730                         }
1731
1732                         objnum = ZFS_DIRENT_OBJ(zap->za_first_integer);
1733                         /*
1734                          * MacOS X can extract the object type here such as:
1735                          * uint8_t type = ZFS_DIRENT_TYPE(zap.za_first_integer);
1736                          */
1737                         type = ZFS_DIRENT_TYPE(zap->za_first_integer);
1738                 }
1739
1740                 reclen = DIRENT64_RECLEN(strlen(zap->za_name));
1741
1742                 /*
1743                  * Will this entry fit in the buffer?
1744                  */
1745                 if (outcount + reclen > bufsize) {
1746                         /*
1747                          * Did we manage to fit anything in the buffer?
1748                          */
1749                         if (!outcount) {
1750                                 error = SET_ERROR(EINVAL);
1751                                 goto update;
1752                         }
1753                         break;
1754                 }
1755                 /*
1756                  * Add normal entry:
1757                  */
1758                 odp->d_ino = objnum;
1759                 odp->d_reclen = reclen;
1760                 odp->d_namlen = strlen(zap->za_name);
1761                 /* NOTE: d_off is the offset for the *next* entry. */
1762                 next = &odp->d_off;
1763                 strlcpy(odp->d_name, zap->za_name, odp->d_namlen + 1);
1764                 odp->d_type = type;
1765                 dirent_terminate(odp);
1766                 odp = (dirent64_t *)((intptr_t)odp + reclen);
1767
1768                 outcount += reclen;
1769
1770                 ASSERT3S(outcount, <=, bufsize);
1771
1772                 if (prefetch)
1773                         dmu_prefetch_dnode(os, objnum, ZIO_PRIORITY_SYNC_READ);
1774
1775                 /*
1776                  * Move to the next entry, fill in the previous offset.
1777                  */
1778                 if (offset > 2 || (offset == 2 && !zfs_show_ctldir(zp))) {
1779                         zap_cursor_advance(&zc);
1780                         offset = zap_cursor_serialize(&zc);
1781                 } else {
1782                         offset += 1;
1783                 }
1784
1785                 /* Fill the offset right after advancing the cursor. */
1786                 if (next != NULL)
1787                         *next = offset;
1788                 if (cooks != NULL) {
1789                         *cooks++ = offset;
1790                         ncooks--;
1791                         KASSERT(ncooks >= 0, ("ncookies=%d", ncooks));
1792                 }
1793         }
1794         zp->z_zn_prefetch = B_FALSE; /* a lookup will re-enable pre-fetching */
1795
1796         /* Subtract unused cookies */
1797         if (ncookies != NULL)
1798                 *ncookies -= ncooks;
1799
1800         if (zfs_uio_segflg(uio) == UIO_SYSSPACE && zfs_uio_iovcnt(uio) == 1) {
1801                 iovp->iov_base += outcount;
1802                 iovp->iov_len -= outcount;
1803                 zfs_uio_resid(uio) -= outcount;
1804         } else if ((error =
1805             zfs_uiomove(outbuf, (long)outcount, UIO_READ, uio))) {
1806                 /*
1807                  * Reset the pointer.
1808                  */
1809                 offset = zfs_uio_offset(uio);
1810         }
1811
1812 update:
1813         zap_cursor_fini(&zc);
1814         zap_attribute_free(zap);
1815         if (zfs_uio_segflg(uio) != UIO_SYSSPACE || zfs_uio_iovcnt(uio) != 1)
1816                 kmem_free(outbuf, bufsize);
1817
1818         if (error == ENOENT)
1819                 error = 0;
1820
1821         ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
1822
1823         zfs_uio_setoffset(uio, offset);
1824         zfs_exit(zfsvfs, FTAG);
1825         if (error != 0 && cookies != NULL) {
1826                 free(*cookies, M_TEMP);
1827                 *cookies = NULL;
1828                 *ncookies = 0;
1829         }
1830         return (error);
1831 }
1832
1833 /*
1834  * Get the requested file attributes and place them in the provided
1835  * vattr structure.
1836  *
1837  *      IN:     vp      - vnode of file.
1838  *              vap     - va_mask identifies requested attributes.
1839  *                        If AT_XVATTR set, then optional attrs are requested
1840  *              flags   - ATTR_NOACLCHECK (CIFS server context)
1841  *              cr      - credentials of caller.
1842  *
1843  *      OUT:    vap     - attribute values.
1844  *
1845  *      RETURN: 0 (always succeeds).
1846  */
1847 static int
1848 zfs_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr)
1849 {
1850         znode_t *zp = VTOZ(vp);
1851         zfsvfs_t *zfsvfs = zp->z_zfsvfs;
1852         int     error = 0;
1853         uint32_t blksize;
1854         u_longlong_t nblocks;
1855         uint64_t mtime[2], ctime[2], crtime[2], rdev;
1856         xvattr_t *xvap = (xvattr_t *)vap;       /* vap may be an xvattr_t * */
1857         xoptattr_t *xoap = NULL;
1858         boolean_t skipaclchk = (flags & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
1859         sa_bulk_attr_t bulk[4];
1860         int count = 0;
1861
1862         if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
1863                 return (error);
1864
1865         zfs_fuid_map_ids(zp, cr, &vap->va_uid, &vap->va_gid);
1866
1867         SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16);
1868         SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16);
1869         SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CRTIME(zfsvfs), NULL, &crtime, 16);
1870         if (vp->v_type == VBLK || vp->v_type == VCHR)
1871                 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_RDEV(zfsvfs), NULL,
1872                     &rdev, 8);
1873
1874         if ((error = sa_bulk_lookup(zp->z_sa_hdl, bulk, count)) != 0) {
1875                 zfs_exit(zfsvfs, FTAG);
1876                 return (error);
1877         }
1878
1879         /*
1880          * If ACL is trivial don't bother looking for ACE_READ_ATTRIBUTES.
1881          * Also, if we are the owner don't bother, since owner should
1882          * always be allowed to read basic attributes of file.
1883          */
1884         if (!(zp->z_pflags & ZFS_ACL_TRIVIAL) &&
1885             (vap->va_uid != crgetuid(cr))) {
1886                 if ((error = zfs_zaccess(zp, ACE_READ_ATTRIBUTES, 0,
1887                     skipaclchk, cr, NULL))) {
1888                         zfs_exit(zfsvfs, FTAG);
1889                         return (error);
1890                 }
1891         }
1892
1893         /*
1894          * Return all attributes.  It's cheaper to provide the answer
1895          * than to determine whether we were asked the question.
1896          */
1897
1898         vap->va_type = IFTOVT(zp->z_mode);
1899         vap->va_mode = zp->z_mode & ~S_IFMT;
1900         vn_fsid(vp, vap);
1901         vap->va_nodeid = zp->z_id;
1902         vap->va_nlink = zp->z_links;
1903         if ((vp->v_flag & VROOT) && zfs_show_ctldir(zp) &&
1904             zp->z_links < ZFS_LINK_MAX)
1905                 vap->va_nlink++;
1906         vap->va_size = zp->z_size;
1907         if (vp->v_type == VBLK || vp->v_type == VCHR)
1908                 vap->va_rdev = zfs_cmpldev(rdev);
1909         else
1910                 vap->va_rdev = 0;
1911         vap->va_gen = zp->z_gen;
1912         vap->va_flags = 0;      /* FreeBSD: Reset chflags(2) flags. */
1913         vap->va_filerev = zp->z_seq;
1914
1915         /*
1916          * Add in any requested optional attributes and the create time.
1917          * Also set the corresponding bits in the returned attribute bitmap.
1918          */
1919         if ((xoap = xva_getxoptattr(xvap)) != NULL && zfsvfs->z_use_fuids) {
1920                 if (XVA_ISSET_REQ(xvap, XAT_ARCHIVE)) {
1921                         xoap->xoa_archive =
1922                             ((zp->z_pflags & ZFS_ARCHIVE) != 0);
1923                         XVA_SET_RTN(xvap, XAT_ARCHIVE);
1924                 }
1925
1926                 if (XVA_ISSET_REQ(xvap, XAT_READONLY)) {
1927                         xoap->xoa_readonly =
1928                             ((zp->z_pflags & ZFS_READONLY) != 0);
1929                         XVA_SET_RTN(xvap, XAT_READONLY);
1930                 }
1931
1932                 if (XVA_ISSET_REQ(xvap, XAT_SYSTEM)) {
1933                         xoap->xoa_system =
1934                             ((zp->z_pflags & ZFS_SYSTEM) != 0);
1935                         XVA_SET_RTN(xvap, XAT_SYSTEM);
1936                 }
1937
1938                 if (XVA_ISSET_REQ(xvap, XAT_HIDDEN)) {
1939                         xoap->xoa_hidden =
1940                             ((zp->z_pflags & ZFS_HIDDEN) != 0);
1941                         XVA_SET_RTN(xvap, XAT_HIDDEN);
1942                 }
1943
1944                 if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) {
1945                         xoap->xoa_nounlink =
1946                             ((zp->z_pflags & ZFS_NOUNLINK) != 0);
1947                         XVA_SET_RTN(xvap, XAT_NOUNLINK);
1948                 }
1949
1950                 if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) {
1951                         xoap->xoa_immutable =
1952                             ((zp->z_pflags & ZFS_IMMUTABLE) != 0);
1953                         XVA_SET_RTN(xvap, XAT_IMMUTABLE);
1954                 }
1955
1956                 if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) {
1957                         xoap->xoa_appendonly =
1958                             ((zp->z_pflags & ZFS_APPENDONLY) != 0);
1959                         XVA_SET_RTN(xvap, XAT_APPENDONLY);
1960                 }
1961
1962                 if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) {
1963                         xoap->xoa_nodump =
1964                             ((zp->z_pflags & ZFS_NODUMP) != 0);
1965                         XVA_SET_RTN(xvap, XAT_NODUMP);
1966                 }
1967
1968                 if (XVA_ISSET_REQ(xvap, XAT_OPAQUE)) {
1969                         xoap->xoa_opaque =
1970                             ((zp->z_pflags & ZFS_OPAQUE) != 0);
1971                         XVA_SET_RTN(xvap, XAT_OPAQUE);
1972                 }
1973
1974                 if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) {
1975                         xoap->xoa_av_quarantined =
1976                             ((zp->z_pflags & ZFS_AV_QUARANTINED) != 0);
1977                         XVA_SET_RTN(xvap, XAT_AV_QUARANTINED);
1978                 }
1979
1980                 if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) {
1981                         xoap->xoa_av_modified =
1982                             ((zp->z_pflags & ZFS_AV_MODIFIED) != 0);
1983                         XVA_SET_RTN(xvap, XAT_AV_MODIFIED);
1984                 }
1985
1986                 if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP) &&
1987                     vp->v_type == VREG) {
1988                         zfs_sa_get_scanstamp(zp, xvap);
1989                 }
1990
1991                 if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) {
1992                         xoap->xoa_reparse = ((zp->z_pflags & ZFS_REPARSE) != 0);
1993                         XVA_SET_RTN(xvap, XAT_REPARSE);
1994                 }
1995                 if (XVA_ISSET_REQ(xvap, XAT_GEN)) {
1996                         xoap->xoa_generation = zp->z_gen;
1997                         XVA_SET_RTN(xvap, XAT_GEN);
1998                 }
1999
2000                 if (XVA_ISSET_REQ(xvap, XAT_OFFLINE)) {
2001                         xoap->xoa_offline =
2002                             ((zp->z_pflags & ZFS_OFFLINE) != 0);
2003                         XVA_SET_RTN(xvap, XAT_OFFLINE);
2004                 }
2005
2006                 if (XVA_ISSET_REQ(xvap, XAT_SPARSE)) {
2007                         xoap->xoa_sparse =
2008                             ((zp->z_pflags & ZFS_SPARSE) != 0);
2009                         XVA_SET_RTN(xvap, XAT_SPARSE);
2010                 }
2011
2012                 if (XVA_ISSET_REQ(xvap, XAT_PROJINHERIT)) {
2013                         xoap->xoa_projinherit =
2014                             ((zp->z_pflags & ZFS_PROJINHERIT) != 0);
2015                         XVA_SET_RTN(xvap, XAT_PROJINHERIT);
2016                 }
2017
2018                 if (XVA_ISSET_REQ(xvap, XAT_PROJID)) {
2019                         xoap->xoa_projid = zp->z_projid;
2020                         XVA_SET_RTN(xvap, XAT_PROJID);
2021                 }
2022         }
2023
2024         ZFS_TIME_DECODE(&vap->va_atime, zp->z_atime);
2025         ZFS_TIME_DECODE(&vap->va_mtime, mtime);
2026         ZFS_TIME_DECODE(&vap->va_ctime, ctime);
2027         ZFS_TIME_DECODE(&vap->va_birthtime, crtime);
2028
2029
2030         sa_object_size(zp->z_sa_hdl, &blksize, &nblocks);
2031         vap->va_blksize = blksize;
2032         vap->va_bytes = nblocks << 9;   /* nblocks * 512 */
2033
2034         if (zp->z_blksz == 0) {
2035                 /*
2036                  * Block size hasn't been set; suggest maximal I/O transfers.
2037                  */
2038                 vap->va_blksize = zfsvfs->z_max_blksz;
2039         }
2040
2041         zfs_exit(zfsvfs, FTAG);
2042         return (0);
2043 }
2044
2045 /*
2046  * Set the file attributes to the values contained in the
2047  * vattr structure.
2048  *
2049  *      IN:     zp      - znode of file to be modified.
2050  *              vap     - new attribute values.
2051  *                        If AT_XVATTR set, then optional attrs are being set
2052  *              flags   - ATTR_UTIME set if non-default time values provided.
2053  *                      - ATTR_NOACLCHECK (CIFS context only).
2054  *              cr      - credentials of caller.
2055  *              mnt_ns  - Unused on FreeBSD
2056  *
2057  *      RETURN: 0 on success, error code on failure.
2058  *
2059  * Timestamps:
2060  *      vp - ctime updated, mtime updated if size changed.
2061  */
2062 int
2063 zfs_setattr(znode_t *zp, vattr_t *vap, int flags, cred_t *cr, zidmap_t *mnt_ns)
2064 {
2065         vnode_t         *vp = ZTOV(zp);
2066         zfsvfs_t        *zfsvfs = zp->z_zfsvfs;
2067         objset_t        *os;
2068         zilog_t         *zilog;
2069         dmu_tx_t        *tx;
2070         vattr_t         oldva;
2071         xvattr_t        tmpxvattr;
2072         uint_t          mask = vap->va_mask;
2073         uint_t          saved_mask = 0;
2074         uint64_t        saved_mode;
2075         int             trim_mask = 0;
2076         uint64_t        new_mode;
2077         uint64_t        new_uid, new_gid;
2078         uint64_t        xattr_obj;
2079         uint64_t        mtime[2], ctime[2];
2080         uint64_t        projid = ZFS_INVALID_PROJID;
2081         znode_t         *attrzp;
2082         int             need_policy = FALSE;
2083         int             err, err2;
2084         zfs_fuid_info_t *fuidp = NULL;
2085         xvattr_t *xvap = (xvattr_t *)vap;       /* vap may be an xvattr_t * */
2086         xoptattr_t      *xoap;
2087         zfs_acl_t       *aclp;
2088         boolean_t skipaclchk = (flags & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
2089         boolean_t       fuid_dirtied = B_FALSE;
2090         sa_bulk_attr_t  bulk[7], xattr_bulk[7];
2091         int             count = 0, xattr_count = 0;
2092
2093         if (mask == 0)
2094                 return (0);
2095
2096         if (mask & AT_NOSET)
2097                 return (SET_ERROR(EINVAL));
2098
2099         if ((err = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
2100                 return (err);
2101
2102         os = zfsvfs->z_os;
2103         zilog = zfsvfs->z_log;
2104
2105         /*
2106          * Make sure that if we have ephemeral uid/gid or xvattr specified
2107          * that file system is at proper version level
2108          */
2109
2110         if (zfsvfs->z_use_fuids == B_FALSE &&
2111             (((mask & AT_UID) && IS_EPHEMERAL(vap->va_uid)) ||
2112             ((mask & AT_GID) && IS_EPHEMERAL(vap->va_gid)) ||
2113             (mask & AT_XVATTR))) {
2114                 zfs_exit(zfsvfs, FTAG);
2115                 return (SET_ERROR(EINVAL));
2116         }
2117
2118         if (mask & AT_SIZE && vp->v_type == VDIR) {
2119                 zfs_exit(zfsvfs, FTAG);
2120                 return (SET_ERROR(EISDIR));
2121         }
2122
2123         if (mask & AT_SIZE && vp->v_type != VREG && vp->v_type != VFIFO) {
2124                 zfs_exit(zfsvfs, FTAG);
2125                 return (SET_ERROR(EINVAL));
2126         }
2127
2128         /*
2129          * If this is an xvattr_t, then get a pointer to the structure of
2130          * optional attributes.  If this is NULL, then we have a vattr_t.
2131          */
2132         xoap = xva_getxoptattr(xvap);
2133
2134         xva_init(&tmpxvattr);
2135
2136         /*
2137          * Immutable files can only alter immutable bit and atime
2138          */
2139         if ((zp->z_pflags & ZFS_IMMUTABLE) &&
2140             ((mask & (AT_SIZE|AT_UID|AT_GID|AT_MTIME|AT_MODE)) ||
2141             ((mask & AT_XVATTR) && XVA_ISSET_REQ(xvap, XAT_CREATETIME)))) {
2142                 zfs_exit(zfsvfs, FTAG);
2143                 return (SET_ERROR(EPERM));
2144         }
2145
2146         /*
2147          * Note: ZFS_READONLY is handled in zfs_zaccess_common.
2148          */
2149
2150         /*
2151          * Verify timestamps doesn't overflow 32 bits.
2152          * ZFS can handle large timestamps, but 32bit syscalls can't
2153          * handle times greater than 2039.  This check should be removed
2154          * once large timestamps are fully supported.
2155          */
2156         if (mask & (AT_ATIME | AT_MTIME)) {
2157                 if (((mask & AT_ATIME) && TIMESPEC_OVERFLOW(&vap->va_atime)) ||
2158                     ((mask & AT_MTIME) && TIMESPEC_OVERFLOW(&vap->va_mtime))) {
2159                         zfs_exit(zfsvfs, FTAG);
2160                         return (SET_ERROR(EOVERFLOW));
2161                 }
2162         }
2163         if (xoap != NULL && (mask & AT_XVATTR)) {
2164                 if (XVA_ISSET_REQ(xvap, XAT_CREATETIME) &&
2165                     TIMESPEC_OVERFLOW(&vap->va_birthtime)) {
2166                         zfs_exit(zfsvfs, FTAG);
2167                         return (SET_ERROR(EOVERFLOW));
2168                 }
2169
2170                 if (XVA_ISSET_REQ(xvap, XAT_PROJID)) {
2171                         if (!dmu_objset_projectquota_enabled(os) ||
2172                             (!S_ISREG(zp->z_mode) && !S_ISDIR(zp->z_mode))) {
2173                                 zfs_exit(zfsvfs, FTAG);
2174                                 return (SET_ERROR(EOPNOTSUPP));
2175                         }
2176
2177                         projid = xoap->xoa_projid;
2178                         if (unlikely(projid == ZFS_INVALID_PROJID)) {
2179                                 zfs_exit(zfsvfs, FTAG);
2180                                 return (SET_ERROR(EINVAL));
2181                         }
2182
2183                         if (projid == zp->z_projid && zp->z_pflags & ZFS_PROJID)
2184                                 projid = ZFS_INVALID_PROJID;
2185                         else
2186                                 need_policy = TRUE;
2187                 }
2188
2189                 if (XVA_ISSET_REQ(xvap, XAT_PROJINHERIT) &&
2190                     (xoap->xoa_projinherit !=
2191                     ((zp->z_pflags & ZFS_PROJINHERIT) != 0)) &&
2192                     (!dmu_objset_projectquota_enabled(os) ||
2193                     (!S_ISREG(zp->z_mode) && !S_ISDIR(zp->z_mode)))) {
2194                         zfs_exit(zfsvfs, FTAG);
2195                         return (SET_ERROR(EOPNOTSUPP));
2196                 }
2197         }
2198
2199         attrzp = NULL;
2200         aclp = NULL;
2201
2202         if (zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) {
2203                 zfs_exit(zfsvfs, FTAG);
2204                 return (SET_ERROR(EROFS));
2205         }
2206
2207         /*
2208          * First validate permissions
2209          */
2210
2211         if (mask & AT_SIZE) {
2212                 /*
2213                  * XXX - Note, we are not providing any open
2214                  * mode flags here (like FNDELAY), so we may
2215                  * block if there are locks present... this
2216                  * should be addressed in openat().
2217                  */
2218                 /* XXX - would it be OK to generate a log record here? */
2219                 err = zfs_freesp(zp, vap->va_size, 0, 0, FALSE);
2220                 if (err) {
2221                         zfs_exit(zfsvfs, FTAG);
2222                         return (err);
2223                 }
2224         }
2225
2226         if (mask & (AT_ATIME|AT_MTIME) ||
2227             ((mask & AT_XVATTR) && (XVA_ISSET_REQ(xvap, XAT_HIDDEN) ||
2228             XVA_ISSET_REQ(xvap, XAT_READONLY) ||
2229             XVA_ISSET_REQ(xvap, XAT_ARCHIVE) ||
2230             XVA_ISSET_REQ(xvap, XAT_OFFLINE) ||
2231             XVA_ISSET_REQ(xvap, XAT_SPARSE) ||
2232             XVA_ISSET_REQ(xvap, XAT_CREATETIME) ||
2233             XVA_ISSET_REQ(xvap, XAT_SYSTEM)))) {
2234                 need_policy = zfs_zaccess(zp, ACE_WRITE_ATTRIBUTES, 0,
2235                     skipaclchk, cr, mnt_ns);
2236         }
2237
2238         if (mask & (AT_UID|AT_GID)) {
2239                 int     idmask = (mask & (AT_UID|AT_GID));
2240                 int     take_owner;
2241                 int     take_group;
2242
2243                 /*
2244                  * NOTE: even if a new mode is being set,
2245                  * we may clear S_ISUID/S_ISGID bits.
2246                  */
2247
2248                 if (!(mask & AT_MODE))
2249                         vap->va_mode = zp->z_mode;
2250
2251                 /*
2252                  * Take ownership or chgrp to group we are a member of
2253                  */
2254
2255                 take_owner = (mask & AT_UID) && (vap->va_uid == crgetuid(cr));
2256                 take_group = (mask & AT_GID) &&
2257                     zfs_groupmember(zfsvfs, vap->va_gid, cr);
2258
2259                 /*
2260                  * If both AT_UID and AT_GID are set then take_owner and
2261                  * take_group must both be set in order to allow taking
2262                  * ownership.
2263                  *
2264                  * Otherwise, send the check through secpolicy_vnode_setattr()
2265                  *
2266                  */
2267
2268                 if (((idmask == (AT_UID|AT_GID)) && take_owner && take_group) ||
2269                     ((idmask == AT_UID) && take_owner) ||
2270                     ((idmask == AT_GID) && take_group)) {
2271                         if (zfs_zaccess(zp, ACE_WRITE_OWNER, 0,
2272                             skipaclchk, cr, mnt_ns) == 0) {
2273                                 /*
2274                                  * Remove setuid/setgid for non-privileged users
2275                                  */
2276                                 secpolicy_setid_clear(vap, vp, cr);
2277                                 trim_mask = (mask & (AT_UID|AT_GID));
2278                         } else {
2279                                 need_policy =  TRUE;
2280                         }
2281                 } else {
2282                         need_policy =  TRUE;
2283                 }
2284         }
2285
2286         oldva.va_mode = zp->z_mode;
2287         zfs_fuid_map_ids(zp, cr, &oldva.va_uid, &oldva.va_gid);
2288         if (mask & AT_XVATTR) {
2289                 /*
2290                  * Update xvattr mask to include only those attributes
2291                  * that are actually changing.
2292                  *
2293                  * the bits will be restored prior to actually setting
2294                  * the attributes so the caller thinks they were set.
2295                  */
2296                 if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) {
2297                         if (xoap->xoa_appendonly !=
2298                             ((zp->z_pflags & ZFS_APPENDONLY) != 0)) {
2299                                 need_policy = TRUE;
2300                         } else {
2301                                 XVA_CLR_REQ(xvap, XAT_APPENDONLY);
2302                                 XVA_SET_REQ(&tmpxvattr, XAT_APPENDONLY);
2303                         }
2304                 }
2305
2306                 if (XVA_ISSET_REQ(xvap, XAT_PROJINHERIT)) {
2307                         if (xoap->xoa_projinherit !=
2308                             ((zp->z_pflags & ZFS_PROJINHERIT) != 0)) {
2309                                 need_policy = TRUE;
2310                         } else {
2311                                 XVA_CLR_REQ(xvap, XAT_PROJINHERIT);
2312                                 XVA_SET_REQ(&tmpxvattr, XAT_PROJINHERIT);
2313                         }
2314                 }
2315
2316                 if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) {
2317                         if (xoap->xoa_nounlink !=
2318                             ((zp->z_pflags & ZFS_NOUNLINK) != 0)) {
2319                                 need_policy = TRUE;
2320                         } else {
2321                                 XVA_CLR_REQ(xvap, XAT_NOUNLINK);
2322                                 XVA_SET_REQ(&tmpxvattr, XAT_NOUNLINK);
2323                         }
2324                 }
2325
2326                 if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) {
2327                         if (xoap->xoa_immutable !=
2328                             ((zp->z_pflags & ZFS_IMMUTABLE) != 0)) {
2329                                 need_policy = TRUE;
2330                         } else {
2331                                 XVA_CLR_REQ(xvap, XAT_IMMUTABLE);
2332                                 XVA_SET_REQ(&tmpxvattr, XAT_IMMUTABLE);
2333                         }
2334                 }
2335
2336                 if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) {
2337                         if (xoap->xoa_nodump !=
2338                             ((zp->z_pflags & ZFS_NODUMP) != 0)) {
2339                                 need_policy = TRUE;
2340                         } else {
2341                                 XVA_CLR_REQ(xvap, XAT_NODUMP);
2342                                 XVA_SET_REQ(&tmpxvattr, XAT_NODUMP);
2343                         }
2344                 }
2345
2346                 if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) {
2347                         if (xoap->xoa_av_modified !=
2348                             ((zp->z_pflags & ZFS_AV_MODIFIED) != 0)) {
2349                                 need_policy = TRUE;
2350                         } else {
2351                                 XVA_CLR_REQ(xvap, XAT_AV_MODIFIED);
2352                                 XVA_SET_REQ(&tmpxvattr, XAT_AV_MODIFIED);
2353                         }
2354                 }
2355
2356                 if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) {
2357                         if ((vp->v_type != VREG &&
2358                             xoap->xoa_av_quarantined) ||
2359                             xoap->xoa_av_quarantined !=
2360                             ((zp->z_pflags & ZFS_AV_QUARANTINED) != 0)) {
2361                                 need_policy = TRUE;
2362                         } else {
2363                                 XVA_CLR_REQ(xvap, XAT_AV_QUARANTINED);
2364                                 XVA_SET_REQ(&tmpxvattr, XAT_AV_QUARANTINED);
2365                         }
2366                 }
2367
2368                 if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) {
2369                         zfs_exit(zfsvfs, FTAG);
2370                         return (SET_ERROR(EPERM));
2371                 }
2372
2373                 if (need_policy == FALSE &&
2374                     (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP) ||
2375                     XVA_ISSET_REQ(xvap, XAT_OPAQUE))) {
2376                         need_policy = TRUE;
2377                 }
2378         }
2379
2380         if (mask & AT_MODE) {
2381                 if (zfs_zaccess(zp, ACE_WRITE_ACL, 0, skipaclchk, cr,
2382                     mnt_ns) == 0) {
2383                         err = secpolicy_setid_setsticky_clear(vp, vap,
2384                             &oldva, cr);
2385                         if (err) {
2386                                 zfs_exit(zfsvfs, FTAG);
2387                                 return (err);
2388                         }
2389                         trim_mask |= AT_MODE;
2390                 } else {
2391                         need_policy = TRUE;
2392                 }
2393         }
2394
2395         if (need_policy) {
2396                 /*
2397                  * If trim_mask is set then take ownership
2398                  * has been granted or write_acl is present and user
2399                  * has the ability to modify mode.  In that case remove
2400                  * UID|GID and or MODE from mask so that
2401                  * secpolicy_vnode_setattr() doesn't revoke it.
2402                  */
2403
2404                 if (trim_mask) {
2405                         saved_mask = vap->va_mask;
2406                         vap->va_mask &= ~trim_mask;
2407                         if (trim_mask & AT_MODE) {
2408                                 /*
2409                                  * Save the mode, as secpolicy_vnode_setattr()
2410                                  * will overwrite it with ova.va_mode.
2411                                  */
2412                                 saved_mode = vap->va_mode;
2413                         }
2414                 }
2415                 err = secpolicy_vnode_setattr(cr, vp, vap, &oldva, flags,
2416                     (int (*)(void *, int, cred_t *))zfs_zaccess_unix, zp);
2417                 if (err) {
2418                         zfs_exit(zfsvfs, FTAG);
2419                         return (err);
2420                 }
2421
2422                 if (trim_mask) {
2423                         vap->va_mask |= saved_mask;
2424                         if (trim_mask & AT_MODE) {
2425                                 /*
2426                                  * Recover the mode after
2427                                  * secpolicy_vnode_setattr().
2428                                  */
2429                                 vap->va_mode = saved_mode;
2430                         }
2431                 }
2432         }
2433
2434         /*
2435          * secpolicy_vnode_setattr, or take ownership may have
2436          * changed va_mask
2437          */
2438         mask = vap->va_mask;
2439
2440         if ((mask & (AT_UID | AT_GID)) || projid != ZFS_INVALID_PROJID) {
2441                 err = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs),
2442                     &xattr_obj, sizeof (xattr_obj));
2443
2444                 if (err == 0 && xattr_obj) {
2445                         err = zfs_zget(zp->z_zfsvfs, xattr_obj, &attrzp);
2446                         if (err == 0) {
2447                                 err = vn_lock(ZTOV(attrzp), LK_EXCLUSIVE);
2448                                 if (err != 0)
2449                                         vrele(ZTOV(attrzp));
2450                         }
2451                         if (err)
2452                                 goto out2;
2453                 }
2454                 if (mask & AT_UID) {
2455                         new_uid = zfs_fuid_create(zfsvfs,
2456                             (uint64_t)vap->va_uid, cr, ZFS_OWNER, &fuidp);
2457                         if (new_uid != zp->z_uid &&
2458                             zfs_id_overquota(zfsvfs, DMU_USERUSED_OBJECT,
2459                             new_uid)) {
2460                                 if (attrzp)
2461                                         vput(ZTOV(attrzp));
2462                                 err = SET_ERROR(EDQUOT);
2463                                 goto out2;
2464                         }
2465                 }
2466
2467                 if (mask & AT_GID) {
2468                         new_gid = zfs_fuid_create(zfsvfs, (uint64_t)vap->va_gid,
2469                             cr, ZFS_GROUP, &fuidp);
2470                         if (new_gid != zp->z_gid &&
2471                             zfs_id_overquota(zfsvfs, DMU_GROUPUSED_OBJECT,
2472                             new_gid)) {
2473                                 if (attrzp)
2474                                         vput(ZTOV(attrzp));
2475                                 err = SET_ERROR(EDQUOT);
2476                                 goto out2;
2477                         }
2478                 }
2479
2480                 if (projid != ZFS_INVALID_PROJID &&
2481                     zfs_id_overquota(zfsvfs, DMU_PROJECTUSED_OBJECT, projid)) {
2482                         if (attrzp)
2483                                 vput(ZTOV(attrzp));
2484                         err = SET_ERROR(EDQUOT);
2485                         goto out2;
2486                 }
2487         }
2488         tx = dmu_tx_create(os);
2489
2490         if (mask & AT_MODE) {
2491                 uint64_t pmode = zp->z_mode;
2492                 uint64_t acl_obj;
2493                 new_mode = (pmode & S_IFMT) | (vap->va_mode & ~S_IFMT);
2494
2495                 if (zp->z_zfsvfs->z_acl_mode == ZFS_ACL_RESTRICTED &&
2496                     !(zp->z_pflags & ZFS_ACL_TRIVIAL)) {
2497                         err = SET_ERROR(EPERM);
2498                         goto out;
2499                 }
2500
2501                 if ((err = zfs_acl_chmod_setattr(zp, &aclp, new_mode)))
2502                         goto out;
2503
2504                 if (!zp->z_is_sa && ((acl_obj = zfs_external_acl(zp)) != 0)) {
2505                         /*
2506                          * Are we upgrading ACL from old V0 format
2507                          * to V1 format?
2508                          */
2509                         if (zfsvfs->z_version >= ZPL_VERSION_FUID &&
2510                             zfs_znode_acl_version(zp) ==
2511                             ZFS_ACL_VERSION_INITIAL) {
2512                                 dmu_tx_hold_free(tx, acl_obj, 0,
2513                                     DMU_OBJECT_END);
2514                                 dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
2515                                     0, aclp->z_acl_bytes);
2516                         } else {
2517                                 dmu_tx_hold_write(tx, acl_obj, 0,
2518                                     aclp->z_acl_bytes);
2519                         }
2520                 } else if (!zp->z_is_sa && aclp->z_acl_bytes > ZFS_ACE_SPACE) {
2521                         dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
2522                             0, aclp->z_acl_bytes);
2523                 }
2524                 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
2525         } else {
2526                 if (((mask & AT_XVATTR) &&
2527                     XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) ||
2528                     (projid != ZFS_INVALID_PROJID &&
2529                     !(zp->z_pflags & ZFS_PROJID)))
2530                         dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
2531                 else
2532                         dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
2533         }
2534
2535         if (attrzp) {
2536                 dmu_tx_hold_sa(tx, attrzp->z_sa_hdl, B_FALSE);
2537         }
2538
2539         fuid_dirtied = zfsvfs->z_fuid_dirty;
2540         if (fuid_dirtied)
2541                 zfs_fuid_txhold(zfsvfs, tx);
2542
2543         zfs_sa_upgrade_txholds(tx, zp);
2544
2545         err = dmu_tx_assign(tx, TXG_WAIT);
2546         if (err)
2547                 goto out;
2548
2549         count = 0;
2550         /*
2551          * Set each attribute requested.
2552          * We group settings according to the locks they need to acquire.
2553          *
2554          * Note: you cannot set ctime directly, although it will be
2555          * updated as a side-effect of calling this function.
2556          */
2557
2558         if (projid != ZFS_INVALID_PROJID && !(zp->z_pflags & ZFS_PROJID)) {
2559                 /*
2560                  * For the existed object that is upgraded from old system,
2561                  * its on-disk layout has no slot for the project ID attribute.
2562                  * But quota accounting logic needs to access related slots by
2563                  * offset directly. So we need to adjust old objects' layout
2564                  * to make the project ID to some unified and fixed offset.
2565                  */
2566                 if (attrzp)
2567                         err = sa_add_projid(attrzp->z_sa_hdl, tx, projid);
2568                 if (err == 0)
2569                         err = sa_add_projid(zp->z_sa_hdl, tx, projid);
2570
2571                 if (unlikely(err == EEXIST))
2572                         err = 0;
2573                 else if (err != 0)
2574                         goto out;
2575                 else
2576                         projid = ZFS_INVALID_PROJID;
2577         }
2578
2579         if (mask & (AT_UID|AT_GID|AT_MODE))
2580                 mutex_enter(&zp->z_acl_lock);
2581
2582         SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
2583             &zp->z_pflags, sizeof (zp->z_pflags));
2584
2585         if (attrzp) {
2586                 if (mask & (AT_UID|AT_GID|AT_MODE))
2587                         mutex_enter(&attrzp->z_acl_lock);
2588                 SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
2589                     SA_ZPL_FLAGS(zfsvfs), NULL, &attrzp->z_pflags,
2590                     sizeof (attrzp->z_pflags));
2591                 if (projid != ZFS_INVALID_PROJID) {
2592                         attrzp->z_projid = projid;
2593                         SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
2594                             SA_ZPL_PROJID(zfsvfs), NULL, &attrzp->z_projid,
2595                             sizeof (attrzp->z_projid));
2596                 }
2597         }
2598
2599         if (mask & (AT_UID|AT_GID)) {
2600
2601                 if (mask & AT_UID) {
2602                         SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL,
2603                             &new_uid, sizeof (new_uid));
2604                         zp->z_uid = new_uid;
2605                         if (attrzp) {
2606                                 SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
2607                                     SA_ZPL_UID(zfsvfs), NULL, &new_uid,
2608                                     sizeof (new_uid));
2609                                 attrzp->z_uid = new_uid;
2610                         }
2611                 }
2612
2613                 if (mask & AT_GID) {
2614                         SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs),
2615                             NULL, &new_gid, sizeof (new_gid));
2616                         zp->z_gid = new_gid;
2617                         if (attrzp) {
2618                                 SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
2619                                     SA_ZPL_GID(zfsvfs), NULL, &new_gid,
2620                                     sizeof (new_gid));
2621                                 attrzp->z_gid = new_gid;
2622                         }
2623                 }
2624                 if (!(mask & AT_MODE)) {
2625                         SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs),
2626                             NULL, &new_mode, sizeof (new_mode));
2627                         new_mode = zp->z_mode;
2628                 }
2629                 err = zfs_acl_chown_setattr(zp);
2630                 ASSERT0(err);
2631                 if (attrzp) {
2632                         vn_seqc_write_begin(ZTOV(attrzp));
2633                         err = zfs_acl_chown_setattr(attrzp);
2634                         vn_seqc_write_end(ZTOV(attrzp));
2635                         ASSERT0(err);
2636                 }
2637         }
2638
2639         if (mask & AT_MODE) {
2640                 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL,
2641                     &new_mode, sizeof (new_mode));
2642                 zp->z_mode = new_mode;
2643                 ASSERT3P(aclp, !=, NULL);
2644                 err = zfs_aclset_common(zp, aclp, cr, tx);
2645                 ASSERT0(err);
2646                 if (zp->z_acl_cached)
2647                         zfs_acl_free(zp->z_acl_cached);
2648                 zp->z_acl_cached = aclp;
2649                 aclp = NULL;
2650         }
2651
2652
2653         if (mask & AT_ATIME) {
2654                 ZFS_TIME_ENCODE(&vap->va_atime, zp->z_atime);
2655                 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zfsvfs), NULL,
2656                     &zp->z_atime, sizeof (zp->z_atime));
2657         }
2658
2659         if (mask & AT_MTIME) {
2660                 ZFS_TIME_ENCODE(&vap->va_mtime, mtime);
2661                 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL,
2662                     mtime, sizeof (mtime));
2663         }
2664
2665         if (projid != ZFS_INVALID_PROJID) {
2666                 zp->z_projid = projid;
2667                 SA_ADD_BULK_ATTR(bulk, count,
2668                     SA_ZPL_PROJID(zfsvfs), NULL, &zp->z_projid,
2669                     sizeof (zp->z_projid));
2670         }
2671
2672         /* XXX - shouldn't this be done *before* the ATIME/MTIME checks? */
2673         if (mask & AT_SIZE && !(mask & AT_MTIME)) {
2674                 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs),
2675                     NULL, mtime, sizeof (mtime));
2676                 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
2677                     &ctime, sizeof (ctime));
2678                 zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime);
2679         } else if (mask != 0) {
2680                 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
2681                     &ctime, sizeof (ctime));
2682                 zfs_tstamp_update_setup(zp, STATE_CHANGED, mtime, ctime);
2683                 if (attrzp) {
2684                         SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
2685                             SA_ZPL_CTIME(zfsvfs), NULL,
2686                             &ctime, sizeof (ctime));
2687                         zfs_tstamp_update_setup(attrzp, STATE_CHANGED,
2688                             mtime, ctime);
2689                 }
2690         }
2691
2692         /*
2693          * Do this after setting timestamps to prevent timestamp
2694          * update from toggling bit
2695          */
2696
2697         if (xoap && (mask & AT_XVATTR)) {
2698
2699                 if (XVA_ISSET_REQ(xvap, XAT_CREATETIME))
2700                         xoap->xoa_createtime = vap->va_birthtime;
2701                 /*
2702                  * restore trimmed off masks
2703                  * so that return masks can be set for caller.
2704                  */
2705
2706                 if (XVA_ISSET_REQ(&tmpxvattr, XAT_APPENDONLY)) {
2707                         XVA_SET_REQ(xvap, XAT_APPENDONLY);
2708                 }
2709                 if (XVA_ISSET_REQ(&tmpxvattr, XAT_NOUNLINK)) {
2710                         XVA_SET_REQ(xvap, XAT_NOUNLINK);
2711                 }
2712                 if (XVA_ISSET_REQ(&tmpxvattr, XAT_IMMUTABLE)) {
2713                         XVA_SET_REQ(xvap, XAT_IMMUTABLE);
2714                 }
2715                 if (XVA_ISSET_REQ(&tmpxvattr, XAT_NODUMP)) {
2716                         XVA_SET_REQ(xvap, XAT_NODUMP);
2717                 }
2718                 if (XVA_ISSET_REQ(&tmpxvattr, XAT_AV_MODIFIED)) {
2719                         XVA_SET_REQ(xvap, XAT_AV_MODIFIED);
2720                 }
2721                 if (XVA_ISSET_REQ(&tmpxvattr, XAT_AV_QUARANTINED)) {
2722                         XVA_SET_REQ(xvap, XAT_AV_QUARANTINED);
2723                 }
2724                 if (XVA_ISSET_REQ(&tmpxvattr, XAT_PROJINHERIT)) {
2725                         XVA_SET_REQ(xvap, XAT_PROJINHERIT);
2726                 }
2727
2728                 if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP))
2729                         ASSERT3S(vp->v_type, ==, VREG);
2730
2731                 zfs_xvattr_set(zp, xvap, tx);
2732         }
2733
2734         if (fuid_dirtied)
2735                 zfs_fuid_sync(zfsvfs, tx);
2736
2737         if (mask != 0)
2738                 zfs_log_setattr(zilog, tx, TX_SETATTR, zp, vap, mask, fuidp);
2739
2740         if (mask & (AT_UID|AT_GID|AT_MODE))
2741                 mutex_exit(&zp->z_acl_lock);
2742
2743         if (attrzp) {
2744                 if (mask & (AT_UID|AT_GID|AT_MODE))
2745                         mutex_exit(&attrzp->z_acl_lock);
2746         }
2747 out:
2748         if (err == 0 && attrzp) {
2749                 err2 = sa_bulk_update(attrzp->z_sa_hdl, xattr_bulk,
2750                     xattr_count, tx);
2751                 ASSERT0(err2);
2752         }
2753
2754         if (attrzp)
2755                 vput(ZTOV(attrzp));
2756
2757         if (aclp)
2758                 zfs_acl_free(aclp);
2759
2760         if (fuidp) {
2761                 zfs_fuid_info_free(fuidp);
2762                 fuidp = NULL;
2763         }
2764
2765         if (err) {
2766                 dmu_tx_abort(tx);
2767         } else {
2768                 err2 = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
2769                 dmu_tx_commit(tx);
2770         }
2771
2772 out2:
2773         if (os->os_sync == ZFS_SYNC_ALWAYS)
2774                 zil_commit(zilog, 0);
2775
2776         zfs_exit(zfsvfs, FTAG);
2777         return (err);
2778 }
2779
2780 /*
2781  * Look up the directory entries corresponding to the source and target
2782  * directory/name pairs.
2783  */
2784 static int
2785 zfs_rename_relock_lookup(znode_t *sdzp, const struct componentname *scnp,
2786     znode_t **szpp, znode_t *tdzp, const struct componentname *tcnp,
2787     znode_t **tzpp)
2788 {
2789         zfsvfs_t *zfsvfs;
2790         znode_t *szp, *tzp;
2791         int error;
2792
2793         /*
2794          * Before using sdzp and tdzp we must ensure that they are live.
2795          * As a porting legacy from illumos we have two things to worry
2796          * about.  One is typical for FreeBSD and it is that the vnode is
2797          * not reclaimed (doomed).  The other is that the znode is live.
2798          * The current code can invalidate the znode without acquiring the
2799          * corresponding vnode lock if the object represented by the znode
2800          * and vnode is no longer valid after a rollback or receive operation.
2801          * z_teardown_lock hidden behind zfs_enter and zfs_exit is the lock
2802          * that protects the znodes from the invalidation.
2803          */
2804         zfsvfs = sdzp->z_zfsvfs;
2805         ASSERT3P(zfsvfs, ==, tdzp->z_zfsvfs);
2806         if ((error = zfs_enter_verify_zp(zfsvfs, sdzp, FTAG)) != 0)
2807                 return (error);
2808         if ((error = zfs_verify_zp(tdzp)) != 0) {
2809                 zfs_exit(zfsvfs, FTAG);
2810                 return (error);
2811         }
2812
2813         /*
2814          * Re-resolve svp to be certain it still exists and fetch the
2815          * correct vnode.
2816          */
2817         error = zfs_dirent_lookup(sdzp, scnp->cn_nameptr, &szp, ZEXISTS);
2818         if (error != 0) {
2819                 /* Source entry invalid or not there. */
2820                 if ((scnp->cn_flags & ISDOTDOT) != 0 ||
2821                     (scnp->cn_namelen == 1 && scnp->cn_nameptr[0] == '.'))
2822                         error = SET_ERROR(EINVAL);
2823                 goto out;
2824         }
2825         *szpp = szp;
2826
2827         /*
2828          * Re-resolve tvp, if it disappeared we just carry on.
2829          */
2830         error = zfs_dirent_lookup(tdzp, tcnp->cn_nameptr, &tzp, 0);
2831         if (error != 0) {
2832                 vrele(ZTOV(szp));
2833                 if ((tcnp->cn_flags & ISDOTDOT) != 0)
2834                         error = SET_ERROR(EINVAL);
2835                 goto out;
2836         }
2837         *tzpp = tzp;
2838 out:
2839         zfs_exit(zfsvfs, FTAG);
2840         return (error);
2841 }
2842
2843 /*
2844  * We acquire all but fdvp locks using non-blocking acquisitions.  If we
2845  * fail to acquire any lock in the path we will drop all held locks,
2846  * acquire the new lock in a blocking fashion, and then release it and
2847  * restart the rename.  This acquire/release step ensures that we do not
2848  * spin on a lock waiting for release.  On error release all vnode locks
2849  * and decrement references the way tmpfs_rename() would do.
2850  */
2851 static int
2852 zfs_rename_relock(struct vnode *sdvp, struct vnode **svpp,
2853     struct vnode *tdvp, struct vnode **tvpp,
2854     const struct componentname *scnp, const struct componentname *tcnp)
2855 {
2856         struct vnode    *nvp, *svp, *tvp;
2857         znode_t         *sdzp, *tdzp, *szp, *tzp;
2858         int             error;
2859
2860         VOP_UNLOCK(tdvp);
2861         if (*tvpp != NULL && *tvpp != tdvp)
2862                 VOP_UNLOCK(*tvpp);
2863
2864 relock:
2865         error = vn_lock(sdvp, LK_EXCLUSIVE);
2866         if (error)
2867                 goto out;
2868         error = vn_lock(tdvp, LK_EXCLUSIVE | LK_NOWAIT);
2869         if (error != 0) {
2870                 VOP_UNLOCK(sdvp);
2871                 if (error != EBUSY)
2872                         goto out;
2873                 error = vn_lock(tdvp, LK_EXCLUSIVE);
2874                 if (error)
2875                         goto out;
2876                 VOP_UNLOCK(tdvp);
2877                 goto relock;
2878         }
2879         tdzp = VTOZ(tdvp);
2880         sdzp = VTOZ(sdvp);
2881
2882         error = zfs_rename_relock_lookup(sdzp, scnp, &szp, tdzp, tcnp, &tzp);
2883         if (error != 0) {
2884                 VOP_UNLOCK(sdvp);
2885                 VOP_UNLOCK(tdvp);
2886                 goto out;
2887         }
2888         svp = ZTOV(szp);
2889         tvp = tzp != NULL ? ZTOV(tzp) : NULL;
2890
2891         /*
2892          * Now try acquire locks on svp and tvp.
2893          */
2894         nvp = svp;
2895         error = vn_lock(nvp, LK_EXCLUSIVE | LK_NOWAIT);
2896         if (error != 0) {
2897                 VOP_UNLOCK(sdvp);
2898                 VOP_UNLOCK(tdvp);
2899                 if (tvp != NULL)
2900                         vrele(tvp);
2901                 if (error != EBUSY) {
2902                         vrele(nvp);
2903                         goto out;
2904                 }
2905                 error = vn_lock(nvp, LK_EXCLUSIVE);
2906                 if (error != 0) {
2907                         vrele(nvp);
2908                         goto out;
2909                 }
2910                 VOP_UNLOCK(nvp);
2911                 /*
2912                  * Concurrent rename race.
2913                  * XXX ?
2914                  */
2915                 if (nvp == tdvp) {
2916                         vrele(nvp);
2917                         error = SET_ERROR(EINVAL);
2918                         goto out;
2919                 }
2920                 vrele(*svpp);
2921                 *svpp = nvp;
2922                 goto relock;
2923         }
2924         vrele(*svpp);
2925         *svpp = nvp;
2926
2927         if (*tvpp != NULL)
2928                 vrele(*tvpp);
2929         *tvpp = NULL;
2930         if (tvp != NULL) {
2931                 nvp = tvp;
2932                 error = vn_lock(nvp, LK_EXCLUSIVE | LK_NOWAIT);
2933                 if (error != 0) {
2934                         VOP_UNLOCK(sdvp);
2935                         VOP_UNLOCK(tdvp);
2936                         VOP_UNLOCK(*svpp);
2937                         if (error != EBUSY) {
2938                                 vrele(nvp);
2939                                 goto out;
2940                         }
2941                         error = vn_lock(nvp, LK_EXCLUSIVE);
2942                         if (error != 0) {
2943                                 vrele(nvp);
2944                                 goto out;
2945                         }
2946                         vput(nvp);
2947                         goto relock;
2948                 }
2949                 *tvpp = nvp;
2950         }
2951
2952         return (0);
2953
2954 out:
2955         return (error);
2956 }
2957
2958 /*
2959  * Note that we must use VRELE_ASYNC in this function as it walks
2960  * up the directory tree and vrele may need to acquire an exclusive
2961  * lock if a last reference to a vnode is dropped.
2962  */
2963 static int
2964 zfs_rename_check(znode_t *szp, znode_t *sdzp, znode_t *tdzp)
2965 {
2966         zfsvfs_t        *zfsvfs;
2967         znode_t         *zp, *zp1;
2968         uint64_t        parent;
2969         int             error;
2970
2971         zfsvfs = tdzp->z_zfsvfs;
2972         if (tdzp == szp)
2973                 return (SET_ERROR(EINVAL));
2974         if (tdzp == sdzp)
2975                 return (0);
2976         if (tdzp->z_id == zfsvfs->z_root)
2977                 return (0);
2978         zp = tdzp;
2979         for (;;) {
2980                 ASSERT(!zp->z_unlinked);
2981                 if ((error = sa_lookup(zp->z_sa_hdl,
2982                     SA_ZPL_PARENT(zfsvfs), &parent, sizeof (parent))) != 0)
2983                         break;
2984
2985                 if (parent == szp->z_id) {
2986                         error = SET_ERROR(EINVAL);
2987                         break;
2988                 }
2989                 if (parent == zfsvfs->z_root)
2990                         break;
2991                 if (parent == sdzp->z_id)
2992                         break;
2993
2994                 error = zfs_zget(zfsvfs, parent, &zp1);
2995                 if (error != 0)
2996                         break;
2997
2998                 if (zp != tdzp)
2999                         VN_RELE_ASYNC(ZTOV(zp),
3000                             dsl_pool_zrele_taskq(
3001                             dmu_objset_pool(zfsvfs->z_os)));
3002                 zp = zp1;
3003         }
3004
3005         if (error == ENOTDIR)
3006                 panic("checkpath: .. not a directory\n");
3007         if (zp != tdzp)
3008                 VN_RELE_ASYNC(ZTOV(zp),
3009                     dsl_pool_zrele_taskq(dmu_objset_pool(zfsvfs->z_os)));
3010         return (error);
3011 }
3012
3013 static int
3014 zfs_do_rename_impl(vnode_t *sdvp, vnode_t **svpp, struct componentname *scnp,
3015     vnode_t *tdvp, vnode_t **tvpp, struct componentname *tcnp,
3016     cred_t *cr);
3017
3018 /*
3019  * Move an entry from the provided source directory to the target
3020  * directory.  Change the entry name as indicated.
3021  *
3022  *      IN:     sdvp    - Source directory containing the "old entry".
3023  *              scnp    - Old entry name.
3024  *              tdvp    - Target directory to contain the "new entry".
3025  *              tcnp    - New entry name.
3026  *              cr      - credentials of caller.
3027  *      INOUT:  svpp    - Source file
3028  *              tvpp    - Target file, may point to NULL initially
3029  *
3030  *      RETURN: 0 on success, error code on failure.
3031  *
3032  * Timestamps:
3033  *      sdvp,tdvp - ctime|mtime updated
3034  */
3035 static int
3036 zfs_do_rename(vnode_t *sdvp, vnode_t **svpp, struct componentname *scnp,
3037     vnode_t *tdvp, vnode_t **tvpp, struct componentname *tcnp,
3038     cred_t *cr)
3039 {
3040         int     error;
3041
3042         ASSERT_VOP_ELOCKED(tdvp, __func__);
3043         if (*tvpp != NULL)
3044                 ASSERT_VOP_ELOCKED(*tvpp, __func__);
3045
3046         /* Reject renames across filesystems. */
3047         if ((*svpp)->v_mount != tdvp->v_mount ||
3048             ((*tvpp) != NULL && (*svpp)->v_mount != (*tvpp)->v_mount)) {
3049                 error = SET_ERROR(EXDEV);
3050                 goto out;
3051         }
3052
3053         if (zfsctl_is_node(tdvp)) {
3054                 error = SET_ERROR(EXDEV);
3055                 goto out;
3056         }
3057
3058         /*
3059          * Lock all four vnodes to ensure safety and semantics of renaming.
3060          */
3061         error = zfs_rename_relock(sdvp, svpp, tdvp, tvpp, scnp, tcnp);
3062         if (error != 0) {
3063                 /* no vnodes are locked in the case of error here */
3064                 return (error);
3065         }
3066
3067         error = zfs_do_rename_impl(sdvp, svpp, scnp, tdvp, tvpp, tcnp, cr);
3068         VOP_UNLOCK(sdvp);
3069         VOP_UNLOCK(*svpp);
3070 out:
3071         if (*tvpp != NULL)
3072                 VOP_UNLOCK(*tvpp);
3073         if (tdvp != *tvpp)
3074                 VOP_UNLOCK(tdvp);
3075
3076         return (error);
3077 }
3078
3079 static int
3080 zfs_do_rename_impl(vnode_t *sdvp, vnode_t **svpp, struct componentname *scnp,
3081     vnode_t *tdvp, vnode_t **tvpp, struct componentname *tcnp,
3082     cred_t *cr)
3083 {
3084         dmu_tx_t        *tx;
3085         zfsvfs_t        *zfsvfs;
3086         zilog_t         *zilog;
3087         znode_t         *tdzp, *sdzp, *tzp, *szp;
3088         const char      *snm = scnp->cn_nameptr;
3089         const char      *tnm = tcnp->cn_nameptr;
3090         int             error;
3091
3092         tdzp = VTOZ(tdvp);
3093         sdzp = VTOZ(sdvp);
3094         zfsvfs = tdzp->z_zfsvfs;
3095
3096         if ((error = zfs_enter_verify_zp(zfsvfs, tdzp, FTAG)) != 0)
3097                 return (error);
3098         if ((error = zfs_verify_zp(sdzp)) != 0) {
3099                 zfs_exit(zfsvfs, FTAG);
3100                 return (error);
3101         }
3102         zilog = zfsvfs->z_log;
3103
3104         if (zfsvfs->z_utf8 && u8_validate(tnm,
3105             strlen(tnm), NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
3106                 error = SET_ERROR(EILSEQ);
3107                 goto out;
3108         }
3109
3110         /* If source and target are the same file, there is nothing to do. */
3111         if ((*svpp) == (*tvpp)) {
3112                 error = 0;
3113                 goto out;
3114         }
3115
3116         if (((*svpp)->v_type == VDIR && (*svpp)->v_mountedhere != NULL) ||
3117             ((*tvpp) != NULL && (*tvpp)->v_type == VDIR &&
3118             (*tvpp)->v_mountedhere != NULL)) {
3119                 error = SET_ERROR(EXDEV);
3120                 goto out;
3121         }
3122
3123         szp = VTOZ(*svpp);
3124         if ((error = zfs_verify_zp(szp)) != 0) {
3125                 zfs_exit(zfsvfs, FTAG);
3126                 return (error);
3127         }
3128         tzp = *tvpp == NULL ? NULL : VTOZ(*tvpp);
3129         if (tzp != NULL) {
3130                 if ((error = zfs_verify_zp(tzp)) != 0) {
3131                         zfs_exit(zfsvfs, FTAG);
3132                         return (error);
3133                 }
3134         }
3135
3136         /*
3137          * This is to prevent the creation of links into attribute space
3138          * by renaming a linked file into/outof an attribute directory.
3139          * See the comment in zfs_link() for why this is considered bad.
3140          */
3141         if ((tdzp->z_pflags & ZFS_XATTR) != (sdzp->z_pflags & ZFS_XATTR)) {
3142                 error = SET_ERROR(EINVAL);
3143                 goto out;
3144         }
3145
3146         /*
3147          * If we are using project inheritance, means if the directory has
3148          * ZFS_PROJINHERIT set, then its descendant directories will inherit
3149          * not only the project ID, but also the ZFS_PROJINHERIT flag. Under
3150          * such case, we only allow renames into our tree when the project
3151          * IDs are the same.
3152          */
3153         if (tdzp->z_pflags & ZFS_PROJINHERIT &&
3154             tdzp->z_projid != szp->z_projid) {
3155                 error = SET_ERROR(EXDEV);
3156                 goto out;
3157         }
3158
3159         /*
3160          * Must have write access at the source to remove the old entry
3161          * and write access at the target to create the new entry.
3162          * Note that if target and source are the same, this can be
3163          * done in a single check.
3164          */
3165         if ((error = zfs_zaccess_rename(sdzp, szp, tdzp, tzp, cr, NULL)))
3166                 goto out;
3167
3168         if ((*svpp)->v_type == VDIR) {
3169                 /*
3170                  * Avoid ".", "..", and aliases of "." for obvious reasons.
3171                  */
3172                 if ((scnp->cn_namelen == 1 && scnp->cn_nameptr[0] == '.') ||
3173                     sdzp == szp ||
3174                     (scnp->cn_flags | tcnp->cn_flags) & ISDOTDOT) {
3175                         error = EINVAL;
3176                         goto out;
3177                 }
3178
3179                 /*
3180                  * Check to make sure rename is valid.
3181                  * Can't do a move like this: /usr/a/b to /usr/a/b/c/d
3182                  */
3183                 if ((error = zfs_rename_check(szp, sdzp, tdzp)))
3184                         goto out;
3185         }
3186
3187         /*
3188          * Does target exist?
3189          */
3190         if (tzp) {
3191                 /*
3192                  * Source and target must be the same type.
3193                  */
3194                 if ((*svpp)->v_type == VDIR) {
3195                         if ((*tvpp)->v_type != VDIR) {
3196                                 error = SET_ERROR(ENOTDIR);
3197                                 goto out;
3198                         } else {
3199                                 cache_purge(tdvp);
3200                                 if (sdvp != tdvp)
3201                                         cache_purge(sdvp);
3202                         }
3203                 } else {
3204                         if ((*tvpp)->v_type == VDIR) {
3205                                 error = SET_ERROR(EISDIR);
3206                                 goto out;
3207                         }
3208                 }
3209         }
3210
3211         vn_seqc_write_begin(*svpp);
3212         vn_seqc_write_begin(sdvp);
3213         if (*tvpp != NULL)
3214                 vn_seqc_write_begin(*tvpp);
3215         if (tdvp != *tvpp)
3216                 vn_seqc_write_begin(tdvp);
3217
3218         vnevent_rename_src(*svpp, sdvp, scnp->cn_nameptr, ct);
3219         if (tzp)
3220                 vnevent_rename_dest(*tvpp, tdvp, tnm, ct);
3221
3222         /*
3223          * notify the target directory if it is not the same
3224          * as source directory.
3225          */
3226         if (tdvp != sdvp) {
3227                 vnevent_rename_dest_dir(tdvp, ct);
3228         }
3229
3230         tx = dmu_tx_create(zfsvfs->z_os);
3231         dmu_tx_hold_sa(tx, szp->z_sa_hdl, B_FALSE);
3232         dmu_tx_hold_sa(tx, sdzp->z_sa_hdl, B_FALSE);
3233         dmu_tx_hold_zap(tx, sdzp->z_id, FALSE, snm);
3234         dmu_tx_hold_zap(tx, tdzp->z_id, TRUE, tnm);
3235         if (sdzp != tdzp) {
3236                 dmu_tx_hold_sa(tx, tdzp->z_sa_hdl, B_FALSE);
3237                 zfs_sa_upgrade_txholds(tx, tdzp);
3238         }
3239         if (tzp) {
3240                 dmu_tx_hold_sa(tx, tzp->z_sa_hdl, B_FALSE);
3241                 zfs_sa_upgrade_txholds(tx, tzp);
3242         }
3243
3244         zfs_sa_upgrade_txholds(tx, szp);
3245         dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
3246         error = dmu_tx_assign(tx, TXG_WAIT);
3247         if (error) {
3248                 dmu_tx_abort(tx);
3249                 goto out_seq;
3250         }
3251
3252         if (tzp)        /* Attempt to remove the existing target */
3253                 error = zfs_link_destroy(tdzp, tnm, tzp, tx, 0, NULL);
3254
3255         if (error == 0) {
3256                 error = zfs_link_create(tdzp, tnm, szp, tx, ZRENAMING);
3257                 if (error == 0) {
3258                         szp->z_pflags |= ZFS_AV_MODIFIED;
3259
3260                         error = sa_update(szp->z_sa_hdl, SA_ZPL_FLAGS(zfsvfs),
3261                             (void *)&szp->z_pflags, sizeof (uint64_t), tx);
3262                         ASSERT0(error);
3263
3264                         error = zfs_link_destroy(sdzp, snm, szp, tx, ZRENAMING,
3265                             NULL);
3266                         if (error == 0) {
3267                                 zfs_log_rename(zilog, tx, TX_RENAME, sdzp,
3268                                     snm, tdzp, tnm, szp);
3269                         } else {
3270                                 /*
3271                                  * At this point, we have successfully created
3272                                  * the target name, but have failed to remove
3273                                  * the source name.  Since the create was done
3274                                  * with the ZRENAMING flag, there are
3275                                  * complications; for one, the link count is
3276                                  * wrong.  The easiest way to deal with this
3277                                  * is to remove the newly created target, and
3278                                  * return the original error.  This must
3279                                  * succeed; fortunately, it is very unlikely to
3280                                  * fail, since we just created it.
3281                                  */
3282                                 VERIFY0(zfs_link_destroy(tdzp, tnm, szp, tx,
3283                                     ZRENAMING, NULL));
3284                         }
3285                 }
3286                 if (error == 0) {
3287                         cache_vop_rename(sdvp, *svpp, tdvp, *tvpp, scnp, tcnp);
3288                 }
3289         }
3290
3291         dmu_tx_commit(tx);
3292
3293 out_seq:
3294         vn_seqc_write_end(*svpp);
3295         vn_seqc_write_end(sdvp);
3296         if (*tvpp != NULL)
3297                 vn_seqc_write_end(*tvpp);
3298         if (tdvp != *tvpp)
3299                 vn_seqc_write_end(tdvp);
3300
3301 out:
3302         if (error == 0 && zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
3303                 zil_commit(zilog, 0);
3304         zfs_exit(zfsvfs, FTAG);
3305
3306         return (error);
3307 }
3308
3309 int
3310 zfs_rename(znode_t *sdzp, const char *sname, znode_t *tdzp, const char *tname,
3311     cred_t *cr, int flags, uint64_t rflags, vattr_t *wo_vap, zidmap_t *mnt_ns)
3312 {
3313         struct componentname scn, tcn;
3314         vnode_t *sdvp, *tdvp;
3315         vnode_t *svp, *tvp;
3316         int error;
3317         svp = tvp = NULL;
3318
3319         if (is_nametoolong(tdzp->z_zfsvfs, tname))
3320                 return (SET_ERROR(ENAMETOOLONG));
3321
3322         if (rflags != 0 || wo_vap != NULL)
3323                 return (SET_ERROR(EINVAL));
3324
3325         sdvp = ZTOV(sdzp);
3326         tdvp = ZTOV(tdzp);
3327         error = zfs_lookup_internal(sdzp, sname, &svp, &scn, DELETE);
3328         if (sdzp->z_zfsvfs->z_replay == B_FALSE)
3329                 VOP_UNLOCK(sdvp);
3330         if (error != 0)
3331                 goto fail;
3332         VOP_UNLOCK(svp);
3333
3334         vn_lock(tdvp, LK_EXCLUSIVE | LK_RETRY);
3335         error = zfs_lookup_internal(tdzp, tname, &tvp, &tcn, RENAME);
3336         if (error == EJUSTRETURN)
3337                 tvp = NULL;
3338         else if (error != 0) {
3339                 VOP_UNLOCK(tdvp);
3340                 goto fail;
3341         }
3342
3343         error = zfs_do_rename(sdvp, &svp, &scn, tdvp, &tvp, &tcn, cr);
3344 fail:
3345         if (svp != NULL)
3346                 vrele(svp);
3347         if (tvp != NULL)
3348                 vrele(tvp);
3349
3350         return (error);
3351 }
3352
3353 /*
3354  * Insert the indicated symbolic reference entry into the directory.
3355  *
3356  *      IN:     dvp     - Directory to contain new symbolic link.
3357  *              link    - Name for new symlink entry.
3358  *              vap     - Attributes of new entry.
3359  *              cr      - credentials of caller.
3360  *              ct      - caller context
3361  *              flags   - case flags
3362  *              mnt_ns  - Unused on FreeBSD
3363  *
3364  *      RETURN: 0 on success, error code on failure.
3365  *
3366  * Timestamps:
3367  *      dvp - ctime|mtime updated
3368  */
3369 int
3370 zfs_symlink(znode_t *dzp, const char *name, vattr_t *vap,
3371     const char *link, znode_t **zpp, cred_t *cr, int flags, zidmap_t *mnt_ns)
3372 {
3373         (void) flags;
3374         znode_t         *zp;
3375         dmu_tx_t        *tx;
3376         zfsvfs_t        *zfsvfs = dzp->z_zfsvfs;
3377         zilog_t         *zilog;
3378         uint64_t        len = strlen(link);
3379         int             error;
3380         zfs_acl_ids_t   acl_ids;
3381         boolean_t       fuid_dirtied;
3382         uint64_t        txtype = TX_SYMLINK;
3383
3384         ASSERT3S(vap->va_type, ==, VLNK);
3385
3386         if (is_nametoolong(zfsvfs, name))
3387                 return (SET_ERROR(ENAMETOOLONG));
3388
3389         if ((error = zfs_enter_verify_zp(zfsvfs, dzp, FTAG)) != 0)
3390                 return (error);
3391         zilog = zfsvfs->z_log;
3392
3393         if (zfsvfs->z_utf8 && u8_validate(name, strlen(name),
3394             NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
3395                 zfs_exit(zfsvfs, FTAG);
3396                 return (SET_ERROR(EILSEQ));
3397         }
3398
3399         if (len > MAXPATHLEN) {
3400                 zfs_exit(zfsvfs, FTAG);
3401                 return (SET_ERROR(ENAMETOOLONG));
3402         }
3403
3404         if ((error = zfs_acl_ids_create(dzp, 0,
3405             vap, cr, NULL, &acl_ids, NULL)) != 0) {
3406                 zfs_exit(zfsvfs, FTAG);
3407                 return (error);
3408         }
3409
3410         /*
3411          * Attempt to lock directory; fail if entry already exists.
3412          */
3413         error = zfs_dirent_lookup(dzp, name, &zp, ZNEW);
3414         if (error) {
3415                 zfs_acl_ids_free(&acl_ids);
3416                 zfs_exit(zfsvfs, FTAG);
3417                 return (error);
3418         }
3419
3420         if ((error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr, mnt_ns))) {
3421                 zfs_acl_ids_free(&acl_ids);
3422                 zfs_exit(zfsvfs, FTAG);
3423                 return (error);
3424         }
3425
3426         if (zfs_acl_ids_overquota(zfsvfs, &acl_ids,
3427             0 /* projid */)) {
3428                 zfs_acl_ids_free(&acl_ids);
3429                 zfs_exit(zfsvfs, FTAG);
3430                 return (SET_ERROR(EDQUOT));
3431         }
3432
3433         getnewvnode_reserve();
3434         tx = dmu_tx_create(zfsvfs->z_os);
3435         fuid_dirtied = zfsvfs->z_fuid_dirty;
3436         dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, MAX(1, len));
3437         dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
3438         dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
3439             ZFS_SA_BASE_ATTR_SIZE + len);
3440         dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE);
3441         if (!zfsvfs->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) {
3442                 dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0,
3443                     acl_ids.z_aclp->z_acl_bytes);
3444         }
3445         if (fuid_dirtied)
3446                 zfs_fuid_txhold(zfsvfs, tx);
3447         error = dmu_tx_assign(tx, TXG_WAIT);
3448         if (error) {
3449                 zfs_acl_ids_free(&acl_ids);
3450                 dmu_tx_abort(tx);
3451                 getnewvnode_drop_reserve();
3452                 zfs_exit(zfsvfs, FTAG);
3453                 return (error);
3454         }
3455
3456         /*
3457          * Create a new object for the symlink.
3458          * for version 4 ZPL datasets the symlink will be an SA attribute
3459          */
3460         zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids);
3461
3462         if (fuid_dirtied)
3463                 zfs_fuid_sync(zfsvfs, tx);
3464
3465         if (zp->z_is_sa)
3466                 error = sa_update(zp->z_sa_hdl, SA_ZPL_SYMLINK(zfsvfs),
3467                     __DECONST(void *, link), len, tx);
3468         else
3469                 zfs_sa_symlink(zp, __DECONST(char *, link), len, tx);
3470
3471         zp->z_size = len;
3472         (void) sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zfsvfs),
3473             &zp->z_size, sizeof (zp->z_size), tx);
3474         /*
3475          * Insert the new object into the directory.
3476          */
3477         error = zfs_link_create(dzp, name, zp, tx, ZNEW);
3478         if (error != 0) {
3479                 zfs_znode_delete(zp, tx);
3480                 VOP_UNLOCK(ZTOV(zp));
3481                 zrele(zp);
3482         } else {
3483                 zfs_log_symlink(zilog, tx, txtype, dzp, zp, name, link);
3484         }
3485
3486         zfs_acl_ids_free(&acl_ids);
3487
3488         dmu_tx_commit(tx);
3489
3490         getnewvnode_drop_reserve();
3491
3492         if (error == 0) {
3493                 *zpp = zp;
3494
3495                 if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
3496                         zil_commit(zilog, 0);
3497         }
3498
3499         zfs_exit(zfsvfs, FTAG);
3500         return (error);
3501 }
3502
3503 /*
3504  * Return, in the buffer contained in the provided uio structure,
3505  * the symbolic path referred to by vp.
3506  *
3507  *      IN:     vp      - vnode of symbolic link.
3508  *              uio     - structure to contain the link path.
3509  *              cr      - credentials of caller.
3510  *              ct      - caller context
3511  *
3512  *      OUT:    uio     - structure containing the link path.
3513  *
3514  *      RETURN: 0 on success, error code on failure.
3515  *
3516  * Timestamps:
3517  *      vp - atime updated
3518  */
3519 static int
3520 zfs_readlink(vnode_t *vp, zfs_uio_t *uio, cred_t *cr, caller_context_t *ct)
3521 {
3522         (void) cr, (void) ct;
3523         znode_t         *zp = VTOZ(vp);
3524         zfsvfs_t        *zfsvfs = zp->z_zfsvfs;
3525         int             error;
3526
3527         if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
3528                 return (error);
3529
3530         if (zp->z_is_sa)
3531                 error = sa_lookup_uio(zp->z_sa_hdl,
3532                     SA_ZPL_SYMLINK(zfsvfs), uio);
3533         else
3534                 error = zfs_sa_readlink(zp, uio);
3535
3536         ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
3537
3538         zfs_exit(zfsvfs, FTAG);
3539         return (error);
3540 }
3541
3542 /*
3543  * Insert a new entry into directory tdvp referencing svp.
3544  *
3545  *      IN:     tdvp    - Directory to contain new entry.
3546  *              svp     - vnode of new entry.
3547  *              name    - name of new entry.
3548  *              cr      - credentials of caller.
3549  *
3550  *      RETURN: 0 on success, error code on failure.
3551  *
3552  * Timestamps:
3553  *      tdvp - ctime|mtime updated
3554  *       svp - ctime updated
3555  */
3556 int
3557 zfs_link(znode_t *tdzp, znode_t *szp, const char *name, cred_t *cr,
3558     int flags)
3559 {
3560         (void) flags;
3561         znode_t         *tzp;
3562         zfsvfs_t        *zfsvfs = tdzp->z_zfsvfs;
3563         zilog_t         *zilog;
3564         dmu_tx_t        *tx;
3565         int             error;
3566         uint64_t        parent;
3567         uid_t           owner;
3568
3569         ASSERT3S(ZTOV(tdzp)->v_type, ==, VDIR);
3570
3571         if (is_nametoolong(zfsvfs, name))
3572                 return (SET_ERROR(ENAMETOOLONG));
3573
3574         if ((error = zfs_enter_verify_zp(zfsvfs, tdzp, FTAG)) != 0)
3575                 return (error);
3576         zilog = zfsvfs->z_log;
3577
3578         /*
3579          * POSIX dictates that we return EPERM here.
3580          * Better choices include ENOTSUP or EISDIR.
3581          */
3582         if (ZTOV(szp)->v_type == VDIR) {
3583                 zfs_exit(zfsvfs, FTAG);
3584                 return (SET_ERROR(EPERM));
3585         }
3586
3587         if ((error = zfs_verify_zp(szp)) != 0) {
3588                 zfs_exit(zfsvfs, FTAG);
3589                 return (error);
3590         }
3591
3592         /*
3593          * If we are using project inheritance, means if the directory has
3594          * ZFS_PROJINHERIT set, then its descendant directories will inherit
3595          * not only the project ID, but also the ZFS_PROJINHERIT flag. Under
3596          * such case, we only allow hard link creation in our tree when the
3597          * project IDs are the same.
3598          */
3599         if (tdzp->z_pflags & ZFS_PROJINHERIT &&
3600             tdzp->z_projid != szp->z_projid) {
3601                 zfs_exit(zfsvfs, FTAG);
3602                 return (SET_ERROR(EXDEV));
3603         }
3604
3605         if (szp->z_pflags & (ZFS_APPENDONLY |
3606             ZFS_IMMUTABLE | ZFS_READONLY)) {
3607                 zfs_exit(zfsvfs, FTAG);
3608                 return (SET_ERROR(EPERM));
3609         }
3610
3611         /* Prevent links to .zfs/shares files */
3612
3613         if ((error = sa_lookup(szp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs),
3614             &parent, sizeof (uint64_t))) != 0) {
3615                 zfs_exit(zfsvfs, FTAG);
3616                 return (error);
3617         }
3618         if (parent == zfsvfs->z_shares_dir) {
3619                 zfs_exit(zfsvfs, FTAG);
3620                 return (SET_ERROR(EPERM));
3621         }
3622
3623         if (zfsvfs->z_utf8 && u8_validate(name,
3624             strlen(name), NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
3625                 zfs_exit(zfsvfs, FTAG);
3626                 return (SET_ERROR(EILSEQ));
3627         }
3628
3629         /*
3630          * We do not support links between attributes and non-attributes
3631          * because of the potential security risk of creating links
3632          * into "normal" file space in order to circumvent restrictions
3633          * imposed in attribute space.
3634          */
3635         if ((szp->z_pflags & ZFS_XATTR) != (tdzp->z_pflags & ZFS_XATTR)) {
3636                 zfs_exit(zfsvfs, FTAG);
3637                 return (SET_ERROR(EINVAL));
3638         }
3639
3640
3641         owner = zfs_fuid_map_id(zfsvfs, szp->z_uid, cr, ZFS_OWNER);
3642         if (owner != crgetuid(cr) && secpolicy_basic_link(ZTOV(szp), cr) != 0) {
3643                 zfs_exit(zfsvfs, FTAG);
3644                 return (SET_ERROR(EPERM));
3645         }
3646
3647         if ((error = zfs_zaccess(tdzp, ACE_ADD_FILE, 0, B_FALSE, cr, NULL))) {
3648                 zfs_exit(zfsvfs, FTAG);
3649                 return (error);
3650         }
3651
3652         /*
3653          * Attempt to lock directory; fail if entry already exists.
3654          */
3655         error = zfs_dirent_lookup(tdzp, name, &tzp, ZNEW);
3656         if (error) {
3657                 zfs_exit(zfsvfs, FTAG);
3658                 return (error);
3659         }
3660
3661         tx = dmu_tx_create(zfsvfs->z_os);
3662         dmu_tx_hold_sa(tx, szp->z_sa_hdl, B_FALSE);
3663         dmu_tx_hold_zap(tx, tdzp->z_id, TRUE, name);
3664         zfs_sa_upgrade_txholds(tx, szp);
3665         zfs_sa_upgrade_txholds(tx, tdzp);
3666         error = dmu_tx_assign(tx, TXG_WAIT);
3667         if (error) {
3668                 dmu_tx_abort(tx);
3669                 zfs_exit(zfsvfs, FTAG);
3670                 return (error);
3671         }
3672
3673         error = zfs_link_create(tdzp, name, szp, tx, 0);
3674
3675         if (error == 0) {
3676                 uint64_t txtype = TX_LINK;
3677                 zfs_log_link(zilog, tx, txtype, tdzp, szp, name);
3678         }
3679
3680         dmu_tx_commit(tx);
3681
3682         if (error == 0) {
3683                 vnevent_link(ZTOV(szp), ct);
3684         }
3685
3686         if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
3687                 zil_commit(zilog, 0);
3688
3689         zfs_exit(zfsvfs, FTAG);
3690         return (error);
3691 }
3692
3693 /*
3694  * Free or allocate space in a file.  Currently, this function only
3695  * supports the `F_FREESP' command.  However, this command is somewhat
3696  * misnamed, as its functionality includes the ability to allocate as
3697  * well as free space.
3698  *
3699  *      IN:     ip      - inode of file to free data in.
3700  *              cmd     - action to take (only F_FREESP supported).
3701  *              bfp     - section of file to free/alloc.
3702  *              flag    - current file open mode flags.
3703  *              offset  - current file offset.
3704  *              cr      - credentials of caller.
3705  *
3706  *      RETURN: 0 on success, error code on failure.
3707  *
3708  * Timestamps:
3709  *      ip - ctime|mtime updated
3710  */
3711 int
3712 zfs_space(znode_t *zp, int cmd, flock64_t *bfp, int flag,
3713     offset_t offset, cred_t *cr)
3714 {
3715         (void) offset;
3716         zfsvfs_t        *zfsvfs = ZTOZSB(zp);
3717         uint64_t        off, len;
3718         int             error;
3719
3720         if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
3721                 return (error);
3722
3723         if (cmd != F_FREESP) {
3724                 zfs_exit(zfsvfs, FTAG);
3725                 return (SET_ERROR(EINVAL));
3726         }
3727
3728         /*
3729          * Callers might not be able to detect properly that we are read-only,
3730          * so check it explicitly here.
3731          */
3732         if (zfs_is_readonly(zfsvfs)) {
3733                 zfs_exit(zfsvfs, FTAG);
3734                 return (SET_ERROR(EROFS));
3735         }
3736
3737         if (bfp->l_len < 0) {
3738                 zfs_exit(zfsvfs, FTAG);
3739                 return (SET_ERROR(EINVAL));
3740         }
3741
3742         /*
3743          * Permissions aren't checked on Solaris because on this OS
3744          * zfs_space() can only be called with an opened file handle.
3745          * On Linux we can get here through truncate_range() which
3746          * operates directly on inodes, so we need to check access rights.
3747          */
3748         if ((error = zfs_zaccess(zp, ACE_WRITE_DATA, 0, B_FALSE, cr, NULL))) {
3749                 zfs_exit(zfsvfs, FTAG);
3750                 return (error);
3751         }
3752
3753         off = bfp->l_start;
3754         len = bfp->l_len; /* 0 means from off to end of file */
3755
3756         error = zfs_freesp(zp, off, len, flag, TRUE);
3757
3758         zfs_exit(zfsvfs, FTAG);
3759         return (error);
3760 }
3761
3762 static void
3763 zfs_inactive(vnode_t *vp, cred_t *cr, caller_context_t *ct)
3764 {
3765         (void) cr, (void) ct;
3766         znode_t *zp = VTOZ(vp);
3767         zfsvfs_t *zfsvfs = zp->z_zfsvfs;
3768         int error;
3769
3770         ZFS_TEARDOWN_INACTIVE_ENTER_READ(zfsvfs);
3771         if (zp->z_sa_hdl == NULL) {
3772                 /*
3773                  * The fs has been unmounted, or we did a
3774                  * suspend/resume and this file no longer exists.
3775                  */
3776                 ZFS_TEARDOWN_INACTIVE_EXIT_READ(zfsvfs);
3777                 vrecycle(vp);
3778                 return;
3779         }
3780
3781         if (zp->z_unlinked) {
3782                 /*
3783                  * Fast path to recycle a vnode of a removed file.
3784                  */
3785                 ZFS_TEARDOWN_INACTIVE_EXIT_READ(zfsvfs);
3786                 vrecycle(vp);
3787                 return;
3788         }
3789
3790         if (zp->z_atime_dirty && zp->z_unlinked == 0) {
3791                 dmu_tx_t *tx = dmu_tx_create(zfsvfs->z_os);
3792
3793                 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
3794                 zfs_sa_upgrade_txholds(tx, zp);
3795                 error = dmu_tx_assign(tx, TXG_WAIT);
3796                 if (error) {
3797                         dmu_tx_abort(tx);
3798                 } else {
3799                         (void) sa_update(zp->z_sa_hdl, SA_ZPL_ATIME(zfsvfs),
3800                             (void *)&zp->z_atime, sizeof (zp->z_atime), tx);
3801                         zp->z_atime_dirty = 0;
3802                         dmu_tx_commit(tx);
3803                 }
3804         }
3805         ZFS_TEARDOWN_INACTIVE_EXIT_READ(zfsvfs);
3806 }
3807
3808
3809 _Static_assert(sizeof (struct zfid_short) <= sizeof (struct fid),
3810         "struct zfid_short bigger than struct fid");
3811 _Static_assert(sizeof (struct zfid_long) <= sizeof (struct fid),
3812         "struct zfid_long bigger than struct fid");
3813
3814 static int
3815 zfs_fid(vnode_t *vp, fid_t *fidp, caller_context_t *ct)
3816 {
3817         (void) ct;
3818         znode_t         *zp = VTOZ(vp);
3819         zfsvfs_t        *zfsvfs = zp->z_zfsvfs;
3820         uint32_t        gen;
3821         uint64_t        gen64;
3822         uint64_t        object = zp->z_id;
3823         zfid_short_t    *zfid;
3824         int             size, i, error;
3825
3826         if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
3827                 return (error);
3828
3829         if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_GEN(zfsvfs),
3830             &gen64, sizeof (uint64_t))) != 0) {
3831                 zfs_exit(zfsvfs, FTAG);
3832                 return (error);
3833         }
3834
3835         gen = (uint32_t)gen64;
3836
3837         size = (zfsvfs->z_parent != zfsvfs) ? LONG_FID_LEN : SHORT_FID_LEN;
3838         fidp->fid_len = size;
3839
3840         zfid = (zfid_short_t *)fidp;
3841
3842         zfid->zf_len = size;
3843
3844         for (i = 0; i < sizeof (zfid->zf_object); i++)
3845                 zfid->zf_object[i] = (uint8_t)(object >> (8 * i));
3846
3847         /* Must have a non-zero generation number to distinguish from .zfs */
3848         if (gen == 0)
3849                 gen = 1;
3850         for (i = 0; i < sizeof (zfid->zf_gen); i++)
3851                 zfid->zf_gen[i] = (uint8_t)(gen >> (8 * i));
3852
3853         if (size == LONG_FID_LEN) {
3854                 uint64_t        objsetid = dmu_objset_id(zfsvfs->z_os);
3855                 zfid_long_t     *zlfid;
3856
3857                 zlfid = (zfid_long_t *)fidp;
3858
3859                 for (i = 0; i < sizeof (zlfid->zf_setid); i++)
3860                         zlfid->zf_setid[i] = (uint8_t)(objsetid >> (8 * i));
3861
3862                 /* XXX - this should be the generation number for the objset */
3863                 for (i = 0; i < sizeof (zlfid->zf_setgen); i++)
3864                         zlfid->zf_setgen[i] = 0;
3865         }
3866
3867         zfs_exit(zfsvfs, FTAG);
3868         return (0);
3869 }
3870
3871 static int
3872 zfs_pathconf(vnode_t *vp, int cmd, ulong_t *valp, cred_t *cr,
3873     caller_context_t *ct)
3874 {
3875         znode_t *zp;
3876         zfsvfs_t *zfsvfs;
3877         int error;
3878
3879         switch (cmd) {
3880         case _PC_LINK_MAX:
3881                 *valp = MIN(LONG_MAX, ZFS_LINK_MAX);
3882                 return (0);
3883
3884         case _PC_FILESIZEBITS:
3885                 *valp = 64;
3886                 return (0);
3887         case _PC_MIN_HOLE_SIZE:
3888                 *valp = (int)SPA_MINBLOCKSIZE;
3889                 return (0);
3890         case _PC_ACL_EXTENDED:
3891 #if 0           /* POSIX ACLs are not implemented for ZFS on FreeBSD yet. */
3892                 zp = VTOZ(vp);
3893                 zfsvfs = zp->z_zfsvfs;
3894                 if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
3895                         return (error);
3896                 *valp = zfsvfs->z_acl_type == ZFSACLTYPE_POSIX ? 1 : 0;
3897                 zfs_exit(zfsvfs, FTAG);
3898 #else
3899                 *valp = 0;
3900 #endif
3901                 return (0);
3902
3903         case _PC_ACL_NFS4:
3904                 zp = VTOZ(vp);
3905                 zfsvfs = zp->z_zfsvfs;
3906                 if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
3907                         return (error);
3908                 *valp = zfsvfs->z_acl_type == ZFS_ACLTYPE_NFSV4 ? 1 : 0;
3909                 zfs_exit(zfsvfs, FTAG);
3910                 return (0);
3911
3912         case _PC_ACL_PATH_MAX:
3913                 *valp = ACL_MAX_ENTRIES;
3914                 return (0);
3915
3916         default:
3917                 return (EOPNOTSUPP);
3918         }
3919 }
3920
3921 static int
3922 zfs_getpages(struct vnode *vp, vm_page_t *ma, int count, int *rbehind,
3923     int *rahead)
3924 {
3925         znode_t *zp = VTOZ(vp);
3926         zfsvfs_t *zfsvfs = zp->z_zfsvfs;
3927         zfs_locked_range_t *lr;
3928         vm_object_t object;
3929         off_t start, end, obj_size;
3930         uint_t blksz;
3931         int pgsin_b, pgsin_a;
3932         int error;
3933
3934         if (zfs_enter_verify_zp(zfsvfs, zp, FTAG) != 0)
3935                 return (zfs_vm_pagerret_error);
3936
3937         object = ma[0]->object;
3938         start = IDX_TO_OFF(ma[0]->pindex);
3939         end = IDX_TO_OFF(ma[count - 1]->pindex + 1);
3940
3941         /*
3942          * Lock a range covering all required and optional pages.
3943          * Note that we need to handle the case of the block size growing.
3944          */
3945         for (;;) {
3946                 uint64_t len;
3947
3948                 blksz = zp->z_blksz;
3949                 len = roundup(end, blksz) - rounddown(start, blksz);
3950
3951                 lr = zfs_rangelock_tryenter(&zp->z_rangelock,
3952                     rounddown(start, blksz), len, RL_READER);
3953                 if (lr == NULL) {
3954                         /*
3955                          * Avoid a deadlock with update_pages().  We need to
3956                          * hold the range lock when copying from the DMU, so
3957                          * give up the busy lock to allow update_pages() to
3958                          * proceed.  We might need to allocate new pages, which
3959                          * isn't quite right since this allocation isn't subject
3960                          * to the page fault handler's OOM logic, but this is
3961                          * the best we can do for now.
3962                          */
3963                         for (int i = 0; i < count; i++) {
3964                                 ASSERT(vm_page_none_valid(ma[i]));
3965                                 vm_page_xunbusy(ma[i]);
3966                         }
3967
3968                         lr = zfs_rangelock_enter(&zp->z_rangelock,
3969                             rounddown(start, blksz), len, RL_READER);
3970
3971                         zfs_vmobject_wlock(object);
3972                         (void) vm_page_grab_pages(object, OFF_TO_IDX(start),
3973                             VM_ALLOC_NORMAL | VM_ALLOC_WAITOK | VM_ALLOC_ZERO,
3974                             ma, count);
3975                         zfs_vmobject_wunlock(object);
3976                 }
3977                 if (blksz == zp->z_blksz)
3978                         break;
3979                 zfs_rangelock_exit(lr);
3980         }
3981
3982         zfs_vmobject_wlock(object);
3983         obj_size = object->un_pager.vnp.vnp_size;
3984         zfs_vmobject_wunlock(object);
3985         if (IDX_TO_OFF(ma[count - 1]->pindex) >= obj_size) {
3986                 zfs_rangelock_exit(lr);
3987                 zfs_exit(zfsvfs, FTAG);
3988                 return (zfs_vm_pagerret_bad);
3989         }
3990
3991         pgsin_b = 0;
3992         if (rbehind != NULL) {
3993                 pgsin_b = OFF_TO_IDX(start - rounddown(start, blksz));
3994                 pgsin_b = MIN(*rbehind, pgsin_b);
3995         }
3996
3997         pgsin_a = 0;
3998         if (rahead != NULL) {
3999                 pgsin_a = OFF_TO_IDX(roundup(end, blksz) - end);
4000                 if (end + IDX_TO_OFF(pgsin_a) >= obj_size)
4001                         pgsin_a = OFF_TO_IDX(round_page(obj_size) - end);
4002                 pgsin_a = MIN(*rahead, pgsin_a);
4003         }
4004
4005         /*
4006          * NB: we need to pass the exact byte size of the data that we expect
4007          * to read after accounting for the file size.  This is required because
4008          * ZFS will panic if we request DMU to read beyond the end of the last
4009          * allocated block.
4010          */
4011         for (int i = 0; i < count; i++) {
4012                 int dummypgsin, count1, j, last_size;
4013
4014                 if (vm_page_any_valid(ma[i])) {
4015                         ASSERT(vm_page_all_valid(ma[i]));
4016                         continue;
4017                 }
4018                 for (j = i + 1; j < count; j++) {
4019                         if (vm_page_any_valid(ma[j])) {
4020                                 ASSERT(vm_page_all_valid(ma[j]));
4021                                 break;
4022                         }
4023                 }
4024                 count1 = j - i;
4025                 dummypgsin = 0;
4026                 last_size = j == count ?
4027                     MIN(end, obj_size) - (end - PAGE_SIZE) : PAGE_SIZE;
4028                 error = dmu_read_pages(zfsvfs->z_os, zp->z_id, &ma[i], count1,
4029                     i == 0 ? &pgsin_b : &dummypgsin,
4030                     j == count ? &pgsin_a : &dummypgsin,
4031                     last_size);
4032                 if (error != 0)
4033                         break;
4034                 i += count1 - 1;
4035         }
4036
4037         zfs_rangelock_exit(lr);
4038         ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
4039
4040         dataset_kstats_update_read_kstats(&zfsvfs->z_kstat, count*PAGE_SIZE);
4041
4042         zfs_exit(zfsvfs, FTAG);
4043
4044         if (error != 0)
4045                 return (zfs_vm_pagerret_error);
4046
4047         VM_CNT_INC(v_vnodein);
4048         VM_CNT_ADD(v_vnodepgsin, count + pgsin_b + pgsin_a);
4049         if (rbehind != NULL)
4050                 *rbehind = pgsin_b;
4051         if (rahead != NULL)
4052                 *rahead = pgsin_a;
4053         return (zfs_vm_pagerret_ok);
4054 }
4055
4056 #ifndef _SYS_SYSPROTO_H_
4057 struct vop_getpages_args {
4058         struct vnode *a_vp;
4059         vm_page_t *a_m;
4060         int a_count;
4061         int *a_rbehind;
4062         int *a_rahead;
4063 };
4064 #endif
4065
4066 static int
4067 zfs_freebsd_getpages(struct vop_getpages_args *ap)
4068 {
4069
4070         return (zfs_getpages(ap->a_vp, ap->a_m, ap->a_count, ap->a_rbehind,
4071             ap->a_rahead));
4072 }
4073
4074 static int
4075 zfs_putpages(struct vnode *vp, vm_page_t *ma, size_t len, int flags,
4076     int *rtvals)
4077 {
4078         znode_t         *zp = VTOZ(vp);
4079         zfsvfs_t        *zfsvfs = zp->z_zfsvfs;
4080         zfs_locked_range_t              *lr;
4081         dmu_tx_t        *tx;
4082         struct sf_buf   *sf;
4083         vm_object_t     object;
4084         vm_page_t       m;
4085         caddr_t         va;
4086         size_t          tocopy;
4087         size_t          lo_len;
4088         vm_ooffset_t    lo_off;
4089         vm_ooffset_t    off;
4090         uint_t          blksz;
4091         int             ncount;
4092         int             pcount;
4093         int             err;
4094         int             i;
4095
4096         object = vp->v_object;
4097         KASSERT(ma[0]->object == object, ("mismatching object"));
4098         KASSERT(len > 0 && (len & PAGE_MASK) == 0, ("unexpected length"));
4099
4100         pcount = btoc(len);
4101         ncount = pcount;
4102         for (i = 0; i < pcount; i++)
4103                 rtvals[i] = zfs_vm_pagerret_error;
4104
4105         if (zfs_enter_verify_zp(zfsvfs, zp, FTAG) != 0)
4106                 return (zfs_vm_pagerret_error);
4107
4108         off = IDX_TO_OFF(ma[0]->pindex);
4109         blksz = zp->z_blksz;
4110         lo_off = rounddown(off, blksz);
4111         lo_len = roundup(len + (off - lo_off), blksz);
4112         lr = zfs_rangelock_enter(&zp->z_rangelock, lo_off, lo_len, RL_WRITER);
4113
4114         zfs_vmobject_wlock(object);
4115         if (len + off > object->un_pager.vnp.vnp_size) {
4116                 if (object->un_pager.vnp.vnp_size > off) {
4117                         int pgoff;
4118
4119                         len = object->un_pager.vnp.vnp_size - off;
4120                         ncount = btoc(len);
4121                         if ((pgoff = (int)len & PAGE_MASK) != 0) {
4122                                 /*
4123                                  * If the object is locked and the following
4124                                  * conditions hold, then the page's dirty
4125                                  * field cannot be concurrently changed by a
4126                                  * pmap operation.
4127                                  */
4128                                 m = ma[ncount - 1];
4129                                 vm_page_assert_sbusied(m);
4130                                 KASSERT(!pmap_page_is_write_mapped(m),
4131                                     ("zfs_putpages: page %p is not read-only",
4132                                     m));
4133                                 vm_page_clear_dirty(m, pgoff, PAGE_SIZE -
4134                                     pgoff);
4135                         }
4136                 } else {
4137                         len = 0;
4138                         ncount = 0;
4139                 }
4140                 if (ncount < pcount) {
4141                         for (i = ncount; i < pcount; i++) {
4142                                 rtvals[i] = zfs_vm_pagerret_bad;
4143                         }
4144                 }
4145         }
4146         zfs_vmobject_wunlock(object);
4147
4148         boolean_t commit = (flags & (zfs_vm_pagerput_sync |
4149             zfs_vm_pagerput_inval)) != 0 ||
4150             zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS;
4151
4152         if (ncount == 0)
4153                 goto out;
4154
4155         if (zfs_id_overblockquota(zfsvfs, DMU_USERUSED_OBJECT, zp->z_uid) ||
4156             zfs_id_overblockquota(zfsvfs, DMU_GROUPUSED_OBJECT, zp->z_gid) ||
4157             (zp->z_projid != ZFS_DEFAULT_PROJID &&
4158             zfs_id_overblockquota(zfsvfs, DMU_PROJECTUSED_OBJECT,
4159             zp->z_projid))) {
4160                 goto out;
4161         }
4162
4163         tx = dmu_tx_create(zfsvfs->z_os);
4164         dmu_tx_hold_write(tx, zp->z_id, off, len);
4165
4166         dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
4167         zfs_sa_upgrade_txholds(tx, zp);
4168         err = dmu_tx_assign(tx, TXG_WAIT);
4169         if (err != 0) {
4170                 dmu_tx_abort(tx);
4171                 goto out;
4172         }
4173
4174         if (zp->z_blksz < PAGE_SIZE) {
4175                 for (i = 0; len > 0; off += tocopy, len -= tocopy, i++) {
4176                         tocopy = len > PAGE_SIZE ? PAGE_SIZE : len;
4177                         va = zfs_map_page(ma[i], &sf);
4178                         dmu_write(zfsvfs->z_os, zp->z_id, off, tocopy, va, tx);
4179                         zfs_unmap_page(sf);
4180                 }
4181         } else {
4182                 err = dmu_write_pages(zfsvfs->z_os, zp->z_id, off, len, ma, tx);
4183         }
4184
4185         if (err == 0) {
4186                 uint64_t mtime[2], ctime[2];
4187                 sa_bulk_attr_t bulk[3];
4188                 int count = 0;
4189
4190                 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL,
4191                     &mtime, 16);
4192                 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
4193                     &ctime, 16);
4194                 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
4195                     &zp->z_pflags, 8);
4196                 zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime);
4197                 err = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
4198                 ASSERT0(err);
4199                 /*
4200                  * XXX we should be passing a callback to undirty
4201                  * but that would make the locking messier
4202                  */
4203                 zfs_log_write(zfsvfs->z_log, tx, TX_WRITE, zp, off,
4204                     len, commit, B_FALSE, NULL, NULL);
4205
4206                 zfs_vmobject_wlock(object);
4207                 for (i = 0; i < ncount; i++) {
4208                         rtvals[i] = zfs_vm_pagerret_ok;
4209                         vm_page_undirty(ma[i]);
4210                 }
4211                 zfs_vmobject_wunlock(object);
4212                 VM_CNT_INC(v_vnodeout);
4213                 VM_CNT_ADD(v_vnodepgsout, ncount);
4214         }
4215         dmu_tx_commit(tx);
4216
4217 out:
4218         zfs_rangelock_exit(lr);
4219         if (commit)
4220                 zil_commit(zfsvfs->z_log, zp->z_id);
4221
4222         dataset_kstats_update_write_kstats(&zfsvfs->z_kstat, len);
4223
4224         zfs_exit(zfsvfs, FTAG);
4225         return (rtvals[0]);
4226 }
4227
4228 #ifndef _SYS_SYSPROTO_H_
4229 struct vop_putpages_args {
4230         struct vnode *a_vp;
4231         vm_page_t *a_m;
4232         int a_count;
4233         int a_sync;
4234         int *a_rtvals;
4235 };
4236 #endif
4237
4238 static int
4239 zfs_freebsd_putpages(struct vop_putpages_args *ap)
4240 {
4241
4242         return (zfs_putpages(ap->a_vp, ap->a_m, ap->a_count, ap->a_sync,
4243             ap->a_rtvals));
4244 }
4245
4246 #ifndef _SYS_SYSPROTO_H_
4247 struct vop_bmap_args {
4248         struct vnode *a_vp;
4249         daddr_t  a_bn;
4250         struct bufobj **a_bop;
4251         daddr_t *a_bnp;
4252         int *a_runp;
4253         int *a_runb;
4254 };
4255 #endif
4256
4257 static int
4258 zfs_freebsd_bmap(struct vop_bmap_args *ap)
4259 {
4260
4261         if (ap->a_bop != NULL)
4262                 *ap->a_bop = &ap->a_vp->v_bufobj;
4263         if (ap->a_bnp != NULL)
4264                 *ap->a_bnp = ap->a_bn;
4265         if (ap->a_runp != NULL)
4266                 *ap->a_runp = 0;
4267         if (ap->a_runb != NULL)
4268                 *ap->a_runb = 0;
4269
4270         return (0);
4271 }
4272
4273 #ifndef _SYS_SYSPROTO_H_
4274 struct vop_open_args {
4275         struct vnode *a_vp;
4276         int a_mode;
4277         struct ucred *a_cred;
4278         struct thread *a_td;
4279 };
4280 #endif
4281
4282 static int
4283 zfs_freebsd_open(struct vop_open_args *ap)
4284 {
4285         vnode_t *vp = ap->a_vp;
4286         znode_t *zp = VTOZ(vp);
4287         int error;
4288
4289         error = zfs_open(&vp, ap->a_mode, ap->a_cred);
4290         if (error == 0)
4291                 vnode_create_vobject(vp, zp->z_size, ap->a_td);
4292         return (error);
4293 }
4294
4295 #ifndef _SYS_SYSPROTO_H_
4296 struct vop_close_args {
4297         struct vnode *a_vp;
4298         int  a_fflag;
4299         struct ucred *a_cred;
4300         struct thread *a_td;
4301 };
4302 #endif
4303
4304 static int
4305 zfs_freebsd_close(struct vop_close_args *ap)
4306 {
4307
4308         return (zfs_close(ap->a_vp, ap->a_fflag, 1, 0, ap->a_cred));
4309 }
4310
4311 #ifndef _SYS_SYSPROTO_H_
4312 struct vop_ioctl_args {
4313         struct vnode *a_vp;
4314         ulong_t a_command;
4315         caddr_t a_data;
4316         int a_fflag;
4317         struct ucred *cred;
4318         struct thread *td;
4319 };
4320 #endif
4321
4322 static int
4323 zfs_freebsd_ioctl(struct vop_ioctl_args *ap)
4324 {
4325
4326         return (zfs_ioctl(ap->a_vp, ap->a_command, (intptr_t)ap->a_data,
4327             ap->a_fflag, ap->a_cred, NULL));
4328 }
4329
4330 static int
4331 ioflags(int ioflags)
4332 {
4333         int flags = 0;
4334
4335         if (ioflags & IO_APPEND)
4336                 flags |= O_APPEND;
4337         if (ioflags & IO_NDELAY)
4338                 flags |= O_NONBLOCK;
4339         if (ioflags & IO_DIRECT)
4340                 flags |= O_DIRECT;
4341         if (ioflags & IO_SYNC)
4342                 flags |= O_SYNC;
4343
4344         return (flags);
4345 }
4346
4347 #ifndef _SYS_SYSPROTO_H_
4348 struct vop_read_args {
4349         struct vnode *a_vp;
4350         struct uio *a_uio;
4351         int a_ioflag;
4352         struct ucred *a_cred;
4353 };
4354 #endif
4355
4356 static int
4357 zfs_freebsd_read(struct vop_read_args *ap)
4358 {
4359         zfs_uio_t uio;
4360         int error = 0;
4361         zfs_uio_init(&uio, ap->a_uio);
4362         error = zfs_read(VTOZ(ap->a_vp), &uio, ioflags(ap->a_ioflag),
4363             ap->a_cred);
4364         /*
4365          * XXX We occasionally get an EFAULT for Direct I/O reads on
4366          * FreeBSD 13. This still needs to be resolved. The EFAULT comes
4367          * from:
4368          * zfs_uio_get__dio_pages_alloc() ->
4369          * zfs_uio_get_dio_pages_impl() ->
4370          * zfs_uio_iov_step() ->
4371          * zfs_uio_get_user_pages().
4372          * We return EFAULT from zfs_uio_iov_step(). When a Direct I/O
4373          * read fails to map in the user pages (returning EFAULT) the
4374          * Direct I/O request is broken up into two separate IO requests
4375          * and issued separately using Direct I/O.
4376          */
4377 #ifdef ZFS_DEBUG
4378         if (error == EFAULT && uio.uio_extflg & UIO_DIRECT) {
4379 #if 0
4380                 printf("%s(%d): Direct I/O read returning EFAULT "
4381                     "uio = %p, zfs_uio_offset(uio) = %lu "
4382                     "zfs_uio_resid(uio) = %lu\n",
4383                     __FUNCTION__, __LINE__, &uio, zfs_uio_offset(&uio),
4384                     zfs_uio_resid(&uio));
4385 #endif
4386         }
4387
4388 #endif
4389         return (error);
4390 }
4391
4392 #ifndef _SYS_SYSPROTO_H_
4393 struct vop_write_args {
4394         struct vnode *a_vp;
4395         struct uio *a_uio;
4396         int a_ioflag;
4397         struct ucred *a_cred;
4398 };
4399 #endif
4400
4401 static int
4402 zfs_freebsd_write(struct vop_write_args *ap)
4403 {
4404         zfs_uio_t uio;
4405         zfs_uio_init(&uio, ap->a_uio);
4406         return (zfs_write(VTOZ(ap->a_vp), &uio, ioflags(ap->a_ioflag),
4407             ap->a_cred));
4408 }
4409
4410 /*
4411  * VOP_FPLOOKUP_VEXEC routines are subject to special circumstances, see
4412  * the comment above cache_fplookup for details.
4413  */
4414 static int
4415 zfs_freebsd_fplookup_vexec(struct vop_fplookup_vexec_args *v)
4416 {
4417         vnode_t *vp;
4418         znode_t *zp;
4419         uint64_t pflags;
4420
4421         vp = v->a_vp;
4422         zp = VTOZ_SMR(vp);
4423         if (__predict_false(zp == NULL))
4424                 return (EAGAIN);
4425         pflags = atomic_load_64(&zp->z_pflags);
4426         if (pflags & ZFS_AV_QUARANTINED)
4427                 return (EAGAIN);
4428         if (pflags & ZFS_XATTR)
4429                 return (EAGAIN);
4430         if ((pflags & ZFS_NO_EXECS_DENIED) == 0)
4431                 return (EAGAIN);
4432         return (0);
4433 }
4434
4435 static int
4436 zfs_freebsd_fplookup_symlink(struct vop_fplookup_symlink_args *v)
4437 {
4438         vnode_t *vp;
4439         znode_t *zp;
4440         char *target;
4441
4442         vp = v->a_vp;
4443         zp = VTOZ_SMR(vp);
4444         if (__predict_false(zp == NULL)) {
4445                 return (EAGAIN);
4446         }
4447
4448         target = atomic_load_consume_ptr(&zp->z_cached_symlink);
4449         if (target == NULL) {
4450                 return (EAGAIN);
4451         }
4452         return (cache_symlink_resolve(v->a_fpl, target, strlen(target)));
4453 }
4454
4455 #ifndef _SYS_SYSPROTO_H_
4456 struct vop_access_args {
4457         struct vnode *a_vp;
4458         accmode_t a_accmode;
4459         struct ucred *a_cred;
4460         struct thread *a_td;
4461 };
4462 #endif
4463
4464 static int
4465 zfs_freebsd_access(struct vop_access_args *ap)
4466 {
4467         vnode_t *vp = ap->a_vp;
4468         znode_t *zp = VTOZ(vp);
4469         accmode_t accmode;
4470         int error = 0;
4471
4472
4473         if (ap->a_accmode == VEXEC) {
4474                 if (zfs_fastaccesschk_execute(zp, ap->a_cred) == 0)
4475                         return (0);
4476         }
4477
4478         /*
4479          * ZFS itself only knowns about VREAD, VWRITE, VEXEC and VAPPEND,
4480          */
4481         accmode = ap->a_accmode & (VREAD|VWRITE|VEXEC|VAPPEND);
4482         if (accmode != 0)
4483                 error = zfs_access(zp, accmode, 0, ap->a_cred);
4484
4485         /*
4486          * VADMIN has to be handled by vaccess().
4487          */
4488         if (error == 0) {
4489                 accmode = ap->a_accmode & ~(VREAD|VWRITE|VEXEC|VAPPEND);
4490                 if (accmode != 0) {
4491                         error = vaccess(vp->v_type, zp->z_mode, zp->z_uid,
4492                             zp->z_gid, accmode, ap->a_cred);
4493                 }
4494         }
4495
4496         /*
4497          * For VEXEC, ensure that at least one execute bit is set for
4498          * non-directories.
4499          */
4500         if (error == 0 && (ap->a_accmode & VEXEC) != 0 && vp->v_type != VDIR &&
4501             (zp->z_mode & (S_IXUSR | S_IXGRP | S_IXOTH)) == 0) {
4502                 error = EACCES;
4503         }
4504
4505         return (error);
4506 }
4507
4508 #ifndef _SYS_SYSPROTO_H_
4509 struct vop_lookup_args {
4510         struct vnode *a_dvp;
4511         struct vnode **a_vpp;
4512         struct componentname *a_cnp;
4513 };
4514 #endif
4515
4516 static int
4517 zfs_freebsd_lookup(struct vop_lookup_args *ap, boolean_t cached)
4518 {
4519         struct componentname *cnp = ap->a_cnp;
4520         char nm[NAME_MAX + 1];
4521
4522         ASSERT3U(cnp->cn_namelen, <, sizeof (nm));
4523         strlcpy(nm, cnp->cn_nameptr, MIN(cnp->cn_namelen + 1, sizeof (nm)));
4524
4525         return (zfs_lookup(ap->a_dvp, nm, ap->a_vpp, cnp, cnp->cn_nameiop,
4526             cnp->cn_cred, 0, cached));
4527 }
4528
4529 static int
4530 zfs_freebsd_cachedlookup(struct vop_cachedlookup_args *ap)
4531 {
4532
4533         return (zfs_freebsd_lookup((struct vop_lookup_args *)ap, B_TRUE));
4534 }
4535
4536 #ifndef _SYS_SYSPROTO_H_
4537 struct vop_lookup_args {
4538         struct vnode *a_dvp;
4539         struct vnode **a_vpp;
4540         struct componentname *a_cnp;
4541 };
4542 #endif
4543
4544 static int
4545 zfs_cache_lookup(struct vop_lookup_args *ap)
4546 {
4547         zfsvfs_t *zfsvfs;
4548
4549         zfsvfs = ap->a_dvp->v_mount->mnt_data;
4550         if (zfsvfs->z_use_namecache)
4551                 return (vfs_cache_lookup(ap));
4552         else
4553                 return (zfs_freebsd_lookup(ap, B_FALSE));
4554 }
4555
4556 #ifndef _SYS_SYSPROTO_H_
4557 struct vop_create_args {
4558         struct vnode *a_dvp;
4559         struct vnode **a_vpp;
4560         struct componentname *a_cnp;
4561         struct vattr *a_vap;
4562 };
4563 #endif
4564
4565 static int
4566 zfs_freebsd_create(struct vop_create_args *ap)
4567 {
4568         zfsvfs_t *zfsvfs;
4569         struct componentname *cnp = ap->a_cnp;
4570         vattr_t *vap = ap->a_vap;
4571         znode_t *zp = NULL;
4572         int rc, mode;
4573
4574 #if __FreeBSD_version < 1400068
4575         ASSERT(cnp->cn_flags & SAVENAME);
4576 #endif
4577
4578         vattr_init_mask(vap);
4579         mode = vap->va_mode & ALLPERMS;
4580         zfsvfs = ap->a_dvp->v_mount->mnt_data;
4581         *ap->a_vpp = NULL;
4582
4583         rc = zfs_create(VTOZ(ap->a_dvp), cnp->cn_nameptr, vap, 0, mode,
4584             &zp, cnp->cn_cred, 0 /* flag */, NULL /* vsecattr */, NULL);
4585         if (rc == 0)
4586                 *ap->a_vpp = ZTOV(zp);
4587         if (zfsvfs->z_use_namecache &&
4588             rc == 0 && (cnp->cn_flags & MAKEENTRY) != 0)
4589                 cache_enter(ap->a_dvp, *ap->a_vpp, cnp);
4590
4591         return (rc);
4592 }
4593
4594 #ifndef _SYS_SYSPROTO_H_
4595 struct vop_remove_args {
4596         struct vnode *a_dvp;
4597         struct vnode *a_vp;
4598         struct componentname *a_cnp;
4599 };
4600 #endif
4601
4602 static int
4603 zfs_freebsd_remove(struct vop_remove_args *ap)
4604 {
4605
4606 #if __FreeBSD_version < 1400068
4607         ASSERT(ap->a_cnp->cn_flags & SAVENAME);
4608 #endif
4609
4610         return (zfs_remove_(ap->a_dvp, ap->a_vp, ap->a_cnp->cn_nameptr,
4611             ap->a_cnp->cn_cred));
4612 }
4613
4614 #ifndef _SYS_SYSPROTO_H_
4615 struct vop_mkdir_args {
4616         struct vnode *a_dvp;
4617         struct vnode **a_vpp;
4618         struct componentname *a_cnp;
4619         struct vattr *a_vap;
4620 };
4621 #endif
4622
4623 static int
4624 zfs_freebsd_mkdir(struct vop_mkdir_args *ap)
4625 {
4626         vattr_t *vap = ap->a_vap;
4627         znode_t *zp = NULL;
4628         int rc;
4629
4630 #if __FreeBSD_version < 1400068
4631         ASSERT(ap->a_cnp->cn_flags & SAVENAME);
4632 #endif
4633
4634         vattr_init_mask(vap);
4635         *ap->a_vpp = NULL;
4636
4637         rc = zfs_mkdir(VTOZ(ap->a_dvp), ap->a_cnp->cn_nameptr, vap, &zp,
4638             ap->a_cnp->cn_cred, 0, NULL, NULL);
4639
4640         if (rc == 0)
4641                 *ap->a_vpp = ZTOV(zp);
4642         return (rc);
4643 }
4644
4645 #ifndef _SYS_SYSPROTO_H_
4646 struct vop_rmdir_args {
4647         struct vnode *a_dvp;
4648         struct vnode *a_vp;
4649         struct componentname *a_cnp;
4650 };
4651 #endif
4652
4653 static int
4654 zfs_freebsd_rmdir(struct vop_rmdir_args *ap)
4655 {
4656         struct componentname *cnp = ap->a_cnp;
4657
4658 #if __FreeBSD_version < 1400068
4659         ASSERT(cnp->cn_flags & SAVENAME);
4660 #endif
4661
4662         return (zfs_rmdir_(ap->a_dvp, ap->a_vp, cnp->cn_nameptr, cnp->cn_cred));
4663 }
4664
4665 #ifndef _SYS_SYSPROTO_H_
4666 struct vop_readdir_args {
4667         struct vnode *a_vp;
4668         struct uio *a_uio;
4669         struct ucred *a_cred;
4670         int *a_eofflag;
4671         int *a_ncookies;
4672         cookie_t **a_cookies;
4673 };
4674 #endif
4675
4676 static int
4677 zfs_freebsd_readdir(struct vop_readdir_args *ap)
4678 {
4679         zfs_uio_t uio;
4680         zfs_uio_init(&uio, ap->a_uio);
4681         return (zfs_readdir(ap->a_vp, &uio, ap->a_cred, ap->a_eofflag,
4682             ap->a_ncookies, ap->a_cookies));
4683 }
4684
4685 #ifndef _SYS_SYSPROTO_H_
4686 struct vop_fsync_args {
4687         struct vnode *a_vp;
4688         int a_waitfor;
4689         struct thread *a_td;
4690 };
4691 #endif
4692
4693 static int
4694 zfs_freebsd_fsync(struct vop_fsync_args *ap)
4695 {
4696
4697         return (zfs_fsync(VTOZ(ap->a_vp), 0, ap->a_td->td_ucred));
4698 }
4699
4700 #ifndef _SYS_SYSPROTO_H_
4701 struct vop_getattr_args {
4702         struct vnode *a_vp;
4703         struct vattr *a_vap;
4704         struct ucred *a_cred;
4705 };
4706 #endif
4707
4708 static int
4709 zfs_freebsd_getattr(struct vop_getattr_args *ap)
4710 {
4711         vattr_t *vap = ap->a_vap;
4712         xvattr_t xvap;
4713         ulong_t fflags = 0;
4714         int error;
4715
4716         xva_init(&xvap);
4717         xvap.xva_vattr = *vap;
4718         xvap.xva_vattr.va_mask |= AT_XVATTR;
4719
4720         /* Convert chflags into ZFS-type flags. */
4721         /* XXX: what about SF_SETTABLE?. */
4722         XVA_SET_REQ(&xvap, XAT_IMMUTABLE);
4723         XVA_SET_REQ(&xvap, XAT_APPENDONLY);
4724         XVA_SET_REQ(&xvap, XAT_NOUNLINK);
4725         XVA_SET_REQ(&xvap, XAT_NODUMP);
4726         XVA_SET_REQ(&xvap, XAT_READONLY);
4727         XVA_SET_REQ(&xvap, XAT_ARCHIVE);
4728         XVA_SET_REQ(&xvap, XAT_SYSTEM);
4729         XVA_SET_REQ(&xvap, XAT_HIDDEN);
4730         XVA_SET_REQ(&xvap, XAT_REPARSE);
4731         XVA_SET_REQ(&xvap, XAT_OFFLINE);
4732         XVA_SET_REQ(&xvap, XAT_SPARSE);
4733
4734         error = zfs_getattr(ap->a_vp, (vattr_t *)&xvap, 0, ap->a_cred);
4735         if (error != 0)
4736                 return (error);
4737
4738         /* Convert ZFS xattr into chflags. */
4739 #define FLAG_CHECK(fflag, xflag, xfield)        do {                    \
4740         if (XVA_ISSET_RTN(&xvap, (xflag)) && (xfield) != 0)             \
4741                 fflags |= (fflag);                                      \
4742 } while (0)
4743         FLAG_CHECK(SF_IMMUTABLE, XAT_IMMUTABLE,
4744             xvap.xva_xoptattrs.xoa_immutable);
4745         FLAG_CHECK(SF_APPEND, XAT_APPENDONLY,
4746             xvap.xva_xoptattrs.xoa_appendonly);
4747         FLAG_CHECK(SF_NOUNLINK, XAT_NOUNLINK,
4748             xvap.xva_xoptattrs.xoa_nounlink);
4749         FLAG_CHECK(UF_ARCHIVE, XAT_ARCHIVE,
4750             xvap.xva_xoptattrs.xoa_archive);
4751         FLAG_CHECK(UF_NODUMP, XAT_NODUMP,
4752             xvap.xva_xoptattrs.xoa_nodump);
4753         FLAG_CHECK(UF_READONLY, XAT_READONLY,
4754             xvap.xva_xoptattrs.xoa_readonly);
4755         FLAG_CHECK(UF_SYSTEM, XAT_SYSTEM,
4756             xvap.xva_xoptattrs.xoa_system);
4757         FLAG_CHECK(UF_HIDDEN, XAT_HIDDEN,
4758             xvap.xva_xoptattrs.xoa_hidden);
4759         FLAG_CHECK(UF_REPARSE, XAT_REPARSE,
4760             xvap.xva_xoptattrs.xoa_reparse);
4761         FLAG_CHECK(UF_OFFLINE, XAT_OFFLINE,
4762             xvap.xva_xoptattrs.xoa_offline);
4763         FLAG_CHECK(UF_SPARSE, XAT_SPARSE,
4764             xvap.xva_xoptattrs.xoa_sparse);
4765
4766 #undef  FLAG_CHECK
4767         *vap = xvap.xva_vattr;
4768         vap->va_flags = fflags;
4769         return (0);
4770 }
4771
4772 #ifndef _SYS_SYSPROTO_H_
4773 struct vop_setattr_args {
4774         struct vnode *a_vp;
4775         struct vattr *a_vap;
4776         struct ucred *a_cred;
4777 };
4778 #endif
4779
4780 static int
4781 zfs_freebsd_setattr(struct vop_setattr_args *ap)
4782 {
4783         vnode_t *vp = ap->a_vp;
4784         vattr_t *vap = ap->a_vap;
4785         cred_t *cred = ap->a_cred;
4786         xvattr_t xvap;
4787         ulong_t fflags;
4788         uint64_t zflags;
4789
4790         vattr_init_mask(vap);
4791         vap->va_mask &= ~AT_NOSET;
4792
4793         xva_init(&xvap);
4794         xvap.xva_vattr = *vap;
4795
4796         zflags = VTOZ(vp)->z_pflags;
4797
4798         if (vap->va_flags != VNOVAL) {
4799                 zfsvfs_t *zfsvfs = VTOZ(vp)->z_zfsvfs;
4800                 int error;
4801
4802                 if (zfsvfs->z_use_fuids == B_FALSE)
4803                         return (EOPNOTSUPP);
4804
4805                 fflags = vap->va_flags;
4806                 /*
4807                  * XXX KDM
4808                  * We need to figure out whether it makes sense to allow
4809                  * UF_REPARSE through, since we don't really have other
4810                  * facilities to handle reparse points and zfs_setattr()
4811                  * doesn't currently allow setting that attribute anyway.
4812                  */
4813                 if ((fflags & ~(SF_IMMUTABLE|SF_APPEND|SF_NOUNLINK|UF_ARCHIVE|
4814                     UF_NODUMP|UF_SYSTEM|UF_HIDDEN|UF_READONLY|UF_REPARSE|
4815                     UF_OFFLINE|UF_SPARSE)) != 0)
4816                         return (EOPNOTSUPP);
4817                 /*
4818                  * Unprivileged processes are not permitted to unset system
4819                  * flags, or modify flags if any system flags are set.
4820                  * Privileged non-jail processes may not modify system flags
4821                  * if securelevel > 0 and any existing system flags are set.
4822                  * Privileged jail processes behave like privileged non-jail
4823                  * processes if the PR_ALLOW_CHFLAGS permission bit is set;
4824                  * otherwise, they behave like unprivileged processes.
4825                  */
4826                 if (secpolicy_fs_owner(vp->v_mount, cred) == 0 ||
4827                     priv_check_cred(cred, PRIV_VFS_SYSFLAGS) == 0) {
4828                         if (zflags &
4829                             (ZFS_IMMUTABLE | ZFS_APPENDONLY | ZFS_NOUNLINK)) {
4830                                 error = securelevel_gt(cred, 0);
4831                                 if (error != 0)
4832                                         return (error);
4833                         }
4834                 } else {
4835                         /*
4836                          * Callers may only modify the file flags on
4837                          * objects they have VADMIN rights for.
4838                          */
4839                         if ((error = VOP_ACCESS(vp, VADMIN, cred,
4840                             curthread)) != 0)
4841                                 return (error);
4842                         if (zflags &
4843                             (ZFS_IMMUTABLE | ZFS_APPENDONLY |
4844                             ZFS_NOUNLINK)) {
4845                                 return (EPERM);
4846                         }
4847                         if (fflags &
4848                             (SF_IMMUTABLE | SF_APPEND | SF_NOUNLINK)) {
4849                                 return (EPERM);
4850                         }
4851                 }
4852
4853 #define FLAG_CHANGE(fflag, zflag, xflag, xfield)        do {            \
4854         if (((fflags & (fflag)) && !(zflags & (zflag))) ||              \
4855             ((zflags & (zflag)) && !(fflags & (fflag)))) {              \
4856                 XVA_SET_REQ(&xvap, (xflag));                            \
4857                 (xfield) = ((fflags & (fflag)) != 0);                   \
4858         }                                                               \
4859 } while (0)
4860                 /* Convert chflags into ZFS-type flags. */
4861                 /* XXX: what about SF_SETTABLE?. */
4862                 FLAG_CHANGE(SF_IMMUTABLE, ZFS_IMMUTABLE, XAT_IMMUTABLE,
4863                     xvap.xva_xoptattrs.xoa_immutable);
4864                 FLAG_CHANGE(SF_APPEND, ZFS_APPENDONLY, XAT_APPENDONLY,
4865                     xvap.xva_xoptattrs.xoa_appendonly);
4866                 FLAG_CHANGE(SF_NOUNLINK, ZFS_NOUNLINK, XAT_NOUNLINK,
4867                     xvap.xva_xoptattrs.xoa_nounlink);
4868                 FLAG_CHANGE(UF_ARCHIVE, ZFS_ARCHIVE, XAT_ARCHIVE,
4869                     xvap.xva_xoptattrs.xoa_archive);
4870                 FLAG_CHANGE(UF_NODUMP, ZFS_NODUMP, XAT_NODUMP,
4871                     xvap.xva_xoptattrs.xoa_nodump);
4872                 FLAG_CHANGE(UF_READONLY, ZFS_READONLY, XAT_READONLY,
4873                     xvap.xva_xoptattrs.xoa_readonly);
4874                 FLAG_CHANGE(UF_SYSTEM, ZFS_SYSTEM, XAT_SYSTEM,
4875                     xvap.xva_xoptattrs.xoa_system);
4876                 FLAG_CHANGE(UF_HIDDEN, ZFS_HIDDEN, XAT_HIDDEN,
4877                     xvap.xva_xoptattrs.xoa_hidden);
4878                 FLAG_CHANGE(UF_REPARSE, ZFS_REPARSE, XAT_REPARSE,
4879                     xvap.xva_xoptattrs.xoa_reparse);
4880                 FLAG_CHANGE(UF_OFFLINE, ZFS_OFFLINE, XAT_OFFLINE,
4881                     xvap.xva_xoptattrs.xoa_offline);
4882                 FLAG_CHANGE(UF_SPARSE, ZFS_SPARSE, XAT_SPARSE,
4883                     xvap.xva_xoptattrs.xoa_sparse);
4884 #undef  FLAG_CHANGE
4885         }
4886         if (vap->va_birthtime.tv_sec != VNOVAL) {
4887                 xvap.xva_vattr.va_mask |= AT_XVATTR;
4888                 XVA_SET_REQ(&xvap, XAT_CREATETIME);
4889         }
4890         return (zfs_setattr(VTOZ(vp), (vattr_t *)&xvap, 0, cred, NULL));
4891 }
4892
4893 #ifndef _SYS_SYSPROTO_H_
4894 struct vop_rename_args {
4895         struct vnode *a_fdvp;
4896         struct vnode *a_fvp;
4897         struct componentname *a_fcnp;
4898         struct vnode *a_tdvp;
4899         struct vnode *a_tvp;
4900         struct componentname *a_tcnp;
4901 };
4902 #endif
4903
4904 static int
4905 zfs_freebsd_rename(struct vop_rename_args *ap)
4906 {
4907         vnode_t *fdvp = ap->a_fdvp;
4908         vnode_t *fvp = ap->a_fvp;
4909         vnode_t *tdvp = ap->a_tdvp;
4910         vnode_t *tvp = ap->a_tvp;
4911         int error;
4912
4913 #if __FreeBSD_version < 1400068
4914         ASSERT(ap->a_fcnp->cn_flags & (SAVENAME|SAVESTART));
4915         ASSERT(ap->a_tcnp->cn_flags & (SAVENAME|SAVESTART));
4916 #endif
4917
4918         error = zfs_do_rename(fdvp, &fvp, ap->a_fcnp, tdvp, &tvp,
4919             ap->a_tcnp, ap->a_fcnp->cn_cred);
4920
4921         vrele(fdvp);
4922         vrele(fvp);
4923         vrele(tdvp);
4924         if (tvp != NULL)
4925                 vrele(tvp);
4926
4927         return (error);
4928 }
4929
4930 #ifndef _SYS_SYSPROTO_H_
4931 struct vop_symlink_args {
4932         struct vnode *a_dvp;
4933         struct vnode **a_vpp;
4934         struct componentname *a_cnp;
4935         struct vattr *a_vap;
4936         char *a_target;
4937 };
4938 #endif
4939
4940 static int
4941 zfs_freebsd_symlink(struct vop_symlink_args *ap)
4942 {
4943         struct componentname *cnp = ap->a_cnp;
4944         vattr_t *vap = ap->a_vap;
4945         znode_t *zp = NULL;
4946         char *symlink;
4947         size_t symlink_len;
4948         int rc;
4949
4950 #if __FreeBSD_version < 1400068
4951         ASSERT(cnp->cn_flags & SAVENAME);
4952 #endif
4953
4954         vap->va_type = VLNK;    /* FreeBSD: Syscall only sets va_mode. */
4955         vattr_init_mask(vap);
4956         *ap->a_vpp = NULL;
4957
4958         rc = zfs_symlink(VTOZ(ap->a_dvp), cnp->cn_nameptr, vap,
4959             ap->a_target, &zp, cnp->cn_cred, 0 /* flags */, NULL);
4960         if (rc == 0) {
4961                 *ap->a_vpp = ZTOV(zp);
4962                 ASSERT_VOP_ELOCKED(ZTOV(zp), __func__);
4963                 MPASS(zp->z_cached_symlink == NULL);
4964                 symlink_len = strlen(ap->a_target);
4965                 symlink = cache_symlink_alloc(symlink_len + 1, M_WAITOK);
4966                 if (symlink != NULL) {
4967                         memcpy(symlink, ap->a_target, symlink_len);
4968                         symlink[symlink_len] = '\0';
4969                         atomic_store_rel_ptr((uintptr_t *)&zp->z_cached_symlink,
4970                             (uintptr_t)symlink);
4971                 }
4972         }
4973         return (rc);
4974 }
4975
4976 #ifndef _SYS_SYSPROTO_H_
4977 struct vop_readlink_args {
4978         struct vnode *a_vp;
4979         struct uio *a_uio;
4980         struct ucred *a_cred;
4981 };
4982 #endif
4983
4984 static int
4985 zfs_freebsd_readlink(struct vop_readlink_args *ap)
4986 {
4987         zfs_uio_t uio;
4988         int error;
4989         znode_t *zp = VTOZ(ap->a_vp);
4990         char *symlink, *base;
4991         size_t symlink_len;
4992         bool trycache;
4993
4994         zfs_uio_init(&uio, ap->a_uio);
4995         trycache = false;
4996         if (zfs_uio_segflg(&uio) == UIO_SYSSPACE &&
4997             zfs_uio_iovcnt(&uio) == 1) {
4998                 base = zfs_uio_iovbase(&uio, 0);
4999                 symlink_len = zfs_uio_iovlen(&uio, 0);
5000                 trycache = true;
5001         }
5002         error = zfs_readlink(ap->a_vp, &uio, ap->a_cred, NULL);
5003         if (atomic_load_ptr(&zp->z_cached_symlink) != NULL ||
5004             error != 0 || !trycache) {
5005                 return (error);
5006         }
5007         symlink_len -= zfs_uio_resid(&uio);
5008         symlink = cache_symlink_alloc(symlink_len + 1, M_WAITOK);
5009         if (symlink != NULL) {
5010                 memcpy(symlink, base, symlink_len);
5011                 symlink[symlink_len] = '\0';
5012                 if (!atomic_cmpset_rel_ptr((uintptr_t *)&zp->z_cached_symlink,
5013                     (uintptr_t)NULL, (uintptr_t)symlink)) {
5014                         cache_symlink_free(symlink, symlink_len + 1);
5015                 }
5016         }
5017         return (error);
5018 }
5019
5020 #ifndef _SYS_SYSPROTO_H_
5021 struct vop_link_args {
5022         struct vnode *a_tdvp;
5023         struct vnode *a_vp;
5024         struct componentname *a_cnp;
5025 };
5026 #endif
5027
5028 static int
5029 zfs_freebsd_link(struct vop_link_args *ap)
5030 {
5031         struct componentname *cnp = ap->a_cnp;
5032         vnode_t *vp = ap->a_vp;
5033         vnode_t *tdvp = ap->a_tdvp;
5034
5035         if (tdvp->v_mount != vp->v_mount)
5036                 return (EXDEV);
5037
5038 #if __FreeBSD_version < 1400068
5039         ASSERT(cnp->cn_flags & SAVENAME);
5040 #endif
5041
5042         return (zfs_link(VTOZ(tdvp), VTOZ(vp),
5043             cnp->cn_nameptr, cnp->cn_cred, 0));
5044 }
5045
5046 #ifndef _SYS_SYSPROTO_H_
5047 struct vop_inactive_args {
5048         struct vnode *a_vp;
5049         struct thread *a_td;
5050 };
5051 #endif
5052
5053 static int
5054 zfs_freebsd_inactive(struct vop_inactive_args *ap)
5055 {
5056         vnode_t *vp = ap->a_vp;
5057
5058         zfs_inactive(vp, curthread->td_ucred, NULL);
5059         return (0);
5060 }
5061
5062 #ifndef _SYS_SYSPROTO_H_
5063 struct vop_need_inactive_args {
5064         struct vnode *a_vp;
5065         struct thread *a_td;
5066 };
5067 #endif
5068
5069 static int
5070 zfs_freebsd_need_inactive(struct vop_need_inactive_args *ap)
5071 {
5072         vnode_t *vp = ap->a_vp;
5073         znode_t *zp = VTOZ(vp);
5074         zfsvfs_t *zfsvfs = zp->z_zfsvfs;
5075         int need;
5076
5077         if (vn_need_pageq_flush(vp))
5078                 return (1);
5079
5080         if (!ZFS_TEARDOWN_INACTIVE_TRY_ENTER_READ(zfsvfs))
5081                 return (1);
5082         need = (zp->z_sa_hdl == NULL || zp->z_unlinked || zp->z_atime_dirty);
5083         ZFS_TEARDOWN_INACTIVE_EXIT_READ(zfsvfs);
5084
5085         return (need);
5086 }
5087
5088 #ifndef _SYS_SYSPROTO_H_
5089 struct vop_reclaim_args {
5090         struct vnode *a_vp;
5091         struct thread *a_td;
5092 };
5093 #endif
5094
5095 static int
5096 zfs_freebsd_reclaim(struct vop_reclaim_args *ap)
5097 {
5098         vnode_t *vp = ap->a_vp;
5099         znode_t *zp = VTOZ(vp);
5100         zfsvfs_t *zfsvfs = zp->z_zfsvfs;
5101
5102         ASSERT3P(zp, !=, NULL);
5103
5104         /*
5105          * z_teardown_inactive_lock protects from a race with
5106          * zfs_znode_dmu_fini in zfsvfs_teardown during
5107          * force unmount.
5108          */
5109         ZFS_TEARDOWN_INACTIVE_ENTER_READ(zfsvfs);
5110         if (zp->z_sa_hdl == NULL)
5111                 zfs_znode_free(zp);
5112         else
5113                 zfs_zinactive(zp);
5114         ZFS_TEARDOWN_INACTIVE_EXIT_READ(zfsvfs);
5115
5116         vp->v_data = NULL;
5117         return (0);
5118 }
5119
5120 #ifndef _SYS_SYSPROTO_H_
5121 struct vop_fid_args {
5122         struct vnode *a_vp;
5123         struct fid *a_fid;
5124 };
5125 #endif
5126
5127 static int
5128 zfs_freebsd_fid(struct vop_fid_args *ap)
5129 {
5130
5131         return (zfs_fid(ap->a_vp, (void *)ap->a_fid, NULL));
5132 }
5133
5134
5135 #ifndef _SYS_SYSPROTO_H_
5136 struct vop_pathconf_args {
5137         struct vnode *a_vp;
5138         int a_name;
5139         register_t *a_retval;
5140 } *ap;
5141 #endif
5142
5143 static int
5144 zfs_freebsd_pathconf(struct vop_pathconf_args *ap)
5145 {
5146         ulong_t val;
5147         int error;
5148
5149         error = zfs_pathconf(ap->a_vp, ap->a_name, &val,
5150             curthread->td_ucred, NULL);
5151         if (error == 0) {
5152                 *ap->a_retval = val;
5153                 return (error);
5154         }
5155         if (error != EOPNOTSUPP)
5156                 return (error);
5157
5158         switch (ap->a_name) {
5159         case _PC_NAME_MAX:
5160                 *ap->a_retval = NAME_MAX;
5161                 return (0);
5162 #if __FreeBSD_version >= 1400032
5163         case _PC_DEALLOC_PRESENT:
5164                 *ap->a_retval = 1;
5165                 return (0);
5166 #endif
5167         case _PC_PIPE_BUF:
5168                 if (ap->a_vp->v_type == VDIR || ap->a_vp->v_type == VFIFO) {
5169                         *ap->a_retval = PIPE_BUF;
5170                         return (0);
5171                 }
5172                 return (EINVAL);
5173         default:
5174                 return (vop_stdpathconf(ap));
5175         }
5176 }
5177
5178 static int zfs_xattr_compat = 1;
5179
5180 static int
5181 zfs_check_attrname(const char *name)
5182 {
5183         /* We don't allow '/' character in attribute name. */
5184         if (strchr(name, '/') != NULL)
5185                 return (SET_ERROR(EINVAL));
5186         /* We don't allow attribute names that start with a namespace prefix. */
5187         if (ZFS_XA_NS_PREFIX_FORBIDDEN(name))
5188                 return (SET_ERROR(EINVAL));
5189         return (0);
5190 }
5191
5192 /*
5193  * FreeBSD's extended attributes namespace defines file name prefix for ZFS'
5194  * extended attribute name:
5195  *
5196  *      NAMESPACE       XATTR_COMPAT    PREFIX
5197  *      system          *               freebsd:system:
5198  *      user            1               (none, can be used to access ZFS
5199  *                                      fsattr(5) attributes created on Solaris)
5200  *      user            0               user.
5201  */
5202 static int
5203 zfs_create_attrname(int attrnamespace, const char *name, char *attrname,
5204     size_t size, boolean_t compat)
5205 {
5206         const char *namespace, *prefix, *suffix;
5207
5208         memset(attrname, 0, size);
5209
5210         switch (attrnamespace) {
5211         case EXTATTR_NAMESPACE_USER:
5212                 if (compat) {
5213                         /*
5214                          * This is the default namespace by which we can access
5215                          * all attributes created on Solaris.
5216                          */
5217                         prefix = namespace = suffix = "";
5218                 } else {
5219                         /*
5220                          * This is compatible with the user namespace encoding
5221                          * on Linux prior to xattr_compat, but nothing
5222                          * else.
5223                          */
5224                         prefix = "";
5225                         namespace = "user";
5226                         suffix = ".";
5227                 }
5228                 break;
5229         case EXTATTR_NAMESPACE_SYSTEM:
5230                 prefix = "freebsd:";
5231                 namespace = EXTATTR_NAMESPACE_SYSTEM_STRING;
5232                 suffix = ":";
5233                 break;
5234         case EXTATTR_NAMESPACE_EMPTY:
5235         default:
5236                 return (SET_ERROR(EINVAL));
5237         }
5238         if (snprintf(attrname, size, "%s%s%s%s", prefix, namespace, suffix,
5239             name) >= size) {
5240                 return (SET_ERROR(ENAMETOOLONG));
5241         }
5242         return (0);
5243 }
5244
5245 static int
5246 zfs_ensure_xattr_cached(znode_t *zp)
5247 {
5248         int error = 0;
5249
5250         ASSERT(RW_LOCK_HELD(&zp->z_xattr_lock));
5251
5252         if (zp->z_xattr_cached != NULL)
5253                 return (0);
5254
5255         if (rw_write_held(&zp->z_xattr_lock))
5256                 return (zfs_sa_get_xattr(zp));
5257
5258         if (!rw_tryupgrade(&zp->z_xattr_lock)) {
5259                 rw_exit(&zp->z_xattr_lock);
5260                 rw_enter(&zp->z_xattr_lock, RW_WRITER);
5261         }
5262         if (zp->z_xattr_cached == NULL)
5263                 error = zfs_sa_get_xattr(zp);
5264         rw_downgrade(&zp->z_xattr_lock);
5265         return (error);
5266 }
5267
5268 #ifndef _SYS_SYSPROTO_H_
5269 struct vop_getextattr {
5270         IN struct vnode *a_vp;
5271         IN int a_attrnamespace;
5272         IN const char *a_name;
5273         INOUT struct uio *a_uio;
5274         OUT size_t *a_size;
5275         IN struct ucred *a_cred;
5276         IN struct thread *a_td;
5277 };
5278 #endif
5279
5280 static int
5281 zfs_getextattr_dir(struct vop_getextattr_args *ap, const char *attrname)
5282 {
5283         struct thread *td = ap->a_td;
5284         struct nameidata nd;
5285         struct vattr va;
5286         vnode_t *xvp = NULL, *vp;
5287         int error, flags;
5288
5289         error = zfs_lookup(ap->a_vp, NULL, &xvp, NULL, 0, ap->a_cred,
5290             LOOKUP_XATTR, B_FALSE);
5291         if (error != 0)
5292                 return (error);
5293
5294         flags = FREAD;
5295 #if __FreeBSD_version < 1400043
5296         NDINIT_ATVP(&nd, LOOKUP, NOFOLLOW, UIO_SYSSPACE, attrname,
5297             xvp, td);
5298 #else
5299         NDINIT_ATVP(&nd, LOOKUP, NOFOLLOW, UIO_SYSSPACE, attrname, xvp);
5300 #endif
5301         error = vn_open_cred(&nd, &flags, 0, VN_OPEN_INVFS, ap->a_cred, NULL);
5302         if (error != 0)
5303                 return (SET_ERROR(error));
5304         vp = nd.ni_vp;
5305         NDFREE_PNBUF(&nd);
5306
5307         if (ap->a_size != NULL) {
5308                 error = VOP_GETATTR(vp, &va, ap->a_cred);
5309                 if (error == 0)
5310                         *ap->a_size = (size_t)va.va_size;
5311         } else if (ap->a_uio != NULL)
5312                 error = VOP_READ(vp, ap->a_uio, IO_UNIT, ap->a_cred);
5313
5314         VOP_UNLOCK(vp);
5315         vn_close(vp, flags, ap->a_cred, td);
5316         return (error);
5317 }
5318
5319 static int
5320 zfs_getextattr_sa(struct vop_getextattr_args *ap, const char *attrname)
5321 {
5322         znode_t *zp = VTOZ(ap->a_vp);
5323         uchar_t *nv_value;
5324         uint_t nv_size;
5325         int error;
5326
5327         error = zfs_ensure_xattr_cached(zp);
5328         if (error != 0)
5329                 return (error);
5330
5331         ASSERT(RW_LOCK_HELD(&zp->z_xattr_lock));
5332         ASSERT3P(zp->z_xattr_cached, !=, NULL);
5333
5334         error = nvlist_lookup_byte_array(zp->z_xattr_cached, attrname,
5335             &nv_value, &nv_size);
5336         if (error != 0)
5337                 return (SET_ERROR(error));
5338
5339         if (ap->a_size != NULL)
5340                 *ap->a_size = nv_size;
5341         else if (ap->a_uio != NULL)
5342                 error = uiomove(nv_value, nv_size, ap->a_uio);
5343         if (error != 0)
5344                 return (SET_ERROR(error));
5345
5346         return (0);
5347 }
5348
5349 static int
5350 zfs_getextattr_impl(struct vop_getextattr_args *ap, boolean_t compat)
5351 {
5352         znode_t *zp = VTOZ(ap->a_vp);
5353         zfsvfs_t *zfsvfs = ZTOZSB(zp);
5354         char attrname[EXTATTR_MAXNAMELEN+1];
5355         int error;
5356
5357         error = zfs_create_attrname(ap->a_attrnamespace, ap->a_name, attrname,
5358             sizeof (attrname), compat);
5359         if (error != 0)
5360                 return (error);
5361
5362         error = ENOENT;
5363         if (zfsvfs->z_use_sa && zp->z_is_sa)
5364                 error = zfs_getextattr_sa(ap, attrname);
5365         if (error == ENOENT)
5366                 error = zfs_getextattr_dir(ap, attrname);
5367         return (error);
5368 }
5369
5370 /*
5371  * Vnode operation to retrieve a named extended attribute.
5372  */
5373 static int
5374 zfs_getextattr(struct vop_getextattr_args *ap)
5375 {
5376         znode_t *zp = VTOZ(ap->a_vp);
5377         zfsvfs_t *zfsvfs = ZTOZSB(zp);
5378         int error;
5379
5380         /*
5381          * If the xattr property is off, refuse the request.
5382          */
5383         if (!(zfsvfs->z_flags & ZSB_XATTR))
5384                 return (SET_ERROR(EOPNOTSUPP));
5385
5386         error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace,
5387             ap->a_cred, ap->a_td, VREAD);
5388         if (error != 0)
5389                 return (SET_ERROR(error));
5390
5391         error = zfs_check_attrname(ap->a_name);
5392         if (error != 0)
5393                 return (error);
5394
5395         if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
5396                 return (error);
5397         error = ENOENT;
5398         rw_enter(&zp->z_xattr_lock, RW_READER);
5399
5400         error = zfs_getextattr_impl(ap, zfs_xattr_compat);
5401         if ((error == ENOENT || error == ENOATTR) &&
5402             ap->a_attrnamespace == EXTATTR_NAMESPACE_USER) {
5403                 /*
5404                  * Fall back to the alternate namespace format if we failed to
5405                  * find a user xattr.
5406                  */
5407                 error = zfs_getextattr_impl(ap, !zfs_xattr_compat);
5408         }
5409
5410         rw_exit(&zp->z_xattr_lock);
5411         zfs_exit(zfsvfs, FTAG);
5412         if (error == ENOENT)
5413                 error = SET_ERROR(ENOATTR);
5414         return (error);
5415 }
5416
5417 #ifndef _SYS_SYSPROTO_H_
5418 struct vop_deleteextattr {
5419         IN struct vnode *a_vp;
5420         IN int a_attrnamespace;
5421         IN const char *a_name;
5422         IN struct ucred *a_cred;
5423         IN struct thread *a_td;
5424 };
5425 #endif
5426
5427 static int
5428 zfs_deleteextattr_dir(struct vop_deleteextattr_args *ap, const char *attrname)
5429 {
5430         struct nameidata nd;
5431         vnode_t *xvp = NULL, *vp;
5432         int error;
5433
5434         error = zfs_lookup(ap->a_vp, NULL, &xvp, NULL, 0, ap->a_cred,
5435             LOOKUP_XATTR, B_FALSE);
5436         if (error != 0)
5437                 return (error);
5438
5439 #if __FreeBSD_version < 1400043
5440         NDINIT_ATVP(&nd, DELETE, NOFOLLOW | LOCKPARENT | LOCKLEAF,
5441             UIO_SYSSPACE, attrname, xvp, ap->a_td);
5442 #else
5443         NDINIT_ATVP(&nd, DELETE, NOFOLLOW | LOCKPARENT | LOCKLEAF,
5444             UIO_SYSSPACE, attrname, xvp);
5445 #endif
5446         error = namei(&nd);
5447         if (error != 0)
5448                 return (SET_ERROR(error));
5449
5450         vp = nd.ni_vp;
5451         error = VOP_REMOVE(nd.ni_dvp, vp, &nd.ni_cnd);
5452         NDFREE_PNBUF(&nd);
5453
5454         vput(nd.ni_dvp);
5455         if (vp == nd.ni_dvp)
5456                 vrele(vp);
5457         else
5458                 vput(vp);
5459
5460         return (error);
5461 }
5462
5463 static int
5464 zfs_deleteextattr_sa(struct vop_deleteextattr_args *ap, const char *attrname)
5465 {
5466         znode_t *zp = VTOZ(ap->a_vp);
5467         nvlist_t *nvl;
5468         int error;
5469
5470         error = zfs_ensure_xattr_cached(zp);
5471         if (error != 0)
5472                 return (error);
5473
5474         ASSERT(RW_WRITE_HELD(&zp->z_xattr_lock));
5475         ASSERT3P(zp->z_xattr_cached, !=, NULL);
5476
5477         nvl = zp->z_xattr_cached;
5478         error = nvlist_remove(nvl, attrname, DATA_TYPE_BYTE_ARRAY);
5479         if (error != 0)
5480                 error = SET_ERROR(error);
5481         else
5482                 error = zfs_sa_set_xattr(zp, attrname, NULL, 0);
5483         if (error != 0) {
5484                 zp->z_xattr_cached = NULL;
5485                 nvlist_free(nvl);
5486         }
5487         return (error);
5488 }
5489
5490 static int
5491 zfs_deleteextattr_impl(struct vop_deleteextattr_args *ap, boolean_t compat)
5492 {
5493         znode_t *zp = VTOZ(ap->a_vp);
5494         zfsvfs_t *zfsvfs = ZTOZSB(zp);
5495         char attrname[EXTATTR_MAXNAMELEN+1];
5496         int error;
5497
5498         error = zfs_create_attrname(ap->a_attrnamespace, ap->a_name, attrname,
5499             sizeof (attrname), compat);
5500         if (error != 0)
5501                 return (error);
5502
5503         error = ENOENT;
5504         if (zfsvfs->z_use_sa && zp->z_is_sa)
5505                 error = zfs_deleteextattr_sa(ap, attrname);
5506         if (error == ENOENT)
5507                 error = zfs_deleteextattr_dir(ap, attrname);
5508         return (error);
5509 }
5510
5511 /*
5512  * Vnode operation to remove a named attribute.
5513  */
5514 static int
5515 zfs_deleteextattr(struct vop_deleteextattr_args *ap)
5516 {
5517         znode_t *zp = VTOZ(ap->a_vp);
5518         zfsvfs_t *zfsvfs = ZTOZSB(zp);
5519         int error;
5520
5521         /*
5522          * If the xattr property is off, refuse the request.
5523          */
5524         if (!(zfsvfs->z_flags & ZSB_XATTR))
5525                 return (SET_ERROR(EOPNOTSUPP));
5526
5527         error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace,
5528             ap->a_cred, ap->a_td, VWRITE);
5529         if (error != 0)
5530                 return (SET_ERROR(error));
5531
5532         error = zfs_check_attrname(ap->a_name);
5533         if (error != 0)
5534                 return (error);
5535
5536         if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
5537                 return (error);
5538         rw_enter(&zp->z_xattr_lock, RW_WRITER);
5539
5540         error = zfs_deleteextattr_impl(ap, zfs_xattr_compat);
5541         if ((error == ENOENT || error == ENOATTR) &&
5542             ap->a_attrnamespace == EXTATTR_NAMESPACE_USER) {
5543                 /*
5544                  * Fall back to the alternate namespace format if we failed to
5545                  * find a user xattr.
5546                  */
5547                 error = zfs_deleteextattr_impl(ap, !zfs_xattr_compat);
5548         }
5549
5550         rw_exit(&zp->z_xattr_lock);
5551         zfs_exit(zfsvfs, FTAG);
5552         if (error == ENOENT)
5553                 error = SET_ERROR(ENOATTR);
5554         return (error);
5555 }
5556
5557 #ifndef _SYS_SYSPROTO_H_
5558 struct vop_setextattr {
5559         IN struct vnode *a_vp;
5560         IN int a_attrnamespace;
5561         IN const char *a_name;
5562         INOUT struct uio *a_uio;
5563         IN struct ucred *a_cred;
5564         IN struct thread *a_td;
5565 };
5566 #endif
5567
5568 static int
5569 zfs_setextattr_dir(struct vop_setextattr_args *ap, const char *attrname)
5570 {
5571         struct thread *td = ap->a_td;
5572         struct nameidata nd;
5573         struct vattr va;
5574         vnode_t *xvp = NULL, *vp;
5575         int error, flags;
5576
5577         error = zfs_lookup(ap->a_vp, NULL, &xvp, NULL, 0, ap->a_cred,
5578             LOOKUP_XATTR | CREATE_XATTR_DIR, B_FALSE);
5579         if (error != 0)
5580                 return (error);
5581
5582         flags = FFLAGS(O_WRONLY | O_CREAT);
5583 #if __FreeBSD_version < 1400043
5584         NDINIT_ATVP(&nd, LOOKUP, NOFOLLOW, UIO_SYSSPACE, attrname, xvp, td);
5585 #else
5586         NDINIT_ATVP(&nd, LOOKUP, NOFOLLOW, UIO_SYSSPACE, attrname, xvp);
5587 #endif
5588         error = vn_open_cred(&nd, &flags, 0600, VN_OPEN_INVFS, ap->a_cred,
5589             NULL);
5590         if (error != 0)
5591                 return (SET_ERROR(error));
5592         vp = nd.ni_vp;
5593         NDFREE_PNBUF(&nd);
5594
5595         VATTR_NULL(&va);
5596         va.va_size = 0;
5597         error = VOP_SETATTR(vp, &va, ap->a_cred);
5598         if (error == 0)
5599                 VOP_WRITE(vp, ap->a_uio, IO_UNIT, ap->a_cred);
5600
5601         VOP_UNLOCK(vp);
5602         vn_close(vp, flags, ap->a_cred, td);
5603         return (error);
5604 }
5605
5606 static int
5607 zfs_setextattr_sa(struct vop_setextattr_args *ap, const char *attrname)
5608 {
5609         znode_t *zp = VTOZ(ap->a_vp);
5610         nvlist_t *nvl;
5611         size_t sa_size;
5612         int error;
5613
5614         error = zfs_ensure_xattr_cached(zp);
5615         if (error != 0)
5616                 return (error);
5617
5618         ASSERT(RW_WRITE_HELD(&zp->z_xattr_lock));
5619         ASSERT3P(zp->z_xattr_cached, !=, NULL);
5620
5621         nvl = zp->z_xattr_cached;
5622         size_t entry_size = ap->a_uio->uio_resid;
5623         if (entry_size > DXATTR_MAX_ENTRY_SIZE)
5624                 return (SET_ERROR(EFBIG));
5625         error = nvlist_size(nvl, &sa_size, NV_ENCODE_XDR);
5626         if (error != 0)
5627                 return (SET_ERROR(error));
5628         if (sa_size > DXATTR_MAX_SA_SIZE)
5629                 return (SET_ERROR(EFBIG));
5630         uchar_t *buf = kmem_alloc(entry_size, KM_SLEEP);
5631         error = uiomove(buf, entry_size, ap->a_uio);
5632         if (error != 0) {
5633                 error = SET_ERROR(error);
5634         } else {
5635                 error = nvlist_add_byte_array(nvl, attrname, buf, entry_size);
5636                 if (error != 0)
5637                         error = SET_ERROR(error);
5638         }
5639         if (error == 0)
5640                 error = zfs_sa_set_xattr(zp, attrname, buf, entry_size);
5641         kmem_free(buf, entry_size);
5642         if (error != 0) {
5643                 zp->z_xattr_cached = NULL;
5644                 nvlist_free(nvl);
5645         }
5646         return (error);
5647 }
5648
5649 static int
5650 zfs_setextattr_impl(struct vop_setextattr_args *ap, boolean_t compat)
5651 {
5652         znode_t *zp = VTOZ(ap->a_vp);
5653         zfsvfs_t *zfsvfs = ZTOZSB(zp);
5654         char attrname[EXTATTR_MAXNAMELEN+1];
5655         int error;
5656
5657         error = zfs_create_attrname(ap->a_attrnamespace, ap->a_name, attrname,
5658             sizeof (attrname), compat);
5659         if (error != 0)
5660                 return (error);
5661
5662         struct vop_deleteextattr_args vda = {
5663                 .a_vp = ap->a_vp,
5664                 .a_attrnamespace = ap->a_attrnamespace,
5665                 .a_name = ap->a_name,
5666                 .a_cred = ap->a_cred,
5667                 .a_td = ap->a_td,
5668         };
5669         error = ENOENT;
5670         if (zfsvfs->z_use_sa && zp->z_is_sa && zfsvfs->z_xattr_sa) {
5671                 error = zfs_setextattr_sa(ap, attrname);
5672                 if (error == 0) {
5673                         /*
5674                          * Successfully put into SA, we need to clear the one
5675                          * in dir if present.
5676                          */
5677                         zfs_deleteextattr_dir(&vda, attrname);
5678                 }
5679         }
5680         if (error != 0) {
5681                 error = zfs_setextattr_dir(ap, attrname);
5682                 if (error == 0 && zp->z_is_sa) {
5683                         /*
5684                          * Successfully put into dir, we need to clear the one
5685                          * in SA if present.
5686                          */
5687                         zfs_deleteextattr_sa(&vda, attrname);
5688                 }
5689         }
5690         if (error == 0 && ap->a_attrnamespace == EXTATTR_NAMESPACE_USER) {
5691                 /*
5692                  * Also clear all versions of the alternate compat name.
5693                  */
5694                 zfs_deleteextattr_impl(&vda, !compat);
5695         }
5696         return (error);
5697 }
5698
5699 /*
5700  * Vnode operation to set a named attribute.
5701  */
5702 static int
5703 zfs_setextattr(struct vop_setextattr_args *ap)
5704 {
5705         znode_t *zp = VTOZ(ap->a_vp);
5706         zfsvfs_t *zfsvfs = ZTOZSB(zp);
5707         int error;
5708
5709         /*
5710          * If the xattr property is off, refuse the request.
5711          */
5712         if (!(zfsvfs->z_flags & ZSB_XATTR))
5713                 return (SET_ERROR(EOPNOTSUPP));
5714
5715         error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace,
5716             ap->a_cred, ap->a_td, VWRITE);
5717         if (error != 0)
5718                 return (SET_ERROR(error));
5719
5720         error = zfs_check_attrname(ap->a_name);
5721         if (error != 0)
5722                 return (error);
5723
5724         if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
5725                 return (error);
5726         rw_enter(&zp->z_xattr_lock, RW_WRITER);
5727
5728         error = zfs_setextattr_impl(ap, zfs_xattr_compat);
5729
5730         rw_exit(&zp->z_xattr_lock);
5731         zfs_exit(zfsvfs, FTAG);
5732         return (error);
5733 }
5734
5735 #ifndef _SYS_SYSPROTO_H_
5736 struct vop_listextattr {
5737         IN struct vnode *a_vp;
5738         IN int a_attrnamespace;
5739         INOUT struct uio *a_uio;
5740         OUT size_t *a_size;
5741         IN struct ucred *a_cred;
5742         IN struct thread *a_td;
5743 };
5744 #endif
5745
5746 static int
5747 zfs_listextattr_dir(struct vop_listextattr_args *ap, const char *attrprefix)
5748 {
5749         struct thread *td = ap->a_td;
5750         struct nameidata nd;
5751         uint8_t dirbuf[sizeof (struct dirent)];
5752         struct iovec aiov;
5753         struct uio auio;
5754         vnode_t *xvp = NULL, *vp;
5755         int error, eof;
5756
5757         error = zfs_lookup(ap->a_vp, NULL, &xvp, NULL, 0, ap->a_cred,
5758             LOOKUP_XATTR, B_FALSE);
5759         if (error != 0) {
5760                 /*
5761                  * ENOATTR means that the EA directory does not yet exist,
5762                  * i.e. there are no extended attributes there.
5763                  */
5764                 if (error == ENOATTR)
5765                         error = 0;
5766                 return (error);
5767         }
5768
5769 #if __FreeBSD_version < 1400043
5770         NDINIT_ATVP(&nd, LOOKUP, NOFOLLOW | LOCKLEAF | LOCKSHARED,
5771             UIO_SYSSPACE, ".", xvp, td);
5772 #else
5773         NDINIT_ATVP(&nd, LOOKUP, NOFOLLOW | LOCKLEAF | LOCKSHARED,
5774             UIO_SYSSPACE, ".", xvp);
5775 #endif
5776         error = namei(&nd);
5777         if (error != 0)
5778                 return (SET_ERROR(error));
5779         vp = nd.ni_vp;
5780         NDFREE_PNBUF(&nd);
5781
5782         auio.uio_iov = &aiov;
5783         auio.uio_iovcnt = 1;
5784         auio.uio_segflg = UIO_SYSSPACE;
5785         auio.uio_td = td;
5786         auio.uio_rw = UIO_READ;
5787         auio.uio_offset = 0;
5788
5789         size_t plen = strlen(attrprefix);
5790
5791         do {
5792                 aiov.iov_base = (void *)dirbuf;
5793                 aiov.iov_len = sizeof (dirbuf);
5794                 auio.uio_resid = sizeof (dirbuf);
5795                 error = VOP_READDIR(vp, &auio, ap->a_cred, &eof, NULL, NULL);
5796                 if (error != 0)
5797                         break;
5798                 int done = sizeof (dirbuf) - auio.uio_resid;
5799                 for (int pos = 0; pos < done; ) {
5800                         struct dirent *dp = (struct dirent *)(dirbuf + pos);
5801                         pos += dp->d_reclen;
5802                         /*
5803                          * XXX: Temporarily we also accept DT_UNKNOWN, as this
5804                          * is what we get when attribute was created on Solaris.
5805                          */
5806                         if (dp->d_type != DT_REG && dp->d_type != DT_UNKNOWN)
5807                                 continue;
5808                         else if (plen == 0 &&
5809                             ZFS_XA_NS_PREFIX_FORBIDDEN(dp->d_name))
5810                                 continue;
5811                         else if (strncmp(dp->d_name, attrprefix, plen) != 0)
5812                                 continue;
5813                         uint8_t nlen = dp->d_namlen - plen;
5814                         if (ap->a_size != NULL) {
5815                                 *ap->a_size += 1 + nlen;
5816                         } else if (ap->a_uio != NULL) {
5817                                 /*
5818                                  * Format of extattr name entry is one byte for
5819                                  * length and the rest for name.
5820                                  */
5821                                 error = uiomove(&nlen, 1, ap->a_uio);
5822                                 if (error == 0) {
5823                                         char *namep = dp->d_name + plen;
5824                                         error = uiomove(namep, nlen, ap->a_uio);
5825                                 }
5826                                 if (error != 0) {
5827                                         error = SET_ERROR(error);
5828                                         break;
5829                                 }
5830                         }
5831                 }
5832         } while (!eof && error == 0);
5833
5834         vput(vp);
5835         return (error);
5836 }
5837
5838 static int
5839 zfs_listextattr_sa(struct vop_listextattr_args *ap, const char *attrprefix)
5840 {
5841         znode_t *zp = VTOZ(ap->a_vp);
5842         int error;
5843
5844         error = zfs_ensure_xattr_cached(zp);
5845         if (error != 0)
5846                 return (error);
5847
5848         ASSERT(RW_LOCK_HELD(&zp->z_xattr_lock));
5849         ASSERT3P(zp->z_xattr_cached, !=, NULL);
5850
5851         size_t plen = strlen(attrprefix);
5852         nvpair_t *nvp = NULL;
5853         while ((nvp = nvlist_next_nvpair(zp->z_xattr_cached, nvp)) != NULL) {
5854                 ASSERT3U(nvpair_type(nvp), ==, DATA_TYPE_BYTE_ARRAY);
5855
5856                 const char *name = nvpair_name(nvp);
5857                 if (plen == 0 && ZFS_XA_NS_PREFIX_FORBIDDEN(name))
5858                         continue;
5859                 else if (strncmp(name, attrprefix, plen) != 0)
5860                         continue;
5861                 uint8_t nlen = strlen(name) - plen;
5862                 if (ap->a_size != NULL) {
5863                         *ap->a_size += 1 + nlen;
5864                 } else if (ap->a_uio != NULL) {
5865                         /*
5866                          * Format of extattr name entry is one byte for
5867                          * length and the rest for name.
5868                          */
5869                         error = uiomove(&nlen, 1, ap->a_uio);
5870                         if (error == 0) {
5871                                 char *namep = __DECONST(char *, name) + plen;
5872                                 error = uiomove(namep, nlen, ap->a_uio);
5873                         }
5874                         if (error != 0) {
5875                                 error = SET_ERROR(error);
5876                                 break;
5877                         }
5878                 }
5879         }
5880
5881         return (error);
5882 }
5883
5884 static int
5885 zfs_listextattr_impl(struct vop_listextattr_args *ap, boolean_t compat)
5886 {
5887         znode_t *zp = VTOZ(ap->a_vp);
5888         zfsvfs_t *zfsvfs = ZTOZSB(zp);
5889         char attrprefix[16];
5890         int error;
5891
5892         error = zfs_create_attrname(ap->a_attrnamespace, "", attrprefix,
5893             sizeof (attrprefix), compat);
5894         if (error != 0)
5895                 return (error);
5896
5897         if (zfsvfs->z_use_sa && zp->z_is_sa)
5898                 error = zfs_listextattr_sa(ap, attrprefix);
5899         if (error == 0)
5900                 error = zfs_listextattr_dir(ap, attrprefix);
5901         return (error);
5902 }
5903
5904 /*
5905  * Vnode operation to retrieve extended attributes on a vnode.
5906  */
5907 static int
5908 zfs_listextattr(struct vop_listextattr_args *ap)
5909 {
5910         znode_t *zp = VTOZ(ap->a_vp);
5911         zfsvfs_t *zfsvfs = ZTOZSB(zp);
5912         int error;
5913
5914         if (ap->a_size != NULL)
5915                 *ap->a_size = 0;
5916
5917         /*
5918          * If the xattr property is off, refuse the request.
5919          */
5920         if (!(zfsvfs->z_flags & ZSB_XATTR))
5921                 return (SET_ERROR(EOPNOTSUPP));
5922
5923         error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace,
5924             ap->a_cred, ap->a_td, VREAD);
5925         if (error != 0)
5926                 return (SET_ERROR(error));
5927
5928         if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
5929                 return (error);
5930         rw_enter(&zp->z_xattr_lock, RW_READER);
5931
5932         error = zfs_listextattr_impl(ap, zfs_xattr_compat);
5933         if (error == 0 && ap->a_attrnamespace == EXTATTR_NAMESPACE_USER) {
5934                 /* Also list user xattrs with the alternate format. */
5935                 error = zfs_listextattr_impl(ap, !zfs_xattr_compat);
5936         }
5937
5938         rw_exit(&zp->z_xattr_lock);
5939         zfs_exit(zfsvfs, FTAG);
5940         return (error);
5941 }
5942
5943 #ifndef _SYS_SYSPROTO_H_
5944 struct vop_getacl_args {
5945         struct vnode *vp;
5946         acl_type_t type;
5947         struct acl *aclp;
5948         struct ucred *cred;
5949         struct thread *td;
5950 };
5951 #endif
5952
5953 static int
5954 zfs_freebsd_getacl(struct vop_getacl_args *ap)
5955 {
5956         int             error;
5957         vsecattr_t      vsecattr;
5958
5959         if (ap->a_type != ACL_TYPE_NFS4)
5960                 return (EINVAL);
5961
5962         vsecattr.vsa_mask = VSA_ACE | VSA_ACECNT;
5963         if ((error = zfs_getsecattr(VTOZ(ap->a_vp),
5964             &vsecattr, 0, ap->a_cred)))
5965                 return (error);
5966
5967         error = acl_from_aces(ap->a_aclp, vsecattr.vsa_aclentp,
5968             vsecattr.vsa_aclcnt);
5969         if (vsecattr.vsa_aclentp != NULL)
5970                 kmem_free(vsecattr.vsa_aclentp, vsecattr.vsa_aclentsz);
5971
5972         return (error);
5973 }
5974
5975 #ifndef _SYS_SYSPROTO_H_
5976 struct vop_setacl_args {
5977         struct vnode *vp;
5978         acl_type_t type;
5979         struct acl *aclp;
5980         struct ucred *cred;
5981         struct thread *td;
5982 };
5983 #endif
5984
5985 static int
5986 zfs_freebsd_setacl(struct vop_setacl_args *ap)
5987 {
5988         int             error;
5989         vsecattr_t vsecattr;
5990         int             aclbsize;       /* size of acl list in bytes */
5991         aclent_t        *aaclp;
5992
5993         if (ap->a_type != ACL_TYPE_NFS4)
5994                 return (EINVAL);
5995
5996         if (ap->a_aclp == NULL)
5997                 return (EINVAL);
5998
5999         if (ap->a_aclp->acl_cnt < 1 || ap->a_aclp->acl_cnt > MAX_ACL_ENTRIES)
6000                 return (EINVAL);
6001
6002         /*
6003          * With NFSv4 ACLs, chmod(2) may need to add additional entries,
6004          * splitting every entry into two and appending "canonical six"
6005          * entries at the end.  Don't allow for setting an ACL that would
6006          * cause chmod(2) to run out of ACL entries.
6007          */
6008         if (ap->a_aclp->acl_cnt * 2 + 6 > ACL_MAX_ENTRIES)
6009                 return (ENOSPC);
6010
6011         error = acl_nfs4_check(ap->a_aclp, ap->a_vp->v_type == VDIR);
6012         if (error != 0)
6013                 return (error);
6014
6015         vsecattr.vsa_mask = VSA_ACE;
6016         aclbsize = ap->a_aclp->acl_cnt * sizeof (ace_t);
6017         vsecattr.vsa_aclentp = kmem_alloc(aclbsize, KM_SLEEP);
6018         aaclp = vsecattr.vsa_aclentp;
6019         vsecattr.vsa_aclentsz = aclbsize;
6020
6021         aces_from_acl(vsecattr.vsa_aclentp, &vsecattr.vsa_aclcnt, ap->a_aclp);
6022         error = zfs_setsecattr(VTOZ(ap->a_vp), &vsecattr, 0, ap->a_cred);
6023         kmem_free(aaclp, aclbsize);
6024
6025         return (error);
6026 }
6027
6028 #ifndef _SYS_SYSPROTO_H_
6029 struct vop_aclcheck_args {
6030         struct vnode *vp;
6031         acl_type_t type;
6032         struct acl *aclp;
6033         struct ucred *cred;
6034         struct thread *td;
6035 };
6036 #endif
6037
6038 static int
6039 zfs_freebsd_aclcheck(struct vop_aclcheck_args *ap)
6040 {
6041
6042         return (EOPNOTSUPP);
6043 }
6044
6045 static int
6046 zfs_vptocnp(struct vop_vptocnp_args *ap)
6047 {
6048         vnode_t *covered_vp;
6049         vnode_t *vp = ap->a_vp;
6050         zfsvfs_t *zfsvfs = vp->v_vfsp->vfs_data;
6051         znode_t *zp = VTOZ(vp);
6052         int ltype;
6053         int error;
6054
6055         if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
6056                 return (error);
6057
6058         /*
6059          * If we are a snapshot mounted under .zfs, run the operation
6060          * on the covered vnode.
6061          */
6062         if (zp->z_id != zfsvfs->z_root || zfsvfs->z_parent == zfsvfs) {
6063                 char name[MAXNAMLEN + 1];
6064                 znode_t *dzp;
6065                 size_t len;
6066
6067                 error = zfs_znode_parent_and_name(zp, &dzp, name,
6068                     sizeof (name));
6069                 if (error == 0) {
6070                         len = strlen(name);
6071                         if (*ap->a_buflen < len)
6072                                 error = SET_ERROR(ENOMEM);
6073                 }
6074                 if (error == 0) {
6075                         *ap->a_buflen -= len;
6076                         memcpy(ap->a_buf + *ap->a_buflen, name, len);
6077                         *ap->a_vpp = ZTOV(dzp);
6078                 }
6079                 zfs_exit(zfsvfs, FTAG);
6080                 return (error);
6081         }
6082         zfs_exit(zfsvfs, FTAG);
6083
6084         covered_vp = vp->v_mount->mnt_vnodecovered;
6085         enum vgetstate vs = vget_prep(covered_vp);
6086         ltype = VOP_ISLOCKED(vp);
6087         VOP_UNLOCK(vp);
6088         error = vget_finish(covered_vp, LK_SHARED, vs);
6089         if (error == 0) {
6090                 error = VOP_VPTOCNP(covered_vp, ap->a_vpp, ap->a_buf,
6091                     ap->a_buflen);
6092                 vput(covered_vp);
6093         }
6094         vn_lock(vp, ltype | LK_RETRY);
6095         if (VN_IS_DOOMED(vp))
6096                 error = SET_ERROR(ENOENT);
6097         return (error);
6098 }
6099
6100 #if __FreeBSD_version >= 1400032
6101 static int
6102 zfs_deallocate(struct vop_deallocate_args *ap)
6103 {
6104         znode_t *zp = VTOZ(ap->a_vp);
6105         zfsvfs_t *zfsvfs = zp->z_zfsvfs;
6106         zilog_t *zilog;
6107         off_t off, len, file_sz;
6108         int error;
6109
6110         if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
6111                 return (error);
6112
6113         /*
6114          * Callers might not be able to detect properly that we are read-only,
6115          * so check it explicitly here.
6116          */
6117         if (zfs_is_readonly(zfsvfs)) {
6118                 zfs_exit(zfsvfs, FTAG);
6119                 return (SET_ERROR(EROFS));
6120         }
6121
6122         zilog = zfsvfs->z_log;
6123         off = *ap->a_offset;
6124         len = *ap->a_len;
6125         file_sz = zp->z_size;
6126         if (off + len > file_sz)
6127                 len = file_sz - off;
6128         /* Fast path for out-of-range request. */
6129         if (len <= 0) {
6130                 *ap->a_len = 0;
6131                 zfs_exit(zfsvfs, FTAG);
6132                 return (0);
6133         }
6134
6135         error = zfs_freesp(zp, off, len, O_RDWR, TRUE);
6136         if (error == 0) {
6137                 if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS ||
6138                     (ap->a_ioflag & IO_SYNC) != 0)
6139                         zil_commit(zilog, zp->z_id);
6140                 *ap->a_offset = off + len;
6141                 *ap->a_len = 0;
6142         }
6143
6144         zfs_exit(zfsvfs, FTAG);
6145         return (error);
6146 }
6147 #endif
6148
6149 #ifndef _SYS_SYSPROTO_H_
6150 struct vop_copy_file_range_args {
6151         struct vnode *a_invp;
6152         off_t *a_inoffp;
6153         struct vnode *a_outvp;
6154         off_t *a_outoffp;
6155         size_t *a_lenp;
6156         unsigned int a_flags;
6157         struct ucred *a_incred;
6158         struct ucred *a_outcred;
6159         struct thread *a_fsizetd;
6160 }
6161 #endif
6162 /*
6163  * TODO: FreeBSD will only call file system-specific copy_file_range() if both
6164  * files resides under the same mountpoint. In case of ZFS we want to be called
6165  * even is files are in different datasets (but on the same pools, but we need
6166  * to check that ourselves).
6167  */
6168 static int
6169 zfs_freebsd_copy_file_range(struct vop_copy_file_range_args *ap)
6170 {
6171         zfsvfs_t *outzfsvfs;
6172         struct vnode *invp = ap->a_invp;
6173         struct vnode *outvp = ap->a_outvp;
6174         struct mount *mp;
6175         int error;
6176         uint64_t len = *ap->a_lenp;
6177
6178         if (!zfs_bclone_enabled) {
6179                 mp = NULL;
6180                 goto bad_write_fallback;
6181         }
6182
6183         /*
6184          * TODO: If offset/length is not aligned to recordsize, use
6185          * vn_generic_copy_file_range() on this fragment.
6186          * It would be better to do this after we lock the vnodes, but then we
6187          * need something else than vn_generic_copy_file_range().
6188          */
6189
6190         vn_start_write(outvp, &mp, V_WAIT);
6191         if (__predict_true(mp == outvp->v_mount)) {
6192                 outzfsvfs = (zfsvfs_t *)mp->mnt_data;
6193                 if (!spa_feature_is_enabled(dmu_objset_spa(outzfsvfs->z_os),
6194                     SPA_FEATURE_BLOCK_CLONING)) {
6195                         goto bad_write_fallback;
6196                 }
6197         }
6198         if (invp == outvp) {
6199                 if (vn_lock(outvp, LK_EXCLUSIVE) != 0) {
6200                         goto bad_write_fallback;
6201                 }
6202         } else {
6203 #if (__FreeBSD_version >= 1302506 && __FreeBSD_version < 1400000) || \
6204         __FreeBSD_version >= 1400086
6205                 vn_lock_pair(invp, false, LK_SHARED, outvp, false,
6206                     LK_EXCLUSIVE);
6207 #else
6208                 vn_lock_pair(invp, false, outvp, false);
6209 #endif
6210                 if (VN_IS_DOOMED(invp) || VN_IS_DOOMED(outvp)) {
6211                         goto bad_locked_fallback;
6212                 }
6213         }
6214
6215 #ifdef MAC
6216         error = mac_vnode_check_write(curthread->td_ucred, ap->a_outcred,
6217             outvp);
6218         if (error != 0)
6219                 goto out_locked;
6220 #endif
6221
6222         error = zfs_clone_range(VTOZ(invp), ap->a_inoffp, VTOZ(outvp),
6223             ap->a_outoffp, &len, ap->a_outcred);
6224         if (error == EXDEV || error == EAGAIN || error == EINVAL ||
6225             error == EOPNOTSUPP)
6226                 goto bad_locked_fallback;
6227         *ap->a_lenp = (size_t)len;
6228 #ifdef MAC
6229 out_locked:
6230 #endif
6231         if (invp != outvp)
6232                 VOP_UNLOCK(invp);
6233         VOP_UNLOCK(outvp);
6234         if (mp != NULL)
6235                 vn_finished_write(mp);
6236         return (error);
6237
6238 bad_locked_fallback:
6239         if (invp != outvp)
6240                 VOP_UNLOCK(invp);
6241         VOP_UNLOCK(outvp);
6242 bad_write_fallback:
6243         if (mp != NULL)
6244                 vn_finished_write(mp);
6245         error = vn_generic_copy_file_range(ap->a_invp, ap->a_inoffp,
6246             ap->a_outvp, ap->a_outoffp, ap->a_lenp, ap->a_flags,
6247             ap->a_incred, ap->a_outcred, ap->a_fsizetd);
6248         return (error);
6249 }
6250
6251 struct vop_vector zfs_vnodeops;
6252 struct vop_vector zfs_fifoops;
6253 struct vop_vector zfs_shareops;
6254
6255 struct vop_vector zfs_vnodeops = {
6256         .vop_default =          &default_vnodeops,
6257         .vop_inactive =         zfs_freebsd_inactive,
6258         .vop_need_inactive =    zfs_freebsd_need_inactive,
6259         .vop_reclaim =          zfs_freebsd_reclaim,
6260         .vop_fplookup_vexec = zfs_freebsd_fplookup_vexec,
6261         .vop_fplookup_symlink = zfs_freebsd_fplookup_symlink,
6262         .vop_access =           zfs_freebsd_access,
6263         .vop_allocate =         VOP_EINVAL,
6264 #if __FreeBSD_version >= 1400032
6265         .vop_deallocate =       zfs_deallocate,
6266 #endif
6267         .vop_lookup =           zfs_cache_lookup,
6268         .vop_cachedlookup =     zfs_freebsd_cachedlookup,
6269         .vop_getattr =          zfs_freebsd_getattr,
6270         .vop_setattr =          zfs_freebsd_setattr,
6271         .vop_create =           zfs_freebsd_create,
6272         .vop_mknod =            (vop_mknod_t *)zfs_freebsd_create,
6273         .vop_mkdir =            zfs_freebsd_mkdir,
6274         .vop_readdir =          zfs_freebsd_readdir,
6275         .vop_fsync =            zfs_freebsd_fsync,
6276         .vop_open =             zfs_freebsd_open,
6277         .vop_close =            zfs_freebsd_close,
6278         .vop_rmdir =            zfs_freebsd_rmdir,
6279         .vop_ioctl =            zfs_freebsd_ioctl,
6280         .vop_link =             zfs_freebsd_link,
6281         .vop_symlink =          zfs_freebsd_symlink,
6282         .vop_readlink =         zfs_freebsd_readlink,
6283         .vop_read =             zfs_freebsd_read,
6284         .vop_write =            zfs_freebsd_write,
6285         .vop_remove =           zfs_freebsd_remove,
6286         .vop_rename =           zfs_freebsd_rename,
6287         .vop_pathconf =         zfs_freebsd_pathconf,
6288         .vop_bmap =             zfs_freebsd_bmap,
6289         .vop_fid =              zfs_freebsd_fid,
6290         .vop_getextattr =       zfs_getextattr,
6291         .vop_deleteextattr =    zfs_deleteextattr,
6292         .vop_setextattr =       zfs_setextattr,
6293         .vop_listextattr =      zfs_listextattr,
6294         .vop_getacl =           zfs_freebsd_getacl,
6295         .vop_setacl =           zfs_freebsd_setacl,
6296         .vop_aclcheck =         zfs_freebsd_aclcheck,
6297         .vop_getpages =         zfs_freebsd_getpages,
6298         .vop_putpages =         zfs_freebsd_putpages,
6299         .vop_vptocnp =          zfs_vptocnp,
6300         .vop_lock1 =            vop_lock,
6301         .vop_unlock =           vop_unlock,
6302         .vop_islocked =         vop_islocked,
6303 #if __FreeBSD_version >= 1400043
6304         .vop_add_writecount =   vop_stdadd_writecount_nomsync,
6305 #endif
6306         .vop_copy_file_range =  zfs_freebsd_copy_file_range,
6307 };
6308 VFS_VOP_VECTOR_REGISTER(zfs_vnodeops);
6309
6310 struct vop_vector zfs_fifoops = {
6311         .vop_default =          &fifo_specops,
6312         .vop_fsync =            zfs_freebsd_fsync,
6313         .vop_fplookup_vexec =   zfs_freebsd_fplookup_vexec,
6314         .vop_fplookup_symlink = zfs_freebsd_fplookup_symlink,
6315         .vop_access =           zfs_freebsd_access,
6316         .vop_getattr =          zfs_freebsd_getattr,
6317         .vop_inactive =         zfs_freebsd_inactive,
6318         .vop_read =             VOP_PANIC,
6319         .vop_reclaim =          zfs_freebsd_reclaim,
6320         .vop_setattr =          zfs_freebsd_setattr,
6321         .vop_write =            VOP_PANIC,
6322         .vop_pathconf =         zfs_freebsd_pathconf,
6323         .vop_fid =              zfs_freebsd_fid,
6324         .vop_getacl =           zfs_freebsd_getacl,
6325         .vop_setacl =           zfs_freebsd_setacl,
6326         .vop_aclcheck =         zfs_freebsd_aclcheck,
6327 #if __FreeBSD_version >= 1400043
6328         .vop_add_writecount =   vop_stdadd_writecount_nomsync,
6329 #endif
6330 };
6331 VFS_VOP_VECTOR_REGISTER(zfs_fifoops);
6332
6333 /*
6334  * special share hidden files vnode operations template
6335  */
6336 struct vop_vector zfs_shareops = {
6337         .vop_default =          &default_vnodeops,
6338         .vop_fplookup_vexec =   VOP_EAGAIN,
6339         .vop_fplookup_symlink = VOP_EAGAIN,
6340         .vop_access =           zfs_freebsd_access,
6341         .vop_inactive =         zfs_freebsd_inactive,
6342         .vop_reclaim =          zfs_freebsd_reclaim,
6343         .vop_fid =              zfs_freebsd_fid,
6344         .vop_pathconf =         zfs_freebsd_pathconf,
6345 #if __FreeBSD_version >= 1400043
6346         .vop_add_writecount =   vop_stdadd_writecount_nomsync,
6347 #endif
6348 };
6349 VFS_VOP_VECTOR_REGISTER(zfs_shareops);
6350
6351 ZFS_MODULE_PARAM(zfs, zfs_, xattr_compat, INT, ZMOD_RW,
6352         "Use legacy ZFS xattr naming for writing new user namespace xattrs");