module/zfs/zfs_vnops.c

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or https://opensource.org/licenses/CDDL-1.0.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21
  22 /*
  23  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  24  * Copyright (c) 2012, 2018 by Delphix. All rights reserved.
  25  * Copyright (c) 2015 by Chunwei Chen. All rights reserved.
  26  * Copyright 2017 Nexenta Systems, Inc.
  27  * Copyright (c) 2021, 2022 by Pawel Jakub Dawidek
  28  */
  29
  30 /* Portions Copyright 2007 Jeremy Teo */
  31 /* Portions Copyright 2010 Robert Milkowski */
  32
  33 #include <sys/types.h>
  34 #include <sys/param.h>
  35 #include <sys/time.h>
  36 #include <sys/sysmacros.h>
  37 #include <sys/vfs.h>
  38 #include <sys/file.h>
  39 #include <sys/stat.h>
  40 #include <sys/kmem.h>
  41 #include <sys/cmn_err.h>
  42 #include <sys/errno.h>
  43 #include <sys/zfs_dir.h>
  44 #include <sys/zfs_acl.h>
  45 #include <sys/zfs_ioctl.h>
  46 #include <sys/fs/zfs.h>
  47 #include <sys/dmu.h>
  48 #include <sys/dmu_objset.h>
  49 #include <sys/dsl_crypt.h>
  50 #include <sys/spa.h>
  51 #include <sys/txg.h>
  52 #include <sys/dbuf.h>
  53 #include <sys/policy.h>
  54 #include <sys/zfeature.h>
  55 #include <sys/zfs_vnops.h>
  56 #include <sys/zfs_quota.h>
  57 #include <sys/zfs_vfsops.h>
  58 #include <sys/zfs_znode.h>
  59
  60 /*
  61  * Enable the experimental block cloning feature.  If this setting is 0, then
  62  * even if feature@block_cloning is enabled, attempts to clone blocks will act
  63  * as though the feature is disabled.
  64  */
  65 int zfs_bclone_enabled = 1;
  66
  67 /*
  68  * When set zfs_clone_range() waits for dirty data to be written to disk.
  69  * This allows the clone operation to reliably succeed when a file is modified
  70  * and then immediately cloned. For small files this may be slower than making
  71  * a copy of the file and is therefore not the default.  However, in certain
  72  * scenarios this behavior may be desirable so a tunable is provided.
  73  */
  74 static int zfs_bclone_wait_dirty = 0;
  75
  76 /*
  77  * Enable Direct I/O. If this setting is 0, then all I/O requests will be
  78  * directed through the ARC acting as though the dataset property direct was
  79  * set to disabled.
  80  */
  81 static int zfs_dio_enabled = 0;
  82
  83
  84 /*
  85  * Maximum bytes to read per chunk in zfs_read().
  86  */
  87 static uint64_t zfs_vnops_read_chunk_size = 1024 * 1024;
  88
  89 int
  90 zfs_fsync(znode_t *zp, int syncflag, cred_t *cr)
  91 {
  92         int error = 0;
  93         zfsvfs_t *zfsvfs = ZTOZSB(zp);
  94
  95         if (zfsvfs->z_os->os_sync != ZFS_SYNC_DISABLED) {
  96                 if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
  97                         return (error);
  98                 atomic_inc_32(&zp->z_sync_writes_cnt);
  99                 zil_commit(zfsvfs->z_log, zp->z_id);
 100                 atomic_dec_32(&zp->z_sync_writes_cnt);
 101                 zfs_exit(zfsvfs, FTAG);
 102         }
 103         return (error);
 104 }
 105
 106
 107 #if defined(SEEK_HOLE) && defined(SEEK_DATA)
 108 /*
 109  * Lseek support for finding holes (cmd == SEEK_HOLE) and
 110  * data (cmd == SEEK_DATA). "off" is an in/out parameter.
 111  */
 112 static int
 113 zfs_holey_common(znode_t *zp, ulong_t cmd, loff_t *off)
 114 {
 115         zfs_locked_range_t *lr;
 116         uint64_t noff = (uint64_t)*off; /* new offset */
 117         uint64_t file_sz;
 118         int error;
 119         boolean_t hole;
 120
 121         file_sz = zp->z_size;
 122         if (noff >= file_sz)  {
 123                 return (SET_ERROR(ENXIO));
 124         }
 125
 126         if (cmd == F_SEEK_HOLE)
 127                 hole = B_TRUE;
 128         else
 129                 hole = B_FALSE;
 130
 131         /* Flush any mmap()'d data to disk */
 132         if (zn_has_cached_data(zp, 0, file_sz - 1))
 133                 zn_flush_cached_data(zp, B_TRUE);
 134
 135         lr = zfs_rangelock_enter(&zp->z_rangelock, 0, UINT64_MAX, RL_READER);
 136         error = dmu_offset_next(ZTOZSB(zp)->z_os, zp->z_id, hole, &noff);
 137         zfs_rangelock_exit(lr);
 138
 139         if (error == ESRCH)
 140                 return (SET_ERROR(ENXIO));
 141
 142         /* File was dirty, so fall back to using generic logic */
 143         if (error == EBUSY) {
 144                 if (hole)
 145                         *off = file_sz;
 146
 147                 return (0);
 148         }
 149
 150         /*
 151          * We could find a hole that begins after the logical end-of-file,
 152          * because dmu_offset_next() only works on whole blocks.  If the
 153          * EOF falls mid-block, then indicate that the "virtual hole"
 154          * at the end of the file begins at the logical EOF, rather than
 155          * at the end of the last block.
 156          */
 157         if (noff > file_sz) {
 158                 ASSERT(hole);
 159                 noff = file_sz;
 160         }
 161
 162         if (noff < *off)
 163                 return (error);
 164         *off = noff;
 165         return (error);
 166 }
 167
 168 int
 169 zfs_holey(znode_t *zp, ulong_t cmd, loff_t *off)
 170 {
 171         zfsvfs_t *zfsvfs = ZTOZSB(zp);
 172         int error;
 173
 174         if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
 175                 return (error);
 176
 177         error = zfs_holey_common(zp, cmd, off);
 178
 179         zfs_exit(zfsvfs, FTAG);
 180         return (error);
 181 }
 182 #endif /* SEEK_HOLE && SEEK_DATA */
 183
 184 int
 185 zfs_access(znode_t *zp, int mode, int flag, cred_t *cr)
 186 {
 187         zfsvfs_t *zfsvfs = ZTOZSB(zp);
 188         int error;
 189
 190         if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
 191                 return (error);
 192
 193         if (flag & V_ACE_MASK)
 194 #if defined(__linux__)
 195                 error = zfs_zaccess(zp, mode, flag, B_FALSE, cr,
 196                     zfs_init_idmap);
 197 #else
 198                 error = zfs_zaccess(zp, mode, flag, B_FALSE, cr,
 199                     NULL);
 200 #endif
 201         else
 202 #if defined(__linux__)
 203                 error = zfs_zaccess_rwx(zp, mode, flag, cr, zfs_init_idmap);
 204 #else
 205                 error = zfs_zaccess_rwx(zp, mode, flag, cr, NULL);
 206 #endif
 207
 208         zfs_exit(zfsvfs, FTAG);
 209         return (error);
 210 }
 211
 212 /*
 213  * Determine if Direct I/O has been requested (either via the O_DIRECT flag or
 214  * the "direct" dataset property). When inherited by the property only apply
 215  * the O_DIRECT flag to correctly aligned IO requests. The rational for this
 216  * is it allows the property to be safely set on a dataset without forcing
 217  * all of the applications to be aware of the alignment restrictions. When
 218  * O_DIRECT is explicitly requested by an application return EINVAL if the
 219  * request is unaligned.  In all cases, if the range for this request has
 220  * been mmap'ed then we will perform buffered I/O to keep the mapped region
 221  * synhronized with the ARC.
 222  *
 223  * It is possible that a file's pages could be mmap'ed after it is checked
 224  * here. If so, that is handled coorarding in zfs_write(). See comments in the
 225  * following area for how this is handled:
 226  * zfs_write() -> update_pages()
 227  */
 228 static int
 229 zfs_setup_direct(struct znode *zp, zfs_uio_t *uio, zfs_uio_rw_t rw,
 230     int *ioflagp)
 231 {
 232         zfsvfs_t *zfsvfs = ZTOZSB(zp);
 233         objset_t *os = zfsvfs->z_os;
 234         int ioflag = *ioflagp;
 235         int error = 0;
 236
 237         if (!zfs_dio_enabled || os->os_direct == ZFS_DIRECT_DISABLED ||
 238             zn_has_cached_data(zp, zfs_uio_offset(uio),
 239             zfs_uio_offset(uio) + zfs_uio_resid(uio) - 1)) {
 240                 /*
 241                  * Direct I/O is disabled or the region is mmap'ed. In either
 242                  * case the I/O request will just directed through the ARC.
 243                  */
 244                 ioflag &= ~O_DIRECT;
 245                 goto out;
 246         } else if (os->os_direct == ZFS_DIRECT_ALWAYS &&
 247             zfs_uio_page_aligned(uio) &&
 248             zfs_uio_aligned(uio, PAGE_SIZE)) {
 249                 if ((rw == UIO_WRITE && zfs_uio_resid(uio) >= zp->z_blksz) ||
 250                     (rw == UIO_READ)) {
 251                         ioflag |= O_DIRECT;
 252                 }
 253         } else if (os->os_direct == ZFS_DIRECT_ALWAYS && (ioflag & O_DIRECT)) {
 254                 /*
 255                  * Direct I/O was requested through the direct=always, but it
 256                  * is not properly PAGE_SIZE aligned. The request will be
 257                  * directed through the ARC.
 258                  */
 259                 ioflag &= ~O_DIRECT;
 260         }
 261
 262         if (ioflag & O_DIRECT) {
 263                 if (!zfs_uio_page_aligned(uio) ||
 264                     !zfs_uio_aligned(uio, PAGE_SIZE)) {
 265                         error = SET_ERROR(EINVAL);
 266                         goto out;
 267                 }
 268
 269                 error = zfs_uio_get_dio_pages_alloc(uio, rw);
 270                 if (error) {
 271                         goto out;
 272                 }
 273         }
 274
 275         IMPLY(ioflag & O_DIRECT, uio->uio_extflg & UIO_DIRECT);
 276         ASSERT0(error);
 277
 278 out:
 279         *ioflagp = ioflag;
 280         return (error);
 281 }
 282
 283 /*
 284  * Read bytes from specified file into supplied buffer.
 285  *
 286  *      IN:     zp      - inode of file to be read from.
 287  *              uio     - structure supplying read location, range info,
 288  *                        and return buffer.
 289  *              ioflag  - O_SYNC flags; used to provide FRSYNC semantics.
 290  *                        O_DIRECT flag; used to bypass page cache.
 291  *              cr      - credentials of caller.
 292  *
 293  *      OUT:    uio     - updated offset and range, buffer filled.
 294  *
 295  *      RETURN: 0 on success, error code on failure.
 296  *
 297  * Side Effects:
 298  *      inode - atime updated if byte count > 0
 299  */
 300 int
 301 zfs_read(struct znode *zp, zfs_uio_t *uio, int ioflag, cred_t *cr)
 302 {
 303         (void) cr;
 304         int error = 0;
 305         boolean_t frsync = B_FALSE;
 306         boolean_t dio_checksum_failure = B_FALSE;
 307
 308         zfsvfs_t *zfsvfs = ZTOZSB(zp);
 309         if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
 310                 return (error);
 311
 312         if (zp->z_pflags & ZFS_AV_QUARANTINED) {
 313                 zfs_exit(zfsvfs, FTAG);
 314                 return (SET_ERROR(EACCES));
 315         }
 316
 317         /* We don't copy out anything useful for directories. */
 318         if (Z_ISDIR(ZTOTYPE(zp))) {
 319                 zfs_exit(zfsvfs, FTAG);
 320                 return (SET_ERROR(EISDIR));
 321         }
 322
 323         /*
 324          * Validate file offset
 325          */
 326         if (zfs_uio_offset(uio) < (offset_t)0) {
 327                 zfs_exit(zfsvfs, FTAG);
 328                 return (SET_ERROR(EINVAL));
 329         }
 330
 331         /*
 332          * Fasttrack empty reads
 333          */
 334         if (zfs_uio_resid(uio) == 0) {
 335                 zfs_exit(zfsvfs, FTAG);
 336                 return (0);
 337         }
 338
 339 #ifdef FRSYNC
 340         /*
 341          * If we're in FRSYNC mode, sync out this znode before reading it.
 342          * Only do this for non-snapshots.
 343          *
 344          * Some platforms do not support FRSYNC and instead map it
 345          * to O_SYNC, which results in unnecessary calls to zil_commit. We
 346          * only honor FRSYNC requests on platforms which support it.
 347          */
 348         frsync = !!(ioflag & FRSYNC);
 349 #endif
 350         if (zfsvfs->z_log &&
 351             (frsync || zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS))
 352                 zil_commit(zfsvfs->z_log, zp->z_id);
 353
 354         /*
 355          * Lock the range against changes.
 356          */
 357         zfs_locked_range_t *lr = zfs_rangelock_enter(&zp->z_rangelock,
 358             zfs_uio_offset(uio), zfs_uio_resid(uio), RL_READER);
 359
 360         /*
 361          * If we are reading past end-of-file we can skip
 362          * to the end; but we might still need to set atime.
 363          */
 364         if (zfs_uio_offset(uio) >= zp->z_size) {
 365                 error = 0;
 366                 goto out;
 367         }
 368         ASSERT(zfs_uio_offset(uio) < zp->z_size);
 369
 370         /*
 371          * Setting up Direct I/O if requested.
 372          */
 373         error = zfs_setup_direct(zp, uio, UIO_READ, &ioflag);
 374         if (error) {
 375                 goto out;
 376         }
 377
 378 #if defined(__linux__)
 379         ssize_t start_offset = zfs_uio_offset(uio);
 380 #endif
 381         ssize_t chunk_size = zfs_vnops_read_chunk_size;
 382         ssize_t n = MIN(zfs_uio_resid(uio), zp->z_size - zfs_uio_offset(uio));
 383         ssize_t start_resid = n;
 384         ssize_t dio_remaining_resid = 0;
 385
 386         if (uio->uio_extflg & UIO_DIRECT) {
 387                 /*
 388                  * All pages for an O_DIRECT request ahve already been mapped
 389                  * so there's no compelling reason to handle this uio in
 390                  * smaller chunks.
 391                  */
 392                 chunk_size = DMU_MAX_ACCESS;
 393
 394                 /*
 395                  * In the event that the O_DIRECT request is reading the entire
 396                  * file, it is possible file's length is not page sized
 397                  * aligned. However, lower layers expect that the Direct I/O
 398                  * request is page-aligned. In this case, as much of the file
 399                  * that can be read using Direct I/O happens and the remaining
 400                  * amount will be read through the ARC.
 401                  *
 402                  * This is still consistent with the semantics of Direct I/O in
 403                  * ZFS as at a minimum the I/O request must be page-aligned.
 404                  */
 405                 dio_remaining_resid = n - P2ALIGN_TYPED(n, PAGE_SIZE, ssize_t);
 406                 if (dio_remaining_resid != 0)
 407                         n -= dio_remaining_resid;
 408         }
 409
 410         while (n > 0) {
 411                 ssize_t nbytes = MIN(n, chunk_size -
 412                     P2PHASE(zfs_uio_offset(uio), chunk_size));
 413 #ifdef UIO_NOCOPY
 414                 if (zfs_uio_segflg(uio) == UIO_NOCOPY)
 415                         error = mappedread_sf(zp, nbytes, uio);
 416                 else
 417 #endif
 418                 if (zn_has_cached_data(zp, zfs_uio_offset(uio),
 419                     zfs_uio_offset(uio) + nbytes - 1)) {
 420                         error = mappedread(zp, nbytes, uio);
 421                 } else {
 422                         error = dmu_read_uio_dbuf(sa_get_db(zp->z_sa_hdl),
 423                             uio, nbytes);
 424                 }
 425
 426                 if (error) {
 427                         /* convert checksum errors into IO errors */
 428                         if (error == ECKSUM) {
 429                                 /*
 430                                  * If a Direct I/O read returned a checksum
 431                                  * verify error, then it must be treated as
 432                                  * suspicious. The contents of the buffer could
 433                                  * have beeen manipulated while the I/O was in
 434                                  * flight. In this case, the remainder of I/O
 435                                  * request will just be reissued through the
 436                                  * ARC.
 437                                  */
 438                                 if (uio->uio_extflg & UIO_DIRECT) {
 439                                         dio_checksum_failure = B_TRUE;
 440                                         uio->uio_extflg &= ~UIO_DIRECT;
 441                                         n += dio_remaining_resid;
 442                                         dio_remaining_resid = 0;
 443                                         continue;
 444                                 } else {
 445                                         error = SET_ERROR(EIO);
 446                                 }
 447                         }
 448
 449 #if defined(__linux__)
 450                         /*
 451                          * if we actually read some bytes, bubbling EFAULT
 452                          * up to become EAGAIN isn't what we want here...
 453                          *
 454                          * ...on Linux, at least. On FBSD, doing this breaks.
 455                          */
 456                         if (error == EFAULT &&
 457                             (zfs_uio_offset(uio) - start_offset) != 0)
 458                                 error = 0;
 459 #endif
 460                         break;
 461                 }
 462
 463                 n -= nbytes;
 464         }
 465
 466         if (error == 0 && (uio->uio_extflg & UIO_DIRECT) &&
 467             dio_remaining_resid != 0) {
 468                 /*
 469                  * Temporarily remove the UIO_DIRECT flag from the UIO so the
 470                  * remainder of the file can be read using the ARC.
 471                  */
 472                 uio->uio_extflg &= ~UIO_DIRECT;
 473
 474                 if (zn_has_cached_data(zp, zfs_uio_offset(uio),
 475                     zfs_uio_offset(uio) + dio_remaining_resid - 1)) {
 476                         error = mappedread(zp, dio_remaining_resid, uio);
 477                 } else {
 478                         error = dmu_read_uio_dbuf(sa_get_db(zp->z_sa_hdl), uio,
 479                             dio_remaining_resid);
 480                 }
 481                 uio->uio_extflg |= UIO_DIRECT;
 482
 483                 if (error != 0)
 484                         n += dio_remaining_resid;
 485         } else if (error && (uio->uio_extflg & UIO_DIRECT)) {
 486                 n += dio_remaining_resid;
 487         }
 488         int64_t nread = start_resid - n;
 489
 490         dataset_kstats_update_read_kstats(&zfsvfs->z_kstat, nread);
 491 out:
 492         zfs_rangelock_exit(lr);
 493
 494         if (dio_checksum_failure == B_TRUE)
 495                 uio->uio_extflg |= UIO_DIRECT;
 496
 497         /*
 498          * Cleanup for Direct I/O if requested.
 499          */
 500         if (uio->uio_extflg & UIO_DIRECT)
 501                 zfs_uio_free_dio_pages(uio, UIO_READ);
 502
 503         ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
 504         zfs_exit(zfsvfs, FTAG);
 505         return (error);
 506 }
 507
 508 static void
 509 zfs_clear_setid_bits_if_necessary(zfsvfs_t *zfsvfs, znode_t *zp, cred_t *cr,
 510     uint64_t *clear_setid_bits_txgp, dmu_tx_t *tx)
 511 {
 512         zilog_t *zilog = zfsvfs->z_log;
 513         const uint64_t uid = KUID_TO_SUID(ZTOUID(zp));
 514
 515         ASSERT(clear_setid_bits_txgp != NULL);
 516         ASSERT(tx != NULL);
 517
 518         /*
 519          * Clear Set-UID/Set-GID bits on successful write if not
 520          * privileged and at least one of the execute bits is set.
 521          *
 522          * It would be nice to do this after all writes have
 523          * been done, but that would still expose the ISUID/ISGID
 524          * to another app after the partial write is committed.
 525          *
 526          * Note: we don't call zfs_fuid_map_id() here because
 527          * user 0 is not an ephemeral uid.
 528          */
 529         mutex_enter(&zp->z_acl_lock);
 530         if ((zp->z_mode & (S_IXUSR | (S_IXUSR >> 3) | (S_IXUSR >> 6))) != 0 &&
 531             (zp->z_mode & (S_ISUID | S_ISGID)) != 0 &&
 532             secpolicy_vnode_setid_retain(zp, cr,
 533             ((zp->z_mode & S_ISUID) != 0 && uid == 0)) != 0) {
 534                 uint64_t newmode;
 535
 536                 zp->z_mode &= ~(S_ISUID | S_ISGID);
 537                 newmode = zp->z_mode;
 538                 (void) sa_update(zp->z_sa_hdl, SA_ZPL_MODE(zfsvfs),
 539                     (void *)&newmode, sizeof (uint64_t), tx);
 540
 541                 mutex_exit(&zp->z_acl_lock);
 542
 543                 /*
 544                  * Make sure SUID/SGID bits will be removed when we replay the
 545                  * log. If the setid bits are keep coming back, don't log more
 546                  * than one TX_SETATTR per transaction group.
 547                  */
 548                 if (*clear_setid_bits_txgp != dmu_tx_get_txg(tx)) {
 549                         vattr_t va = {0};
 550
 551                         va.va_mask = ATTR_MODE;
 552                         va.va_nodeid = zp->z_id;
 553                         va.va_mode = newmode;
 554                         zfs_log_setattr(zilog, tx, TX_SETATTR, zp, &va,
 555                             ATTR_MODE, NULL);
 556                         *clear_setid_bits_txgp = dmu_tx_get_txg(tx);
 557                 }
 558         } else {
 559                 mutex_exit(&zp->z_acl_lock);
 560         }
 561 }
 562
 563 /*
 564  * Write the bytes to a file.
 565  *
 566  *      IN:     zp      - znode of file to be written to.
 567  *              uio     - structure supplying write location, range info,
 568  *                        and data buffer.
 569  *              ioflag  - O_APPEND flag set if in append mode.
 570  *                        O_DIRECT flag; used to bypass page cache.
 571  *              cr      - credentials of caller.
 572  *
 573  *      OUT:    uio     - updated offset and range.
 574  *
 575  *      RETURN: 0 if success
 576  *              error code if failure
 577  *
 578  * Timestamps:
 579  *      ip - ctime|mtime updated if byte count > 0
 580  */
 581 int
 582 zfs_write(znode_t *zp, zfs_uio_t *uio, int ioflag, cred_t *cr)
 583 {
 584         int error = 0, error1;
 585         ssize_t start_resid = zfs_uio_resid(uio);
 586         uint64_t clear_setid_bits_txg = 0;
 587         boolean_t o_direct_defer = B_FALSE;
 588
 589         /*
 590          * Fasttrack empty write
 591          */
 592         ssize_t n = start_resid;
 593         if (n == 0)
 594                 return (0);
 595
 596         zfsvfs_t *zfsvfs = ZTOZSB(zp);
 597         if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
 598                 return (error);
 599
 600         sa_bulk_attr_t bulk[4];
 601         int count = 0;
 602         uint64_t mtime[2], ctime[2];
 603         SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16);
 604         SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16);
 605         SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL,
 606             &zp->z_size, 8);
 607         SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
 608             &zp->z_pflags, 8);
 609
 610         /*
 611          * Callers might not be able to detect properly that we are read-only,
 612          * so check it explicitly here.
 613          */
 614         if (zfs_is_readonly(zfsvfs)) {
 615                 zfs_exit(zfsvfs, FTAG);
 616                 return (SET_ERROR(EROFS));
 617         }
 618
 619         /*
 620          * If immutable or not appending then return EPERM.
 621          * Intentionally allow ZFS_READONLY through here.
 622          * See zfs_zaccess_common()
 623          */
 624         if ((zp->z_pflags & ZFS_IMMUTABLE) ||
 625             ((zp->z_pflags & ZFS_APPENDONLY) && !(ioflag & O_APPEND) &&
 626             (zfs_uio_offset(uio) < zp->z_size))) {
 627                 zfs_exit(zfsvfs, FTAG);
 628                 return (SET_ERROR(EPERM));
 629         }
 630
 631         /*
 632          * Validate file offset
 633          */
 634         offset_t woff = ioflag & O_APPEND ? zp->z_size : zfs_uio_offset(uio);
 635         if (woff < 0) {
 636                 zfs_exit(zfsvfs, FTAG);
 637                 return (SET_ERROR(EINVAL));
 638         }
 639
 640         /*
 641          * Setting up Direct I/O if requested.
 642          */
 643         error = zfs_setup_direct(zp, uio, UIO_WRITE, &ioflag);
 644         if (error) {
 645                 zfs_exit(zfsvfs, FTAG);
 646                 return (SET_ERROR(error));
 647         }
 648
 649         /*
 650          * Pre-fault the pages to ensure slow (eg NFS) pages
 651          * don't hold up txg.
 652          */
 653         ssize_t pfbytes = MIN(n, DMU_MAX_ACCESS >> 1);
 654         if (zfs_uio_prefaultpages(pfbytes, uio)) {
 655                 zfs_exit(zfsvfs, FTAG);
 656                 return (SET_ERROR(EFAULT));
 657         }
 658
 659         /*
 660          * If in append mode, set the io offset pointer to eof.
 661          */
 662         zfs_locked_range_t *lr;
 663         if (ioflag & O_APPEND) {
 664                 /*
 665                  * Obtain an appending range lock to guarantee file append
 666                  * semantics.  We reset the write offset once we have the lock.
 667                  */
 668                 lr = zfs_rangelock_enter(&zp->z_rangelock, 0, n, RL_APPEND);
 669                 woff = lr->lr_offset;
 670                 if (lr->lr_length == UINT64_MAX) {
 671                         /*
 672                          * We overlocked the file because this write will cause
 673                          * the file block size to increase.
 674                          * Note that zp_size cannot change with this lock held.
 675                          */
 676                         woff = zp->z_size;
 677                 }
 678                 zfs_uio_setoffset(uio, woff);
 679                 /*
 680                  * We need to update the starting offset as well because it is
 681                  * set previously in the ZPL (Linux) and VNOPS (FreeBSD)
 682                  * layers.
 683                  */
 684                 zfs_uio_setsoffset(uio, woff);
 685         } else {
 686                 /*
 687                  * Note that if the file block size will change as a result of
 688                  * this write, then this range lock will lock the entire file
 689                  * so that we can re-write the block safely.
 690                  */
 691                 lr = zfs_rangelock_enter(&zp->z_rangelock, woff, n, RL_WRITER);
 692         }
 693
 694         if (zn_rlimit_fsize_uio(zp, uio)) {
 695                 zfs_rangelock_exit(lr);
 696                 zfs_exit(zfsvfs, FTAG);
 697                 return (SET_ERROR(EFBIG));
 698         }
 699
 700         const rlim64_t limit = MAXOFFSET_T;
 701
 702         if (woff >= limit) {
 703                 zfs_rangelock_exit(lr);
 704                 zfs_exit(zfsvfs, FTAG);
 705                 return (SET_ERROR(EFBIG));
 706         }
 707
 708         if (n > limit - woff)
 709                 n = limit - woff;
 710
 711         uint64_t end_size = MAX(zp->z_size, woff + n);
 712         zilog_t *zilog = zfsvfs->z_log;
 713         boolean_t commit = (ioflag & (O_SYNC | O_DSYNC)) ||
 714             (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS);
 715
 716         const uint64_t uid = KUID_TO_SUID(ZTOUID(zp));
 717         const uint64_t gid = KGID_TO_SGID(ZTOGID(zp));
 718         const uint64_t projid = zp->z_projid;
 719
 720         /*
 721          * In the event we are increasing the file block size
 722          * (lr_length == UINT64_MAX), we will direct the write to the ARC.
 723          * Because zfs_grow_blocksize() will read from the ARC in order to
 724          * grow the dbuf, we avoid doing Direct I/O here as that would cause
 725          * data written to disk to be overwritten by data in the ARC during
 726          * the sync phase. Besides writing data twice to disk, we also
 727          * want to avoid consistency concerns between data in the the ARC and
 728          * on disk while growing the file's blocksize.
 729          *
 730          * We will only temporarily remove Direct I/O and put it back after
 731          * we have grown the blocksize. We do this in the event a request
 732          * is larger than max_blksz, so further requests to
 733          * dmu_write_uio_dbuf() will still issue the requests using Direct
 734          * IO.
 735          *
 736          * As an example:
 737          * The first block to file is being written as a 4k request with
 738          * a recorsize of 1K. The first 1K issued in the loop below will go
 739          * through the ARC; however, the following 3 1K requests will
 740          * use Direct I/O.
 741          */
 742         if (uio->uio_extflg & UIO_DIRECT && lr->lr_length == UINT64_MAX) {
 743                 uio->uio_extflg &= ~UIO_DIRECT;
 744                 o_direct_defer = B_TRUE;
 745         }
 746
 747         /*
 748          * Write the file in reasonable size chunks.  Each chunk is written
 749          * in a separate transaction; this keeps the intent log records small
 750          * and allows us to do more fine-grained space accounting.
 751          */
 752         while (n > 0) {
 753                 woff = zfs_uio_offset(uio);
 754
 755                 if (zfs_id_overblockquota(zfsvfs, DMU_USERUSED_OBJECT, uid) ||
 756                     zfs_id_overblockquota(zfsvfs, DMU_GROUPUSED_OBJECT, gid) ||
 757                     (projid != ZFS_DEFAULT_PROJID &&
 758                     zfs_id_overblockquota(zfsvfs, DMU_PROJECTUSED_OBJECT,
 759                     projid))) {
 760                         error = SET_ERROR(EDQUOT);
 761                         break;
 762                 }
 763
 764                 uint64_t blksz;
 765                 if (lr->lr_length == UINT64_MAX && zp->z_size <= zp->z_blksz) {
 766                         if (zp->z_blksz > zfsvfs->z_max_blksz &&
 767                             !ISP2(zp->z_blksz)) {
 768                                 /*
 769                                  * File's blocksize is already larger than the
 770                                  * "recordsize" property.  Only let it grow to
 771                                  * the next power of 2.
 772                                  */
 773                                 blksz = 1 << highbit64(zp->z_blksz);
 774                         } else {
 775                                 blksz = zfsvfs->z_max_blksz;
 776                         }
 777                         blksz = MIN(blksz, P2ROUNDUP(end_size,
 778                             SPA_MINBLOCKSIZE));
 779                         blksz = MAX(blksz, zp->z_blksz);
 780                 } else {
 781                         blksz = zp->z_blksz;
 782                 }
 783
 784                 arc_buf_t *abuf = NULL;
 785                 ssize_t nbytes = n;
 786                 if (n >= blksz && woff >= zp->z_size &&
 787                     P2PHASE(woff, blksz) == 0 &&
 788                     !(uio->uio_extflg & UIO_DIRECT) &&
 789                     (blksz >= SPA_OLD_MAXBLOCKSIZE || n < 4 * blksz)) {
 790                         /*
 791                          * This write covers a full block.  "Borrow" a buffer
 792                          * from the dmu so that we can fill it before we enter
 793                          * a transaction.  This avoids the possibility of
 794                          * holding up the transaction if the data copy hangs
 795                          * up on a pagefault (e.g., from an NFS server mapping).
 796                          */
 797                         abuf = dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl),
 798                             blksz);
 799                         ASSERT(abuf != NULL);
 800                         ASSERT(arc_buf_size(abuf) == blksz);
 801                         if ((error = zfs_uiocopy(abuf->b_data, blksz,
 802                             UIO_WRITE, uio, &nbytes))) {
 803                                 dmu_return_arcbuf(abuf);
 804                                 break;
 805                         }
 806                         ASSERT3S(nbytes, ==, blksz);
 807                 } else {
 808                         nbytes = MIN(n, (DMU_MAX_ACCESS >> 1) -
 809                             P2PHASE(woff, blksz));
 810                         if (pfbytes < nbytes) {
 811                                 if (zfs_uio_prefaultpages(nbytes, uio)) {
 812                                         error = SET_ERROR(EFAULT);
 813                                         break;
 814                                 }
 815                                 pfbytes = nbytes;
 816                         }
 817                 }
 818
 819                 /*
 820                  * Start a transaction.
 821                  */
 822                 dmu_tx_t *tx = dmu_tx_create(zfsvfs->z_os);
 823                 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
 824                 dmu_buf_impl_t *db = (dmu_buf_impl_t *)sa_get_db(zp->z_sa_hdl);
 825                 DB_DNODE_ENTER(db);
 826                 dmu_tx_hold_write_by_dnode(tx, DB_DNODE(db), woff, nbytes);
 827                 DB_DNODE_EXIT(db);
 828                 zfs_sa_upgrade_txholds(tx, zp);
 829                 error = dmu_tx_assign(tx, TXG_WAIT);
 830                 if (error) {
 831                         dmu_tx_abort(tx);
 832                         if (abuf != NULL)
 833                                 dmu_return_arcbuf(abuf);
 834                         break;
 835                 }
 836
 837                 /*
 838                  * NB: We must call zfs_clear_setid_bits_if_necessary before
 839                  * committing the transaction!
 840                  */
 841
 842                 /*
 843                  * If rangelock_enter() over-locked we grow the blocksize
 844                  * and then reduce the lock range.  This will only happen
 845                  * on the first iteration since rangelock_reduce() will
 846                  * shrink down lr_length to the appropriate size.
 847                  */
 848                 if (lr->lr_length == UINT64_MAX) {
 849                         zfs_grow_blocksize(zp, blksz, tx);
 850                         zfs_rangelock_reduce(lr, woff, n);
 851                 }
 852
 853                 ssize_t tx_bytes;
 854                 if (abuf == NULL) {
 855                         tx_bytes = zfs_uio_resid(uio);
 856                         zfs_uio_fault_disable(uio, B_TRUE);
 857                         error = dmu_write_uio_dbuf(sa_get_db(zp->z_sa_hdl),
 858                             uio, nbytes, tx);
 859                         zfs_uio_fault_disable(uio, B_FALSE);
 860 #ifdef __linux__
 861                         if (error == EFAULT) {
 862                                 zfs_clear_setid_bits_if_necessary(zfsvfs, zp,
 863                                     cr, &clear_setid_bits_txg, tx);
 864                                 dmu_tx_commit(tx);
 865                                 /*
 866                                  * Account for partial writes before
 867                                  * continuing the loop.
 868                                  * Update needs to occur before the next
 869                                  * zfs_uio_prefaultpages, or prefaultpages may
 870                                  * error, and we may break the loop early.
 871                                  */
 872                                 n -= tx_bytes - zfs_uio_resid(uio);
 873                                 pfbytes -= tx_bytes - zfs_uio_resid(uio);
 874                                 continue;
 875                         }
 876 #endif
 877                         /*
 878                          * On FreeBSD, EFAULT should be propagated back to the
 879                          * VFS, which will handle faulting and will retry.
 880                          */
 881                         if (error != 0 && error != EFAULT) {
 882                                 zfs_clear_setid_bits_if_necessary(zfsvfs, zp,
 883                                     cr, &clear_setid_bits_txg, tx);
 884                                 dmu_tx_commit(tx);
 885                                 break;
 886                         }
 887                         tx_bytes -= zfs_uio_resid(uio);
 888                 } else {
 889                         /*
 890                          * Thus, we're writing a full block at a block-aligned
 891                          * offset and extending the file past EOF.
 892                          *
 893                          * dmu_assign_arcbuf_by_dbuf() will directly assign the
 894                          * arc buffer to a dbuf.
 895                          */
 896                         error = dmu_assign_arcbuf_by_dbuf(
 897                             sa_get_db(zp->z_sa_hdl), woff, abuf, tx);
 898                         if (error != 0) {
 899                                 /*
 900                                  * XXX This might not be necessary if
 901                                  * dmu_assign_arcbuf_by_dbuf is guaranteed
 902                                  * to be atomic.
 903                                  */
 904                                 zfs_clear_setid_bits_if_necessary(zfsvfs, zp,
 905                                     cr, &clear_setid_bits_txg, tx);
 906                                 dmu_return_arcbuf(abuf);
 907                                 dmu_tx_commit(tx);
 908                                 break;
 909                         }
 910                         ASSERT3S(nbytes, <=, zfs_uio_resid(uio));
 911                         zfs_uioskip(uio, nbytes);
 912                         tx_bytes = nbytes;
 913                 }
 914                 /*
 915                  * There is a window where a file's pages can be mmap'ed after
 916                  * zfs_setup_direct() is called. This is due to the fact that
 917                  * the rangelock in this function is acquired after calling
 918                  * zfs_setup_direct(). This is done so that
 919                  * zfs_uio_prefaultpages() does not attempt to fault in pages
 920                  * on Linux for Direct I/O requests. This is not necessary as
 921                  * the pages are pinned in memory and can not be faulted out.
 922                  * Ideally, the rangelock would be held before calling
 923                  * zfs_setup_direct() and zfs_uio_prefaultpages(); however,
 924                  * this can lead to a deadlock as zfs_getpage() also acquires
 925                  * the rangelock as a RL_WRITER and prefaulting the pages can
 926                  * lead to zfs_getpage() being called.
 927                  *
 928                  * In the case of the pages being mapped after
 929                  * zfs_setup_direct() is called, the call to update_pages()
 930                  * will still be made to make sure there is consistency between
 931                  * the ARC and the Linux page cache. This is an ufortunate
 932                  * situation as the data will be read back into the ARC after
 933                  * the Direct I/O write has completed, but this is the penality
 934                  * for writing to a mmap'ed region of a file using Direct I/O.
 935                  */
 936                 if (tx_bytes &&
 937                     zn_has_cached_data(zp, woff, woff + tx_bytes - 1)) {
 938                         update_pages(zp, woff, tx_bytes, zfsvfs->z_os);
 939                 }
 940
 941                 /*
 942                  * If we made no progress, we're done.  If we made even
 943                  * partial progress, update the znode and ZIL accordingly.
 944                  */
 945                 if (tx_bytes == 0) {
 946                         (void) sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zfsvfs),
 947                             (void *)&zp->z_size, sizeof (uint64_t), tx);
 948                         dmu_tx_commit(tx);
 949                         ASSERT(error != 0);
 950                         break;
 951                 }
 952
 953                 zfs_clear_setid_bits_if_necessary(zfsvfs, zp, cr,
 954                     &clear_setid_bits_txg, tx);
 955
 956                 zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime);
 957
 958                 /*
 959                  * Update the file size (zp_size) if it has changed;
 960                  * account for possible concurrent updates.
 961                  */
 962                 while ((end_size = zp->z_size) < zfs_uio_offset(uio)) {
 963                         (void) atomic_cas_64(&zp->z_size, end_size,
 964                             zfs_uio_offset(uio));
 965                         ASSERT(error == 0 || error == EFAULT);
 966                 }
 967                 /*
 968                  * If we are replaying and eof is non zero then force
 969                  * the file size to the specified eof. Note, there's no
 970                  * concurrency during replay.
 971                  */
 972                 if (zfsvfs->z_replay && zfsvfs->z_replay_eof != 0)
 973                         zp->z_size = zfsvfs->z_replay_eof;
 974
 975                 error1 = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
 976                 if (error1 != 0)
 977                         /* Avoid clobbering EFAULT. */
 978                         error = error1;
 979
 980                 /*
 981                  * NB: During replay, the TX_SETATTR record logged by
 982                  * zfs_clear_setid_bits_if_necessary must precede any of
 983                  * the TX_WRITE records logged here.
 984                  */
 985                 zfs_log_write(zilog, tx, TX_WRITE, zp, woff, tx_bytes, commit,
 986                     uio->uio_extflg & UIO_DIRECT ? B_TRUE : B_FALSE, NULL,
 987                     NULL);
 988
 989                 dmu_tx_commit(tx);
 990
 991                 /*
 992                  * Direct I/O was deferred in order to grow the first block.
 993                  * At this point it can be re-enabled for subsequent writes.
 994                  */
 995                 if (o_direct_defer) {
 996                         ASSERT(ioflag & O_DIRECT);
 997                         uio->uio_extflg |= UIO_DIRECT;
 998                         o_direct_defer = B_FALSE;
 999                 }
1000
1001                 if (error != 0)
1002                         break;
1003                 ASSERT3S(tx_bytes, ==, nbytes);
1004                 n -= nbytes;
1005                 pfbytes -= nbytes;
1006         }
1007
1008         if (o_direct_defer) {
1009                 ASSERT(ioflag & O_DIRECT);
1010                 uio->uio_extflg |= UIO_DIRECT;
1011                 o_direct_defer = B_FALSE;
1012         }
1013
1014         zfs_znode_update_vfs(zp);
1015         zfs_rangelock_exit(lr);
1016
1017         /*
1018          * Cleanup for Direct I/O if requested.
1019          */
1020         if (uio->uio_extflg & UIO_DIRECT)
1021                 zfs_uio_free_dio_pages(uio, UIO_WRITE);
1022
1023         /*
1024          * If we're in replay mode, or we made no progress, or the
1025          * uio data is inaccessible return an error.  Otherwise, it's
1026          * at least a partial write, so it's successful.
1027          */
1028         if (zfsvfs->z_replay || zfs_uio_resid(uio) == start_resid ||
1029             error == EFAULT) {
1030                 zfs_exit(zfsvfs, FTAG);
1031                 return (error);
1032         }
1033
1034         if (commit)
1035                 zil_commit(zilog, zp->z_id);
1036
1037         int64_t nwritten = start_resid - zfs_uio_resid(uio);
1038         dataset_kstats_update_write_kstats(&zfsvfs->z_kstat, nwritten);
1039
1040         zfs_exit(zfsvfs, FTAG);
1041         return (0);
1042 }
1043
1044 int
1045 zfs_getsecattr(znode_t *zp, vsecattr_t *vsecp, int flag, cred_t *cr)
1046 {
1047         zfsvfs_t *zfsvfs = ZTOZSB(zp);
1048         int error;
1049         boolean_t skipaclchk = (flag & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
1050
1051         if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
1052                 return (error);
1053         error = zfs_getacl(zp, vsecp, skipaclchk, cr);
1054         zfs_exit(zfsvfs, FTAG);
1055
1056         return (error);
1057 }
1058
1059 int
1060 zfs_setsecattr(znode_t *zp, vsecattr_t *vsecp, int flag, cred_t *cr)
1061 {
1062         zfsvfs_t *zfsvfs = ZTOZSB(zp);
1063         int error;
1064         boolean_t skipaclchk = (flag & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
1065         zilog_t *zilog;
1066
1067         if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
1068                 return (error);
1069         zilog = zfsvfs->z_log;
1070         error = zfs_setacl(zp, vsecp, skipaclchk, cr);
1071
1072         if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
1073                 zil_commit(zilog, 0);
1074
1075         zfs_exit(zfsvfs, FTAG);
1076         return (error);
1077 }
1078
1079 #ifdef ZFS_DEBUG
1080 static int zil_fault_io = 0;
1081 #endif
1082
1083 static void zfs_get_done(zgd_t *zgd, int error);
1084
1085 /*
1086  * Get data to generate a TX_WRITE intent log record.
1087  */
1088 int
1089 zfs_get_data(void *arg, uint64_t gen, lr_write_t *lr, char *buf,
1090     struct lwb *lwb, zio_t *zio)
1091 {
1092         zfsvfs_t *zfsvfs = arg;
1093         objset_t *os = zfsvfs->z_os;
1094         znode_t *zp;
1095         uint64_t object = lr->lr_foid;
1096         uint64_t offset = lr->lr_offset;
1097         uint64_t size = lr->lr_length;
1098         zgd_t *zgd;
1099         int error = 0;
1100         uint64_t zp_gen;
1101
1102         ASSERT3P(lwb, !=, NULL);
1103         ASSERT3U(size, !=, 0);
1104
1105         /*
1106          * Nothing to do if the file has been removed
1107          */
1108         if (zfs_zget(zfsvfs, object, &zp) != 0)
1109                 return (SET_ERROR(ENOENT));
1110         if (zp->z_unlinked) {
1111                 /*
1112                  * Release the vnode asynchronously as we currently have the
1113                  * txg stopped from syncing.
1114                  */
1115                 zfs_zrele_async(zp);
1116                 return (SET_ERROR(ENOENT));
1117         }
1118         /* check if generation number matches */
1119         if (sa_lookup(zp->z_sa_hdl, SA_ZPL_GEN(zfsvfs), &zp_gen,
1120             sizeof (zp_gen)) != 0) {
1121                 zfs_zrele_async(zp);
1122                 return (SET_ERROR(EIO));
1123         }
1124         if (zp_gen != gen) {
1125                 zfs_zrele_async(zp);
1126                 return (SET_ERROR(ENOENT));
1127         }
1128
1129         zgd = kmem_zalloc(sizeof (zgd_t), KM_SLEEP);
1130         zgd->zgd_lwb = lwb;
1131         zgd->zgd_private = zp;
1132
1133         /*
1134          * Write records come in two flavors: immediate and indirect.
1135          * For small writes it's cheaper to store the data with the
1136          * log record (immediate); for large writes it's cheaper to
1137          * sync the data and get a pointer to it (indirect) so that
1138          * we don't have to write the data twice.
1139          */
1140         if (buf != NULL) { /* immediate write */
1141                 zgd->zgd_lr = zfs_rangelock_enter(&zp->z_rangelock, offset,
1142                     size, RL_READER);
1143                 /* test for truncation needs to be done while range locked */
1144                 if (offset >= zp->z_size) {
1145                         error = SET_ERROR(ENOENT);
1146                 } else {
1147                         error = dmu_read(os, object, offset, size, buf,
1148                             DMU_READ_NO_PREFETCH);
1149                 }
1150                 ASSERT(error == 0 || error == ENOENT);
1151         } else { /* indirect write */
1152                 ASSERT3P(zio, !=, NULL);
1153                 /*
1154                  * Have to lock the whole block to ensure when it's
1155                  * written out and its checksum is being calculated
1156                  * that no one can change the data. We need to re-check
1157                  * blocksize after we get the lock in case it's changed!
1158                  */
1159                 for (;;) {
1160                         uint64_t blkoff;
1161                         size = zp->z_blksz;
1162                         blkoff = ISP2(size) ? P2PHASE(offset, size) : offset;
1163                         offset -= blkoff;
1164                         zgd->zgd_lr = zfs_rangelock_enter(&zp->z_rangelock,
1165                             offset, size, RL_READER);
1166                         if (zp->z_blksz == size)
1167                                 break;
1168                         offset += blkoff;
1169                         zfs_rangelock_exit(zgd->zgd_lr);
1170                 }
1171                 /* test for truncation needs to be done while range locked */
1172                 if (lr->lr_offset >= zp->z_size)
1173                         error = SET_ERROR(ENOENT);
1174 #ifdef ZFS_DEBUG
1175                 if (zil_fault_io) {
1176                         error = SET_ERROR(EIO);
1177                         zil_fault_io = 0;
1178                 }
1179 #endif
1180
1181                 dmu_buf_t *dbp;
1182                 if (error == 0)
1183                         error = dmu_buf_hold_noread(os, object, offset, zgd,
1184                             &dbp);
1185
1186                 if (error == 0) {
1187                         zgd->zgd_db = dbp;
1188                         dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbp;
1189                         boolean_t direct_write = B_FALSE;
1190                         mutex_enter(&db->db_mtx);
1191                         dbuf_dirty_record_t *dr =
1192                             dbuf_find_dirty_eq(db, lr->lr_common.lrc_txg);
1193                         if (dr != NULL && dr->dt.dl.dr_diowrite)
1194                                 direct_write = B_TRUE;
1195                         mutex_exit(&db->db_mtx);
1196
1197                         /*
1198                          * All Direct I/O writes will have already completed and
1199                          * the block pointer can be immediately stored in the
1200                          * log record.
1201                          */
1202                         if (direct_write) {
1203                                 /*
1204                                  * A Direct I/O write always covers an entire
1205                                  * block.
1206                                  */
1207                                 ASSERT3U(dbp->db_size, ==, zp->z_blksz);
1208                                 lr->lr_blkptr = dr->dt.dl.dr_overridden_by;
1209                                 zfs_get_done(zgd, 0);
1210                                 return (0);
1211                         }
1212
1213                         blkptr_t *bp = &lr->lr_blkptr;
1214                         zgd->zgd_bp = bp;
1215
1216                         ASSERT3U(dbp->db_offset, ==, offset);
1217                         ASSERT3U(dbp->db_size, ==, size);
1218
1219                         error = dmu_sync(zio, lr->lr_common.lrc_txg,
1220                             zfs_get_done, zgd);
1221                         ASSERT(error || lr->lr_length <= size);
1222
1223                         /*
1224                          * On success, we need to wait for the write I/O
1225                          * initiated by dmu_sync() to complete before we can
1226                          * release this dbuf.  We will finish everything up
1227                          * in the zfs_get_done() callback.
1228                          */
1229                         if (error == 0)
1230                                 return (0);
1231
1232                         if (error == EALREADY) {
1233                                 lr->lr_common.lrc_txtype = TX_WRITE2;
1234                                 /*
1235                                  * TX_WRITE2 relies on the data previously
1236                                  * written by the TX_WRITE that caused
1237                                  * EALREADY.  We zero out the BP because
1238                                  * it is the old, currently-on-disk BP.
1239                                  */
1240                                 zgd->zgd_bp = NULL;
1241                                 BP_ZERO(bp);
1242                                 error = 0;
1243                         }
1244                 }
1245         }
1246
1247         zfs_get_done(zgd, error);
1248
1249         return (error);
1250 }
1251
1252 static void
1253 zfs_get_done(zgd_t *zgd, int error)
1254 {
1255         (void) error;
1256         znode_t *zp = zgd->zgd_private;
1257
1258         if (zgd->zgd_db)
1259                 dmu_buf_rele(zgd->zgd_db, zgd);
1260
1261         zfs_rangelock_exit(zgd->zgd_lr);
1262
1263         /*
1264          * Release the vnode asynchronously as we currently have the
1265          * txg stopped from syncing.
1266          */
1267         zfs_zrele_async(zp);
1268
1269         kmem_free(zgd, sizeof (zgd_t));
1270 }
1271
1272 static int
1273 zfs_enter_two(zfsvfs_t *zfsvfs1, zfsvfs_t *zfsvfs2, const char *tag)
1274 {
1275         int error;
1276
1277         /* Swap. Not sure if the order of zfs_enter()s is important. */
1278         if (zfsvfs1 > zfsvfs2) {
1279                 zfsvfs_t *tmpzfsvfs;
1280
1281                 tmpzfsvfs = zfsvfs2;
1282                 zfsvfs2 = zfsvfs1;
1283                 zfsvfs1 = tmpzfsvfs;
1284         }
1285
1286         error = zfs_enter(zfsvfs1, tag);
1287         if (error != 0)
1288                 return (error);
1289         if (zfsvfs1 != zfsvfs2) {
1290                 error = zfs_enter(zfsvfs2, tag);
1291                 if (error != 0) {
1292                         zfs_exit(zfsvfs1, tag);
1293                         return (error);
1294                 }
1295         }
1296
1297         return (0);
1298 }
1299
1300 static void
1301 zfs_exit_two(zfsvfs_t *zfsvfs1, zfsvfs_t *zfsvfs2, const char *tag)
1302 {
1303
1304         zfs_exit(zfsvfs1, tag);
1305         if (zfsvfs1 != zfsvfs2)
1306                 zfs_exit(zfsvfs2, tag);
1307 }
1308
1309 /*
1310  * We split each clone request in chunks that can fit into a single ZIL
1311  * log entry. Each ZIL log entry can fit 130816 bytes for a block cloning
1312  * operation (see zil_max_log_data() and zfs_log_clone_range()). This gives
1313  * us room for storing 1022 block pointers.
1314  *
1315  * On success, the function return the number of bytes copied in *lenp.
1316  * Note, it doesn't return how much bytes are left to be copied.
1317  * On errors which are caused by any file system limitations or
1318  * brt limitations `EINVAL` is returned. In the most cases a user
1319  * requested bad parameters, it could be possible to clone the file but
1320  * some parameters don't match the requirements.
1321  */
1322 int
1323 zfs_clone_range(znode_t *inzp, uint64_t *inoffp, znode_t *outzp,
1324     uint64_t *outoffp, uint64_t *lenp, cred_t *cr)
1325 {
1326         zfsvfs_t        *inzfsvfs, *outzfsvfs;
1327         objset_t        *inos, *outos;
1328         zfs_locked_range_t *inlr, *outlr;
1329         dmu_buf_impl_t  *db;
1330         dmu_tx_t        *tx;
1331         zilog_t         *zilog;
1332         uint64_t        inoff, outoff, len, done;
1333         uint64_t        outsize, size;
1334         int             error;
1335         int             count = 0;
1336         sa_bulk_attr_t  bulk[3];
1337         uint64_t        mtime[2], ctime[2];
1338         uint64_t        uid, gid, projid;
1339         blkptr_t        *bps;
1340         size_t          maxblocks, nbps;
1341         uint_t          inblksz;
1342         uint64_t        clear_setid_bits_txg = 0;
1343         uint64_t        last_synced_txg = 0;
1344
1345         inoff = *inoffp;
1346         outoff = *outoffp;
1347         len = *lenp;
1348         done = 0;
1349
1350         inzfsvfs = ZTOZSB(inzp);
1351         outzfsvfs = ZTOZSB(outzp);
1352
1353         /*
1354          * We need to call zfs_enter() potentially on two different datasets,
1355          * so we need a dedicated function for that.
1356          */
1357         error = zfs_enter_two(inzfsvfs, outzfsvfs, FTAG);
1358         if (error != 0)
1359                 return (error);
1360
1361         inos = inzfsvfs->z_os;
1362         outos = outzfsvfs->z_os;
1363
1364         /*
1365          * Both source and destination have to belong to the same storage pool.
1366          */
1367         if (dmu_objset_spa(inos) != dmu_objset_spa(outos)) {
1368                 zfs_exit_two(inzfsvfs, outzfsvfs, FTAG);
1369                 return (SET_ERROR(EXDEV));
1370         }
1371
1372         /*
1373          * outos and inos belongs to the same storage pool.
1374          * see a few lines above, only one check.
1375          */
1376         if (!spa_feature_is_enabled(dmu_objset_spa(outos),
1377             SPA_FEATURE_BLOCK_CLONING)) {
1378                 zfs_exit_two(inzfsvfs, outzfsvfs, FTAG);
1379                 return (SET_ERROR(EOPNOTSUPP));
1380         }
1381
1382         ASSERT(!outzfsvfs->z_replay);
1383
1384         /*
1385          * Block cloning from an unencrypted dataset into an encrypted
1386          * dataset and vice versa is not supported.
1387          */
1388         if (inos->os_encrypted != outos->os_encrypted) {
1389                 zfs_exit_two(inzfsvfs, outzfsvfs, FTAG);
1390                 return (SET_ERROR(EXDEV));
1391         }
1392
1393         /*
1394          * Cloning across encrypted datasets is possible only if they
1395          * share the same master key.
1396          */
1397         if (inos != outos && inos->os_encrypted &&
1398             !dmu_objset_crypto_key_equal(inos, outos)) {
1399                 zfs_exit_two(inzfsvfs, outzfsvfs, FTAG);
1400                 return (SET_ERROR(EXDEV));
1401         }
1402
1403         error = zfs_verify_zp(inzp);
1404         if (error == 0)
1405                 error = zfs_verify_zp(outzp);
1406         if (error != 0) {
1407                 zfs_exit_two(inzfsvfs, outzfsvfs, FTAG);
1408                 return (error);
1409         }
1410
1411         /*
1412          * We don't copy source file's flags that's why we don't allow to clone
1413          * files that are in quarantine.
1414          */
1415         if (inzp->z_pflags & ZFS_AV_QUARANTINED) {
1416                 zfs_exit_two(inzfsvfs, outzfsvfs, FTAG);
1417                 return (SET_ERROR(EACCES));
1418         }
1419
1420         if (inoff >= inzp->z_size) {
1421                 *lenp = 0;
1422                 zfs_exit_two(inzfsvfs, outzfsvfs, FTAG);
1423                 return (0);
1424         }
1425         if (len > inzp->z_size - inoff) {
1426                 len = inzp->z_size - inoff;
1427         }
1428         if (len == 0) {
1429                 *lenp = 0;
1430                 zfs_exit_two(inzfsvfs, outzfsvfs, FTAG);
1431                 return (0);
1432         }
1433
1434         /*
1435          * Callers might not be able to detect properly that we are read-only,
1436          * so check it explicitly here.
1437          */
1438         if (zfs_is_readonly(outzfsvfs)) {
1439                 zfs_exit_two(inzfsvfs, outzfsvfs, FTAG);
1440                 return (SET_ERROR(EROFS));
1441         }
1442
1443         /*
1444          * If immutable or not appending then return EPERM.
1445          * Intentionally allow ZFS_READONLY through here.
1446          * See zfs_zaccess_common()
1447          */
1448         if ((outzp->z_pflags & ZFS_IMMUTABLE) != 0) {
1449                 zfs_exit_two(inzfsvfs, outzfsvfs, FTAG);
1450                 return (SET_ERROR(EPERM));
1451         }
1452
1453         /*
1454          * No overlapping if we are cloning within the same file.
1455          */
1456         if (inzp == outzp) {
1457                 if (inoff < outoff + len && outoff < inoff + len) {
1458                         zfs_exit_two(inzfsvfs, outzfsvfs, FTAG);
1459                         return (SET_ERROR(EINVAL));
1460                 }
1461         }
1462
1463         /* Flush any mmap()'d data to disk */
1464         if (zn_has_cached_data(inzp, inoff, inoff + len - 1))
1465                 zn_flush_cached_data(inzp, B_TRUE);
1466
1467         /*
1468          * Maintain predictable lock order.
1469          */
1470         if (inzp < outzp || (inzp == outzp && inoff < outoff)) {
1471                 inlr = zfs_rangelock_enter(&inzp->z_rangelock, inoff, len,
1472                     RL_READER);
1473                 outlr = zfs_rangelock_enter(&outzp->z_rangelock, outoff, len,
1474                     RL_WRITER);
1475         } else {
1476                 outlr = zfs_rangelock_enter(&outzp->z_rangelock, outoff, len,
1477                     RL_WRITER);
1478                 inlr = zfs_rangelock_enter(&inzp->z_rangelock, inoff, len,
1479                     RL_READER);
1480         }
1481
1482         inblksz = inzp->z_blksz;
1483
1484         /*
1485          * We cannot clone into a file with different block size if we can't
1486          * grow it (block size is already bigger, has more than one block, or
1487          * not locked for growth).  There are other possible reasons for the
1488          * grow to fail, but we cover what we can before opening transaction
1489          * and the rest detect after we try to do it.
1490          */
1491         if (inblksz < outzp->z_blksz) {
1492                 error = SET_ERROR(EINVAL);
1493                 goto unlock;
1494         }
1495         if (inblksz != outzp->z_blksz && (outzp->z_size > outzp->z_blksz ||
1496             outlr->lr_length != UINT64_MAX)) {
1497                 error = SET_ERROR(EINVAL);
1498                 goto unlock;
1499         }
1500
1501         /*
1502          * Block size must be power-of-2 if destination offset != 0.
1503          * There can be no multiple blocks of non-power-of-2 size.
1504          */
1505         if (outoff != 0 && !ISP2(inblksz)) {
1506                 error = SET_ERROR(EINVAL);
1507                 goto unlock;
1508         }
1509
1510         /*
1511          * Offsets and len must be at block boundries.
1512          */
1513         if ((inoff % inblksz) != 0 || (outoff % inblksz) != 0) {
1514                 error = SET_ERROR(EINVAL);
1515                 goto unlock;
1516         }
1517         /*
1518          * Length must be multipe of blksz, except for the end of the file.
1519          */
1520         if ((len % inblksz) != 0 &&
1521             (len < inzp->z_size - inoff || len < outzp->z_size - outoff)) {
1522                 error = SET_ERROR(EINVAL);
1523                 goto unlock;
1524         }
1525
1526         /*
1527          * If we are copying only one block and it is smaller than recordsize
1528          * property, do not allow destination to grow beyond one block if it
1529          * is not there yet.  Otherwise the destination will get stuck with
1530          * that block size forever, that can be as small as 512 bytes, no
1531          * matter how big the destination grow later.
1532          */
1533         if (len <= inblksz && inblksz < outzfsvfs->z_max_blksz &&
1534             outzp->z_size <= inblksz && outoff + len > inblksz) {
1535                 error = SET_ERROR(EINVAL);
1536                 goto unlock;
1537         }
1538
1539         error = zn_rlimit_fsize(outoff + len);
1540         if (error != 0) {
1541                 goto unlock;
1542         }
1543
1544         if (inoff >= MAXOFFSET_T || outoff >= MAXOFFSET_T) {
1545                 error = SET_ERROR(EFBIG);
1546                 goto unlock;
1547         }
1548
1549         SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(outzfsvfs), NULL,
1550             &mtime, 16);
1551         SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(outzfsvfs), NULL,
1552             &ctime, 16);
1553         SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(outzfsvfs), NULL,
1554             &outzp->z_size, 8);
1555
1556         zilog = outzfsvfs->z_log;
1557         maxblocks = zil_max_log_data(zilog, sizeof (lr_clone_range_t)) /
1558             sizeof (bps[0]);
1559
1560         uid = KUID_TO_SUID(ZTOUID(outzp));
1561         gid = KGID_TO_SGID(ZTOGID(outzp));
1562         projid = outzp->z_projid;
1563
1564         bps = vmem_alloc(sizeof (bps[0]) * maxblocks, KM_SLEEP);
1565
1566         /*
1567          * Clone the file in reasonable size chunks.  Each chunk is cloned
1568          * in a separate transaction; this keeps the intent log records small
1569          * and allows us to do more fine-grained space accounting.
1570          */
1571         while (len > 0) {
1572                 size = MIN(inblksz * maxblocks, len);
1573
1574                 if (zfs_id_overblockquota(outzfsvfs, DMU_USERUSED_OBJECT,
1575                     uid) ||
1576                     zfs_id_overblockquota(outzfsvfs, DMU_GROUPUSED_OBJECT,
1577                     gid) ||
1578                     (projid != ZFS_DEFAULT_PROJID &&
1579                     zfs_id_overblockquota(outzfsvfs, DMU_PROJECTUSED_OBJECT,
1580                     projid))) {
1581                         error = SET_ERROR(EDQUOT);
1582                         break;
1583                 }
1584
1585                 nbps = maxblocks;
1586                 last_synced_txg = spa_last_synced_txg(dmu_objset_spa(inos));
1587                 error = dmu_read_l0_bps(inos, inzp->z_id, inoff, size, bps,
1588                     &nbps);
1589                 if (error != 0) {
1590                         /*
1591                          * If we are trying to clone a block that was created
1592                          * in the current transaction group, the error will be
1593                          * EAGAIN here.  Based on zfs_bclone_wait_dirty either
1594                          * return a shortened range to the caller so it can
1595                          * fallback, or wait for the next TXG and check again.
1596                          */
1597                         if (error == EAGAIN && zfs_bclone_wait_dirty) {
1598                                 txg_wait_synced(dmu_objset_pool(inos),
1599                                     last_synced_txg + 1);
1600                                 continue;
1601                         }
1602
1603                         break;
1604                 }
1605
1606                 /*
1607                  * Start a transaction.
1608                  */
1609                 tx = dmu_tx_create(outos);
1610                 dmu_tx_hold_sa(tx, outzp->z_sa_hdl, B_FALSE);
1611                 db = (dmu_buf_impl_t *)sa_get_db(outzp->z_sa_hdl);
1612                 DB_DNODE_ENTER(db);
1613                 dmu_tx_hold_clone_by_dnode(tx, DB_DNODE(db), outoff, size);
1614                 DB_DNODE_EXIT(db);
1615                 zfs_sa_upgrade_txholds(tx, outzp);
1616                 error = dmu_tx_assign(tx, TXG_WAIT);
1617                 if (error != 0) {
1618                         dmu_tx_abort(tx);
1619                         break;
1620                 }
1621
1622                 /*
1623                  * Copy source znode's block size. This is done only if the
1624                  * whole znode is locked (see zfs_rangelock_cb()) and only
1625                  * on the first iteration since zfs_rangelock_reduce() will
1626                  * shrink down lr_length to the appropriate size.
1627                  */
1628                 if (outlr->lr_length == UINT64_MAX) {
1629                         zfs_grow_blocksize(outzp, inblksz, tx);
1630
1631                         /*
1632                          * Block growth may fail for many reasons we can not
1633                          * predict here.  If it happen the cloning is doomed.
1634                          */
1635                         if (inblksz != outzp->z_blksz) {
1636                                 error = SET_ERROR(EINVAL);
1637                                 dmu_tx_abort(tx);
1638                                 break;
1639                         }
1640
1641                         /*
1642                          * Round range lock up to the block boundary, so we
1643                          * prevent appends until we are done.
1644                          */
1645                         zfs_rangelock_reduce(outlr, outoff,
1646                             ((len - 1) / inblksz + 1) * inblksz);
1647                 }
1648
1649                 error = dmu_brt_clone(outos, outzp->z_id, outoff, size, tx,
1650                     bps, nbps);
1651                 if (error != 0) {
1652                         dmu_tx_commit(tx);
1653                         break;
1654                 }
1655
1656                 if (zn_has_cached_data(outzp, outoff, outoff + size - 1)) {
1657                         update_pages(outzp, outoff, size, outos);
1658                 }
1659
1660                 zfs_clear_setid_bits_if_necessary(outzfsvfs, outzp, cr,
1661                     &clear_setid_bits_txg, tx);
1662
1663                 zfs_tstamp_update_setup(outzp, CONTENT_MODIFIED, mtime, ctime);
1664
1665                 /*
1666                  * Update the file size (zp_size) if it has changed;
1667                  * account for possible concurrent updates.
1668                  */
1669                 while ((outsize = outzp->z_size) < outoff + size) {
1670                         (void) atomic_cas_64(&outzp->z_size, outsize,
1671                             outoff + size);
1672                 }
1673
1674                 error = sa_bulk_update(outzp->z_sa_hdl, bulk, count, tx);
1675
1676                 zfs_log_clone_range(zilog, tx, TX_CLONE_RANGE, outzp, outoff,
1677                     size, inblksz, bps, nbps);
1678
1679                 dmu_tx_commit(tx);
1680
1681                 if (error != 0)
1682                         break;
1683
1684                 inoff += size;
1685                 outoff += size;
1686                 len -= size;
1687                 done += size;
1688
1689                 if (issig()) {
1690                         error = SET_ERROR(EINTR);
1691                         break;
1692                 }
1693         }
1694
1695         vmem_free(bps, sizeof (bps[0]) * maxblocks);
1696         zfs_znode_update_vfs(outzp);
1697
1698 unlock:
1699         zfs_rangelock_exit(outlr);
1700         zfs_rangelock_exit(inlr);
1701
1702         if (done > 0) {
1703                 /*
1704                  * If we have made at least partial progress, reset the error.
1705                  */
1706                 error = 0;
1707
1708                 ZFS_ACCESSTIME_STAMP(inzfsvfs, inzp);
1709
1710                 if (outos->os_sync == ZFS_SYNC_ALWAYS) {
1711                         zil_commit(zilog, outzp->z_id);
1712                 }
1713
1714                 *inoffp += done;
1715                 *outoffp += done;
1716                 *lenp = done;
1717         } else {
1718                 /*
1719                  * If we made no progress, there must be a good reason.
1720                  * EOF is handled explicitly above, before the loop.
1721                  */
1722                 ASSERT3S(error, !=, 0);
1723         }
1724
1725         zfs_exit_two(inzfsvfs, outzfsvfs, FTAG);
1726
1727         return (error);
1728 }
1729
1730 /*
1731  * Usual pattern would be to call zfs_clone_range() from zfs_replay_clone(),
1732  * but we cannot do that, because when replaying we don't have source znode
1733  * available. This is why we need a dedicated replay function.
1734  */
1735 int
1736 zfs_clone_range_replay(znode_t *zp, uint64_t off, uint64_t len, uint64_t blksz,
1737     const blkptr_t *bps, size_t nbps)
1738 {
1739         zfsvfs_t        *zfsvfs;
1740         dmu_buf_impl_t  *db;
1741         dmu_tx_t        *tx;
1742         int             error;
1743         int             count = 0;
1744         sa_bulk_attr_t  bulk[3];
1745         uint64_t        mtime[2], ctime[2];
1746
1747         ASSERT3U(off, <, MAXOFFSET_T);
1748         ASSERT3U(len, >, 0);
1749         ASSERT3U(nbps, >, 0);
1750
1751         zfsvfs = ZTOZSB(zp);
1752
1753         ASSERT(spa_feature_is_enabled(dmu_objset_spa(zfsvfs->z_os),
1754             SPA_FEATURE_BLOCK_CLONING));
1755
1756         if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
1757                 return (error);
1758
1759         ASSERT(zfsvfs->z_replay);
1760         ASSERT(!zfs_is_readonly(zfsvfs));
1761
1762         if ((off % blksz) != 0) {
1763                 zfs_exit(zfsvfs, FTAG);
1764                 return (SET_ERROR(EINVAL));
1765         }
1766
1767         SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16);
1768         SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16);
1769         SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL,
1770             &zp->z_size, 8);
1771
1772         /*
1773          * Start a transaction.
1774          */
1775         tx = dmu_tx_create(zfsvfs->z_os);
1776
1777         dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
1778         db = (dmu_buf_impl_t *)sa_get_db(zp->z_sa_hdl);
1779         DB_DNODE_ENTER(db);
1780         dmu_tx_hold_clone_by_dnode(tx, DB_DNODE(db), off, len);
1781         DB_DNODE_EXIT(db);
1782         zfs_sa_upgrade_txholds(tx, zp);
1783         error = dmu_tx_assign(tx, TXG_WAIT);
1784         if (error != 0) {
1785                 dmu_tx_abort(tx);
1786                 zfs_exit(zfsvfs, FTAG);
1787                 return (error);
1788         }
1789
1790         if (zp->z_blksz < blksz)
1791                 zfs_grow_blocksize(zp, blksz, tx);
1792
1793         dmu_brt_clone(zfsvfs->z_os, zp->z_id, off, len, tx, bps, nbps);
1794
1795         zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime);
1796
1797         if (zp->z_size < off + len)
1798                 zp->z_size = off + len;
1799
1800         error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
1801
1802         /*
1803          * zil_replaying() not only check if we are replaying ZIL, but also
1804          * updates the ZIL header to record replay progress.
1805          */
1806         VERIFY(zil_replaying(zfsvfs->z_log, tx));
1807
1808         dmu_tx_commit(tx);
1809
1810         zfs_znode_update_vfs(zp);
1811
1812         zfs_exit(zfsvfs, FTAG);
1813
1814         return (error);
1815 }
1816
1817 EXPORT_SYMBOL(zfs_access);
1818 EXPORT_SYMBOL(zfs_fsync);
1819 EXPORT_SYMBOL(zfs_holey);
1820 EXPORT_SYMBOL(zfs_read);
1821 EXPORT_SYMBOL(zfs_write);
1822 EXPORT_SYMBOL(zfs_getsecattr);
1823 EXPORT_SYMBOL(zfs_setsecattr);
1824 EXPORT_SYMBOL(zfs_clone_range);
1825 EXPORT_SYMBOL(zfs_clone_range_replay);
1826
1827 ZFS_MODULE_PARAM(zfs_vnops, zfs_vnops_, read_chunk_size, U64, ZMOD_RW,
1828         "Bytes to read per chunk");
1829
1830 ZFS_MODULE_PARAM(zfs, zfs_, bclone_enabled, INT, ZMOD_RW,
1831         "Enable block cloning");
1832
1833 ZFS_MODULE_PARAM(zfs, zfs_, bclone_wait_dirty, INT, ZMOD_RW,
1834         "Wait for dirty blocks when cloning");
1835
1836 ZFS_MODULE_PARAM(zfs, zfs_, dio_enabled, INT, ZMOD_RW,
1837         "Enable Direct I/O");