module/zfs/zfs_vnops.c

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or https://opensource.org/licenses/CDDL-1.0.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21
  22 /*
  23  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  24  * Copyright (c) 2012, 2018 by Delphix. All rights reserved.
  25  * Copyright (c) 2015 by Chunwei Chen. All rights reserved.
  26  * Copyright 2017 Nexenta Systems, Inc.
  27  * Copyright (c) 2021, 2022 by Pawel Jakub Dawidek
  28  */
  29
  30 /* Portions Copyright 2007 Jeremy Teo */
  31 /* Portions Copyright 2010 Robert Milkowski */
  32
  33 #include <sys/types.h>
  34 #include <sys/param.h>
  35 #include <sys/time.h>
  36 #include <sys/sysmacros.h>
  37 #include <sys/vfs.h>
  38 #include <sys/uio_impl.h>
  39 #include <sys/file.h>
  40 #include <sys/stat.h>
  41 #include <sys/kmem.h>
  42 #include <sys/cmn_err.h>
  43 #include <sys/errno.h>
  44 #include <sys/zfs_dir.h>
  45 #include <sys/zfs_acl.h>
  46 #include <sys/zfs_ioctl.h>
  47 #include <sys/fs/zfs.h>
  48 #include <sys/dmu.h>
  49 #include <sys/dmu_objset.h>
  50 #include <sys/spa.h>
  51 #include <sys/txg.h>
  52 #include <sys/dbuf.h>
  53 #include <sys/policy.h>
  54 #include <sys/zfeature.h>
  55 #include <sys/zfs_vnops.h>
  56 #include <sys/zfs_quota.h>
  57 #include <sys/zfs_vfsops.h>
  58 #include <sys/zfs_znode.h>
  59
  60
  61 static ulong_t zfs_fsync_sync_cnt = 4;
  62
  63 int
  64 zfs_fsync(znode_t *zp, int syncflag, cred_t *cr)
  65 {
  66         int error = 0;
  67         zfsvfs_t *zfsvfs = ZTOZSB(zp);
  68
  69         (void) tsd_set(zfs_fsyncer_key, (void *)(uintptr_t)zfs_fsync_sync_cnt);
  70
  71         if (zfsvfs->z_os->os_sync != ZFS_SYNC_DISABLED) {
  72                 if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
  73                         goto out;
  74                 atomic_inc_32(&zp->z_sync_writes_cnt);
  75                 zil_commit(zfsvfs->z_log, zp->z_id);
  76                 atomic_dec_32(&zp->z_sync_writes_cnt);
  77                 zfs_exit(zfsvfs, FTAG);
  78         }
  79 out:
  80         tsd_set(zfs_fsyncer_key, NULL);
  81
  82         return (error);
  83 }
  84
  85
  86 #if defined(SEEK_HOLE) && defined(SEEK_DATA)
  87 /*
  88  * Lseek support for finding holes (cmd == SEEK_HOLE) and
  89  * data (cmd == SEEK_DATA). "off" is an in/out parameter.
  90  */
  91 static int
  92 zfs_holey_common(znode_t *zp, ulong_t cmd, loff_t *off)
  93 {
  94         zfs_locked_range_t *lr;
  95         uint64_t noff = (uint64_t)*off; /* new offset */
  96         uint64_t file_sz;
  97         int error;
  98         boolean_t hole;
  99
 100         file_sz = zp->z_size;
 101         if (noff >= file_sz)  {
 102                 return (SET_ERROR(ENXIO));
 103         }
 104
 105         if (cmd == F_SEEK_HOLE)
 106                 hole = B_TRUE;
 107         else
 108                 hole = B_FALSE;
 109
 110         /* Flush any mmap()'d data to disk */
 111         if (zn_has_cached_data(zp, 0, file_sz - 1))
 112                 zn_flush_cached_data(zp, B_FALSE);
 113
 114         lr = zfs_rangelock_enter(&zp->z_rangelock, 0, UINT64_MAX, RL_READER);
 115         error = dmu_offset_next(ZTOZSB(zp)->z_os, zp->z_id, hole, &noff);
 116         zfs_rangelock_exit(lr);
 117
 118         if (error == ESRCH)
 119                 return (SET_ERROR(ENXIO));
 120
 121         /* File was dirty, so fall back to using generic logic */
 122         if (error == EBUSY) {
 123                 if (hole)
 124                         *off = file_sz;
 125
 126                 return (0);
 127         }
 128
 129         /*
 130          * We could find a hole that begins after the logical end-of-file,
 131          * because dmu_offset_next() only works on whole blocks.  If the
 132          * EOF falls mid-block, then indicate that the "virtual hole"
 133          * at the end of the file begins at the logical EOF, rather than
 134          * at the end of the last block.
 135          */
 136         if (noff > file_sz) {
 137                 ASSERT(hole);
 138                 noff = file_sz;
 139         }
 140
 141         if (noff < *off)
 142                 return (error);
 143         *off = noff;
 144         return (error);
 145 }
 146
 147 int
 148 zfs_holey(znode_t *zp, ulong_t cmd, loff_t *off)
 149 {
 150         zfsvfs_t *zfsvfs = ZTOZSB(zp);
 151         int error;
 152
 153         if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
 154                 return (error);
 155
 156         error = zfs_holey_common(zp, cmd, off);
 157
 158         zfs_exit(zfsvfs, FTAG);
 159         return (error);
 160 }
 161 #endif /* SEEK_HOLE && SEEK_DATA */
 162
 163 int
 164 zfs_access(znode_t *zp, int mode, int flag, cred_t *cr)
 165 {
 166         zfsvfs_t *zfsvfs = ZTOZSB(zp);
 167         int error;
 168
 169         if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
 170                 return (error);
 171
 172         if (flag & V_ACE_MASK)
 173 #if defined(__linux__)
 174                 error = zfs_zaccess(zp, mode, flag, B_FALSE, cr,
 175                     zfs_init_idmap);
 176 #else
 177                 error = zfs_zaccess(zp, mode, flag, B_FALSE, cr,
 178                     NULL);
 179 #endif
 180         else
 181 #if defined(__linux__)
 182                 error = zfs_zaccess_rwx(zp, mode, flag, cr, zfs_init_idmap);
 183 #else
 184                 error = zfs_zaccess_rwx(zp, mode, flag, cr, NULL);
 185 #endif
 186
 187         zfs_exit(zfsvfs, FTAG);
 188         return (error);
 189 }
 190
 191 static uint64_t zfs_vnops_read_chunk_size = 1024 * 1024; /* Tunable */
 192
 193 /*
 194  * Read bytes from specified file into supplied buffer.
 195  *
 196  *      IN:     zp      - inode of file to be read from.
 197  *              uio     - structure supplying read location, range info,
 198  *                        and return buffer.
 199  *              ioflag  - O_SYNC flags; used to provide FRSYNC semantics.
 200  *                        O_DIRECT flag; used to bypass page cache.
 201  *              cr      - credentials of caller.
 202  *
 203  *      OUT:    uio     - updated offset and range, buffer filled.
 204  *
 205  *      RETURN: 0 on success, error code on failure.
 206  *
 207  * Side Effects:
 208  *      inode - atime updated if byte count > 0
 209  */
 210 int
 211 zfs_read(struct znode *zp, zfs_uio_t *uio, int ioflag, cred_t *cr)
 212 {
 213         (void) cr;
 214         int error = 0;
 215         boolean_t frsync = B_FALSE;
 216
 217         zfsvfs_t *zfsvfs = ZTOZSB(zp);
 218         if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
 219                 return (error);
 220
 221         if (zp->z_pflags & ZFS_AV_QUARANTINED) {
 222                 zfs_exit(zfsvfs, FTAG);
 223                 return (SET_ERROR(EACCES));
 224         }
 225
 226         /* We don't copy out anything useful for directories. */
 227         if (Z_ISDIR(ZTOTYPE(zp))) {
 228                 zfs_exit(zfsvfs, FTAG);
 229                 return (SET_ERROR(EISDIR));
 230         }
 231
 232         /*
 233          * Validate file offset
 234          */
 235         if (zfs_uio_offset(uio) < (offset_t)0) {
 236                 zfs_exit(zfsvfs, FTAG);
 237                 return (SET_ERROR(EINVAL));
 238         }
 239
 240         /*
 241          * Fasttrack empty reads
 242          */
 243         if (zfs_uio_resid(uio) == 0) {
 244                 zfs_exit(zfsvfs, FTAG);
 245                 return (0);
 246         }
 247
 248 #ifdef FRSYNC
 249         /*
 250          * If we're in FRSYNC mode, sync out this znode before reading it.
 251          * Only do this for non-snapshots.
 252          *
 253          * Some platforms do not support FRSYNC and instead map it
 254          * to O_SYNC, which results in unnecessary calls to zil_commit. We
 255          * only honor FRSYNC requests on platforms which support it.
 256          */
 257         frsync = !!(ioflag & FRSYNC);
 258 #endif
 259         if (zfsvfs->z_log &&
 260             (frsync || zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS))
 261                 zil_commit(zfsvfs->z_log, zp->z_id);
 262
 263         /*
 264          * Lock the range against changes.
 265          */
 266         zfs_locked_range_t *lr = zfs_rangelock_enter(&zp->z_rangelock,
 267             zfs_uio_offset(uio), zfs_uio_resid(uio), RL_READER);
 268
 269         /*
 270          * If we are reading past end-of-file we can skip
 271          * to the end; but we might still need to set atime.
 272          */
 273         if (zfs_uio_offset(uio) >= zp->z_size) {
 274                 error = 0;
 275                 goto out;
 276         }
 277
 278         ASSERT(zfs_uio_offset(uio) < zp->z_size);
 279 #if defined(__linux__)
 280         ssize_t start_offset = zfs_uio_offset(uio);
 281 #endif
 282         ssize_t n = MIN(zfs_uio_resid(uio), zp->z_size - zfs_uio_offset(uio));
 283         ssize_t start_resid = n;
 284
 285         while (n > 0) {
 286                 ssize_t nbytes = MIN(n, zfs_vnops_read_chunk_size -
 287                     P2PHASE(zfs_uio_offset(uio), zfs_vnops_read_chunk_size));
 288 #ifdef UIO_NOCOPY
 289                 if (zfs_uio_segflg(uio) == UIO_NOCOPY)
 290                         error = mappedread_sf(zp, nbytes, uio);
 291                 else
 292 #endif
 293                 if (zn_has_cached_data(zp, zfs_uio_offset(uio),
 294                     zfs_uio_offset(uio) + nbytes - 1) && !(ioflag & O_DIRECT)) {
 295                         error = mappedread(zp, nbytes, uio);
 296                 } else {
 297                         error = dmu_read_uio_dbuf(sa_get_db(zp->z_sa_hdl),
 298                             uio, nbytes);
 299                 }
 300
 301                 if (error) {
 302                         /* convert checksum errors into IO errors */
 303                         if (error == ECKSUM)
 304                                 error = SET_ERROR(EIO);
 305
 306 #if defined(__linux__)
 307                         /*
 308                          * if we actually read some bytes, bubbling EFAULT
 309                          * up to become EAGAIN isn't what we want here...
 310                          *
 311                          * ...on Linux, at least. On FBSD, doing this breaks.
 312                          */
 313                         if (error == EFAULT &&
 314                             (zfs_uio_offset(uio) - start_offset) != 0)
 315                                 error = 0;
 316 #endif
 317                         break;
 318                 }
 319
 320                 n -= nbytes;
 321         }
 322
 323         int64_t nread = start_resid - n;
 324         dataset_kstats_update_read_kstats(&zfsvfs->z_kstat, nread);
 325         task_io_account_read(nread);
 326 out:
 327         zfs_rangelock_exit(lr);
 328
 329         ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
 330         zfs_exit(zfsvfs, FTAG);
 331         return (error);
 332 }
 333
 334 static void
 335 zfs_clear_setid_bits_if_necessary(zfsvfs_t *zfsvfs, znode_t *zp, cred_t *cr,
 336     uint64_t *clear_setid_bits_txgp, dmu_tx_t *tx)
 337 {
 338         zilog_t *zilog = zfsvfs->z_log;
 339         const uint64_t uid = KUID_TO_SUID(ZTOUID(zp));
 340
 341         ASSERT(clear_setid_bits_txgp != NULL);
 342         ASSERT(tx != NULL);
 343
 344         /*
 345          * Clear Set-UID/Set-GID bits on successful write if not
 346          * privileged and at least one of the execute bits is set.
 347          *
 348          * It would be nice to do this after all writes have
 349          * been done, but that would still expose the ISUID/ISGID
 350          * to another app after the partial write is committed.
 351          *
 352          * Note: we don't call zfs_fuid_map_id() here because
 353          * user 0 is not an ephemeral uid.
 354          */
 355         mutex_enter(&zp->z_acl_lock);
 356         if ((zp->z_mode & (S_IXUSR | (S_IXUSR >> 3) | (S_IXUSR >> 6))) != 0 &&
 357             (zp->z_mode & (S_ISUID | S_ISGID)) != 0 &&
 358             secpolicy_vnode_setid_retain(zp, cr,
 359             ((zp->z_mode & S_ISUID) != 0 && uid == 0)) != 0) {
 360                 uint64_t newmode;
 361
 362                 zp->z_mode &= ~(S_ISUID | S_ISGID);
 363                 newmode = zp->z_mode;
 364                 (void) sa_update(zp->z_sa_hdl, SA_ZPL_MODE(zfsvfs),
 365                     (void *)&newmode, sizeof (uint64_t), tx);
 366
 367                 mutex_exit(&zp->z_acl_lock);
 368
 369                 /*
 370                  * Make sure SUID/SGID bits will be removed when we replay the
 371                  * log. If the setid bits are keep coming back, don't log more
 372                  * than one TX_SETATTR per transaction group.
 373                  */
 374                 if (*clear_setid_bits_txgp != dmu_tx_get_txg(tx)) {
 375                         vattr_t va = {0};
 376
 377                         va.va_mask = ATTR_MODE;
 378                         va.va_nodeid = zp->z_id;
 379                         va.va_mode = newmode;
 380                         zfs_log_setattr(zilog, tx, TX_SETATTR, zp, &va,
 381                             ATTR_MODE, NULL);
 382                         *clear_setid_bits_txgp = dmu_tx_get_txg(tx);
 383                 }
 384         } else {
 385                 mutex_exit(&zp->z_acl_lock);
 386         }
 387 }
 388
 389 /*
 390  * Write the bytes to a file.
 391  *
 392  *      IN:     zp      - znode of file to be written to.
 393  *              uio     - structure supplying write location, range info,
 394  *                        and data buffer.
 395  *              ioflag  - O_APPEND flag set if in append mode.
 396  *                        O_DIRECT flag; used to bypass page cache.
 397  *              cr      - credentials of caller.
 398  *
 399  *      OUT:    uio     - updated offset and range.
 400  *
 401  *      RETURN: 0 if success
 402  *              error code if failure
 403  *
 404  * Timestamps:
 405  *      ip - ctime|mtime updated if byte count > 0
 406  */
 407 int
 408 zfs_write(znode_t *zp, zfs_uio_t *uio, int ioflag, cred_t *cr)
 409 {
 410         int error = 0, error1;
 411         ssize_t start_resid = zfs_uio_resid(uio);
 412         uint64_t clear_setid_bits_txg = 0;
 413
 414         /*
 415          * Fasttrack empty write
 416          */
 417         ssize_t n = start_resid;
 418         if (n == 0)
 419                 return (0);
 420
 421         zfsvfs_t *zfsvfs = ZTOZSB(zp);
 422         if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
 423                 return (error);
 424
 425         sa_bulk_attr_t bulk[4];
 426         int count = 0;
 427         uint64_t mtime[2], ctime[2];
 428         SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16);
 429         SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16);
 430         SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL,
 431             &zp->z_size, 8);
 432         SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
 433             &zp->z_pflags, 8);
 434
 435         /*
 436          * Callers might not be able to detect properly that we are read-only,
 437          * so check it explicitly here.
 438          */
 439         if (zfs_is_readonly(zfsvfs)) {
 440                 zfs_exit(zfsvfs, FTAG);
 441                 return (SET_ERROR(EROFS));
 442         }
 443
 444         /*
 445          * If immutable or not appending then return EPERM.
 446          * Intentionally allow ZFS_READONLY through here.
 447          * See zfs_zaccess_common()
 448          */
 449         if ((zp->z_pflags & ZFS_IMMUTABLE) ||
 450             ((zp->z_pflags & ZFS_APPENDONLY) && !(ioflag & O_APPEND) &&
 451             (zfs_uio_offset(uio) < zp->z_size))) {
 452                 zfs_exit(zfsvfs, FTAG);
 453                 return (SET_ERROR(EPERM));
 454         }
 455
 456         /*
 457          * Validate file offset
 458          */
 459         offset_t woff = ioflag & O_APPEND ? zp->z_size : zfs_uio_offset(uio);
 460         if (woff < 0) {
 461                 zfs_exit(zfsvfs, FTAG);
 462                 return (SET_ERROR(EINVAL));
 463         }
 464
 465         /*
 466          * Pre-fault the pages to ensure slow (eg NFS) pages
 467          * don't hold up txg.
 468          */
 469         ssize_t pfbytes = MIN(n, DMU_MAX_ACCESS >> 1);
 470         if (zfs_uio_prefaultpages(pfbytes, uio)) {
 471                 zfs_exit(zfsvfs, FTAG);
 472                 return (SET_ERROR(EFAULT));
 473         }
 474
 475         /*
 476          * If in append mode, set the io offset pointer to eof.
 477          */
 478         zfs_locked_range_t *lr;
 479         if (ioflag & O_APPEND) {
 480                 /*
 481                  * Obtain an appending range lock to guarantee file append
 482                  * semantics.  We reset the write offset once we have the lock.
 483                  */
 484                 lr = zfs_rangelock_enter(&zp->z_rangelock, 0, n, RL_APPEND);
 485                 woff = lr->lr_offset;
 486                 if (lr->lr_length == UINT64_MAX) {
 487                         /*
 488                          * We overlocked the file because this write will cause
 489                          * the file block size to increase.
 490                          * Note that zp_size cannot change with this lock held.
 491                          */
 492                         woff = zp->z_size;
 493                 }
 494                 zfs_uio_setoffset(uio, woff);
 495         } else {
 496                 /*
 497                  * Note that if the file block size will change as a result of
 498                  * this write, then this range lock will lock the entire file
 499                  * so that we can re-write the block safely.
 500                  */
 501                 lr = zfs_rangelock_enter(&zp->z_rangelock, woff, n, RL_WRITER);
 502         }
 503
 504         if (zn_rlimit_fsize_uio(zp, uio)) {
 505                 zfs_rangelock_exit(lr);
 506                 zfs_exit(zfsvfs, FTAG);
 507                 return (SET_ERROR(EFBIG));
 508         }
 509
 510         const rlim64_t limit = MAXOFFSET_T;
 511
 512         if (woff >= limit) {
 513                 zfs_rangelock_exit(lr);
 514                 zfs_exit(zfsvfs, FTAG);
 515                 return (SET_ERROR(EFBIG));
 516         }
 517
 518         if (n > limit - woff)
 519                 n = limit - woff;
 520
 521         uint64_t end_size = MAX(zp->z_size, woff + n);
 522         zilog_t *zilog = zfsvfs->z_log;
 523
 524         const uint64_t uid = KUID_TO_SUID(ZTOUID(zp));
 525         const uint64_t gid = KGID_TO_SGID(ZTOGID(zp));
 526         const uint64_t projid = zp->z_projid;
 527
 528         /*
 529          * Write the file in reasonable size chunks.  Each chunk is written
 530          * in a separate transaction; this keeps the intent log records small
 531          * and allows us to do more fine-grained space accounting.
 532          */
 533         while (n > 0) {
 534                 woff = zfs_uio_offset(uio);
 535
 536                 if (zfs_id_overblockquota(zfsvfs, DMU_USERUSED_OBJECT, uid) ||
 537                     zfs_id_overblockquota(zfsvfs, DMU_GROUPUSED_OBJECT, gid) ||
 538                     (projid != ZFS_DEFAULT_PROJID &&
 539                     zfs_id_overblockquota(zfsvfs, DMU_PROJECTUSED_OBJECT,
 540                     projid))) {
 541                         error = SET_ERROR(EDQUOT);
 542                         break;
 543                 }
 544
 545                 uint64_t blksz;
 546                 if (lr->lr_length == UINT64_MAX && zp->z_size <= zp->z_blksz) {
 547                         if (zp->z_blksz > zfsvfs->z_max_blksz &&
 548                             !ISP2(zp->z_blksz)) {
 549                                 /*
 550                                  * File's blocksize is already larger than the
 551                                  * "recordsize" property.  Only let it grow to
 552                                  * the next power of 2.
 553                                  */
 554                                 blksz = 1 << highbit64(zp->z_blksz);
 555                         } else {
 556                                 blksz = zfsvfs->z_max_blksz;
 557                         }
 558                         blksz = MIN(blksz, P2ROUNDUP(end_size,
 559                             SPA_MINBLOCKSIZE));
 560                         blksz = MAX(blksz, zp->z_blksz);
 561                 } else {
 562                         blksz = zp->z_blksz;
 563                 }
 564
 565                 arc_buf_t *abuf = NULL;
 566                 ssize_t nbytes = n;
 567                 if (n >= blksz && woff >= zp->z_size &&
 568                     P2PHASE(woff, blksz) == 0 &&
 569                     (blksz >= SPA_OLD_MAXBLOCKSIZE || n < 4 * blksz)) {
 570                         /*
 571                          * This write covers a full block.  "Borrow" a buffer
 572                          * from the dmu so that we can fill it before we enter
 573                          * a transaction.  This avoids the possibility of
 574                          * holding up the transaction if the data copy hangs
 575                          * up on a pagefault (e.g., from an NFS server mapping).
 576                          */
 577                         abuf = dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl),
 578                             blksz);
 579                         ASSERT(abuf != NULL);
 580                         ASSERT(arc_buf_size(abuf) == blksz);
 581                         if ((error = zfs_uiocopy(abuf->b_data, blksz,
 582                             UIO_WRITE, uio, &nbytes))) {
 583                                 dmu_return_arcbuf(abuf);
 584                                 break;
 585                         }
 586                         ASSERT3S(nbytes, ==, blksz);
 587                 } else {
 588                         nbytes = MIN(n, (DMU_MAX_ACCESS >> 1) -
 589                             P2PHASE(woff, blksz));
 590                         if (pfbytes < nbytes) {
 591                                 if (zfs_uio_prefaultpages(nbytes, uio)) {
 592                                         error = SET_ERROR(EFAULT);
 593                                         break;
 594                                 }
 595                                 pfbytes = nbytes;
 596                         }
 597                 }
 598
 599                 /*
 600                  * Start a transaction.
 601                  */
 602                 dmu_tx_t *tx = dmu_tx_create(zfsvfs->z_os);
 603                 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
 604                 dmu_buf_impl_t *db = (dmu_buf_impl_t *)sa_get_db(zp->z_sa_hdl);
 605                 DB_DNODE_ENTER(db);
 606                 dmu_tx_hold_write_by_dnode(tx, DB_DNODE(db), woff, nbytes);
 607                 DB_DNODE_EXIT(db);
 608                 zfs_sa_upgrade_txholds(tx, zp);
 609                 error = dmu_tx_assign(tx, TXG_WAIT);
 610                 if (error) {
 611                         dmu_tx_abort(tx);
 612                         if (abuf != NULL)
 613                                 dmu_return_arcbuf(abuf);
 614                         break;
 615                 }
 616
 617                 /*
 618                  * NB: We must call zfs_clear_setid_bits_if_necessary before
 619                  * committing the transaction!
 620                  */
 621
 622                 /*
 623                  * If rangelock_enter() over-locked we grow the blocksize
 624                  * and then reduce the lock range.  This will only happen
 625                  * on the first iteration since rangelock_reduce() will
 626                  * shrink down lr_length to the appropriate size.
 627                  */
 628                 if (lr->lr_length == UINT64_MAX) {
 629                         zfs_grow_blocksize(zp, blksz, tx);
 630                         zfs_rangelock_reduce(lr, woff, n);
 631                 }
 632
 633                 ssize_t tx_bytes;
 634                 if (abuf == NULL) {
 635                         tx_bytes = zfs_uio_resid(uio);
 636                         zfs_uio_fault_disable(uio, B_TRUE);
 637                         error = dmu_write_uio_dbuf(sa_get_db(zp->z_sa_hdl),
 638                             uio, nbytes, tx);
 639                         zfs_uio_fault_disable(uio, B_FALSE);
 640 #ifdef __linux__
 641                         if (error == EFAULT) {
 642                                 zfs_clear_setid_bits_if_necessary(zfsvfs, zp,
 643                                     cr, &clear_setid_bits_txg, tx);
 644                                 dmu_tx_commit(tx);
 645                                 /*
 646                                  * Account for partial writes before
 647                                  * continuing the loop.
 648                                  * Update needs to occur before the next
 649                                  * zfs_uio_prefaultpages, or prefaultpages may
 650                                  * error, and we may break the loop early.
 651                                  */
 652                                 n -= tx_bytes - zfs_uio_resid(uio);
 653                                 pfbytes -= tx_bytes - zfs_uio_resid(uio);
 654                                 continue;
 655                         }
 656 #endif
 657                         /*
 658                          * On FreeBSD, EFAULT should be propagated back to the
 659                          * VFS, which will handle faulting and will retry.
 660                          */
 661                         if (error != 0 && error != EFAULT) {
 662                                 zfs_clear_setid_bits_if_necessary(zfsvfs, zp,
 663                                     cr, &clear_setid_bits_txg, tx);
 664                                 dmu_tx_commit(tx);
 665                                 break;
 666                         }
 667                         tx_bytes -= zfs_uio_resid(uio);
 668                 } else {
 669                         /*
 670                          * Thus, we're writing a full block at a block-aligned
 671                          * offset and extending the file past EOF.
 672                          *
 673                          * dmu_assign_arcbuf_by_dbuf() will directly assign the
 674                          * arc buffer to a dbuf.
 675                          */
 676                         error = dmu_assign_arcbuf_by_dbuf(
 677                             sa_get_db(zp->z_sa_hdl), woff, abuf, tx);
 678                         if (error != 0) {
 679                                 /*
 680                                  * XXX This might not be necessary if
 681                                  * dmu_assign_arcbuf_by_dbuf is guaranteed
 682                                  * to be atomic.
 683                                  */
 684                                 zfs_clear_setid_bits_if_necessary(zfsvfs, zp,
 685                                     cr, &clear_setid_bits_txg, tx);
 686                                 dmu_return_arcbuf(abuf);
 687                                 dmu_tx_commit(tx);
 688                                 break;
 689                         }
 690                         ASSERT3S(nbytes, <=, zfs_uio_resid(uio));
 691                         zfs_uioskip(uio, nbytes);
 692                         tx_bytes = nbytes;
 693                 }
 694                 if (tx_bytes &&
 695                     zn_has_cached_data(zp, woff, woff + tx_bytes - 1) &&
 696                     !(ioflag & O_DIRECT)) {
 697                         update_pages(zp, woff, tx_bytes, zfsvfs->z_os);
 698                 }
 699
 700                 /*
 701                  * If we made no progress, we're done.  If we made even
 702                  * partial progress, update the znode and ZIL accordingly.
 703                  */
 704                 if (tx_bytes == 0) {
 705                         (void) sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zfsvfs),
 706                             (void *)&zp->z_size, sizeof (uint64_t), tx);
 707                         dmu_tx_commit(tx);
 708                         ASSERT(error != 0);
 709                         break;
 710                 }
 711
 712                 zfs_clear_setid_bits_if_necessary(zfsvfs, zp, cr,
 713                     &clear_setid_bits_txg, tx);
 714
 715                 zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime);
 716
 717                 /*
 718                  * Update the file size (zp_size) if it has changed;
 719                  * account for possible concurrent updates.
 720                  */
 721                 while ((end_size = zp->z_size) < zfs_uio_offset(uio)) {
 722                         (void) atomic_cas_64(&zp->z_size, end_size,
 723                             zfs_uio_offset(uio));
 724                         ASSERT(error == 0 || error == EFAULT);
 725                 }
 726                 /*
 727                  * If we are replaying and eof is non zero then force
 728                  * the file size to the specified eof. Note, there's no
 729                  * concurrency during replay.
 730                  */
 731                 if (zfsvfs->z_replay && zfsvfs->z_replay_eof != 0)
 732                         zp->z_size = zfsvfs->z_replay_eof;
 733
 734                 error1 = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
 735                 if (error1 != 0)
 736                         /* Avoid clobbering EFAULT. */
 737                         error = error1;
 738
 739                 /*
 740                  * NB: During replay, the TX_SETATTR record logged by
 741                  * zfs_clear_setid_bits_if_necessary must precede any of
 742                  * the TX_WRITE records logged here.
 743                  */
 744                 zfs_log_write(zilog, tx, TX_WRITE, zp, woff, tx_bytes, ioflag,
 745                     NULL, NULL);
 746
 747                 dmu_tx_commit(tx);
 748
 749                 if (error != 0)
 750                         break;
 751                 ASSERT3S(tx_bytes, ==, nbytes);
 752                 n -= nbytes;
 753                 pfbytes -= nbytes;
 754         }
 755
 756         zfs_znode_update_vfs(zp);
 757         zfs_rangelock_exit(lr);
 758
 759         /*
 760          * If we're in replay mode, or we made no progress, or the
 761          * uio data is inaccessible return an error.  Otherwise, it's
 762          * at least a partial write, so it's successful.
 763          */
 764         if (zfsvfs->z_replay || zfs_uio_resid(uio) == start_resid ||
 765             error == EFAULT) {
 766                 zfs_exit(zfsvfs, FTAG);
 767                 return (error);
 768         }
 769
 770         if (ioflag & (O_SYNC | O_DSYNC) ||
 771             zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
 772                 zil_commit(zilog, zp->z_id);
 773
 774         const int64_t nwritten = start_resid - zfs_uio_resid(uio);
 775         dataset_kstats_update_write_kstats(&zfsvfs->z_kstat, nwritten);
 776         task_io_account_write(nwritten);
 777
 778         zfs_exit(zfsvfs, FTAG);
 779         return (0);
 780 }
 781
 782 int
 783 zfs_getsecattr(znode_t *zp, vsecattr_t *vsecp, int flag, cred_t *cr)
 784 {
 785         zfsvfs_t *zfsvfs = ZTOZSB(zp);
 786         int error;
 787         boolean_t skipaclchk = (flag & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
 788
 789         if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
 790                 return (error);
 791         error = zfs_getacl(zp, vsecp, skipaclchk, cr);
 792         zfs_exit(zfsvfs, FTAG);
 793
 794         return (error);
 795 }
 796
 797 int
 798 zfs_setsecattr(znode_t *zp, vsecattr_t *vsecp, int flag, cred_t *cr)
 799 {
 800         zfsvfs_t *zfsvfs = ZTOZSB(zp);
 801         int error;
 802         boolean_t skipaclchk = (flag & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
 803         zilog_t *zilog = zfsvfs->z_log;
 804
 805         if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
 806                 return (error);
 807
 808         error = zfs_setacl(zp, vsecp, skipaclchk, cr);
 809
 810         if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
 811                 zil_commit(zilog, 0);
 812
 813         zfs_exit(zfsvfs, FTAG);
 814         return (error);
 815 }
 816
 817 #ifdef ZFS_DEBUG
 818 static int zil_fault_io = 0;
 819 #endif
 820
 821 static void zfs_get_done(zgd_t *zgd, int error);
 822
 823 /*
 824  * Get data to generate a TX_WRITE intent log record.
 825  */
 826 int
 827 zfs_get_data(void *arg, uint64_t gen, lr_write_t *lr, char *buf,
 828     struct lwb *lwb, zio_t *zio)
 829 {
 830         zfsvfs_t *zfsvfs = arg;
 831         objset_t *os = zfsvfs->z_os;
 832         znode_t *zp;
 833         uint64_t object = lr->lr_foid;
 834         uint64_t offset = lr->lr_offset;
 835         uint64_t size = lr->lr_length;
 836         dmu_buf_t *db;
 837         zgd_t *zgd;
 838         int error = 0;
 839         uint64_t zp_gen;
 840
 841         ASSERT3P(lwb, !=, NULL);
 842         ASSERT3P(zio, !=, NULL);
 843         ASSERT3U(size, !=, 0);
 844
 845         /*
 846          * Nothing to do if the file has been removed
 847          */
 848         if (zfs_zget(zfsvfs, object, &zp) != 0)
 849                 return (SET_ERROR(ENOENT));
 850         if (zp->z_unlinked) {
 851                 /*
 852                  * Release the vnode asynchronously as we currently have the
 853                  * txg stopped from syncing.
 854                  */
 855                 zfs_zrele_async(zp);
 856                 return (SET_ERROR(ENOENT));
 857         }
 858         /* check if generation number matches */
 859         if (sa_lookup(zp->z_sa_hdl, SA_ZPL_GEN(zfsvfs), &zp_gen,
 860             sizeof (zp_gen)) != 0) {
 861                 zfs_zrele_async(zp);
 862                 return (SET_ERROR(EIO));
 863         }
 864         if (zp_gen != gen) {
 865                 zfs_zrele_async(zp);
 866                 return (SET_ERROR(ENOENT));
 867         }
 868
 869         zgd = kmem_zalloc(sizeof (zgd_t), KM_SLEEP);
 870         zgd->zgd_lwb = lwb;
 871         zgd->zgd_private = zp;
 872
 873         /*
 874          * Write records come in two flavors: immediate and indirect.
 875          * For small writes it's cheaper to store the data with the
 876          * log record (immediate); for large writes it's cheaper to
 877          * sync the data and get a pointer to it (indirect) so that
 878          * we don't have to write the data twice.
 879          */
 880         if (buf != NULL) { /* immediate write */
 881                 zgd->zgd_lr = zfs_rangelock_enter(&zp->z_rangelock,
 882                     offset, size, RL_READER);
 883                 /* test for truncation needs to be done while range locked */
 884                 if (offset >= zp->z_size) {
 885                         error = SET_ERROR(ENOENT);
 886                 } else {
 887                         error = dmu_read(os, object, offset, size, buf,
 888                             DMU_READ_NO_PREFETCH);
 889                 }
 890                 ASSERT(error == 0 || error == ENOENT);
 891         } else { /* indirect write */
 892                 /*
 893                  * Have to lock the whole block to ensure when it's
 894                  * written out and its checksum is being calculated
 895                  * that no one can change the data. We need to re-check
 896                  * blocksize after we get the lock in case it's changed!
 897                  */
 898                 for (;;) {
 899                         uint64_t blkoff;
 900                         size = zp->z_blksz;
 901                         blkoff = ISP2(size) ? P2PHASE(offset, size) : offset;
 902                         offset -= blkoff;
 903                         zgd->zgd_lr = zfs_rangelock_enter(&zp->z_rangelock,
 904                             offset, size, RL_READER);
 905                         if (zp->z_blksz == size)
 906                                 break;
 907                         offset += blkoff;
 908                         zfs_rangelock_exit(zgd->zgd_lr);
 909                 }
 910                 /* test for truncation needs to be done while range locked */
 911                 if (lr->lr_offset >= zp->z_size)
 912                         error = SET_ERROR(ENOENT);
 913 #ifdef ZFS_DEBUG
 914                 if (zil_fault_io) {
 915                         error = SET_ERROR(EIO);
 916                         zil_fault_io = 0;
 917                 }
 918 #endif
 919                 if (error == 0)
 920                         error = dmu_buf_hold(os, object, offset, zgd, &db,
 921                             DMU_READ_NO_PREFETCH);
 922
 923                 if (error == 0) {
 924                         blkptr_t *bp = &lr->lr_blkptr;
 925
 926                         zgd->zgd_db = db;
 927                         zgd->zgd_bp = bp;
 928
 929                         ASSERT(db->db_offset == offset);
 930                         ASSERT(db->db_size == size);
 931
 932                         error = dmu_sync(zio, lr->lr_common.lrc_txg,
 933                             zfs_get_done, zgd);
 934                         ASSERT(error || lr->lr_length <= size);
 935
 936                         /*
 937                          * On success, we need to wait for the write I/O
 938                          * initiated by dmu_sync() to complete before we can
 939                          * release this dbuf.  We will finish everything up
 940                          * in the zfs_get_done() callback.
 941                          */
 942                         if (error == 0)
 943                                 return (0);
 944
 945                         if (error == EALREADY) {
 946                                 lr->lr_common.lrc_txtype = TX_WRITE2;
 947                                 /*
 948                                  * TX_WRITE2 relies on the data previously
 949                                  * written by the TX_WRITE that caused
 950                                  * EALREADY.  We zero out the BP because
 951                                  * it is the old, currently-on-disk BP.
 952                                  */
 953                                 zgd->zgd_bp = NULL;
 954                                 BP_ZERO(bp);
 955                                 error = 0;
 956                         }
 957                 }
 958         }
 959
 960         zfs_get_done(zgd, error);
 961
 962         return (error);
 963 }
 964
 965
 966 static void
 967 zfs_get_done(zgd_t *zgd, int error)
 968 {
 969         (void) error;
 970         znode_t *zp = zgd->zgd_private;
 971
 972         if (zgd->zgd_db)
 973                 dmu_buf_rele(zgd->zgd_db, zgd);
 974
 975         zfs_rangelock_exit(zgd->zgd_lr);
 976
 977         /*
 978          * Release the vnode asynchronously as we currently have the
 979          * txg stopped from syncing.
 980          */
 981         zfs_zrele_async(zp);
 982
 983         kmem_free(zgd, sizeof (zgd_t));
 984 }
 985
 986 static int
 987 zfs_enter_two(zfsvfs_t *zfsvfs1, zfsvfs_t *zfsvfs2, const char *tag)
 988 {
 989         int error;
 990
 991         /* Swap. Not sure if the order of zfs_enter()s is important. */
 992         if (zfsvfs1 > zfsvfs2) {
 993                 zfsvfs_t *tmpzfsvfs;
 994
 995                 tmpzfsvfs = zfsvfs2;
 996                 zfsvfs2 = zfsvfs1;
 997                 zfsvfs1 = tmpzfsvfs;
 998         }
 999
1000         error = zfs_enter(zfsvfs1, tag);
1001         if (error != 0)
1002                 return (error);
1003         if (zfsvfs1 != zfsvfs2) {
1004                 error = zfs_enter(zfsvfs2, tag);
1005                 if (error != 0) {
1006                         zfs_exit(zfsvfs1, tag);
1007                         return (error);
1008                 }
1009         }
1010
1011         return (0);
1012 }
1013
1014 static void
1015 zfs_exit_two(zfsvfs_t *zfsvfs1, zfsvfs_t *zfsvfs2, const char *tag)
1016 {
1017
1018         zfs_exit(zfsvfs1, tag);
1019         if (zfsvfs1 != zfsvfs2)
1020                 zfs_exit(zfsvfs2, tag);
1021 }
1022
1023 /*
1024  * We split each clone request in chunks that can fit into a single ZIL
1025  * log entry. Each ZIL log entry can fit 130816 bytes for a block cloning
1026  * operation (see zil_max_log_data() and zfs_log_clone_range()). This gives
1027  * us room for storing 1022 block pointers.
1028  *
1029  * On success, the function return the number of bytes copied in *lenp.
1030  * Note, it doesn't return how much bytes are left to be copied.
1031  */
1032 int
1033 zfs_clone_range(znode_t *inzp, uint64_t *inoffp, znode_t *outzp,
1034     uint64_t *outoffp, uint64_t *lenp, cred_t *cr)
1035 {
1036         zfsvfs_t        *inzfsvfs, *outzfsvfs;
1037         objset_t        *inos, *outos;
1038         zfs_locked_range_t *inlr, *outlr;
1039         dmu_buf_impl_t  *db;
1040         dmu_tx_t        *tx;
1041         zilog_t         *zilog;
1042         uint64_t        inoff, outoff, len, done;
1043         uint64_t        outsize, size;
1044         int             error;
1045         int             count = 0;
1046         sa_bulk_attr_t  bulk[3];
1047         uint64_t        mtime[2], ctime[2];
1048         uint64_t        uid, gid, projid;
1049         blkptr_t        *bps;
1050         size_t          maxblocks, nbps;
1051         uint_t          inblksz;
1052         uint64_t        clear_setid_bits_txg = 0;
1053
1054         inoff = *inoffp;
1055         outoff = *outoffp;
1056         len = *lenp;
1057         done = 0;
1058
1059         inzfsvfs = ZTOZSB(inzp);
1060         outzfsvfs = ZTOZSB(outzp);
1061
1062         /*
1063          * We need to call zfs_enter() potentially on two different datasets,
1064          * so we need a dedicated function for that.
1065          */
1066         error = zfs_enter_two(inzfsvfs, outzfsvfs, FTAG);
1067         if (error != 0)
1068                 return (error);
1069
1070         inos = inzfsvfs->z_os;
1071         outos = outzfsvfs->z_os;
1072
1073         /*
1074          * Both source and destination have to belong to the same storage pool.
1075          */
1076         if (dmu_objset_spa(inos) != dmu_objset_spa(outos)) {
1077                 zfs_exit_two(inzfsvfs, outzfsvfs, FTAG);
1078                 return (SET_ERROR(EXDEV));
1079         }
1080
1081         /*
1082          * outos and inos belongs to the same storage pool.
1083          * see a few lines above, only one check.
1084          */
1085         if (!spa_feature_is_enabled(dmu_objset_spa(outos),
1086             SPA_FEATURE_BLOCK_CLONING)) {
1087                 zfs_exit_two(inzfsvfs, outzfsvfs, FTAG);
1088                 return (SET_ERROR(EOPNOTSUPP));
1089         }
1090
1091         ASSERT(!outzfsvfs->z_replay);
1092
1093         error = zfs_verify_zp(inzp);
1094         if (error == 0)
1095                 error = zfs_verify_zp(outzp);
1096         if (error != 0) {
1097                 zfs_exit_two(inzfsvfs, outzfsvfs, FTAG);
1098                 return (error);
1099         }
1100
1101         /*
1102          * We don't copy source file's flags that's why we don't allow to clone
1103          * files that are in quarantine.
1104          */
1105         if (inzp->z_pflags & ZFS_AV_QUARANTINED) {
1106                 zfs_exit_two(inzfsvfs, outzfsvfs, FTAG);
1107                 return (SET_ERROR(EACCES));
1108         }
1109
1110         if (inoff >= inzp->z_size) {
1111                 *lenp = 0;
1112                 zfs_exit_two(inzfsvfs, outzfsvfs, FTAG);
1113                 return (0);
1114         }
1115         if (len > inzp->z_size - inoff) {
1116                 len = inzp->z_size - inoff;
1117         }
1118         if (len == 0) {
1119                 *lenp = 0;
1120                 zfs_exit_two(inzfsvfs, outzfsvfs, FTAG);
1121                 return (0);
1122         }
1123
1124         /*
1125          * Callers might not be able to detect properly that we are read-only,
1126          * so check it explicitly here.
1127          */
1128         if (zfs_is_readonly(outzfsvfs)) {
1129                 zfs_exit_two(inzfsvfs, outzfsvfs, FTAG);
1130                 return (SET_ERROR(EROFS));
1131         }
1132
1133         /*
1134          * If immutable or not appending then return EPERM.
1135          * Intentionally allow ZFS_READONLY through here.
1136          * See zfs_zaccess_common()
1137          */
1138         if ((outzp->z_pflags & ZFS_IMMUTABLE) != 0) {
1139                 zfs_exit_two(inzfsvfs, outzfsvfs, FTAG);
1140                 return (SET_ERROR(EPERM));
1141         }
1142
1143         /*
1144          * No overlapping if we are cloning within the same file.
1145          */
1146         if (inzp == outzp) {
1147                 if (inoff < outoff + len && outoff < inoff + len) {
1148                         zfs_exit_two(inzfsvfs, outzfsvfs, FTAG);
1149                         return (SET_ERROR(EINVAL));
1150                 }
1151         }
1152
1153         /*
1154          * Maintain predictable lock order.
1155          */
1156         if (inzp < outzp || (inzp == outzp && inoff < outoff)) {
1157                 inlr = zfs_rangelock_enter(&inzp->z_rangelock, inoff, len,
1158                     RL_READER);
1159                 outlr = zfs_rangelock_enter(&outzp->z_rangelock, outoff, len,
1160                     RL_WRITER);
1161         } else {
1162                 outlr = zfs_rangelock_enter(&outzp->z_rangelock, outoff, len,
1163                     RL_WRITER);
1164                 inlr = zfs_rangelock_enter(&inzp->z_rangelock, inoff, len,
1165                     RL_READER);
1166         }
1167
1168         inblksz = inzp->z_blksz;
1169
1170         /*
1171          * We cannot clone into files with different block size.
1172          */
1173         if (inblksz != outzp->z_blksz && outzp->z_size > inblksz) {
1174                 error = SET_ERROR(EXDEV);
1175                 goto unlock;
1176         }
1177
1178         /*
1179          * Offsets and len must be at block boundries.
1180          */
1181         if ((inoff % inblksz) != 0 || (outoff % inblksz) != 0) {
1182                 error = SET_ERROR(EXDEV);
1183                 goto unlock;
1184         }
1185         /*
1186          * Length must be multipe of blksz, except for the end of the file.
1187          */
1188         if ((len % inblksz) != 0 &&
1189             (len < inzp->z_size - inoff || len < outzp->z_size - outoff)) {
1190                 error = SET_ERROR(EXDEV);
1191                 goto unlock;
1192         }
1193
1194         error = zn_rlimit_fsize(outoff + len);
1195         if (error != 0) {
1196                 goto unlock;
1197         }
1198
1199         if (inoff >= MAXOFFSET_T || outoff >= MAXOFFSET_T) {
1200                 error = SET_ERROR(EFBIG);
1201                 goto unlock;
1202         }
1203
1204         SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(outzfsvfs), NULL,
1205             &mtime, 16);
1206         SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(outzfsvfs), NULL,
1207             &ctime, 16);
1208         SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(outzfsvfs), NULL,
1209             &outzp->z_size, 8);
1210
1211         zilog = outzfsvfs->z_log;
1212         maxblocks = zil_max_log_data(zilog, sizeof (lr_clone_range_t)) /
1213             sizeof (bps[0]);
1214
1215         uid = KUID_TO_SUID(ZTOUID(outzp));
1216         gid = KGID_TO_SGID(ZTOGID(outzp));
1217         projid = outzp->z_projid;
1218
1219         bps = vmem_alloc(sizeof (bps[0]) * maxblocks, KM_SLEEP);
1220
1221         /*
1222          * Clone the file in reasonable size chunks.  Each chunk is cloned
1223          * in a separate transaction; this keeps the intent log records small
1224          * and allows us to do more fine-grained space accounting.
1225          */
1226         while (len > 0) {
1227                 size = MIN(inblksz * maxblocks, len);
1228
1229                 if (zfs_id_overblockquota(outzfsvfs, DMU_USERUSED_OBJECT,
1230                     uid) ||
1231                     zfs_id_overblockquota(outzfsvfs, DMU_GROUPUSED_OBJECT,
1232                     gid) ||
1233                     (projid != ZFS_DEFAULT_PROJID &&
1234                     zfs_id_overblockquota(outzfsvfs, DMU_PROJECTUSED_OBJECT,
1235                     projid))) {
1236                         error = SET_ERROR(EDQUOT);
1237                         break;
1238                 }
1239
1240                 nbps = maxblocks;
1241                 error = dmu_read_l0_bps(inos, inzp->z_id, inoff, size, bps,
1242                     &nbps);
1243                 if (error != 0) {
1244                         /*
1245                          * If we are tyring to clone a block that was created
1246                          * in the current transaction group. Return an error,
1247                          * so the caller can fallback to just copying the data.
1248                          */
1249                         if (error == EAGAIN) {
1250                                 error = SET_ERROR(EXDEV);
1251                         }
1252                         break;
1253                 }
1254                 /*
1255                  * Encrypted data is fine as long as it comes from the same
1256                  * dataset.
1257                  * TODO: We want to extend it in the future to allow cloning to
1258                  * datasets with the same keys, like clones or to be able to
1259                  * clone a file from a snapshot of an encrypted dataset into the
1260                  * dataset itself.
1261                  */
1262                 if (BP_IS_PROTECTED(&bps[0])) {
1263                         if (inzfsvfs != outzfsvfs) {
1264                                 error = SET_ERROR(EXDEV);
1265                                 break;
1266                         }
1267                 }
1268
1269                 /*
1270                  * Start a transaction.
1271                  */
1272                 tx = dmu_tx_create(outos);
1273                 dmu_tx_hold_sa(tx, outzp->z_sa_hdl, B_FALSE);
1274                 db = (dmu_buf_impl_t *)sa_get_db(outzp->z_sa_hdl);
1275                 DB_DNODE_ENTER(db);
1276                 dmu_tx_hold_clone_by_dnode(tx, DB_DNODE(db), outoff, size);
1277                 DB_DNODE_EXIT(db);
1278                 zfs_sa_upgrade_txholds(tx, outzp);
1279                 error = dmu_tx_assign(tx, TXG_WAIT);
1280                 if (error != 0) {
1281                         dmu_tx_abort(tx);
1282                         break;
1283                 }
1284
1285                 /*
1286                  * Copy source znode's block size. This only happens on the
1287                  * first iteration since zfs_rangelock_reduce() will shrink down
1288                  * lr_len to the appropriate size.
1289                  */
1290                 if (outlr->lr_length == UINT64_MAX) {
1291                         zfs_grow_blocksize(outzp, inblksz, tx);
1292                         /*
1293                          * Round range lock up to the block boundary, so we
1294                          * prevent appends until we are done.
1295                          */
1296                         zfs_rangelock_reduce(outlr, outoff,
1297                             ((len - 1) / inblksz + 1) * inblksz);
1298                 }
1299
1300                 error = dmu_brt_clone(outos, outzp->z_id, outoff, size, tx,
1301                     bps, nbps, B_FALSE);
1302                 if (error != 0) {
1303                         dmu_tx_commit(tx);
1304                         break;
1305                 }
1306
1307                 zfs_clear_setid_bits_if_necessary(outzfsvfs, outzp, cr,
1308                     &clear_setid_bits_txg, tx);
1309
1310                 zfs_tstamp_update_setup(outzp, CONTENT_MODIFIED, mtime, ctime);
1311
1312                 /*
1313                  * Update the file size (zp_size) if it has changed;
1314                  * account for possible concurrent updates.
1315                  */
1316                 while ((outsize = outzp->z_size) < outoff + size) {
1317                         (void) atomic_cas_64(&outzp->z_size, outsize,
1318                             outoff + size);
1319                 }
1320
1321                 error = sa_bulk_update(outzp->z_sa_hdl, bulk, count, tx);
1322
1323                 zfs_log_clone_range(zilog, tx, TX_CLONE_RANGE, outzp, outoff,
1324                     size, inblksz, bps, nbps);
1325
1326                 dmu_tx_commit(tx);
1327
1328                 if (error != 0)
1329                         break;
1330
1331                 inoff += size;
1332                 outoff += size;
1333                 len -= size;
1334                 done += size;
1335         }
1336
1337         vmem_free(bps, sizeof (bps[0]) * maxblocks);
1338         zfs_znode_update_vfs(outzp);
1339
1340 unlock:
1341         zfs_rangelock_exit(outlr);
1342         zfs_rangelock_exit(inlr);
1343
1344         if (done > 0) {
1345                 /*
1346                  * If we have made at least partial progress, reset the error.
1347                  */
1348                 error = 0;
1349
1350                 ZFS_ACCESSTIME_STAMP(inzfsvfs, inzp);
1351
1352                 if (outos->os_sync == ZFS_SYNC_ALWAYS) {
1353                         zil_commit(zilog, outzp->z_id);
1354                 }
1355
1356                 *inoffp += done;
1357                 *outoffp += done;
1358                 *lenp = done;
1359         }
1360
1361         zfs_exit_two(inzfsvfs, outzfsvfs, FTAG);
1362
1363         return (error);
1364 }
1365
1366 /*
1367  * Usual pattern would be to call zfs_clone_range() from zfs_replay_clone(),
1368  * but we cannot do that, because when replaying we don't have source znode
1369  * available. This is why we need a dedicated replay function.
1370  */
1371 int
1372 zfs_clone_range_replay(znode_t *zp, uint64_t off, uint64_t len, uint64_t blksz,
1373     const blkptr_t *bps, size_t nbps)
1374 {
1375         zfsvfs_t        *zfsvfs;
1376         dmu_buf_impl_t  *db;
1377         dmu_tx_t        *tx;
1378         int             error;
1379         int             count = 0;
1380         sa_bulk_attr_t  bulk[3];
1381         uint64_t        mtime[2], ctime[2];
1382
1383         ASSERT3U(off, <, MAXOFFSET_T);
1384         ASSERT3U(len, >, 0);
1385         ASSERT3U(nbps, >, 0);
1386
1387         zfsvfs = ZTOZSB(zp);
1388
1389         ASSERT(spa_feature_is_enabled(dmu_objset_spa(zfsvfs->z_os),
1390             SPA_FEATURE_BLOCK_CLONING));
1391
1392         if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
1393                 return (error);
1394
1395         ASSERT(zfsvfs->z_replay);
1396         ASSERT(!zfs_is_readonly(zfsvfs));
1397
1398         if ((off % blksz) != 0) {
1399                 zfs_exit(zfsvfs, FTAG);
1400                 return (SET_ERROR(EINVAL));
1401         }
1402
1403         SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16);
1404         SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16);
1405         SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL,
1406             &zp->z_size, 8);
1407
1408         /*
1409          * Start a transaction.
1410          */
1411         tx = dmu_tx_create(zfsvfs->z_os);
1412
1413         dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
1414         db = (dmu_buf_impl_t *)sa_get_db(zp->z_sa_hdl);
1415         DB_DNODE_ENTER(db);
1416         dmu_tx_hold_clone_by_dnode(tx, DB_DNODE(db), off, len);
1417         DB_DNODE_EXIT(db);
1418         zfs_sa_upgrade_txholds(tx, zp);
1419         error = dmu_tx_assign(tx, TXG_WAIT);
1420         if (error != 0) {
1421                 dmu_tx_abort(tx);
1422                 zfs_exit(zfsvfs, FTAG);
1423                 return (error);
1424         }
1425
1426         if (zp->z_blksz < blksz)
1427                 zfs_grow_blocksize(zp, blksz, tx);
1428
1429         dmu_brt_clone(zfsvfs->z_os, zp->z_id, off, len, tx, bps, nbps, B_TRUE);
1430
1431         zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime);
1432
1433         if (zp->z_size < off + len)
1434                 zp->z_size = off + len;
1435
1436         error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
1437
1438         /*
1439          * zil_replaying() not only check if we are replaying ZIL, but also
1440          * updates the ZIL header to record replay progress.
1441          */
1442         VERIFY(zil_replaying(zfsvfs->z_log, tx));
1443
1444         dmu_tx_commit(tx);
1445
1446         zfs_znode_update_vfs(zp);
1447
1448         zfs_exit(zfsvfs, FTAG);
1449
1450         return (error);
1451 }
1452
1453 EXPORT_SYMBOL(zfs_access);
1454 EXPORT_SYMBOL(zfs_fsync);
1455 EXPORT_SYMBOL(zfs_holey);
1456 EXPORT_SYMBOL(zfs_read);
1457 EXPORT_SYMBOL(zfs_write);
1458 EXPORT_SYMBOL(zfs_getsecattr);
1459 EXPORT_SYMBOL(zfs_setsecattr);
1460 EXPORT_SYMBOL(zfs_clone_range);
1461 EXPORT_SYMBOL(zfs_clone_range_replay);
1462
1463 ZFS_MODULE_PARAM(zfs_vnops, zfs_vnops_, read_chunk_size, U64, ZMOD_RW,
1464         "Bytes to read per chunk");