4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or https://opensource.org/licenses/CDDL-1.0.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
23 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
24 * Copyright (c) 2012, 2018 by Delphix. All rights reserved.
25 * Copyright (c) 2015 by Chunwei Chen. All rights reserved.
26 * Copyright 2017 Nexenta Systems, Inc.
27 * Copyright (c) 2021, 2022 by Pawel Jakub Dawidek
30 /* Portions Copyright 2007 Jeremy Teo */
31 /* Portions Copyright 2010 Robert Milkowski */
33 #include <sys/types.h>
34 #include <sys/param.h>
36 #include <sys/sysmacros.h>
41 #include <sys/cmn_err.h>
42 #include <sys/errno.h>
43 #include <sys/zfs_dir.h>
44 #include <sys/zfs_acl.h>
45 #include <sys/zfs_ioctl.h>
46 #include <sys/fs/zfs.h>
48 #include <sys/dmu_objset.h>
49 #include <sys/dsl_crypt.h>
53 #include <sys/policy.h>
54 #include <sys/zfeature.h>
55 #include <sys/zfs_vnops.h>
56 #include <sys/zfs_quota.h>
57 #include <sys/zfs_vfsops.h>
58 #include <sys/zfs_znode.h>
61 * Enables access to the block cloning feature. If this setting is 0, then even
62 * if feature@block_cloning is enabled, using functions and system calls that
63 * attempt to clone blocks will act as though the feature is disabled.
65 int zfs_bclone_enabled
= 1;
68 * When set zfs_clone_range() waits for dirty data to be written to disk.
69 * This allows the clone operation to reliably succeed when a file is modified
70 * and then immediately cloned. For small files this may be slower than making
71 * a copy of the file and is therefore not the default. However, in certain
72 * scenarios this behavior may be desirable so a tunable is provided.
74 static int zfs_bclone_wait_dirty
= 0;
77 * Enable Direct I/O. If this setting is 0, then all I/O requests will be
78 * directed through the ARC acting as though the dataset property direct was
81 * Disabled by default on FreeBSD until a potential range locking issue in
82 * zfs_getpages() can be resolved.
85 static int zfs_dio_enabled
= 0;
87 static int zfs_dio_enabled
= 1;
92 * Maximum bytes to read per chunk in zfs_read().
94 static uint64_t zfs_vnops_read_chunk_size
= 1024 * 1024;
97 zfs_fsync(znode_t
*zp
, int syncflag
, cred_t
*cr
)
100 zfsvfs_t
*zfsvfs
= ZTOZSB(zp
);
102 if (zfsvfs
->z_os
->os_sync
!= ZFS_SYNC_DISABLED
) {
103 if ((error
= zfs_enter_verify_zp(zfsvfs
, zp
, FTAG
)) != 0)
105 atomic_inc_32(&zp
->z_sync_writes_cnt
);
106 zil_commit(zfsvfs
->z_log
, zp
->z_id
);
107 atomic_dec_32(&zp
->z_sync_writes_cnt
);
108 zfs_exit(zfsvfs
, FTAG
);
114 #if defined(SEEK_HOLE) && defined(SEEK_DATA)
116 * Lseek support for finding holes (cmd == SEEK_HOLE) and
117 * data (cmd == SEEK_DATA). "off" is an in/out parameter.
120 zfs_holey_common(znode_t
*zp
, ulong_t cmd
, loff_t
*off
)
122 zfs_locked_range_t
*lr
;
123 uint64_t noff
= (uint64_t)*off
; /* new offset */
128 file_sz
= zp
->z_size
;
129 if (noff
>= file_sz
) {
130 return (SET_ERROR(ENXIO
));
133 if (cmd
== F_SEEK_HOLE
)
138 /* Flush any mmap()'d data to disk */
139 if (zn_has_cached_data(zp
, 0, file_sz
- 1))
140 zn_flush_cached_data(zp
, B_TRUE
);
142 lr
= zfs_rangelock_enter(&zp
->z_rangelock
, 0, UINT64_MAX
, RL_READER
);
143 error
= dmu_offset_next(ZTOZSB(zp
)->z_os
, zp
->z_id
, hole
, &noff
);
144 zfs_rangelock_exit(lr
);
147 return (SET_ERROR(ENXIO
));
149 /* File was dirty, so fall back to using generic logic */
150 if (error
== EBUSY
) {
158 * We could find a hole that begins after the logical end-of-file,
159 * because dmu_offset_next() only works on whole blocks. If the
160 * EOF falls mid-block, then indicate that the "virtual hole"
161 * at the end of the file begins at the logical EOF, rather than
162 * at the end of the last block.
164 if (noff
> file_sz
) {
176 zfs_holey(znode_t
*zp
, ulong_t cmd
, loff_t
*off
)
178 zfsvfs_t
*zfsvfs
= ZTOZSB(zp
);
181 if ((error
= zfs_enter_verify_zp(zfsvfs
, zp
, FTAG
)) != 0)
184 error
= zfs_holey_common(zp
, cmd
, off
);
186 zfs_exit(zfsvfs
, FTAG
);
189 #endif /* SEEK_HOLE && SEEK_DATA */
192 zfs_access(znode_t
*zp
, int mode
, int flag
, cred_t
*cr
)
194 zfsvfs_t
*zfsvfs
= ZTOZSB(zp
);
197 if ((error
= zfs_enter_verify_zp(zfsvfs
, zp
, FTAG
)) != 0)
200 if (flag
& V_ACE_MASK
)
201 #if defined(__linux__)
202 error
= zfs_zaccess(zp
, mode
, flag
, B_FALSE
, cr
,
205 error
= zfs_zaccess(zp
, mode
, flag
, B_FALSE
, cr
,
209 #if defined(__linux__)
210 error
= zfs_zaccess_rwx(zp
, mode
, flag
, cr
, zfs_init_idmap
);
212 error
= zfs_zaccess_rwx(zp
, mode
, flag
, cr
, NULL
);
215 zfs_exit(zfsvfs
, FTAG
);
220 * Determine if Direct I/O has been requested (either via the O_DIRECT flag or
221 * the "direct" dataset property). When inherited by the property only apply
222 * the O_DIRECT flag to correctly aligned IO requests. The rational for this
223 * is it allows the property to be safely set on a dataset without forcing
224 * all of the applications to be aware of the alignment restrictions. When
225 * O_DIRECT is explicitly requested by an application return EINVAL if the
226 * request is unaligned. In all cases, if the range for this request has
227 * been mmap'ed then we will perform buffered I/O to keep the mapped region
228 * synhronized with the ARC.
230 * It is possible that a file's pages could be mmap'ed after it is checked
231 * here. If so, that is handled coorarding in zfs_write(). See comments in the
232 * following area for how this is handled:
233 * zfs_write() -> update_pages()
236 zfs_setup_direct(struct znode
*zp
, zfs_uio_t
*uio
, zfs_uio_rw_t rw
,
239 zfsvfs_t
*zfsvfs
= ZTOZSB(zp
);
240 objset_t
*os
= zfsvfs
->z_os
;
241 int ioflag
= *ioflagp
;
244 if (!zfs_dio_enabled
|| os
->os_direct
== ZFS_DIRECT_DISABLED
||
245 zn_has_cached_data(zp
, zfs_uio_offset(uio
),
246 zfs_uio_offset(uio
) + zfs_uio_resid(uio
) - 1)) {
248 * Direct I/O is disabled or the region is mmap'ed. In either
249 * case the I/O request will just directed through the ARC.
253 } else if (os
->os_direct
== ZFS_DIRECT_ALWAYS
&&
254 zfs_uio_page_aligned(uio
) &&
255 zfs_uio_aligned(uio
, PAGE_SIZE
)) {
256 if ((rw
== UIO_WRITE
&& zfs_uio_resid(uio
) >= zp
->z_blksz
) ||
260 } else if (os
->os_direct
== ZFS_DIRECT_ALWAYS
&& (ioflag
& O_DIRECT
)) {
262 * Direct I/O was requested through the direct=always, but it
263 * is not properly PAGE_SIZE aligned. The request will be
264 * directed through the ARC.
269 if (ioflag
& O_DIRECT
) {
270 if (!zfs_uio_page_aligned(uio
) ||
271 !zfs_uio_aligned(uio
, PAGE_SIZE
)) {
272 error
= SET_ERROR(EINVAL
);
276 error
= zfs_uio_get_dio_pages_alloc(uio
, rw
);
282 IMPLY(ioflag
& O_DIRECT
, uio
->uio_extflg
& UIO_DIRECT
);
291 * Read bytes from specified file into supplied buffer.
293 * IN: zp - inode of file to be read from.
294 * uio - structure supplying read location, range info,
296 * ioflag - O_SYNC flags; used to provide FRSYNC semantics.
297 * O_DIRECT flag; used to bypass page cache.
298 * cr - credentials of caller.
300 * OUT: uio - updated offset and range, buffer filled.
302 * RETURN: 0 on success, error code on failure.
305 * inode - atime updated if byte count > 0
308 zfs_read(struct znode
*zp
, zfs_uio_t
*uio
, int ioflag
, cred_t
*cr
)
312 boolean_t frsync
= B_FALSE
;
313 boolean_t dio_checksum_failure
= B_FALSE
;
315 zfsvfs_t
*zfsvfs
= ZTOZSB(zp
);
316 if ((error
= zfs_enter_verify_zp(zfsvfs
, zp
, FTAG
)) != 0)
319 if (zp
->z_pflags
& ZFS_AV_QUARANTINED
) {
320 zfs_exit(zfsvfs
, FTAG
);
321 return (SET_ERROR(EACCES
));
324 /* We don't copy out anything useful for directories. */
325 if (Z_ISDIR(ZTOTYPE(zp
))) {
326 zfs_exit(zfsvfs
, FTAG
);
327 return (SET_ERROR(EISDIR
));
331 * Validate file offset
333 if (zfs_uio_offset(uio
) < (offset_t
)0) {
334 zfs_exit(zfsvfs
, FTAG
);
335 return (SET_ERROR(EINVAL
));
339 * Fasttrack empty reads
341 if (zfs_uio_resid(uio
) == 0) {
342 zfs_exit(zfsvfs
, FTAG
);
348 * If we're in FRSYNC mode, sync out this znode before reading it.
349 * Only do this for non-snapshots.
351 * Some platforms do not support FRSYNC and instead map it
352 * to O_SYNC, which results in unnecessary calls to zil_commit. We
353 * only honor FRSYNC requests on platforms which support it.
355 frsync
= !!(ioflag
& FRSYNC
);
358 (frsync
|| zfsvfs
->z_os
->os_sync
== ZFS_SYNC_ALWAYS
))
359 zil_commit(zfsvfs
->z_log
, zp
->z_id
);
362 * Lock the range against changes.
364 zfs_locked_range_t
*lr
= zfs_rangelock_enter(&zp
->z_rangelock
,
365 zfs_uio_offset(uio
), zfs_uio_resid(uio
), RL_READER
);
368 * If we are reading past end-of-file we can skip
369 * to the end; but we might still need to set atime.
371 if (zfs_uio_offset(uio
) >= zp
->z_size
) {
375 ASSERT(zfs_uio_offset(uio
) < zp
->z_size
);
378 * Setting up Direct I/O if requested.
380 error
= zfs_setup_direct(zp
, uio
, UIO_READ
, &ioflag
);
385 #if defined(__linux__)
386 ssize_t start_offset
= zfs_uio_offset(uio
);
388 ssize_t chunk_size
= zfs_vnops_read_chunk_size
;
389 ssize_t n
= MIN(zfs_uio_resid(uio
), zp
->z_size
- zfs_uio_offset(uio
));
390 ssize_t start_resid
= n
;
391 ssize_t dio_remaining_resid
= 0;
393 if (uio
->uio_extflg
& UIO_DIRECT
) {
395 * All pages for an O_DIRECT request ahve already been mapped
396 * so there's no compelling reason to handle this uio in
399 chunk_size
= DMU_MAX_ACCESS
;
402 * In the event that the O_DIRECT request is reading the entire
403 * file, it is possible file's length is not page sized
404 * aligned. However, lower layers expect that the Direct I/O
405 * request is page-aligned. In this case, as much of the file
406 * that can be read using Direct I/O happens and the remaining
407 * amount will be read through the ARC.
409 * This is still consistent with the semantics of Direct I/O in
410 * ZFS as at a minimum the I/O request must be page-aligned.
412 dio_remaining_resid
= n
- P2ALIGN_TYPED(n
, PAGE_SIZE
, ssize_t
);
413 if (dio_remaining_resid
!= 0)
414 n
-= dio_remaining_resid
;
418 ssize_t nbytes
= MIN(n
, chunk_size
-
419 P2PHASE(zfs_uio_offset(uio
), chunk_size
));
421 if (zfs_uio_segflg(uio
) == UIO_NOCOPY
)
422 error
= mappedread_sf(zp
, nbytes
, uio
);
425 if (zn_has_cached_data(zp
, zfs_uio_offset(uio
),
426 zfs_uio_offset(uio
) + nbytes
- 1)) {
427 error
= mappedread(zp
, nbytes
, uio
);
429 error
= dmu_read_uio_dbuf(sa_get_db(zp
->z_sa_hdl
),
434 /* convert checksum errors into IO errors */
435 if (error
== ECKSUM
) {
437 * If a Direct I/O read returned a checksum
438 * verify error, then it must be treated as
439 * suspicious. The contents of the buffer could
440 * have beeen manipulated while the I/O was in
441 * flight. In this case, the remainder of I/O
442 * request will just be reissued through the
445 if (uio
->uio_extflg
& UIO_DIRECT
) {
446 dio_checksum_failure
= B_TRUE
;
447 uio
->uio_extflg
&= ~UIO_DIRECT
;
448 n
+= dio_remaining_resid
;
449 dio_remaining_resid
= 0;
452 error
= SET_ERROR(EIO
);
456 #if defined(__linux__)
458 * if we actually read some bytes, bubbling EFAULT
459 * up to become EAGAIN isn't what we want here...
461 * ...on Linux, at least. On FBSD, doing this breaks.
463 if (error
== EFAULT
&&
464 (zfs_uio_offset(uio
) - start_offset
) != 0)
473 if (error
== 0 && (uio
->uio_extflg
& UIO_DIRECT
) &&
474 dio_remaining_resid
!= 0) {
476 * Temporarily remove the UIO_DIRECT flag from the UIO so the
477 * remainder of the file can be read using the ARC.
479 uio
->uio_extflg
&= ~UIO_DIRECT
;
481 if (zn_has_cached_data(zp
, zfs_uio_offset(uio
),
482 zfs_uio_offset(uio
) + dio_remaining_resid
- 1)) {
483 error
= mappedread(zp
, dio_remaining_resid
, uio
);
485 error
= dmu_read_uio_dbuf(sa_get_db(zp
->z_sa_hdl
), uio
,
486 dio_remaining_resid
);
488 uio
->uio_extflg
|= UIO_DIRECT
;
491 n
+= dio_remaining_resid
;
492 } else if (error
&& (uio
->uio_extflg
& UIO_DIRECT
)) {
493 n
+= dio_remaining_resid
;
495 int64_t nread
= start_resid
- n
;
497 dataset_kstats_update_read_kstats(&zfsvfs
->z_kstat
, nread
);
499 zfs_rangelock_exit(lr
);
501 if (dio_checksum_failure
== B_TRUE
)
502 uio
->uio_extflg
|= UIO_DIRECT
;
505 * Cleanup for Direct I/O if requested.
507 if (uio
->uio_extflg
& UIO_DIRECT
)
508 zfs_uio_free_dio_pages(uio
, UIO_READ
);
510 ZFS_ACCESSTIME_STAMP(zfsvfs
, zp
);
511 zfs_exit(zfsvfs
, FTAG
);
516 zfs_clear_setid_bits_if_necessary(zfsvfs_t
*zfsvfs
, znode_t
*zp
, cred_t
*cr
,
517 uint64_t *clear_setid_bits_txgp
, dmu_tx_t
*tx
)
519 zilog_t
*zilog
= zfsvfs
->z_log
;
520 const uint64_t uid
= KUID_TO_SUID(ZTOUID(zp
));
522 ASSERT(clear_setid_bits_txgp
!= NULL
);
526 * Clear Set-UID/Set-GID bits on successful write if not
527 * privileged and at least one of the execute bits is set.
529 * It would be nice to do this after all writes have
530 * been done, but that would still expose the ISUID/ISGID
531 * to another app after the partial write is committed.
533 * Note: we don't call zfs_fuid_map_id() here because
534 * user 0 is not an ephemeral uid.
536 mutex_enter(&zp
->z_acl_lock
);
537 if ((zp
->z_mode
& (S_IXUSR
| (S_IXUSR
>> 3) | (S_IXUSR
>> 6))) != 0 &&
538 (zp
->z_mode
& (S_ISUID
| S_ISGID
)) != 0 &&
539 secpolicy_vnode_setid_retain(zp
, cr
,
540 ((zp
->z_mode
& S_ISUID
) != 0 && uid
== 0)) != 0) {
543 zp
->z_mode
&= ~(S_ISUID
| S_ISGID
);
544 newmode
= zp
->z_mode
;
545 (void) sa_update(zp
->z_sa_hdl
, SA_ZPL_MODE(zfsvfs
),
546 (void *)&newmode
, sizeof (uint64_t), tx
);
548 mutex_exit(&zp
->z_acl_lock
);
551 * Make sure SUID/SGID bits will be removed when we replay the
552 * log. If the setid bits are keep coming back, don't log more
553 * than one TX_SETATTR per transaction group.
555 if (*clear_setid_bits_txgp
!= dmu_tx_get_txg(tx
)) {
558 va
.va_mask
= ATTR_MODE
;
559 va
.va_nodeid
= zp
->z_id
;
560 va
.va_mode
= newmode
;
561 zfs_log_setattr(zilog
, tx
, TX_SETATTR
, zp
, &va
,
563 *clear_setid_bits_txgp
= dmu_tx_get_txg(tx
);
566 mutex_exit(&zp
->z_acl_lock
);
571 * Write the bytes to a file.
573 * IN: zp - znode of file to be written to.
574 * uio - structure supplying write location, range info,
576 * ioflag - O_APPEND flag set if in append mode.
577 * O_DIRECT flag; used to bypass page cache.
578 * cr - credentials of caller.
580 * OUT: uio - updated offset and range.
582 * RETURN: 0 if success
583 * error code if failure
586 * ip - ctime|mtime updated if byte count > 0
589 zfs_write(znode_t
*zp
, zfs_uio_t
*uio
, int ioflag
, cred_t
*cr
)
591 int error
= 0, error1
;
592 ssize_t start_resid
= zfs_uio_resid(uio
);
593 uint64_t clear_setid_bits_txg
= 0;
594 boolean_t o_direct_defer
= B_FALSE
;
597 * Fasttrack empty write
599 ssize_t n
= start_resid
;
603 zfsvfs_t
*zfsvfs
= ZTOZSB(zp
);
604 if ((error
= zfs_enter_verify_zp(zfsvfs
, zp
, FTAG
)) != 0)
607 sa_bulk_attr_t bulk
[4];
609 uint64_t mtime
[2], ctime
[2];
610 SA_ADD_BULK_ATTR(bulk
, count
, SA_ZPL_MTIME(zfsvfs
), NULL
, &mtime
, 16);
611 SA_ADD_BULK_ATTR(bulk
, count
, SA_ZPL_CTIME(zfsvfs
), NULL
, &ctime
, 16);
612 SA_ADD_BULK_ATTR(bulk
, count
, SA_ZPL_SIZE(zfsvfs
), NULL
,
614 SA_ADD_BULK_ATTR(bulk
, count
, SA_ZPL_FLAGS(zfsvfs
), NULL
,
618 * Callers might not be able to detect properly that we are read-only,
619 * so check it explicitly here.
621 if (zfs_is_readonly(zfsvfs
)) {
622 zfs_exit(zfsvfs
, FTAG
);
623 return (SET_ERROR(EROFS
));
627 * If immutable or not appending then return EPERM.
628 * Intentionally allow ZFS_READONLY through here.
629 * See zfs_zaccess_common()
631 if ((zp
->z_pflags
& ZFS_IMMUTABLE
) ||
632 ((zp
->z_pflags
& ZFS_APPENDONLY
) && !(ioflag
& O_APPEND
) &&
633 (zfs_uio_offset(uio
) < zp
->z_size
))) {
634 zfs_exit(zfsvfs
, FTAG
);
635 return (SET_ERROR(EPERM
));
639 * Validate file offset
641 offset_t woff
= ioflag
& O_APPEND
? zp
->z_size
: zfs_uio_offset(uio
);
643 zfs_exit(zfsvfs
, FTAG
);
644 return (SET_ERROR(EINVAL
));
648 * Setting up Direct I/O if requested.
650 error
= zfs_setup_direct(zp
, uio
, UIO_WRITE
, &ioflag
);
652 zfs_exit(zfsvfs
, FTAG
);
653 return (SET_ERROR(error
));
657 * Pre-fault the pages to ensure slow (eg NFS) pages
660 ssize_t pfbytes
= MIN(n
, DMU_MAX_ACCESS
>> 1);
661 if (zfs_uio_prefaultpages(pfbytes
, uio
)) {
662 zfs_exit(zfsvfs
, FTAG
);
663 return (SET_ERROR(EFAULT
));
667 * If in append mode, set the io offset pointer to eof.
669 zfs_locked_range_t
*lr
;
670 if (ioflag
& O_APPEND
) {
672 * Obtain an appending range lock to guarantee file append
673 * semantics. We reset the write offset once we have the lock.
675 lr
= zfs_rangelock_enter(&zp
->z_rangelock
, 0, n
, RL_APPEND
);
676 woff
= lr
->lr_offset
;
677 if (lr
->lr_length
== UINT64_MAX
) {
679 * We overlocked the file because this write will cause
680 * the file block size to increase.
681 * Note that zp_size cannot change with this lock held.
685 zfs_uio_setoffset(uio
, woff
);
687 * We need to update the starting offset as well because it is
688 * set previously in the ZPL (Linux) and VNOPS (FreeBSD)
691 zfs_uio_setsoffset(uio
, woff
);
694 * Note that if the file block size will change as a result of
695 * this write, then this range lock will lock the entire file
696 * so that we can re-write the block safely.
698 lr
= zfs_rangelock_enter(&zp
->z_rangelock
, woff
, n
, RL_WRITER
);
701 if (zn_rlimit_fsize_uio(zp
, uio
)) {
702 zfs_rangelock_exit(lr
);
703 zfs_exit(zfsvfs
, FTAG
);
704 return (SET_ERROR(EFBIG
));
707 const rlim64_t limit
= MAXOFFSET_T
;
710 zfs_rangelock_exit(lr
);
711 zfs_exit(zfsvfs
, FTAG
);
712 return (SET_ERROR(EFBIG
));
715 if (n
> limit
- woff
)
718 uint64_t end_size
= MAX(zp
->z_size
, woff
+ n
);
719 zilog_t
*zilog
= zfsvfs
->z_log
;
720 boolean_t commit
= (ioflag
& (O_SYNC
| O_DSYNC
)) ||
721 (zfsvfs
->z_os
->os_sync
== ZFS_SYNC_ALWAYS
);
723 const uint64_t uid
= KUID_TO_SUID(ZTOUID(zp
));
724 const uint64_t gid
= KGID_TO_SGID(ZTOGID(zp
));
725 const uint64_t projid
= zp
->z_projid
;
728 * In the event we are increasing the file block size
729 * (lr_length == UINT64_MAX), we will direct the write to the ARC.
730 * Because zfs_grow_blocksize() will read from the ARC in order to
731 * grow the dbuf, we avoid doing Direct I/O here as that would cause
732 * data written to disk to be overwritten by data in the ARC during
733 * the sync phase. Besides writing data twice to disk, we also
734 * want to avoid consistency concerns between data in the the ARC and
735 * on disk while growing the file's blocksize.
737 * We will only temporarily remove Direct I/O and put it back after
738 * we have grown the blocksize. We do this in the event a request
739 * is larger than max_blksz, so further requests to
740 * dmu_write_uio_dbuf() will still issue the requests using Direct
744 * The first block to file is being written as a 4k request with
745 * a recorsize of 1K. The first 1K issued in the loop below will go
746 * through the ARC; however, the following 3 1K requests will
749 if (uio
->uio_extflg
& UIO_DIRECT
&& lr
->lr_length
== UINT64_MAX
) {
750 uio
->uio_extflg
&= ~UIO_DIRECT
;
751 o_direct_defer
= B_TRUE
;
755 * Write the file in reasonable size chunks. Each chunk is written
756 * in a separate transaction; this keeps the intent log records small
757 * and allows us to do more fine-grained space accounting.
760 woff
= zfs_uio_offset(uio
);
762 if (zfs_id_overblockquota(zfsvfs
, DMU_USERUSED_OBJECT
, uid
) ||
763 zfs_id_overblockquota(zfsvfs
, DMU_GROUPUSED_OBJECT
, gid
) ||
764 (projid
!= ZFS_DEFAULT_PROJID
&&
765 zfs_id_overblockquota(zfsvfs
, DMU_PROJECTUSED_OBJECT
,
767 error
= SET_ERROR(EDQUOT
);
772 if (lr
->lr_length
== UINT64_MAX
&& zp
->z_size
<= zp
->z_blksz
) {
773 if (zp
->z_blksz
> zfsvfs
->z_max_blksz
&&
774 !ISP2(zp
->z_blksz
)) {
776 * File's blocksize is already larger than the
777 * "recordsize" property. Only let it grow to
778 * the next power of 2.
780 blksz
= 1 << highbit64(zp
->z_blksz
);
782 blksz
= zfsvfs
->z_max_blksz
;
784 blksz
= MIN(blksz
, P2ROUNDUP(end_size
,
786 blksz
= MAX(blksz
, zp
->z_blksz
);
791 arc_buf_t
*abuf
= NULL
;
793 if (n
>= blksz
&& woff
>= zp
->z_size
&&
794 P2PHASE(woff
, blksz
) == 0 &&
795 !(uio
->uio_extflg
& UIO_DIRECT
) &&
796 (blksz
>= SPA_OLD_MAXBLOCKSIZE
|| n
< 4 * blksz
)) {
798 * This write covers a full block. "Borrow" a buffer
799 * from the dmu so that we can fill it before we enter
800 * a transaction. This avoids the possibility of
801 * holding up the transaction if the data copy hangs
802 * up on a pagefault (e.g., from an NFS server mapping).
804 abuf
= dmu_request_arcbuf(sa_get_db(zp
->z_sa_hdl
),
806 ASSERT(abuf
!= NULL
);
807 ASSERT(arc_buf_size(abuf
) == blksz
);
808 if ((error
= zfs_uiocopy(abuf
->b_data
, blksz
,
809 UIO_WRITE
, uio
, &nbytes
))) {
810 dmu_return_arcbuf(abuf
);
813 ASSERT3S(nbytes
, ==, blksz
);
815 nbytes
= MIN(n
, (DMU_MAX_ACCESS
>> 1) -
816 P2PHASE(woff
, blksz
));
817 if (pfbytes
< nbytes
) {
818 if (zfs_uio_prefaultpages(nbytes
, uio
)) {
819 error
= SET_ERROR(EFAULT
);
827 * Start a transaction.
829 dmu_tx_t
*tx
= dmu_tx_create(zfsvfs
->z_os
);
830 dmu_tx_hold_sa(tx
, zp
->z_sa_hdl
, B_FALSE
);
831 dmu_buf_impl_t
*db
= (dmu_buf_impl_t
*)sa_get_db(zp
->z_sa_hdl
);
833 dmu_tx_hold_write_by_dnode(tx
, DB_DNODE(db
), woff
, nbytes
);
835 zfs_sa_upgrade_txholds(tx
, zp
);
836 error
= dmu_tx_assign(tx
, TXG_WAIT
);
840 dmu_return_arcbuf(abuf
);
845 * NB: We must call zfs_clear_setid_bits_if_necessary before
846 * committing the transaction!
850 * If rangelock_enter() over-locked we grow the blocksize
851 * and then reduce the lock range. This will only happen
852 * on the first iteration since rangelock_reduce() will
853 * shrink down lr_length to the appropriate size.
855 if (lr
->lr_length
== UINT64_MAX
) {
856 zfs_grow_blocksize(zp
, blksz
, tx
);
857 zfs_rangelock_reduce(lr
, woff
, n
);
862 tx_bytes
= zfs_uio_resid(uio
);
863 zfs_uio_fault_disable(uio
, B_TRUE
);
864 error
= dmu_write_uio_dbuf(sa_get_db(zp
->z_sa_hdl
),
866 zfs_uio_fault_disable(uio
, B_FALSE
);
868 if (error
== EFAULT
) {
869 zfs_clear_setid_bits_if_necessary(zfsvfs
, zp
,
870 cr
, &clear_setid_bits_txg
, tx
);
873 * Account for partial writes before
874 * continuing the loop.
875 * Update needs to occur before the next
876 * zfs_uio_prefaultpages, or prefaultpages may
877 * error, and we may break the loop early.
879 n
-= tx_bytes
- zfs_uio_resid(uio
);
880 pfbytes
-= tx_bytes
- zfs_uio_resid(uio
);
885 * On FreeBSD, EFAULT should be propagated back to the
886 * VFS, which will handle faulting and will retry.
888 if (error
!= 0 && error
!= EFAULT
) {
889 zfs_clear_setid_bits_if_necessary(zfsvfs
, zp
,
890 cr
, &clear_setid_bits_txg
, tx
);
894 tx_bytes
-= zfs_uio_resid(uio
);
897 * Thus, we're writing a full block at a block-aligned
898 * offset and extending the file past EOF.
900 * dmu_assign_arcbuf_by_dbuf() will directly assign the
901 * arc buffer to a dbuf.
903 error
= dmu_assign_arcbuf_by_dbuf(
904 sa_get_db(zp
->z_sa_hdl
), woff
, abuf
, tx
);
907 * XXX This might not be necessary if
908 * dmu_assign_arcbuf_by_dbuf is guaranteed
911 zfs_clear_setid_bits_if_necessary(zfsvfs
, zp
,
912 cr
, &clear_setid_bits_txg
, tx
);
913 dmu_return_arcbuf(abuf
);
917 ASSERT3S(nbytes
, <=, zfs_uio_resid(uio
));
918 zfs_uioskip(uio
, nbytes
);
922 * There is a window where a file's pages can be mmap'ed after
923 * zfs_setup_direct() is called. This is due to the fact that
924 * the rangelock in this function is acquired after calling
925 * zfs_setup_direct(). This is done so that
926 * zfs_uio_prefaultpages() does not attempt to fault in pages
927 * on Linux for Direct I/O requests. This is not necessary as
928 * the pages are pinned in memory and can not be faulted out.
929 * Ideally, the rangelock would be held before calling
930 * zfs_setup_direct() and zfs_uio_prefaultpages(); however,
931 * this can lead to a deadlock as zfs_getpage() also acquires
932 * the rangelock as a RL_WRITER and prefaulting the pages can
933 * lead to zfs_getpage() being called.
935 * In the case of the pages being mapped after
936 * zfs_setup_direct() is called, the call to update_pages()
937 * will still be made to make sure there is consistency between
938 * the ARC and the Linux page cache. This is an ufortunate
939 * situation as the data will be read back into the ARC after
940 * the Direct I/O write has completed, but this is the penality
941 * for writing to a mmap'ed region of a file using Direct I/O.
944 zn_has_cached_data(zp
, woff
, woff
+ tx_bytes
- 1)) {
945 update_pages(zp
, woff
, tx_bytes
, zfsvfs
->z_os
);
949 * If we made no progress, we're done. If we made even
950 * partial progress, update the znode and ZIL accordingly.
953 (void) sa_update(zp
->z_sa_hdl
, SA_ZPL_SIZE(zfsvfs
),
954 (void *)&zp
->z_size
, sizeof (uint64_t), tx
);
960 zfs_clear_setid_bits_if_necessary(zfsvfs
, zp
, cr
,
961 &clear_setid_bits_txg
, tx
);
963 zfs_tstamp_update_setup(zp
, CONTENT_MODIFIED
, mtime
, ctime
);
966 * Update the file size (zp_size) if it has changed;
967 * account for possible concurrent updates.
969 while ((end_size
= zp
->z_size
) < zfs_uio_offset(uio
)) {
970 (void) atomic_cas_64(&zp
->z_size
, end_size
,
971 zfs_uio_offset(uio
));
972 ASSERT(error
== 0 || error
== EFAULT
);
975 * If we are replaying and eof is non zero then force
976 * the file size to the specified eof. Note, there's no
977 * concurrency during replay.
979 if (zfsvfs
->z_replay
&& zfsvfs
->z_replay_eof
!= 0)
980 zp
->z_size
= zfsvfs
->z_replay_eof
;
982 error1
= sa_bulk_update(zp
->z_sa_hdl
, bulk
, count
, tx
);
984 /* Avoid clobbering EFAULT. */
988 * NB: During replay, the TX_SETATTR record logged by
989 * zfs_clear_setid_bits_if_necessary must precede any of
990 * the TX_WRITE records logged here.
992 zfs_log_write(zilog
, tx
, TX_WRITE
, zp
, woff
, tx_bytes
, commit
,
993 uio
->uio_extflg
& UIO_DIRECT
? B_TRUE
: B_FALSE
, NULL
,
999 * Direct I/O was deferred in order to grow the first block.
1000 * At this point it can be re-enabled for subsequent writes.
1002 if (o_direct_defer
) {
1003 ASSERT(ioflag
& O_DIRECT
);
1004 uio
->uio_extflg
|= UIO_DIRECT
;
1005 o_direct_defer
= B_FALSE
;
1010 ASSERT3S(tx_bytes
, ==, nbytes
);
1015 if (o_direct_defer
) {
1016 ASSERT(ioflag
& O_DIRECT
);
1017 uio
->uio_extflg
|= UIO_DIRECT
;
1018 o_direct_defer
= B_FALSE
;
1021 zfs_znode_update_vfs(zp
);
1022 zfs_rangelock_exit(lr
);
1025 * Cleanup for Direct I/O if requested.
1027 if (uio
->uio_extflg
& UIO_DIRECT
)
1028 zfs_uio_free_dio_pages(uio
, UIO_WRITE
);
1031 * If we're in replay mode, or we made no progress, or the
1032 * uio data is inaccessible return an error. Otherwise, it's
1033 * at least a partial write, so it's successful.
1035 if (zfsvfs
->z_replay
|| zfs_uio_resid(uio
) == start_resid
||
1037 zfs_exit(zfsvfs
, FTAG
);
1042 zil_commit(zilog
, zp
->z_id
);
1044 int64_t nwritten
= start_resid
- zfs_uio_resid(uio
);
1045 dataset_kstats_update_write_kstats(&zfsvfs
->z_kstat
, nwritten
);
1047 zfs_exit(zfsvfs
, FTAG
);
1052 zfs_getsecattr(znode_t
*zp
, vsecattr_t
*vsecp
, int flag
, cred_t
*cr
)
1054 zfsvfs_t
*zfsvfs
= ZTOZSB(zp
);
1056 boolean_t skipaclchk
= (flag
& ATTR_NOACLCHECK
) ? B_TRUE
: B_FALSE
;
1058 if ((error
= zfs_enter_verify_zp(zfsvfs
, zp
, FTAG
)) != 0)
1060 error
= zfs_getacl(zp
, vsecp
, skipaclchk
, cr
);
1061 zfs_exit(zfsvfs
, FTAG
);
1067 zfs_setsecattr(znode_t
*zp
, vsecattr_t
*vsecp
, int flag
, cred_t
*cr
)
1069 zfsvfs_t
*zfsvfs
= ZTOZSB(zp
);
1071 boolean_t skipaclchk
= (flag
& ATTR_NOACLCHECK
) ? B_TRUE
: B_FALSE
;
1074 if ((error
= zfs_enter_verify_zp(zfsvfs
, zp
, FTAG
)) != 0)
1076 zilog
= zfsvfs
->z_log
;
1077 error
= zfs_setacl(zp
, vsecp
, skipaclchk
, cr
);
1079 if (zfsvfs
->z_os
->os_sync
== ZFS_SYNC_ALWAYS
)
1080 zil_commit(zilog
, 0);
1082 zfs_exit(zfsvfs
, FTAG
);
1087 static int zil_fault_io
= 0;
1090 static void zfs_get_done(zgd_t
*zgd
, int error
);
1093 * Get data to generate a TX_WRITE intent log record.
1096 zfs_get_data(void *arg
, uint64_t gen
, lr_write_t
*lr
, char *buf
,
1097 struct lwb
*lwb
, zio_t
*zio
)
1099 zfsvfs_t
*zfsvfs
= arg
;
1100 objset_t
*os
= zfsvfs
->z_os
;
1102 uint64_t object
= lr
->lr_foid
;
1103 uint64_t offset
= lr
->lr_offset
;
1104 uint64_t size
= lr
->lr_length
;
1109 ASSERT3P(lwb
, !=, NULL
);
1110 ASSERT3U(size
, !=, 0);
1113 * Nothing to do if the file has been removed
1115 if (zfs_zget(zfsvfs
, object
, &zp
) != 0)
1116 return (SET_ERROR(ENOENT
));
1117 if (zp
->z_unlinked
) {
1119 * Release the vnode asynchronously as we currently have the
1120 * txg stopped from syncing.
1122 zfs_zrele_async(zp
);
1123 return (SET_ERROR(ENOENT
));
1125 /* check if generation number matches */
1126 if (sa_lookup(zp
->z_sa_hdl
, SA_ZPL_GEN(zfsvfs
), &zp_gen
,
1127 sizeof (zp_gen
)) != 0) {
1128 zfs_zrele_async(zp
);
1129 return (SET_ERROR(EIO
));
1131 if (zp_gen
!= gen
) {
1132 zfs_zrele_async(zp
);
1133 return (SET_ERROR(ENOENT
));
1136 zgd
= kmem_zalloc(sizeof (zgd_t
), KM_SLEEP
);
1138 zgd
->zgd_private
= zp
;
1141 * Write records come in two flavors: immediate and indirect.
1142 * For small writes it's cheaper to store the data with the
1143 * log record (immediate); for large writes it's cheaper to
1144 * sync the data and get a pointer to it (indirect) so that
1145 * we don't have to write the data twice.
1147 if (buf
!= NULL
) { /* immediate write */
1148 zgd
->zgd_lr
= zfs_rangelock_enter(&zp
->z_rangelock
, offset
,
1150 /* test for truncation needs to be done while range locked */
1151 if (offset
>= zp
->z_size
) {
1152 error
= SET_ERROR(ENOENT
);
1154 error
= dmu_read(os
, object
, offset
, size
, buf
,
1155 DMU_READ_NO_PREFETCH
);
1157 ASSERT(error
== 0 || error
== ENOENT
);
1158 } else { /* indirect write */
1159 ASSERT3P(zio
, !=, NULL
);
1161 * Have to lock the whole block to ensure when it's
1162 * written out and its checksum is being calculated
1163 * that no one can change the data. We need to re-check
1164 * blocksize after we get the lock in case it's changed!
1169 blkoff
= ISP2(size
) ? P2PHASE(offset
, size
) : offset
;
1171 zgd
->zgd_lr
= zfs_rangelock_enter(&zp
->z_rangelock
,
1172 offset
, size
, RL_READER
);
1173 if (zp
->z_blksz
== size
)
1176 zfs_rangelock_exit(zgd
->zgd_lr
);
1178 /* test for truncation needs to be done while range locked */
1179 if (lr
->lr_offset
>= zp
->z_size
)
1180 error
= SET_ERROR(ENOENT
);
1183 error
= SET_ERROR(EIO
);
1190 error
= dmu_buf_hold_noread(os
, object
, offset
, zgd
,
1195 dmu_buf_impl_t
*db
= (dmu_buf_impl_t
*)dbp
;
1196 boolean_t direct_write
= B_FALSE
;
1197 mutex_enter(&db
->db_mtx
);
1198 dbuf_dirty_record_t
*dr
=
1199 dbuf_find_dirty_eq(db
, lr
->lr_common
.lrc_txg
);
1200 if (dr
!= NULL
&& dr
->dt
.dl
.dr_diowrite
)
1201 direct_write
= B_TRUE
;
1202 mutex_exit(&db
->db_mtx
);
1205 * All Direct I/O writes will have already completed and
1206 * the block pointer can be immediately stored in the
1211 * A Direct I/O write always covers an entire
1214 ASSERT3U(dbp
->db_size
, ==, zp
->z_blksz
);
1215 lr
->lr_blkptr
= dr
->dt
.dl
.dr_overridden_by
;
1216 zfs_get_done(zgd
, 0);
1220 blkptr_t
*bp
= &lr
->lr_blkptr
;
1223 ASSERT3U(dbp
->db_offset
, ==, offset
);
1224 ASSERT3U(dbp
->db_size
, ==, size
);
1226 error
= dmu_sync(zio
, lr
->lr_common
.lrc_txg
,
1228 ASSERT(error
|| lr
->lr_length
<= size
);
1231 * On success, we need to wait for the write I/O
1232 * initiated by dmu_sync() to complete before we can
1233 * release this dbuf. We will finish everything up
1234 * in the zfs_get_done() callback.
1239 if (error
== EALREADY
) {
1240 lr
->lr_common
.lrc_txtype
= TX_WRITE2
;
1242 * TX_WRITE2 relies on the data previously
1243 * written by the TX_WRITE that caused
1244 * EALREADY. We zero out the BP because
1245 * it is the old, currently-on-disk BP.
1254 zfs_get_done(zgd
, error
);
1260 zfs_get_done(zgd_t
*zgd
, int error
)
1263 znode_t
*zp
= zgd
->zgd_private
;
1266 dmu_buf_rele(zgd
->zgd_db
, zgd
);
1268 zfs_rangelock_exit(zgd
->zgd_lr
);
1271 * Release the vnode asynchronously as we currently have the
1272 * txg stopped from syncing.
1274 zfs_zrele_async(zp
);
1276 kmem_free(zgd
, sizeof (zgd_t
));
1280 zfs_enter_two(zfsvfs_t
*zfsvfs1
, zfsvfs_t
*zfsvfs2
, const char *tag
)
1284 /* Swap. Not sure if the order of zfs_enter()s is important. */
1285 if (zfsvfs1
> zfsvfs2
) {
1286 zfsvfs_t
*tmpzfsvfs
;
1288 tmpzfsvfs
= zfsvfs2
;
1290 zfsvfs1
= tmpzfsvfs
;
1293 error
= zfs_enter(zfsvfs1
, tag
);
1296 if (zfsvfs1
!= zfsvfs2
) {
1297 error
= zfs_enter(zfsvfs2
, tag
);
1299 zfs_exit(zfsvfs1
, tag
);
1308 zfs_exit_two(zfsvfs_t
*zfsvfs1
, zfsvfs_t
*zfsvfs2
, const char *tag
)
1311 zfs_exit(zfsvfs1
, tag
);
1312 if (zfsvfs1
!= zfsvfs2
)
1313 zfs_exit(zfsvfs2
, tag
);
1317 * We split each clone request in chunks that can fit into a single ZIL
1318 * log entry. Each ZIL log entry can fit 130816 bytes for a block cloning
1319 * operation (see zil_max_log_data() and zfs_log_clone_range()). This gives
1320 * us room for storing 1022 block pointers.
1322 * On success, the function return the number of bytes copied in *lenp.
1323 * Note, it doesn't return how much bytes are left to be copied.
1324 * On errors which are caused by any file system limitations or
1325 * brt limitations `EINVAL` is returned. In the most cases a user
1326 * requested bad parameters, it could be possible to clone the file but
1327 * some parameters don't match the requirements.
1330 zfs_clone_range(znode_t
*inzp
, uint64_t *inoffp
, znode_t
*outzp
,
1331 uint64_t *outoffp
, uint64_t *lenp
, cred_t
*cr
)
1333 zfsvfs_t
*inzfsvfs
, *outzfsvfs
;
1334 objset_t
*inos
, *outos
;
1335 zfs_locked_range_t
*inlr
, *outlr
;
1339 uint64_t inoff
, outoff
, len
, done
;
1340 uint64_t outsize
, size
;
1343 sa_bulk_attr_t bulk
[3];
1344 uint64_t mtime
[2], ctime
[2];
1345 uint64_t uid
, gid
, projid
;
1347 size_t maxblocks
, nbps
;
1349 uint64_t clear_setid_bits_txg
= 0;
1350 uint64_t last_synced_txg
= 0;
1357 inzfsvfs
= ZTOZSB(inzp
);
1358 outzfsvfs
= ZTOZSB(outzp
);
1361 * We need to call zfs_enter() potentially on two different datasets,
1362 * so we need a dedicated function for that.
1364 error
= zfs_enter_two(inzfsvfs
, outzfsvfs
, FTAG
);
1368 inos
= inzfsvfs
->z_os
;
1369 outos
= outzfsvfs
->z_os
;
1372 * Both source and destination have to belong to the same storage pool.
1374 if (dmu_objset_spa(inos
) != dmu_objset_spa(outos
)) {
1375 zfs_exit_two(inzfsvfs
, outzfsvfs
, FTAG
);
1376 return (SET_ERROR(EXDEV
));
1380 * outos and inos belongs to the same storage pool.
1381 * see a few lines above, only one check.
1383 if (!spa_feature_is_enabled(dmu_objset_spa(outos
),
1384 SPA_FEATURE_BLOCK_CLONING
)) {
1385 zfs_exit_two(inzfsvfs
, outzfsvfs
, FTAG
);
1386 return (SET_ERROR(EOPNOTSUPP
));
1389 ASSERT(!outzfsvfs
->z_replay
);
1392 * Block cloning from an unencrypted dataset into an encrypted
1393 * dataset and vice versa is not supported.
1395 if (inos
->os_encrypted
!= outos
->os_encrypted
) {
1396 zfs_exit_two(inzfsvfs
, outzfsvfs
, FTAG
);
1397 return (SET_ERROR(EXDEV
));
1401 * Cloning across encrypted datasets is possible only if they
1402 * share the same master key.
1404 if (inos
!= outos
&& inos
->os_encrypted
&&
1405 !dmu_objset_crypto_key_equal(inos
, outos
)) {
1406 zfs_exit_two(inzfsvfs
, outzfsvfs
, FTAG
);
1407 return (SET_ERROR(EXDEV
));
1410 error
= zfs_verify_zp(inzp
);
1412 error
= zfs_verify_zp(outzp
);
1414 zfs_exit_two(inzfsvfs
, outzfsvfs
, FTAG
);
1419 * We don't copy source file's flags that's why we don't allow to clone
1420 * files that are in quarantine.
1422 if (inzp
->z_pflags
& ZFS_AV_QUARANTINED
) {
1423 zfs_exit_two(inzfsvfs
, outzfsvfs
, FTAG
);
1424 return (SET_ERROR(EACCES
));
1427 if (inoff
>= inzp
->z_size
) {
1429 zfs_exit_two(inzfsvfs
, outzfsvfs
, FTAG
);
1432 if (len
> inzp
->z_size
- inoff
) {
1433 len
= inzp
->z_size
- inoff
;
1437 zfs_exit_two(inzfsvfs
, outzfsvfs
, FTAG
);
1442 * Callers might not be able to detect properly that we are read-only,
1443 * so check it explicitly here.
1445 if (zfs_is_readonly(outzfsvfs
)) {
1446 zfs_exit_two(inzfsvfs
, outzfsvfs
, FTAG
);
1447 return (SET_ERROR(EROFS
));
1451 * If immutable or not appending then return EPERM.
1452 * Intentionally allow ZFS_READONLY through here.
1453 * See zfs_zaccess_common()
1455 if ((outzp
->z_pflags
& ZFS_IMMUTABLE
) != 0) {
1456 zfs_exit_two(inzfsvfs
, outzfsvfs
, FTAG
);
1457 return (SET_ERROR(EPERM
));
1461 * No overlapping if we are cloning within the same file.
1463 if (inzp
== outzp
) {
1464 if (inoff
< outoff
+ len
&& outoff
< inoff
+ len
) {
1465 zfs_exit_two(inzfsvfs
, outzfsvfs
, FTAG
);
1466 return (SET_ERROR(EINVAL
));
1470 /* Flush any mmap()'d data to disk */
1471 if (zn_has_cached_data(inzp
, inoff
, inoff
+ len
- 1))
1472 zn_flush_cached_data(inzp
, B_TRUE
);
1475 * Maintain predictable lock order.
1477 if (inzp
< outzp
|| (inzp
== outzp
&& inoff
< outoff
)) {
1478 inlr
= zfs_rangelock_enter(&inzp
->z_rangelock
, inoff
, len
,
1480 outlr
= zfs_rangelock_enter(&outzp
->z_rangelock
, outoff
, len
,
1483 outlr
= zfs_rangelock_enter(&outzp
->z_rangelock
, outoff
, len
,
1485 inlr
= zfs_rangelock_enter(&inzp
->z_rangelock
, inoff
, len
,
1489 inblksz
= inzp
->z_blksz
;
1492 * We cannot clone into a file with different block size if we can't
1493 * grow it (block size is already bigger, has more than one block, or
1494 * not locked for growth). There are other possible reasons for the
1495 * grow to fail, but we cover what we can before opening transaction
1496 * and the rest detect after we try to do it.
1498 if (inblksz
< outzp
->z_blksz
) {
1499 error
= SET_ERROR(EINVAL
);
1502 if (inblksz
!= outzp
->z_blksz
&& (outzp
->z_size
> outzp
->z_blksz
||
1503 outlr
->lr_length
!= UINT64_MAX
)) {
1504 error
= SET_ERROR(EINVAL
);
1509 * Block size must be power-of-2 if destination offset != 0.
1510 * There can be no multiple blocks of non-power-of-2 size.
1512 if (outoff
!= 0 && !ISP2(inblksz
)) {
1513 error
= SET_ERROR(EINVAL
);
1518 * Offsets and len must be at block boundries.
1520 if ((inoff
% inblksz
) != 0 || (outoff
% inblksz
) != 0) {
1521 error
= SET_ERROR(EINVAL
);
1525 * Length must be multipe of blksz, except for the end of the file.
1527 if ((len
% inblksz
) != 0 &&
1528 (len
< inzp
->z_size
- inoff
|| len
< outzp
->z_size
- outoff
)) {
1529 error
= SET_ERROR(EINVAL
);
1534 * If we are copying only one block and it is smaller than recordsize
1535 * property, do not allow destination to grow beyond one block if it
1536 * is not there yet. Otherwise the destination will get stuck with
1537 * that block size forever, that can be as small as 512 bytes, no
1538 * matter how big the destination grow later.
1540 if (len
<= inblksz
&& inblksz
< outzfsvfs
->z_max_blksz
&&
1541 outzp
->z_size
<= inblksz
&& outoff
+ len
> inblksz
) {
1542 error
= SET_ERROR(EINVAL
);
1546 error
= zn_rlimit_fsize(outoff
+ len
);
1551 if (inoff
>= MAXOFFSET_T
|| outoff
>= MAXOFFSET_T
) {
1552 error
= SET_ERROR(EFBIG
);
1556 SA_ADD_BULK_ATTR(bulk
, count
, SA_ZPL_MTIME(outzfsvfs
), NULL
,
1558 SA_ADD_BULK_ATTR(bulk
, count
, SA_ZPL_CTIME(outzfsvfs
), NULL
,
1560 SA_ADD_BULK_ATTR(bulk
, count
, SA_ZPL_SIZE(outzfsvfs
), NULL
,
1563 zilog
= outzfsvfs
->z_log
;
1564 maxblocks
= zil_max_log_data(zilog
, sizeof (lr_clone_range_t
)) /
1567 uid
= KUID_TO_SUID(ZTOUID(outzp
));
1568 gid
= KGID_TO_SGID(ZTOGID(outzp
));
1569 projid
= outzp
->z_projid
;
1571 bps
= vmem_alloc(sizeof (bps
[0]) * maxblocks
, KM_SLEEP
);
1574 * Clone the file in reasonable size chunks. Each chunk is cloned
1575 * in a separate transaction; this keeps the intent log records small
1576 * and allows us to do more fine-grained space accounting.
1579 size
= MIN(inblksz
* maxblocks
, len
);
1581 if (zfs_id_overblockquota(outzfsvfs
, DMU_USERUSED_OBJECT
,
1583 zfs_id_overblockquota(outzfsvfs
, DMU_GROUPUSED_OBJECT
,
1585 (projid
!= ZFS_DEFAULT_PROJID
&&
1586 zfs_id_overblockquota(outzfsvfs
, DMU_PROJECTUSED_OBJECT
,
1588 error
= SET_ERROR(EDQUOT
);
1593 last_synced_txg
= spa_last_synced_txg(dmu_objset_spa(inos
));
1594 error
= dmu_read_l0_bps(inos
, inzp
->z_id
, inoff
, size
, bps
,
1598 * If we are trying to clone a block that was created
1599 * in the current transaction group, the error will be
1600 * EAGAIN here. Based on zfs_bclone_wait_dirty either
1601 * return a shortened range to the caller so it can
1602 * fallback, or wait for the next TXG and check again.
1604 if (error
== EAGAIN
&& zfs_bclone_wait_dirty
) {
1605 txg_wait_synced(dmu_objset_pool(inos
),
1606 last_synced_txg
+ 1);
1614 * Start a transaction.
1616 tx
= dmu_tx_create(outos
);
1617 dmu_tx_hold_sa(tx
, outzp
->z_sa_hdl
, B_FALSE
);
1618 db
= (dmu_buf_impl_t
*)sa_get_db(outzp
->z_sa_hdl
);
1620 dmu_tx_hold_clone_by_dnode(tx
, DB_DNODE(db
), outoff
, size
);
1622 zfs_sa_upgrade_txholds(tx
, outzp
);
1623 error
= dmu_tx_assign(tx
, TXG_WAIT
);
1630 * Copy source znode's block size. This is done only if the
1631 * whole znode is locked (see zfs_rangelock_cb()) and only
1632 * on the first iteration since zfs_rangelock_reduce() will
1633 * shrink down lr_length to the appropriate size.
1635 if (outlr
->lr_length
== UINT64_MAX
) {
1636 zfs_grow_blocksize(outzp
, inblksz
, tx
);
1639 * Block growth may fail for many reasons we can not
1640 * predict here. If it happen the cloning is doomed.
1642 if (inblksz
!= outzp
->z_blksz
) {
1643 error
= SET_ERROR(EINVAL
);
1649 * Round range lock up to the block boundary, so we
1650 * prevent appends until we are done.
1652 zfs_rangelock_reduce(outlr
, outoff
,
1653 ((len
- 1) / inblksz
+ 1) * inblksz
);
1656 error
= dmu_brt_clone(outos
, outzp
->z_id
, outoff
, size
, tx
,
1663 if (zn_has_cached_data(outzp
, outoff
, outoff
+ size
- 1)) {
1664 update_pages(outzp
, outoff
, size
, outos
);
1667 zfs_clear_setid_bits_if_necessary(outzfsvfs
, outzp
, cr
,
1668 &clear_setid_bits_txg
, tx
);
1670 zfs_tstamp_update_setup(outzp
, CONTENT_MODIFIED
, mtime
, ctime
);
1673 * Update the file size (zp_size) if it has changed;
1674 * account for possible concurrent updates.
1676 while ((outsize
= outzp
->z_size
) < outoff
+ size
) {
1677 (void) atomic_cas_64(&outzp
->z_size
, outsize
,
1681 error
= sa_bulk_update(outzp
->z_sa_hdl
, bulk
, count
, tx
);
1683 zfs_log_clone_range(zilog
, tx
, TX_CLONE_RANGE
, outzp
, outoff
,
1684 size
, inblksz
, bps
, nbps
);
1697 error
= SET_ERROR(EINTR
);
1702 vmem_free(bps
, sizeof (bps
[0]) * maxblocks
);
1703 zfs_znode_update_vfs(outzp
);
1706 zfs_rangelock_exit(outlr
);
1707 zfs_rangelock_exit(inlr
);
1711 * If we have made at least partial progress, reset the error.
1715 ZFS_ACCESSTIME_STAMP(inzfsvfs
, inzp
);
1717 if (outos
->os_sync
== ZFS_SYNC_ALWAYS
) {
1718 zil_commit(zilog
, outzp
->z_id
);
1726 * If we made no progress, there must be a good reason.
1727 * EOF is handled explicitly above, before the loop.
1729 ASSERT3S(error
, !=, 0);
1732 zfs_exit_two(inzfsvfs
, outzfsvfs
, FTAG
);
1738 * Usual pattern would be to call zfs_clone_range() from zfs_replay_clone(),
1739 * but we cannot do that, because when replaying we don't have source znode
1740 * available. This is why we need a dedicated replay function.
1743 zfs_clone_range_replay(znode_t
*zp
, uint64_t off
, uint64_t len
, uint64_t blksz
,
1744 const blkptr_t
*bps
, size_t nbps
)
1751 sa_bulk_attr_t bulk
[3];
1752 uint64_t mtime
[2], ctime
[2];
1754 ASSERT3U(off
, <, MAXOFFSET_T
);
1755 ASSERT3U(len
, >, 0);
1756 ASSERT3U(nbps
, >, 0);
1758 zfsvfs
= ZTOZSB(zp
);
1760 ASSERT(spa_feature_is_enabled(dmu_objset_spa(zfsvfs
->z_os
),
1761 SPA_FEATURE_BLOCK_CLONING
));
1763 if ((error
= zfs_enter_verify_zp(zfsvfs
, zp
, FTAG
)) != 0)
1766 ASSERT(zfsvfs
->z_replay
);
1767 ASSERT(!zfs_is_readonly(zfsvfs
));
1769 if ((off
% blksz
) != 0) {
1770 zfs_exit(zfsvfs
, FTAG
);
1771 return (SET_ERROR(EINVAL
));
1774 SA_ADD_BULK_ATTR(bulk
, count
, SA_ZPL_MTIME(zfsvfs
), NULL
, &mtime
, 16);
1775 SA_ADD_BULK_ATTR(bulk
, count
, SA_ZPL_CTIME(zfsvfs
), NULL
, &ctime
, 16);
1776 SA_ADD_BULK_ATTR(bulk
, count
, SA_ZPL_SIZE(zfsvfs
), NULL
,
1780 * Start a transaction.
1782 tx
= dmu_tx_create(zfsvfs
->z_os
);
1784 dmu_tx_hold_sa(tx
, zp
->z_sa_hdl
, B_FALSE
);
1785 db
= (dmu_buf_impl_t
*)sa_get_db(zp
->z_sa_hdl
);
1787 dmu_tx_hold_clone_by_dnode(tx
, DB_DNODE(db
), off
, len
);
1789 zfs_sa_upgrade_txholds(tx
, zp
);
1790 error
= dmu_tx_assign(tx
, TXG_WAIT
);
1793 zfs_exit(zfsvfs
, FTAG
);
1797 if (zp
->z_blksz
< blksz
)
1798 zfs_grow_blocksize(zp
, blksz
, tx
);
1800 dmu_brt_clone(zfsvfs
->z_os
, zp
->z_id
, off
, len
, tx
, bps
, nbps
);
1802 zfs_tstamp_update_setup(zp
, CONTENT_MODIFIED
, mtime
, ctime
);
1804 if (zp
->z_size
< off
+ len
)
1805 zp
->z_size
= off
+ len
;
1807 error
= sa_bulk_update(zp
->z_sa_hdl
, bulk
, count
, tx
);
1810 * zil_replaying() not only check if we are replaying ZIL, but also
1811 * updates the ZIL header to record replay progress.
1813 VERIFY(zil_replaying(zfsvfs
->z_log
, tx
));
1817 zfs_znode_update_vfs(zp
);
1819 zfs_exit(zfsvfs
, FTAG
);
1824 EXPORT_SYMBOL(zfs_access
);
1825 EXPORT_SYMBOL(zfs_fsync
);
1826 EXPORT_SYMBOL(zfs_holey
);
1827 EXPORT_SYMBOL(zfs_read
);
1828 EXPORT_SYMBOL(zfs_write
);
1829 EXPORT_SYMBOL(zfs_getsecattr
);
1830 EXPORT_SYMBOL(zfs_setsecattr
);
1831 EXPORT_SYMBOL(zfs_clone_range
);
1832 EXPORT_SYMBOL(zfs_clone_range_replay
);
1834 ZFS_MODULE_PARAM(zfs_vnops
, zfs_vnops_
, read_chunk_size
, U64
, ZMOD_RW
,
1835 "Bytes to read per chunk");
1837 ZFS_MODULE_PARAM(zfs
, zfs_
, bclone_enabled
, INT
, ZMOD_RW
,
1838 "Enable block cloning");
1840 ZFS_MODULE_PARAM(zfs
, zfs_
, bclone_wait_dirty
, INT
, ZMOD_RW
,
1841 "Wait for dirty blocks when cloning");
1843 ZFS_MODULE_PARAM(zfs
, zfs_
, dio_enabled
, INT
, ZMOD_RW
,
1844 "Enable Direct I/O");