4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or https://opensource.org/licenses/CDDL-1.0.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
23 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
24 * Copyright (c) 2012, 2018 by Delphix. All rights reserved.
25 * Copyright (c) 2015 by Chunwei Chen. All rights reserved.
26 * Copyright 2017 Nexenta Systems, Inc.
29 /* Portions Copyright 2007 Jeremy Teo */
30 /* Portions Copyright 2010 Robert Milkowski */
32 #include <sys/types.h>
33 #include <sys/param.h>
35 #include <sys/sysmacros.h>
37 #include <sys/uio_impl.h>
41 #include <sys/cmn_err.h>
42 #include <sys/errno.h>
43 #include <sys/zfs_dir.h>
44 #include <sys/zfs_acl.h>
45 #include <sys/zfs_ioctl.h>
46 #include <sys/fs/zfs.h>
48 #include <sys/dmu_objset.h>
52 #include <sys/policy.h>
53 #include <sys/zfs_vnops.h>
54 #include <sys/zfs_quota.h>
55 #include <sys/zfs_vfsops.h>
56 #include <sys/zfs_znode.h>
59 static ulong_t zfs_fsync_sync_cnt
= 4;
62 zfs_fsync(znode_t
*zp
, int syncflag
, cred_t
*cr
)
65 zfsvfs_t
*zfsvfs
= ZTOZSB(zp
);
67 (void) tsd_set(zfs_fsyncer_key
, (void *)(uintptr_t)zfs_fsync_sync_cnt
);
69 if (zfsvfs
->z_os
->os_sync
!= ZFS_SYNC_DISABLED
) {
70 if ((error
= zfs_enter_verify_zp(zfsvfs
, zp
, FTAG
)) != 0)
72 atomic_inc_32(&zp
->z_sync_writes_cnt
);
73 zil_commit(zfsvfs
->z_log
, zp
->z_id
);
74 atomic_dec_32(&zp
->z_sync_writes_cnt
);
75 zfs_exit(zfsvfs
, FTAG
);
78 tsd_set(zfs_fsyncer_key
, NULL
);
84 #if defined(SEEK_HOLE) && defined(SEEK_DATA)
86 * Lseek support for finding holes (cmd == SEEK_HOLE) and
87 * data (cmd == SEEK_DATA). "off" is an in/out parameter.
90 zfs_holey_common(znode_t
*zp
, ulong_t cmd
, loff_t
*off
)
92 zfs_locked_range_t
*lr
;
93 uint64_t noff
= (uint64_t)*off
; /* new offset */
99 if (noff
>= file_sz
) {
100 return (SET_ERROR(ENXIO
));
103 if (cmd
== F_SEEK_HOLE
)
108 /* Flush any mmap()'d data to disk */
109 if (zn_has_cached_data(zp
))
110 zn_flush_cached_data(zp
, B_FALSE
);
112 lr
= zfs_rangelock_enter(&zp
->z_rangelock
, 0, file_sz
, RL_READER
);
113 error
= dmu_offset_next(ZTOZSB(zp
)->z_os
, zp
->z_id
, hole
, &noff
);
114 zfs_rangelock_exit(lr
);
117 return (SET_ERROR(ENXIO
));
119 /* File was dirty, so fall back to using generic logic */
120 if (error
== EBUSY
) {
128 * We could find a hole that begins after the logical end-of-file,
129 * because dmu_offset_next() only works on whole blocks. If the
130 * EOF falls mid-block, then indicate that the "virtual hole"
131 * at the end of the file begins at the logical EOF, rather than
132 * at the end of the last block.
134 if (noff
> file_sz
) {
146 zfs_holey(znode_t
*zp
, ulong_t cmd
, loff_t
*off
)
148 zfsvfs_t
*zfsvfs
= ZTOZSB(zp
);
151 if ((error
= zfs_enter_verify_zp(zfsvfs
, zp
, FTAG
)) != 0)
154 error
= zfs_holey_common(zp
, cmd
, off
);
156 zfs_exit(zfsvfs
, FTAG
);
159 #endif /* SEEK_HOLE && SEEK_DATA */
162 zfs_access(znode_t
*zp
, int mode
, int flag
, cred_t
*cr
)
164 zfsvfs_t
*zfsvfs
= ZTOZSB(zp
);
167 if ((error
= zfs_enter_verify_zp(zfsvfs
, zp
, FTAG
)) != 0)
170 if (flag
& V_ACE_MASK
)
171 #if defined(__linux__)
172 error
= zfs_zaccess(zp
, mode
, flag
, B_FALSE
, cr
,
175 error
= zfs_zaccess(zp
, mode
, flag
, B_FALSE
, cr
,
179 #if defined(__linux__)
180 error
= zfs_zaccess_rwx(zp
, mode
, flag
, cr
, kcred
->user_ns
);
182 error
= zfs_zaccess_rwx(zp
, mode
, flag
, cr
, NULL
);
185 zfs_exit(zfsvfs
, FTAG
);
189 static uint64_t zfs_vnops_read_chunk_size
= 1024 * 1024; /* Tunable */
192 * Read bytes from specified file into supplied buffer.
194 * IN: zp - inode of file to be read from.
195 * uio - structure supplying read location, range info,
197 * ioflag - O_SYNC flags; used to provide FRSYNC semantics.
198 * O_DIRECT flag; used to bypass page cache.
199 * cr - credentials of caller.
201 * OUT: uio - updated offset and range, buffer filled.
203 * RETURN: 0 on success, error code on failure.
206 * inode - atime updated if byte count > 0
209 zfs_read(struct znode
*zp
, zfs_uio_t
*uio
, int ioflag
, cred_t
*cr
)
213 boolean_t frsync
= B_FALSE
;
215 zfsvfs_t
*zfsvfs
= ZTOZSB(zp
);
216 if ((error
= zfs_enter_verify_zp(zfsvfs
, zp
, FTAG
)) != 0)
219 if (zp
->z_pflags
& ZFS_AV_QUARANTINED
) {
220 zfs_exit(zfsvfs
, FTAG
);
221 return (SET_ERROR(EACCES
));
224 /* We don't copy out anything useful for directories. */
225 if (Z_ISDIR(ZTOTYPE(zp
))) {
226 zfs_exit(zfsvfs
, FTAG
);
227 return (SET_ERROR(EISDIR
));
231 * Validate file offset
233 if (zfs_uio_offset(uio
) < (offset_t
)0) {
234 zfs_exit(zfsvfs
, FTAG
);
235 return (SET_ERROR(EINVAL
));
239 * Fasttrack empty reads
241 if (zfs_uio_resid(uio
) == 0) {
242 zfs_exit(zfsvfs
, FTAG
);
248 * If we're in FRSYNC mode, sync out this znode before reading it.
249 * Only do this for non-snapshots.
251 * Some platforms do not support FRSYNC and instead map it
252 * to O_SYNC, which results in unnecessary calls to zil_commit. We
253 * only honor FRSYNC requests on platforms which support it.
255 frsync
= !!(ioflag
& FRSYNC
);
258 (frsync
|| zfsvfs
->z_os
->os_sync
== ZFS_SYNC_ALWAYS
))
259 zil_commit(zfsvfs
->z_log
, zp
->z_id
);
262 * Lock the range against changes.
264 zfs_locked_range_t
*lr
= zfs_rangelock_enter(&zp
->z_rangelock
,
265 zfs_uio_offset(uio
), zfs_uio_resid(uio
), RL_READER
);
268 * If we are reading past end-of-file we can skip
269 * to the end; but we might still need to set atime.
271 if (zfs_uio_offset(uio
) >= zp
->z_size
) {
276 ASSERT(zfs_uio_offset(uio
) < zp
->z_size
);
277 #if defined(__linux__)
278 ssize_t start_offset
= zfs_uio_offset(uio
);
280 ssize_t n
= MIN(zfs_uio_resid(uio
), zp
->z_size
- zfs_uio_offset(uio
));
281 ssize_t start_resid
= n
;
284 ssize_t nbytes
= MIN(n
, zfs_vnops_read_chunk_size
-
285 P2PHASE(zfs_uio_offset(uio
), zfs_vnops_read_chunk_size
));
287 if (zfs_uio_segflg(uio
) == UIO_NOCOPY
)
288 error
= mappedread_sf(zp
, nbytes
, uio
);
291 if (zn_has_cached_data(zp
) && !(ioflag
& O_DIRECT
)) {
292 error
= mappedread(zp
, nbytes
, uio
);
294 error
= dmu_read_uio_dbuf(sa_get_db(zp
->z_sa_hdl
),
299 /* convert checksum errors into IO errors */
301 error
= SET_ERROR(EIO
);
303 #if defined(__linux__)
305 * if we actually read some bytes, bubbling EFAULT
306 * up to become EAGAIN isn't what we want here...
308 * ...on Linux, at least. On FBSD, doing this breaks.
310 if (error
== EFAULT
&&
311 (zfs_uio_offset(uio
) - start_offset
) != 0)
320 int64_t nread
= start_resid
- n
;
321 dataset_kstats_update_read_kstats(&zfsvfs
->z_kstat
, nread
);
322 task_io_account_read(nread
);
324 zfs_rangelock_exit(lr
);
326 ZFS_ACCESSTIME_STAMP(zfsvfs
, zp
);
327 zfs_exit(zfsvfs
, FTAG
);
332 zfs_clear_setid_bits_if_necessary(zfsvfs_t
*zfsvfs
, znode_t
*zp
, cred_t
*cr
,
333 uint64_t *clear_setid_bits_txgp
, dmu_tx_t
*tx
)
335 zilog_t
*zilog
= zfsvfs
->z_log
;
336 const uint64_t uid
= KUID_TO_SUID(ZTOUID(zp
));
338 ASSERT(clear_setid_bits_txgp
!= NULL
);
342 * Clear Set-UID/Set-GID bits on successful write if not
343 * privileged and at least one of the execute bits is set.
345 * It would be nice to do this after all writes have
346 * been done, but that would still expose the ISUID/ISGID
347 * to another app after the partial write is committed.
349 * Note: we don't call zfs_fuid_map_id() here because
350 * user 0 is not an ephemeral uid.
352 mutex_enter(&zp
->z_acl_lock
);
353 if ((zp
->z_mode
& (S_IXUSR
| (S_IXUSR
>> 3) | (S_IXUSR
>> 6))) != 0 &&
354 (zp
->z_mode
& (S_ISUID
| S_ISGID
)) != 0 &&
355 secpolicy_vnode_setid_retain(zp
, cr
,
356 ((zp
->z_mode
& S_ISUID
) != 0 && uid
== 0)) != 0) {
359 zp
->z_mode
&= ~(S_ISUID
| S_ISGID
);
360 newmode
= zp
->z_mode
;
361 (void) sa_update(zp
->z_sa_hdl
, SA_ZPL_MODE(zfsvfs
),
362 (void *)&newmode
, sizeof (uint64_t), tx
);
364 mutex_exit(&zp
->z_acl_lock
);
367 * Make sure SUID/SGID bits will be removed when we replay the
368 * log. If the setid bits are keep coming back, don't log more
369 * than one TX_SETATTR per transaction group.
371 if (*clear_setid_bits_txgp
!= dmu_tx_get_txg(tx
)) {
374 va
.va_mask
= ATTR_MODE
;
375 va
.va_nodeid
= zp
->z_id
;
376 va
.va_mode
= newmode
;
377 zfs_log_setattr(zilog
, tx
, TX_SETATTR
, zp
, &va
,
379 *clear_setid_bits_txgp
= dmu_tx_get_txg(tx
);
382 mutex_exit(&zp
->z_acl_lock
);
387 * Write the bytes to a file.
389 * IN: zp - znode of file to be written to.
390 * uio - structure supplying write location, range info,
392 * ioflag - O_APPEND flag set if in append mode.
393 * O_DIRECT flag; used to bypass page cache.
394 * cr - credentials of caller.
396 * OUT: uio - updated offset and range.
398 * RETURN: 0 if success
399 * error code if failure
402 * ip - ctime|mtime updated if byte count > 0
405 zfs_write(znode_t
*zp
, zfs_uio_t
*uio
, int ioflag
, cred_t
*cr
)
407 int error
= 0, error1
;
408 ssize_t start_resid
= zfs_uio_resid(uio
);
409 uint64_t clear_setid_bits_txg
= 0;
412 * Fasttrack empty write
414 ssize_t n
= start_resid
;
418 zfsvfs_t
*zfsvfs
= ZTOZSB(zp
);
419 if ((error
= zfs_enter_verify_zp(zfsvfs
, zp
, FTAG
)) != 0)
422 sa_bulk_attr_t bulk
[4];
424 uint64_t mtime
[2], ctime
[2];
425 SA_ADD_BULK_ATTR(bulk
, count
, SA_ZPL_MTIME(zfsvfs
), NULL
, &mtime
, 16);
426 SA_ADD_BULK_ATTR(bulk
, count
, SA_ZPL_CTIME(zfsvfs
), NULL
, &ctime
, 16);
427 SA_ADD_BULK_ATTR(bulk
, count
, SA_ZPL_SIZE(zfsvfs
), NULL
,
429 SA_ADD_BULK_ATTR(bulk
, count
, SA_ZPL_FLAGS(zfsvfs
), NULL
,
433 * Callers might not be able to detect properly that we are read-only,
434 * so check it explicitly here.
436 if (zfs_is_readonly(zfsvfs
)) {
437 zfs_exit(zfsvfs
, FTAG
);
438 return (SET_ERROR(EROFS
));
442 * If immutable or not appending then return EPERM.
443 * Intentionally allow ZFS_READONLY through here.
444 * See zfs_zaccess_common()
446 if ((zp
->z_pflags
& ZFS_IMMUTABLE
) ||
447 ((zp
->z_pflags
& ZFS_APPENDONLY
) && !(ioflag
& O_APPEND
) &&
448 (zfs_uio_offset(uio
) < zp
->z_size
))) {
449 zfs_exit(zfsvfs
, FTAG
);
450 return (SET_ERROR(EPERM
));
454 * Validate file offset
456 offset_t woff
= ioflag
& O_APPEND
? zp
->z_size
: zfs_uio_offset(uio
);
458 zfs_exit(zfsvfs
, FTAG
);
459 return (SET_ERROR(EINVAL
));
462 const uint64_t max_blksz
= zfsvfs
->z_max_blksz
;
465 * Pre-fault the pages to ensure slow (eg NFS) pages
467 * Skip this if uio contains loaned arc_buf.
469 if (zfs_uio_prefaultpages(MIN(n
, max_blksz
), uio
)) {
470 zfs_exit(zfsvfs
, FTAG
);
471 return (SET_ERROR(EFAULT
));
475 * If in append mode, set the io offset pointer to eof.
477 zfs_locked_range_t
*lr
;
478 if (ioflag
& O_APPEND
) {
480 * Obtain an appending range lock to guarantee file append
481 * semantics. We reset the write offset once we have the lock.
483 lr
= zfs_rangelock_enter(&zp
->z_rangelock
, 0, n
, RL_APPEND
);
484 woff
= lr
->lr_offset
;
485 if (lr
->lr_length
== UINT64_MAX
) {
487 * We overlocked the file because this write will cause
488 * the file block size to increase.
489 * Note that zp_size cannot change with this lock held.
493 zfs_uio_setoffset(uio
, woff
);
496 * Note that if the file block size will change as a result of
497 * this write, then this range lock will lock the entire file
498 * so that we can re-write the block safely.
500 lr
= zfs_rangelock_enter(&zp
->z_rangelock
, woff
, n
, RL_WRITER
);
503 if (zn_rlimit_fsize(zp
, uio
)) {
504 zfs_rangelock_exit(lr
);
505 zfs_exit(zfsvfs
, FTAG
);
506 return (SET_ERROR(EFBIG
));
509 const rlim64_t limit
= MAXOFFSET_T
;
512 zfs_rangelock_exit(lr
);
513 zfs_exit(zfsvfs
, FTAG
);
514 return (SET_ERROR(EFBIG
));
517 if (n
> limit
- woff
)
520 uint64_t end_size
= MAX(zp
->z_size
, woff
+ n
);
521 zilog_t
*zilog
= zfsvfs
->z_log
;
523 const uint64_t uid
= KUID_TO_SUID(ZTOUID(zp
));
524 const uint64_t gid
= KGID_TO_SGID(ZTOGID(zp
));
525 const uint64_t projid
= zp
->z_projid
;
528 * Write the file in reasonable size chunks. Each chunk is written
529 * in a separate transaction; this keeps the intent log records small
530 * and allows us to do more fine-grained space accounting.
533 woff
= zfs_uio_offset(uio
);
535 if (zfs_id_overblockquota(zfsvfs
, DMU_USERUSED_OBJECT
, uid
) ||
536 zfs_id_overblockquota(zfsvfs
, DMU_GROUPUSED_OBJECT
, gid
) ||
537 (projid
!= ZFS_DEFAULT_PROJID
&&
538 zfs_id_overblockquota(zfsvfs
, DMU_PROJECTUSED_OBJECT
,
540 error
= SET_ERROR(EDQUOT
);
544 arc_buf_t
*abuf
= NULL
;
545 if (n
>= max_blksz
&& woff
>= zp
->z_size
&&
546 P2PHASE(woff
, max_blksz
) == 0 &&
547 zp
->z_blksz
== max_blksz
) {
549 * This write covers a full block. "Borrow" a buffer
550 * from the dmu so that we can fill it before we enter
551 * a transaction. This avoids the possibility of
552 * holding up the transaction if the data copy hangs
553 * up on a pagefault (e.g., from an NFS server mapping).
557 abuf
= dmu_request_arcbuf(sa_get_db(zp
->z_sa_hdl
),
559 ASSERT(abuf
!= NULL
);
560 ASSERT(arc_buf_size(abuf
) == max_blksz
);
561 if ((error
= zfs_uiocopy(abuf
->b_data
, max_blksz
,
562 UIO_WRITE
, uio
, &cbytes
))) {
563 dmu_return_arcbuf(abuf
);
566 ASSERT3S(cbytes
, ==, max_blksz
);
570 * Start a transaction.
572 dmu_tx_t
*tx
= dmu_tx_create(zfsvfs
->z_os
);
573 dmu_tx_hold_sa(tx
, zp
->z_sa_hdl
, B_FALSE
);
574 dmu_buf_impl_t
*db
= (dmu_buf_impl_t
*)sa_get_db(zp
->z_sa_hdl
);
576 dmu_tx_hold_write_by_dnode(tx
, DB_DNODE(db
), woff
,
579 zfs_sa_upgrade_txholds(tx
, zp
);
580 error
= dmu_tx_assign(tx
, TXG_WAIT
);
584 dmu_return_arcbuf(abuf
);
589 * NB: We must call zfs_clear_setid_bits_if_necessary before
590 * committing the transaction!
594 * If rangelock_enter() over-locked we grow the blocksize
595 * and then reduce the lock range. This will only happen
596 * on the first iteration since rangelock_reduce() will
597 * shrink down lr_length to the appropriate size.
599 if (lr
->lr_length
== UINT64_MAX
) {
602 if (zp
->z_blksz
> max_blksz
) {
604 * File's blocksize is already larger than the
605 * "recordsize" property. Only let it grow to
606 * the next power of 2.
608 ASSERT(!ISP2(zp
->z_blksz
));
609 new_blksz
= MIN(end_size
,
610 1 << highbit64(zp
->z_blksz
));
612 new_blksz
= MIN(end_size
, max_blksz
);
614 zfs_grow_blocksize(zp
, new_blksz
, tx
);
615 zfs_rangelock_reduce(lr
, woff
, n
);
619 * XXX - should we really limit each write to z_max_blksz?
620 * Perhaps we should use SPA_MAXBLOCKSIZE chunks?
622 const ssize_t nbytes
=
623 MIN(n
, max_blksz
- P2PHASE(woff
, max_blksz
));
627 tx_bytes
= zfs_uio_resid(uio
);
628 zfs_uio_fault_disable(uio
, B_TRUE
);
629 error
= dmu_write_uio_dbuf(sa_get_db(zp
->z_sa_hdl
),
631 zfs_uio_fault_disable(uio
, B_FALSE
);
633 if (error
== EFAULT
) {
634 zfs_clear_setid_bits_if_necessary(zfsvfs
, zp
,
635 cr
, &clear_setid_bits_txg
, tx
);
638 * Account for partial writes before
639 * continuing the loop.
640 * Update needs to occur before the next
641 * zfs_uio_prefaultpages, or prefaultpages may
642 * error, and we may break the loop early.
644 if (tx_bytes
!= zfs_uio_resid(uio
))
645 n
-= tx_bytes
- zfs_uio_resid(uio
);
646 if (zfs_uio_prefaultpages(MIN(n
, max_blksz
),
654 * On FreeBSD, EFAULT should be propagated back to the
655 * VFS, which will handle faulting and will retry.
657 if (error
!= 0 && error
!= EFAULT
) {
658 zfs_clear_setid_bits_if_necessary(zfsvfs
, zp
,
659 cr
, &clear_setid_bits_txg
, tx
);
663 tx_bytes
-= zfs_uio_resid(uio
);
665 /* Implied by abuf != NULL: */
666 ASSERT3S(n
, >=, max_blksz
);
667 ASSERT0(P2PHASE(woff
, max_blksz
));
669 * We can simplify nbytes to MIN(n, max_blksz) since
670 * P2PHASE(woff, max_blksz) is 0, and knowing
671 * n >= max_blksz lets us simplify further:
673 ASSERT3S(nbytes
, ==, max_blksz
);
675 * Thus, we're writing a full block at a block-aligned
676 * offset and extending the file past EOF.
678 * dmu_assign_arcbuf_by_dbuf() will directly assign the
679 * arc buffer to a dbuf.
681 error
= dmu_assign_arcbuf_by_dbuf(
682 sa_get_db(zp
->z_sa_hdl
), woff
, abuf
, tx
);
685 * XXX This might not be necessary if
686 * dmu_assign_arcbuf_by_dbuf is guaranteed
689 zfs_clear_setid_bits_if_necessary(zfsvfs
, zp
,
690 cr
, &clear_setid_bits_txg
, tx
);
691 dmu_return_arcbuf(abuf
);
695 ASSERT3S(nbytes
, <=, zfs_uio_resid(uio
));
696 zfs_uioskip(uio
, nbytes
);
699 if (tx_bytes
&& zn_has_cached_data(zp
) &&
700 !(ioflag
& O_DIRECT
)) {
701 update_pages(zp
, woff
, tx_bytes
, zfsvfs
->z_os
);
705 * If we made no progress, we're done. If we made even
706 * partial progress, update the znode and ZIL accordingly.
709 (void) sa_update(zp
->z_sa_hdl
, SA_ZPL_SIZE(zfsvfs
),
710 (void *)&zp
->z_size
, sizeof (uint64_t), tx
);
716 zfs_clear_setid_bits_if_necessary(zfsvfs
, zp
, cr
,
717 &clear_setid_bits_txg
, tx
);
719 zfs_tstamp_update_setup(zp
, CONTENT_MODIFIED
, mtime
, ctime
);
722 * Update the file size (zp_size) if it has changed;
723 * account for possible concurrent updates.
725 while ((end_size
= zp
->z_size
) < zfs_uio_offset(uio
)) {
726 (void) atomic_cas_64(&zp
->z_size
, end_size
,
727 zfs_uio_offset(uio
));
728 ASSERT(error
== 0 || error
== EFAULT
);
731 * If we are replaying and eof is non zero then force
732 * the file size to the specified eof. Note, there's no
733 * concurrency during replay.
735 if (zfsvfs
->z_replay
&& zfsvfs
->z_replay_eof
!= 0)
736 zp
->z_size
= zfsvfs
->z_replay_eof
;
738 error1
= sa_bulk_update(zp
->z_sa_hdl
, bulk
, count
, tx
);
740 /* Avoid clobbering EFAULT. */
744 * NB: During replay, the TX_SETATTR record logged by
745 * zfs_clear_setid_bits_if_necessary must precede any of
746 * the TX_WRITE records logged here.
748 zfs_log_write(zilog
, tx
, TX_WRITE
, zp
, woff
, tx_bytes
, ioflag
,
755 ASSERT3S(tx_bytes
, ==, nbytes
);
759 if (zfs_uio_prefaultpages(MIN(n
, max_blksz
), uio
)) {
760 error
= SET_ERROR(EFAULT
);
766 zfs_znode_update_vfs(zp
);
767 zfs_rangelock_exit(lr
);
770 * If we're in replay mode, or we made no progress, or the
771 * uio data is inaccessible return an error. Otherwise, it's
772 * at least a partial write, so it's successful.
774 if (zfsvfs
->z_replay
|| zfs_uio_resid(uio
) == start_resid
||
776 zfs_exit(zfsvfs
, FTAG
);
780 if (ioflag
& (O_SYNC
| O_DSYNC
) ||
781 zfsvfs
->z_os
->os_sync
== ZFS_SYNC_ALWAYS
)
782 zil_commit(zilog
, zp
->z_id
);
784 const int64_t nwritten
= start_resid
- zfs_uio_resid(uio
);
785 dataset_kstats_update_write_kstats(&zfsvfs
->z_kstat
, nwritten
);
786 task_io_account_write(nwritten
);
788 zfs_exit(zfsvfs
, FTAG
);
793 zfs_getsecattr(znode_t
*zp
, vsecattr_t
*vsecp
, int flag
, cred_t
*cr
)
795 zfsvfs_t
*zfsvfs
= ZTOZSB(zp
);
797 boolean_t skipaclchk
= (flag
& ATTR_NOACLCHECK
) ? B_TRUE
: B_FALSE
;
799 if ((error
= zfs_enter_verify_zp(zfsvfs
, zp
, FTAG
)) != 0)
801 error
= zfs_getacl(zp
, vsecp
, skipaclchk
, cr
);
802 zfs_exit(zfsvfs
, FTAG
);
808 zfs_setsecattr(znode_t
*zp
, vsecattr_t
*vsecp
, int flag
, cred_t
*cr
)
810 zfsvfs_t
*zfsvfs
= ZTOZSB(zp
);
812 boolean_t skipaclchk
= (flag
& ATTR_NOACLCHECK
) ? B_TRUE
: B_FALSE
;
813 zilog_t
*zilog
= zfsvfs
->z_log
;
815 if ((error
= zfs_enter_verify_zp(zfsvfs
, zp
, FTAG
)) != 0)
818 error
= zfs_setacl(zp
, vsecp
, skipaclchk
, cr
);
820 if (zfsvfs
->z_os
->os_sync
== ZFS_SYNC_ALWAYS
)
821 zil_commit(zilog
, 0);
823 zfs_exit(zfsvfs
, FTAG
);
828 static int zil_fault_io
= 0;
831 static void zfs_get_done(zgd_t
*zgd
, int error
);
834 * Get data to generate a TX_WRITE intent log record.
837 zfs_get_data(void *arg
, uint64_t gen
, lr_write_t
*lr
, char *buf
,
838 struct lwb
*lwb
, zio_t
*zio
)
840 zfsvfs_t
*zfsvfs
= arg
;
841 objset_t
*os
= zfsvfs
->z_os
;
843 uint64_t object
= lr
->lr_foid
;
844 uint64_t offset
= lr
->lr_offset
;
845 uint64_t size
= lr
->lr_length
;
851 ASSERT3P(lwb
, !=, NULL
);
852 ASSERT3P(zio
, !=, NULL
);
853 ASSERT3U(size
, !=, 0);
856 * Nothing to do if the file has been removed
858 if (zfs_zget(zfsvfs
, object
, &zp
) != 0)
859 return (SET_ERROR(ENOENT
));
860 if (zp
->z_unlinked
) {
862 * Release the vnode asynchronously as we currently have the
863 * txg stopped from syncing.
866 return (SET_ERROR(ENOENT
));
868 /* check if generation number matches */
869 if (sa_lookup(zp
->z_sa_hdl
, SA_ZPL_GEN(zfsvfs
), &zp_gen
,
870 sizeof (zp_gen
)) != 0) {
872 return (SET_ERROR(EIO
));
876 return (SET_ERROR(ENOENT
));
879 zgd
= (zgd_t
*)kmem_zalloc(sizeof (zgd_t
), KM_SLEEP
);
881 zgd
->zgd_private
= zp
;
884 * Write records come in two flavors: immediate and indirect.
885 * For small writes it's cheaper to store the data with the
886 * log record (immediate); for large writes it's cheaper to
887 * sync the data and get a pointer to it (indirect) so that
888 * we don't have to write the data twice.
890 if (buf
!= NULL
) { /* immediate write */
891 zgd
->zgd_lr
= zfs_rangelock_enter(&zp
->z_rangelock
,
892 offset
, size
, RL_READER
);
893 /* test for truncation needs to be done while range locked */
894 if (offset
>= zp
->z_size
) {
895 error
= SET_ERROR(ENOENT
);
897 error
= dmu_read(os
, object
, offset
, size
, buf
,
898 DMU_READ_NO_PREFETCH
);
900 ASSERT(error
== 0 || error
== ENOENT
);
901 } else { /* indirect write */
903 * Have to lock the whole block to ensure when it's
904 * written out and its checksum is being calculated
905 * that no one can change the data. We need to re-check
906 * blocksize after we get the lock in case it's changed!
911 blkoff
= ISP2(size
) ? P2PHASE(offset
, size
) : offset
;
913 zgd
->zgd_lr
= zfs_rangelock_enter(&zp
->z_rangelock
,
914 offset
, size
, RL_READER
);
915 if (zp
->z_blksz
== size
)
918 zfs_rangelock_exit(zgd
->zgd_lr
);
920 /* test for truncation needs to be done while range locked */
921 if (lr
->lr_offset
>= zp
->z_size
)
922 error
= SET_ERROR(ENOENT
);
925 error
= SET_ERROR(EIO
);
930 error
= dmu_buf_hold(os
, object
, offset
, zgd
, &db
,
931 DMU_READ_NO_PREFETCH
);
934 blkptr_t
*bp
= &lr
->lr_blkptr
;
939 ASSERT(db
->db_offset
== offset
);
940 ASSERT(db
->db_size
== size
);
942 error
= dmu_sync(zio
, lr
->lr_common
.lrc_txg
,
944 ASSERT(error
|| lr
->lr_length
<= size
);
947 * On success, we need to wait for the write I/O
948 * initiated by dmu_sync() to complete before we can
949 * release this dbuf. We will finish everything up
950 * in the zfs_get_done() callback.
955 if (error
== EALREADY
) {
956 lr
->lr_common
.lrc_txtype
= TX_WRITE2
;
958 * TX_WRITE2 relies on the data previously
959 * written by the TX_WRITE that caused
960 * EALREADY. We zero out the BP because
961 * it is the old, currently-on-disk BP.
970 zfs_get_done(zgd
, error
);
977 zfs_get_done(zgd_t
*zgd
, int error
)
980 znode_t
*zp
= zgd
->zgd_private
;
983 dmu_buf_rele(zgd
->zgd_db
, zgd
);
985 zfs_rangelock_exit(zgd
->zgd_lr
);
988 * Release the vnode asynchronously as we currently have the
989 * txg stopped from syncing.
993 kmem_free(zgd
, sizeof (zgd_t
));
996 EXPORT_SYMBOL(zfs_access
);
997 EXPORT_SYMBOL(zfs_fsync
);
998 EXPORT_SYMBOL(zfs_holey
);
999 EXPORT_SYMBOL(zfs_read
);
1000 EXPORT_SYMBOL(zfs_write
);
1001 EXPORT_SYMBOL(zfs_getsecattr
);
1002 EXPORT_SYMBOL(zfs_setsecattr
);
1004 ZFS_MODULE_PARAM(zfs_vnops
, zfs_vnops_
, read_chunk_size
, U64
, ZMOD_RW
,
1005 "Bytes to read per chunk");