1 // SPDX-License-Identifier: GPL-2.0
3 * Copyright (c) 2000-2006 Silicon Graphics, Inc.
4 * Copyright (c) 2016-2018 Christoph Hellwig.
9 #include "xfs_shared.h"
10 #include "xfs_format.h"
11 #include "xfs_log_format.h"
12 #include "xfs_trans_resv.h"
13 #include "xfs_mount.h"
14 #include "xfs_inode.h"
15 #include "xfs_btree.h"
16 #include "xfs_bmap_btree.h"
18 #include "xfs_bmap_util.h"
19 #include "xfs_errortag.h"
20 #include "xfs_error.h"
21 #include "xfs_trans.h"
22 #include "xfs_trans_space.h"
23 #include "xfs_inode_item.h"
24 #include "xfs_iomap.h"
25 #include "xfs_trace.h"
26 #include "xfs_quota.h"
27 #include "xfs_dquot_item.h"
28 #include "xfs_dquot.h"
29 #include "xfs_reflink.h"
30 #include "xfs_health.h"
31 #include "xfs_rtbitmap.h"
33 #define XFS_ALLOC_ALIGN(mp, off) \
34 (((off) >> mp->m_allocsize_log) << mp->m_allocsize_log)
37 xfs_alert_fsblock_zero(
39 xfs_bmbt_irec_t
*imap
)
41 xfs_alert_tag(ip
->i_mount
, XFS_PTAG_FSBLOCK_ZERO
,
42 "Access to block zero in inode %llu "
43 "start_block: %llx start_off: %llx "
44 "blkcnt: %llx extent-state: %x",
45 (unsigned long long)ip
->i_ino
,
46 (unsigned long long)imap
->br_startblock
,
47 (unsigned long long)imap
->br_startoff
,
48 (unsigned long long)imap
->br_blockcount
,
50 xfs_bmap_mark_sick(ip
, XFS_DATA_FORK
);
55 xfs_iomap_inode_sequence(
61 if (iomap_flags
& IOMAP_F_XATTR
)
62 return READ_ONCE(ip
->i_af
.if_seq
);
63 if ((iomap_flags
& IOMAP_F_SHARED
) && ip
->i_cowfp
)
64 cookie
= (u64
)READ_ONCE(ip
->i_cowfp
->if_seq
) << 32;
65 return cookie
| READ_ONCE(ip
->i_df
.if_seq
);
69 * Check that the iomap passed to us is still valid for the given offset and
75 const struct iomap
*iomap
)
77 struct xfs_inode
*ip
= XFS_I(inode
);
79 if (iomap
->validity_cookie
!=
80 xfs_iomap_inode_sequence(ip
, iomap
->flags
)) {
81 trace_xfs_iomap_invalid(ip
, iomap
);
85 XFS_ERRORTAG_DELAY(ip
->i_mount
, XFS_ERRTAG_WRITE_DELAY_MS
);
89 static const struct iomap_folio_ops xfs_iomap_folio_ops
= {
90 .iomap_valid
= xfs_iomap_valid
,
97 struct xfs_bmbt_irec
*imap
,
98 unsigned int mapping_flags
,
102 struct xfs_mount
*mp
= ip
->i_mount
;
103 struct xfs_buftarg
*target
= xfs_inode_buftarg(ip
);
105 if (unlikely(!xfs_valid_startblock(ip
, imap
->br_startblock
))) {
106 xfs_bmap_mark_sick(ip
, XFS_DATA_FORK
);
107 return xfs_alert_fsblock_zero(ip
, imap
);
110 if (imap
->br_startblock
== HOLESTARTBLOCK
) {
111 iomap
->addr
= IOMAP_NULL_ADDR
;
112 iomap
->type
= IOMAP_HOLE
;
113 } else if (imap
->br_startblock
== DELAYSTARTBLOCK
||
114 isnullstartblock(imap
->br_startblock
)) {
115 iomap
->addr
= IOMAP_NULL_ADDR
;
116 iomap
->type
= IOMAP_DELALLOC
;
118 iomap
->addr
= BBTOB(xfs_fsb_to_db(ip
, imap
->br_startblock
));
119 if (mapping_flags
& IOMAP_DAX
)
120 iomap
->addr
+= target
->bt_dax_part_off
;
122 if (imap
->br_state
== XFS_EXT_UNWRITTEN
)
123 iomap
->type
= IOMAP_UNWRITTEN
;
125 iomap
->type
= IOMAP_MAPPED
;
128 iomap
->offset
= XFS_FSB_TO_B(mp
, imap
->br_startoff
);
129 iomap
->length
= XFS_FSB_TO_B(mp
, imap
->br_blockcount
);
130 if (mapping_flags
& IOMAP_DAX
)
131 iomap
->dax_dev
= target
->bt_daxdev
;
133 iomap
->bdev
= target
->bt_bdev
;
134 iomap
->flags
= iomap_flags
;
136 if (xfs_ipincount(ip
) &&
137 (ip
->i_itemp
->ili_fsync_fields
& ~XFS_ILOG_TIMESTAMP
))
138 iomap
->flags
|= IOMAP_F_DIRTY
;
140 iomap
->validity_cookie
= sequence_cookie
;
141 iomap
->folio_ops
= &xfs_iomap_folio_ops
;
147 struct xfs_inode
*ip
,
149 xfs_fileoff_t offset_fsb
,
150 xfs_fileoff_t end_fsb
)
152 struct xfs_buftarg
*target
= xfs_inode_buftarg(ip
);
154 iomap
->addr
= IOMAP_NULL_ADDR
;
155 iomap
->type
= IOMAP_HOLE
;
156 iomap
->offset
= XFS_FSB_TO_B(ip
->i_mount
, offset_fsb
);
157 iomap
->length
= XFS_FSB_TO_B(ip
->i_mount
, end_fsb
- offset_fsb
);
158 iomap
->bdev
= target
->bt_bdev
;
159 iomap
->dax_dev
= target
->bt_daxdev
;
162 static inline xfs_fileoff_t
164 struct xfs_mount
*mp
,
168 ASSERT(offset
<= mp
->m_super
->s_maxbytes
);
169 return min(XFS_B_TO_FSB(mp
, offset
+ count
),
170 XFS_B_TO_FSB(mp
, mp
->m_super
->s_maxbytes
));
175 struct xfs_inode
*ip
)
177 struct xfs_mount
*mp
= ip
->i_mount
;
178 xfs_extlen_t align
= 0;
180 if (!XFS_IS_REALTIME_INODE(ip
)) {
182 * Round up the allocation request to a stripe unit
183 * (m_dalign) boundary if the file size is >= stripe unit
184 * size, and we are allocating past the allocation eof.
186 * If mounted with the "-o swalloc" option the alignment is
187 * increased from the strip unit size to the stripe width.
189 if (mp
->m_swidth
&& xfs_has_swalloc(mp
))
190 align
= mp
->m_swidth
;
191 else if (mp
->m_dalign
)
192 align
= mp
->m_dalign
;
194 if (align
&& XFS_ISIZE(ip
) < XFS_FSB_TO_B(mp
, align
))
202 * Check if last_fsb is outside the last extent, and if so grow it to the next
203 * stripe unit boundary.
206 xfs_iomap_eof_align_last_fsb(
207 struct xfs_inode
*ip
,
208 xfs_fileoff_t end_fsb
)
210 struct xfs_ifork
*ifp
= xfs_ifork_ptr(ip
, XFS_DATA_FORK
);
211 xfs_extlen_t extsz
= xfs_get_extsz_hint(ip
);
212 xfs_extlen_t align
= xfs_eof_alignment(ip
);
213 struct xfs_bmbt_irec irec
;
214 struct xfs_iext_cursor icur
;
216 ASSERT(!xfs_need_iread_extents(ifp
));
219 * Always round up the allocation request to the extent hint boundary.
223 align
= roundup_64(align
, extsz
);
229 xfs_fileoff_t aligned_end_fsb
= roundup_64(end_fsb
, align
);
231 xfs_iext_last(ifp
, &icur
);
232 if (!xfs_iext_get_extent(ifp
, &icur
, &irec
) ||
233 aligned_end_fsb
>= irec
.br_startoff
+ irec
.br_blockcount
)
234 return aligned_end_fsb
;
241 xfs_iomap_write_direct(
242 struct xfs_inode
*ip
,
243 xfs_fileoff_t offset_fsb
,
244 xfs_fileoff_t count_fsb
,
246 struct xfs_bmbt_irec
*imap
,
249 struct xfs_mount
*mp
= ip
->i_mount
;
250 struct xfs_trans
*tp
;
251 xfs_filblks_t resaligned
;
253 unsigned int dblocks
, rblocks
;
256 int bmapi_flags
= XFS_BMAPI_PREALLOC
;
257 int nr_exts
= XFS_IEXT_ADD_NOSPLIT_CNT
;
259 ASSERT(count_fsb
> 0);
261 resaligned
= xfs_aligned_fsb_count(offset_fsb
, count_fsb
,
262 xfs_get_extsz_hint(ip
));
263 if (unlikely(XFS_IS_REALTIME_INODE(ip
))) {
264 dblocks
= XFS_DIOSTRAT_SPACE_RES(mp
, 0);
265 rblocks
= resaligned
;
267 dblocks
= XFS_DIOSTRAT_SPACE_RES(mp
, resaligned
);
271 error
= xfs_qm_dqattach(ip
);
276 * For DAX, we do not allocate unwritten extents, but instead we zero
277 * the block before we commit the transaction. Ideally we'd like to do
278 * this outside the transaction context, but if we commit and then crash
279 * we may not have zeroed the blocks and this will be exposed on
280 * recovery of the allocation. Hence we must zero before commit.
282 * Further, if we are mapping unwritten extents here, we need to zero
283 * and convert them to written so that we don't need an unwritten extent
284 * callback for DAX. This also means that we need to be able to dip into
285 * the reserve block pool for bmbt block allocation if there is no space
286 * left but we need to do unwritten extent conversion.
288 if (flags
& IOMAP_DAX
) {
289 bmapi_flags
= XFS_BMAPI_CONVERT
| XFS_BMAPI_ZERO
;
290 if (imap
->br_state
== XFS_EXT_UNWRITTEN
) {
292 nr_exts
= XFS_IEXT_WRITE_UNWRITTEN_CNT
;
293 dblocks
= XFS_DIOSTRAT_SPACE_RES(mp
, 0) << 1;
297 error
= xfs_trans_alloc_inode(ip
, &M_RES(mp
)->tr_write
, dblocks
,
298 rblocks
, force
, &tp
);
302 error
= xfs_iext_count_extend(tp
, ip
, XFS_DATA_FORK
, nr_exts
);
304 goto out_trans_cancel
;
307 * From this point onwards we overwrite the imap pointer that the
311 error
= xfs_bmapi_write(tp
, ip
, offset_fsb
, count_fsb
, bmapi_flags
, 0,
314 goto out_trans_cancel
;
317 * Complete the transaction
319 error
= xfs_trans_commit(tp
);
323 if (unlikely(!xfs_valid_startblock(ip
, imap
->br_startblock
))) {
324 xfs_bmap_mark_sick(ip
, XFS_DATA_FORK
);
325 error
= xfs_alert_fsblock_zero(ip
, imap
);
329 *seq
= xfs_iomap_inode_sequence(ip
, 0);
330 xfs_iunlock(ip
, XFS_ILOCK_EXCL
);
334 xfs_trans_cancel(tp
);
339 xfs_quota_need_throttle(
340 struct xfs_inode
*ip
,
342 xfs_fsblock_t alloc_blocks
)
344 struct xfs_dquot
*dq
= xfs_inode_dquot(ip
, type
);
346 if (!dq
|| !xfs_this_quota_on(ip
->i_mount
, type
))
349 /* no hi watermark, no throttle */
350 if (!dq
->q_prealloc_hi_wmark
)
353 /* under the lo watermark, no throttle */
354 if (dq
->q_blk
.reserved
+ alloc_blocks
< dq
->q_prealloc_lo_wmark
)
361 xfs_quota_calc_throttle(
362 struct xfs_inode
*ip
,
364 xfs_fsblock_t
*qblocks
,
368 struct xfs_dquot
*dq
= xfs_inode_dquot(ip
, type
);
372 /* no dq, or over hi wmark, squash the prealloc completely */
373 if (!dq
|| dq
->q_blk
.reserved
>= dq
->q_prealloc_hi_wmark
) {
379 freesp
= dq
->q_prealloc_hi_wmark
- dq
->q_blk
.reserved
;
380 if (freesp
< dq
->q_low_space
[XFS_QLOWSP_5_PCNT
]) {
382 if (freesp
< dq
->q_low_space
[XFS_QLOWSP_3_PCNT
])
384 if (freesp
< dq
->q_low_space
[XFS_QLOWSP_1_PCNT
])
388 if (freesp
< *qfreesp
)
391 /* only overwrite the throttle values if we are more aggressive */
392 if ((freesp
>> shift
) < (*qblocks
>> *qshift
)) {
400 struct percpu_counter
*counter
,
401 uint64_t low_space
[XFS_LOWSP_MAX
],
406 freesp
= percpu_counter_read_positive(counter
);
407 if (freesp
< low_space
[XFS_LOWSP_5_PCNT
]) {
409 if (freesp
< low_space
[XFS_LOWSP_4_PCNT
])
411 if (freesp
< low_space
[XFS_LOWSP_3_PCNT
])
413 if (freesp
< low_space
[XFS_LOWSP_2_PCNT
])
415 if (freesp
< low_space
[XFS_LOWSP_1_PCNT
])
422 * If we don't have a user specified preallocation size, dynamically increase
423 * the preallocation size as the size of the file grows. Cap the maximum size
424 * at a single extent or less if the filesystem is near full. The closer the
425 * filesystem is to being full, the smaller the maximum preallocation.
428 xfs_iomap_prealloc_size(
429 struct xfs_inode
*ip
,
433 struct xfs_iext_cursor
*icur
)
435 struct xfs_iext_cursor ncur
= *icur
;
436 struct xfs_bmbt_irec prev
, got
;
437 struct xfs_mount
*mp
= ip
->i_mount
;
438 struct xfs_ifork
*ifp
= xfs_ifork_ptr(ip
, whichfork
);
439 xfs_fileoff_t offset_fsb
= XFS_B_TO_FSBT(mp
, offset
);
441 xfs_fsblock_t qblocks
;
442 xfs_fsblock_t alloc_blocks
= 0;
448 * As an exception we don't do any preallocation at all if the file is
449 * smaller than the minimum preallocation and we are using the default
450 * dynamic preallocation scheme, as it is likely this is the only write
451 * to the file that is going to be done.
453 if (XFS_ISIZE(ip
) < XFS_FSB_TO_B(mp
, mp
->m_allocsize_blocks
))
457 * Use the minimum preallocation size for small files or if we are
458 * writing right after a hole.
460 if (XFS_ISIZE(ip
) < XFS_FSB_TO_B(mp
, mp
->m_dalign
) ||
461 !xfs_iext_prev_extent(ifp
, &ncur
, &prev
) ||
462 prev
.br_startoff
+ prev
.br_blockcount
< offset_fsb
)
463 return mp
->m_allocsize_blocks
;
466 * Take the size of the preceding data extents as the basis for the
467 * preallocation size. Note that we don't care if the previous extents
468 * are written or not.
470 plen
= prev
.br_blockcount
;
471 while (xfs_iext_prev_extent(ifp
, &ncur
, &got
)) {
472 if (plen
> XFS_MAX_BMBT_EXTLEN
/ 2 ||
473 isnullstartblock(got
.br_startblock
) ||
474 got
.br_startoff
+ got
.br_blockcount
!= prev
.br_startoff
||
475 got
.br_startblock
+ got
.br_blockcount
!= prev
.br_startblock
)
477 plen
+= got
.br_blockcount
;
482 * If the size of the extents is greater than half the maximum extent
483 * length, then use the current offset as the basis. This ensures that
484 * for large files the preallocation size always extends to
485 * XFS_BMBT_MAX_EXTLEN rather than falling short due to things like stripe
486 * unit/width alignment of real extents.
488 alloc_blocks
= plen
* 2;
489 if (alloc_blocks
> XFS_MAX_BMBT_EXTLEN
)
490 alloc_blocks
= XFS_B_TO_FSB(mp
, offset
);
491 qblocks
= alloc_blocks
;
494 * XFS_BMBT_MAX_EXTLEN is not a power of two value but we round the prealloc
495 * down to the nearest power of two value after throttling. To prevent
496 * the round down from unconditionally reducing the maximum supported
497 * prealloc size, we round up first, apply appropriate throttling, round
498 * down and cap the value to XFS_BMBT_MAX_EXTLEN.
500 alloc_blocks
= XFS_FILEOFF_MIN(roundup_pow_of_two(XFS_MAX_BMBT_EXTLEN
),
503 if (unlikely(XFS_IS_REALTIME_INODE(ip
)))
504 freesp
= xfs_rtx_to_rtb(mp
,
505 xfs_iomap_freesp(&mp
->m_frextents
,
506 mp
->m_low_rtexts
, &shift
));
508 freesp
= xfs_iomap_freesp(&mp
->m_fdblocks
, mp
->m_low_space
,
512 * Check each quota to cap the prealloc size, provide a shift value to
513 * throttle with and adjust amount of available space.
515 if (xfs_quota_need_throttle(ip
, XFS_DQTYPE_USER
, alloc_blocks
))
516 xfs_quota_calc_throttle(ip
, XFS_DQTYPE_USER
, &qblocks
, &qshift
,
518 if (xfs_quota_need_throttle(ip
, XFS_DQTYPE_GROUP
, alloc_blocks
))
519 xfs_quota_calc_throttle(ip
, XFS_DQTYPE_GROUP
, &qblocks
, &qshift
,
521 if (xfs_quota_need_throttle(ip
, XFS_DQTYPE_PROJ
, alloc_blocks
))
522 xfs_quota_calc_throttle(ip
, XFS_DQTYPE_PROJ
, &qblocks
, &qshift
,
526 * The final prealloc size is set to the minimum of free space available
527 * in each of the quotas and the overall filesystem.
529 * The shift throttle value is set to the maximum value as determined by
530 * the global low free space values and per-quota low free space values.
532 alloc_blocks
= min(alloc_blocks
, qblocks
);
533 shift
= max(shift
, qshift
);
536 alloc_blocks
>>= shift
;
538 * rounddown_pow_of_two() returns an undefined result if we pass in
542 alloc_blocks
= rounddown_pow_of_two(alloc_blocks
);
543 if (alloc_blocks
> XFS_MAX_BMBT_EXTLEN
)
544 alloc_blocks
= XFS_MAX_BMBT_EXTLEN
;
547 * If we are still trying to allocate more space than is
548 * available, squash the prealloc hard. This can happen if we
549 * have a large file on a small filesystem and the above
550 * lowspace thresholds are smaller than XFS_BMBT_MAX_EXTLEN.
552 while (alloc_blocks
&& alloc_blocks
>= freesp
)
554 if (alloc_blocks
< mp
->m_allocsize_blocks
)
555 alloc_blocks
= mp
->m_allocsize_blocks
;
556 trace_xfs_iomap_prealloc_size(ip
, alloc_blocks
, shift
,
557 mp
->m_allocsize_blocks
);
562 xfs_iomap_write_unwritten(
568 xfs_mount_t
*mp
= ip
->i_mount
;
569 xfs_fileoff_t offset_fsb
;
570 xfs_filblks_t count_fsb
;
571 xfs_filblks_t numblks_fsb
;
574 xfs_bmbt_irec_t imap
;
575 struct inode
*inode
= VFS_I(ip
);
580 trace_xfs_unwritten_convert(ip
, offset
, count
);
582 offset_fsb
= XFS_B_TO_FSBT(mp
, offset
);
583 count_fsb
= XFS_B_TO_FSB(mp
, (xfs_ufsize_t
)offset
+ count
);
584 count_fsb
= (xfs_filblks_t
)(count_fsb
- offset_fsb
);
587 * Reserve enough blocks in this transaction for two complete extent
588 * btree splits. We may be converting the middle part of an unwritten
589 * extent and in this case we will insert two new extents in the btree
590 * each of which could cause a full split.
592 * This reservation amount will be used in the first call to
593 * xfs_bmbt_split() to select an AG with enough space to satisfy the
594 * rest of the operation.
596 resblks
= XFS_DIOSTRAT_SPACE_RES(mp
, 0) << 1;
598 /* Attach dquots so that bmbt splits are accounted correctly. */
599 error
= xfs_qm_dqattach(ip
);
605 * Set up a transaction to convert the range of extents
606 * from unwritten to real. Do allocations in a loop until
607 * we have covered the range passed in.
609 * Note that we can't risk to recursing back into the filesystem
610 * here as we might be asked to write out the same inode that we
611 * complete here and might deadlock on the iolock.
613 error
= xfs_trans_alloc_inode(ip
, &M_RES(mp
)->tr_write
, resblks
,
618 error
= xfs_iext_count_extend(tp
, ip
, XFS_DATA_FORK
,
619 XFS_IEXT_WRITE_UNWRITTEN_CNT
);
621 goto error_on_bmapi_transaction
;
624 * Modify the unwritten extent state of the buffer.
627 error
= xfs_bmapi_write(tp
, ip
, offset_fsb
, count_fsb
,
628 XFS_BMAPI_CONVERT
, resblks
, &imap
,
631 goto error_on_bmapi_transaction
;
634 * Log the updated inode size as we go. We have to be careful
635 * to only log it up to the actual write offset if it is
636 * halfway into a block.
638 i_size
= XFS_FSB_TO_B(mp
, offset_fsb
+ count_fsb
);
639 if (i_size
> offset
+ count
)
640 i_size
= offset
+ count
;
641 if (update_isize
&& i_size
> i_size_read(inode
))
642 i_size_write(inode
, i_size
);
643 i_size
= xfs_new_eof(ip
, i_size
);
645 ip
->i_disk_size
= i_size
;
646 xfs_trans_log_inode(tp
, ip
, XFS_ILOG_CORE
);
649 error
= xfs_trans_commit(tp
);
650 xfs_iunlock(ip
, XFS_ILOCK_EXCL
);
654 if (unlikely(!xfs_valid_startblock(ip
, imap
.br_startblock
))) {
655 xfs_bmap_mark_sick(ip
, XFS_DATA_FORK
);
656 return xfs_alert_fsblock_zero(ip
, &imap
);
659 if ((numblks_fsb
= imap
.br_blockcount
) == 0) {
661 * The numblks_fsb value should always get
662 * smaller, otherwise the loop is stuck.
664 ASSERT(imap
.br_blockcount
);
667 offset_fsb
+= numblks_fsb
;
668 count_fsb
-= numblks_fsb
;
669 } while (count_fsb
> 0);
673 error_on_bmapi_transaction
:
674 xfs_trans_cancel(tp
);
675 xfs_iunlock(ip
, XFS_ILOCK_EXCL
);
683 struct xfs_bmbt_irec
*imap
,
686 /* don't allocate blocks when just zeroing */
687 if (flags
& IOMAP_ZERO
)
690 imap
->br_startblock
== HOLESTARTBLOCK
||
691 imap
->br_startblock
== DELAYSTARTBLOCK
)
693 /* we convert unwritten extents before copying the data for DAX */
694 if ((flags
& IOMAP_DAX
) && imap
->br_state
== XFS_EXT_UNWRITTEN
)
701 struct xfs_inode
*ip
,
703 struct xfs_bmbt_irec
*imap
,
706 if (!xfs_is_cow_inode(ip
))
709 /* when zeroing we don't have to COW holes or unwritten extents */
710 if (flags
& IOMAP_ZERO
) {
712 imap
->br_startblock
== HOLESTARTBLOCK
||
713 imap
->br_state
== XFS_EXT_UNWRITTEN
)
721 * Extents not yet cached requires exclusive access, don't block for
724 * This is basically an opencoded xfs_ilock_data_map_shared() call, but with
725 * support for IOMAP_NOWAIT.
729 struct xfs_inode
*ip
,
733 if (flags
& IOMAP_NOWAIT
) {
734 if (xfs_need_iread_extents(&ip
->i_df
))
736 if (!xfs_ilock_nowait(ip
, *lockmode
))
739 if (xfs_need_iread_extents(&ip
->i_df
))
740 *lockmode
= XFS_ILOCK_EXCL
;
741 xfs_ilock(ip
, *lockmode
);
748 * Check that the imap we are going to return to the caller spans the entire
749 * range that the caller requested for the IO.
753 struct xfs_bmbt_irec
*imap
,
754 xfs_fileoff_t offset_fsb
,
755 xfs_fileoff_t end_fsb
)
757 if (imap
->br_startoff
> offset_fsb
)
759 if (imap
->br_startoff
+ imap
->br_blockcount
< end_fsb
)
765 xfs_direct_write_iomap_begin(
771 struct iomap
*srcmap
)
773 struct xfs_inode
*ip
= XFS_I(inode
);
774 struct xfs_mount
*mp
= ip
->i_mount
;
775 struct xfs_bmbt_irec imap
, cmap
;
776 xfs_fileoff_t offset_fsb
= XFS_B_TO_FSBT(mp
, offset
);
777 xfs_fileoff_t end_fsb
= xfs_iomap_end_fsb(mp
, offset
, length
);
778 int nimaps
= 1, error
= 0;
781 unsigned int lockmode
;
784 ASSERT(flags
& (IOMAP_WRITE
| IOMAP_ZERO
));
786 if (xfs_is_shutdown(mp
))
790 * Writes that span EOF might trigger an IO size update on completion,
791 * so consider them to be dirty for the purposes of O_DSYNC even if
792 * there is no other metadata changes pending or have been made here.
794 if (offset
+ length
> i_size_read(inode
))
795 iomap_flags
|= IOMAP_F_DIRTY
;
798 * COW writes may allocate delalloc space or convert unwritten COW
799 * extents, so we need to make sure to take the lock exclusively here.
801 if (xfs_is_cow_inode(ip
))
802 lockmode
= XFS_ILOCK_EXCL
;
804 lockmode
= XFS_ILOCK_SHARED
;
807 error
= xfs_ilock_for_iomap(ip
, flags
, &lockmode
);
812 * The reflink iflag could have changed since the earlier unlocked
813 * check, check if it again and relock if needed.
815 if (xfs_is_cow_inode(ip
) && lockmode
== XFS_ILOCK_SHARED
) {
816 xfs_iunlock(ip
, lockmode
);
817 lockmode
= XFS_ILOCK_EXCL
;
821 error
= xfs_bmapi_read(ip
, offset_fsb
, end_fsb
- offset_fsb
, &imap
,
826 if (imap_needs_cow(ip
, flags
, &imap
, nimaps
)) {
828 if (flags
& IOMAP_NOWAIT
)
831 /* may drop and re-acquire the ilock */
832 error
= xfs_reflink_allocate_cow(ip
, &imap
, &cmap
, &shared
,
834 (flags
& IOMAP_DIRECT
) || IS_DAX(inode
));
839 end_fsb
= imap
.br_startoff
+ imap
.br_blockcount
;
840 length
= XFS_FSB_TO_B(mp
, end_fsb
) - offset
;
843 if (imap_needs_alloc(inode
, flags
, &imap
, nimaps
))
844 goto allocate_blocks
;
847 * NOWAIT and OVERWRITE I/O needs to span the entire requested I/O with
848 * a single map so that we avoid partial IO failures due to the rest of
849 * the I/O range not covered by this map triggering an EAGAIN condition
850 * when it is subsequently mapped and aborting the I/O.
852 if (flags
& (IOMAP_NOWAIT
| IOMAP_OVERWRITE_ONLY
)) {
854 if (!imap_spans_range(&imap
, offset_fsb
, end_fsb
))
859 * For overwrite only I/O, we cannot convert unwritten extents without
860 * requiring sub-block zeroing. This can only be done under an
861 * exclusive IOLOCK, hence return -EAGAIN if this is not a written
862 * extent to tell the caller to try again.
864 if (flags
& IOMAP_OVERWRITE_ONLY
) {
866 if (imap
.br_state
!= XFS_EXT_NORM
&&
867 ((offset
| length
) & mp
->m_blockmask
))
871 seq
= xfs_iomap_inode_sequence(ip
, iomap_flags
);
872 xfs_iunlock(ip
, lockmode
);
873 trace_xfs_iomap_found(ip
, offset
, length
, XFS_DATA_FORK
, &imap
);
874 return xfs_bmbt_to_iomap(ip
, iomap
, &imap
, flags
, iomap_flags
, seq
);
878 if (flags
& (IOMAP_NOWAIT
| IOMAP_OVERWRITE_ONLY
))
882 * We cap the maximum length we map to a sane size to keep the chunks
883 * of work done where somewhat symmetric with the work writeback does.
884 * This is a completely arbitrary number pulled out of thin air as a
885 * best guess for initial testing.
887 * Note that the values needs to be less than 32-bits wide until the
888 * lower level functions are updated.
890 length
= min_t(loff_t
, length
, 1024 * PAGE_SIZE
);
891 end_fsb
= xfs_iomap_end_fsb(mp
, offset
, length
);
893 if (offset
+ length
> XFS_ISIZE(ip
))
894 end_fsb
= xfs_iomap_eof_align_last_fsb(ip
, end_fsb
);
895 else if (nimaps
&& imap
.br_startblock
== HOLESTARTBLOCK
)
896 end_fsb
= min(end_fsb
, imap
.br_startoff
+ imap
.br_blockcount
);
897 xfs_iunlock(ip
, lockmode
);
899 error
= xfs_iomap_write_direct(ip
, offset_fsb
, end_fsb
- offset_fsb
,
904 trace_xfs_iomap_alloc(ip
, offset
, length
, XFS_DATA_FORK
, &imap
);
905 return xfs_bmbt_to_iomap(ip
, iomap
, &imap
, flags
,
906 iomap_flags
| IOMAP_F_NEW
, seq
);
909 length
= XFS_FSB_TO_B(mp
, cmap
.br_startoff
+ cmap
.br_blockcount
);
910 trace_xfs_iomap_found(ip
, offset
, length
- offset
, XFS_COW_FORK
, &cmap
);
911 if (imap
.br_startblock
!= HOLESTARTBLOCK
) {
912 seq
= xfs_iomap_inode_sequence(ip
, 0);
913 error
= xfs_bmbt_to_iomap(ip
, srcmap
, &imap
, flags
, 0, seq
);
917 seq
= xfs_iomap_inode_sequence(ip
, IOMAP_F_SHARED
);
918 xfs_iunlock(ip
, lockmode
);
919 return xfs_bmbt_to_iomap(ip
, iomap
, &cmap
, flags
, IOMAP_F_SHARED
, seq
);
923 xfs_iunlock(ip
, lockmode
);
927 const struct iomap_ops xfs_direct_write_iomap_ops
= {
928 .iomap_begin
= xfs_direct_write_iomap_begin
,
932 xfs_dax_write_iomap_end(
940 struct xfs_inode
*ip
= XFS_I(inode
);
942 if (!xfs_is_cow_inode(ip
))
946 xfs_reflink_cancel_cow_range(ip
, pos
, length
, true);
950 return xfs_reflink_end_cow(ip
, pos
, written
);
953 const struct iomap_ops xfs_dax_write_iomap_ops
= {
954 .iomap_begin
= xfs_direct_write_iomap_begin
,
955 .iomap_end
= xfs_dax_write_iomap_end
,
959 xfs_buffered_write_iomap_begin(
965 struct iomap
*srcmap
)
967 struct xfs_inode
*ip
= XFS_I(inode
);
968 struct xfs_mount
*mp
= ip
->i_mount
;
969 xfs_fileoff_t offset_fsb
= XFS_B_TO_FSBT(mp
, offset
);
970 xfs_fileoff_t end_fsb
= xfs_iomap_end_fsb(mp
, offset
, count
);
971 struct xfs_bmbt_irec imap
, cmap
;
972 struct xfs_iext_cursor icur
, ccur
;
973 xfs_fsblock_t prealloc_blocks
= 0;
974 bool eof
= false, cow_eof
= false, shared
= false;
975 int allocfork
= XFS_DATA_FORK
;
977 unsigned int lockmode
= XFS_ILOCK_EXCL
;
980 if (xfs_is_shutdown(mp
))
983 /* we can't use delayed allocations when using extent size hints */
984 if (xfs_get_extsz_hint(ip
))
985 return xfs_direct_write_iomap_begin(inode
, offset
, count
,
986 flags
, iomap
, srcmap
);
988 error
= xfs_qm_dqattach(ip
);
992 error
= xfs_ilock_for_iomap(ip
, flags
, &lockmode
);
996 if (XFS_IS_CORRUPT(mp
, !xfs_ifork_has_extents(&ip
->i_df
)) ||
997 XFS_TEST_ERROR(false, mp
, XFS_ERRTAG_BMAPIFORMAT
)) {
998 xfs_bmap_mark_sick(ip
, XFS_DATA_FORK
);
999 error
= -EFSCORRUPTED
;
1003 XFS_STATS_INC(mp
, xs_blk_mapw
);
1005 error
= xfs_iread_extents(NULL
, ip
, XFS_DATA_FORK
);
1010 * Search the data fork first to look up our source mapping. We
1011 * always need the data fork map, as we have to return it to the
1012 * iomap code so that the higher level write code can read data in to
1013 * perform read-modify-write cycles for unaligned writes.
1015 eof
= !xfs_iext_lookup_extent(ip
, &ip
->i_df
, offset_fsb
, &icur
, &imap
);
1017 imap
.br_startoff
= end_fsb
; /* fake hole until the end */
1019 /* We never need to allocate blocks for zeroing or unsharing a hole. */
1020 if ((flags
& (IOMAP_UNSHARE
| IOMAP_ZERO
)) &&
1021 imap
.br_startoff
> offset_fsb
) {
1022 xfs_hole_to_iomap(ip
, iomap
, offset_fsb
, imap
.br_startoff
);
1027 * For zeroing, trim a delalloc extent that extends beyond the EOF
1028 * block. If it starts beyond the EOF block, convert it to an
1031 if ((flags
& IOMAP_ZERO
) && imap
.br_startoff
<= offset_fsb
&&
1032 isnullstartblock(imap
.br_startblock
)) {
1033 xfs_fileoff_t eof_fsb
= XFS_B_TO_FSB(mp
, XFS_ISIZE(ip
));
1035 if (offset_fsb
>= eof_fsb
)
1037 if (end_fsb
> eof_fsb
) {
1039 xfs_trim_extent(&imap
, offset_fsb
,
1040 end_fsb
- offset_fsb
);
1045 * Search the COW fork extent list even if we did not find a data fork
1046 * extent. This serves two purposes: first this implements the
1047 * speculative preallocation using cowextsize, so that we also unshare
1048 * block adjacent to shared blocks instead of just the shared blocks
1049 * themselves. Second the lookup in the extent list is generally faster
1050 * than going out to the shared extent tree.
1052 if (xfs_is_cow_inode(ip
)) {
1054 ASSERT(!xfs_is_reflink_inode(ip
));
1055 xfs_ifork_init_cow(ip
);
1057 cow_eof
= !xfs_iext_lookup_extent(ip
, ip
->i_cowfp
, offset_fsb
,
1059 if (!cow_eof
&& cmap
.br_startoff
<= offset_fsb
) {
1060 trace_xfs_reflink_cow_found(ip
, &cmap
);
1065 if (imap
.br_startoff
<= offset_fsb
) {
1067 * For reflink files we may need a delalloc reservation when
1068 * overwriting shared extents. This includes zeroing of
1069 * existing extents that contain data.
1071 if (!xfs_is_cow_inode(ip
) ||
1072 ((flags
& IOMAP_ZERO
) && imap
.br_state
!= XFS_EXT_NORM
)) {
1073 trace_xfs_iomap_found(ip
, offset
, count
, XFS_DATA_FORK
,
1078 xfs_trim_extent(&imap
, offset_fsb
, end_fsb
- offset_fsb
);
1080 /* Trim the mapping to the nearest shared extent boundary. */
1081 error
= xfs_bmap_trim_cow(ip
, &imap
, &shared
);
1085 /* Not shared? Just report the (potentially capped) extent. */
1087 trace_xfs_iomap_found(ip
, offset
, count
, XFS_DATA_FORK
,
1093 * Fork all the shared blocks from our write offset until the
1094 * end of the extent.
1096 allocfork
= XFS_COW_FORK
;
1097 end_fsb
= imap
.br_startoff
+ imap
.br_blockcount
;
1100 * We cap the maximum length we map here to MAX_WRITEBACK_PAGES
1101 * pages to keep the chunks of work done where somewhat
1102 * symmetric with the work writeback does. This is a completely
1103 * arbitrary number pulled out of thin air.
1105 * Note that the values needs to be less than 32-bits wide until
1106 * the lower level functions are updated.
1108 count
= min_t(loff_t
, count
, 1024 * PAGE_SIZE
);
1109 end_fsb
= xfs_iomap_end_fsb(mp
, offset
, count
);
1111 if (xfs_is_always_cow_inode(ip
))
1112 allocfork
= XFS_COW_FORK
;
1115 if (eof
&& offset
+ count
> XFS_ISIZE(ip
)) {
1117 * Determine the initial size of the preallocation.
1118 * We clean up any extra preallocation when the file is closed.
1120 if (xfs_has_allocsize(mp
))
1121 prealloc_blocks
= mp
->m_allocsize_blocks
;
1122 else if (allocfork
== XFS_DATA_FORK
)
1123 prealloc_blocks
= xfs_iomap_prealloc_size(ip
, allocfork
,
1124 offset
, count
, &icur
);
1126 prealloc_blocks
= xfs_iomap_prealloc_size(ip
, allocfork
,
1127 offset
, count
, &ccur
);
1128 if (prealloc_blocks
) {
1130 xfs_off_t end_offset
;
1131 xfs_fileoff_t p_end_fsb
;
1133 end_offset
= XFS_ALLOC_ALIGN(mp
, offset
+ count
- 1);
1134 p_end_fsb
= XFS_B_TO_FSBT(mp
, end_offset
) +
1137 align
= xfs_eof_alignment(ip
);
1139 p_end_fsb
= roundup_64(p_end_fsb
, align
);
1141 p_end_fsb
= min(p_end_fsb
,
1142 XFS_B_TO_FSB(mp
, mp
->m_super
->s_maxbytes
));
1143 ASSERT(p_end_fsb
> offset_fsb
);
1144 prealloc_blocks
= p_end_fsb
- end_fsb
;
1148 if (allocfork
== XFS_COW_FORK
) {
1149 error
= xfs_bmapi_reserve_delalloc(ip
, allocfork
, offset_fsb
,
1150 end_fsb
- offset_fsb
, prealloc_blocks
, &cmap
,
1155 trace_xfs_iomap_alloc(ip
, offset
, count
, allocfork
, &cmap
);
1159 error
= xfs_bmapi_reserve_delalloc(ip
, allocfork
, offset_fsb
,
1160 end_fsb
- offset_fsb
, prealloc_blocks
, &imap
, &icur
,
1166 * Flag newly allocated delalloc blocks with IOMAP_F_NEW so we punch
1167 * them out if the write happens to fail.
1169 seq
= xfs_iomap_inode_sequence(ip
, IOMAP_F_NEW
);
1170 xfs_iunlock(ip
, lockmode
);
1171 trace_xfs_iomap_alloc(ip
, offset
, count
, allocfork
, &imap
);
1172 return xfs_bmbt_to_iomap(ip
, iomap
, &imap
, flags
, IOMAP_F_NEW
, seq
);
1175 seq
= xfs_iomap_inode_sequence(ip
, 0);
1176 xfs_iunlock(ip
, lockmode
);
1177 return xfs_bmbt_to_iomap(ip
, iomap
, &imap
, flags
, 0, seq
);
1180 xfs_iunlock(ip
, lockmode
);
1181 truncate_pagecache(inode
, offset
);
1182 error
= xfs_bmapi_convert_delalloc(ip
, XFS_DATA_FORK
, offset
,
1187 trace_xfs_iomap_alloc(ip
, offset
, count
, XFS_DATA_FORK
, &imap
);
1191 seq
= xfs_iomap_inode_sequence(ip
, 0);
1192 if (imap
.br_startoff
<= offset_fsb
) {
1193 error
= xfs_bmbt_to_iomap(ip
, srcmap
, &imap
, flags
, 0, seq
);
1196 seq
= xfs_iomap_inode_sequence(ip
, IOMAP_F_SHARED
);
1197 xfs_iunlock(ip
, lockmode
);
1198 return xfs_bmbt_to_iomap(ip
, iomap
, &cmap
, flags
,
1199 IOMAP_F_SHARED
, seq
);
1202 xfs_trim_extent(&cmap
, offset_fsb
, imap
.br_startoff
- offset_fsb
);
1203 xfs_iunlock(ip
, lockmode
);
1204 return xfs_bmbt_to_iomap(ip
, iomap
, &cmap
, flags
, 0, seq
);
1207 xfs_iunlock(ip
, lockmode
);
1212 xfs_buffered_write_delalloc_punch(
1213 struct inode
*inode
,
1216 struct iomap
*iomap
)
1218 xfs_bmap_punch_delalloc_range(XFS_I(inode
), offset
, offset
+ length
);
1222 xfs_buffered_write_iomap_end(
1223 struct inode
*inode
,
1228 struct iomap
*iomap
)
1230 iomap_file_buffered_write_punch_delalloc(inode
, offset
, length
, written
,
1231 flags
, iomap
, &xfs_buffered_write_delalloc_punch
);
1235 const struct iomap_ops xfs_buffered_write_iomap_ops
= {
1236 .iomap_begin
= xfs_buffered_write_iomap_begin
,
1237 .iomap_end
= xfs_buffered_write_iomap_end
,
1241 * iomap_page_mkwrite() will never fail in a way that requires delalloc extents
1242 * that it allocated to be revoked. Hence we do not need an .iomap_end method
1243 * for this operation.
1245 const struct iomap_ops xfs_page_mkwrite_iomap_ops
= {
1246 .iomap_begin
= xfs_buffered_write_iomap_begin
,
1250 xfs_read_iomap_begin(
1251 struct inode
*inode
,
1255 struct iomap
*iomap
,
1256 struct iomap
*srcmap
)
1258 struct xfs_inode
*ip
= XFS_I(inode
);
1259 struct xfs_mount
*mp
= ip
->i_mount
;
1260 struct xfs_bmbt_irec imap
;
1261 xfs_fileoff_t offset_fsb
= XFS_B_TO_FSBT(mp
, offset
);
1262 xfs_fileoff_t end_fsb
= xfs_iomap_end_fsb(mp
, offset
, length
);
1263 int nimaps
= 1, error
= 0;
1264 bool shared
= false;
1265 unsigned int lockmode
= XFS_ILOCK_SHARED
;
1268 ASSERT(!(flags
& (IOMAP_WRITE
| IOMAP_ZERO
)));
1270 if (xfs_is_shutdown(mp
))
1273 error
= xfs_ilock_for_iomap(ip
, flags
, &lockmode
);
1276 error
= xfs_bmapi_read(ip
, offset_fsb
, end_fsb
- offset_fsb
, &imap
,
1278 if (!error
&& ((flags
& IOMAP_REPORT
) || IS_DAX(inode
)))
1279 error
= xfs_reflink_trim_around_shared(ip
, &imap
, &shared
);
1280 seq
= xfs_iomap_inode_sequence(ip
, shared
? IOMAP_F_SHARED
: 0);
1281 xfs_iunlock(ip
, lockmode
);
1285 trace_xfs_iomap_found(ip
, offset
, length
, XFS_DATA_FORK
, &imap
);
1286 return xfs_bmbt_to_iomap(ip
, iomap
, &imap
, flags
,
1287 shared
? IOMAP_F_SHARED
: 0, seq
);
1290 const struct iomap_ops xfs_read_iomap_ops
= {
1291 .iomap_begin
= xfs_read_iomap_begin
,
1295 xfs_seek_iomap_begin(
1296 struct inode
*inode
,
1300 struct iomap
*iomap
,
1301 struct iomap
*srcmap
)
1303 struct xfs_inode
*ip
= XFS_I(inode
);
1304 struct xfs_mount
*mp
= ip
->i_mount
;
1305 xfs_fileoff_t offset_fsb
= XFS_B_TO_FSBT(mp
, offset
);
1306 xfs_fileoff_t end_fsb
= XFS_B_TO_FSB(mp
, offset
+ length
);
1307 xfs_fileoff_t cow_fsb
= NULLFILEOFF
, data_fsb
= NULLFILEOFF
;
1308 struct xfs_iext_cursor icur
;
1309 struct xfs_bmbt_irec imap
, cmap
;
1314 if (xfs_is_shutdown(mp
))
1317 lockmode
= xfs_ilock_data_map_shared(ip
);
1318 error
= xfs_iread_extents(NULL
, ip
, XFS_DATA_FORK
);
1322 if (xfs_iext_lookup_extent(ip
, &ip
->i_df
, offset_fsb
, &icur
, &imap
)) {
1324 * If we found a data extent we are done.
1326 if (imap
.br_startoff
<= offset_fsb
)
1328 data_fsb
= imap
.br_startoff
;
1331 * Fake a hole until the end of the file.
1333 data_fsb
= xfs_iomap_end_fsb(mp
, offset
, length
);
1337 * If a COW fork extent covers the hole, report it - capped to the next
1340 if (xfs_inode_has_cow_data(ip
) &&
1341 xfs_iext_lookup_extent(ip
, ip
->i_cowfp
, offset_fsb
, &icur
, &cmap
))
1342 cow_fsb
= cmap
.br_startoff
;
1343 if (cow_fsb
!= NULLFILEOFF
&& cow_fsb
<= offset_fsb
) {
1344 if (data_fsb
< cow_fsb
+ cmap
.br_blockcount
)
1345 end_fsb
= min(end_fsb
, data_fsb
);
1346 xfs_trim_extent(&cmap
, offset_fsb
, end_fsb
- offset_fsb
);
1347 seq
= xfs_iomap_inode_sequence(ip
, IOMAP_F_SHARED
);
1348 error
= xfs_bmbt_to_iomap(ip
, iomap
, &cmap
, flags
,
1349 IOMAP_F_SHARED
, seq
);
1351 * This is a COW extent, so we must probe the page cache
1352 * because there could be dirty page cache being backed
1355 iomap
->type
= IOMAP_UNWRITTEN
;
1360 * Else report a hole, capped to the next found data or COW extent.
1362 if (cow_fsb
!= NULLFILEOFF
&& cow_fsb
< data_fsb
)
1363 imap
.br_blockcount
= cow_fsb
- offset_fsb
;
1365 imap
.br_blockcount
= data_fsb
- offset_fsb
;
1366 imap
.br_startoff
= offset_fsb
;
1367 imap
.br_startblock
= HOLESTARTBLOCK
;
1368 imap
.br_state
= XFS_EXT_NORM
;
1370 seq
= xfs_iomap_inode_sequence(ip
, 0);
1371 xfs_trim_extent(&imap
, offset_fsb
, end_fsb
- offset_fsb
);
1372 error
= xfs_bmbt_to_iomap(ip
, iomap
, &imap
, flags
, 0, seq
);
1374 xfs_iunlock(ip
, lockmode
);
1378 const struct iomap_ops xfs_seek_iomap_ops
= {
1379 .iomap_begin
= xfs_seek_iomap_begin
,
1383 xfs_xattr_iomap_begin(
1384 struct inode
*inode
,
1388 struct iomap
*iomap
,
1389 struct iomap
*srcmap
)
1391 struct xfs_inode
*ip
= XFS_I(inode
);
1392 struct xfs_mount
*mp
= ip
->i_mount
;
1393 xfs_fileoff_t offset_fsb
= XFS_B_TO_FSBT(mp
, offset
);
1394 xfs_fileoff_t end_fsb
= XFS_B_TO_FSB(mp
, offset
+ length
);
1395 struct xfs_bmbt_irec imap
;
1396 int nimaps
= 1, error
= 0;
1400 if (xfs_is_shutdown(mp
))
1403 lockmode
= xfs_ilock_attr_map_shared(ip
);
1405 /* if there are no attribute fork or extents, return ENOENT */
1406 if (!xfs_inode_has_attr_fork(ip
) || !ip
->i_af
.if_nextents
) {
1411 ASSERT(ip
->i_af
.if_format
!= XFS_DINODE_FMT_LOCAL
);
1412 error
= xfs_bmapi_read(ip
, offset_fsb
, end_fsb
- offset_fsb
, &imap
,
1413 &nimaps
, XFS_BMAPI_ATTRFORK
);
1416 seq
= xfs_iomap_inode_sequence(ip
, IOMAP_F_XATTR
);
1417 xfs_iunlock(ip
, lockmode
);
1422 return xfs_bmbt_to_iomap(ip
, iomap
, &imap
, flags
, IOMAP_F_XATTR
, seq
);
1425 const struct iomap_ops xfs_xattr_iomap_ops
= {
1426 .iomap_begin
= xfs_xattr_iomap_begin
,
1431 struct xfs_inode
*ip
,
1436 struct inode
*inode
= VFS_I(ip
);
1439 return dax_zero_range(inode
, pos
, len
, did_zero
,
1440 &xfs_dax_write_iomap_ops
);
1441 return iomap_zero_range(inode
, pos
, len
, did_zero
,
1442 &xfs_buffered_write_iomap_ops
);
1447 struct xfs_inode
*ip
,
1451 struct inode
*inode
= VFS_I(ip
);
1454 return dax_truncate_page(inode
, pos
, did_zero
,
1455 &xfs_dax_write_iomap_ops
);
1456 return iomap_truncate_page(inode
, pos
, did_zero
,
1457 &xfs_buffered_write_iomap_ops
);