1 // SPDX-License-Identifier: GPL-2.0
3 * Copyright (c) 2000-2006 Silicon Graphics, Inc.
4 * Copyright (c) 2016-2018 Christoph Hellwig.
9 #include "xfs_shared.h"
10 #include "xfs_format.h"
11 #include "xfs_log_format.h"
12 #include "xfs_trans_resv.h"
13 #include "xfs_mount.h"
14 #include "xfs_inode.h"
15 #include "xfs_btree.h"
16 #include "xfs_bmap_btree.h"
18 #include "xfs_bmap_util.h"
19 #include "xfs_errortag.h"
20 #include "xfs_error.h"
21 #include "xfs_trans.h"
22 #include "xfs_trans_space.h"
23 #include "xfs_inode_item.h"
24 #include "xfs_iomap.h"
25 #include "xfs_trace.h"
26 #include "xfs_quota.h"
27 #include "xfs_dquot_item.h"
28 #include "xfs_dquot.h"
29 #include "xfs_reflink.h"
32 #define XFS_ALLOC_ALIGN(mp, off) \
33 (((off) >> mp->m_allocsize_log) << mp->m_allocsize_log)
36 xfs_alert_fsblock_zero(
38 xfs_bmbt_irec_t
*imap
)
40 xfs_alert_tag(ip
->i_mount
, XFS_PTAG_FSBLOCK_ZERO
,
41 "Access to block zero in inode %llu "
42 "start_block: %llx start_off: %llx "
43 "blkcnt: %llx extent-state: %x",
44 (unsigned long long)ip
->i_ino
,
45 (unsigned long long)imap
->br_startblock
,
46 (unsigned long long)imap
->br_startoff
,
47 (unsigned long long)imap
->br_blockcount
,
56 struct xfs_bmbt_irec
*imap
,
59 struct xfs_mount
*mp
= ip
->i_mount
;
60 struct xfs_buftarg
*target
= xfs_inode_buftarg(ip
);
62 if (unlikely(!xfs_valid_startblock(ip
, imap
->br_startblock
)))
63 return xfs_alert_fsblock_zero(ip
, imap
);
65 if (imap
->br_startblock
== HOLESTARTBLOCK
) {
66 iomap
->addr
= IOMAP_NULL_ADDR
;
67 iomap
->type
= IOMAP_HOLE
;
68 } else if (imap
->br_startblock
== DELAYSTARTBLOCK
||
69 isnullstartblock(imap
->br_startblock
)) {
70 iomap
->addr
= IOMAP_NULL_ADDR
;
71 iomap
->type
= IOMAP_DELALLOC
;
73 iomap
->addr
= BBTOB(xfs_fsb_to_db(ip
, imap
->br_startblock
));
74 if (imap
->br_state
== XFS_EXT_UNWRITTEN
)
75 iomap
->type
= IOMAP_UNWRITTEN
;
77 iomap
->type
= IOMAP_MAPPED
;
79 iomap
->offset
= XFS_FSB_TO_B(mp
, imap
->br_startoff
);
80 iomap
->length
= XFS_FSB_TO_B(mp
, imap
->br_blockcount
);
81 iomap
->bdev
= target
->bt_bdev
;
82 iomap
->dax_dev
= target
->bt_daxdev
;
85 if (xfs_ipincount(ip
) &&
86 (ip
->i_itemp
->ili_fsync_fields
& ~XFS_ILOG_TIMESTAMP
))
87 iomap
->flags
|= IOMAP_F_DIRTY
;
95 xfs_fileoff_t offset_fsb
,
96 xfs_fileoff_t end_fsb
)
98 struct xfs_buftarg
*target
= xfs_inode_buftarg(ip
);
100 iomap
->addr
= IOMAP_NULL_ADDR
;
101 iomap
->type
= IOMAP_HOLE
;
102 iomap
->offset
= XFS_FSB_TO_B(ip
->i_mount
, offset_fsb
);
103 iomap
->length
= XFS_FSB_TO_B(ip
->i_mount
, end_fsb
- offset_fsb
);
104 iomap
->bdev
= target
->bt_bdev
;
105 iomap
->dax_dev
= target
->bt_daxdev
;
108 static inline xfs_fileoff_t
110 struct xfs_mount
*mp
,
114 ASSERT(offset
<= mp
->m_super
->s_maxbytes
);
115 return min(XFS_B_TO_FSB(mp
, offset
+ count
),
116 XFS_B_TO_FSB(mp
, mp
->m_super
->s_maxbytes
));
121 struct xfs_inode
*ip
)
123 struct xfs_mount
*mp
= ip
->i_mount
;
124 xfs_extlen_t align
= 0;
126 if (!XFS_IS_REALTIME_INODE(ip
)) {
128 * Round up the allocation request to a stripe unit
129 * (m_dalign) boundary if the file size is >= stripe unit
130 * size, and we are allocating past the allocation eof.
132 * If mounted with the "-o swalloc" option the alignment is
133 * increased from the strip unit size to the stripe width.
135 if (mp
->m_swidth
&& (mp
->m_flags
& XFS_MOUNT_SWALLOC
))
136 align
= mp
->m_swidth
;
137 else if (mp
->m_dalign
)
138 align
= mp
->m_dalign
;
140 if (align
&& XFS_ISIZE(ip
) < XFS_FSB_TO_B(mp
, align
))
148 * Check if last_fsb is outside the last extent, and if so grow it to the next
149 * stripe unit boundary.
152 xfs_iomap_eof_align_last_fsb(
153 struct xfs_inode
*ip
,
154 xfs_fileoff_t end_fsb
)
156 struct xfs_ifork
*ifp
= XFS_IFORK_PTR(ip
, XFS_DATA_FORK
);
157 xfs_extlen_t extsz
= xfs_get_extsz_hint(ip
);
158 xfs_extlen_t align
= xfs_eof_alignment(ip
);
159 struct xfs_bmbt_irec irec
;
160 struct xfs_iext_cursor icur
;
162 ASSERT(ifp
->if_flags
& XFS_IFEXTENTS
);
165 * Always round up the allocation request to the extent hint boundary.
169 align
= roundup_64(align
, extsz
);
175 xfs_fileoff_t aligned_end_fsb
= roundup_64(end_fsb
, align
);
177 xfs_iext_last(ifp
, &icur
);
178 if (!xfs_iext_get_extent(ifp
, &icur
, &irec
) ||
179 aligned_end_fsb
>= irec
.br_startoff
+ irec
.br_blockcount
)
180 return aligned_end_fsb
;
187 xfs_iomap_write_direct(
188 struct xfs_inode
*ip
,
189 xfs_fileoff_t offset_fsb
,
190 xfs_fileoff_t count_fsb
,
191 struct xfs_bmbt_irec
*imap
)
193 struct xfs_mount
*mp
= ip
->i_mount
;
194 struct xfs_trans
*tp
;
195 xfs_filblks_t resaligned
;
198 uint qblocks
, resblks
;
199 unsigned int resrtextents
= 0;
201 int bmapi_flags
= XFS_BMAPI_PREALLOC
;
204 ASSERT(count_fsb
> 0);
206 resaligned
= xfs_aligned_fsb_count(offset_fsb
, count_fsb
,
207 xfs_get_extsz_hint(ip
));
208 if (unlikely(XFS_IS_REALTIME_INODE(ip
))) {
209 resrtextents
= qblocks
= resaligned
;
210 resrtextents
/= mp
->m_sb
.sb_rextsize
;
211 resblks
= XFS_DIOSTRAT_SPACE_RES(mp
, 0);
212 quota_flag
= XFS_QMOPT_RES_RTBLKS
;
214 resblks
= qblocks
= XFS_DIOSTRAT_SPACE_RES(mp
, resaligned
);
215 quota_flag
= XFS_QMOPT_RES_REGBLKS
;
218 error
= xfs_qm_dqattach(ip
);
223 * For DAX, we do not allocate unwritten extents, but instead we zero
224 * the block before we commit the transaction. Ideally we'd like to do
225 * this outside the transaction context, but if we commit and then crash
226 * we may not have zeroed the blocks and this will be exposed on
227 * recovery of the allocation. Hence we must zero before commit.
229 * Further, if we are mapping unwritten extents here, we need to zero
230 * and convert them to written so that we don't need an unwritten extent
231 * callback for DAX. This also means that we need to be able to dip into
232 * the reserve block pool for bmbt block allocation if there is no space
233 * left but we need to do unwritten extent conversion.
235 if (IS_DAX(VFS_I(ip
))) {
236 bmapi_flags
= XFS_BMAPI_CONVERT
| XFS_BMAPI_ZERO
;
237 if (imap
->br_state
== XFS_EXT_UNWRITTEN
) {
238 tflags
|= XFS_TRANS_RESERVE
;
239 resblks
= XFS_DIOSTRAT_SPACE_RES(mp
, 0) << 1;
242 error
= xfs_trans_alloc(mp
, &M_RES(mp
)->tr_write
, resblks
, resrtextents
,
247 xfs_ilock(ip
, XFS_ILOCK_EXCL
);
249 error
= xfs_trans_reserve_quota_nblks(tp
, ip
, qblocks
, 0, quota_flag
);
251 goto out_trans_cancel
;
253 xfs_trans_ijoin(tp
, ip
, 0);
256 * From this point onwards we overwrite the imap pointer that the
260 error
= xfs_bmapi_write(tp
, ip
, offset_fsb
, count_fsb
, bmapi_flags
, 0,
266 * Complete the transaction
268 error
= xfs_trans_commit(tp
);
273 * Copy any maps to caller's array and return any error.
280 if (unlikely(!xfs_valid_startblock(ip
, imap
->br_startblock
)))
281 error
= xfs_alert_fsblock_zero(ip
, imap
);
284 xfs_iunlock(ip
, XFS_ILOCK_EXCL
);
288 xfs_trans_unreserve_quota_nblks(tp
, ip
, (long)qblocks
, 0, quota_flag
);
290 xfs_trans_cancel(tp
);
295 xfs_quota_need_throttle(
296 struct xfs_inode
*ip
,
298 xfs_fsblock_t alloc_blocks
)
300 struct xfs_dquot
*dq
= xfs_inode_dquot(ip
, type
);
302 if (!dq
|| !xfs_this_quota_on(ip
->i_mount
, type
))
305 /* no hi watermark, no throttle */
306 if (!dq
->q_prealloc_hi_wmark
)
309 /* under the lo watermark, no throttle */
310 if (dq
->q_res_bcount
+ alloc_blocks
< dq
->q_prealloc_lo_wmark
)
317 xfs_quota_calc_throttle(
318 struct xfs_inode
*ip
,
320 xfs_fsblock_t
*qblocks
,
326 struct xfs_dquot
*dq
= xfs_inode_dquot(ip
, type
);
328 /* no dq, or over hi wmark, squash the prealloc completely */
329 if (!dq
|| dq
->q_res_bcount
>= dq
->q_prealloc_hi_wmark
) {
335 freesp
= dq
->q_prealloc_hi_wmark
- dq
->q_res_bcount
;
336 if (freesp
< dq
->q_low_space
[XFS_QLOWSP_5_PCNT
]) {
338 if (freesp
< dq
->q_low_space
[XFS_QLOWSP_3_PCNT
])
340 if (freesp
< dq
->q_low_space
[XFS_QLOWSP_1_PCNT
])
344 if (freesp
< *qfreesp
)
347 /* only overwrite the throttle values if we are more aggressive */
348 if ((freesp
>> shift
) < (*qblocks
>> *qshift
)) {
355 * If we are doing a write at the end of the file and there are no allocations
356 * past this one, then extend the allocation out to the file system's write
359 * If we don't have a user specified preallocation size, dynamically increase
360 * the preallocation size as the size of the file grows. Cap the maximum size
361 * at a single extent or less if the filesystem is near full. The closer the
362 * filesystem is to full, the smaller the maximum prealocation.
364 * As an exception we don't do any preallocation at all if the file is smaller
365 * than the minimum preallocation and we are using the default dynamic
366 * preallocation scheme, as it is likely this is the only write to the file that
367 * is going to be done.
369 * We clean up any extra space left over when the file is closed in
373 xfs_iomap_prealloc_size(
374 struct xfs_inode
*ip
,
378 struct xfs_iext_cursor
*icur
)
380 struct xfs_mount
*mp
= ip
->i_mount
;
381 struct xfs_ifork
*ifp
= XFS_IFORK_PTR(ip
, whichfork
);
382 xfs_fileoff_t offset_fsb
= XFS_B_TO_FSBT(mp
, offset
);
383 struct xfs_bmbt_irec prev
;
386 xfs_fsblock_t qblocks
;
388 xfs_fsblock_t alloc_blocks
= 0;
390 if (offset
+ count
<= XFS_ISIZE(ip
))
393 if (!(mp
->m_flags
& XFS_MOUNT_ALLOCSIZE
) &&
394 (XFS_ISIZE(ip
) < XFS_FSB_TO_B(mp
, mp
->m_allocsize_blocks
)))
398 * If an explicit allocsize is set, the file is small, or we
399 * are writing behind a hole, then use the minimum prealloc:
401 if ((mp
->m_flags
& XFS_MOUNT_ALLOCSIZE
) ||
402 XFS_ISIZE(ip
) < XFS_FSB_TO_B(mp
, mp
->m_dalign
) ||
403 !xfs_iext_peek_prev_extent(ifp
, icur
, &prev
) ||
404 prev
.br_startoff
+ prev
.br_blockcount
< offset_fsb
)
405 return mp
->m_allocsize_blocks
;
408 * Determine the initial size of the preallocation. We are beyond the
409 * current EOF here, but we need to take into account whether this is
410 * a sparse write or an extending write when determining the
411 * preallocation size. Hence we need to look up the extent that ends
412 * at the current write offset and use the result to determine the
413 * preallocation size.
415 * If the extent is a hole, then preallocation is essentially disabled.
416 * Otherwise we take the size of the preceding data extent as the basis
417 * for the preallocation size. If the size of the extent is greater than
418 * half the maximum extent length, then use the current offset as the
419 * basis. This ensures that for large files the preallocation size
420 * always extends to MAXEXTLEN rather than falling short due to things
421 * like stripe unit/width alignment of real extents.
423 if (prev
.br_blockcount
<= (MAXEXTLEN
>> 1))
424 alloc_blocks
= prev
.br_blockcount
<< 1;
426 alloc_blocks
= XFS_B_TO_FSB(mp
, offset
);
429 qblocks
= alloc_blocks
;
432 * MAXEXTLEN is not a power of two value but we round the prealloc down
433 * to the nearest power of two value after throttling. To prevent the
434 * round down from unconditionally reducing the maximum supported prealloc
435 * size, we round up first, apply appropriate throttling, round down and
436 * cap the value to MAXEXTLEN.
438 alloc_blocks
= XFS_FILEOFF_MIN(roundup_pow_of_two(MAXEXTLEN
),
441 freesp
= percpu_counter_read_positive(&mp
->m_fdblocks
);
442 if (freesp
< mp
->m_low_space
[XFS_LOWSP_5_PCNT
]) {
444 if (freesp
< mp
->m_low_space
[XFS_LOWSP_4_PCNT
])
446 if (freesp
< mp
->m_low_space
[XFS_LOWSP_3_PCNT
])
448 if (freesp
< mp
->m_low_space
[XFS_LOWSP_2_PCNT
])
450 if (freesp
< mp
->m_low_space
[XFS_LOWSP_1_PCNT
])
455 * Check each quota to cap the prealloc size, provide a shift value to
456 * throttle with and adjust amount of available space.
458 if (xfs_quota_need_throttle(ip
, XFS_DQ_USER
, alloc_blocks
))
459 xfs_quota_calc_throttle(ip
, XFS_DQ_USER
, &qblocks
, &qshift
,
461 if (xfs_quota_need_throttle(ip
, XFS_DQ_GROUP
, alloc_blocks
))
462 xfs_quota_calc_throttle(ip
, XFS_DQ_GROUP
, &qblocks
, &qshift
,
464 if (xfs_quota_need_throttle(ip
, XFS_DQ_PROJ
, alloc_blocks
))
465 xfs_quota_calc_throttle(ip
, XFS_DQ_PROJ
, &qblocks
, &qshift
,
469 * The final prealloc size is set to the minimum of free space available
470 * in each of the quotas and the overall filesystem.
472 * The shift throttle value is set to the maximum value as determined by
473 * the global low free space values and per-quota low free space values.
475 alloc_blocks
= min(alloc_blocks
, qblocks
);
476 shift
= max(shift
, qshift
);
479 alloc_blocks
>>= shift
;
481 * rounddown_pow_of_two() returns an undefined result if we pass in
485 alloc_blocks
= rounddown_pow_of_two(alloc_blocks
);
486 if (alloc_blocks
> MAXEXTLEN
)
487 alloc_blocks
= MAXEXTLEN
;
490 * If we are still trying to allocate more space than is
491 * available, squash the prealloc hard. This can happen if we
492 * have a large file on a small filesystem and the above
493 * lowspace thresholds are smaller than MAXEXTLEN.
495 while (alloc_blocks
&& alloc_blocks
>= freesp
)
498 if (alloc_blocks
< mp
->m_allocsize_blocks
)
499 alloc_blocks
= mp
->m_allocsize_blocks
;
500 trace_xfs_iomap_prealloc_size(ip
, alloc_blocks
, shift
,
501 mp
->m_allocsize_blocks
);
506 xfs_iomap_write_unwritten(
512 xfs_mount_t
*mp
= ip
->i_mount
;
513 xfs_fileoff_t offset_fsb
;
514 xfs_filblks_t count_fsb
;
515 xfs_filblks_t numblks_fsb
;
518 xfs_bmbt_irec_t imap
;
519 struct inode
*inode
= VFS_I(ip
);
524 trace_xfs_unwritten_convert(ip
, offset
, count
);
526 offset_fsb
= XFS_B_TO_FSBT(mp
, offset
);
527 count_fsb
= XFS_B_TO_FSB(mp
, (xfs_ufsize_t
)offset
+ count
);
528 count_fsb
= (xfs_filblks_t
)(count_fsb
- offset_fsb
);
531 * Reserve enough blocks in this transaction for two complete extent
532 * btree splits. We may be converting the middle part of an unwritten
533 * extent and in this case we will insert two new extents in the btree
534 * each of which could cause a full split.
536 * This reservation amount will be used in the first call to
537 * xfs_bmbt_split() to select an AG with enough space to satisfy the
538 * rest of the operation.
540 resblks
= XFS_DIOSTRAT_SPACE_RES(mp
, 0) << 1;
542 /* Attach dquots so that bmbt splits are accounted correctly. */
543 error
= xfs_qm_dqattach(ip
);
549 * Set up a transaction to convert the range of extents
550 * from unwritten to real. Do allocations in a loop until
551 * we have covered the range passed in.
553 * Note that we can't risk to recursing back into the filesystem
554 * here as we might be asked to write out the same inode that we
555 * complete here and might deadlock on the iolock.
557 error
= xfs_trans_alloc(mp
, &M_RES(mp
)->tr_write
, resblks
, 0,
558 XFS_TRANS_RESERVE
, &tp
);
562 xfs_ilock(ip
, XFS_ILOCK_EXCL
);
563 xfs_trans_ijoin(tp
, ip
, 0);
565 error
= xfs_trans_reserve_quota_nblks(tp
, ip
, resblks
, 0,
566 XFS_QMOPT_RES_REGBLKS
);
568 goto error_on_bmapi_transaction
;
571 * Modify the unwritten extent state of the buffer.
574 error
= xfs_bmapi_write(tp
, ip
, offset_fsb
, count_fsb
,
575 XFS_BMAPI_CONVERT
, resblks
, &imap
,
578 goto error_on_bmapi_transaction
;
581 * Log the updated inode size as we go. We have to be careful
582 * to only log it up to the actual write offset if it is
583 * halfway into a block.
585 i_size
= XFS_FSB_TO_B(mp
, offset_fsb
+ count_fsb
);
586 if (i_size
> offset
+ count
)
587 i_size
= offset
+ count
;
588 if (update_isize
&& i_size
> i_size_read(inode
))
589 i_size_write(inode
, i_size
);
590 i_size
= xfs_new_eof(ip
, i_size
);
592 ip
->i_d
.di_size
= i_size
;
593 xfs_trans_log_inode(tp
, ip
, XFS_ILOG_CORE
);
596 error
= xfs_trans_commit(tp
);
597 xfs_iunlock(ip
, XFS_ILOCK_EXCL
);
601 if (unlikely(!xfs_valid_startblock(ip
, imap
.br_startblock
)))
602 return xfs_alert_fsblock_zero(ip
, &imap
);
604 if ((numblks_fsb
= imap
.br_blockcount
) == 0) {
606 * The numblks_fsb value should always get
607 * smaller, otherwise the loop is stuck.
609 ASSERT(imap
.br_blockcount
);
612 offset_fsb
+= numblks_fsb
;
613 count_fsb
-= numblks_fsb
;
614 } while (count_fsb
> 0);
618 error_on_bmapi_transaction
:
619 xfs_trans_cancel(tp
);
620 xfs_iunlock(ip
, XFS_ILOCK_EXCL
);
628 struct xfs_bmbt_irec
*imap
,
631 /* don't allocate blocks when just zeroing */
632 if (flags
& IOMAP_ZERO
)
635 imap
->br_startblock
== HOLESTARTBLOCK
||
636 imap
->br_startblock
== DELAYSTARTBLOCK
)
638 /* we convert unwritten extents before copying the data for DAX */
639 if (IS_DAX(inode
) && imap
->br_state
== XFS_EXT_UNWRITTEN
)
646 struct xfs_inode
*ip
,
648 struct xfs_bmbt_irec
*imap
,
651 if (!xfs_is_cow_inode(ip
))
654 /* when zeroing we don't have to COW holes or unwritten extents */
655 if (flags
& IOMAP_ZERO
) {
657 imap
->br_startblock
== HOLESTARTBLOCK
||
658 imap
->br_state
== XFS_EXT_UNWRITTEN
)
667 struct xfs_inode
*ip
,
671 unsigned mode
= XFS_ILOCK_SHARED
;
672 bool is_write
= flags
& (IOMAP_WRITE
| IOMAP_ZERO
);
675 * COW writes may allocate delalloc space or convert unwritten COW
676 * extents, so we need to make sure to take the lock exclusively here.
678 if (xfs_is_cow_inode(ip
) && is_write
)
679 mode
= XFS_ILOCK_EXCL
;
682 * Extents not yet cached requires exclusive access, don't block. This
683 * is an opencoded xfs_ilock_data_map_shared() call but with
684 * non-blocking behaviour.
686 if (!(ip
->i_df
.if_flags
& XFS_IFEXTENTS
)) {
687 if (flags
& IOMAP_NOWAIT
)
689 mode
= XFS_ILOCK_EXCL
;
693 if (flags
& IOMAP_NOWAIT
) {
694 if (!xfs_ilock_nowait(ip
, mode
))
701 * The reflink iflag could have changed since the earlier unlocked
702 * check, so if we got ILOCK_SHARED for a write and but we're now a
703 * reflink inode we have to switch to ILOCK_EXCL and relock.
705 if (mode
== XFS_ILOCK_SHARED
&& is_write
&& xfs_is_cow_inode(ip
)) {
706 xfs_iunlock(ip
, mode
);
707 mode
= XFS_ILOCK_EXCL
;
716 xfs_direct_write_iomap_begin(
722 struct iomap
*srcmap
)
724 struct xfs_inode
*ip
= XFS_I(inode
);
725 struct xfs_mount
*mp
= ip
->i_mount
;
726 struct xfs_bmbt_irec imap
, cmap
;
727 xfs_fileoff_t offset_fsb
= XFS_B_TO_FSBT(mp
, offset
);
728 xfs_fileoff_t end_fsb
= xfs_iomap_end_fsb(mp
, offset
, length
);
729 int nimaps
= 1, error
= 0;
734 ASSERT(flags
& (IOMAP_WRITE
| IOMAP_ZERO
));
736 if (XFS_FORCED_SHUTDOWN(mp
))
740 * Writes that span EOF might trigger an IO size update on completion,
741 * so consider them to be dirty for the purposes of O_DSYNC even if
742 * there is no other metadata changes pending or have been made here.
744 if (offset
+ length
> i_size_read(inode
))
745 iomap_flags
|= IOMAP_F_DIRTY
;
747 error
= xfs_ilock_for_iomap(ip
, flags
, &lockmode
);
751 error
= xfs_bmapi_read(ip
, offset_fsb
, end_fsb
- offset_fsb
, &imap
,
756 if (imap_needs_cow(ip
, flags
, &imap
, nimaps
)) {
758 if (flags
& IOMAP_NOWAIT
)
761 /* may drop and re-acquire the ilock */
762 error
= xfs_reflink_allocate_cow(ip
, &imap
, &cmap
, &shared
,
763 &lockmode
, flags
& IOMAP_DIRECT
);
768 end_fsb
= imap
.br_startoff
+ imap
.br_blockcount
;
769 length
= XFS_FSB_TO_B(mp
, end_fsb
) - offset
;
772 if (imap_needs_alloc(inode
, flags
, &imap
, nimaps
))
773 goto allocate_blocks
;
775 xfs_iunlock(ip
, lockmode
);
776 trace_xfs_iomap_found(ip
, offset
, length
, XFS_DATA_FORK
, &imap
);
777 return xfs_bmbt_to_iomap(ip
, iomap
, &imap
, iomap_flags
);
781 if (flags
& IOMAP_NOWAIT
)
785 * We cap the maximum length we map to a sane size to keep the chunks
786 * of work done where somewhat symmetric with the work writeback does.
787 * This is a completely arbitrary number pulled out of thin air as a
788 * best guess for initial testing.
790 * Note that the values needs to be less than 32-bits wide until the
791 * lower level functions are updated.
793 length
= min_t(loff_t
, length
, 1024 * PAGE_SIZE
);
794 end_fsb
= xfs_iomap_end_fsb(mp
, offset
, length
);
796 if (offset
+ length
> XFS_ISIZE(ip
))
797 end_fsb
= xfs_iomap_eof_align_last_fsb(ip
, end_fsb
);
798 else if (nimaps
&& imap
.br_startblock
== HOLESTARTBLOCK
)
799 end_fsb
= min(end_fsb
, imap
.br_startoff
+ imap
.br_blockcount
);
800 xfs_iunlock(ip
, lockmode
);
802 error
= xfs_iomap_write_direct(ip
, offset_fsb
, end_fsb
- offset_fsb
,
807 trace_xfs_iomap_alloc(ip
, offset
, length
, XFS_DATA_FORK
, &imap
);
808 return xfs_bmbt_to_iomap(ip
, iomap
, &imap
, iomap_flags
| IOMAP_F_NEW
);
811 xfs_iunlock(ip
, lockmode
);
812 length
= XFS_FSB_TO_B(mp
, cmap
.br_startoff
+ cmap
.br_blockcount
);
813 trace_xfs_iomap_found(ip
, offset
, length
- offset
, XFS_COW_FORK
, &cmap
);
814 if (imap
.br_startblock
!= HOLESTARTBLOCK
) {
815 error
= xfs_bmbt_to_iomap(ip
, srcmap
, &imap
, 0);
819 return xfs_bmbt_to_iomap(ip
, iomap
, &cmap
, IOMAP_F_SHARED
);
822 xfs_iunlock(ip
, lockmode
);
826 const struct iomap_ops xfs_direct_write_iomap_ops
= {
827 .iomap_begin
= xfs_direct_write_iomap_begin
,
831 xfs_buffered_write_iomap_begin(
837 struct iomap
*srcmap
)
839 struct xfs_inode
*ip
= XFS_I(inode
);
840 struct xfs_mount
*mp
= ip
->i_mount
;
841 xfs_fileoff_t offset_fsb
= XFS_B_TO_FSBT(mp
, offset
);
842 xfs_fileoff_t end_fsb
= xfs_iomap_end_fsb(mp
, offset
, count
);
843 struct xfs_bmbt_irec imap
, cmap
;
844 struct xfs_iext_cursor icur
, ccur
;
845 xfs_fsblock_t prealloc_blocks
= 0;
846 bool eof
= false, cow_eof
= false, shared
= false;
847 int allocfork
= XFS_DATA_FORK
;
850 /* we can't use delayed allocations when using extent size hints */
851 if (xfs_get_extsz_hint(ip
))
852 return xfs_direct_write_iomap_begin(inode
, offset
, count
,
853 flags
, iomap
, srcmap
);
855 ASSERT(!XFS_IS_REALTIME_INODE(ip
));
857 xfs_ilock(ip
, XFS_ILOCK_EXCL
);
859 if (XFS_IS_CORRUPT(mp
, !xfs_ifork_has_extents(ip
, XFS_DATA_FORK
)) ||
860 XFS_TEST_ERROR(false, mp
, XFS_ERRTAG_BMAPIFORMAT
)) {
861 error
= -EFSCORRUPTED
;
865 XFS_STATS_INC(mp
, xs_blk_mapw
);
867 if (!(ip
->i_df
.if_flags
& XFS_IFEXTENTS
)) {
868 error
= xfs_iread_extents(NULL
, ip
, XFS_DATA_FORK
);
874 * Search the data fork fork first to look up our source mapping. We
875 * always need the data fork map, as we have to return it to the
876 * iomap code so that the higher level write code can read data in to
877 * perform read-modify-write cycles for unaligned writes.
879 eof
= !xfs_iext_lookup_extent(ip
, &ip
->i_df
, offset_fsb
, &icur
, &imap
);
881 imap
.br_startoff
= end_fsb
; /* fake hole until the end */
883 /* We never need to allocate blocks for zeroing a hole. */
884 if ((flags
& IOMAP_ZERO
) && imap
.br_startoff
> offset_fsb
) {
885 xfs_hole_to_iomap(ip
, iomap
, offset_fsb
, imap
.br_startoff
);
890 * Search the COW fork extent list even if we did not find a data fork
891 * extent. This serves two purposes: first this implements the
892 * speculative preallocation using cowextsize, so that we also unshare
893 * block adjacent to shared blocks instead of just the shared blocks
894 * themselves. Second the lookup in the extent list is generally faster
895 * than going out to the shared extent tree.
897 if (xfs_is_cow_inode(ip
)) {
899 ASSERT(!xfs_is_reflink_inode(ip
));
900 xfs_ifork_init_cow(ip
);
902 cow_eof
= !xfs_iext_lookup_extent(ip
, ip
->i_cowfp
, offset_fsb
,
904 if (!cow_eof
&& cmap
.br_startoff
<= offset_fsb
) {
905 trace_xfs_reflink_cow_found(ip
, &cmap
);
910 if (imap
.br_startoff
<= offset_fsb
) {
912 * For reflink files we may need a delalloc reservation when
913 * overwriting shared extents. This includes zeroing of
914 * existing extents that contain data.
916 if (!xfs_is_cow_inode(ip
) ||
917 ((flags
& IOMAP_ZERO
) && imap
.br_state
!= XFS_EXT_NORM
)) {
918 trace_xfs_iomap_found(ip
, offset
, count
, XFS_DATA_FORK
,
923 xfs_trim_extent(&imap
, offset_fsb
, end_fsb
- offset_fsb
);
925 /* Trim the mapping to the nearest shared extent boundary. */
926 error
= xfs_bmap_trim_cow(ip
, &imap
, &shared
);
930 /* Not shared? Just report the (potentially capped) extent. */
932 trace_xfs_iomap_found(ip
, offset
, count
, XFS_DATA_FORK
,
938 * Fork all the shared blocks from our write offset until the
941 allocfork
= XFS_COW_FORK
;
942 end_fsb
= imap
.br_startoff
+ imap
.br_blockcount
;
945 * We cap the maximum length we map here to MAX_WRITEBACK_PAGES
946 * pages to keep the chunks of work done where somewhat
947 * symmetric with the work writeback does. This is a completely
948 * arbitrary number pulled out of thin air.
950 * Note that the values needs to be less than 32-bits wide until
951 * the lower level functions are updated.
953 count
= min_t(loff_t
, count
, 1024 * PAGE_SIZE
);
954 end_fsb
= xfs_iomap_end_fsb(mp
, offset
, count
);
956 if (xfs_is_always_cow_inode(ip
))
957 allocfork
= XFS_COW_FORK
;
960 error
= xfs_qm_dqattach_locked(ip
, false);
965 prealloc_blocks
= xfs_iomap_prealloc_size(ip
, allocfork
, offset
,
967 if (prealloc_blocks
) {
969 xfs_off_t end_offset
;
970 xfs_fileoff_t p_end_fsb
;
972 end_offset
= XFS_ALLOC_ALIGN(mp
, offset
+ count
- 1);
973 p_end_fsb
= XFS_B_TO_FSBT(mp
, end_offset
) +
976 align
= xfs_eof_alignment(ip
);
978 p_end_fsb
= roundup_64(p_end_fsb
, align
);
980 p_end_fsb
= min(p_end_fsb
,
981 XFS_B_TO_FSB(mp
, mp
->m_super
->s_maxbytes
));
982 ASSERT(p_end_fsb
> offset_fsb
);
983 prealloc_blocks
= p_end_fsb
- end_fsb
;
988 error
= xfs_bmapi_reserve_delalloc(ip
, allocfork
, offset_fsb
,
989 end_fsb
- offset_fsb
, prealloc_blocks
,
990 allocfork
== XFS_DATA_FORK
? &imap
: &cmap
,
991 allocfork
== XFS_DATA_FORK
? &icur
: &ccur
,
992 allocfork
== XFS_DATA_FORK
? eof
: cow_eof
);
998 /* retry without any preallocation */
999 trace_xfs_delalloc_enospc(ip
, offset
, count
);
1000 if (prealloc_blocks
) {
1001 prealloc_blocks
= 0;
1009 if (allocfork
== XFS_COW_FORK
) {
1010 trace_xfs_iomap_alloc(ip
, offset
, count
, allocfork
, &cmap
);
1015 * Flag newly allocated delalloc blocks with IOMAP_F_NEW so we punch
1016 * them out if the write happens to fail.
1018 xfs_iunlock(ip
, XFS_ILOCK_EXCL
);
1019 trace_xfs_iomap_alloc(ip
, offset
, count
, allocfork
, &imap
);
1020 return xfs_bmbt_to_iomap(ip
, iomap
, &imap
, IOMAP_F_NEW
);
1023 xfs_iunlock(ip
, XFS_ILOCK_EXCL
);
1024 return xfs_bmbt_to_iomap(ip
, iomap
, &imap
, 0);
1027 xfs_iunlock(ip
, XFS_ILOCK_EXCL
);
1028 if (imap
.br_startoff
<= offset_fsb
) {
1029 error
= xfs_bmbt_to_iomap(ip
, srcmap
, &imap
, 0);
1033 xfs_trim_extent(&cmap
, offset_fsb
,
1034 imap
.br_startoff
- offset_fsb
);
1036 return xfs_bmbt_to_iomap(ip
, iomap
, &cmap
, IOMAP_F_SHARED
);
1039 xfs_iunlock(ip
, XFS_ILOCK_EXCL
);
1044 xfs_buffered_write_iomap_end(
1045 struct inode
*inode
,
1050 struct iomap
*iomap
)
1052 struct xfs_inode
*ip
= XFS_I(inode
);
1053 struct xfs_mount
*mp
= ip
->i_mount
;
1054 xfs_fileoff_t start_fsb
;
1055 xfs_fileoff_t end_fsb
;
1058 if (iomap
->type
!= IOMAP_DELALLOC
)
1062 * Behave as if the write failed if drop writes is enabled. Set the NEW
1063 * flag to force delalloc cleanup.
1065 if (XFS_TEST_ERROR(false, mp
, XFS_ERRTAG_DROP_WRITES
)) {
1066 iomap
->flags
|= IOMAP_F_NEW
;
1071 * start_fsb refers to the first unused block after a short write. If
1072 * nothing was written, round offset down to point at the first block in
1075 if (unlikely(!written
))
1076 start_fsb
= XFS_B_TO_FSBT(mp
, offset
);
1078 start_fsb
= XFS_B_TO_FSB(mp
, offset
+ written
);
1079 end_fsb
= XFS_B_TO_FSB(mp
, offset
+ length
);
1082 * Trim delalloc blocks if they were allocated by this write and we
1083 * didn't manage to write the whole range.
1085 * We don't need to care about racing delalloc as we hold i_mutex
1086 * across the reserve/allocate/unreserve calls. If there are delalloc
1087 * blocks in the range, they are ours.
1089 if ((iomap
->flags
& IOMAP_F_NEW
) && start_fsb
< end_fsb
) {
1090 truncate_pagecache_range(VFS_I(ip
), XFS_FSB_TO_B(mp
, start_fsb
),
1091 XFS_FSB_TO_B(mp
, end_fsb
) - 1);
1093 error
= xfs_bmap_punch_delalloc_range(ip
, start_fsb
,
1094 end_fsb
- start_fsb
);
1095 if (error
&& !XFS_FORCED_SHUTDOWN(mp
)) {
1096 xfs_alert(mp
, "%s: unable to clean up ino %lld",
1097 __func__
, ip
->i_ino
);
1105 const struct iomap_ops xfs_buffered_write_iomap_ops
= {
1106 .iomap_begin
= xfs_buffered_write_iomap_begin
,
1107 .iomap_end
= xfs_buffered_write_iomap_end
,
1111 xfs_read_iomap_begin(
1112 struct inode
*inode
,
1116 struct iomap
*iomap
,
1117 struct iomap
*srcmap
)
1119 struct xfs_inode
*ip
= XFS_I(inode
);
1120 struct xfs_mount
*mp
= ip
->i_mount
;
1121 struct xfs_bmbt_irec imap
;
1122 xfs_fileoff_t offset_fsb
= XFS_B_TO_FSBT(mp
, offset
);
1123 xfs_fileoff_t end_fsb
= xfs_iomap_end_fsb(mp
, offset
, length
);
1124 int nimaps
= 1, error
= 0;
1125 bool shared
= false;
1128 ASSERT(!(flags
& (IOMAP_WRITE
| IOMAP_ZERO
)));
1130 if (XFS_FORCED_SHUTDOWN(mp
))
1133 error
= xfs_ilock_for_iomap(ip
, flags
, &lockmode
);
1136 error
= xfs_bmapi_read(ip
, offset_fsb
, end_fsb
- offset_fsb
, &imap
,
1138 if (!error
&& (flags
& IOMAP_REPORT
))
1139 error
= xfs_reflink_trim_around_shared(ip
, &imap
, &shared
);
1140 xfs_iunlock(ip
, lockmode
);
1144 trace_xfs_iomap_found(ip
, offset
, length
, XFS_DATA_FORK
, &imap
);
1145 return xfs_bmbt_to_iomap(ip
, iomap
, &imap
, shared
? IOMAP_F_SHARED
: 0);
1148 const struct iomap_ops xfs_read_iomap_ops
= {
1149 .iomap_begin
= xfs_read_iomap_begin
,
1153 xfs_seek_iomap_begin(
1154 struct inode
*inode
,
1158 struct iomap
*iomap
,
1159 struct iomap
*srcmap
)
1161 struct xfs_inode
*ip
= XFS_I(inode
);
1162 struct xfs_mount
*mp
= ip
->i_mount
;
1163 xfs_fileoff_t offset_fsb
= XFS_B_TO_FSBT(mp
, offset
);
1164 xfs_fileoff_t end_fsb
= XFS_B_TO_FSB(mp
, offset
+ length
);
1165 xfs_fileoff_t cow_fsb
= NULLFILEOFF
, data_fsb
= NULLFILEOFF
;
1166 struct xfs_iext_cursor icur
;
1167 struct xfs_bmbt_irec imap
, cmap
;
1171 if (XFS_FORCED_SHUTDOWN(mp
))
1174 lockmode
= xfs_ilock_data_map_shared(ip
);
1175 if (!(ip
->i_df
.if_flags
& XFS_IFEXTENTS
)) {
1176 error
= xfs_iread_extents(NULL
, ip
, XFS_DATA_FORK
);
1181 if (xfs_iext_lookup_extent(ip
, &ip
->i_df
, offset_fsb
, &icur
, &imap
)) {
1183 * If we found a data extent we are done.
1185 if (imap
.br_startoff
<= offset_fsb
)
1187 data_fsb
= imap
.br_startoff
;
1190 * Fake a hole until the end of the file.
1192 data_fsb
= xfs_iomap_end_fsb(mp
, offset
, length
);
1196 * If a COW fork extent covers the hole, report it - capped to the next
1199 if (xfs_inode_has_cow_data(ip
) &&
1200 xfs_iext_lookup_extent(ip
, ip
->i_cowfp
, offset_fsb
, &icur
, &cmap
))
1201 cow_fsb
= cmap
.br_startoff
;
1202 if (cow_fsb
!= NULLFILEOFF
&& cow_fsb
<= offset_fsb
) {
1203 if (data_fsb
< cow_fsb
+ cmap
.br_blockcount
)
1204 end_fsb
= min(end_fsb
, data_fsb
);
1205 xfs_trim_extent(&cmap
, offset_fsb
, end_fsb
);
1206 error
= xfs_bmbt_to_iomap(ip
, iomap
, &cmap
, IOMAP_F_SHARED
);
1208 * This is a COW extent, so we must probe the page cache
1209 * because there could be dirty page cache being backed
1212 iomap
->type
= IOMAP_UNWRITTEN
;
1217 * Else report a hole, capped to the next found data or COW extent.
1219 if (cow_fsb
!= NULLFILEOFF
&& cow_fsb
< data_fsb
)
1220 imap
.br_blockcount
= cow_fsb
- offset_fsb
;
1222 imap
.br_blockcount
= data_fsb
- offset_fsb
;
1223 imap
.br_startoff
= offset_fsb
;
1224 imap
.br_startblock
= HOLESTARTBLOCK
;
1225 imap
.br_state
= XFS_EXT_NORM
;
1227 xfs_trim_extent(&imap
, offset_fsb
, end_fsb
);
1228 error
= xfs_bmbt_to_iomap(ip
, iomap
, &imap
, 0);
1230 xfs_iunlock(ip
, lockmode
);
1234 const struct iomap_ops xfs_seek_iomap_ops
= {
1235 .iomap_begin
= xfs_seek_iomap_begin
,
1239 xfs_xattr_iomap_begin(
1240 struct inode
*inode
,
1244 struct iomap
*iomap
,
1245 struct iomap
*srcmap
)
1247 struct xfs_inode
*ip
= XFS_I(inode
);
1248 struct xfs_mount
*mp
= ip
->i_mount
;
1249 xfs_fileoff_t offset_fsb
= XFS_B_TO_FSBT(mp
, offset
);
1250 xfs_fileoff_t end_fsb
= XFS_B_TO_FSB(mp
, offset
+ length
);
1251 struct xfs_bmbt_irec imap
;
1252 int nimaps
= 1, error
= 0;
1255 if (XFS_FORCED_SHUTDOWN(mp
))
1258 lockmode
= xfs_ilock_attr_map_shared(ip
);
1260 /* if there are no attribute fork or extents, return ENOENT */
1261 if (!XFS_IFORK_Q(ip
) || !ip
->i_d
.di_anextents
) {
1266 ASSERT(ip
->i_d
.di_aformat
!= XFS_DINODE_FMT_LOCAL
);
1267 error
= xfs_bmapi_read(ip
, offset_fsb
, end_fsb
- offset_fsb
, &imap
,
1268 &nimaps
, XFS_BMAPI_ATTRFORK
);
1270 xfs_iunlock(ip
, lockmode
);
1275 return xfs_bmbt_to_iomap(ip
, iomap
, &imap
, 0);
1278 const struct iomap_ops xfs_xattr_iomap_ops
= {
1279 .iomap_begin
= xfs_xattr_iomap_begin
,