1 // SPDX-License-Identifier: GPL-2.0-or-later
3 * Copyright (c) 2020-2024 Oracle. All Rights Reserved.
4 * Author: Darrick J. Wong <djwong@kernel.org>
8 #include "xfs_shared.h"
9 #include "xfs_format.h"
10 #include "xfs_log_format.h"
11 #include "xfs_trans_resv.h"
12 #include "xfs_mount.h"
13 #include "xfs_defer.h"
14 #include "xfs_inode.h"
15 #include "xfs_trans.h"
17 #include "xfs_icache.h"
18 #include "xfs_quota.h"
19 #include "xfs_exchmaps.h"
20 #include "xfs_trace.h"
21 #include "xfs_bmap_btree.h"
22 #include "xfs_trans_space.h"
23 #include "xfs_error.h"
24 #include "xfs_errortag.h"
25 #include "xfs_health.h"
26 #include "xfs_exchmaps_item.h"
27 #include "xfs_da_format.h"
28 #include "xfs_da_btree.h"
29 #include "xfs_attr_leaf.h"
31 #include "xfs_dir2_priv.h"
33 #include "xfs_symlink_remote.h"
35 struct kmem_cache
*xfs_exchmaps_intent_cache
;
37 /* bmbt mappings adjacent to a pair of records. */
38 struct xfs_exchmaps_adjacent
{
39 struct xfs_bmbt_irec left1
;
40 struct xfs_bmbt_irec right1
;
41 struct xfs_bmbt_irec left2
;
42 struct xfs_bmbt_irec right2
;
45 #define ADJACENT_INIT { \
46 .left1 = { .br_startblock = HOLESTARTBLOCK }, \
47 .right1 = { .br_startblock = HOLESTARTBLOCK }, \
48 .left2 = { .br_startblock = HOLESTARTBLOCK }, \
49 .right2 = { .br_startblock = HOLESTARTBLOCK }, \
52 /* Information to reset reflink flag / CoW fork state after an exchange. */
55 * If the reflink flag is set on either inode, make sure it has an incore CoW
56 * fork, since all reflink inodes must have them. If there's a CoW fork and it
57 * has mappings in it, make sure the inodes are tagged appropriately so that
58 * speculative preallocations can be GC'd if we run low of space.
61 xfs_exchmaps_ensure_cowfork(
64 struct xfs_ifork
*cfork
;
66 if (xfs_is_reflink_inode(ip
))
67 xfs_ifork_init_cow(ip
);
69 cfork
= xfs_ifork_ptr(ip
, XFS_COW_FORK
);
72 if (cfork
->if_bytes
> 0)
73 xfs_inode_set_cowblocks_tag(ip
);
75 xfs_inode_clear_cowblocks_tag(ip
);
79 * Adjust the on-disk inode size upwards if needed so that we never add
80 * mappings into the file past EOF. This is crucial so that log recovery won't
81 * get confused by the sudden appearance of post-eof mappings.
84 xfs_exchmaps_update_size(
87 struct xfs_bmbt_irec
*imap
,
88 xfs_fsize_t new_isize
)
90 struct xfs_mount
*mp
= tp
->t_mountp
;
96 len
= min(XFS_FSB_TO_B(mp
, imap
->br_startoff
+ imap
->br_blockcount
),
99 if (len
<= ip
->i_disk_size
)
102 trace_xfs_exchmaps_update_inode_size(ip
, len
);
104 ip
->i_disk_size
= len
;
105 xfs_trans_log_inode(tp
, ip
, XFS_ILOG_CORE
);
108 /* Advance the incore state tracking after exchanging a mapping. */
111 struct xfs_exchmaps_intent
*xmi
,
112 const struct xfs_bmbt_irec
*irec
)
114 xmi
->xmi_startoff1
+= irec
->br_blockcount
;
115 xmi
->xmi_startoff2
+= irec
->br_blockcount
;
116 xmi
->xmi_blockcount
-= irec
->br_blockcount
;
119 /* Do we still have more mappings to exchange? */
121 xmi_has_more_exchange_work(const struct xfs_exchmaps_intent
*xmi
)
123 return xmi
->xmi_blockcount
> 0;
126 /* Do we have post-operation cleanups to perform? */
128 xmi_has_postop_work(const struct xfs_exchmaps_intent
*xmi
)
130 return xmi
->xmi_flags
& (XFS_EXCHMAPS_CLEAR_INO1_REFLINK
|
131 XFS_EXCHMAPS_CLEAR_INO2_REFLINK
|
132 __XFS_EXCHMAPS_INO2_SHORTFORM
);
135 /* Check all mappings to make sure we can actually exchange them. */
137 xfs_exchmaps_check_forks(
138 struct xfs_mount
*mp
,
139 const struct xfs_exchmaps_req
*req
)
141 struct xfs_ifork
*ifp1
, *ifp2
;
142 int whichfork
= xfs_exchmaps_reqfork(req
);
145 ifp1
= xfs_ifork_ptr(req
->ip1
, whichfork
);
146 ifp2
= xfs_ifork_ptr(req
->ip2
, whichfork
);
150 /* We don't know how to exchange local format forks. */
151 if (ifp1
->if_format
== XFS_DINODE_FMT_LOCAL
||
152 ifp2
->if_format
== XFS_DINODE_FMT_LOCAL
)
158 #ifdef CONFIG_XFS_QUOTA
159 /* Log the actual updates to the quota accounting. */
161 xfs_exchmaps_update_quota(
162 struct xfs_trans
*tp
,
163 struct xfs_exchmaps_intent
*xmi
,
164 struct xfs_bmbt_irec
*irec1
,
165 struct xfs_bmbt_irec
*irec2
)
167 int64_t ip1_delta
= 0, ip2_delta
= 0;
170 qflag
= XFS_IS_REALTIME_INODE(xmi
->xmi_ip1
) ? XFS_TRANS_DQ_RTBCOUNT
:
173 if (xfs_bmap_is_real_extent(irec1
)) {
174 ip1_delta
-= irec1
->br_blockcount
;
175 ip2_delta
+= irec1
->br_blockcount
;
178 if (xfs_bmap_is_real_extent(irec2
)) {
179 ip1_delta
+= irec2
->br_blockcount
;
180 ip2_delta
-= irec2
->br_blockcount
;
183 xfs_trans_mod_dquot_byino(tp
, xmi
->xmi_ip1
, qflag
, ip1_delta
);
184 xfs_trans_mod_dquot_byino(tp
, xmi
->xmi_ip2
, qflag
, ip2_delta
);
187 # define xfs_exchmaps_update_quota(tp, xmi, irec1, irec2) ((void)0)
190 /* Decide if we want to skip this mapping from file1. */
192 xfs_exchmaps_can_skip_mapping(
193 struct xfs_exchmaps_intent
*xmi
,
194 struct xfs_bmbt_irec
*irec
)
196 struct xfs_mount
*mp
= xmi
->xmi_ip1
->i_mount
;
198 /* Do not skip this mapping if the caller did not tell us to. */
199 if (!(xmi
->xmi_flags
& XFS_EXCHMAPS_INO1_WRITTEN
))
202 /* Do not skip mapped, written mappings. */
203 if (xfs_bmap_is_written_extent(irec
))
207 * The mapping is unwritten or a hole. It cannot be a delalloc
208 * reservation because we already excluded those. It cannot be an
209 * unwritten extent with dirty page cache because we flushed the page
210 * cache. For files where the allocation unit is 1FSB (files on the
211 * data dev, rt files if the extent size is 1FSB), we can safely
214 if (!xfs_inode_has_bigrtalloc(xmi
->xmi_ip1
))
218 * For a realtime file with a multi-fsb allocation unit, the decision
219 * is trickier because we can only swap full allocation units.
220 * Unwritten mappings can appear in the middle of an rtx if the rtx is
221 * partially written, but they can also appear for preallocations.
223 * If the mapping is a hole, skip it entirely. Holes should align with
226 if (!xfs_bmap_is_real_extent(irec
))
230 * All mappings below this point are unwritten.
232 * - If the beginning is not aligned to an rtx, trim the end of the
233 * mapping so that it does not cross an rtx boundary, and swap it.
235 * - If both ends are aligned to an rtx, skip the entire mapping.
237 if (!isaligned_64(irec
->br_startoff
, mp
->m_sb
.sb_rextsize
)) {
238 xfs_fileoff_t new_end
;
240 new_end
= roundup_64(irec
->br_startoff
, mp
->m_sb
.sb_rextsize
);
241 irec
->br_blockcount
= min(irec
->br_blockcount
,
242 new_end
- irec
->br_startoff
);
245 if (isaligned_64(irec
->br_blockcount
, mp
->m_sb
.sb_rextsize
))
249 * All mappings below this point are unwritten, start on an rtx
250 * boundary, and do not end on an rtx boundary.
252 * - If the mapping is longer than one rtx, trim the end of the mapping
253 * down to an rtx boundary and skip it.
255 * - The mapping is shorter than one rtx. Swap it.
257 if (irec
->br_blockcount
> mp
->m_sb
.sb_rextsize
) {
258 xfs_fileoff_t new_end
;
260 new_end
= rounddown_64(irec
->br_startoff
+ irec
->br_blockcount
,
261 mp
->m_sb
.sb_rextsize
);
262 irec
->br_blockcount
= new_end
- irec
->br_startoff
;
270 * Walk forward through the file ranges in @xmi until we find two different
271 * mappings to exchange. If there is work to do, return the mappings;
272 * otherwise we've reached the end of the range and xmi_blockcount will be
275 * If the walk skips over a pair of mappings to the same storage, save them as
276 * the left records in @adj (if provided) so that the simulation phase can
277 * avoid an extra lookup.
280 xfs_exchmaps_find_mappings(
281 struct xfs_exchmaps_intent
*xmi
,
282 struct xfs_bmbt_irec
*irec1
,
283 struct xfs_bmbt_irec
*irec2
,
284 struct xfs_exchmaps_adjacent
*adj
)
290 bmap_flags
= xfs_bmapi_aflag(xfs_exchmaps_whichfork(xmi
));
292 for (; xmi_has_more_exchange_work(xmi
); xmi_advance(xmi
, irec1
)) {
293 /* Read mapping from the first file */
295 error
= xfs_bmapi_read(xmi
->xmi_ip1
, xmi
->xmi_startoff1
,
296 xmi
->xmi_blockcount
, irec1
, &nimaps
,
301 irec1
->br_startblock
== DELAYSTARTBLOCK
||
302 irec1
->br_startoff
!= xmi
->xmi_startoff1
) {
304 * We should never get no mapping or a delalloc mapping
305 * or something that doesn't match what we asked for,
306 * since the caller flushed both inodes and we hold the
307 * ILOCKs for both inodes.
313 if (xfs_exchmaps_can_skip_mapping(xmi
, irec1
)) {
314 trace_xfs_exchmaps_mapping1_skip(xmi
->xmi_ip1
, irec1
);
318 /* Read mapping from the second file */
320 error
= xfs_bmapi_read(xmi
->xmi_ip2
, xmi
->xmi_startoff2
,
321 irec1
->br_blockcount
, irec2
, &nimaps
,
326 irec2
->br_startblock
== DELAYSTARTBLOCK
||
327 irec2
->br_startoff
!= xmi
->xmi_startoff2
) {
329 * We should never get no mapping or a delalloc mapping
330 * or something that doesn't match what we asked for,
331 * since the caller flushed both inodes and we hold the
332 * ILOCKs for both inodes.
339 * We can only exchange as many blocks as the smaller of the
342 irec1
->br_blockcount
= min(irec1
->br_blockcount
,
343 irec2
->br_blockcount
);
345 trace_xfs_exchmaps_mapping1(xmi
->xmi_ip1
, irec1
);
346 trace_xfs_exchmaps_mapping2(xmi
->xmi_ip2
, irec2
);
348 /* We found something to exchange, so return it. */
349 if (irec1
->br_startblock
!= irec2
->br_startblock
)
353 * Two mappings pointing to the same physical block must not
354 * have different states; that's filesystem corruption. Move
355 * on to the next mapping if they're both holes or both point
356 * to the same physical space extent.
358 if (irec1
->br_state
!= irec2
->br_state
) {
359 xfs_bmap_mark_sick(xmi
->xmi_ip1
,
360 xfs_exchmaps_whichfork(xmi
));
361 xfs_bmap_mark_sick(xmi
->xmi_ip2
,
362 xfs_exchmaps_whichfork(xmi
));
363 return -EFSCORRUPTED
;
367 * Save the mappings if we're estimating work and skipping
368 * these identical mappings.
371 memcpy(&adj
->left1
, irec1
, sizeof(*irec1
));
372 memcpy(&adj
->left2
, irec2
, sizeof(*irec2
));
379 /* Exchange these two mappings. */
381 xfs_exchmaps_one_step(
382 struct xfs_trans
*tp
,
383 struct xfs_exchmaps_intent
*xmi
,
384 struct xfs_bmbt_irec
*irec1
,
385 struct xfs_bmbt_irec
*irec2
)
387 int whichfork
= xfs_exchmaps_whichfork(xmi
);
389 xfs_exchmaps_update_quota(tp
, xmi
, irec1
, irec2
);
391 /* Remove both mappings. */
392 xfs_bmap_unmap_extent(tp
, xmi
->xmi_ip1
, whichfork
, irec1
);
393 xfs_bmap_unmap_extent(tp
, xmi
->xmi_ip2
, whichfork
, irec2
);
396 * Re-add both mappings. We exchange the file offsets between the two
397 * maps and add the opposite map, which has the effect of filling the
398 * logical offsets we just unmapped, but with with the physical mapping
399 * information exchanged.
401 swap(irec1
->br_startoff
, irec2
->br_startoff
);
402 xfs_bmap_map_extent(tp
, xmi
->xmi_ip1
, whichfork
, irec2
);
403 xfs_bmap_map_extent(tp
, xmi
->xmi_ip2
, whichfork
, irec1
);
405 /* Make sure we're not adding mappings past EOF. */
406 if (whichfork
== XFS_DATA_FORK
) {
407 xfs_exchmaps_update_size(tp
, xmi
->xmi_ip1
, irec2
,
409 xfs_exchmaps_update_size(tp
, xmi
->xmi_ip2
, irec1
,
414 * Advance our cursor and exit. The caller (either defer ops or log
415 * recovery) will log the XMD item, and if *blockcount is nonzero, it
416 * will log a new XMI item for the remainder and call us back.
418 xmi_advance(xmi
, irec1
);
421 /* Convert inode2's leaf attr fork back to shortform, if possible.. */
423 xfs_exchmaps_attr_to_sf(
424 struct xfs_trans
*tp
,
425 struct xfs_exchmaps_intent
*xmi
)
427 struct xfs_da_args args
= {
429 .geo
= tp
->t_mountp
->m_attr_geo
,
430 .whichfork
= XFS_ATTR_FORK
,
432 .owner
= xmi
->xmi_ip2
->i_ino
,
438 if (!xfs_attr_is_leaf(xmi
->xmi_ip2
))
441 error
= xfs_attr3_leaf_read(tp
, xmi
->xmi_ip2
, xmi
->xmi_ip2
->i_ino
, 0,
446 forkoff
= xfs_attr_shortform_allfit(bp
, xmi
->xmi_ip2
);
450 return xfs_attr3_leaf_to_shortform(bp
, &args
, forkoff
);
453 /* Convert inode2's block dir fork back to shortform, if possible.. */
455 xfs_exchmaps_dir_to_sf(
456 struct xfs_trans
*tp
,
457 struct xfs_exchmaps_intent
*xmi
)
459 struct xfs_da_args args
= {
461 .geo
= tp
->t_mountp
->m_dir_geo
,
462 .whichfork
= XFS_DATA_FORK
,
464 .owner
= xmi
->xmi_ip2
->i_ino
,
466 struct xfs_dir2_sf_hdr sfh
;
471 if (xfs_dir2_format(&args
, &error
) != XFS_DIR2_FMT_BLOCK
)
474 error
= xfs_dir3_block_read(tp
, xmi
->xmi_ip2
, xmi
->xmi_ip2
->i_ino
, &bp
);
478 size
= xfs_dir2_block_sfsize(xmi
->xmi_ip2
, bp
->b_addr
, &sfh
);
479 if (size
> xfs_inode_data_fork_size(xmi
->xmi_ip2
))
482 return xfs_dir2_block_to_sf(&args
, bp
, size
, &sfh
);
485 /* Convert inode2's remote symlink target back to shortform, if possible. */
487 xfs_exchmaps_link_to_sf(
488 struct xfs_trans
*tp
,
489 struct xfs_exchmaps_intent
*xmi
)
491 struct xfs_inode
*ip
= xmi
->xmi_ip2
;
492 struct xfs_ifork
*ifp
= xfs_ifork_ptr(ip
, XFS_DATA_FORK
);
496 if (ifp
->if_format
== XFS_DINODE_FMT_LOCAL
||
497 ip
->i_disk_size
> xfs_inode_data_fork_size(ip
))
500 /* Read the current symlink target into a buffer. */
501 buf
= kmalloc(ip
->i_disk_size
+ 1,
502 GFP_KERNEL
| __GFP_NOLOCKDEP
| __GFP_NOFAIL
);
508 error
= xfs_symlink_remote_read(ip
, buf
);
512 /* Remove the blocks. */
513 error
= xfs_symlink_remote_truncate(tp
, ip
);
517 /* Convert fork to local format and log our changes. */
518 xfs_idestroy_fork(ifp
);
520 ifp
->if_format
= XFS_DINODE_FMT_LOCAL
;
521 xfs_init_local_fork(ip
, XFS_DATA_FORK
, buf
, ip
->i_disk_size
);
522 xfs_trans_log_inode(tp
, ip
, XFS_ILOG_DDATA
| XFS_ILOG_CORE
);
528 /* Clear the reflink flag after an exchange. */
530 xfs_exchmaps_clear_reflink(
531 struct xfs_trans
*tp
,
532 struct xfs_inode
*ip
)
534 trace_xfs_reflink_unset_inode_flag(ip
);
536 ip
->i_diflags2
&= ~XFS_DIFLAG2_REFLINK
;
537 xfs_trans_log_inode(tp
, ip
, XFS_ILOG_CORE
);
540 /* Finish whatever work might come after an exchange operation. */
542 xfs_exchmaps_do_postop_work(
543 struct xfs_trans
*tp
,
544 struct xfs_exchmaps_intent
*xmi
)
546 if (xmi
->xmi_flags
& __XFS_EXCHMAPS_INO2_SHORTFORM
) {
549 if (xmi
->xmi_flags
& XFS_EXCHMAPS_ATTR_FORK
)
550 error
= xfs_exchmaps_attr_to_sf(tp
, xmi
);
551 else if (S_ISDIR(VFS_I(xmi
->xmi_ip2
)->i_mode
))
552 error
= xfs_exchmaps_dir_to_sf(tp
, xmi
);
553 else if (S_ISLNK(VFS_I(xmi
->xmi_ip2
)->i_mode
))
554 error
= xfs_exchmaps_link_to_sf(tp
, xmi
);
555 xmi
->xmi_flags
&= ~__XFS_EXCHMAPS_INO2_SHORTFORM
;
560 if (xmi
->xmi_flags
& XFS_EXCHMAPS_CLEAR_INO1_REFLINK
) {
561 xfs_exchmaps_clear_reflink(tp
, xmi
->xmi_ip1
);
562 xmi
->xmi_flags
&= ~XFS_EXCHMAPS_CLEAR_INO1_REFLINK
;
565 if (xmi
->xmi_flags
& XFS_EXCHMAPS_CLEAR_INO2_REFLINK
) {
566 xfs_exchmaps_clear_reflink(tp
, xmi
->xmi_ip2
);
567 xmi
->xmi_flags
&= ~XFS_EXCHMAPS_CLEAR_INO2_REFLINK
;
573 /* Finish one step in a mapping exchange operation, possibly relogging. */
575 xfs_exchmaps_finish_one(
576 struct xfs_trans
*tp
,
577 struct xfs_exchmaps_intent
*xmi
)
579 struct xfs_bmbt_irec irec1
, irec2
;
582 if (xmi_has_more_exchange_work(xmi
)) {
584 * If the operation state says that some range of the files
585 * have not yet been exchanged, look for mappings in that range
586 * to exchange. If we find some mappings, exchange them.
588 error
= xfs_exchmaps_find_mappings(xmi
, &irec1
, &irec2
, NULL
);
592 if (xmi_has_more_exchange_work(xmi
))
593 xfs_exchmaps_one_step(tp
, xmi
, &irec1
, &irec2
);
596 * If the caller asked us to exchange the file sizes after the
597 * exchange and either we just exchanged the last mappings in
598 * the range or we didn't find anything to exchange, update the
601 if ((xmi
->xmi_flags
& XFS_EXCHMAPS_SET_SIZES
) &&
602 !xmi_has_more_exchange_work(xmi
)) {
603 xmi
->xmi_ip1
->i_disk_size
= xmi
->xmi_isize1
;
604 xmi
->xmi_ip2
->i_disk_size
= xmi
->xmi_isize2
;
606 xfs_trans_log_inode(tp
, xmi
->xmi_ip1
, XFS_ILOG_CORE
);
607 xfs_trans_log_inode(tp
, xmi
->xmi_ip2
, XFS_ILOG_CORE
);
609 } else if (xmi_has_postop_work(xmi
)) {
611 * Now that we're finished with the exchange operation,
612 * complete the post-op cleanup work.
614 error
= xfs_exchmaps_do_postop_work(tp
, xmi
);
619 if (XFS_TEST_ERROR(false, tp
->t_mountp
, XFS_ERRTAG_EXCHMAPS_FINISH_ONE
))
622 /* If we still have work to do, ask for a new transaction. */
623 if (xmi_has_more_exchange_work(xmi
) || xmi_has_postop_work(xmi
)) {
624 trace_xfs_exchmaps_defer(tp
->t_mountp
, xmi
);
629 * If we reach here, we've finished all the exchange work and the post
630 * operation work. The last thing we need to do before returning to
631 * the caller is to make sure that COW forks are set up correctly.
633 if (!(xmi
->xmi_flags
& XFS_EXCHMAPS_ATTR_FORK
)) {
634 xfs_exchmaps_ensure_cowfork(xmi
->xmi_ip1
);
635 xfs_exchmaps_ensure_cowfork(xmi
->xmi_ip2
);
642 * Compute the amount of bmbt blocks we should reserve for each file. In the
643 * worst case, each exchange will fill a hole with a new mapping, which could
644 * result in a btree split every time we add a new leaf block.
646 static inline uint64_t
647 xfs_exchmaps_bmbt_blocks(
648 struct xfs_mount
*mp
,
649 const struct xfs_exchmaps_req
*req
)
651 return howmany_64(req
->nr_exchanges
,
652 XFS_MAX_CONTIG_BMAPS_PER_BLOCK(mp
)) *
653 XFS_EXTENTADD_SPACE_RES(mp
, xfs_exchmaps_reqfork(req
));
656 /* Compute the space we should reserve for the rmap btree expansions. */
657 static inline uint64_t
658 xfs_exchmaps_rmapbt_blocks(
659 struct xfs_mount
*mp
,
660 const struct xfs_exchmaps_req
*req
)
662 if (!xfs_has_rmapbt(mp
))
664 if (XFS_IS_REALTIME_INODE(req
->ip1
))
667 return howmany_64(req
->nr_exchanges
,
668 XFS_MAX_CONTIG_RMAPS_PER_BLOCK(mp
)) *
669 XFS_RMAPADD_SPACE_RES(mp
);
672 /* Estimate the bmbt and rmapbt overhead required to exchange mappings. */
674 xfs_exchmaps_estimate_overhead(
675 struct xfs_exchmaps_req
*req
)
677 struct xfs_mount
*mp
= req
->ip1
->i_mount
;
678 xfs_filblks_t bmbt_blocks
;
679 xfs_filblks_t rmapbt_blocks
;
680 xfs_filblks_t resblks
= req
->resblks
;
683 * Compute the number of bmbt and rmapbt blocks we might need to handle
684 * the estimated number of exchanges.
686 bmbt_blocks
= xfs_exchmaps_bmbt_blocks(mp
, req
);
687 rmapbt_blocks
= xfs_exchmaps_rmapbt_blocks(mp
, req
);
689 trace_xfs_exchmaps_overhead(mp
, bmbt_blocks
, rmapbt_blocks
);
691 /* Make sure the change in file block count doesn't overflow. */
692 if (check_add_overflow(req
->ip1_bcount
, bmbt_blocks
, &req
->ip1_bcount
))
694 if (check_add_overflow(req
->ip2_bcount
, bmbt_blocks
, &req
->ip2_bcount
))
698 * Add together the number of blocks we need to handle btree growth,
699 * then add it to the number of blocks we need to reserve to this
702 if (check_add_overflow(resblks
, bmbt_blocks
, &resblks
))
704 if (check_add_overflow(resblks
, bmbt_blocks
, &resblks
))
706 if (check_add_overflow(resblks
, rmapbt_blocks
, &resblks
))
708 if (check_add_overflow(resblks
, rmapbt_blocks
, &resblks
))
711 /* Can't actually reserve more than UINT_MAX blocks. */
712 if (req
->resblks
> UINT_MAX
)
715 req
->resblks
= resblks
;
716 trace_xfs_exchmaps_final_estimate(req
);
720 /* Decide if we can merge two real mappings. */
723 const struct xfs_bmbt_irec
*b1
,
724 const struct xfs_bmbt_irec
*b2
)
726 /* Don't merge holes. */
727 if (b1
->br_startblock
== HOLESTARTBLOCK
||
728 b2
->br_startblock
== HOLESTARTBLOCK
)
731 /* We don't merge holes. */
732 if (!xfs_bmap_is_real_extent(b1
) || !xfs_bmap_is_real_extent(b2
))
735 if (b1
->br_startoff
+ b1
->br_blockcount
== b2
->br_startoff
&&
736 b1
->br_startblock
+ b1
->br_blockcount
== b2
->br_startblock
&&
737 b1
->br_state
== b2
->br_state
&&
738 b1
->br_blockcount
+ b2
->br_blockcount
<= XFS_MAX_BMBT_EXTLEN
)
745 * Decide if we can merge three mappings. Caller must ensure all three
746 * mappings must not be holes or delalloc reservations.
750 const struct xfs_bmbt_irec
*l
,
751 const struct xfs_bmbt_irec
*m
,
752 const struct xfs_bmbt_irec
*r
)
754 xfs_filblks_t new_len
;
756 new_len
= l
->br_blockcount
+ m
->br_blockcount
+ r
->br_blockcount
;
757 return new_len
<= XFS_MAX_BMBT_EXTLEN
;
760 #define CLEFT_CONTIG 0x01
761 #define CRIGHT_CONTIG 0x02
763 #define CBOTH_CONTIG (CLEFT_CONTIG | CRIGHT_CONTIG)
765 #define NLEFT_CONTIG 0x10
766 #define NRIGHT_CONTIG 0x20
768 #define NBOTH_CONTIG (NLEFT_CONTIG | NRIGHT_CONTIG)
770 /* Estimate the effect of a single exchange on mapping count. */
772 xmi_delta_nextents_step(
773 struct xfs_mount
*mp
,
774 const struct xfs_bmbt_irec
*left
,
775 const struct xfs_bmbt_irec
*curr
,
776 const struct xfs_bmbt_irec
*new,
777 const struct xfs_bmbt_irec
*right
)
779 bool lhole
, rhole
, chole
, nhole
;
780 unsigned int state
= 0;
783 lhole
= left
->br_startblock
== HOLESTARTBLOCK
;
784 rhole
= right
->br_startblock
== HOLESTARTBLOCK
;
785 chole
= curr
->br_startblock
== HOLESTARTBLOCK
;
786 nhole
= new->br_startblock
== HOLESTARTBLOCK
;
790 if (!lhole
&& !chole
&& xmi_can_merge(left
, curr
))
791 state
|= CLEFT_CONTIG
;
792 if (!rhole
&& !chole
&& xmi_can_merge(curr
, right
))
793 state
|= CRIGHT_CONTIG
;
794 if ((state
& CBOTH_CONTIG
) == CBOTH_CONTIG
&&
795 !xmi_can_merge_all(left
, curr
, right
))
796 state
&= ~CRIGHT_CONTIG
;
800 if (!lhole
&& !nhole
&& xmi_can_merge(left
, new))
801 state
|= NLEFT_CONTIG
;
802 if (!rhole
&& !nhole
&& xmi_can_merge(new, right
))
803 state
|= NRIGHT_CONTIG
;
804 if ((state
& NBOTH_CONTIG
) == NBOTH_CONTIG
&&
805 !xmi_can_merge_all(left
, new, right
))
806 state
&= ~NRIGHT_CONTIG
;
808 switch (state
& (CLEFT_CONTIG
| CRIGHT_CONTIG
| CHOLE
)) {
809 case CLEFT_CONTIG
| CRIGHT_CONTIG
:
811 * left/curr/right are the same mapping, so deleting curr
812 * causes 2 new mappings to be created.
818 * curr is not contiguous with any mapping, so we remove curr
824 /* hole, do nothing */
828 /* trim either left or right, no change */
832 switch (state
& (NLEFT_CONTIG
| NRIGHT_CONTIG
| NHOLE
)) {
833 case NLEFT_CONTIG
| NRIGHT_CONTIG
:
835 * left/curr/right will become the same mapping, so adding
836 * curr causes the deletion of right.
841 /* new is not contiguous with any mapping */
845 /* hole, do nothing. */
849 /* new is absorbed into left or right, no change */
853 trace_xfs_exchmaps_delta_nextents_step(mp
, left
, curr
, new, right
, ret
,
858 /* Make sure we don't overflow the extent (mapping) counters. */
860 xmi_ensure_delta_nextents(
861 struct xfs_exchmaps_req
*req
,
862 struct xfs_inode
*ip
,
865 struct xfs_mount
*mp
= ip
->i_mount
;
866 int whichfork
= xfs_exchmaps_reqfork(req
);
867 struct xfs_ifork
*ifp
= xfs_ifork_ptr(ip
, whichfork
);
868 uint64_t new_nextents
;
869 xfs_extnum_t max_nextents
;
875 * It's always an error if the delta causes integer overflow. delta
876 * needs an explicit cast here to avoid warnings about implicit casts
877 * coded into the overflow check.
879 if (check_add_overflow(ifp
->if_nextents
, (uint64_t)delta
,
883 if (XFS_TEST_ERROR(false, mp
, XFS_ERRTAG_REDUCE_MAX_IEXTENTS
) &&
888 * We always promote both inodes to have large extent counts if the
889 * superblock feature is enabled, so we only need to check against the
890 * theoretical maximum.
892 max_nextents
= xfs_iext_max_nextents(xfs_has_large_extent_counts(mp
),
894 if (new_nextents
> max_nextents
)
900 /* Find the next mapping after irec. */
903 struct xfs_inode
*ip
,
905 const struct xfs_bmbt_irec
*irec
,
906 struct xfs_bmbt_irec
*nrec
)
909 xfs_filblks_t blockcount
;
913 off
= irec
->br_startoff
+ irec
->br_blockcount
;
914 blockcount
= XFS_MAX_FILEOFF
- off
;
915 error
= xfs_bmapi_read(ip
, off
, blockcount
, nrec
, &nimaps
, bmap_flags
);
918 if (nrec
->br_startblock
== DELAYSTARTBLOCK
||
919 nrec
->br_startoff
!= off
) {
921 * If we don't get the mapping we want, return a zero-length
922 * mapping, which our estimator function will pretend is a hole.
923 * We shouldn't get delalloc reservations.
925 nrec
->br_startblock
= HOLESTARTBLOCK
;
932 xfs_exchmaps_intent_init_cache(void)
934 xfs_exchmaps_intent_cache
= kmem_cache_create("xfs_exchmaps_intent",
935 sizeof(struct xfs_exchmaps_intent
),
938 return xfs_exchmaps_intent_cache
!= NULL
? 0 : -ENOMEM
;
942 xfs_exchmaps_intent_destroy_cache(void)
944 kmem_cache_destroy(xfs_exchmaps_intent_cache
);
945 xfs_exchmaps_intent_cache
= NULL
;
949 * Decide if we will exchange the reflink flags between the two files after the
950 * exchange. The only time we want to do this is if we're exchanging all
951 * mappings under EOF and the inode reflink flags have different states.
954 xmi_can_exchange_reflink_flags(
955 const struct xfs_exchmaps_req
*req
,
956 unsigned int reflink_state
)
958 struct xfs_mount
*mp
= req
->ip1
->i_mount
;
960 if (hweight32(reflink_state
) != 1)
962 if (req
->startoff1
!= 0 || req
->startoff2
!= 0)
964 if (req
->blockcount
!= XFS_B_TO_FSB(mp
, req
->ip1
->i_disk_size
))
966 if (req
->blockcount
!= XFS_B_TO_FSB(mp
, req
->ip2
->i_disk_size
))
972 /* Allocate and initialize a new incore intent item from a request. */
973 struct xfs_exchmaps_intent
*
974 xfs_exchmaps_init_intent(
975 const struct xfs_exchmaps_req
*req
)
977 struct xfs_exchmaps_intent
*xmi
;
980 xmi
= kmem_cache_zalloc(xfs_exchmaps_intent_cache
,
981 GFP_NOFS
| __GFP_NOFAIL
);
982 INIT_LIST_HEAD(&xmi
->xmi_list
);
983 xmi
->xmi_ip1
= req
->ip1
;
984 xmi
->xmi_ip2
= req
->ip2
;
985 xmi
->xmi_startoff1
= req
->startoff1
;
986 xmi
->xmi_startoff2
= req
->startoff2
;
987 xmi
->xmi_blockcount
= req
->blockcount
;
988 xmi
->xmi_isize1
= xmi
->xmi_isize2
= -1;
989 xmi
->xmi_flags
= req
->flags
& XFS_EXCHMAPS_PARAMS
;
991 if (xfs_exchmaps_whichfork(xmi
) == XFS_ATTR_FORK
) {
992 xmi
->xmi_flags
|= __XFS_EXCHMAPS_INO2_SHORTFORM
;
996 if (req
->flags
& XFS_EXCHMAPS_SET_SIZES
) {
997 xmi
->xmi_flags
|= XFS_EXCHMAPS_SET_SIZES
;
998 xmi
->xmi_isize1
= req
->ip2
->i_disk_size
;
999 xmi
->xmi_isize2
= req
->ip1
->i_disk_size
;
1002 /* Record the state of each inode's reflink flag before the op. */
1003 if (xfs_is_reflink_inode(req
->ip1
))
1005 if (xfs_is_reflink_inode(req
->ip2
))
1009 * Figure out if we're clearing the reflink flags (which effectively
1010 * exchanges them) after the operation.
1012 if (xmi_can_exchange_reflink_flags(req
, rs
)) {
1014 xmi
->xmi_flags
|= XFS_EXCHMAPS_CLEAR_INO1_REFLINK
;
1016 xmi
->xmi_flags
|= XFS_EXCHMAPS_CLEAR_INO2_REFLINK
;
1019 if (S_ISDIR(VFS_I(xmi
->xmi_ip2
)->i_mode
) ||
1020 S_ISLNK(VFS_I(xmi
->xmi_ip2
)->i_mode
))
1021 xmi
->xmi_flags
|= __XFS_EXCHMAPS_INO2_SHORTFORM
;
1027 * Estimate the number of exchange operations and the number of file blocks
1028 * in each file that will be affected by the exchange operation.
1031 xfs_exchmaps_estimate(
1032 struct xfs_exchmaps_req
*req
)
1034 struct xfs_exchmaps_intent
*xmi
;
1035 struct xfs_bmbt_irec irec1
, irec2
;
1036 struct xfs_exchmaps_adjacent adj
= ADJACENT_INIT
;
1037 xfs_filblks_t ip1_blocks
= 0, ip2_blocks
= 0;
1038 int64_t d_nexts1
, d_nexts2
;
1042 ASSERT(!(req
->flags
& ~XFS_EXCHMAPS_PARAMS
));
1044 bmap_flags
= xfs_bmapi_aflag(xfs_exchmaps_reqfork(req
));
1045 xmi
= xfs_exchmaps_init_intent(req
);
1048 * To guard against the possibility of overflowing the extent counters,
1049 * we have to estimate an upper bound on the potential increase in that
1050 * counter. We can split the mapping at each end of the range, and for
1051 * each step of the exchange we can split the mapping that we're
1052 * working on if the mappings do not align.
1054 d_nexts1
= d_nexts2
= 3;
1056 while (xmi_has_more_exchange_work(xmi
)) {
1058 * Walk through the file ranges until we find something to
1059 * exchange. Because we're simulating the exchange, pass in
1060 * adj to capture skipped mappings for correct estimation of
1061 * bmbt record merges.
1063 error
= xfs_exchmaps_find_mappings(xmi
, &irec1
, &irec2
, &adj
);
1066 if (!xmi_has_more_exchange_work(xmi
))
1069 /* Update accounting. */
1070 if (xfs_bmap_is_real_extent(&irec1
))
1071 ip1_blocks
+= irec1
.br_blockcount
;
1072 if (xfs_bmap_is_real_extent(&irec2
))
1073 ip2_blocks
+= irec2
.br_blockcount
;
1074 req
->nr_exchanges
++;
1076 /* Read the next mappings from both files. */
1077 error
= xmi_next(req
->ip1
, bmap_flags
, &irec1
, &adj
.right1
);
1081 error
= xmi_next(req
->ip2
, bmap_flags
, &irec2
, &adj
.right2
);
1085 /* Update extent count deltas. */
1086 d_nexts1
+= xmi_delta_nextents_step(req
->ip1
->i_mount
,
1087 &adj
.left1
, &irec1
, &irec2
, &adj
.right1
);
1089 d_nexts2
+= xmi_delta_nextents_step(req
->ip1
->i_mount
,
1090 &adj
.left2
, &irec2
, &irec1
, &adj
.right2
);
1092 /* Now pretend we exchanged the mappings. */
1093 if (xmi_can_merge(&adj
.left2
, &irec1
))
1094 adj
.left2
.br_blockcount
+= irec1
.br_blockcount
;
1096 memcpy(&adj
.left2
, &irec1
, sizeof(irec1
));
1098 if (xmi_can_merge(&adj
.left1
, &irec2
))
1099 adj
.left1
.br_blockcount
+= irec2
.br_blockcount
;
1101 memcpy(&adj
.left1
, &irec2
, sizeof(irec2
));
1103 xmi_advance(xmi
, &irec1
);
1106 /* Account for the blocks that are being exchanged. */
1107 if (XFS_IS_REALTIME_INODE(req
->ip1
) &&
1108 xfs_exchmaps_reqfork(req
) == XFS_DATA_FORK
) {
1109 req
->ip1_rtbcount
= ip1_blocks
;
1110 req
->ip2_rtbcount
= ip2_blocks
;
1112 req
->ip1_bcount
= ip1_blocks
;
1113 req
->ip2_bcount
= ip2_blocks
;
1117 * Make sure that both forks have enough slack left in their extent
1118 * counters that the exchange operation will not overflow.
1120 trace_xfs_exchmaps_delta_nextents(req
, d_nexts1
, d_nexts2
);
1121 if (req
->ip1
== req
->ip2
) {
1122 error
= xmi_ensure_delta_nextents(req
, req
->ip1
,
1123 d_nexts1
+ d_nexts2
);
1125 error
= xmi_ensure_delta_nextents(req
, req
->ip1
, d_nexts1
);
1128 error
= xmi_ensure_delta_nextents(req
, req
->ip2
, d_nexts2
);
1133 trace_xfs_exchmaps_initial_estimate(req
);
1134 error
= xfs_exchmaps_estimate_overhead(req
);
1136 kmem_cache_free(xfs_exchmaps_intent_cache
, xmi
);
1140 /* Set the reflink flag before an operation. */
1142 xfs_exchmaps_set_reflink(
1143 struct xfs_trans
*tp
,
1144 struct xfs_inode
*ip
)
1146 trace_xfs_reflink_set_inode_flag(ip
);
1148 ip
->i_diflags2
|= XFS_DIFLAG2_REFLINK
;
1149 xfs_trans_log_inode(tp
, ip
, XFS_ILOG_CORE
);
1153 * If either file has shared blocks and we're exchanging data forks, we must
1154 * flag the other file as having shared blocks so that we get the shared-block
1155 * rmap functions if we need to fix up the rmaps.
1158 xfs_exchmaps_ensure_reflink(
1159 struct xfs_trans
*tp
,
1160 const struct xfs_exchmaps_intent
*xmi
)
1162 unsigned int rs
= 0;
1164 if (xfs_is_reflink_inode(xmi
->xmi_ip1
))
1166 if (xfs_is_reflink_inode(xmi
->xmi_ip2
))
1169 if ((rs
& 1) && !xfs_is_reflink_inode(xmi
->xmi_ip2
))
1170 xfs_exchmaps_set_reflink(tp
, xmi
->xmi_ip2
);
1172 if ((rs
& 2) && !xfs_is_reflink_inode(xmi
->xmi_ip1
))
1173 xfs_exchmaps_set_reflink(tp
, xmi
->xmi_ip1
);
1176 /* Set the large extent count flag before an operation if needed. */
1178 xfs_exchmaps_ensure_large_extent_counts(
1179 struct xfs_trans
*tp
,
1180 struct xfs_inode
*ip
)
1182 if (xfs_inode_has_large_extent_counts(ip
))
1185 ip
->i_diflags2
|= XFS_DIFLAG2_NREXT64
;
1186 xfs_trans_log_inode(tp
, ip
, XFS_ILOG_CORE
);
1189 /* Widen the extent counter fields of both inodes if necessary. */
1191 xfs_exchmaps_upgrade_extent_counts(
1192 struct xfs_trans
*tp
,
1193 const struct xfs_exchmaps_intent
*xmi
)
1195 if (!xfs_has_large_extent_counts(tp
->t_mountp
))
1198 xfs_exchmaps_ensure_large_extent_counts(tp
, xmi
->xmi_ip1
);
1199 xfs_exchmaps_ensure_large_extent_counts(tp
, xmi
->xmi_ip2
);
1203 * Schedule an exchange a range of mappings from one inode to another.
1205 * The use of file mapping exchange log intent items ensures the operation can
1206 * be resumed even if the system goes down. The caller must commit the
1207 * transaction to start the work.
1209 * The caller must ensure the inodes must be joined to the transaction and
1210 * ILOCKd; they will still be joined to the transaction at exit.
1213 xfs_exchange_mappings(
1214 struct xfs_trans
*tp
,
1215 const struct xfs_exchmaps_req
*req
)
1217 struct xfs_exchmaps_intent
*xmi
;
1219 BUILD_BUG_ON(XFS_EXCHMAPS_INTERNAL_FLAGS
& XFS_EXCHMAPS_LOGGED_FLAGS
);
1221 xfs_assert_ilocked(req
->ip1
, XFS_ILOCK_EXCL
);
1222 xfs_assert_ilocked(req
->ip2
, XFS_ILOCK_EXCL
);
1223 ASSERT(!(req
->flags
& ~XFS_EXCHMAPS_LOGGED_FLAGS
));
1224 if (req
->flags
& XFS_EXCHMAPS_SET_SIZES
)
1225 ASSERT(!(req
->flags
& XFS_EXCHMAPS_ATTR_FORK
));
1226 ASSERT(xfs_has_exchange_range(tp
->t_mountp
));
1228 if (req
->blockcount
== 0)
1231 xmi
= xfs_exchmaps_init_intent(req
);
1232 xfs_exchmaps_defer_add(tp
, xmi
);
1233 xfs_exchmaps_ensure_reflink(tp
, xmi
);
1234 xfs_exchmaps_upgrade_extent_counts(tp
, xmi
);