1 // SPDX-License-Identifier: GPL-2.0
3 * Copyright (c) 2000-2006 Silicon Graphics, Inc.
4 * Copyright (c) 2012 Red Hat, Inc.
9 #include "xfs_shared.h"
10 #include "xfs_format.h"
11 #include "xfs_log_format.h"
12 #include "xfs_trans_resv.h"
14 #include "xfs_mount.h"
15 #include "xfs_defer.h"
16 #include "xfs_inode.h"
17 #include "xfs_btree.h"
18 #include "xfs_trans.h"
19 #include "xfs_alloc.h"
21 #include "xfs_bmap_util.h"
22 #include "xfs_bmap_btree.h"
23 #include "xfs_rtalloc.h"
24 #include "xfs_error.h"
25 #include "xfs_quota.h"
26 #include "xfs_trans_space.h"
27 #include "xfs_trace.h"
28 #include "xfs_icache.h"
29 #include "xfs_iomap.h"
30 #include "xfs_reflink.h"
31 #include "xfs_rtbitmap.h"
32 #include "xfs_rtgroup.h"
34 /* Kernel only BMAP related definitions and functions */
37 * Convert the given file system block to a disk block. We have to treat it
38 * differently based on whether the file is a real time file or not, because the
42 xfs_fsb_to_db(struct xfs_inode
*ip
, xfs_fsblock_t fsb
)
44 if (XFS_IS_REALTIME_INODE(ip
))
45 return xfs_rtb_to_daddr(ip
->i_mount
, fsb
);
46 return XFS_FSB_TO_DADDR(ip
->i_mount
, fsb
);
50 * Routine to zero an extent on disk allocated to the specific inode.
55 xfs_fsblock_t start_fsb
,
58 return blkdev_issue_zeroout(xfs_inode_buftarg(ip
)->bt_bdev
,
59 xfs_fsb_to_db(ip
, start_fsb
),
60 XFS_FSB_TO_BB(ip
->i_mount
, count_fsb
),
65 * Extent tree block counting routines.
69 * Count leaf blocks given a range of extent records. Delayed allocation
70 * extents are not counted towards the totals.
73 xfs_bmap_count_leaves(
74 struct xfs_ifork
*ifp
,
77 struct xfs_iext_cursor icur
;
78 struct xfs_bmbt_irec got
;
79 xfs_extnum_t numrecs
= 0;
81 for_each_xfs_iext(ifp
, &icur
, &got
) {
82 if (!isnullstartblock(got
.br_startblock
)) {
83 *count
+= got
.br_blockcount
;
92 * Count fsblocks of the given fork. Delayed allocation extents are
93 * not counted towards the totals.
96 xfs_bmap_count_blocks(
100 xfs_extnum_t
*nextents
,
101 xfs_filblks_t
*count
)
103 struct xfs_mount
*mp
= ip
->i_mount
;
104 struct xfs_ifork
*ifp
= xfs_ifork_ptr(ip
, whichfork
);
105 struct xfs_btree_cur
*cur
;
106 xfs_extlen_t btblocks
= 0;
115 switch (ifp
->if_format
) {
116 case XFS_DINODE_FMT_BTREE
:
117 error
= xfs_iread_extents(tp
, ip
, whichfork
);
121 cur
= xfs_bmbt_init_cursor(mp
, tp
, ip
, whichfork
);
122 error
= xfs_btree_count_blocks(cur
, &btblocks
);
123 xfs_btree_del_cursor(cur
, error
);
128 * xfs_btree_count_blocks includes the root block contained in
129 * the inode fork in @btblocks, so subtract one because we're
130 * only interested in allocated disk blocks.
132 *count
+= btblocks
- 1;
135 case XFS_DINODE_FMT_EXTENTS
:
136 *nextents
= xfs_bmap_count_leaves(ifp
, count
);
144 xfs_getbmap_report_one(
145 struct xfs_inode
*ip
,
146 struct getbmapx
*bmv
,
147 struct kgetbmap
*out
,
149 struct xfs_bmbt_irec
*got
)
151 struct kgetbmap
*p
= out
+ bmv
->bmv_entries
;
155 error
= xfs_reflink_trim_around_shared(ip
, got
, &shared
);
159 if (isnullstartblock(got
->br_startblock
) ||
160 got
->br_startblock
== DELAYSTARTBLOCK
) {
162 * Take the flush completion as being a point-in-time snapshot
163 * where there are no delalloc extents, and if any new ones
164 * have been created racily, just skip them as being 'after'
165 * the flush and so don't get reported.
167 if (!(bmv
->bmv_iflags
& BMV_IF_DELALLOC
))
170 p
->bmv_oflags
|= BMV_OF_DELALLOC
;
173 p
->bmv_block
= xfs_fsb_to_db(ip
, got
->br_startblock
);
176 if (got
->br_state
== XFS_EXT_UNWRITTEN
&&
177 (bmv
->bmv_iflags
& BMV_IF_PREALLOC
))
178 p
->bmv_oflags
|= BMV_OF_PREALLOC
;
181 p
->bmv_oflags
|= BMV_OF_SHARED
;
183 p
->bmv_offset
= XFS_FSB_TO_BB(ip
->i_mount
, got
->br_startoff
);
184 p
->bmv_length
= XFS_FSB_TO_BB(ip
->i_mount
, got
->br_blockcount
);
186 bmv
->bmv_offset
= p
->bmv_offset
+ p
->bmv_length
;
187 bmv
->bmv_length
= max(0LL, bmv_end
- bmv
->bmv_offset
);
193 xfs_getbmap_report_hole(
194 struct xfs_inode
*ip
,
195 struct getbmapx
*bmv
,
196 struct kgetbmap
*out
,
201 struct kgetbmap
*p
= out
+ bmv
->bmv_entries
;
203 if (bmv
->bmv_iflags
& BMV_IF_NO_HOLES
)
207 p
->bmv_offset
= XFS_FSB_TO_BB(ip
->i_mount
, bno
);
208 p
->bmv_length
= XFS_FSB_TO_BB(ip
->i_mount
, end
- bno
);
210 bmv
->bmv_offset
= p
->bmv_offset
+ p
->bmv_length
;
211 bmv
->bmv_length
= max(0LL, bmv_end
- bmv
->bmv_offset
);
217 struct getbmapx
*bmv
)
219 return bmv
->bmv_length
== 0 || bmv
->bmv_entries
>= bmv
->bmv_count
- 1;
223 xfs_getbmap_next_rec(
224 struct xfs_bmbt_irec
*rec
,
225 xfs_fileoff_t total_end
)
227 xfs_fileoff_t end
= rec
->br_startoff
+ rec
->br_blockcount
;
229 if (end
== total_end
)
232 rec
->br_startoff
+= rec
->br_blockcount
;
233 if (!isnullstartblock(rec
->br_startblock
) &&
234 rec
->br_startblock
!= DELAYSTARTBLOCK
)
235 rec
->br_startblock
+= rec
->br_blockcount
;
236 rec
->br_blockcount
= total_end
- end
;
241 * Get inode's extents as described in bmv, and format for output.
242 * Calls formatter to fill the user's buffer until all extents
243 * are mapped, until the passed-in bmv->bmv_count slots have
244 * been filled, or until the formatter short-circuits the loop,
245 * if it is tracking filled-in extents on its own.
249 struct xfs_inode
*ip
,
250 struct getbmapx
*bmv
, /* user bmap structure */
251 struct kgetbmap
*out
)
253 struct xfs_mount
*mp
= ip
->i_mount
;
254 int iflags
= bmv
->bmv_iflags
;
255 int whichfork
, lock
, error
= 0;
256 int64_t bmv_end
, max_len
;
257 xfs_fileoff_t bno
, first_bno
;
258 struct xfs_ifork
*ifp
;
259 struct xfs_bmbt_irec got
, rec
;
261 struct xfs_iext_cursor icur
;
263 if (bmv
->bmv_iflags
& ~BMV_IF_VALID
)
266 /* Only allow CoW fork queries if we're debugging. */
267 if (iflags
& BMV_IF_COWFORK
)
270 if ((iflags
& BMV_IF_ATTRFORK
) && (iflags
& BMV_IF_COWFORK
))
273 if (bmv
->bmv_length
< -1)
275 bmv
->bmv_entries
= 0;
276 if (bmv
->bmv_length
== 0)
279 if (iflags
& BMV_IF_ATTRFORK
)
280 whichfork
= XFS_ATTR_FORK
;
281 else if (iflags
& BMV_IF_COWFORK
)
282 whichfork
= XFS_COW_FORK
;
284 whichfork
= XFS_DATA_FORK
;
286 xfs_ilock(ip
, XFS_IOLOCK_SHARED
);
289 lock
= xfs_ilock_attr_map_shared(ip
);
290 if (!xfs_inode_has_attr_fork(ip
))
291 goto out_unlock_ilock
;
296 lock
= XFS_ILOCK_SHARED
;
299 /* No CoW fork? Just return */
300 if (!xfs_ifork_ptr(ip
, whichfork
))
301 goto out_unlock_ilock
;
303 if (xfs_get_cowextsz_hint(ip
))
304 max_len
= mp
->m_super
->s_maxbytes
;
306 max_len
= XFS_ISIZE(ip
);
309 if (!(iflags
& BMV_IF_DELALLOC
) &&
310 (ip
->i_delayed_blks
|| XFS_ISIZE(ip
) > ip
->i_disk_size
)) {
311 error
= filemap_write_and_wait(VFS_I(ip
)->i_mapping
);
313 goto out_unlock_iolock
;
316 * Even after flushing the inode, there can still be
317 * delalloc blocks on the inode beyond EOF due to
318 * speculative preallocation. These are not removed
319 * until the release function is called or the inode
320 * is inactivated. Hence we cannot assert here that
321 * ip->i_delayed_blks == 0.
325 if (xfs_get_extsz_hint(ip
) ||
326 (ip
->i_diflags
& XFS_DIFLAG_PREALLOC
))
327 max_len
= mp
->m_super
->s_maxbytes
;
329 max_len
= XFS_ISIZE(ip
);
331 lock
= xfs_ilock_data_map_shared(ip
);
335 ifp
= xfs_ifork_ptr(ip
, whichfork
);
337 switch (ifp
->if_format
) {
338 case XFS_DINODE_FMT_EXTENTS
:
339 case XFS_DINODE_FMT_BTREE
:
341 case XFS_DINODE_FMT_LOCAL
:
342 /* Local format inode forks report no extents. */
343 goto out_unlock_ilock
;
346 goto out_unlock_ilock
;
349 if (bmv
->bmv_length
== -1) {
350 max_len
= XFS_FSB_TO_BB(mp
, XFS_B_TO_FSB(mp
, max_len
));
351 bmv
->bmv_length
= max(0LL, max_len
- bmv
->bmv_offset
);
354 bmv_end
= bmv
->bmv_offset
+ bmv
->bmv_length
;
356 first_bno
= bno
= XFS_BB_TO_FSBT(mp
, bmv
->bmv_offset
);
357 len
= XFS_BB_TO_FSB(mp
, bmv
->bmv_length
);
359 error
= xfs_iread_extents(NULL
, ip
, whichfork
);
361 goto out_unlock_ilock
;
363 if (!xfs_iext_lookup_extent(ip
, ifp
, bno
, &icur
, &got
)) {
365 * Report a whole-file hole if the delalloc flag is set to
366 * stay compatible with the old implementation.
368 if (iflags
& BMV_IF_DELALLOC
)
369 xfs_getbmap_report_hole(ip
, bmv
, out
, bmv_end
, bno
,
370 XFS_B_TO_FSB(mp
, XFS_ISIZE(ip
)));
371 goto out_unlock_ilock
;
374 while (!xfs_getbmap_full(bmv
)) {
375 xfs_trim_extent(&got
, first_bno
, len
);
378 * Report an entry for a hole if this extent doesn't directly
379 * follow the previous one.
381 if (got
.br_startoff
> bno
) {
382 xfs_getbmap_report_hole(ip
, bmv
, out
, bmv_end
, bno
,
384 if (xfs_getbmap_full(bmv
))
389 * In order to report shared extents accurately, we report each
390 * distinct shared / unshared part of a single bmbt record with
391 * an individual getbmapx record.
393 bno
= got
.br_startoff
+ got
.br_blockcount
;
396 error
= xfs_getbmap_report_one(ip
, bmv
, out
, bmv_end
,
398 if (error
|| xfs_getbmap_full(bmv
))
399 goto out_unlock_ilock
;
400 } while (xfs_getbmap_next_rec(&rec
, bno
));
402 if (!xfs_iext_next_extent(ifp
, &icur
, &got
)) {
403 xfs_fileoff_t end
= XFS_B_TO_FSB(mp
, XFS_ISIZE(ip
));
405 if (bmv
->bmv_entries
> 0)
406 out
[bmv
->bmv_entries
- 1].bmv_oflags
|=
409 if (whichfork
!= XFS_ATTR_FORK
&& bno
< end
&&
410 !xfs_getbmap_full(bmv
)) {
411 xfs_getbmap_report_hole(ip
, bmv
, out
, bmv_end
,
417 if (bno
>= first_bno
+ len
)
422 xfs_iunlock(ip
, lock
);
424 xfs_iunlock(ip
, XFS_IOLOCK_SHARED
);
429 * Dead simple method of punching delalyed allocation blocks from a range in
430 * the inode. This will always punch out both the start and end blocks, even
431 * if the ranges only partially overlap them, so it is up to the caller to
432 * ensure that partial blocks are not passed in.
435 xfs_bmap_punch_delalloc_range(
436 struct xfs_inode
*ip
,
438 xfs_off_t start_byte
,
441 struct xfs_mount
*mp
= ip
->i_mount
;
442 struct xfs_ifork
*ifp
= xfs_ifork_ptr(ip
, whichfork
);
443 xfs_fileoff_t start_fsb
= XFS_B_TO_FSBT(mp
, start_byte
);
444 xfs_fileoff_t end_fsb
= XFS_B_TO_FSB(mp
, end_byte
);
445 struct xfs_bmbt_irec got
, del
;
446 struct xfs_iext_cursor icur
;
448 ASSERT(!xfs_need_iread_extents(ifp
));
450 xfs_ilock(ip
, XFS_ILOCK_EXCL
);
451 if (!xfs_iext_lookup_extent_before(ip
, ifp
, &end_fsb
, &icur
, &got
))
454 while (got
.br_startoff
+ got
.br_blockcount
> start_fsb
) {
456 xfs_trim_extent(&del
, start_fsb
, end_fsb
- start_fsb
);
459 * A delete can push the cursor forward. Step back to the
460 * previous extent on non-delalloc or extents outside the
463 if (!del
.br_blockcount
||
464 !isnullstartblock(del
.br_startblock
)) {
465 if (!xfs_iext_prev_extent(ifp
, &icur
, &got
))
470 xfs_bmap_del_extent_delay(ip
, whichfork
, &icur
, &got
, &del
);
471 if (!xfs_iext_get_extent(ifp
, &icur
, &got
))
475 if (whichfork
== XFS_COW_FORK
&& !ifp
->if_bytes
)
476 xfs_inode_clear_cowblocks_tag(ip
);
479 xfs_iunlock(ip
, XFS_ILOCK_EXCL
);
483 * Test whether it is appropriate to check an inode for and free post EOF
487 xfs_can_free_eofblocks(
488 struct xfs_inode
*ip
)
490 struct xfs_mount
*mp
= ip
->i_mount
;
491 bool found_blocks
= false;
492 xfs_fileoff_t end_fsb
;
493 xfs_fileoff_t last_fsb
;
494 struct xfs_bmbt_irec imap
;
495 struct xfs_iext_cursor icur
;
498 * Caller must either hold the exclusive io lock; or be inactivating
499 * the inode, which guarantees there are no other users of the inode.
501 if (!(VFS_I(ip
)->i_state
& I_FREEING
))
502 xfs_assert_ilocked(ip
, XFS_IOLOCK_EXCL
);
504 /* prealloc/delalloc exists only on regular files */
505 if (!S_ISREG(VFS_I(ip
)->i_mode
))
509 * Zero sized files with no cached pages and delalloc blocks will not
510 * have speculative prealloc/delalloc blocks to remove.
512 if (VFS_I(ip
)->i_size
== 0 &&
513 VFS_I(ip
)->i_mapping
->nrpages
== 0 &&
514 ip
->i_delayed_blks
== 0)
517 /* If we haven't read in the extent list, then don't do it now. */
518 if (xfs_need_iread_extents(&ip
->i_df
))
522 * Do not free real extents in preallocated files unless the file has
523 * delalloc blocks and we are forced to remove them.
525 if ((ip
->i_diflags
& XFS_DIFLAG_PREALLOC
) && !ip
->i_delayed_blks
)
529 * Do not try to free post-EOF blocks if EOF is beyond the end of the
530 * range supported by the page cache, because the truncation will loop
533 end_fsb
= XFS_B_TO_FSB(mp
, (xfs_ufsize_t
)XFS_ISIZE(ip
));
534 if (xfs_inode_has_bigrtalloc(ip
))
535 end_fsb
= xfs_fileoff_roundup_rtx(mp
, end_fsb
);
536 last_fsb
= XFS_B_TO_FSB(mp
, mp
->m_super
->s_maxbytes
);
537 if (last_fsb
<= end_fsb
)
541 * Check if there is an post-EOF extent to free. If there are any
542 * delalloc blocks attached to the inode (data fork delalloc
543 * reservations or CoW extents of any kind), we need to free them so
544 * that inactivation doesn't fail to erase them.
546 xfs_ilock(ip
, XFS_ILOCK_SHARED
);
547 if (ip
->i_delayed_blks
||
548 xfs_iext_lookup_extent(ip
, &ip
->i_df
, end_fsb
, &icur
, &imap
))
550 xfs_iunlock(ip
, XFS_ILOCK_SHARED
);
555 * This is called to free any blocks beyond eof. The caller must hold
556 * IOLOCK_EXCL unless we are in the inode reclaim path and have the only
557 * reference to the inode.
561 struct xfs_inode
*ip
)
563 struct xfs_trans
*tp
;
564 struct xfs_mount
*mp
= ip
->i_mount
;
567 /* Attach the dquots to the inode up front. */
568 error
= xfs_qm_dqattach(ip
);
572 /* Wait on dio to ensure i_size has settled. */
573 inode_dio_wait(VFS_I(ip
));
576 * For preallocated files only free delayed allocations.
578 * Note that this means we also leave speculative preallocations in
579 * place for preallocated files.
581 if (ip
->i_diflags
& (XFS_DIFLAG_PREALLOC
| XFS_DIFLAG_APPEND
)) {
582 if (ip
->i_delayed_blks
) {
583 xfs_bmap_punch_delalloc_range(ip
, XFS_DATA_FORK
,
584 round_up(XFS_ISIZE(ip
), mp
->m_sb
.sb_blocksize
),
587 xfs_inode_clear_eofblocks_tag(ip
);
591 error
= xfs_trans_alloc(mp
, &M_RES(mp
)->tr_itruncate
, 0, 0, 0, &tp
);
593 ASSERT(xfs_is_shutdown(mp
));
597 xfs_ilock(ip
, XFS_ILOCK_EXCL
);
598 xfs_trans_ijoin(tp
, ip
, 0);
601 * Do not update the on-disk file size. If we update the on-disk file
602 * size and then the system crashes before the contents of the file are
603 * flushed to disk then the files may be full of holes (ie NULL files
606 error
= xfs_itruncate_extents_flags(&tp
, ip
, XFS_DATA_FORK
,
607 XFS_ISIZE(ip
), XFS_BMAPI_NODISCARD
);
611 error
= xfs_trans_commit(tp
);
615 xfs_inode_clear_eofblocks_tag(ip
);
620 * If we get an error at this point we simply don't
621 * bother truncating the file.
623 xfs_trans_cancel(tp
);
625 xfs_iunlock(ip
, XFS_ILOCK_EXCL
);
630 xfs_alloc_file_space(
631 struct xfs_inode
*ip
,
635 xfs_mount_t
*mp
= ip
->i_mount
;
637 xfs_filblks_t allocatesize_fsb
;
638 xfs_extlen_t extsz
, temp
;
639 xfs_fileoff_t startoffset_fsb
;
640 xfs_fileoff_t endoffset_fsb
;
643 xfs_bmbt_irec_t imaps
[1], *imapp
;
646 if (xfs_is_always_cow_inode(ip
))
649 trace_xfs_alloc_file_space(ip
);
651 if (xfs_is_shutdown(mp
))
654 error
= xfs_qm_dqattach(ip
);
661 rt
= XFS_IS_REALTIME_INODE(ip
);
662 extsz
= xfs_get_extsz_hint(ip
);
666 startoffset_fsb
= XFS_B_TO_FSBT(mp
, offset
);
667 endoffset_fsb
= XFS_B_TO_FSB(mp
, offset
+ count
);
668 allocatesize_fsb
= endoffset_fsb
- startoffset_fsb
;
671 * Allocate file space until done or until there is an error
673 while (allocatesize_fsb
&& !error
) {
675 unsigned int dblocks
, rblocks
, resblks
;
679 * Determine space reservations for data/realtime.
681 if (unlikely(extsz
)) {
685 e
= startoffset_fsb
+ allocatesize_fsb
;
686 div_u64_rem(startoffset_fsb
, extsz
, &temp
);
689 div_u64_rem(e
, extsz
, &temp
);
694 e
= allocatesize_fsb
;
698 * The transaction reservation is limited to a 32-bit block
699 * count, hence we need to limit the number of blocks we are
700 * trying to reserve to avoid an overflow. We can't allocate
701 * more than @nimaps extents, and an extent is limited on disk
702 * to XFS_BMBT_MAX_EXTLEN (21 bits), so use that to enforce the
705 resblks
= min_t(xfs_fileoff_t
, (e
- s
),
706 (XFS_MAX_BMBT_EXTLEN
* nimaps
));
708 dblocks
= XFS_DIOSTRAT_SPACE_RES(mp
, 0);
711 dblocks
= XFS_DIOSTRAT_SPACE_RES(mp
, resblks
);
715 error
= xfs_trans_alloc_inode(ip
, &M_RES(mp
)->tr_write
,
716 dblocks
, rblocks
, false, &tp
);
720 error
= xfs_iext_count_extend(tp
, ip
, XFS_DATA_FORK
,
721 XFS_IEXT_ADD_NOSPLIT_CNT
);
726 * If the allocator cannot find a single free extent large
727 * enough to cover the start block of the requested range,
728 * xfs_bmapi_write will return -ENOSR.
730 * In that case we simply need to keep looping with the same
731 * startoffset_fsb so that one of the following allocations
732 * will eventually reach the requested range.
734 error
= xfs_bmapi_write(tp
, ip
, startoffset_fsb
,
735 allocatesize_fsb
, XFS_BMAPI_PREALLOC
, 0, imapp
,
742 startoffset_fsb
+= imapp
->br_blockcount
;
743 allocatesize_fsb
-= imapp
->br_blockcount
;
746 ip
->i_diflags
|= XFS_DIFLAG_PREALLOC
;
747 xfs_trans_log_inode(tp
, ip
, XFS_ILOG_CORE
);
749 error
= xfs_trans_commit(tp
);
750 xfs_iunlock(ip
, XFS_ILOCK_EXCL
);
756 xfs_trans_cancel(tp
);
757 xfs_iunlock(ip
, XFS_ILOCK_EXCL
);
763 struct xfs_inode
*ip
,
764 xfs_fileoff_t startoffset_fsb
,
765 xfs_filblks_t len_fsb
,
768 struct xfs_mount
*mp
= ip
->i_mount
;
769 struct xfs_trans
*tp
;
770 uint resblks
= XFS_DIOSTRAT_SPACE_RES(mp
, 0);
773 error
= xfs_trans_alloc_inode(ip
, &M_RES(mp
)->tr_write
, resblks
, 0,
778 error
= xfs_iext_count_extend(tp
, ip
, XFS_DATA_FORK
,
779 XFS_IEXT_PUNCH_HOLE_CNT
);
781 goto out_trans_cancel
;
783 error
= xfs_bunmapi(tp
, ip
, startoffset_fsb
, len_fsb
, 0, 2, done
);
785 goto out_trans_cancel
;
787 error
= xfs_trans_commit(tp
);
789 xfs_iunlock(ip
, XFS_ILOCK_EXCL
);
793 xfs_trans_cancel(tp
);
797 /* Caller must first wait for the completion of any pending DIOs if required. */
799 xfs_flush_unmap_range(
800 struct xfs_inode
*ip
,
804 struct inode
*inode
= VFS_I(ip
);
805 xfs_off_t rounding
, start
, end
;
809 * Make sure we extend the flush out to extent alignment
810 * boundaries so any extent range overlapping the start/end
811 * of the modification we are about to do is clean and idle.
813 rounding
= max_t(xfs_off_t
, xfs_inode_alloc_unitsize(ip
), PAGE_SIZE
);
814 start
= rounddown_64(offset
, rounding
);
815 end
= roundup_64(offset
+ len
, rounding
) - 1;
817 error
= filemap_write_and_wait_range(inode
->i_mapping
, start
, end
);
820 truncate_pagecache_range(inode
, start
, end
);
826 struct xfs_inode
*ip
,
830 struct xfs_mount
*mp
= ip
->i_mount
;
831 xfs_fileoff_t startoffset_fsb
;
832 xfs_fileoff_t endoffset_fsb
;
835 trace_xfs_free_file_space(ip
);
837 error
= xfs_qm_dqattach(ip
);
841 if (len
<= 0) /* if nothing being freed */
845 * Now AIO and DIO has drained we flush and (if necessary) invalidate
846 * the cached range over the first operation we are about to run.
848 error
= xfs_flush_unmap_range(ip
, offset
, len
);
852 startoffset_fsb
= XFS_B_TO_FSB(mp
, offset
);
853 endoffset_fsb
= XFS_B_TO_FSBT(mp
, offset
+ len
);
855 /* We can only free complete realtime extents. */
856 if (xfs_inode_has_bigrtalloc(ip
)) {
857 startoffset_fsb
= xfs_fileoff_roundup_rtx(mp
, startoffset_fsb
);
858 endoffset_fsb
= xfs_fileoff_rounddown_rtx(mp
, endoffset_fsb
);
862 * Need to zero the stuff we're not freeing, on disk.
864 if (endoffset_fsb
> startoffset_fsb
) {
866 error
= xfs_unmap_extent(ip
, startoffset_fsb
,
867 endoffset_fsb
- startoffset_fsb
, &done
);
874 * Now that we've unmap all full blocks we'll have to zero out any
875 * partial block at the beginning and/or end. xfs_zero_range is smart
876 * enough to skip any holes, including those we just created, but we
877 * must take care not to zero beyond EOF and enlarge i_size.
879 if (offset
>= XFS_ISIZE(ip
))
881 if (offset
+ len
> XFS_ISIZE(ip
))
882 len
= XFS_ISIZE(ip
) - offset
;
883 error
= xfs_zero_range(ip
, offset
, len
, NULL
);
888 * If we zeroed right up to EOF and EOF straddles a page boundary we
889 * must make sure that the post-EOF area is also zeroed because the
890 * page could be mmap'd and xfs_zero_range doesn't do that for us.
891 * Writeback of the eof page will do this, albeit clumsily.
893 if (offset
+ len
>= XFS_ISIZE(ip
) && offset_in_page(offset
+ len
) > 0) {
894 error
= filemap_write_and_wait_range(VFS_I(ip
)->i_mapping
,
895 round_down(offset
+ len
, PAGE_SIZE
), LLONG_MAX
);
903 struct xfs_inode
*ip
,
906 unsigned int rounding
;
910 * Trim eofblocks to avoid shifting uninitialized post-eof preallocation
911 * into the accessible region of the file.
913 if (xfs_can_free_eofblocks(ip
)) {
914 error
= xfs_free_eofblocks(ip
);
920 * Shift operations must stabilize the start block offset boundary along
921 * with the full range of the operation. If we don't, a COW writeback
922 * completion could race with an insert, front merge with the start
923 * extent (after split) during the shift and corrupt the file. Start
924 * with the allocation unit just prior to the start to stabilize the
927 rounding
= xfs_inode_alloc_unitsize(ip
);
928 offset
= rounddown_64(offset
, rounding
);
933 * Writeback and invalidate cache for the remainder of the file as we're
934 * about to shift down every extent from offset to EOF.
936 error
= xfs_flush_unmap_range(ip
, offset
, XFS_ISIZE(ip
));
941 * Clean out anything hanging around in the cow fork now that
942 * we've flushed all the dirty data out to disk to avoid having
943 * CoW extents at the wrong offsets.
945 if (xfs_inode_has_cow_data(ip
)) {
946 error
= xfs_reflink_cancel_cow_range(ip
, offset
, NULLFILEOFF
,
956 * xfs_collapse_file_space()
957 * This routine frees disk space and shift extent for the given file.
958 * The first thing we do is to free data blocks in the specified range
959 * by calling xfs_free_file_space(). It would also sync dirty data
960 * and invalidate page cache over the region on which collapse range
961 * is working. And Shift extent records to the left to cover a hole.
968 xfs_collapse_file_space(
969 struct xfs_inode
*ip
,
973 struct xfs_mount
*mp
= ip
->i_mount
;
974 struct xfs_trans
*tp
;
976 xfs_fileoff_t next_fsb
= XFS_B_TO_FSB(mp
, offset
+ len
);
977 xfs_fileoff_t shift_fsb
= XFS_B_TO_FSB(mp
, len
);
980 xfs_assert_ilocked(ip
, XFS_IOLOCK_EXCL
| XFS_MMAPLOCK_EXCL
);
982 trace_xfs_collapse_file_space(ip
);
984 error
= xfs_free_file_space(ip
, offset
, len
);
988 error
= xfs_prepare_shift(ip
, offset
);
992 error
= xfs_trans_alloc(mp
, &M_RES(mp
)->tr_write
, 0, 0, 0, &tp
);
996 xfs_ilock(ip
, XFS_ILOCK_EXCL
);
997 xfs_trans_ijoin(tp
, ip
, 0);
1000 error
= xfs_bmap_collapse_extents(tp
, ip
, &next_fsb
, shift_fsb
,
1003 goto out_trans_cancel
;
1007 /* finish any deferred frees and roll the transaction */
1008 error
= xfs_defer_finish(&tp
);
1010 goto out_trans_cancel
;
1013 error
= xfs_trans_commit(tp
);
1014 xfs_iunlock(ip
, XFS_ILOCK_EXCL
);
1018 xfs_trans_cancel(tp
);
1019 xfs_iunlock(ip
, XFS_ILOCK_EXCL
);
1024 * xfs_insert_file_space()
1025 * This routine create hole space by shifting extents for the given file.
1026 * The first thing we do is to sync dirty data and invalidate page cache
1027 * over the region on which insert range is working. And split an extent
1028 * to two extents at given offset by calling xfs_bmap_split_extent.
1029 * And shift all extent records which are laying between [offset,
1030 * last allocated extent] to the right to reserve hole range.
1036 xfs_insert_file_space(
1037 struct xfs_inode
*ip
,
1041 struct xfs_mount
*mp
= ip
->i_mount
;
1042 struct xfs_trans
*tp
;
1044 xfs_fileoff_t stop_fsb
= XFS_B_TO_FSB(mp
, offset
);
1045 xfs_fileoff_t next_fsb
= NULLFSBLOCK
;
1046 xfs_fileoff_t shift_fsb
= XFS_B_TO_FSB(mp
, len
);
1049 xfs_assert_ilocked(ip
, XFS_IOLOCK_EXCL
| XFS_MMAPLOCK_EXCL
);
1051 trace_xfs_insert_file_space(ip
);
1053 error
= xfs_bmap_can_insert_extents(ip
, stop_fsb
, shift_fsb
);
1057 error
= xfs_prepare_shift(ip
, offset
);
1061 error
= xfs_trans_alloc(mp
, &M_RES(mp
)->tr_write
,
1062 XFS_DIOSTRAT_SPACE_RES(mp
, 0), 0, 0, &tp
);
1066 xfs_ilock(ip
, XFS_ILOCK_EXCL
);
1067 xfs_trans_ijoin(tp
, ip
, 0);
1069 error
= xfs_iext_count_extend(tp
, ip
, XFS_DATA_FORK
,
1070 XFS_IEXT_PUNCH_HOLE_CNT
);
1072 goto out_trans_cancel
;
1075 * The extent shifting code works on extent granularity. So, if stop_fsb
1076 * is not the starting block of extent, we need to split the extent at
1079 error
= xfs_bmap_split_extent(tp
, ip
, stop_fsb
);
1081 goto out_trans_cancel
;
1084 error
= xfs_defer_finish(&tp
);
1086 goto out_trans_cancel
;
1088 error
= xfs_bmap_insert_extents(tp
, ip
, &next_fsb
, shift_fsb
,
1091 goto out_trans_cancel
;
1094 error
= xfs_trans_commit(tp
);
1095 xfs_iunlock(ip
, XFS_ILOCK_EXCL
);
1099 xfs_trans_cancel(tp
);
1100 xfs_iunlock(ip
, XFS_ILOCK_EXCL
);
1105 * We need to check that the format of the data fork in the temporary inode is
1106 * valid for the target inode before doing the swap. This is not a problem with
1107 * attr1 because of the fixed fork offset, but attr2 has a dynamically sized
1108 * data fork depending on the space the attribute fork is taking so we can get
1109 * invalid formats on the target inode.
1111 * E.g. target has space for 7 extents in extent format, temp inode only has
1112 * space for 6. If we defragment down to 7 extents, then the tmp format is a
1113 * btree, but when swapped it needs to be in extent format. Hence we can't just
1114 * blindly swap data forks on attr2 filesystems.
1116 * Note that we check the swap in both directions so that we don't end up with
1117 * a corrupt temporary inode, either.
1119 * Note that fixing the way xfs_fsr sets up the attribute fork in the source
1120 * inode will prevent this situation from occurring, so all we do here is
1121 * reject and log the attempt. basically we are putting the responsibility on
1122 * userspace to get this right.
1125 xfs_swap_extents_check_format(
1126 struct xfs_inode
*ip
, /* target inode */
1127 struct xfs_inode
*tip
) /* tmp inode */
1129 struct xfs_ifork
*ifp
= &ip
->i_df
;
1130 struct xfs_ifork
*tifp
= &tip
->i_df
;
1132 /* User/group/project quota ids must match if quotas are enforced. */
1133 if (XFS_IS_QUOTA_ON(ip
->i_mount
) &&
1134 (!uid_eq(VFS_I(ip
)->i_uid
, VFS_I(tip
)->i_uid
) ||
1135 !gid_eq(VFS_I(ip
)->i_gid
, VFS_I(tip
)->i_gid
) ||
1136 ip
->i_projid
!= tip
->i_projid
))
1139 /* Should never get a local format */
1140 if (ifp
->if_format
== XFS_DINODE_FMT_LOCAL
||
1141 tifp
->if_format
== XFS_DINODE_FMT_LOCAL
)
1145 * if the target inode has less extents that then temporary inode then
1146 * why did userspace call us?
1148 if (ifp
->if_nextents
< tifp
->if_nextents
)
1152 * If we have to use the (expensive) rmap swap method, we can
1153 * handle any number of extents and any format.
1155 if (xfs_has_rmapbt(ip
->i_mount
))
1159 * if the target inode is in extent form and the temp inode is in btree
1160 * form then we will end up with the target inode in the wrong format
1161 * as we already know there are less extents in the temp inode.
1163 if (ifp
->if_format
== XFS_DINODE_FMT_EXTENTS
&&
1164 tifp
->if_format
== XFS_DINODE_FMT_BTREE
)
1167 /* Check temp in extent form to max in target */
1168 if (tifp
->if_format
== XFS_DINODE_FMT_EXTENTS
&&
1169 tifp
->if_nextents
> XFS_IFORK_MAXEXT(ip
, XFS_DATA_FORK
))
1172 /* Check target in extent form to max in temp */
1173 if (ifp
->if_format
== XFS_DINODE_FMT_EXTENTS
&&
1174 ifp
->if_nextents
> XFS_IFORK_MAXEXT(tip
, XFS_DATA_FORK
))
1178 * If we are in a btree format, check that the temp root block will fit
1179 * in the target and that it has enough extents to be in btree format
1182 * Note that we have to be careful to allow btree->extent conversions
1183 * (a common defrag case) which will occur when the temp inode is in
1186 if (tifp
->if_format
== XFS_DINODE_FMT_BTREE
) {
1187 if (xfs_inode_has_attr_fork(ip
) &&
1188 xfs_bmap_bmdr_space(tifp
->if_broot
) > xfs_inode_fork_boff(ip
))
1190 if (tifp
->if_nextents
<= XFS_IFORK_MAXEXT(ip
, XFS_DATA_FORK
))
1194 /* Reciprocal target->temp btree format checks */
1195 if (ifp
->if_format
== XFS_DINODE_FMT_BTREE
) {
1196 if (xfs_inode_has_attr_fork(tip
) &&
1197 xfs_bmap_bmdr_space(ip
->i_df
.if_broot
) > xfs_inode_fork_boff(tip
))
1199 if (ifp
->if_nextents
<= XFS_IFORK_MAXEXT(tip
, XFS_DATA_FORK
))
1207 xfs_swap_extent_flush(
1208 struct xfs_inode
*ip
)
1212 error
= filemap_write_and_wait(VFS_I(ip
)->i_mapping
);
1215 truncate_pagecache_range(VFS_I(ip
), 0, -1);
1217 /* Verify O_DIRECT for ftmp */
1218 if (VFS_I(ip
)->i_mapping
->nrpages
)
1224 * Move extents from one file to another, when rmap is enabled.
1227 xfs_swap_extent_rmap(
1228 struct xfs_trans
**tpp
,
1229 struct xfs_inode
*ip
,
1230 struct xfs_inode
*tip
)
1232 struct xfs_trans
*tp
= *tpp
;
1233 struct xfs_bmbt_irec irec
;
1234 struct xfs_bmbt_irec uirec
;
1235 struct xfs_bmbt_irec tirec
;
1236 xfs_fileoff_t offset_fsb
;
1237 xfs_fileoff_t end_fsb
;
1238 xfs_filblks_t count_fsb
;
1243 uint64_t tip_flags2
;
1246 * If the source file has shared blocks, we must flag the donor
1247 * file as having shared blocks so that we get the shared-block
1248 * rmap functions when we go to fix up the rmaps. The flags
1249 * will be switch for reals later.
1251 tip_flags2
= tip
->i_diflags2
;
1252 if (ip
->i_diflags2
& XFS_DIFLAG2_REFLINK
)
1253 tip
->i_diflags2
|= XFS_DIFLAG2_REFLINK
;
1256 end_fsb
= XFS_B_TO_FSB(ip
->i_mount
, i_size_read(VFS_I(ip
)));
1257 count_fsb
= (xfs_filblks_t
)(end_fsb
- offset_fsb
);
1260 /* Read extent from the donor file */
1262 error
= xfs_bmapi_read(tip
, offset_fsb
, count_fsb
, &tirec
,
1266 ASSERT(nimaps
== 1);
1267 ASSERT(tirec
.br_startblock
!= DELAYSTARTBLOCK
);
1269 trace_xfs_swap_extent_rmap_remap(tip
, &tirec
);
1270 ilen
= tirec
.br_blockcount
;
1272 /* Unmap the old blocks in the source file. */
1273 while (tirec
.br_blockcount
) {
1274 ASSERT(tp
->t_highest_agno
== NULLAGNUMBER
);
1275 trace_xfs_swap_extent_rmap_remap_piece(tip
, &tirec
);
1277 /* Read extent from the source file */
1279 error
= xfs_bmapi_read(ip
, tirec
.br_startoff
,
1280 tirec
.br_blockcount
, &irec
,
1284 ASSERT(nimaps
== 1);
1285 ASSERT(tirec
.br_startoff
== irec
.br_startoff
);
1286 trace_xfs_swap_extent_rmap_remap_piece(ip
, &irec
);
1288 /* Trim the extent. */
1290 uirec
.br_blockcount
= rlen
= min_t(xfs_filblks_t
,
1291 tirec
.br_blockcount
,
1292 irec
.br_blockcount
);
1293 trace_xfs_swap_extent_rmap_remap_piece(tip
, &uirec
);
1295 if (xfs_bmap_is_real_extent(&uirec
)) {
1296 error
= xfs_iext_count_extend(tp
, ip
,
1298 XFS_IEXT_SWAP_RMAP_CNT
);
1303 if (xfs_bmap_is_real_extent(&irec
)) {
1304 error
= xfs_iext_count_extend(tp
, tip
,
1306 XFS_IEXT_SWAP_RMAP_CNT
);
1311 /* Remove the mapping from the donor file. */
1312 xfs_bmap_unmap_extent(tp
, tip
, XFS_DATA_FORK
, &uirec
);
1314 /* Remove the mapping from the source file. */
1315 xfs_bmap_unmap_extent(tp
, ip
, XFS_DATA_FORK
, &irec
);
1317 /* Map the donor file's blocks into the source file. */
1318 xfs_bmap_map_extent(tp
, ip
, XFS_DATA_FORK
, &uirec
);
1320 /* Map the source file's blocks into the donor file. */
1321 xfs_bmap_map_extent(tp
, tip
, XFS_DATA_FORK
, &irec
);
1323 error
= xfs_defer_finish(tpp
);
1328 tirec
.br_startoff
+= rlen
;
1329 if (tirec
.br_startblock
!= HOLESTARTBLOCK
&&
1330 tirec
.br_startblock
!= DELAYSTARTBLOCK
)
1331 tirec
.br_startblock
+= rlen
;
1332 tirec
.br_blockcount
-= rlen
;
1340 tip
->i_diflags2
= tip_flags2
;
1344 trace_xfs_swap_extent_rmap_error(ip
, error
, _RET_IP_
);
1345 tip
->i_diflags2
= tip_flags2
;
1349 /* Swap the extents of two files by swapping data forks. */
1351 xfs_swap_extent_forks(
1352 struct xfs_trans
*tp
,
1353 struct xfs_inode
*ip
,
1354 struct xfs_inode
*tip
,
1356 int *target_log_flags
)
1358 xfs_filblks_t aforkblks
= 0;
1359 xfs_filblks_t taforkblks
= 0;
1365 * Count the number of extended attribute blocks
1367 if (xfs_inode_has_attr_fork(ip
) && ip
->i_af
.if_nextents
> 0 &&
1368 ip
->i_af
.if_format
!= XFS_DINODE_FMT_LOCAL
) {
1369 error
= xfs_bmap_count_blocks(tp
, ip
, XFS_ATTR_FORK
, &junk
,
1374 if (xfs_inode_has_attr_fork(tip
) && tip
->i_af
.if_nextents
> 0 &&
1375 tip
->i_af
.if_format
!= XFS_DINODE_FMT_LOCAL
) {
1376 error
= xfs_bmap_count_blocks(tp
, tip
, XFS_ATTR_FORK
, &junk
,
1383 * Btree format (v3) inodes have the inode number stamped in the bmbt
1384 * block headers. We can't start changing the bmbt blocks until the
1385 * inode owner change is logged so recovery does the right thing in the
1386 * event of a crash. Set the owner change log flags now and leave the
1387 * bmbt scan as the last step.
1389 if (xfs_has_v3inodes(ip
->i_mount
)) {
1390 if (ip
->i_df
.if_format
== XFS_DINODE_FMT_BTREE
)
1391 (*target_log_flags
) |= XFS_ILOG_DOWNER
;
1392 if (tip
->i_df
.if_format
== XFS_DINODE_FMT_BTREE
)
1393 (*src_log_flags
) |= XFS_ILOG_DOWNER
;
1397 * Swap the data forks of the inodes
1399 swap(ip
->i_df
, tip
->i_df
);
1402 * Fix the on-disk inode values
1404 tmp
= (uint64_t)ip
->i_nblocks
;
1405 ip
->i_nblocks
= tip
->i_nblocks
- taforkblks
+ aforkblks
;
1406 tip
->i_nblocks
= tmp
+ taforkblks
- aforkblks
;
1409 * The extents in the source inode could still contain speculative
1410 * preallocation beyond EOF (e.g. the file is open but not modified
1411 * while defrag is in progress). In that case, we need to copy over the
1412 * number of delalloc blocks the data fork in the source inode is
1413 * tracking beyond EOF so that when the fork is truncated away when the
1414 * temporary inode is unlinked we don't underrun the i_delayed_blks
1415 * counter on that inode.
1417 ASSERT(tip
->i_delayed_blks
== 0);
1418 tip
->i_delayed_blks
= ip
->i_delayed_blks
;
1419 ip
->i_delayed_blks
= 0;
1421 switch (ip
->i_df
.if_format
) {
1422 case XFS_DINODE_FMT_EXTENTS
:
1423 (*src_log_flags
) |= XFS_ILOG_DEXT
;
1425 case XFS_DINODE_FMT_BTREE
:
1426 ASSERT(!xfs_has_v3inodes(ip
->i_mount
) ||
1427 (*src_log_flags
& XFS_ILOG_DOWNER
));
1428 (*src_log_flags
) |= XFS_ILOG_DBROOT
;
1432 switch (tip
->i_df
.if_format
) {
1433 case XFS_DINODE_FMT_EXTENTS
:
1434 (*target_log_flags
) |= XFS_ILOG_DEXT
;
1436 case XFS_DINODE_FMT_BTREE
:
1437 (*target_log_flags
) |= XFS_ILOG_DBROOT
;
1438 ASSERT(!xfs_has_v3inodes(ip
->i_mount
) ||
1439 (*target_log_flags
& XFS_ILOG_DOWNER
));
1447 * Fix up the owners of the bmbt blocks to refer to the current inode. The
1448 * change owner scan attempts to order all modified buffers in the current
1449 * transaction. In the event of ordered buffer failure, the offending buffer is
1450 * physically logged as a fallback and the scan returns -EAGAIN. We must roll
1451 * the transaction in this case to replenish the fallback log reservation and
1452 * restart the scan. This process repeats until the scan completes.
1455 xfs_swap_change_owner(
1456 struct xfs_trans
**tpp
,
1457 struct xfs_inode
*ip
,
1458 struct xfs_inode
*tmpip
)
1461 struct xfs_trans
*tp
= *tpp
;
1464 error
= xfs_bmbt_change_owner(tp
, ip
, XFS_DATA_FORK
, ip
->i_ino
,
1466 /* success or fatal error */
1467 if (error
!= -EAGAIN
)
1470 error
= xfs_trans_roll(tpp
);
1476 * Redirty both inodes so they can relog and keep the log tail
1479 xfs_trans_ijoin(tp
, ip
, 0);
1480 xfs_trans_ijoin(tp
, tmpip
, 0);
1481 xfs_trans_log_inode(tp
, ip
, XFS_ILOG_CORE
);
1482 xfs_trans_log_inode(tp
, tmpip
, XFS_ILOG_CORE
);
1490 struct xfs_inode
*ip
, /* target inode */
1491 struct xfs_inode
*tip
, /* tmp inode */
1492 struct xfs_swapext
*sxp
)
1494 struct xfs_mount
*mp
= ip
->i_mount
;
1495 struct xfs_trans
*tp
;
1496 struct xfs_bstat
*sbp
= &sxp
->sx_stat
;
1497 int src_log_flags
, target_log_flags
;
1501 unsigned int flags
= 0;
1502 struct timespec64 ctime
, mtime
;
1505 * Lock the inodes against other IO, page faults and truncate to
1506 * begin with. Then we can ensure the inodes are flushed and have no
1507 * page cache safely. Once we have done this we can take the ilocks and
1508 * do the rest of the checks.
1510 lock_two_nondirectories(VFS_I(ip
), VFS_I(tip
));
1511 filemap_invalidate_lock_two(VFS_I(ip
)->i_mapping
,
1512 VFS_I(tip
)->i_mapping
);
1514 /* Verify that both files have the same format */
1515 if ((VFS_I(ip
)->i_mode
& S_IFMT
) != (VFS_I(tip
)->i_mode
& S_IFMT
)) {
1520 /* Verify both files are either real-time or non-realtime */
1521 if (XFS_IS_REALTIME_INODE(ip
) != XFS_IS_REALTIME_INODE(tip
)) {
1527 * The rmapbt implementation is unable to resume a swapext operation
1528 * after a crash if the allocation unit size is larger than a block.
1529 * This (deprecated) interface will not be upgraded to handle this
1530 * situation. Defragmentation must be performed with the commit range
1533 if (XFS_IS_REALTIME_INODE(ip
) && xfs_has_rtgroups(ip
->i_mount
)) {
1534 error
= -EOPNOTSUPP
;
1538 error
= xfs_qm_dqattach(ip
);
1542 error
= xfs_qm_dqattach(tip
);
1546 error
= xfs_swap_extent_flush(ip
);
1549 error
= xfs_swap_extent_flush(tip
);
1553 if (xfs_inode_has_cow_data(tip
)) {
1554 error
= xfs_reflink_cancel_cow_range(tip
, 0, NULLFILEOFF
, true);
1560 * Extent "swapping" with rmap requires a permanent reservation and
1561 * a block reservation because it's really just a remap operation
1562 * performed with log redo items!
1564 if (xfs_has_rmapbt(mp
)) {
1565 int w
= XFS_DATA_FORK
;
1566 uint32_t ipnext
= ip
->i_df
.if_nextents
;
1567 uint32_t tipnext
= tip
->i_df
.if_nextents
;
1570 * Conceptually this shouldn't affect the shape of either bmbt,
1571 * but since we atomically move extents one by one, we reserve
1572 * enough space to rebuild both trees.
1574 resblks
= XFS_SWAP_RMAP_SPACE_RES(mp
, ipnext
, w
);
1575 resblks
+= XFS_SWAP_RMAP_SPACE_RES(mp
, tipnext
, w
);
1578 * If either inode straddles a bmapbt block allocation boundary,
1579 * the rmapbt algorithm triggers repeated allocs and frees as
1580 * extents are remapped. This can exhaust the block reservation
1581 * prematurely and cause shutdown. Return freed blocks to the
1582 * transaction reservation to counter this behavior.
1584 flags
|= XFS_TRANS_RES_FDBLKS
;
1586 error
= xfs_trans_alloc(mp
, &M_RES(mp
)->tr_write
, resblks
, 0, flags
,
1592 * Lock and join the inodes to the tansaction so that transaction commit
1593 * or cancel will unlock the inodes from this point onwards.
1595 xfs_lock_two_inodes(ip
, XFS_ILOCK_EXCL
, tip
, XFS_ILOCK_EXCL
);
1596 xfs_trans_ijoin(tp
, ip
, 0);
1597 xfs_trans_ijoin(tp
, tip
, 0);
1600 /* Verify all data are being swapped */
1601 if (sxp
->sx_offset
!= 0 ||
1602 sxp
->sx_length
!= ip
->i_disk_size
||
1603 sxp
->sx_length
!= tip
->i_disk_size
) {
1605 goto out_trans_cancel
;
1608 trace_xfs_swap_extent_before(ip
, 0);
1609 trace_xfs_swap_extent_before(tip
, 1);
1611 /* check inode formats now that data is flushed */
1612 error
= xfs_swap_extents_check_format(ip
, tip
);
1615 "%s: inode 0x%llx format is incompatible for exchanging.",
1616 __func__
, ip
->i_ino
);
1617 goto out_trans_cancel
;
1621 * Compare the current change & modify times with that
1622 * passed in. If they differ, we abort this swap.
1623 * This is the mechanism used to ensure the calling
1624 * process that the file was not changed out from
1627 ctime
= inode_get_ctime(VFS_I(ip
));
1628 mtime
= inode_get_mtime(VFS_I(ip
));
1629 if ((sbp
->bs_ctime
.tv_sec
!= ctime
.tv_sec
) ||
1630 (sbp
->bs_ctime
.tv_nsec
!= ctime
.tv_nsec
) ||
1631 (sbp
->bs_mtime
.tv_sec
!= mtime
.tv_sec
) ||
1632 (sbp
->bs_mtime
.tv_nsec
!= mtime
.tv_nsec
)) {
1634 goto out_trans_cancel
;
1638 * Note the trickiness in setting the log flags - we set the owner log
1639 * flag on the opposite inode (i.e. the inode we are setting the new
1640 * owner to be) because once we swap the forks and log that, log
1641 * recovery is going to see the fork as owned by the swapped inode,
1642 * not the pre-swapped inodes.
1644 src_log_flags
= XFS_ILOG_CORE
;
1645 target_log_flags
= XFS_ILOG_CORE
;
1647 if (xfs_has_rmapbt(mp
))
1648 error
= xfs_swap_extent_rmap(&tp
, ip
, tip
);
1650 error
= xfs_swap_extent_forks(tp
, ip
, tip
, &src_log_flags
,
1653 goto out_trans_cancel
;
1655 /* Do we have to swap reflink flags? */
1656 if ((ip
->i_diflags2
& XFS_DIFLAG2_REFLINK
) ^
1657 (tip
->i_diflags2
& XFS_DIFLAG2_REFLINK
)) {
1658 f
= ip
->i_diflags2
& XFS_DIFLAG2_REFLINK
;
1659 ip
->i_diflags2
&= ~XFS_DIFLAG2_REFLINK
;
1660 ip
->i_diflags2
|= tip
->i_diflags2
& XFS_DIFLAG2_REFLINK
;
1661 tip
->i_diflags2
&= ~XFS_DIFLAG2_REFLINK
;
1662 tip
->i_diflags2
|= f
& XFS_DIFLAG2_REFLINK
;
1665 /* Swap the cow forks. */
1666 if (xfs_has_reflink(mp
)) {
1667 ASSERT(!ip
->i_cowfp
||
1668 ip
->i_cowfp
->if_format
== XFS_DINODE_FMT_EXTENTS
);
1669 ASSERT(!tip
->i_cowfp
||
1670 tip
->i_cowfp
->if_format
== XFS_DINODE_FMT_EXTENTS
);
1672 swap(ip
->i_cowfp
, tip
->i_cowfp
);
1674 if (ip
->i_cowfp
&& ip
->i_cowfp
->if_bytes
)
1675 xfs_inode_set_cowblocks_tag(ip
);
1677 xfs_inode_clear_cowblocks_tag(ip
);
1678 if (tip
->i_cowfp
&& tip
->i_cowfp
->if_bytes
)
1679 xfs_inode_set_cowblocks_tag(tip
);
1681 xfs_inode_clear_cowblocks_tag(tip
);
1684 xfs_trans_log_inode(tp
, ip
, src_log_flags
);
1685 xfs_trans_log_inode(tp
, tip
, target_log_flags
);
1688 * The extent forks have been swapped, but crc=1,rmapbt=0 filesystems
1689 * have inode number owner values in the bmbt blocks that still refer to
1690 * the old inode. Scan each bmbt to fix up the owner values with the
1691 * inode number of the current inode.
1693 if (src_log_flags
& XFS_ILOG_DOWNER
) {
1694 error
= xfs_swap_change_owner(&tp
, ip
, tip
);
1696 goto out_trans_cancel
;
1698 if (target_log_flags
& XFS_ILOG_DOWNER
) {
1699 error
= xfs_swap_change_owner(&tp
, tip
, ip
);
1701 goto out_trans_cancel
;
1705 * If this is a synchronous mount, make sure that the
1706 * transaction goes to disk before returning to the user.
1708 if (xfs_has_wsync(mp
))
1709 xfs_trans_set_sync(tp
);
1711 error
= xfs_trans_commit(tp
);
1713 trace_xfs_swap_extent_after(ip
, 0);
1714 trace_xfs_swap_extent_after(tip
, 1);
1717 xfs_iunlock(ip
, XFS_ILOCK_EXCL
);
1718 xfs_iunlock(tip
, XFS_ILOCK_EXCL
);
1720 filemap_invalidate_unlock_two(VFS_I(ip
)->i_mapping
,
1721 VFS_I(tip
)->i_mapping
);
1722 unlock_two_nondirectories(VFS_I(ip
), VFS_I(tip
));
1726 xfs_trans_cancel(tp
);
1727 goto out_unlock_ilock
;