1 // SPDX-License-Identifier: GPL-2.0
3 * Copyright (c) 2000-2006 Silicon Graphics, Inc.
4 * Copyright (c) 2012 Red Hat, Inc.
9 #include "xfs_shared.h"
10 #include "xfs_format.h"
11 #include "xfs_log_format.h"
12 #include "xfs_trans_resv.h"
14 #include "xfs_mount.h"
15 #include "xfs_defer.h"
16 #include "xfs_inode.h"
17 #include "xfs_btree.h"
18 #include "xfs_trans.h"
19 #include "xfs_alloc.h"
21 #include "xfs_bmap_util.h"
22 #include "xfs_bmap_btree.h"
23 #include "xfs_rtalloc.h"
24 #include "xfs_error.h"
25 #include "xfs_quota.h"
26 #include "xfs_trans_space.h"
27 #include "xfs_trace.h"
28 #include "xfs_icache.h"
29 #include "xfs_iomap.h"
30 #include "xfs_reflink.h"
31 #include "xfs_rtbitmap.h"
33 /* Kernel only BMAP related definitions and functions */
36 * Convert the given file system block to a disk block. We have to treat it
37 * differently based on whether the file is a real time file or not, because the
41 xfs_fsb_to_db(struct xfs_inode
*ip
, xfs_fsblock_t fsb
)
43 if (XFS_IS_REALTIME_INODE(ip
))
44 return XFS_FSB_TO_BB(ip
->i_mount
, fsb
);
45 return XFS_FSB_TO_DADDR(ip
->i_mount
, fsb
);
49 * Routine to zero an extent on disk allocated to the specific inode.
51 * The VFS functions take a linearised filesystem block offset, so we have to
52 * convert the sparse xfs fsb to the right format first.
53 * VFS types are real funky, too.
58 xfs_fsblock_t start_fsb
,
61 struct xfs_mount
*mp
= ip
->i_mount
;
62 struct xfs_buftarg
*target
= xfs_inode_buftarg(ip
);
63 xfs_daddr_t sector
= xfs_fsb_to_db(ip
, start_fsb
);
64 sector_t block
= XFS_BB_TO_FSBT(mp
, sector
);
66 return blkdev_issue_zeroout(target
->bt_bdev
,
67 block
<< (mp
->m_super
->s_blocksize_bits
- 9),
68 count_fsb
<< (mp
->m_super
->s_blocksize_bits
- 9),
73 * Extent tree block counting routines.
77 * Count leaf blocks given a range of extent records. Delayed allocation
78 * extents are not counted towards the totals.
81 xfs_bmap_count_leaves(
82 struct xfs_ifork
*ifp
,
85 struct xfs_iext_cursor icur
;
86 struct xfs_bmbt_irec got
;
87 xfs_extnum_t numrecs
= 0;
89 for_each_xfs_iext(ifp
, &icur
, &got
) {
90 if (!isnullstartblock(got
.br_startblock
)) {
91 *count
+= got
.br_blockcount
;
100 * Count fsblocks of the given fork. Delayed allocation extents are
101 * not counted towards the totals.
104 xfs_bmap_count_blocks(
105 struct xfs_trans
*tp
,
106 struct xfs_inode
*ip
,
108 xfs_extnum_t
*nextents
,
109 xfs_filblks_t
*count
)
111 struct xfs_mount
*mp
= ip
->i_mount
;
112 struct xfs_ifork
*ifp
= xfs_ifork_ptr(ip
, whichfork
);
113 struct xfs_btree_cur
*cur
;
114 xfs_extlen_t btblocks
= 0;
123 switch (ifp
->if_format
) {
124 case XFS_DINODE_FMT_BTREE
:
125 error
= xfs_iread_extents(tp
, ip
, whichfork
);
129 cur
= xfs_bmbt_init_cursor(mp
, tp
, ip
, whichfork
);
130 error
= xfs_btree_count_blocks(cur
, &btblocks
);
131 xfs_btree_del_cursor(cur
, error
);
136 * xfs_btree_count_blocks includes the root block contained in
137 * the inode fork in @btblocks, so subtract one because we're
138 * only interested in allocated disk blocks.
140 *count
+= btblocks
- 1;
143 case XFS_DINODE_FMT_EXTENTS
:
144 *nextents
= xfs_bmap_count_leaves(ifp
, count
);
152 xfs_getbmap_report_one(
153 struct xfs_inode
*ip
,
154 struct getbmapx
*bmv
,
155 struct kgetbmap
*out
,
157 struct xfs_bmbt_irec
*got
)
159 struct kgetbmap
*p
= out
+ bmv
->bmv_entries
;
163 error
= xfs_reflink_trim_around_shared(ip
, got
, &shared
);
167 if (isnullstartblock(got
->br_startblock
) ||
168 got
->br_startblock
== DELAYSTARTBLOCK
) {
170 * Take the flush completion as being a point-in-time snapshot
171 * where there are no delalloc extents, and if any new ones
172 * have been created racily, just skip them as being 'after'
173 * the flush and so don't get reported.
175 if (!(bmv
->bmv_iflags
& BMV_IF_DELALLOC
))
178 p
->bmv_oflags
|= BMV_OF_DELALLOC
;
181 p
->bmv_block
= xfs_fsb_to_db(ip
, got
->br_startblock
);
184 if (got
->br_state
== XFS_EXT_UNWRITTEN
&&
185 (bmv
->bmv_iflags
& BMV_IF_PREALLOC
))
186 p
->bmv_oflags
|= BMV_OF_PREALLOC
;
189 p
->bmv_oflags
|= BMV_OF_SHARED
;
191 p
->bmv_offset
= XFS_FSB_TO_BB(ip
->i_mount
, got
->br_startoff
);
192 p
->bmv_length
= XFS_FSB_TO_BB(ip
->i_mount
, got
->br_blockcount
);
194 bmv
->bmv_offset
= p
->bmv_offset
+ p
->bmv_length
;
195 bmv
->bmv_length
= max(0LL, bmv_end
- bmv
->bmv_offset
);
201 xfs_getbmap_report_hole(
202 struct xfs_inode
*ip
,
203 struct getbmapx
*bmv
,
204 struct kgetbmap
*out
,
209 struct kgetbmap
*p
= out
+ bmv
->bmv_entries
;
211 if (bmv
->bmv_iflags
& BMV_IF_NO_HOLES
)
215 p
->bmv_offset
= XFS_FSB_TO_BB(ip
->i_mount
, bno
);
216 p
->bmv_length
= XFS_FSB_TO_BB(ip
->i_mount
, end
- bno
);
218 bmv
->bmv_offset
= p
->bmv_offset
+ p
->bmv_length
;
219 bmv
->bmv_length
= max(0LL, bmv_end
- bmv
->bmv_offset
);
225 struct getbmapx
*bmv
)
227 return bmv
->bmv_length
== 0 || bmv
->bmv_entries
>= bmv
->bmv_count
- 1;
231 xfs_getbmap_next_rec(
232 struct xfs_bmbt_irec
*rec
,
233 xfs_fileoff_t total_end
)
235 xfs_fileoff_t end
= rec
->br_startoff
+ rec
->br_blockcount
;
237 if (end
== total_end
)
240 rec
->br_startoff
+= rec
->br_blockcount
;
241 if (!isnullstartblock(rec
->br_startblock
) &&
242 rec
->br_startblock
!= DELAYSTARTBLOCK
)
243 rec
->br_startblock
+= rec
->br_blockcount
;
244 rec
->br_blockcount
= total_end
- end
;
249 * Get inode's extents as described in bmv, and format for output.
250 * Calls formatter to fill the user's buffer until all extents
251 * are mapped, until the passed-in bmv->bmv_count slots have
252 * been filled, or until the formatter short-circuits the loop,
253 * if it is tracking filled-in extents on its own.
257 struct xfs_inode
*ip
,
258 struct getbmapx
*bmv
, /* user bmap structure */
259 struct kgetbmap
*out
)
261 struct xfs_mount
*mp
= ip
->i_mount
;
262 int iflags
= bmv
->bmv_iflags
;
263 int whichfork
, lock
, error
= 0;
264 int64_t bmv_end
, max_len
;
265 xfs_fileoff_t bno
, first_bno
;
266 struct xfs_ifork
*ifp
;
267 struct xfs_bmbt_irec got
, rec
;
269 struct xfs_iext_cursor icur
;
271 if (bmv
->bmv_iflags
& ~BMV_IF_VALID
)
274 /* Only allow CoW fork queries if we're debugging. */
275 if (iflags
& BMV_IF_COWFORK
)
278 if ((iflags
& BMV_IF_ATTRFORK
) && (iflags
& BMV_IF_COWFORK
))
281 if (bmv
->bmv_length
< -1)
283 bmv
->bmv_entries
= 0;
284 if (bmv
->bmv_length
== 0)
287 if (iflags
& BMV_IF_ATTRFORK
)
288 whichfork
= XFS_ATTR_FORK
;
289 else if (iflags
& BMV_IF_COWFORK
)
290 whichfork
= XFS_COW_FORK
;
292 whichfork
= XFS_DATA_FORK
;
294 xfs_ilock(ip
, XFS_IOLOCK_SHARED
);
297 lock
= xfs_ilock_attr_map_shared(ip
);
298 if (!xfs_inode_has_attr_fork(ip
))
299 goto out_unlock_ilock
;
304 lock
= XFS_ILOCK_SHARED
;
307 /* No CoW fork? Just return */
308 if (!xfs_ifork_ptr(ip
, whichfork
))
309 goto out_unlock_ilock
;
311 if (xfs_get_cowextsz_hint(ip
))
312 max_len
= mp
->m_super
->s_maxbytes
;
314 max_len
= XFS_ISIZE(ip
);
317 if (!(iflags
& BMV_IF_DELALLOC
) &&
318 (ip
->i_delayed_blks
|| XFS_ISIZE(ip
) > ip
->i_disk_size
)) {
319 error
= filemap_write_and_wait(VFS_I(ip
)->i_mapping
);
321 goto out_unlock_iolock
;
324 * Even after flushing the inode, there can still be
325 * delalloc blocks on the inode beyond EOF due to
326 * speculative preallocation. These are not removed
327 * until the release function is called or the inode
328 * is inactivated. Hence we cannot assert here that
329 * ip->i_delayed_blks == 0.
333 if (xfs_get_extsz_hint(ip
) ||
334 (ip
->i_diflags
& XFS_DIFLAG_PREALLOC
))
335 max_len
= mp
->m_super
->s_maxbytes
;
337 max_len
= XFS_ISIZE(ip
);
339 lock
= xfs_ilock_data_map_shared(ip
);
343 ifp
= xfs_ifork_ptr(ip
, whichfork
);
345 switch (ifp
->if_format
) {
346 case XFS_DINODE_FMT_EXTENTS
:
347 case XFS_DINODE_FMT_BTREE
:
349 case XFS_DINODE_FMT_LOCAL
:
350 /* Local format inode forks report no extents. */
351 goto out_unlock_ilock
;
354 goto out_unlock_ilock
;
357 if (bmv
->bmv_length
== -1) {
358 max_len
= XFS_FSB_TO_BB(mp
, XFS_B_TO_FSB(mp
, max_len
));
359 bmv
->bmv_length
= max(0LL, max_len
- bmv
->bmv_offset
);
362 bmv_end
= bmv
->bmv_offset
+ bmv
->bmv_length
;
364 first_bno
= bno
= XFS_BB_TO_FSBT(mp
, bmv
->bmv_offset
);
365 len
= XFS_BB_TO_FSB(mp
, bmv
->bmv_length
);
367 error
= xfs_iread_extents(NULL
, ip
, whichfork
);
369 goto out_unlock_ilock
;
371 if (!xfs_iext_lookup_extent(ip
, ifp
, bno
, &icur
, &got
)) {
373 * Report a whole-file hole if the delalloc flag is set to
374 * stay compatible with the old implementation.
376 if (iflags
& BMV_IF_DELALLOC
)
377 xfs_getbmap_report_hole(ip
, bmv
, out
, bmv_end
, bno
,
378 XFS_B_TO_FSB(mp
, XFS_ISIZE(ip
)));
379 goto out_unlock_ilock
;
382 while (!xfs_getbmap_full(bmv
)) {
383 xfs_trim_extent(&got
, first_bno
, len
);
386 * Report an entry for a hole if this extent doesn't directly
387 * follow the previous one.
389 if (got
.br_startoff
> bno
) {
390 xfs_getbmap_report_hole(ip
, bmv
, out
, bmv_end
, bno
,
392 if (xfs_getbmap_full(bmv
))
397 * In order to report shared extents accurately, we report each
398 * distinct shared / unshared part of a single bmbt record with
399 * an individual getbmapx record.
401 bno
= got
.br_startoff
+ got
.br_blockcount
;
404 error
= xfs_getbmap_report_one(ip
, bmv
, out
, bmv_end
,
406 if (error
|| xfs_getbmap_full(bmv
))
407 goto out_unlock_ilock
;
408 } while (xfs_getbmap_next_rec(&rec
, bno
));
410 if (!xfs_iext_next_extent(ifp
, &icur
, &got
)) {
411 xfs_fileoff_t end
= XFS_B_TO_FSB(mp
, XFS_ISIZE(ip
));
413 if (bmv
->bmv_entries
> 0)
414 out
[bmv
->bmv_entries
- 1].bmv_oflags
|=
417 if (whichfork
!= XFS_ATTR_FORK
&& bno
< end
&&
418 !xfs_getbmap_full(bmv
)) {
419 xfs_getbmap_report_hole(ip
, bmv
, out
, bmv_end
,
425 if (bno
>= first_bno
+ len
)
430 xfs_iunlock(ip
, lock
);
432 xfs_iunlock(ip
, XFS_IOLOCK_SHARED
);
437 * Dead simple method of punching delalyed allocation blocks from a range in
438 * the inode. This will always punch out both the start and end blocks, even
439 * if the ranges only partially overlap them, so it is up to the caller to
440 * ensure that partial blocks are not passed in.
443 xfs_bmap_punch_delalloc_range(
444 struct xfs_inode
*ip
,
445 xfs_off_t start_byte
,
448 struct xfs_mount
*mp
= ip
->i_mount
;
449 struct xfs_ifork
*ifp
= &ip
->i_df
;
450 xfs_fileoff_t start_fsb
= XFS_B_TO_FSBT(mp
, start_byte
);
451 xfs_fileoff_t end_fsb
= XFS_B_TO_FSB(mp
, end_byte
);
452 struct xfs_bmbt_irec got
, del
;
453 struct xfs_iext_cursor icur
;
455 ASSERT(!xfs_need_iread_extents(ifp
));
457 xfs_ilock(ip
, XFS_ILOCK_EXCL
);
458 if (!xfs_iext_lookup_extent_before(ip
, ifp
, &end_fsb
, &icur
, &got
))
461 while (got
.br_startoff
+ got
.br_blockcount
> start_fsb
) {
463 xfs_trim_extent(&del
, start_fsb
, end_fsb
- start_fsb
);
466 * A delete can push the cursor forward. Step back to the
467 * previous extent on non-delalloc or extents outside the
470 if (!del
.br_blockcount
||
471 !isnullstartblock(del
.br_startblock
)) {
472 if (!xfs_iext_prev_extent(ifp
, &icur
, &got
))
477 xfs_bmap_del_extent_delay(ip
, XFS_DATA_FORK
, &icur
, &got
, &del
);
478 if (!xfs_iext_get_extent(ifp
, &icur
, &got
))
483 xfs_iunlock(ip
, XFS_ILOCK_EXCL
);
487 * Test whether it is appropriate to check an inode for and free post EOF
491 xfs_can_free_eofblocks(
492 struct xfs_inode
*ip
)
494 struct xfs_mount
*mp
= ip
->i_mount
;
495 bool found_blocks
= false;
496 xfs_fileoff_t end_fsb
;
497 xfs_fileoff_t last_fsb
;
498 struct xfs_bmbt_irec imap
;
499 struct xfs_iext_cursor icur
;
502 * Caller must either hold the exclusive io lock; or be inactivating
503 * the inode, which guarantees there are no other users of the inode.
505 if (!(VFS_I(ip
)->i_state
& I_FREEING
))
506 xfs_assert_ilocked(ip
, XFS_IOLOCK_EXCL
);
508 /* prealloc/delalloc exists only on regular files */
509 if (!S_ISREG(VFS_I(ip
)->i_mode
))
513 * Zero sized files with no cached pages and delalloc blocks will not
514 * have speculative prealloc/delalloc blocks to remove.
516 if (VFS_I(ip
)->i_size
== 0 &&
517 VFS_I(ip
)->i_mapping
->nrpages
== 0 &&
518 ip
->i_delayed_blks
== 0)
521 /* If we haven't read in the extent list, then don't do it now. */
522 if (xfs_need_iread_extents(&ip
->i_df
))
526 * Do not free real extents in preallocated files unless the file has
527 * delalloc blocks and we are forced to remove them.
529 if ((ip
->i_diflags
& XFS_DIFLAG_PREALLOC
) && !ip
->i_delayed_blks
)
533 * Do not try to free post-EOF blocks if EOF is beyond the end of the
534 * range supported by the page cache, because the truncation will loop
537 end_fsb
= XFS_B_TO_FSB(mp
, (xfs_ufsize_t
)XFS_ISIZE(ip
));
538 if (xfs_inode_has_bigrtalloc(ip
))
539 end_fsb
= xfs_rtb_roundup_rtx(mp
, end_fsb
);
540 last_fsb
= XFS_B_TO_FSB(mp
, mp
->m_super
->s_maxbytes
);
541 if (last_fsb
<= end_fsb
)
545 * Check if there is an post-EOF extent to free.
547 xfs_ilock(ip
, XFS_ILOCK_SHARED
);
548 if (xfs_iext_lookup_extent(ip
, &ip
->i_df
, end_fsb
, &icur
, &imap
))
550 xfs_iunlock(ip
, XFS_ILOCK_SHARED
);
555 * This is called to free any blocks beyond eof. The caller must hold
556 * IOLOCK_EXCL unless we are in the inode reclaim path and have the only
557 * reference to the inode.
561 struct xfs_inode
*ip
)
563 struct xfs_trans
*tp
;
564 struct xfs_mount
*mp
= ip
->i_mount
;
567 /* Attach the dquots to the inode up front. */
568 error
= xfs_qm_dqattach(ip
);
572 /* Wait on dio to ensure i_size has settled. */
573 inode_dio_wait(VFS_I(ip
));
576 * For preallocated files only free delayed allocations.
578 * Note that this means we also leave speculative preallocations in
579 * place for preallocated files.
581 if (ip
->i_diflags
& (XFS_DIFLAG_PREALLOC
| XFS_DIFLAG_APPEND
)) {
582 if (ip
->i_delayed_blks
) {
583 xfs_bmap_punch_delalloc_range(ip
,
584 round_up(XFS_ISIZE(ip
), mp
->m_sb
.sb_blocksize
),
587 xfs_inode_clear_eofblocks_tag(ip
);
591 error
= xfs_trans_alloc(mp
, &M_RES(mp
)->tr_itruncate
, 0, 0, 0, &tp
);
593 ASSERT(xfs_is_shutdown(mp
));
597 xfs_ilock(ip
, XFS_ILOCK_EXCL
);
598 xfs_trans_ijoin(tp
, ip
, 0);
601 * Do not update the on-disk file size. If we update the on-disk file
602 * size and then the system crashes before the contents of the file are
603 * flushed to disk then the files may be full of holes (ie NULL files
606 error
= xfs_itruncate_extents_flags(&tp
, ip
, XFS_DATA_FORK
,
607 XFS_ISIZE(ip
), XFS_BMAPI_NODISCARD
);
611 error
= xfs_trans_commit(tp
);
615 xfs_inode_clear_eofblocks_tag(ip
);
620 * If we get an error at this point we simply don't
621 * bother truncating the file.
623 xfs_trans_cancel(tp
);
625 xfs_iunlock(ip
, XFS_ILOCK_EXCL
);
630 xfs_alloc_file_space(
631 struct xfs_inode
*ip
,
635 xfs_mount_t
*mp
= ip
->i_mount
;
637 xfs_filblks_t allocatesize_fsb
;
638 xfs_extlen_t extsz
, temp
;
639 xfs_fileoff_t startoffset_fsb
;
640 xfs_fileoff_t endoffset_fsb
;
643 xfs_bmbt_irec_t imaps
[1], *imapp
;
646 if (xfs_is_always_cow_inode(ip
))
649 trace_xfs_alloc_file_space(ip
);
651 if (xfs_is_shutdown(mp
))
654 error
= xfs_qm_dqattach(ip
);
661 rt
= XFS_IS_REALTIME_INODE(ip
);
662 extsz
= xfs_get_extsz_hint(ip
);
666 startoffset_fsb
= XFS_B_TO_FSBT(mp
, offset
);
667 endoffset_fsb
= XFS_B_TO_FSB(mp
, offset
+ count
);
668 allocatesize_fsb
= endoffset_fsb
- startoffset_fsb
;
671 * Allocate file space until done or until there is an error
673 while (allocatesize_fsb
&& !error
) {
675 unsigned int dblocks
, rblocks
, resblks
;
679 * Determine space reservations for data/realtime.
681 if (unlikely(extsz
)) {
685 e
= startoffset_fsb
+ allocatesize_fsb
;
686 div_u64_rem(startoffset_fsb
, extsz
, &temp
);
689 div_u64_rem(e
, extsz
, &temp
);
694 e
= allocatesize_fsb
;
698 * The transaction reservation is limited to a 32-bit block
699 * count, hence we need to limit the number of blocks we are
700 * trying to reserve to avoid an overflow. We can't allocate
701 * more than @nimaps extents, and an extent is limited on disk
702 * to XFS_BMBT_MAX_EXTLEN (21 bits), so use that to enforce the
705 resblks
= min_t(xfs_fileoff_t
, (e
- s
),
706 (XFS_MAX_BMBT_EXTLEN
* nimaps
));
708 dblocks
= XFS_DIOSTRAT_SPACE_RES(mp
, 0);
711 dblocks
= XFS_DIOSTRAT_SPACE_RES(mp
, resblks
);
715 error
= xfs_trans_alloc_inode(ip
, &M_RES(mp
)->tr_write
,
716 dblocks
, rblocks
, false, &tp
);
720 error
= xfs_iext_count_extend(tp
, ip
, XFS_DATA_FORK
,
721 XFS_IEXT_ADD_NOSPLIT_CNT
);
726 * If the allocator cannot find a single free extent large
727 * enough to cover the start block of the requested range,
728 * xfs_bmapi_write will return -ENOSR.
730 * In that case we simply need to keep looping with the same
731 * startoffset_fsb so that one of the following allocations
732 * will eventually reach the requested range.
734 error
= xfs_bmapi_write(tp
, ip
, startoffset_fsb
,
735 allocatesize_fsb
, XFS_BMAPI_PREALLOC
, 0, imapp
,
742 startoffset_fsb
+= imapp
->br_blockcount
;
743 allocatesize_fsb
-= imapp
->br_blockcount
;
746 ip
->i_diflags
|= XFS_DIFLAG_PREALLOC
;
747 xfs_trans_log_inode(tp
, ip
, XFS_ILOG_CORE
);
749 error
= xfs_trans_commit(tp
);
750 xfs_iunlock(ip
, XFS_ILOCK_EXCL
);
756 xfs_trans_cancel(tp
);
757 xfs_iunlock(ip
, XFS_ILOCK_EXCL
);
763 struct xfs_inode
*ip
,
764 xfs_fileoff_t startoffset_fsb
,
765 xfs_filblks_t len_fsb
,
768 struct xfs_mount
*mp
= ip
->i_mount
;
769 struct xfs_trans
*tp
;
770 uint resblks
= XFS_DIOSTRAT_SPACE_RES(mp
, 0);
773 error
= xfs_trans_alloc_inode(ip
, &M_RES(mp
)->tr_write
, resblks
, 0,
778 error
= xfs_iext_count_extend(tp
, ip
, XFS_DATA_FORK
,
779 XFS_IEXT_PUNCH_HOLE_CNT
);
781 goto out_trans_cancel
;
783 error
= xfs_bunmapi(tp
, ip
, startoffset_fsb
, len_fsb
, 0, 2, done
);
785 goto out_trans_cancel
;
787 error
= xfs_trans_commit(tp
);
789 xfs_iunlock(ip
, XFS_ILOCK_EXCL
);
793 xfs_trans_cancel(tp
);
797 /* Caller must first wait for the completion of any pending DIOs if required. */
799 xfs_flush_unmap_range(
800 struct xfs_inode
*ip
,
804 struct inode
*inode
= VFS_I(ip
);
805 xfs_off_t rounding
, start
, end
;
809 * Make sure we extend the flush out to extent alignment
810 * boundaries so any extent range overlapping the start/end
811 * of the modification we are about to do is clean and idle.
813 rounding
= max_t(xfs_off_t
, xfs_inode_alloc_unitsize(ip
), PAGE_SIZE
);
814 start
= rounddown_64(offset
, rounding
);
815 end
= roundup_64(offset
+ len
, rounding
) - 1;
817 error
= filemap_write_and_wait_range(inode
->i_mapping
, start
, end
);
820 truncate_pagecache_range(inode
, start
, end
);
826 struct xfs_inode
*ip
,
830 struct xfs_mount
*mp
= ip
->i_mount
;
831 xfs_fileoff_t startoffset_fsb
;
832 xfs_fileoff_t endoffset_fsb
;
835 trace_xfs_free_file_space(ip
);
837 error
= xfs_qm_dqattach(ip
);
841 if (len
<= 0) /* if nothing being freed */
845 * Now AIO and DIO has drained we flush and (if necessary) invalidate
846 * the cached range over the first operation we are about to run.
848 error
= xfs_flush_unmap_range(ip
, offset
, len
);
852 startoffset_fsb
= XFS_B_TO_FSB(mp
, offset
);
853 endoffset_fsb
= XFS_B_TO_FSBT(mp
, offset
+ len
);
855 /* We can only free complete realtime extents. */
856 if (xfs_inode_has_bigrtalloc(ip
)) {
857 startoffset_fsb
= xfs_rtb_roundup_rtx(mp
, startoffset_fsb
);
858 endoffset_fsb
= xfs_rtb_rounddown_rtx(mp
, endoffset_fsb
);
862 * Need to zero the stuff we're not freeing, on disk.
864 if (endoffset_fsb
> startoffset_fsb
) {
866 error
= xfs_unmap_extent(ip
, startoffset_fsb
,
867 endoffset_fsb
- startoffset_fsb
, &done
);
874 * Now that we've unmap all full blocks we'll have to zero out any
875 * partial block at the beginning and/or end. xfs_zero_range is smart
876 * enough to skip any holes, including those we just created, but we
877 * must take care not to zero beyond EOF and enlarge i_size.
879 if (offset
>= XFS_ISIZE(ip
))
881 if (offset
+ len
> XFS_ISIZE(ip
))
882 len
= XFS_ISIZE(ip
) - offset
;
883 error
= xfs_zero_range(ip
, offset
, len
, NULL
);
888 * If we zeroed right up to EOF and EOF straddles a page boundary we
889 * must make sure that the post-EOF area is also zeroed because the
890 * page could be mmap'd and xfs_zero_range doesn't do that for us.
891 * Writeback of the eof page will do this, albeit clumsily.
893 if (offset
+ len
>= XFS_ISIZE(ip
) && offset_in_page(offset
+ len
) > 0) {
894 error
= filemap_write_and_wait_range(VFS_I(ip
)->i_mapping
,
895 round_down(offset
+ len
, PAGE_SIZE
), LLONG_MAX
);
903 struct xfs_inode
*ip
,
906 unsigned int rounding
;
910 * Trim eofblocks to avoid shifting uninitialized post-eof preallocation
911 * into the accessible region of the file.
913 if (xfs_can_free_eofblocks(ip
)) {
914 error
= xfs_free_eofblocks(ip
);
920 * Shift operations must stabilize the start block offset boundary along
921 * with the full range of the operation. If we don't, a COW writeback
922 * completion could race with an insert, front merge with the start
923 * extent (after split) during the shift and corrupt the file. Start
924 * with the allocation unit just prior to the start to stabilize the
927 rounding
= xfs_inode_alloc_unitsize(ip
);
928 offset
= rounddown_64(offset
, rounding
);
933 * Writeback and invalidate cache for the remainder of the file as we're
934 * about to shift down every extent from offset to EOF.
936 error
= xfs_flush_unmap_range(ip
, offset
, XFS_ISIZE(ip
));
941 * Clean out anything hanging around in the cow fork now that
942 * we've flushed all the dirty data out to disk to avoid having
943 * CoW extents at the wrong offsets.
945 if (xfs_inode_has_cow_data(ip
)) {
946 error
= xfs_reflink_cancel_cow_range(ip
, offset
, NULLFILEOFF
,
956 * xfs_collapse_file_space()
957 * This routine frees disk space and shift extent for the given file.
958 * The first thing we do is to free data blocks in the specified range
959 * by calling xfs_free_file_space(). It would also sync dirty data
960 * and invalidate page cache over the region on which collapse range
961 * is working. And Shift extent records to the left to cover a hole.
968 xfs_collapse_file_space(
969 struct xfs_inode
*ip
,
973 struct xfs_mount
*mp
= ip
->i_mount
;
974 struct xfs_trans
*tp
;
976 xfs_fileoff_t next_fsb
= XFS_B_TO_FSB(mp
, offset
+ len
);
977 xfs_fileoff_t shift_fsb
= XFS_B_TO_FSB(mp
, len
);
980 xfs_assert_ilocked(ip
, XFS_IOLOCK_EXCL
| XFS_MMAPLOCK_EXCL
);
982 trace_xfs_collapse_file_space(ip
);
984 error
= xfs_free_file_space(ip
, offset
, len
);
988 error
= xfs_prepare_shift(ip
, offset
);
992 error
= xfs_trans_alloc(mp
, &M_RES(mp
)->tr_write
, 0, 0, 0, &tp
);
996 xfs_ilock(ip
, XFS_ILOCK_EXCL
);
997 xfs_trans_ijoin(tp
, ip
, 0);
1000 error
= xfs_bmap_collapse_extents(tp
, ip
, &next_fsb
, shift_fsb
,
1003 goto out_trans_cancel
;
1007 /* finish any deferred frees and roll the transaction */
1008 error
= xfs_defer_finish(&tp
);
1010 goto out_trans_cancel
;
1013 error
= xfs_trans_commit(tp
);
1014 xfs_iunlock(ip
, XFS_ILOCK_EXCL
);
1018 xfs_trans_cancel(tp
);
1019 xfs_iunlock(ip
, XFS_ILOCK_EXCL
);
1024 * xfs_insert_file_space()
1025 * This routine create hole space by shifting extents for the given file.
1026 * The first thing we do is to sync dirty data and invalidate page cache
1027 * over the region on which insert range is working. And split an extent
1028 * to two extents at given offset by calling xfs_bmap_split_extent.
1029 * And shift all extent records which are laying between [offset,
1030 * last allocated extent] to the right to reserve hole range.
1036 xfs_insert_file_space(
1037 struct xfs_inode
*ip
,
1041 struct xfs_mount
*mp
= ip
->i_mount
;
1042 struct xfs_trans
*tp
;
1044 xfs_fileoff_t stop_fsb
= XFS_B_TO_FSB(mp
, offset
);
1045 xfs_fileoff_t next_fsb
= NULLFSBLOCK
;
1046 xfs_fileoff_t shift_fsb
= XFS_B_TO_FSB(mp
, len
);
1049 xfs_assert_ilocked(ip
, XFS_IOLOCK_EXCL
| XFS_MMAPLOCK_EXCL
);
1051 trace_xfs_insert_file_space(ip
);
1053 error
= xfs_bmap_can_insert_extents(ip
, stop_fsb
, shift_fsb
);
1057 error
= xfs_prepare_shift(ip
, offset
);
1061 error
= xfs_trans_alloc(mp
, &M_RES(mp
)->tr_write
,
1062 XFS_DIOSTRAT_SPACE_RES(mp
, 0), 0, 0, &tp
);
1066 xfs_ilock(ip
, XFS_ILOCK_EXCL
);
1067 xfs_trans_ijoin(tp
, ip
, 0);
1069 error
= xfs_iext_count_extend(tp
, ip
, XFS_DATA_FORK
,
1070 XFS_IEXT_PUNCH_HOLE_CNT
);
1072 goto out_trans_cancel
;
1075 * The extent shifting code works on extent granularity. So, if stop_fsb
1076 * is not the starting block of extent, we need to split the extent at
1079 error
= xfs_bmap_split_extent(tp
, ip
, stop_fsb
);
1081 goto out_trans_cancel
;
1084 error
= xfs_defer_finish(&tp
);
1086 goto out_trans_cancel
;
1088 error
= xfs_bmap_insert_extents(tp
, ip
, &next_fsb
, shift_fsb
,
1091 goto out_trans_cancel
;
1094 error
= xfs_trans_commit(tp
);
1095 xfs_iunlock(ip
, XFS_ILOCK_EXCL
);
1099 xfs_trans_cancel(tp
);
1100 xfs_iunlock(ip
, XFS_ILOCK_EXCL
);
1105 * We need to check that the format of the data fork in the temporary inode is
1106 * valid for the target inode before doing the swap. This is not a problem with
1107 * attr1 because of the fixed fork offset, but attr2 has a dynamically sized
1108 * data fork depending on the space the attribute fork is taking so we can get
1109 * invalid formats on the target inode.
1111 * E.g. target has space for 7 extents in extent format, temp inode only has
1112 * space for 6. If we defragment down to 7 extents, then the tmp format is a
1113 * btree, but when swapped it needs to be in extent format. Hence we can't just
1114 * blindly swap data forks on attr2 filesystems.
1116 * Note that we check the swap in both directions so that we don't end up with
1117 * a corrupt temporary inode, either.
1119 * Note that fixing the way xfs_fsr sets up the attribute fork in the source
1120 * inode will prevent this situation from occurring, so all we do here is
1121 * reject and log the attempt. basically we are putting the responsibility on
1122 * userspace to get this right.
1125 xfs_swap_extents_check_format(
1126 struct xfs_inode
*ip
, /* target inode */
1127 struct xfs_inode
*tip
) /* tmp inode */
1129 struct xfs_ifork
*ifp
= &ip
->i_df
;
1130 struct xfs_ifork
*tifp
= &tip
->i_df
;
1132 /* User/group/project quota ids must match if quotas are enforced. */
1133 if (XFS_IS_QUOTA_ON(ip
->i_mount
) &&
1134 (!uid_eq(VFS_I(ip
)->i_uid
, VFS_I(tip
)->i_uid
) ||
1135 !gid_eq(VFS_I(ip
)->i_gid
, VFS_I(tip
)->i_gid
) ||
1136 ip
->i_projid
!= tip
->i_projid
))
1139 /* Should never get a local format */
1140 if (ifp
->if_format
== XFS_DINODE_FMT_LOCAL
||
1141 tifp
->if_format
== XFS_DINODE_FMT_LOCAL
)
1145 * if the target inode has less extents that then temporary inode then
1146 * why did userspace call us?
1148 if (ifp
->if_nextents
< tifp
->if_nextents
)
1152 * If we have to use the (expensive) rmap swap method, we can
1153 * handle any number of extents and any format.
1155 if (xfs_has_rmapbt(ip
->i_mount
))
1159 * if the target inode is in extent form and the temp inode is in btree
1160 * form then we will end up with the target inode in the wrong format
1161 * as we already know there are less extents in the temp inode.
1163 if (ifp
->if_format
== XFS_DINODE_FMT_EXTENTS
&&
1164 tifp
->if_format
== XFS_DINODE_FMT_BTREE
)
1167 /* Check temp in extent form to max in target */
1168 if (tifp
->if_format
== XFS_DINODE_FMT_EXTENTS
&&
1169 tifp
->if_nextents
> XFS_IFORK_MAXEXT(ip
, XFS_DATA_FORK
))
1172 /* Check target in extent form to max in temp */
1173 if (ifp
->if_format
== XFS_DINODE_FMT_EXTENTS
&&
1174 ifp
->if_nextents
> XFS_IFORK_MAXEXT(tip
, XFS_DATA_FORK
))
1178 * If we are in a btree format, check that the temp root block will fit
1179 * in the target and that it has enough extents to be in btree format
1182 * Note that we have to be careful to allow btree->extent conversions
1183 * (a common defrag case) which will occur when the temp inode is in
1186 if (tifp
->if_format
== XFS_DINODE_FMT_BTREE
) {
1187 if (xfs_inode_has_attr_fork(ip
) &&
1188 xfs_bmap_bmdr_space(tifp
->if_broot
) > xfs_inode_fork_boff(ip
))
1190 if (tifp
->if_nextents
<= XFS_IFORK_MAXEXT(ip
, XFS_DATA_FORK
))
1194 /* Reciprocal target->temp btree format checks */
1195 if (ifp
->if_format
== XFS_DINODE_FMT_BTREE
) {
1196 if (xfs_inode_has_attr_fork(tip
) &&
1197 xfs_bmap_bmdr_space(ip
->i_df
.if_broot
) > xfs_inode_fork_boff(tip
))
1199 if (ifp
->if_nextents
<= XFS_IFORK_MAXEXT(tip
, XFS_DATA_FORK
))
1207 xfs_swap_extent_flush(
1208 struct xfs_inode
*ip
)
1212 error
= filemap_write_and_wait(VFS_I(ip
)->i_mapping
);
1215 truncate_pagecache_range(VFS_I(ip
), 0, -1);
1217 /* Verify O_DIRECT for ftmp */
1218 if (VFS_I(ip
)->i_mapping
->nrpages
)
1224 * Move extents from one file to another, when rmap is enabled.
1227 xfs_swap_extent_rmap(
1228 struct xfs_trans
**tpp
,
1229 struct xfs_inode
*ip
,
1230 struct xfs_inode
*tip
)
1232 struct xfs_trans
*tp
= *tpp
;
1233 struct xfs_bmbt_irec irec
;
1234 struct xfs_bmbt_irec uirec
;
1235 struct xfs_bmbt_irec tirec
;
1236 xfs_fileoff_t offset_fsb
;
1237 xfs_fileoff_t end_fsb
;
1238 xfs_filblks_t count_fsb
;
1243 uint64_t tip_flags2
;
1246 * If the source file has shared blocks, we must flag the donor
1247 * file as having shared blocks so that we get the shared-block
1248 * rmap functions when we go to fix up the rmaps. The flags
1249 * will be switch for reals later.
1251 tip_flags2
= tip
->i_diflags2
;
1252 if (ip
->i_diflags2
& XFS_DIFLAG2_REFLINK
)
1253 tip
->i_diflags2
|= XFS_DIFLAG2_REFLINK
;
1256 end_fsb
= XFS_B_TO_FSB(ip
->i_mount
, i_size_read(VFS_I(ip
)));
1257 count_fsb
= (xfs_filblks_t
)(end_fsb
- offset_fsb
);
1260 /* Read extent from the donor file */
1262 error
= xfs_bmapi_read(tip
, offset_fsb
, count_fsb
, &tirec
,
1266 ASSERT(nimaps
== 1);
1267 ASSERT(tirec
.br_startblock
!= DELAYSTARTBLOCK
);
1269 trace_xfs_swap_extent_rmap_remap(tip
, &tirec
);
1270 ilen
= tirec
.br_blockcount
;
1272 /* Unmap the old blocks in the source file. */
1273 while (tirec
.br_blockcount
) {
1274 ASSERT(tp
->t_highest_agno
== NULLAGNUMBER
);
1275 trace_xfs_swap_extent_rmap_remap_piece(tip
, &tirec
);
1277 /* Read extent from the source file */
1279 error
= xfs_bmapi_read(ip
, tirec
.br_startoff
,
1280 tirec
.br_blockcount
, &irec
,
1284 ASSERT(nimaps
== 1);
1285 ASSERT(tirec
.br_startoff
== irec
.br_startoff
);
1286 trace_xfs_swap_extent_rmap_remap_piece(ip
, &irec
);
1288 /* Trim the extent. */
1290 uirec
.br_blockcount
= rlen
= min_t(xfs_filblks_t
,
1291 tirec
.br_blockcount
,
1292 irec
.br_blockcount
);
1293 trace_xfs_swap_extent_rmap_remap_piece(tip
, &uirec
);
1295 if (xfs_bmap_is_real_extent(&uirec
)) {
1296 error
= xfs_iext_count_extend(tp
, ip
,
1298 XFS_IEXT_SWAP_RMAP_CNT
);
1303 if (xfs_bmap_is_real_extent(&irec
)) {
1304 error
= xfs_iext_count_extend(tp
, tip
,
1306 XFS_IEXT_SWAP_RMAP_CNT
);
1311 /* Remove the mapping from the donor file. */
1312 xfs_bmap_unmap_extent(tp
, tip
, XFS_DATA_FORK
, &uirec
);
1314 /* Remove the mapping from the source file. */
1315 xfs_bmap_unmap_extent(tp
, ip
, XFS_DATA_FORK
, &irec
);
1317 /* Map the donor file's blocks into the source file. */
1318 xfs_bmap_map_extent(tp
, ip
, XFS_DATA_FORK
, &uirec
);
1320 /* Map the source file's blocks into the donor file. */
1321 xfs_bmap_map_extent(tp
, tip
, XFS_DATA_FORK
, &irec
);
1323 error
= xfs_defer_finish(tpp
);
1328 tirec
.br_startoff
+= rlen
;
1329 if (tirec
.br_startblock
!= HOLESTARTBLOCK
&&
1330 tirec
.br_startblock
!= DELAYSTARTBLOCK
)
1331 tirec
.br_startblock
+= rlen
;
1332 tirec
.br_blockcount
-= rlen
;
1340 tip
->i_diflags2
= tip_flags2
;
1344 trace_xfs_swap_extent_rmap_error(ip
, error
, _RET_IP_
);
1345 tip
->i_diflags2
= tip_flags2
;
1349 /* Swap the extents of two files by swapping data forks. */
1351 xfs_swap_extent_forks(
1352 struct xfs_trans
*tp
,
1353 struct xfs_inode
*ip
,
1354 struct xfs_inode
*tip
,
1356 int *target_log_flags
)
1358 xfs_filblks_t aforkblks
= 0;
1359 xfs_filblks_t taforkblks
= 0;
1365 * Count the number of extended attribute blocks
1367 if (xfs_inode_has_attr_fork(ip
) && ip
->i_af
.if_nextents
> 0 &&
1368 ip
->i_af
.if_format
!= XFS_DINODE_FMT_LOCAL
) {
1369 error
= xfs_bmap_count_blocks(tp
, ip
, XFS_ATTR_FORK
, &junk
,
1374 if (xfs_inode_has_attr_fork(tip
) && tip
->i_af
.if_nextents
> 0 &&
1375 tip
->i_af
.if_format
!= XFS_DINODE_FMT_LOCAL
) {
1376 error
= xfs_bmap_count_blocks(tp
, tip
, XFS_ATTR_FORK
, &junk
,
1383 * Btree format (v3) inodes have the inode number stamped in the bmbt
1384 * block headers. We can't start changing the bmbt blocks until the
1385 * inode owner change is logged so recovery does the right thing in the
1386 * event of a crash. Set the owner change log flags now and leave the
1387 * bmbt scan as the last step.
1389 if (xfs_has_v3inodes(ip
->i_mount
)) {
1390 if (ip
->i_df
.if_format
== XFS_DINODE_FMT_BTREE
)
1391 (*target_log_flags
) |= XFS_ILOG_DOWNER
;
1392 if (tip
->i_df
.if_format
== XFS_DINODE_FMT_BTREE
)
1393 (*src_log_flags
) |= XFS_ILOG_DOWNER
;
1397 * Swap the data forks of the inodes
1399 swap(ip
->i_df
, tip
->i_df
);
1402 * Fix the on-disk inode values
1404 tmp
= (uint64_t)ip
->i_nblocks
;
1405 ip
->i_nblocks
= tip
->i_nblocks
- taforkblks
+ aforkblks
;
1406 tip
->i_nblocks
= tmp
+ taforkblks
- aforkblks
;
1409 * The extents in the source inode could still contain speculative
1410 * preallocation beyond EOF (e.g. the file is open but not modified
1411 * while defrag is in progress). In that case, we need to copy over the
1412 * number of delalloc blocks the data fork in the source inode is
1413 * tracking beyond EOF so that when the fork is truncated away when the
1414 * temporary inode is unlinked we don't underrun the i_delayed_blks
1415 * counter on that inode.
1417 ASSERT(tip
->i_delayed_blks
== 0);
1418 tip
->i_delayed_blks
= ip
->i_delayed_blks
;
1419 ip
->i_delayed_blks
= 0;
1421 switch (ip
->i_df
.if_format
) {
1422 case XFS_DINODE_FMT_EXTENTS
:
1423 (*src_log_flags
) |= XFS_ILOG_DEXT
;
1425 case XFS_DINODE_FMT_BTREE
:
1426 ASSERT(!xfs_has_v3inodes(ip
->i_mount
) ||
1427 (*src_log_flags
& XFS_ILOG_DOWNER
));
1428 (*src_log_flags
) |= XFS_ILOG_DBROOT
;
1432 switch (tip
->i_df
.if_format
) {
1433 case XFS_DINODE_FMT_EXTENTS
:
1434 (*target_log_flags
) |= XFS_ILOG_DEXT
;
1436 case XFS_DINODE_FMT_BTREE
:
1437 (*target_log_flags
) |= XFS_ILOG_DBROOT
;
1438 ASSERT(!xfs_has_v3inodes(ip
->i_mount
) ||
1439 (*target_log_flags
& XFS_ILOG_DOWNER
));
1447 * Fix up the owners of the bmbt blocks to refer to the current inode. The
1448 * change owner scan attempts to order all modified buffers in the current
1449 * transaction. In the event of ordered buffer failure, the offending buffer is
1450 * physically logged as a fallback and the scan returns -EAGAIN. We must roll
1451 * the transaction in this case to replenish the fallback log reservation and
1452 * restart the scan. This process repeats until the scan completes.
1455 xfs_swap_change_owner(
1456 struct xfs_trans
**tpp
,
1457 struct xfs_inode
*ip
,
1458 struct xfs_inode
*tmpip
)
1461 struct xfs_trans
*tp
= *tpp
;
1464 error
= xfs_bmbt_change_owner(tp
, ip
, XFS_DATA_FORK
, ip
->i_ino
,
1466 /* success or fatal error */
1467 if (error
!= -EAGAIN
)
1470 error
= xfs_trans_roll(tpp
);
1476 * Redirty both inodes so they can relog and keep the log tail
1479 xfs_trans_ijoin(tp
, ip
, 0);
1480 xfs_trans_ijoin(tp
, tmpip
, 0);
1481 xfs_trans_log_inode(tp
, ip
, XFS_ILOG_CORE
);
1482 xfs_trans_log_inode(tp
, tmpip
, XFS_ILOG_CORE
);
1490 struct xfs_inode
*ip
, /* target inode */
1491 struct xfs_inode
*tip
, /* tmp inode */
1492 struct xfs_swapext
*sxp
)
1494 struct xfs_mount
*mp
= ip
->i_mount
;
1495 struct xfs_trans
*tp
;
1496 struct xfs_bstat
*sbp
= &sxp
->sx_stat
;
1497 int src_log_flags
, target_log_flags
;
1501 unsigned int flags
= 0;
1502 struct timespec64 ctime
, mtime
;
1505 * Lock the inodes against other IO, page faults and truncate to
1506 * begin with. Then we can ensure the inodes are flushed and have no
1507 * page cache safely. Once we have done this we can take the ilocks and
1508 * do the rest of the checks.
1510 lock_two_nondirectories(VFS_I(ip
), VFS_I(tip
));
1511 filemap_invalidate_lock_two(VFS_I(ip
)->i_mapping
,
1512 VFS_I(tip
)->i_mapping
);
1514 /* Verify that both files have the same format */
1515 if ((VFS_I(ip
)->i_mode
& S_IFMT
) != (VFS_I(tip
)->i_mode
& S_IFMT
)) {
1520 /* Verify both files are either real-time or non-realtime */
1521 if (XFS_IS_REALTIME_INODE(ip
) != XFS_IS_REALTIME_INODE(tip
)) {
1526 error
= xfs_qm_dqattach(ip
);
1530 error
= xfs_qm_dqattach(tip
);
1534 error
= xfs_swap_extent_flush(ip
);
1537 error
= xfs_swap_extent_flush(tip
);
1541 if (xfs_inode_has_cow_data(tip
)) {
1542 error
= xfs_reflink_cancel_cow_range(tip
, 0, NULLFILEOFF
, true);
1548 * Extent "swapping" with rmap requires a permanent reservation and
1549 * a block reservation because it's really just a remap operation
1550 * performed with log redo items!
1552 if (xfs_has_rmapbt(mp
)) {
1553 int w
= XFS_DATA_FORK
;
1554 uint32_t ipnext
= ip
->i_df
.if_nextents
;
1555 uint32_t tipnext
= tip
->i_df
.if_nextents
;
1558 * Conceptually this shouldn't affect the shape of either bmbt,
1559 * but since we atomically move extents one by one, we reserve
1560 * enough space to rebuild both trees.
1562 resblks
= XFS_SWAP_RMAP_SPACE_RES(mp
, ipnext
, w
);
1563 resblks
+= XFS_SWAP_RMAP_SPACE_RES(mp
, tipnext
, w
);
1566 * If either inode straddles a bmapbt block allocation boundary,
1567 * the rmapbt algorithm triggers repeated allocs and frees as
1568 * extents are remapped. This can exhaust the block reservation
1569 * prematurely and cause shutdown. Return freed blocks to the
1570 * transaction reservation to counter this behavior.
1572 flags
|= XFS_TRANS_RES_FDBLKS
;
1574 error
= xfs_trans_alloc(mp
, &M_RES(mp
)->tr_write
, resblks
, 0, flags
,
1580 * Lock and join the inodes to the tansaction so that transaction commit
1581 * or cancel will unlock the inodes from this point onwards.
1583 xfs_lock_two_inodes(ip
, XFS_ILOCK_EXCL
, tip
, XFS_ILOCK_EXCL
);
1584 xfs_trans_ijoin(tp
, ip
, 0);
1585 xfs_trans_ijoin(tp
, tip
, 0);
1588 /* Verify all data are being swapped */
1589 if (sxp
->sx_offset
!= 0 ||
1590 sxp
->sx_length
!= ip
->i_disk_size
||
1591 sxp
->sx_length
!= tip
->i_disk_size
) {
1593 goto out_trans_cancel
;
1596 trace_xfs_swap_extent_before(ip
, 0);
1597 trace_xfs_swap_extent_before(tip
, 1);
1599 /* check inode formats now that data is flushed */
1600 error
= xfs_swap_extents_check_format(ip
, tip
);
1603 "%s: inode 0x%llx format is incompatible for exchanging.",
1604 __func__
, ip
->i_ino
);
1605 goto out_trans_cancel
;
1609 * Compare the current change & modify times with that
1610 * passed in. If they differ, we abort this swap.
1611 * This is the mechanism used to ensure the calling
1612 * process that the file was not changed out from
1615 ctime
= inode_get_ctime(VFS_I(ip
));
1616 mtime
= inode_get_mtime(VFS_I(ip
));
1617 if ((sbp
->bs_ctime
.tv_sec
!= ctime
.tv_sec
) ||
1618 (sbp
->bs_ctime
.tv_nsec
!= ctime
.tv_nsec
) ||
1619 (sbp
->bs_mtime
.tv_sec
!= mtime
.tv_sec
) ||
1620 (sbp
->bs_mtime
.tv_nsec
!= mtime
.tv_nsec
)) {
1622 goto out_trans_cancel
;
1626 * Note the trickiness in setting the log flags - we set the owner log
1627 * flag on the opposite inode (i.e. the inode we are setting the new
1628 * owner to be) because once we swap the forks and log that, log
1629 * recovery is going to see the fork as owned by the swapped inode,
1630 * not the pre-swapped inodes.
1632 src_log_flags
= XFS_ILOG_CORE
;
1633 target_log_flags
= XFS_ILOG_CORE
;
1635 if (xfs_has_rmapbt(mp
))
1636 error
= xfs_swap_extent_rmap(&tp
, ip
, tip
);
1638 error
= xfs_swap_extent_forks(tp
, ip
, tip
, &src_log_flags
,
1641 goto out_trans_cancel
;
1643 /* Do we have to swap reflink flags? */
1644 if ((ip
->i_diflags2
& XFS_DIFLAG2_REFLINK
) ^
1645 (tip
->i_diflags2
& XFS_DIFLAG2_REFLINK
)) {
1646 f
= ip
->i_diflags2
& XFS_DIFLAG2_REFLINK
;
1647 ip
->i_diflags2
&= ~XFS_DIFLAG2_REFLINK
;
1648 ip
->i_diflags2
|= tip
->i_diflags2
& XFS_DIFLAG2_REFLINK
;
1649 tip
->i_diflags2
&= ~XFS_DIFLAG2_REFLINK
;
1650 tip
->i_diflags2
|= f
& XFS_DIFLAG2_REFLINK
;
1653 /* Swap the cow forks. */
1654 if (xfs_has_reflink(mp
)) {
1655 ASSERT(!ip
->i_cowfp
||
1656 ip
->i_cowfp
->if_format
== XFS_DINODE_FMT_EXTENTS
);
1657 ASSERT(!tip
->i_cowfp
||
1658 tip
->i_cowfp
->if_format
== XFS_DINODE_FMT_EXTENTS
);
1660 swap(ip
->i_cowfp
, tip
->i_cowfp
);
1662 if (ip
->i_cowfp
&& ip
->i_cowfp
->if_bytes
)
1663 xfs_inode_set_cowblocks_tag(ip
);
1665 xfs_inode_clear_cowblocks_tag(ip
);
1666 if (tip
->i_cowfp
&& tip
->i_cowfp
->if_bytes
)
1667 xfs_inode_set_cowblocks_tag(tip
);
1669 xfs_inode_clear_cowblocks_tag(tip
);
1672 xfs_trans_log_inode(tp
, ip
, src_log_flags
);
1673 xfs_trans_log_inode(tp
, tip
, target_log_flags
);
1676 * The extent forks have been swapped, but crc=1,rmapbt=0 filesystems
1677 * have inode number owner values in the bmbt blocks that still refer to
1678 * the old inode. Scan each bmbt to fix up the owner values with the
1679 * inode number of the current inode.
1681 if (src_log_flags
& XFS_ILOG_DOWNER
) {
1682 error
= xfs_swap_change_owner(&tp
, ip
, tip
);
1684 goto out_trans_cancel
;
1686 if (target_log_flags
& XFS_ILOG_DOWNER
) {
1687 error
= xfs_swap_change_owner(&tp
, tip
, ip
);
1689 goto out_trans_cancel
;
1693 * If this is a synchronous mount, make sure that the
1694 * transaction goes to disk before returning to the user.
1696 if (xfs_has_wsync(mp
))
1697 xfs_trans_set_sync(tp
);
1699 error
= xfs_trans_commit(tp
);
1701 trace_xfs_swap_extent_after(ip
, 0);
1702 trace_xfs_swap_extent_after(tip
, 1);
1705 xfs_iunlock(ip
, XFS_ILOCK_EXCL
);
1706 xfs_iunlock(tip
, XFS_ILOCK_EXCL
);
1708 filemap_invalidate_unlock_two(VFS_I(ip
)->i_mapping
,
1709 VFS_I(tip
)->i_mapping
);
1710 unlock_two_nondirectories(VFS_I(ip
), VFS_I(tip
));
1714 xfs_trans_cancel(tp
);
1715 goto out_unlock_ilock
;