2 * Copyright (C) 2016 Oracle. All Rights Reserved.
4 * Author: Darrick J. Wong <darrick.wong@oracle.com>
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version 2
9 * of the License, or (at your option) any later version.
11 * This program is distributed in the hope that it would be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write the Free Software Foundation,
18 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA.
22 #include "xfs_shared.h"
23 #include "xfs_format.h"
24 #include "xfs_log_format.h"
25 #include "xfs_trans_resv.h"
26 #include "xfs_mount.h"
27 #include "xfs_defer.h"
28 #include "xfs_da_format.h"
29 #include "xfs_da_btree.h"
30 #include "xfs_inode.h"
31 #include "xfs_trans.h"
32 #include "xfs_inode_item.h"
34 #include "xfs_bmap_util.h"
35 #include "xfs_error.h"
37 #include "xfs_dir2_priv.h"
38 #include "xfs_ioctl.h"
39 #include "xfs_trace.h"
41 #include "xfs_icache.h"
43 #include "xfs_btree.h"
44 #include "xfs_refcount_btree.h"
45 #include "xfs_refcount.h"
46 #include "xfs_bmap_btree.h"
47 #include "xfs_trans_space.h"
49 #include "xfs_alloc.h"
50 #include "xfs_quota_defs.h"
51 #include "xfs_quota.h"
52 #include "xfs_btree.h"
53 #include "xfs_bmap_btree.h"
54 #include "xfs_reflink.h"
55 #include "xfs_iomap.h"
56 #include "xfs_rmap_btree.h"
58 #include "xfs_ag_resv.h"
61 * Copy on Write of Shared Blocks
63 * XFS must preserve "the usual" file semantics even when two files share
64 * the same physical blocks. This means that a write to one file must not
65 * alter the blocks in a different file; the way that we'll do that is
66 * through the use of a copy-on-write mechanism. At a high level, that
67 * means that when we want to write to a shared block, we allocate a new
68 * block, write the data to the new block, and if that succeeds we map the
69 * new block into the file.
71 * XFS provides a "delayed allocation" mechanism that defers the allocation
72 * of disk blocks to dirty-but-not-yet-mapped file blocks as long as
73 * possible. This reduces fragmentation by enabling the filesystem to ask
74 * for bigger chunks less often, which is exactly what we want for CoW.
76 * The delalloc mechanism begins when the kernel wants to make a block
77 * writable (write_begin or page_mkwrite). If the offset is not mapped, we
78 * create a delalloc mapping, which is a regular in-core extent, but without
79 * a real startblock. (For delalloc mappings, the startblock encodes both
80 * a flag that this is a delalloc mapping, and a worst-case estimate of how
81 * many blocks might be required to put the mapping into the BMBT.) delalloc
82 * mappings are a reservation against the free space in the filesystem;
83 * adjacent mappings can also be combined into fewer larger mappings.
85 * As an optimization, the CoW extent size hint (cowextsz) creates
86 * outsized aligned delalloc reservations in the hope of landing out of
87 * order nearby CoW writes in a single extent on disk, thereby reducing
88 * fragmentation and improving future performance.
90 * D: --RRRRRRSSSRRRRRRRR--- (data fork)
91 * C: ------DDDDDDD--------- (CoW fork)
93 * When dirty pages are being written out (typically in writepage), the
94 * delalloc reservations are converted into unwritten mappings by
95 * allocating blocks and replacing the delalloc mapping with real ones.
96 * A delalloc mapping can be replaced by several unwritten ones if the
97 * free space is fragmented.
99 * D: --RRRRRRSSSRRRRRRRR---
100 * C: ------UUUUUUU---------
102 * We want to adapt the delalloc mechanism for copy-on-write, since the
103 * write paths are similar. The first two steps (creating the reservation
104 * and allocating the blocks) are exactly the same as delalloc except that
105 * the mappings must be stored in a separate CoW fork because we do not want
106 * to disturb the mapping in the data fork until we're sure that the write
107 * succeeded. IO completion in this case is the process of removing the old
108 * mapping from the data fork and moving the new mapping from the CoW fork to
109 * the data fork. This will be discussed shortly.
111 * For now, unaligned directio writes will be bounced back to the page cache.
112 * Block-aligned directio writes will use the same mechanism as buffered
115 * Just prior to submitting the actual disk write requests, we convert
116 * the extents representing the range of the file actually being written
117 * (as opposed to extra pieces created for the cowextsize hint) to real
118 * extents. This will become important in the next step:
120 * D: --RRRRRRSSSRRRRRRRR---
121 * C: ------UUrrUUU---------
123 * CoW remapping must be done after the data block write completes,
124 * because we don't want to destroy the old data fork map until we're sure
125 * the new block has been written. Since the new mappings are kept in a
126 * separate fork, we can simply iterate these mappings to find the ones
127 * that cover the file blocks that we just CoW'd. For each extent, simply
128 * unmap the corresponding range in the data fork, map the new range into
129 * the data fork, and remove the extent from the CoW fork. Because of
130 * the presence of the cowextsize hint, however, we must be careful
131 * only to remap the blocks that we've actually written out -- we must
132 * never remap delalloc reservations nor CoW staging blocks that have
133 * yet to be written. This corresponds exactly to the real extents in
136 * D: --RRRRRRrrSRRRRRRRR---
137 * C: ------UU--UUU---------
139 * Since the remapping operation can be applied to an arbitrary file
140 * range, we record the need for the remap step as a flag in the ioend
141 * instead of declaring a new IO type. This is required for direct io
142 * because we only have ioend for the whole dio, and we have to be able to
143 * remember the presence of unwritten blocks and CoW blocks with a single
144 * ioend structure. Better yet, the more ground we can cover with one
149 * Given an AG extent, find the lowest-numbered run of shared blocks
150 * within that range and return the range in fbno/flen. If
151 * find_end_of_shared is true, return the longest contiguous extent of
152 * shared blocks. If there are no shared extents, fbno and flen will
153 * be set to NULLAGBLOCK and 0, respectively.
156 xfs_reflink_find_shared(
157 struct xfs_mount
*mp
,
163 bool find_end_of_shared
)
165 struct xfs_buf
*agbp
;
166 struct xfs_btree_cur
*cur
;
169 error
= xfs_alloc_read_agf(mp
, NULL
, agno
, 0, &agbp
);
175 cur
= xfs_refcountbt_init_cursor(mp
, NULL
, agbp
, agno
, NULL
);
177 error
= xfs_refcount_find_shared(cur
, agbno
, aglen
, fbno
, flen
,
180 xfs_btree_del_cursor(cur
, error
? XFS_BTREE_ERROR
: XFS_BTREE_NOERROR
);
187 * Trim the mapping to the next block where there's a change in the
188 * shared/unshared status. More specifically, this means that we
189 * find the lowest-numbered extent of shared blocks that coincides with
190 * the given block mapping. If the shared extent overlaps the start of
191 * the mapping, trim the mapping to the end of the shared extent. If
192 * the shared region intersects the mapping, trim the mapping to the
193 * start of the shared extent. If there are no shared regions that
194 * overlap, just return the original extent.
197 xfs_reflink_trim_around_shared(
198 struct xfs_inode
*ip
,
199 struct xfs_bmbt_irec
*irec
,
210 /* Holes, unwritten, and delalloc extents cannot be shared */
211 if (!xfs_is_reflink_inode(ip
) ||
213 irec
->br_startblock
== HOLESTARTBLOCK
||
214 irec
->br_startblock
== DELAYSTARTBLOCK
||
215 isnullstartblock(irec
->br_startblock
)) {
220 trace_xfs_reflink_trim_around_shared(ip
, irec
);
222 agno
= XFS_FSB_TO_AGNO(ip
->i_mount
, irec
->br_startblock
);
223 agbno
= XFS_FSB_TO_AGBNO(ip
->i_mount
, irec
->br_startblock
);
224 aglen
= irec
->br_blockcount
;
226 error
= xfs_reflink_find_shared(ip
->i_mount
, agno
, agbno
,
227 aglen
, &fbno
, &flen
, true);
231 *shared
= *trimmed
= false;
232 if (fbno
== NULLAGBLOCK
) {
233 /* No shared blocks at all. */
235 } else if (fbno
== agbno
) {
237 * The start of this extent is shared. Truncate the
238 * mapping at the end of the shared region so that a
239 * subsequent iteration starts at the start of the
242 irec
->br_blockcount
= flen
;
249 * There's a shared extent midway through this extent.
250 * Truncate the mapping at the start of the shared
251 * extent so that a subsequent iteration starts at the
252 * start of the shared region.
254 irec
->br_blockcount
= fbno
- agbno
;
261 * Trim the passed in imap to the next shared/unshared extent boundary, and
262 * if imap->br_startoff points to a shared extent reserve space for it in the
263 * COW fork. In this case *shared is set to true, else to false.
265 * Note that imap will always contain the block numbers for the existing blocks
266 * in the data fork, as the upper layers need them for read-modify-write
270 xfs_reflink_reserve_cow(
271 struct xfs_inode
*ip
,
272 struct xfs_bmbt_irec
*imap
,
275 struct xfs_ifork
*ifp
= XFS_IFORK_PTR(ip
, XFS_COW_FORK
);
276 struct xfs_bmbt_irec got
;
278 bool eof
= false, trimmed
;
282 * Search the COW fork extent list first. This serves two purposes:
283 * first this implement the speculative preallocation using cowextisze,
284 * so that we also unshared block adjacent to shared blocks instead
285 * of just the shared blocks themselves. Second the lookup in the
286 * extent list is generally faster than going out to the shared extent
290 if (!xfs_iext_lookup_extent(ip
, ifp
, imap
->br_startoff
, &idx
, &got
))
292 if (!eof
&& got
.br_startoff
<= imap
->br_startoff
) {
293 trace_xfs_reflink_cow_found(ip
, imap
);
294 xfs_trim_extent(imap
, got
.br_startoff
, got
.br_blockcount
);
300 /* Trim the mapping to the nearest shared extent boundary. */
301 error
= xfs_reflink_trim_around_shared(ip
, imap
, shared
, &trimmed
);
305 /* Not shared? Just report the (potentially capped) extent. */
310 * Fork all the shared blocks from our write offset until the end of
313 error
= xfs_qm_dqattach_locked(ip
, 0);
317 error
= xfs_bmapi_reserve_delalloc(ip
, XFS_COW_FORK
, imap
->br_startoff
,
318 imap
->br_blockcount
, 0, &got
, &idx
, eof
);
319 if (error
== -ENOSPC
|| error
== -EDQUOT
)
320 trace_xfs_reflink_cow_enospc(ip
, imap
);
324 trace_xfs_reflink_cow_alloc(ip
, &got
);
328 /* Convert part of an unwritten CoW extent to a real one. */
330 xfs_reflink_convert_cow_extent(
331 struct xfs_inode
*ip
,
332 struct xfs_bmbt_irec
*imap
,
333 xfs_fileoff_t offset_fsb
,
334 xfs_filblks_t count_fsb
,
335 struct xfs_defer_ops
*dfops
)
337 struct xfs_bmbt_irec irec
= *imap
;
338 xfs_fsblock_t first_block
= NULLFSBLOCK
;
341 if (imap
->br_state
== XFS_EXT_NORM
)
344 xfs_trim_extent(&irec
, offset_fsb
, count_fsb
);
345 trace_xfs_reflink_convert_cow(ip
, &irec
);
346 if (irec
.br_blockcount
== 0)
348 return xfs_bmapi_write(NULL
, ip
, irec
.br_startoff
, irec
.br_blockcount
,
349 XFS_BMAPI_COWFORK
| XFS_BMAPI_CONVERT
, &first_block
,
350 0, &irec
, &nimaps
, dfops
);
353 /* Convert all of the unwritten CoW extents in a file's range to real ones. */
355 xfs_reflink_convert_cow(
356 struct xfs_inode
*ip
,
360 struct xfs_bmbt_irec got
;
361 struct xfs_defer_ops dfops
;
362 struct xfs_mount
*mp
= ip
->i_mount
;
363 struct xfs_ifork
*ifp
= XFS_IFORK_PTR(ip
, XFS_COW_FORK
);
364 xfs_fileoff_t offset_fsb
= XFS_B_TO_FSBT(mp
, offset
);
365 xfs_fileoff_t end_fsb
= XFS_B_TO_FSB(mp
, offset
+ count
);
370 xfs_ilock(ip
, XFS_ILOCK_EXCL
);
372 /* Convert all the extents to real from unwritten. */
373 for (found
= xfs_iext_lookup_extent(ip
, ifp
, offset_fsb
, &idx
, &got
);
374 found
&& got
.br_startoff
< end_fsb
;
375 found
= xfs_iext_get_extent(ifp
, ++idx
, &got
)) {
376 error
= xfs_reflink_convert_cow_extent(ip
, &got
, offset_fsb
,
377 end_fsb
- offset_fsb
, &dfops
);
383 xfs_iunlock(ip
, XFS_ILOCK_EXCL
);
387 /* Allocate all CoW reservations covering a range of blocks in a file. */
389 __xfs_reflink_allocate_cow(
390 struct xfs_inode
*ip
,
391 xfs_fileoff_t
*offset_fsb
,
392 xfs_fileoff_t end_fsb
)
394 struct xfs_mount
*mp
= ip
->i_mount
;
395 struct xfs_bmbt_irec imap
;
396 struct xfs_defer_ops dfops
;
397 struct xfs_trans
*tp
;
398 xfs_fsblock_t first_block
;
399 int nimaps
= 1, error
;
402 xfs_defer_init(&dfops
, &first_block
);
404 error
= xfs_trans_alloc(mp
, &M_RES(mp
)->tr_write
, 0, 0,
405 XFS_TRANS_RESERVE
, &tp
);
409 xfs_ilock(ip
, XFS_ILOCK_EXCL
);
411 /* Read extent from the source file. */
413 error
= xfs_bmapi_read(ip
, *offset_fsb
, end_fsb
- *offset_fsb
,
419 /* Make sure there's a CoW reservation for it. */
420 error
= xfs_reflink_reserve_cow(ip
, &imap
, &shared
);
422 goto out_trans_cancel
;
425 *offset_fsb
= imap
.br_startoff
+ imap
.br_blockcount
;
426 goto out_trans_cancel
;
429 /* Allocate the entire reservation as unwritten blocks. */
430 xfs_trans_ijoin(tp
, ip
, 0);
431 error
= xfs_bmapi_write(tp
, ip
, imap
.br_startoff
, imap
.br_blockcount
,
432 XFS_BMAPI_COWFORK
| XFS_BMAPI_PREALLOC
, &first_block
,
433 XFS_EXTENTADD_SPACE_RES(mp
, XFS_DATA_FORK
),
434 &imap
, &nimaps
, &dfops
);
436 goto out_trans_cancel
;
439 error
= xfs_defer_finish(&tp
, &dfops
, NULL
);
441 goto out_trans_cancel
;
443 error
= xfs_trans_commit(tp
);
445 *offset_fsb
= imap
.br_startoff
+ imap
.br_blockcount
;
447 xfs_iunlock(ip
, XFS_ILOCK_EXCL
);
450 xfs_defer_cancel(&dfops
);
451 xfs_trans_cancel(tp
);
455 /* Allocate all CoW reservations covering a part of a file. */
457 xfs_reflink_allocate_cow_range(
458 struct xfs_inode
*ip
,
462 struct xfs_mount
*mp
= ip
->i_mount
;
463 xfs_fileoff_t offset_fsb
= XFS_B_TO_FSBT(mp
, offset
);
464 xfs_fileoff_t end_fsb
= XFS_B_TO_FSB(mp
, offset
+ count
);
467 ASSERT(xfs_is_reflink_inode(ip
));
469 trace_xfs_reflink_allocate_cow_range(ip
, offset
, count
);
472 * Make sure that the dquots are there.
474 error
= xfs_qm_dqattach(ip
, 0);
478 while (offset_fsb
< end_fsb
) {
479 error
= __xfs_reflink_allocate_cow(ip
, &offset_fsb
, end_fsb
);
481 trace_xfs_reflink_allocate_cow_range_error(ip
, error
,
487 /* Convert the CoW extents to regular. */
488 return xfs_reflink_convert_cow(ip
, offset
, count
);
492 * Find the CoW reservation (and whether or not it needs block allocation)
493 * for a given byte offset of a file.
496 xfs_reflink_find_cow_mapping(
497 struct xfs_inode
*ip
,
499 struct xfs_bmbt_irec
*imap
,
502 struct xfs_bmbt_irec irec
;
503 struct xfs_ifork
*ifp
;
504 struct xfs_bmbt_rec_host
*gotp
;
508 ASSERT(xfs_isilocked(ip
, XFS_ILOCK_EXCL
| XFS_ILOCK_SHARED
));
509 ASSERT(xfs_is_reflink_inode(ip
));
511 /* Find the extent in the CoW fork. */
512 ifp
= XFS_IFORK_PTR(ip
, XFS_COW_FORK
);
513 bno
= XFS_B_TO_FSBT(ip
->i_mount
, offset
);
514 gotp
= xfs_iext_bno_to_ext(ifp
, bno
, &idx
);
518 xfs_bmbt_get_all(gotp
, &irec
);
519 if (bno
>= irec
.br_startoff
+ irec
.br_blockcount
||
520 bno
< irec
.br_startoff
)
523 trace_xfs_reflink_find_cow_mapping(ip
, offset
, 1, XFS_IO_OVERWRITE
,
526 /* If it's still delalloc, we must allocate later. */
528 *need_alloc
= !!(isnullstartblock(irec
.br_startblock
));
534 * Trim an extent to end at the next CoW reservation past offset_fsb.
537 xfs_reflink_trim_irec_to_next_cow(
538 struct xfs_inode
*ip
,
539 xfs_fileoff_t offset_fsb
,
540 struct xfs_bmbt_irec
*imap
)
542 struct xfs_bmbt_irec irec
;
543 struct xfs_ifork
*ifp
;
544 struct xfs_bmbt_rec_host
*gotp
;
547 if (!xfs_is_reflink_inode(ip
))
550 /* Find the extent in the CoW fork. */
551 ifp
= XFS_IFORK_PTR(ip
, XFS_COW_FORK
);
552 gotp
= xfs_iext_bno_to_ext(ifp
, offset_fsb
, &idx
);
555 xfs_bmbt_get_all(gotp
, &irec
);
557 /* This is the extent before; try sliding up one. */
558 if (irec
.br_startoff
< offset_fsb
) {
560 if (idx
>= xfs_iext_count(ifp
))
562 gotp
= xfs_iext_get_ext(ifp
, idx
);
563 xfs_bmbt_get_all(gotp
, &irec
);
566 if (irec
.br_startoff
>= imap
->br_startoff
+ imap
->br_blockcount
)
569 imap
->br_blockcount
= irec
.br_startoff
- imap
->br_startoff
;
570 trace_xfs_reflink_trim_irec(ip
, imap
);
576 * Cancel CoW reservations for some block range of an inode.
578 * If cancel_real is true this function cancels all COW fork extents for the
579 * inode; if cancel_real is false, real extents are not cleared.
582 xfs_reflink_cancel_cow_blocks(
583 struct xfs_inode
*ip
,
584 struct xfs_trans
**tpp
,
585 xfs_fileoff_t offset_fsb
,
586 xfs_fileoff_t end_fsb
,
589 struct xfs_ifork
*ifp
= XFS_IFORK_PTR(ip
, XFS_COW_FORK
);
590 struct xfs_bmbt_irec got
, prev
, del
;
592 xfs_fsblock_t firstfsb
;
593 struct xfs_defer_ops dfops
;
594 int error
= 0, eof
= 0;
596 if (!xfs_is_reflink_inode(ip
))
599 xfs_bmap_search_extents(ip
, offset_fsb
, XFS_COW_FORK
, &eof
, &idx
,
604 while (got
.br_startoff
< end_fsb
) {
606 xfs_trim_extent(&del
, offset_fsb
, end_fsb
- offset_fsb
);
607 trace_xfs_reflink_cancel_cow(ip
, &del
);
609 if (isnullstartblock(del
.br_startblock
)) {
610 error
= xfs_bmap_del_extent_delay(ip
, XFS_COW_FORK
,
614 } else if (del
.br_state
== XFS_EXT_UNWRITTEN
|| cancel_real
) {
615 xfs_trans_ijoin(*tpp
, ip
, 0);
616 xfs_defer_init(&dfops
, &firstfsb
);
618 /* Free the CoW orphan record. */
619 error
= xfs_refcount_free_cow_extent(ip
->i_mount
,
620 &dfops
, del
.br_startblock
,
625 xfs_bmap_add_free(ip
->i_mount
, &dfops
,
626 del
.br_startblock
, del
.br_blockcount
,
629 /* Update quota accounting */
630 xfs_trans_mod_dquot_byino(*tpp
, ip
, XFS_TRANS_DQ_BCOUNT
,
631 -(long)del
.br_blockcount
);
633 /* Roll the transaction */
634 error
= xfs_defer_finish(tpp
, &dfops
, ip
);
636 xfs_defer_cancel(&dfops
);
640 /* Remove the mapping from the CoW fork. */
641 xfs_bmap_del_extent_cow(ip
, &idx
, &got
, &del
);
644 if (++idx
>= xfs_iext_count(ifp
))
646 xfs_bmbt_get_all(xfs_iext_get_ext(ifp
, idx
), &got
);
649 /* clear tag if cow fork is emptied */
651 xfs_inode_clear_cowblocks_tag(ip
);
657 * Cancel CoW reservations for some byte range of an inode.
659 * If cancel_real is true this function cancels all COW fork extents for the
660 * inode; if cancel_real is false, real extents are not cleared.
663 xfs_reflink_cancel_cow_range(
664 struct xfs_inode
*ip
,
669 struct xfs_trans
*tp
;
670 xfs_fileoff_t offset_fsb
;
671 xfs_fileoff_t end_fsb
;
674 trace_xfs_reflink_cancel_cow_range(ip
, offset
, count
);
675 ASSERT(xfs_is_reflink_inode(ip
));
677 offset_fsb
= XFS_B_TO_FSBT(ip
->i_mount
, offset
);
678 if (count
== NULLFILEOFF
)
679 end_fsb
= NULLFILEOFF
;
681 end_fsb
= XFS_B_TO_FSB(ip
->i_mount
, offset
+ count
);
683 /* Start a rolling transaction to remove the mappings */
684 error
= xfs_trans_alloc(ip
->i_mount
, &M_RES(ip
->i_mount
)->tr_write
,
689 xfs_ilock(ip
, XFS_ILOCK_EXCL
);
690 xfs_trans_ijoin(tp
, ip
, 0);
692 /* Scrape out the old CoW reservations */
693 error
= xfs_reflink_cancel_cow_blocks(ip
, &tp
, offset_fsb
, end_fsb
,
698 error
= xfs_trans_commit(tp
);
700 xfs_iunlock(ip
, XFS_ILOCK_EXCL
);
704 xfs_trans_cancel(tp
);
705 xfs_iunlock(ip
, XFS_ILOCK_EXCL
);
707 trace_xfs_reflink_cancel_cow_range_error(ip
, error
, _RET_IP_
);
712 * Remap parts of a file's data fork after a successful CoW.
716 struct xfs_inode
*ip
,
720 struct xfs_ifork
*ifp
= XFS_IFORK_PTR(ip
, XFS_COW_FORK
);
721 struct xfs_bmbt_irec got
, prev
, del
;
722 struct xfs_trans
*tp
;
723 xfs_fileoff_t offset_fsb
;
724 xfs_fileoff_t end_fsb
;
725 xfs_fsblock_t firstfsb
;
726 struct xfs_defer_ops dfops
;
728 unsigned int resblks
;
732 trace_xfs_reflink_end_cow(ip
, offset
, count
);
734 /* No COW extents? That's easy! */
735 if (ifp
->if_bytes
== 0)
738 offset_fsb
= XFS_B_TO_FSBT(ip
->i_mount
, offset
);
739 end_fsb
= XFS_B_TO_FSB(ip
->i_mount
, offset
+ count
);
742 * Start a rolling transaction to switch the mappings. We're
743 * unlikely ever to have to remap 16T worth of single-block
744 * extents, so just cap the worst case extent count to 2^32-1.
745 * Stick a warning in just in case, and avoid 64-bit division.
747 BUILD_BUG_ON(MAX_RW_COUNT
> UINT_MAX
);
748 if (end_fsb
- offset_fsb
> UINT_MAX
) {
749 error
= -EFSCORRUPTED
;
750 xfs_force_shutdown(ip
->i_mount
, SHUTDOWN_CORRUPT_INCORE
);
754 resblks
= XFS_NEXTENTADD_SPACE_RES(ip
->i_mount
,
755 (unsigned int)(end_fsb
- offset_fsb
),
757 error
= xfs_trans_alloc(ip
->i_mount
, &M_RES(ip
->i_mount
)->tr_write
,
762 xfs_ilock(ip
, XFS_ILOCK_EXCL
);
763 xfs_trans_ijoin(tp
, ip
, 0);
765 xfs_bmap_search_extents(ip
, end_fsb
- 1, XFS_COW_FORK
, &eof
, &idx
,
768 /* If there is a hole at end_fsb - 1 go to the previous extent */
769 if (eof
|| got
.br_startoff
> end_fsb
) {
771 * In case of racing, overlapping AIO writes no COW extents
772 * might be left by the time I/O completes for the loser of
773 * the race. In that case we are done.
777 xfs_bmbt_get_all(xfs_iext_get_ext(ifp
, --idx
), &got
);
780 /* Walk backwards until we're out of the I/O range... */
781 while (got
.br_startoff
+ got
.br_blockcount
> offset_fsb
) {
783 xfs_trim_extent(&del
, offset_fsb
, end_fsb
- offset_fsb
);
785 /* Extent delete may have bumped idx forward */
786 if (!del
.br_blockcount
) {
791 ASSERT(!isnullstartblock(got
.br_startblock
));
794 * Don't remap unwritten extents; these are
795 * speculatively preallocated CoW extents that have been
796 * allocated but have not yet been involved in a write.
798 if (got
.br_state
== XFS_EXT_UNWRITTEN
) {
803 /* Unmap the old blocks in the data fork. */
804 xfs_defer_init(&dfops
, &firstfsb
);
805 rlen
= del
.br_blockcount
;
806 error
= __xfs_bunmapi(tp
, ip
, del
.br_startoff
, &rlen
, 0, 1,
811 /* Trim the extent to whatever got unmapped. */
813 xfs_trim_extent(&del
, del
.br_startoff
+ rlen
,
814 del
.br_blockcount
- rlen
);
816 trace_xfs_reflink_cow_remap(ip
, &del
);
818 /* Free the CoW orphan record. */
819 error
= xfs_refcount_free_cow_extent(tp
->t_mountp
, &dfops
,
820 del
.br_startblock
, del
.br_blockcount
);
824 /* Map the new blocks into the data fork. */
825 error
= xfs_bmap_map_extent(tp
->t_mountp
, &dfops
, ip
, &del
);
829 /* Remove the mapping from the CoW fork. */
830 xfs_bmap_del_extent_cow(ip
, &idx
, &got
, &del
);
832 error
= xfs_defer_finish(&tp
, &dfops
, ip
);
839 xfs_bmbt_get_all(xfs_iext_get_ext(ifp
, idx
), &got
);
842 error
= xfs_trans_commit(tp
);
843 xfs_iunlock(ip
, XFS_ILOCK_EXCL
);
849 xfs_defer_cancel(&dfops
);
851 xfs_trans_cancel(tp
);
852 xfs_iunlock(ip
, XFS_ILOCK_EXCL
);
854 trace_xfs_reflink_end_cow_error(ip
, error
, _RET_IP_
);
859 * Free leftover CoW reservations that didn't get cleaned out.
862 xfs_reflink_recover_cow(
863 struct xfs_mount
*mp
)
868 if (!xfs_sb_version_hasreflink(&mp
->m_sb
))
871 for (agno
= 0; agno
< mp
->m_sb
.sb_agcount
; agno
++) {
872 error
= xfs_refcount_recover_cow_leftovers(mp
, agno
);
881 * Reflinking (Block) Ranges of Two Files Together
883 * First, ensure that the reflink flag is set on both inodes. The flag is an
884 * optimization to avoid unnecessary refcount btree lookups in the write path.
886 * Now we can iteratively remap the range of extents (and holes) in src to the
887 * corresponding ranges in dest. Let drange and srange denote the ranges of
888 * logical blocks in dest and src touched by the reflink operation.
890 * While the length of drange is greater than zero,
891 * - Read src's bmbt at the start of srange ("imap")
892 * - If imap doesn't exist, make imap appear to start at the end of srange
894 * - If imap starts before srange, advance imap to start at srange.
895 * - If imap goes beyond srange, truncate imap to end at the end of srange.
896 * - Punch (imap start - srange start + imap len) blocks from dest at
897 * offset (drange start).
898 * - If imap points to a real range of pblks,
899 * > Increase the refcount of the imap's pblks
900 * > Map imap's pblks into dest at the offset
901 * (drange start + imap start - srange start)
902 * - Advance drange and srange by (imap start - srange start + imap len)
904 * Finally, if the reflink made dest longer, update both the in-core and
905 * on-disk file sizes.
907 * ASCII Art Demonstration:
909 * Let's say we want to reflink this source file:
911 * ----SSSSSSS-SSSSS----SSSSSS (src file)
912 * <-------------------->
914 * into this destination file:
916 * --DDDDDDDDDDDDDDDDDDD--DDD (dest file)
917 * <-------------------->
918 * '-' means a hole, and 'S' and 'D' are written blocks in the src and dest.
919 * Observe that the range has different logical offsets in either file.
921 * Consider that the first extent in the source file doesn't line up with our
922 * reflink range. Unmapping and remapping are separate operations, so we can
923 * unmap more blocks from the destination file than we remap.
925 * ----SSSSSSS-SSSSS----SSSSSS
927 * --DDDDD---------DDDDD--DDD
930 * Now remap the source extent into the destination file:
932 * ----SSSSSSS-SSSSS----SSSSSS
934 * --DDDDD--SSSSSSSDDDDD--DDD
937 * Do likewise with the second hole and extent in our range. Holes in the
938 * unmap range don't affect our operation.
940 * ----SSSSSSS-SSSSS----SSSSSS
942 * --DDDDD--SSSSSSS-SSSSS-DDD
945 * Finally, unmap and remap part of the third extent. This will increase the
946 * size of the destination file.
948 * ----SSSSSSS-SSSSS----SSSSSS
950 * --DDDDD--SSSSSSS-SSSSS----SSS
953 * Once we update the destination file's i_size, we're done.
957 * Ensure the reflink bit is set in both inodes.
960 xfs_reflink_set_inode_flag(
961 struct xfs_inode
*src
,
962 struct xfs_inode
*dest
)
964 struct xfs_mount
*mp
= src
->i_mount
;
966 struct xfs_trans
*tp
;
968 if (xfs_is_reflink_inode(src
) && xfs_is_reflink_inode(dest
))
971 error
= xfs_trans_alloc(mp
, &M_RES(mp
)->tr_ichange
, 0, 0, 0, &tp
);
975 /* Lock both files against IO */
976 if (src
->i_ino
== dest
->i_ino
)
977 xfs_ilock(src
, XFS_ILOCK_EXCL
);
979 xfs_lock_two_inodes(src
, dest
, XFS_ILOCK_EXCL
);
981 if (!xfs_is_reflink_inode(src
)) {
982 trace_xfs_reflink_set_inode_flag(src
);
983 xfs_trans_ijoin(tp
, src
, XFS_ILOCK_EXCL
);
984 src
->i_d
.di_flags2
|= XFS_DIFLAG2_REFLINK
;
985 xfs_trans_log_inode(tp
, src
, XFS_ILOG_CORE
);
986 xfs_ifork_init_cow(src
);
988 xfs_iunlock(src
, XFS_ILOCK_EXCL
);
990 if (src
->i_ino
== dest
->i_ino
)
993 if (!xfs_is_reflink_inode(dest
)) {
994 trace_xfs_reflink_set_inode_flag(dest
);
995 xfs_trans_ijoin(tp
, dest
, XFS_ILOCK_EXCL
);
996 dest
->i_d
.di_flags2
|= XFS_DIFLAG2_REFLINK
;
997 xfs_trans_log_inode(tp
, dest
, XFS_ILOG_CORE
);
998 xfs_ifork_init_cow(dest
);
1000 xfs_iunlock(dest
, XFS_ILOCK_EXCL
);
1003 error
= xfs_trans_commit(tp
);
1009 trace_xfs_reflink_set_inode_flag_error(dest
, error
, _RET_IP_
);
1014 * Update destination inode size & cowextsize hint, if necessary.
1017 xfs_reflink_update_dest(
1018 struct xfs_inode
*dest
,
1020 xfs_extlen_t cowextsize
,
1023 struct xfs_mount
*mp
= dest
->i_mount
;
1024 struct xfs_trans
*tp
;
1027 if (is_dedupe
&& newlen
<= i_size_read(VFS_I(dest
)) && cowextsize
== 0)
1030 error
= xfs_trans_alloc(mp
, &M_RES(mp
)->tr_ichange
, 0, 0, 0, &tp
);
1034 xfs_ilock(dest
, XFS_ILOCK_EXCL
);
1035 xfs_trans_ijoin(tp
, dest
, XFS_ILOCK_EXCL
);
1037 if (newlen
> i_size_read(VFS_I(dest
))) {
1038 trace_xfs_reflink_update_inode_size(dest
, newlen
);
1039 i_size_write(VFS_I(dest
), newlen
);
1040 dest
->i_d
.di_size
= newlen
;
1044 dest
->i_d
.di_cowextsize
= cowextsize
;
1045 dest
->i_d
.di_flags2
|= XFS_DIFLAG2_COWEXTSIZE
;
1049 xfs_trans_ichgtime(tp
, dest
,
1050 XFS_ICHGTIME_MOD
| XFS_ICHGTIME_CHG
);
1052 xfs_trans_log_inode(tp
, dest
, XFS_ILOG_CORE
);
1054 error
= xfs_trans_commit(tp
);
1060 trace_xfs_reflink_update_inode_size_error(dest
, error
, _RET_IP_
);
1065 * Do we have enough reserve in this AG to handle a reflink? The refcount
1066 * btree already reserved all the space it needs, but the rmap btree can grow
1067 * infinitely, so we won't allow more reflinks when the AG is down to the
1071 xfs_reflink_ag_has_free_space(
1072 struct xfs_mount
*mp
,
1073 xfs_agnumber_t agno
)
1075 struct xfs_perag
*pag
;
1078 if (!xfs_sb_version_hasrmapbt(&mp
->m_sb
))
1081 pag
= xfs_perag_get(mp
, agno
);
1082 if (xfs_ag_resv_critical(pag
, XFS_AG_RESV_AGFL
) ||
1083 xfs_ag_resv_critical(pag
, XFS_AG_RESV_METADATA
))
1090 * Unmap a range of blocks from a file, then map other blocks into the hole.
1091 * The range to unmap is (destoff : destoff + srcioff + irec->br_blockcount).
1092 * The extent irec is mapped into dest at irec->br_startoff.
1095 xfs_reflink_remap_extent(
1096 struct xfs_inode
*ip
,
1097 struct xfs_bmbt_irec
*irec
,
1098 xfs_fileoff_t destoff
,
1099 xfs_off_t new_isize
)
1101 struct xfs_mount
*mp
= ip
->i_mount
;
1102 struct xfs_trans
*tp
;
1103 xfs_fsblock_t firstfsb
;
1104 unsigned int resblks
;
1105 struct xfs_defer_ops dfops
;
1106 struct xfs_bmbt_irec uirec
;
1109 xfs_filblks_t unmap_len
;
1113 unmap_len
= irec
->br_startoff
+ irec
->br_blockcount
- destoff
;
1114 trace_xfs_reflink_punch_range(ip
, destoff
, unmap_len
);
1116 /* Only remap normal extents. */
1117 real_extent
= (irec
->br_startblock
!= HOLESTARTBLOCK
&&
1118 irec
->br_startblock
!= DELAYSTARTBLOCK
&&
1119 !ISUNWRITTEN(irec
));
1121 /* No reflinking if we're low on space */
1123 error
= xfs_reflink_ag_has_free_space(mp
,
1124 XFS_FSB_TO_AGNO(mp
, irec
->br_startblock
));
1129 /* Start a rolling transaction to switch the mappings */
1130 resblks
= XFS_EXTENTADD_SPACE_RES(ip
->i_mount
, XFS_DATA_FORK
);
1131 error
= xfs_trans_alloc(mp
, &M_RES(mp
)->tr_write
, resblks
, 0, 0, &tp
);
1135 xfs_ilock(ip
, XFS_ILOCK_EXCL
);
1136 xfs_trans_ijoin(tp
, ip
, 0);
1138 /* If we're not just clearing space, then do we have enough quota? */
1140 error
= xfs_trans_reserve_quota_nblks(tp
, ip
,
1141 irec
->br_blockcount
, 0, XFS_QMOPT_RES_REGBLKS
);
1146 trace_xfs_reflink_remap(ip
, irec
->br_startoff
,
1147 irec
->br_blockcount
, irec
->br_startblock
);
1149 /* Unmap the old blocks in the data fork. */
1152 xfs_defer_init(&dfops
, &firstfsb
);
1153 error
= __xfs_bunmapi(tp
, ip
, destoff
, &rlen
, 0, 1,
1159 * Trim the extent to whatever got unmapped.
1160 * Remember, bunmapi works backwards.
1162 uirec
.br_startblock
= irec
->br_startblock
+ rlen
;
1163 uirec
.br_startoff
= irec
->br_startoff
+ rlen
;
1164 uirec
.br_blockcount
= unmap_len
- rlen
;
1165 uirec
.br_state
= irec
->br_state
;
1168 /* If this isn't a real mapping, we're done. */
1169 if (!real_extent
|| uirec
.br_blockcount
== 0)
1172 trace_xfs_reflink_remap(ip
, uirec
.br_startoff
,
1173 uirec
.br_blockcount
, uirec
.br_startblock
);
1175 /* Update the refcount tree */
1176 error
= xfs_refcount_increase_extent(mp
, &dfops
, &uirec
);
1180 /* Map the new blocks into the data fork. */
1181 error
= xfs_bmap_map_extent(mp
, &dfops
, ip
, &uirec
);
1185 /* Update quota accounting. */
1186 xfs_trans_mod_dquot_byino(tp
, ip
, XFS_TRANS_DQ_BCOUNT
,
1187 uirec
.br_blockcount
);
1189 /* Update dest isize if needed. */
1190 newlen
= XFS_FSB_TO_B(mp
,
1191 uirec
.br_startoff
+ uirec
.br_blockcount
);
1192 newlen
= min_t(xfs_off_t
, newlen
, new_isize
);
1193 if (newlen
> i_size_read(VFS_I(ip
))) {
1194 trace_xfs_reflink_update_inode_size(ip
, newlen
);
1195 i_size_write(VFS_I(ip
), newlen
);
1196 ip
->i_d
.di_size
= newlen
;
1197 xfs_trans_log_inode(tp
, ip
, XFS_ILOG_CORE
);
1201 /* Process all the deferred stuff. */
1202 error
= xfs_defer_finish(&tp
, &dfops
, ip
);
1207 error
= xfs_trans_commit(tp
);
1208 xfs_iunlock(ip
, XFS_ILOCK_EXCL
);
1214 xfs_defer_cancel(&dfops
);
1216 xfs_trans_cancel(tp
);
1217 xfs_iunlock(ip
, XFS_ILOCK_EXCL
);
1219 trace_xfs_reflink_remap_extent_error(ip
, error
, _RET_IP_
);
1224 * Iteratively remap one file's extents (and holes) to another's.
1227 xfs_reflink_remap_blocks(
1228 struct xfs_inode
*src
,
1229 xfs_fileoff_t srcoff
,
1230 struct xfs_inode
*dest
,
1231 xfs_fileoff_t destoff
,
1233 xfs_off_t new_isize
)
1235 struct xfs_bmbt_irec imap
;
1238 xfs_filblks_t range_len
;
1240 /* drange = (destoff, destoff + len); srange = (srcoff, srcoff + len) */
1242 trace_xfs_reflink_remap_blocks_loop(src
, srcoff
, len
,
1244 /* Read extent from the source file */
1246 xfs_ilock(src
, XFS_ILOCK_EXCL
);
1247 error
= xfs_bmapi_read(src
, srcoff
, len
, &imap
, &nimaps
, 0);
1248 xfs_iunlock(src
, XFS_ILOCK_EXCL
);
1251 ASSERT(nimaps
== 1);
1253 trace_xfs_reflink_remap_imap(src
, srcoff
, len
, XFS_IO_OVERWRITE
,
1256 /* Translate imap into the destination file. */
1257 range_len
= imap
.br_startoff
+ imap
.br_blockcount
- srcoff
;
1258 imap
.br_startoff
+= destoff
- srcoff
;
1260 /* Clear dest from destoff to the end of imap and map it in. */
1261 error
= xfs_reflink_remap_extent(dest
, &imap
, destoff
,
1266 if (fatal_signal_pending(current
)) {
1271 /* Advance drange/srange */
1272 srcoff
+= range_len
;
1273 destoff
+= range_len
;
1280 trace_xfs_reflink_remap_blocks_error(dest
, error
, _RET_IP_
);
1285 * Read a page's worth of file data into the page cache. Return the page
1288 static struct page
*
1290 struct inode
*inode
,
1293 struct address_space
*mapping
;
1297 n
= offset
>> PAGE_SHIFT
;
1298 mapping
= inode
->i_mapping
;
1299 page
= read_mapping_page(mapping
, n
, NULL
);
1302 if (!PageUptodate(page
)) {
1304 return ERR_PTR(-EIO
);
1311 * Compare extents of two files to see if they are the same.
1314 xfs_compare_extents(
1323 xfs_off_t dest_poff
;
1326 struct page
*src_page
;
1327 struct page
*dest_page
;
1335 src_poff
= srcoff
& (PAGE_SIZE
- 1);
1336 dest_poff
= destoff
& (PAGE_SIZE
- 1);
1337 cmp_len
= min(PAGE_SIZE
- src_poff
,
1338 PAGE_SIZE
- dest_poff
);
1339 cmp_len
= min(cmp_len
, len
);
1340 ASSERT(cmp_len
> 0);
1342 trace_xfs_reflink_compare_extents(XFS_I(src
), srcoff
, cmp_len
,
1343 XFS_I(dest
), destoff
);
1345 src_page
= xfs_get_page(src
, srcoff
);
1346 if (IS_ERR(src_page
)) {
1347 error
= PTR_ERR(src_page
);
1350 dest_page
= xfs_get_page(dest
, destoff
);
1351 if (IS_ERR(dest_page
)) {
1352 error
= PTR_ERR(dest_page
);
1353 unlock_page(src_page
);
1357 src_addr
= kmap_atomic(src_page
);
1358 dest_addr
= kmap_atomic(dest_page
);
1360 flush_dcache_page(src_page
);
1361 flush_dcache_page(dest_page
);
1363 if (memcmp(src_addr
+ src_poff
, dest_addr
+ dest_poff
, cmp_len
))
1366 kunmap_atomic(dest_addr
);
1367 kunmap_atomic(src_addr
);
1368 unlock_page(dest_page
);
1369 unlock_page(src_page
);
1370 put_page(dest_page
);
1385 trace_xfs_reflink_compare_extents_error(XFS_I(dest
), error
, _RET_IP_
);
1390 * Link a range of blocks from one file to another.
1393 xfs_reflink_remap_range(
1394 struct file
*file_in
,
1396 struct file
*file_out
,
1401 struct inode
*inode_in
= file_inode(file_in
);
1402 struct xfs_inode
*src
= XFS_I(inode_in
);
1403 struct inode
*inode_out
= file_inode(file_out
);
1404 struct xfs_inode
*dest
= XFS_I(inode_out
);
1405 struct xfs_mount
*mp
= src
->i_mount
;
1406 loff_t bs
= inode_out
->i_sb
->s_blocksize
;
1407 bool same_inode
= (inode_in
== inode_out
);
1408 xfs_fileoff_t sfsbno
, dfsbno
;
1409 xfs_filblks_t fsblen
;
1410 xfs_extlen_t cowextsize
;
1415 if (!xfs_sb_version_hasreflink(&mp
->m_sb
))
1418 if (XFS_FORCED_SHUTDOWN(mp
))
1421 /* Lock both files against IO */
1423 xfs_ilock(src
, XFS_IOLOCK_EXCL
);
1424 xfs_ilock(src
, XFS_MMAPLOCK_EXCL
);
1426 xfs_lock_two_inodes(src
, dest
, XFS_IOLOCK_EXCL
);
1427 xfs_lock_two_inodes(src
, dest
, XFS_MMAPLOCK_EXCL
);
1430 /* Don't touch certain kinds of inodes */
1432 if (IS_IMMUTABLE(inode_out
))
1436 if (IS_SWAPFILE(inode_in
) || IS_SWAPFILE(inode_out
))
1440 /* Don't reflink dirs, pipes, sockets... */
1442 if (S_ISDIR(inode_in
->i_mode
) || S_ISDIR(inode_out
->i_mode
))
1445 if (S_ISFIFO(inode_in
->i_mode
) || S_ISFIFO(inode_out
->i_mode
))
1447 if (!S_ISREG(inode_in
->i_mode
) || !S_ISREG(inode_out
->i_mode
))
1450 /* Don't reflink realtime inodes */
1451 if (XFS_IS_REALTIME_INODE(src
) || XFS_IS_REALTIME_INODE(dest
))
1454 /* Don't share DAX file data for now. */
1455 if (IS_DAX(inode_in
) || IS_DAX(inode_out
))
1458 /* Are we going all the way to the end? */
1459 isize
= i_size_read(inode_in
);
1465 /* Zero length dedupe exits immediately; reflink goes to EOF. */
1471 len
= isize
- pos_in
;
1474 /* Ensure offsets don't wrap and the input is inside i_size */
1475 if (pos_in
+ len
< pos_in
|| pos_out
+ len
< pos_out
||
1476 pos_in
+ len
> isize
)
1479 /* Don't allow dedupe past EOF in the dest file */
1483 disize
= i_size_read(inode_out
);
1484 if (pos_out
>= disize
|| pos_out
+ len
> disize
)
1488 /* If we're linking to EOF, continue to the block boundary. */
1489 if (pos_in
+ len
== isize
)
1490 blen
= ALIGN(isize
, bs
) - pos_in
;
1494 /* Only reflink if we're aligned to block boundaries */
1495 if (!IS_ALIGNED(pos_in
, bs
) || !IS_ALIGNED(pos_in
+ blen
, bs
) ||
1496 !IS_ALIGNED(pos_out
, bs
) || !IS_ALIGNED(pos_out
+ blen
, bs
))
1499 /* Don't allow overlapped reflink within the same file */
1501 if (pos_out
+ blen
> pos_in
&& pos_out
< pos_in
+ blen
)
1505 /* Wait for the completion of any pending IOs on both files */
1506 inode_dio_wait(inode_in
);
1508 inode_dio_wait(inode_out
);
1510 ret
= filemap_write_and_wait_range(inode_in
->i_mapping
,
1511 pos_in
, pos_in
+ len
- 1);
1515 ret
= filemap_write_and_wait_range(inode_out
->i_mapping
,
1516 pos_out
, pos_out
+ len
- 1);
1520 trace_xfs_reflink_remap_range(src
, pos_in
, len
, dest
, pos_out
);
1523 * Check that the extents are the same.
1526 bool is_same
= false;
1528 ret
= xfs_compare_extents(inode_in
, pos_in
, inode_out
, pos_out
,
1538 ret
= xfs_reflink_set_inode_flag(src
, dest
);
1543 * Invalidate the page cache so that we can clear any CoW mappings
1544 * in the destination file.
1546 truncate_inode_pages_range(&inode_out
->i_data
, pos_out
,
1547 PAGE_ALIGN(pos_out
+ len
) - 1);
1549 dfsbno
= XFS_B_TO_FSBT(mp
, pos_out
);
1550 sfsbno
= XFS_B_TO_FSBT(mp
, pos_in
);
1551 fsblen
= XFS_B_TO_FSB(mp
, len
);
1552 ret
= xfs_reflink_remap_blocks(src
, sfsbno
, dest
, dfsbno
, fsblen
,
1558 * Carry the cowextsize hint from src to dest if we're sharing the
1559 * entire source file to the entire destination file, the source file
1560 * has a cowextsize hint, and the destination file does not.
1563 if (pos_in
== 0 && len
== i_size_read(inode_in
) &&
1564 (src
->i_d
.di_flags2
& XFS_DIFLAG2_COWEXTSIZE
) &&
1565 pos_out
== 0 && len
>= i_size_read(inode_out
) &&
1566 !(dest
->i_d
.di_flags2
& XFS_DIFLAG2_COWEXTSIZE
))
1567 cowextsize
= src
->i_d
.di_cowextsize
;
1569 ret
= xfs_reflink_update_dest(dest
, pos_out
+ len
, cowextsize
,
1573 xfs_iunlock(src
, XFS_MMAPLOCK_EXCL
);
1574 xfs_iunlock(src
, XFS_IOLOCK_EXCL
);
1575 if (src
->i_ino
!= dest
->i_ino
) {
1576 xfs_iunlock(dest
, XFS_MMAPLOCK_EXCL
);
1577 xfs_iunlock(dest
, XFS_IOLOCK_EXCL
);
1580 trace_xfs_reflink_remap_range_error(dest
, ret
, _RET_IP_
);
1585 * The user wants to preemptively CoW all shared blocks in this file,
1586 * which enables us to turn off the reflink flag. Iterate all
1587 * extents which are not prealloc/delalloc to see which ranges are
1588 * mentioned in the refcount tree, then read those blocks into the
1589 * pagecache, dirty them, fsync them back out, and then we can update
1590 * the inode flag. What happens if we run out of memory? :)
1593 xfs_reflink_dirty_extents(
1594 struct xfs_inode
*ip
,
1599 struct xfs_mount
*mp
= ip
->i_mount
;
1600 xfs_agnumber_t agno
;
1601 xfs_agblock_t agbno
;
1607 struct xfs_bmbt_irec map
[2];
1611 while (end
- fbno
> 0) {
1614 * Look for extents in the file. Skip holes, delalloc, or
1615 * unwritten extents; they can't be reflinked.
1617 error
= xfs_bmapi_read(ip
, fbno
, end
- fbno
, map
, &nmaps
, 0);
1622 if (map
[0].br_startblock
== HOLESTARTBLOCK
||
1623 map
[0].br_startblock
== DELAYSTARTBLOCK
||
1624 ISUNWRITTEN(&map
[0]))
1628 while (map
[1].br_blockcount
) {
1629 agno
= XFS_FSB_TO_AGNO(mp
, map
[1].br_startblock
);
1630 agbno
= XFS_FSB_TO_AGBNO(mp
, map
[1].br_startblock
);
1631 aglen
= map
[1].br_blockcount
;
1633 error
= xfs_reflink_find_shared(mp
, agno
, agbno
, aglen
,
1634 &rbno
, &rlen
, true);
1637 if (rbno
== NULLAGBLOCK
)
1640 /* Dirty the pages */
1641 xfs_iunlock(ip
, XFS_ILOCK_EXCL
);
1642 fpos
= XFS_FSB_TO_B(mp
, map
[1].br_startoff
+
1644 flen
= XFS_FSB_TO_B(mp
, rlen
);
1645 if (fpos
+ flen
> isize
)
1646 flen
= isize
- fpos
;
1647 error
= iomap_file_dirty(VFS_I(ip
), fpos
, flen
,
1649 xfs_ilock(ip
, XFS_ILOCK_EXCL
);
1653 map
[1].br_blockcount
-= (rbno
- agbno
+ rlen
);
1654 map
[1].br_startoff
+= (rbno
- agbno
+ rlen
);
1655 map
[1].br_startblock
+= (rbno
- agbno
+ rlen
);
1659 fbno
= map
[0].br_startoff
+ map
[0].br_blockcount
;
1665 /* Clear the inode reflink flag if there are no shared extents. */
1667 xfs_reflink_clear_inode_flag(
1668 struct xfs_inode
*ip
,
1669 struct xfs_trans
**tpp
)
1671 struct xfs_mount
*mp
= ip
->i_mount
;
1674 xfs_agnumber_t agno
;
1675 xfs_agblock_t agbno
;
1679 struct xfs_bmbt_irec map
;
1683 ASSERT(xfs_is_reflink_inode(ip
));
1686 end
= XFS_B_TO_FSB(mp
, i_size_read(VFS_I(ip
)));
1687 while (end
- fbno
> 0) {
1690 * Look for extents in the file. Skip holes, delalloc, or
1691 * unwritten extents; they can't be reflinked.
1693 error
= xfs_bmapi_read(ip
, fbno
, end
- fbno
, &map
, &nmaps
, 0);
1698 if (map
.br_startblock
== HOLESTARTBLOCK
||
1699 map
.br_startblock
== DELAYSTARTBLOCK
||
1703 agno
= XFS_FSB_TO_AGNO(mp
, map
.br_startblock
);
1704 agbno
= XFS_FSB_TO_AGBNO(mp
, map
.br_startblock
);
1705 aglen
= map
.br_blockcount
;
1707 error
= xfs_reflink_find_shared(mp
, agno
, agbno
, aglen
,
1708 &rbno
, &rlen
, false);
1711 /* Is there still a shared block here? */
1712 if (rbno
!= NULLAGBLOCK
)
1715 fbno
= map
.br_startoff
+ map
.br_blockcount
;
1719 * We didn't find any shared blocks so turn off the reflink flag.
1720 * First, get rid of any leftover CoW mappings.
1722 error
= xfs_reflink_cancel_cow_blocks(ip
, tpp
, 0, NULLFILEOFF
, true);
1726 /* Clear the inode flag. */
1727 trace_xfs_reflink_unset_inode_flag(ip
);
1728 ip
->i_d
.di_flags2
&= ~XFS_DIFLAG2_REFLINK
;
1729 xfs_inode_clear_cowblocks_tag(ip
);
1730 xfs_trans_ijoin(*tpp
, ip
, 0);
1731 xfs_trans_log_inode(*tpp
, ip
, XFS_ILOG_CORE
);
1737 * Clear the inode reflink flag if there are no shared extents and the size
1741 xfs_reflink_try_clear_inode_flag(
1742 struct xfs_inode
*ip
)
1744 struct xfs_mount
*mp
= ip
->i_mount
;
1745 struct xfs_trans
*tp
;
1748 /* Start a rolling transaction to remove the mappings */
1749 error
= xfs_trans_alloc(mp
, &M_RES(mp
)->tr_write
, 0, 0, 0, &tp
);
1753 xfs_ilock(ip
, XFS_ILOCK_EXCL
);
1754 xfs_trans_ijoin(tp
, ip
, 0);
1756 error
= xfs_reflink_clear_inode_flag(ip
, &tp
);
1760 error
= xfs_trans_commit(tp
);
1764 xfs_iunlock(ip
, XFS_ILOCK_EXCL
);
1767 xfs_trans_cancel(tp
);
1769 xfs_iunlock(ip
, XFS_ILOCK_EXCL
);
1774 * Pre-COW all shared blocks within a given byte range of a file and turn off
1775 * the reflink flag if we unshare all of the file's blocks.
1778 xfs_reflink_unshare(
1779 struct xfs_inode
*ip
,
1783 struct xfs_mount
*mp
= ip
->i_mount
;
1789 if (!xfs_is_reflink_inode(ip
))
1792 trace_xfs_reflink_unshare(ip
, offset
, len
);
1794 inode_dio_wait(VFS_I(ip
));
1796 /* Try to CoW the selected ranges */
1797 xfs_ilock(ip
, XFS_ILOCK_EXCL
);
1798 fbno
= XFS_B_TO_FSBT(mp
, offset
);
1799 isize
= i_size_read(VFS_I(ip
));
1800 end
= XFS_B_TO_FSB(mp
, offset
+ len
);
1801 error
= xfs_reflink_dirty_extents(ip
, fbno
, end
, isize
);
1804 xfs_iunlock(ip
, XFS_ILOCK_EXCL
);
1806 /* Wait for the IO to finish */
1807 error
= filemap_write_and_wait(VFS_I(ip
)->i_mapping
);
1811 /* Turn off the reflink flag if possible. */
1812 error
= xfs_reflink_try_clear_inode_flag(ip
);
1819 xfs_iunlock(ip
, XFS_ILOCK_EXCL
);
1821 trace_xfs_reflink_unshare_error(ip
, error
, _RET_IP_
);