1 // SPDX-License-Identifier: GPL-2.0
3 * Copyright (c) 2000-2005 Silicon Graphics, Inc.
4 * Copyright (c) 2016-2018 Christoph Hellwig.
8 #include "xfs_shared.h"
9 #include "xfs_format.h"
10 #include "xfs_log_format.h"
11 #include "xfs_trans_resv.h"
12 #include "xfs_mount.h"
13 #include "xfs_inode.h"
14 #include "xfs_trans.h"
15 #include "xfs_iomap.h"
16 #include "xfs_trace.h"
18 #include "xfs_bmap_util.h"
19 #include "xfs_reflink.h"
21 struct xfs_writepage_ctx
{
22 struct iomap_writepage_ctx ctx
;
23 unsigned int data_seq
;
27 static inline struct xfs_writepage_ctx
*
28 XFS_WPC(struct iomap_writepage_ctx
*ctx
)
30 return container_of(ctx
, struct xfs_writepage_ctx
, ctx
);
34 * Fast and loose check if this write could update the on-disk inode size.
36 static inline bool xfs_ioend_is_append(struct iomap_ioend
*ioend
)
38 return ioend
->io_offset
+ ioend
->io_size
>
39 XFS_I(ioend
->io_inode
)->i_d
.di_size
;
43 xfs_setfilesize_trans_alloc(
44 struct iomap_ioend
*ioend
)
46 struct xfs_mount
*mp
= XFS_I(ioend
->io_inode
)->i_mount
;
50 error
= xfs_trans_alloc(mp
, &M_RES(mp
)->tr_fsyncts
, 0, 0, 0, &tp
);
54 ioend
->io_private
= tp
;
57 * We may pass freeze protection with a transaction. So tell lockdep
60 __sb_writers_release(ioend
->io_inode
->i_sb
, SB_FREEZE_FS
);
62 * We hand off the transaction to the completion thread now, so
63 * clear the flag here.
65 current_restore_flags_nested(&tp
->t_pflags
, PF_MEMALLOC_NOFS
);
70 * Update on-disk file size now that data has been written to disk.
81 xfs_ilock(ip
, XFS_ILOCK_EXCL
);
82 isize
= xfs_new_eof(ip
, offset
+ size
);
84 xfs_iunlock(ip
, XFS_ILOCK_EXCL
);
89 trace_xfs_setfilesize(ip
, offset
, size
);
91 ip
->i_d
.di_size
= isize
;
92 xfs_trans_ijoin(tp
, ip
, XFS_ILOCK_EXCL
);
93 xfs_trans_log_inode(tp
, ip
, XFS_ILOG_CORE
);
95 return xfs_trans_commit(tp
);
100 struct xfs_inode
*ip
,
104 struct xfs_mount
*mp
= ip
->i_mount
;
105 struct xfs_trans
*tp
;
108 error
= xfs_trans_alloc(mp
, &M_RES(mp
)->tr_fsyncts
, 0, 0, 0, &tp
);
112 return __xfs_setfilesize(ip
, tp
, offset
, size
);
116 xfs_setfilesize_ioend(
117 struct iomap_ioend
*ioend
,
120 struct xfs_inode
*ip
= XFS_I(ioend
->io_inode
);
121 struct xfs_trans
*tp
= ioend
->io_private
;
124 * The transaction may have been allocated in the I/O submission thread,
125 * thus we need to mark ourselves as being in a transaction manually.
126 * Similarly for freeze protection.
128 current_set_flags_nested(&tp
->t_pflags
, PF_MEMALLOC_NOFS
);
129 __sb_writers_acquired(VFS_I(ip
)->i_sb
, SB_FREEZE_FS
);
131 /* we abort the update if there was an IO error */
133 xfs_trans_cancel(tp
);
137 return __xfs_setfilesize(ip
, tp
, ioend
->io_offset
, ioend
->io_size
);
141 * IO write completion.
145 struct iomap_ioend
*ioend
)
147 struct xfs_inode
*ip
= XFS_I(ioend
->io_inode
);
148 xfs_off_t offset
= ioend
->io_offset
;
149 size_t size
= ioend
->io_size
;
150 unsigned int nofs_flag
;
154 * We can allocate memory here while doing writeback on behalf of
155 * memory reclaim. To avoid memory allocation deadlocks set the
156 * task-wide nofs context for the following operations.
158 nofs_flag
= memalloc_nofs_save();
161 * Just clean up the in-memory strutures if the fs has been shut down.
163 if (XFS_FORCED_SHUTDOWN(ip
->i_mount
)) {
169 * Clean up any COW blocks on an I/O error.
171 error
= blk_status_to_errno(ioend
->io_bio
->bi_status
);
172 if (unlikely(error
)) {
173 if (ioend
->io_flags
& IOMAP_F_SHARED
)
174 xfs_reflink_cancel_cow_range(ip
, offset
, size
, true);
179 * Success: commit the COW or unwritten blocks if needed.
181 if (ioend
->io_flags
& IOMAP_F_SHARED
)
182 error
= xfs_reflink_end_cow(ip
, offset
, size
);
183 else if (ioend
->io_type
== IOMAP_UNWRITTEN
)
184 error
= xfs_iomap_write_unwritten(ip
, offset
, size
, false);
186 ASSERT(!xfs_ioend_is_append(ioend
) || ioend
->io_private
);
189 if (ioend
->io_private
)
190 error
= xfs_setfilesize_ioend(ioend
, error
);
191 iomap_finish_ioends(ioend
, error
);
192 memalloc_nofs_restore(nofs_flag
);
196 * If the to be merged ioend has a preallocated transaction for file
197 * size updates we need to ensure the ioend it is merged into also
198 * has one. If it already has one we can simply cancel the transaction
199 * as it is guaranteed to be clean.
202 xfs_ioend_merge_private(
203 struct iomap_ioend
*ioend
,
204 struct iomap_ioend
*next
)
206 if (!ioend
->io_private
) {
207 ioend
->io_private
= next
->io_private
;
208 next
->io_private
= NULL
;
210 xfs_setfilesize_ioend(next
, -ECANCELED
);
214 /* Finish all pending io completions. */
217 struct work_struct
*work
)
219 struct xfs_inode
*ip
=
220 container_of(work
, struct xfs_inode
, i_ioend_work
);
221 struct iomap_ioend
*ioend
;
222 struct list_head tmp
;
225 spin_lock_irqsave(&ip
->i_ioend_lock
, flags
);
226 list_replace_init(&ip
->i_ioend_list
, &tmp
);
227 spin_unlock_irqrestore(&ip
->i_ioend_lock
, flags
);
229 iomap_sort_ioends(&tmp
);
230 while ((ioend
= list_first_entry_or_null(&tmp
, struct iomap_ioend
,
232 list_del_init(&ioend
->io_list
);
233 iomap_ioend_try_merge(ioend
, &tmp
, xfs_ioend_merge_private
);
234 xfs_end_ioend(ioend
);
238 static inline bool xfs_ioend_needs_workqueue(struct iomap_ioend
*ioend
)
240 return ioend
->io_private
||
241 ioend
->io_type
== IOMAP_UNWRITTEN
||
242 (ioend
->io_flags
& IOMAP_F_SHARED
);
249 struct iomap_ioend
*ioend
= bio
->bi_private
;
250 struct xfs_inode
*ip
= XFS_I(ioend
->io_inode
);
253 ASSERT(xfs_ioend_needs_workqueue(ioend
));
255 spin_lock_irqsave(&ip
->i_ioend_lock
, flags
);
256 if (list_empty(&ip
->i_ioend_list
))
257 WARN_ON_ONCE(!queue_work(ip
->i_mount
->m_unwritten_workqueue
,
259 list_add_tail(&ioend
->io_list
, &ip
->i_ioend_list
);
260 spin_unlock_irqrestore(&ip
->i_ioend_lock
, flags
);
264 * Fast revalidation of the cached writeback mapping. Return true if the current
265 * mapping is valid, false otherwise.
269 struct iomap_writepage_ctx
*wpc
,
270 struct xfs_inode
*ip
,
273 if (offset
< wpc
->iomap
.offset
||
274 offset
>= wpc
->iomap
.offset
+ wpc
->iomap
.length
)
277 * If this is a COW mapping, it is sufficient to check that the mapping
278 * covers the offset. Be careful to check this first because the caller
279 * can revalidate a COW mapping without updating the data seqno.
281 if (wpc
->iomap
.flags
& IOMAP_F_SHARED
)
285 * This is not a COW mapping. Check the sequence number of the data fork
286 * because concurrent changes could have invalidated the extent. Check
287 * the COW fork because concurrent changes since the last time we
288 * checked (and found nothing at this offset) could have added
289 * overlapping blocks.
291 if (XFS_WPC(wpc
)->data_seq
!= READ_ONCE(ip
->i_df
.if_seq
))
293 if (xfs_inode_has_cow_data(ip
) &&
294 XFS_WPC(wpc
)->cow_seq
!= READ_ONCE(ip
->i_cowfp
->if_seq
))
300 * Pass in a dellalloc extent and convert it to real extents, return the real
301 * extent that maps offset_fsb in wpc->iomap.
303 * The current page is held locked so nothing could have removed the block
304 * backing offset_fsb, although it could have moved from the COW to the data
305 * fork by another thread.
309 struct iomap_writepage_ctx
*wpc
,
310 struct xfs_inode
*ip
,
317 if (whichfork
== XFS_COW_FORK
)
318 seq
= &XFS_WPC(wpc
)->cow_seq
;
320 seq
= &XFS_WPC(wpc
)->data_seq
;
323 * Attempt to allocate whatever delalloc extent currently backs offset
324 * and put the result into wpc->iomap. Allocate in a loop because it
325 * may take several attempts to allocate real blocks for a contiguous
326 * delalloc extent if free space is sufficiently fragmented.
329 error
= xfs_bmapi_convert_delalloc(ip
, whichfork
, offset
,
333 } while (wpc
->iomap
.offset
+ wpc
->iomap
.length
<= offset
);
340 struct iomap_writepage_ctx
*wpc
,
344 struct xfs_inode
*ip
= XFS_I(inode
);
345 struct xfs_mount
*mp
= ip
->i_mount
;
346 ssize_t count
= i_blocksize(inode
);
347 xfs_fileoff_t offset_fsb
= XFS_B_TO_FSBT(mp
, offset
);
348 xfs_fileoff_t end_fsb
= XFS_B_TO_FSB(mp
, offset
+ count
);
349 xfs_fileoff_t cow_fsb
= NULLFILEOFF
;
350 int whichfork
= XFS_DATA_FORK
;
351 struct xfs_bmbt_irec imap
;
352 struct xfs_iext_cursor icur
;
356 if (XFS_FORCED_SHUTDOWN(mp
))
360 * COW fork blocks can overlap data fork blocks even if the blocks
361 * aren't shared. COW I/O always takes precedent, so we must always
362 * check for overlap on reflink inodes unless the mapping is already a
363 * COW one, or the COW fork hasn't changed from the last time we looked
366 * It's safe to check the COW fork if_seq here without the ILOCK because
367 * we've indirectly protected against concurrent updates: writeback has
368 * the page locked, which prevents concurrent invalidations by reflink
369 * and directio and prevents concurrent buffered writes to the same
370 * page. Changes to if_seq always happen under i_lock, which protects
371 * against concurrent updates and provides a memory barrier on the way
372 * out that ensures that we always see the current value.
374 if (xfs_imap_valid(wpc
, ip
, offset
))
378 * If we don't have a valid map, now it's time to get a new one for this
379 * offset. This will convert delayed allocations (including COW ones)
380 * into real extents. If we return without a valid map, it means we
381 * landed in a hole and we skip the block.
384 xfs_ilock(ip
, XFS_ILOCK_SHARED
);
385 ASSERT(ip
->i_d
.di_format
!= XFS_DINODE_FMT_BTREE
||
386 (ip
->i_df
.if_flags
& XFS_IFEXTENTS
));
389 * Check if this is offset is covered by a COW extents, and if yes use
390 * it directly instead of looking up anything in the data fork.
392 if (xfs_inode_has_cow_data(ip
) &&
393 xfs_iext_lookup_extent(ip
, ip
->i_cowfp
, offset_fsb
, &icur
, &imap
))
394 cow_fsb
= imap
.br_startoff
;
395 if (cow_fsb
!= NULLFILEOFF
&& cow_fsb
<= offset_fsb
) {
396 XFS_WPC(wpc
)->cow_seq
= READ_ONCE(ip
->i_cowfp
->if_seq
);
397 xfs_iunlock(ip
, XFS_ILOCK_SHARED
);
399 whichfork
= XFS_COW_FORK
;
400 goto allocate_blocks
;
404 * No COW extent overlap. Revalidate now that we may have updated
405 * ->cow_seq. If the data mapping is still valid, we're done.
407 if (xfs_imap_valid(wpc
, ip
, offset
)) {
408 xfs_iunlock(ip
, XFS_ILOCK_SHARED
);
413 * If we don't have a valid map, now it's time to get a new one for this
414 * offset. This will convert delayed allocations (including COW ones)
417 if (!xfs_iext_lookup_extent(ip
, &ip
->i_df
, offset_fsb
, &icur
, &imap
))
418 imap
.br_startoff
= end_fsb
; /* fake a hole past EOF */
419 XFS_WPC(wpc
)->data_seq
= READ_ONCE(ip
->i_df
.if_seq
);
420 xfs_iunlock(ip
, XFS_ILOCK_SHARED
);
422 /* landed in a hole or beyond EOF? */
423 if (imap
.br_startoff
> offset_fsb
) {
424 imap
.br_blockcount
= imap
.br_startoff
- offset_fsb
;
425 imap
.br_startoff
= offset_fsb
;
426 imap
.br_startblock
= HOLESTARTBLOCK
;
427 imap
.br_state
= XFS_EXT_NORM
;
431 * Truncate to the next COW extent if there is one. This is the only
432 * opportunity to do this because we can skip COW fork lookups for the
433 * subsequent blocks in the mapping; however, the requirement to treat
434 * the COW range separately remains.
436 if (cow_fsb
!= NULLFILEOFF
&&
437 cow_fsb
< imap
.br_startoff
+ imap
.br_blockcount
)
438 imap
.br_blockcount
= cow_fsb
- imap
.br_startoff
;
440 /* got a delalloc extent? */
441 if (imap
.br_startblock
!= HOLESTARTBLOCK
&&
442 isnullstartblock(imap
.br_startblock
))
443 goto allocate_blocks
;
445 xfs_bmbt_to_iomap(ip
, &wpc
->iomap
, &imap
, 0);
446 trace_xfs_map_blocks_found(ip
, offset
, count
, whichfork
, &imap
);
449 error
= xfs_convert_blocks(wpc
, ip
, whichfork
, offset
);
452 * If we failed to find the extent in the COW fork we might have
453 * raced with a COW to data fork conversion or truncate.
454 * Restart the lookup to catch the extent in the data fork for
455 * the former case, but prevent additional retries to avoid
456 * looping forever for the latter case.
458 if (error
== -EAGAIN
&& whichfork
== XFS_COW_FORK
&& !retries
++)
460 ASSERT(error
!= -EAGAIN
);
465 * Due to merging the return real extent might be larger than the
466 * original delalloc one. Trim the return extent to the next COW
467 * boundary again to force a re-lookup.
469 if (whichfork
!= XFS_COW_FORK
&& cow_fsb
!= NULLFILEOFF
) {
470 loff_t cow_offset
= XFS_FSB_TO_B(mp
, cow_fsb
);
472 if (cow_offset
< wpc
->iomap
.offset
+ wpc
->iomap
.length
)
473 wpc
->iomap
.length
= cow_offset
- wpc
->iomap
.offset
;
476 ASSERT(wpc
->iomap
.offset
<= offset
);
477 ASSERT(wpc
->iomap
.offset
+ wpc
->iomap
.length
> offset
);
478 trace_xfs_map_blocks_alloc(ip
, offset
, count
, whichfork
, &imap
);
484 struct iomap_ioend
*ioend
,
487 unsigned int nofs_flag
;
490 * We can allocate memory here while doing writeback on behalf of
491 * memory reclaim. To avoid memory allocation deadlocks set the
492 * task-wide nofs context for the following operations.
494 nofs_flag
= memalloc_nofs_save();
496 /* Convert CoW extents to regular */
497 if (!status
&& (ioend
->io_flags
& IOMAP_F_SHARED
)) {
498 status
= xfs_reflink_convert_cow(XFS_I(ioend
->io_inode
),
499 ioend
->io_offset
, ioend
->io_size
);
502 /* Reserve log space if we might write beyond the on-disk inode size. */
504 ((ioend
->io_flags
& IOMAP_F_SHARED
) ||
505 ioend
->io_type
!= IOMAP_UNWRITTEN
) &&
506 xfs_ioend_is_append(ioend
) &&
508 status
= xfs_setfilesize_trans_alloc(ioend
);
510 memalloc_nofs_restore(nofs_flag
);
512 if (xfs_ioend_needs_workqueue(ioend
))
513 ioend
->io_bio
->bi_end_io
= xfs_end_bio
;
518 * If the page has delalloc blocks on it, we need to punch them out before we
519 * invalidate the page. If we don't, we leave a stale delalloc mapping on the
520 * inode that can trip up a later direct I/O read operation on the same region.
522 * We prevent this by truncating away the delalloc regions on the page. Because
523 * they are delalloc, we can do this without needing a transaction. Indeed - if
524 * we get ENOSPC errors, we have to be able to do this truncation without a
525 * transaction as there is no space left for block reservation (typically why we
526 * see a ENOSPC in writeback).
532 struct inode
*inode
= page
->mapping
->host
;
533 struct xfs_inode
*ip
= XFS_I(inode
);
534 struct xfs_mount
*mp
= ip
->i_mount
;
535 loff_t offset
= page_offset(page
);
536 xfs_fileoff_t start_fsb
= XFS_B_TO_FSBT(mp
, offset
);
539 if (XFS_FORCED_SHUTDOWN(mp
))
543 "page discard on page "PTR_FMT
", inode 0x%llx, offset %llu.",
544 page
, ip
->i_ino
, offset
);
546 error
= xfs_bmap_punch_delalloc_range(ip
, start_fsb
,
547 PAGE_SIZE
/ i_blocksize(inode
));
548 if (error
&& !XFS_FORCED_SHUTDOWN(mp
))
549 xfs_alert(mp
, "page discard unable to remove delalloc mapping.");
551 iomap_invalidatepage(page
, 0, PAGE_SIZE
);
554 static const struct iomap_writeback_ops xfs_writeback_ops
= {
555 .map_blocks
= xfs_map_blocks
,
556 .prepare_ioend
= xfs_prepare_ioend
,
557 .discard_page
= xfs_discard_page
,
563 struct writeback_control
*wbc
)
565 struct xfs_writepage_ctx wpc
= { };
567 return iomap_writepage(page
, wbc
, &wpc
.ctx
, &xfs_writeback_ops
);
572 struct address_space
*mapping
,
573 struct writeback_control
*wbc
)
575 struct xfs_writepage_ctx wpc
= { };
577 xfs_iflags_clear(XFS_I(mapping
->host
), XFS_ITRUNCATED
);
578 return iomap_writepages(mapping
, wbc
, &wpc
.ctx
, &xfs_writeback_ops
);
583 struct address_space
*mapping
,
584 struct writeback_control
*wbc
)
586 struct xfs_inode
*ip
= XFS_I(mapping
->host
);
588 xfs_iflags_clear(ip
, XFS_ITRUNCATED
);
589 return dax_writeback_mapping_range(mapping
,
590 xfs_inode_buftarg(ip
)->bt_bdev
, wbc
);
595 struct address_space
*mapping
,
598 struct xfs_inode
*ip
= XFS_I(mapping
->host
);
600 trace_xfs_vm_bmap(ip
);
603 * The swap code (ab-)uses ->bmap to get a block mapping and then
604 * bypasses the file system for actual I/O. We really can't allow
605 * that on reflinks inodes, so we have to skip out here. And yes,
606 * 0 is the magic code for a bmap error.
608 * Since we don't pass back blockdev info, we can't return bmap
609 * information for rt files either.
611 if (xfs_is_cow_inode(ip
) || XFS_IS_REALTIME_INODE(ip
))
613 return iomap_bmap(mapping
, block
, &xfs_read_iomap_ops
);
621 return iomap_readpage(page
, &xfs_read_iomap_ops
);
627 struct address_space
*mapping
,
628 struct list_head
*pages
,
631 return iomap_readpages(mapping
, pages
, nr_pages
, &xfs_read_iomap_ops
);
635 xfs_iomap_swapfile_activate(
636 struct swap_info_struct
*sis
,
637 struct file
*swap_file
,
640 sis
->bdev
= xfs_inode_buftarg(XFS_I(file_inode(swap_file
)))->bt_bdev
;
641 return iomap_swapfile_activate(sis
, swap_file
, span
,
642 &xfs_read_iomap_ops
);
645 const struct address_space_operations xfs_address_space_operations
= {
646 .readpage
= xfs_vm_readpage
,
647 .readpages
= xfs_vm_readpages
,
648 .writepage
= xfs_vm_writepage
,
649 .writepages
= xfs_vm_writepages
,
650 .set_page_dirty
= iomap_set_page_dirty
,
651 .releasepage
= iomap_releasepage
,
652 .invalidatepage
= iomap_invalidatepage
,
654 .direct_IO
= noop_direct_IO
,
655 .migratepage
= iomap_migrate_page
,
656 .is_partially_uptodate
= iomap_is_partially_uptodate
,
657 .error_remove_page
= generic_error_remove_page
,
658 .swap_activate
= xfs_iomap_swapfile_activate
,
661 const struct address_space_operations xfs_dax_aops
= {
662 .writepages
= xfs_dax_writepages
,
663 .direct_IO
= noop_direct_IO
,
664 .set_page_dirty
= noop_set_page_dirty
,
665 .invalidatepage
= noop_invalidatepage
,
666 .swap_activate
= xfs_iomap_swapfile_activate
,