1 // SPDX-License-Identifier: GPL-2.0
3 * Copyright (C) 2010 Red Hat, Inc.
4 * Copyright (c) 2016-2018 Christoph Hellwig.
6 #include <linux/module.h>
7 #include <linux/compiler.h>
9 #include <linux/iomap.h>
10 #include <linux/backing-dev.h>
11 #include <linux/uio.h>
12 #include <linux/task_io_accounting_ops.h>
15 #include "../internal.h"
18 * Private flags for iomap_dio, must not overlap with the public ones in
21 #define IOMAP_DIO_WRITE_FUA (1 << 28)
22 #define IOMAP_DIO_NEED_SYNC (1 << 29)
23 #define IOMAP_DIO_WRITE (1 << 30)
24 #define IOMAP_DIO_DIRTY (1 << 31)
28 const struct iomap_dio_ops
*dops
;
34 bool wait_for_completion
;
37 /* used during submission and for synchronous completion: */
39 struct iov_iter
*iter
;
40 struct task_struct
*waiter
;
41 struct request_queue
*last_queue
;
45 /* used for aio completion: */
47 struct work_struct work
;
52 int iomap_dio_iopoll(struct kiocb
*kiocb
, bool spin
)
54 struct request_queue
*q
= READ_ONCE(kiocb
->private);
58 return blk_poll(q
, READ_ONCE(kiocb
->ki_cookie
), spin
);
60 EXPORT_SYMBOL_GPL(iomap_dio_iopoll
);
62 static void iomap_dio_submit_bio(struct iomap_dio
*dio
, struct iomap
*iomap
,
63 struct bio
*bio
, loff_t pos
)
65 atomic_inc(&dio
->ref
);
67 if (dio
->iocb
->ki_flags
& IOCB_HIPRI
)
68 bio_set_polled(bio
, dio
->iocb
);
70 dio
->submit
.last_queue
= bdev_get_queue(iomap
->bdev
);
71 if (dio
->dops
&& dio
->dops
->submit_io
)
72 dio
->submit
.cookie
= dio
->dops
->submit_io(
73 file_inode(dio
->iocb
->ki_filp
),
76 dio
->submit
.cookie
= submit_bio(bio
);
79 ssize_t
iomap_dio_complete(struct iomap_dio
*dio
)
81 const struct iomap_dio_ops
*dops
= dio
->dops
;
82 struct kiocb
*iocb
= dio
->iocb
;
83 struct inode
*inode
= file_inode(iocb
->ki_filp
);
84 loff_t offset
= iocb
->ki_pos
;
85 ssize_t ret
= dio
->error
;
87 if (dops
&& dops
->end_io
)
88 ret
= dops
->end_io(iocb
, dio
->size
, ret
, dio
->flags
);
92 /* check for short read */
93 if (offset
+ ret
> dio
->i_size
&&
94 !(dio
->flags
& IOMAP_DIO_WRITE
))
95 ret
= dio
->i_size
- offset
;
100 * Try again to invalidate clean pages which might have been cached by
101 * non-direct readahead, or faulted in by get_user_pages() if the source
102 * of the write was an mmap'ed region of the file we're writing. Either
103 * one is a pretty crazy thing to do, so we don't support it 100%. If
104 * this invalidation fails, tough, the write still worked...
106 * And this page cache invalidation has to be after ->end_io(), as some
107 * filesystems convert unwritten extents to real allocations in
108 * ->end_io() when necessary, otherwise a racing buffer read would cache
109 * zeros from unwritten extents.
111 if (!dio
->error
&& dio
->size
&&
112 (dio
->flags
& IOMAP_DIO_WRITE
) && inode
->i_mapping
->nrpages
) {
114 err
= invalidate_inode_pages2_range(inode
->i_mapping
,
115 offset
>> PAGE_SHIFT
,
116 (offset
+ dio
->size
- 1) >> PAGE_SHIFT
);
118 dio_warn_stale_pagecache(iocb
->ki_filp
);
121 inode_dio_end(file_inode(iocb
->ki_filp
));
123 * If this is a DSYNC write, make sure we push it to stable storage now
124 * that we've written data.
126 if (ret
> 0 && (dio
->flags
& IOMAP_DIO_NEED_SYNC
))
127 ret
= generic_write_sync(iocb
, ret
);
133 EXPORT_SYMBOL_GPL(iomap_dio_complete
);
135 static void iomap_dio_complete_work(struct work_struct
*work
)
137 struct iomap_dio
*dio
= container_of(work
, struct iomap_dio
, aio
.work
);
138 struct kiocb
*iocb
= dio
->iocb
;
140 iocb
->ki_complete(iocb
, iomap_dio_complete(dio
), 0);
144 * Set an error in the dio if none is set yet. We have to use cmpxchg
145 * as the submission context and the completion context(s) can race to
148 static inline void iomap_dio_set_error(struct iomap_dio
*dio
, int ret
)
150 cmpxchg(&dio
->error
, 0, ret
);
153 static void iomap_dio_bio_end_io(struct bio
*bio
)
155 struct iomap_dio
*dio
= bio
->bi_private
;
156 bool should_dirty
= (dio
->flags
& IOMAP_DIO_DIRTY
);
159 iomap_dio_set_error(dio
, blk_status_to_errno(bio
->bi_status
));
161 if (atomic_dec_and_test(&dio
->ref
)) {
162 if (dio
->wait_for_completion
) {
163 struct task_struct
*waiter
= dio
->submit
.waiter
;
164 WRITE_ONCE(dio
->submit
.waiter
, NULL
);
165 blk_wake_io_task(waiter
);
166 } else if (dio
->flags
& IOMAP_DIO_WRITE
) {
167 struct inode
*inode
= file_inode(dio
->iocb
->ki_filp
);
169 INIT_WORK(&dio
->aio
.work
, iomap_dio_complete_work
);
170 queue_work(inode
->i_sb
->s_dio_done_wq
, &dio
->aio
.work
);
172 iomap_dio_complete_work(&dio
->aio
.work
);
177 bio_check_pages_dirty(bio
);
179 bio_release_pages(bio
, false);
185 iomap_dio_zero(struct iomap_dio
*dio
, struct iomap
*iomap
, loff_t pos
,
188 struct page
*page
= ZERO_PAGE(0);
189 int flags
= REQ_SYNC
| REQ_IDLE
;
192 bio
= bio_alloc(GFP_KERNEL
, 1);
193 bio_set_dev(bio
, iomap
->bdev
);
194 bio
->bi_iter
.bi_sector
= iomap_sector(iomap
, pos
);
195 bio
->bi_private
= dio
;
196 bio
->bi_end_io
= iomap_dio_bio_end_io
;
199 __bio_add_page(bio
, page
, len
, 0);
200 bio_set_op_attrs(bio
, REQ_OP_WRITE
, flags
);
201 iomap_dio_submit_bio(dio
, iomap
, bio
, pos
);
205 iomap_dio_bio_actor(struct inode
*inode
, loff_t pos
, loff_t length
,
206 struct iomap_dio
*dio
, struct iomap
*iomap
)
208 unsigned int blkbits
= blksize_bits(bdev_logical_block_size(iomap
->bdev
));
209 unsigned int fs_block_size
= i_blocksize(inode
), pad
;
210 unsigned int align
= iov_iter_alignment(dio
->submit
.iter
);
212 bool need_zeroout
= false;
213 bool use_fua
= false;
214 int nr_pages
, ret
= 0;
218 if ((pos
| length
| align
) & ((1 << blkbits
) - 1))
221 if (iomap
->type
== IOMAP_UNWRITTEN
) {
222 dio
->flags
|= IOMAP_DIO_UNWRITTEN
;
226 if (iomap
->flags
& IOMAP_F_SHARED
)
227 dio
->flags
|= IOMAP_DIO_COW
;
229 if (iomap
->flags
& IOMAP_F_NEW
) {
231 } else if (iomap
->type
== IOMAP_MAPPED
) {
233 * Use a FUA write if we need datasync semantics, this is a pure
234 * data IO that doesn't require any metadata updates (including
235 * after IO completion such as unwritten extent conversion) and
236 * the underlying device supports FUA. This allows us to avoid
237 * cache flushes on IO completion.
239 if (!(iomap
->flags
& (IOMAP_F_SHARED
|IOMAP_F_DIRTY
)) &&
240 (dio
->flags
& IOMAP_DIO_WRITE_FUA
) &&
241 blk_queue_fua(bdev_get_queue(iomap
->bdev
)))
246 * Save the original count and trim the iter to just the extent we
247 * are operating on right now. The iter will be re-expanded once
250 orig_count
= iov_iter_count(dio
->submit
.iter
);
251 iov_iter_truncate(dio
->submit
.iter
, length
);
253 nr_pages
= iov_iter_npages(dio
->submit
.iter
, BIO_MAX_PAGES
);
260 /* zero out from the start of the block to the write offset */
261 pad
= pos
& (fs_block_size
- 1);
263 iomap_dio_zero(dio
, iomap
, pos
- pad
, pad
);
269 iov_iter_revert(dio
->submit
.iter
, copied
);
274 bio
= bio_alloc(GFP_KERNEL
, nr_pages
);
275 bio_set_dev(bio
, iomap
->bdev
);
276 bio
->bi_iter
.bi_sector
= iomap_sector(iomap
, pos
);
277 bio
->bi_write_hint
= dio
->iocb
->ki_hint
;
278 bio
->bi_ioprio
= dio
->iocb
->ki_ioprio
;
279 bio
->bi_private
= dio
;
280 bio
->bi_end_io
= iomap_dio_bio_end_io
;
282 ret
= bio_iov_iter_get_pages(bio
, dio
->submit
.iter
);
285 * We have to stop part way through an IO. We must fall
286 * through to the sub-block tail zeroing here, otherwise
287 * this short IO may expose stale data in the tail of
288 * the block we haven't written data to.
294 n
= bio
->bi_iter
.bi_size
;
295 if (dio
->flags
& IOMAP_DIO_WRITE
) {
296 bio
->bi_opf
= REQ_OP_WRITE
| REQ_SYNC
| REQ_IDLE
;
298 bio
->bi_opf
|= REQ_FUA
;
300 dio
->flags
&= ~IOMAP_DIO_WRITE_FUA
;
301 task_io_account_write(n
);
303 bio
->bi_opf
= REQ_OP_READ
;
304 if (dio
->flags
& IOMAP_DIO_DIRTY
)
305 bio_set_pages_dirty(bio
);
311 nr_pages
= iov_iter_npages(dio
->submit
.iter
, BIO_MAX_PAGES
);
312 iomap_dio_submit_bio(dio
, iomap
, bio
, pos
);
317 * We need to zeroout the tail of a sub-block write if the extent type
318 * requires zeroing or the write extends beyond EOF. If we don't zero
319 * the block tail in the latter case, we can expose stale data via mmap
320 * reads of the EOF block.
324 ((dio
->flags
& IOMAP_DIO_WRITE
) && pos
>= i_size_read(inode
))) {
325 /* zero out from the end of the write to the end of the block */
326 pad
= pos
& (fs_block_size
- 1);
328 iomap_dio_zero(dio
, iomap
, pos
, fs_block_size
- pad
);
331 /* Undo iter limitation to current extent */
332 iov_iter_reexpand(dio
->submit
.iter
, orig_count
- copied
);
339 iomap_dio_hole_actor(loff_t length
, struct iomap_dio
*dio
)
341 length
= iov_iter_zero(length
, dio
->submit
.iter
);
347 iomap_dio_inline_actor(struct inode
*inode
, loff_t pos
, loff_t length
,
348 struct iomap_dio
*dio
, struct iomap
*iomap
)
350 struct iov_iter
*iter
= dio
->submit
.iter
;
353 BUG_ON(pos
+ length
> PAGE_SIZE
- offset_in_page(iomap
->inline_data
));
355 if (dio
->flags
& IOMAP_DIO_WRITE
) {
356 loff_t size
= inode
->i_size
;
359 memset(iomap
->inline_data
+ size
, 0, pos
- size
);
360 copied
= copy_from_iter(iomap
->inline_data
+ pos
, length
, iter
);
362 if (pos
+ copied
> size
)
363 i_size_write(inode
, pos
+ copied
);
364 mark_inode_dirty(inode
);
367 copied
= copy_to_iter(iomap
->inline_data
+ pos
, length
, iter
);
374 iomap_dio_actor(struct inode
*inode
, loff_t pos
, loff_t length
,
375 void *data
, struct iomap
*iomap
, struct iomap
*srcmap
)
377 struct iomap_dio
*dio
= data
;
379 switch (iomap
->type
) {
381 if (WARN_ON_ONCE(dio
->flags
& IOMAP_DIO_WRITE
))
383 return iomap_dio_hole_actor(length
, dio
);
384 case IOMAP_UNWRITTEN
:
385 if (!(dio
->flags
& IOMAP_DIO_WRITE
))
386 return iomap_dio_hole_actor(length
, dio
);
387 return iomap_dio_bio_actor(inode
, pos
, length
, dio
, iomap
);
389 return iomap_dio_bio_actor(inode
, pos
, length
, dio
, iomap
);
391 return iomap_dio_inline_actor(inode
, pos
, length
, dio
, iomap
);
394 * DIO is not serialised against mmap() access at all, and so
395 * if the page_mkwrite occurs between the writeback and the
396 * iomap_apply() call in the DIO path, then it will see the
397 * DELALLOC block that the page-mkwrite allocated.
399 pr_warn_ratelimited("Direct I/O collision with buffered writes! File: %pD4 Comm: %.20s\n",
400 dio
->iocb
->ki_filp
, current
->comm
);
409 * iomap_dio_rw() always completes O_[D]SYNC writes regardless of whether the IO
410 * is being issued as AIO or not. This allows us to optimise pure data writes
411 * to use REQ_FUA rather than requiring generic_write_sync() to issue a
412 * REQ_FLUSH post write. This is slightly tricky because a single request here
413 * can be mapped into multiple disjoint IOs and only a subset of the IOs issued
414 * may be pure data writes. In that case, we still need to do a full data sync
417 * Returns -ENOTBLK In case of a page invalidation invalidation failure for
418 * writes. The callers needs to fall back to buffered I/O in this case.
421 __iomap_dio_rw(struct kiocb
*iocb
, struct iov_iter
*iter
,
422 const struct iomap_ops
*ops
, const struct iomap_dio_ops
*dops
,
423 bool wait_for_completion
)
425 struct address_space
*mapping
= iocb
->ki_filp
->f_mapping
;
426 struct inode
*inode
= file_inode(iocb
->ki_filp
);
427 size_t count
= iov_iter_count(iter
);
428 loff_t pos
= iocb
->ki_pos
;
429 loff_t end
= iocb
->ki_pos
+ count
- 1, ret
= 0;
430 unsigned int flags
= IOMAP_DIRECT
;
431 struct blk_plug plug
;
432 struct iomap_dio
*dio
;
437 if (WARN_ON(is_sync_kiocb(iocb
) && !wait_for_completion
))
438 return ERR_PTR(-EIO
);
440 dio
= kmalloc(sizeof(*dio
), GFP_KERNEL
);
442 return ERR_PTR(-ENOMEM
);
445 atomic_set(&dio
->ref
, 1);
447 dio
->i_size
= i_size_read(inode
);
452 dio
->submit
.iter
= iter
;
453 dio
->submit
.waiter
= current
;
454 dio
->submit
.cookie
= BLK_QC_T_NONE
;
455 dio
->submit
.last_queue
= NULL
;
457 if (iov_iter_rw(iter
) == READ
) {
458 if (pos
>= dio
->i_size
)
461 if (iter_is_iovec(iter
))
462 dio
->flags
|= IOMAP_DIO_DIRTY
;
464 flags
|= IOMAP_WRITE
;
465 dio
->flags
|= IOMAP_DIO_WRITE
;
467 /* for data sync or sync, we need sync completion processing */
468 if (iocb
->ki_flags
& IOCB_DSYNC
)
469 dio
->flags
|= IOMAP_DIO_NEED_SYNC
;
472 * For datasync only writes, we optimistically try using FUA for
473 * this IO. Any non-FUA write that occurs will clear this flag,
474 * hence we know before completion whether a cache flush is
477 if ((iocb
->ki_flags
& (IOCB_DSYNC
| IOCB_SYNC
)) == IOCB_DSYNC
)
478 dio
->flags
|= IOMAP_DIO_WRITE_FUA
;
481 if (iocb
->ki_flags
& IOCB_NOWAIT
) {
482 if (filemap_range_has_page(mapping
, pos
, end
)) {
486 flags
|= IOMAP_NOWAIT
;
489 ret
= filemap_write_and_wait_range(mapping
, pos
, end
);
493 if (iov_iter_rw(iter
) == WRITE
) {
495 * Try to invalidate cache pages for the range we are writing.
496 * If this invalidation fails, let the caller fall back to
499 if (invalidate_inode_pages2_range(mapping
, pos
>> PAGE_SHIFT
,
500 end
>> PAGE_SHIFT
)) {
501 trace_iomap_dio_invalidate_fail(inode
, pos
, count
);
506 if (!wait_for_completion
&& !inode
->i_sb
->s_dio_done_wq
) {
507 ret
= sb_init_dio_done_wq(inode
->i_sb
);
513 inode_dio_begin(inode
);
515 blk_start_plug(&plug
);
517 ret
= iomap_apply(inode
, pos
, count
, flags
, ops
, dio
,
520 /* magic error code to fall back to buffered I/O */
521 if (ret
== -ENOTBLK
) {
522 wait_for_completion
= true;
529 if (iov_iter_rw(iter
) == READ
&& pos
>= dio
->i_size
) {
531 * We only report that we've read data up to i_size.
532 * Revert iter to a state corresponding to that as
533 * some callers (such as splice code) rely on it.
535 iov_iter_revert(iter
, pos
- dio
->i_size
);
538 } while ((count
= iov_iter_count(iter
)) > 0);
539 blk_finish_plug(&plug
);
542 iomap_dio_set_error(dio
, ret
);
545 * If all the writes we issued were FUA, we don't need to flush the
546 * cache on IO completion. Clear the sync flag for this case.
548 if (dio
->flags
& IOMAP_DIO_WRITE_FUA
)
549 dio
->flags
&= ~IOMAP_DIO_NEED_SYNC
;
551 WRITE_ONCE(iocb
->ki_cookie
, dio
->submit
.cookie
);
552 WRITE_ONCE(iocb
->private, dio
->submit
.last_queue
);
555 * We are about to drop our additional submission reference, which
556 * might be the last reference to the dio. There are three different
557 * ways we can progress here:
559 * (a) If this is the last reference we will always complete and free
561 * (b) If this is not the last reference, and we serve an asynchronous
562 * iocb, we must never touch the dio after the decrement, the
563 * I/O completion handler will complete and free it.
564 * (c) If this is not the last reference, but we serve a synchronous
565 * iocb, the I/O completion handler will wake us up on the drop
566 * of the final reference, and we will complete and free it here
567 * after we got woken by the I/O completion handler.
569 dio
->wait_for_completion
= wait_for_completion
;
570 if (!atomic_dec_and_test(&dio
->ref
)) {
571 if (!wait_for_completion
)
572 return ERR_PTR(-EIOCBQUEUED
);
575 set_current_state(TASK_UNINTERRUPTIBLE
);
576 if (!READ_ONCE(dio
->submit
.waiter
))
579 if (!(iocb
->ki_flags
& IOCB_HIPRI
) ||
580 !dio
->submit
.last_queue
||
581 !blk_poll(dio
->submit
.last_queue
,
582 dio
->submit
.cookie
, true))
585 __set_current_state(TASK_RUNNING
);
596 EXPORT_SYMBOL_GPL(__iomap_dio_rw
);
599 iomap_dio_rw(struct kiocb
*iocb
, struct iov_iter
*iter
,
600 const struct iomap_ops
*ops
, const struct iomap_dio_ops
*dops
,
601 bool wait_for_completion
)
603 struct iomap_dio
*dio
;
605 dio
= __iomap_dio_rw(iocb
, iter
, ops
, dops
, wait_for_completion
);
606 if (IS_ERR_OR_NULL(dio
))
607 return PTR_ERR_OR_ZERO(dio
);
608 return iomap_dio_complete(dio
);
610 EXPORT_SYMBOL_GPL(iomap_dio_rw
);