1 // SPDX-License-Identifier: GPL-2.0
3 * Copyright (C) 2010 Red Hat, Inc.
4 * Copyright (c) 2016-2018 Christoph Hellwig.
6 #include <linux/module.h>
7 #include <linux/compiler.h>
9 #include <linux/iomap.h>
10 #include <linux/backing-dev.h>
11 #include <linux/uio.h>
12 #include <linux/task_io_accounting_ops.h>
14 #include "../internal.h"
17 * Private flags for iomap_dio, must not overlap with the public ones in
20 #define IOMAP_DIO_WRITE_FUA (1 << 28)
21 #define IOMAP_DIO_NEED_SYNC (1 << 29)
22 #define IOMAP_DIO_WRITE (1 << 30)
23 #define IOMAP_DIO_DIRTY (1 << 31)
27 const struct iomap_dio_ops
*dops
;
33 bool wait_for_completion
;
36 /* used during submission and for synchronous completion: */
38 struct iov_iter
*iter
;
39 struct task_struct
*waiter
;
40 struct request_queue
*last_queue
;
44 /* used for aio completion: */
46 struct work_struct work
;
51 int iomap_dio_iopoll(struct kiocb
*kiocb
, bool spin
)
53 struct request_queue
*q
= READ_ONCE(kiocb
->private);
57 return blk_poll(q
, READ_ONCE(kiocb
->ki_cookie
), spin
);
59 EXPORT_SYMBOL_GPL(iomap_dio_iopoll
);
61 static void iomap_dio_submit_bio(struct iomap_dio
*dio
, struct iomap
*iomap
,
64 atomic_inc(&dio
->ref
);
66 if (dio
->iocb
->ki_flags
& IOCB_HIPRI
)
67 bio_set_polled(bio
, dio
->iocb
);
69 dio
->submit
.last_queue
= bdev_get_queue(iomap
->bdev
);
70 dio
->submit
.cookie
= submit_bio(bio
);
73 static ssize_t
iomap_dio_complete(struct iomap_dio
*dio
)
75 const struct iomap_dio_ops
*dops
= dio
->dops
;
76 struct kiocb
*iocb
= dio
->iocb
;
77 struct inode
*inode
= file_inode(iocb
->ki_filp
);
78 loff_t offset
= iocb
->ki_pos
;
79 ssize_t ret
= dio
->error
;
81 if (dops
&& dops
->end_io
)
82 ret
= dops
->end_io(iocb
, dio
->size
, ret
, dio
->flags
);
86 /* check for short read */
87 if (offset
+ ret
> dio
->i_size
&&
88 !(dio
->flags
& IOMAP_DIO_WRITE
))
89 ret
= dio
->i_size
- offset
;
94 * Try again to invalidate clean pages which might have been cached by
95 * non-direct readahead, or faulted in by get_user_pages() if the source
96 * of the write was an mmap'ed region of the file we're writing. Either
97 * one is a pretty crazy thing to do, so we don't support it 100%. If
98 * this invalidation fails, tough, the write still worked...
100 * And this page cache invalidation has to be after ->end_io(), as some
101 * filesystems convert unwritten extents to real allocations in
102 * ->end_io() when necessary, otherwise a racing buffer read would cache
103 * zeros from unwritten extents.
106 (dio
->flags
& IOMAP_DIO_WRITE
) && inode
->i_mapping
->nrpages
) {
108 err
= invalidate_inode_pages2_range(inode
->i_mapping
,
109 offset
>> PAGE_SHIFT
,
110 (offset
+ dio
->size
- 1) >> PAGE_SHIFT
);
112 dio_warn_stale_pagecache(iocb
->ki_filp
);
116 * If this is a DSYNC write, make sure we push it to stable storage now
117 * that we've written data.
119 if (ret
> 0 && (dio
->flags
& IOMAP_DIO_NEED_SYNC
))
120 ret
= generic_write_sync(iocb
, ret
);
122 inode_dio_end(file_inode(iocb
->ki_filp
));
128 static void iomap_dio_complete_work(struct work_struct
*work
)
130 struct iomap_dio
*dio
= container_of(work
, struct iomap_dio
, aio
.work
);
131 struct kiocb
*iocb
= dio
->iocb
;
133 iocb
->ki_complete(iocb
, iomap_dio_complete(dio
), 0);
137 * Set an error in the dio if none is set yet. We have to use cmpxchg
138 * as the submission context and the completion context(s) can race to
141 static inline void iomap_dio_set_error(struct iomap_dio
*dio
, int ret
)
143 cmpxchg(&dio
->error
, 0, ret
);
146 static void iomap_dio_bio_end_io(struct bio
*bio
)
148 struct iomap_dio
*dio
= bio
->bi_private
;
149 bool should_dirty
= (dio
->flags
& IOMAP_DIO_DIRTY
);
152 iomap_dio_set_error(dio
, blk_status_to_errno(bio
->bi_status
));
154 if (atomic_dec_and_test(&dio
->ref
)) {
155 if (dio
->wait_for_completion
) {
156 struct task_struct
*waiter
= dio
->submit
.waiter
;
157 WRITE_ONCE(dio
->submit
.waiter
, NULL
);
158 blk_wake_io_task(waiter
);
159 } else if (dio
->flags
& IOMAP_DIO_WRITE
) {
160 struct inode
*inode
= file_inode(dio
->iocb
->ki_filp
);
162 INIT_WORK(&dio
->aio
.work
, iomap_dio_complete_work
);
163 queue_work(inode
->i_sb
->s_dio_done_wq
, &dio
->aio
.work
);
165 iomap_dio_complete_work(&dio
->aio
.work
);
170 bio_check_pages_dirty(bio
);
172 bio_release_pages(bio
, false);
178 iomap_dio_zero(struct iomap_dio
*dio
, struct iomap
*iomap
, loff_t pos
,
181 struct page
*page
= ZERO_PAGE(0);
182 int flags
= REQ_SYNC
| REQ_IDLE
;
185 bio
= bio_alloc(GFP_KERNEL
, 1);
186 bio_set_dev(bio
, iomap
->bdev
);
187 bio
->bi_iter
.bi_sector
= iomap_sector(iomap
, pos
);
188 bio
->bi_private
= dio
;
189 bio
->bi_end_io
= iomap_dio_bio_end_io
;
192 __bio_add_page(bio
, page
, len
, 0);
193 bio_set_op_attrs(bio
, REQ_OP_WRITE
, flags
);
194 iomap_dio_submit_bio(dio
, iomap
, bio
);
198 iomap_dio_bio_actor(struct inode
*inode
, loff_t pos
, loff_t length
,
199 struct iomap_dio
*dio
, struct iomap
*iomap
)
201 unsigned int blkbits
= blksize_bits(bdev_logical_block_size(iomap
->bdev
));
202 unsigned int fs_block_size
= i_blocksize(inode
), pad
;
203 unsigned int align
= iov_iter_alignment(dio
->submit
.iter
);
205 bool need_zeroout
= false;
206 bool use_fua
= false;
207 int nr_pages
, ret
= 0;
211 if ((pos
| length
| align
) & ((1 << blkbits
) - 1))
214 if (iomap
->type
== IOMAP_UNWRITTEN
) {
215 dio
->flags
|= IOMAP_DIO_UNWRITTEN
;
219 if (iomap
->flags
& IOMAP_F_SHARED
)
220 dio
->flags
|= IOMAP_DIO_COW
;
222 if (iomap
->flags
& IOMAP_F_NEW
) {
224 } else if (iomap
->type
== IOMAP_MAPPED
) {
226 * Use a FUA write if we need datasync semantics, this is a pure
227 * data IO that doesn't require any metadata updates (including
228 * after IO completion such as unwritten extent conversion) and
229 * the underlying device supports FUA. This allows us to avoid
230 * cache flushes on IO completion.
232 if (!(iomap
->flags
& (IOMAP_F_SHARED
|IOMAP_F_DIRTY
)) &&
233 (dio
->flags
& IOMAP_DIO_WRITE_FUA
) &&
234 blk_queue_fua(bdev_get_queue(iomap
->bdev
)))
239 * Save the original count and trim the iter to just the extent we
240 * are operating on right now. The iter will be re-expanded once
243 orig_count
= iov_iter_count(dio
->submit
.iter
);
244 iov_iter_truncate(dio
->submit
.iter
, length
);
246 nr_pages
= iov_iter_npages(dio
->submit
.iter
, BIO_MAX_PAGES
);
253 /* zero out from the start of the block to the write offset */
254 pad
= pos
& (fs_block_size
- 1);
256 iomap_dio_zero(dio
, iomap
, pos
- pad
, pad
);
262 iov_iter_revert(dio
->submit
.iter
, copied
);
267 bio
= bio_alloc(GFP_KERNEL
, nr_pages
);
268 bio_set_dev(bio
, iomap
->bdev
);
269 bio
->bi_iter
.bi_sector
= iomap_sector(iomap
, pos
);
270 bio
->bi_write_hint
= dio
->iocb
->ki_hint
;
271 bio
->bi_ioprio
= dio
->iocb
->ki_ioprio
;
272 bio
->bi_private
= dio
;
273 bio
->bi_end_io
= iomap_dio_bio_end_io
;
275 ret
= bio_iov_iter_get_pages(bio
, dio
->submit
.iter
);
278 * We have to stop part way through an IO. We must fall
279 * through to the sub-block tail zeroing here, otherwise
280 * this short IO may expose stale data in the tail of
281 * the block we haven't written data to.
287 n
= bio
->bi_iter
.bi_size
;
288 if (dio
->flags
& IOMAP_DIO_WRITE
) {
289 bio
->bi_opf
= REQ_OP_WRITE
| REQ_SYNC
| REQ_IDLE
;
291 bio
->bi_opf
|= REQ_FUA
;
293 dio
->flags
&= ~IOMAP_DIO_WRITE_FUA
;
294 task_io_account_write(n
);
296 bio
->bi_opf
= REQ_OP_READ
;
297 if (dio
->flags
& IOMAP_DIO_DIRTY
)
298 bio_set_pages_dirty(bio
);
305 nr_pages
= iov_iter_npages(dio
->submit
.iter
, BIO_MAX_PAGES
);
306 iomap_dio_submit_bio(dio
, iomap
, bio
);
310 * We need to zeroout the tail of a sub-block write if the extent type
311 * requires zeroing or the write extends beyond EOF. If we don't zero
312 * the block tail in the latter case, we can expose stale data via mmap
313 * reads of the EOF block.
317 ((dio
->flags
& IOMAP_DIO_WRITE
) && pos
>= i_size_read(inode
))) {
318 /* zero out from the end of the write to the end of the block */
319 pad
= pos
& (fs_block_size
- 1);
321 iomap_dio_zero(dio
, iomap
, pos
, fs_block_size
- pad
);
324 /* Undo iter limitation to current extent */
325 iov_iter_reexpand(dio
->submit
.iter
, orig_count
- copied
);
332 iomap_dio_hole_actor(loff_t length
, struct iomap_dio
*dio
)
334 length
= iov_iter_zero(length
, dio
->submit
.iter
);
340 iomap_dio_inline_actor(struct inode
*inode
, loff_t pos
, loff_t length
,
341 struct iomap_dio
*dio
, struct iomap
*iomap
)
343 struct iov_iter
*iter
= dio
->submit
.iter
;
346 BUG_ON(pos
+ length
> PAGE_SIZE
- offset_in_page(iomap
->inline_data
));
348 if (dio
->flags
& IOMAP_DIO_WRITE
) {
349 loff_t size
= inode
->i_size
;
352 memset(iomap
->inline_data
+ size
, 0, pos
- size
);
353 copied
= copy_from_iter(iomap
->inline_data
+ pos
, length
, iter
);
355 if (pos
+ copied
> size
)
356 i_size_write(inode
, pos
+ copied
);
357 mark_inode_dirty(inode
);
360 copied
= copy_to_iter(iomap
->inline_data
+ pos
, length
, iter
);
367 iomap_dio_actor(struct inode
*inode
, loff_t pos
, loff_t length
,
368 void *data
, struct iomap
*iomap
, struct iomap
*srcmap
)
370 struct iomap_dio
*dio
= data
;
372 switch (iomap
->type
) {
374 if (WARN_ON_ONCE(dio
->flags
& IOMAP_DIO_WRITE
))
376 return iomap_dio_hole_actor(length
, dio
);
377 case IOMAP_UNWRITTEN
:
378 if (!(dio
->flags
& IOMAP_DIO_WRITE
))
379 return iomap_dio_hole_actor(length
, dio
);
380 return iomap_dio_bio_actor(inode
, pos
, length
, dio
, iomap
);
382 return iomap_dio_bio_actor(inode
, pos
, length
, dio
, iomap
);
384 return iomap_dio_inline_actor(inode
, pos
, length
, dio
, iomap
);
392 * iomap_dio_rw() always completes O_[D]SYNC writes regardless of whether the IO
393 * is being issued as AIO or not. This allows us to optimise pure data writes
394 * to use REQ_FUA rather than requiring generic_write_sync() to issue a
395 * REQ_FLUSH post write. This is slightly tricky because a single request here
396 * can be mapped into multiple disjoint IOs and only a subset of the IOs issued
397 * may be pure data writes. In that case, we still need to do a full data sync
401 iomap_dio_rw(struct kiocb
*iocb
, struct iov_iter
*iter
,
402 const struct iomap_ops
*ops
, const struct iomap_dio_ops
*dops
,
403 bool wait_for_completion
)
405 struct address_space
*mapping
= iocb
->ki_filp
->f_mapping
;
406 struct inode
*inode
= file_inode(iocb
->ki_filp
);
407 size_t count
= iov_iter_count(iter
);
408 loff_t pos
= iocb
->ki_pos
;
409 loff_t end
= iocb
->ki_pos
+ count
- 1, ret
= 0;
410 unsigned int flags
= IOMAP_DIRECT
;
411 struct blk_plug plug
;
412 struct iomap_dio
*dio
;
414 lockdep_assert_held(&inode
->i_rwsem
);
419 if (WARN_ON(is_sync_kiocb(iocb
) && !wait_for_completion
))
422 dio
= kmalloc(sizeof(*dio
), GFP_KERNEL
);
427 atomic_set(&dio
->ref
, 1);
429 dio
->i_size
= i_size_read(inode
);
434 dio
->submit
.iter
= iter
;
435 dio
->submit
.waiter
= current
;
436 dio
->submit
.cookie
= BLK_QC_T_NONE
;
437 dio
->submit
.last_queue
= NULL
;
439 if (iov_iter_rw(iter
) == READ
) {
440 if (pos
>= dio
->i_size
)
443 if (iter_is_iovec(iter
))
444 dio
->flags
|= IOMAP_DIO_DIRTY
;
446 flags
|= IOMAP_WRITE
;
447 dio
->flags
|= IOMAP_DIO_WRITE
;
449 /* for data sync or sync, we need sync completion processing */
450 if (iocb
->ki_flags
& IOCB_DSYNC
)
451 dio
->flags
|= IOMAP_DIO_NEED_SYNC
;
454 * For datasync only writes, we optimistically try using FUA for
455 * this IO. Any non-FUA write that occurs will clear this flag,
456 * hence we know before completion whether a cache flush is
459 if ((iocb
->ki_flags
& (IOCB_DSYNC
| IOCB_SYNC
)) == IOCB_DSYNC
)
460 dio
->flags
|= IOMAP_DIO_WRITE_FUA
;
463 if (iocb
->ki_flags
& IOCB_NOWAIT
) {
464 if (filemap_range_has_page(mapping
, pos
, end
)) {
468 flags
|= IOMAP_NOWAIT
;
471 ret
= filemap_write_and_wait_range(mapping
, pos
, end
);
476 * Try to invalidate cache pages for the range we're direct
477 * writing. If this invalidation fails, tough, the write will
478 * still work, but racing two incompatible write paths is a
479 * pretty crazy thing to do, so we don't support it 100%.
481 ret
= invalidate_inode_pages2_range(mapping
,
482 pos
>> PAGE_SHIFT
, end
>> PAGE_SHIFT
);
484 dio_warn_stale_pagecache(iocb
->ki_filp
);
487 if (iov_iter_rw(iter
) == WRITE
&& !wait_for_completion
&&
488 !inode
->i_sb
->s_dio_done_wq
) {
489 ret
= sb_init_dio_done_wq(inode
->i_sb
);
494 inode_dio_begin(inode
);
496 blk_start_plug(&plug
);
498 ret
= iomap_apply(inode
, pos
, count
, flags
, ops
, dio
,
501 /* magic error code to fall back to buffered I/O */
502 if (ret
== -ENOTBLK
) {
503 wait_for_completion
= true;
510 if (iov_iter_rw(iter
) == READ
&& pos
>= dio
->i_size
) {
512 * We only report that we've read data up to i_size.
513 * Revert iter to a state corresponding to that as
514 * some callers (such as splice code) rely on it.
516 iov_iter_revert(iter
, pos
- dio
->i_size
);
519 } while ((count
= iov_iter_count(iter
)) > 0);
520 blk_finish_plug(&plug
);
523 iomap_dio_set_error(dio
, ret
);
526 * If all the writes we issued were FUA, we don't need to flush the
527 * cache on IO completion. Clear the sync flag for this case.
529 if (dio
->flags
& IOMAP_DIO_WRITE_FUA
)
530 dio
->flags
&= ~IOMAP_DIO_NEED_SYNC
;
532 WRITE_ONCE(iocb
->ki_cookie
, dio
->submit
.cookie
);
533 WRITE_ONCE(iocb
->private, dio
->submit
.last_queue
);
536 * We are about to drop our additional submission reference, which
537 * might be the last reference to the dio. There are three different
538 * ways we can progress here:
540 * (a) If this is the last reference we will always complete and free
542 * (b) If this is not the last reference, and we serve an asynchronous
543 * iocb, we must never touch the dio after the decrement, the
544 * I/O completion handler will complete and free it.
545 * (c) If this is not the last reference, but we serve a synchronous
546 * iocb, the I/O completion handler will wake us up on the drop
547 * of the final reference, and we will complete and free it here
548 * after we got woken by the I/O completion handler.
550 dio
->wait_for_completion
= wait_for_completion
;
551 if (!atomic_dec_and_test(&dio
->ref
)) {
552 if (!wait_for_completion
)
556 set_current_state(TASK_UNINTERRUPTIBLE
);
557 if (!READ_ONCE(dio
->submit
.waiter
))
560 if (!(iocb
->ki_flags
& IOCB_HIPRI
) ||
561 !dio
->submit
.last_queue
||
562 !blk_poll(dio
->submit
.last_queue
,
563 dio
->submit
.cookie
, true))
566 __set_current_state(TASK_RUNNING
);
569 return iomap_dio_complete(dio
);
575 EXPORT_SYMBOL_GPL(iomap_dio_rw
);