1 // SPDX-License-Identifier: GPL-2.0
3 * Copyright (C) 2010 Red Hat, Inc.
4 * Copyright (c) 2016-2018 Christoph Hellwig.
6 #include <linux/module.h>
7 #include <linux/compiler.h>
9 #include <linux/iomap.h>
10 #include <linux/backing-dev.h>
11 #include <linux/uio.h>
12 #include <linux/task_io_accounting_ops.h>
14 #include "../internal.h"
17 * Private flags for iomap_dio, must not overlap with the public ones in
20 #define IOMAP_DIO_WRITE_FUA (1 << 28)
21 #define IOMAP_DIO_NEED_SYNC (1 << 29)
22 #define IOMAP_DIO_WRITE (1 << 30)
23 #define IOMAP_DIO_DIRTY (1 << 31)
27 iomap_dio_end_io_t
*end_io
;
33 bool wait_for_completion
;
36 /* used during submission and for synchronous completion: */
38 struct iov_iter
*iter
;
39 struct task_struct
*waiter
;
40 struct request_queue
*last_queue
;
44 /* used for aio completion: */
46 struct work_struct work
;
51 int iomap_dio_iopoll(struct kiocb
*kiocb
, bool spin
)
53 struct request_queue
*q
= READ_ONCE(kiocb
->private);
57 return blk_poll(q
, READ_ONCE(kiocb
->ki_cookie
), spin
);
59 EXPORT_SYMBOL_GPL(iomap_dio_iopoll
);
61 static void iomap_dio_submit_bio(struct iomap_dio
*dio
, struct iomap
*iomap
,
64 atomic_inc(&dio
->ref
);
66 if (dio
->iocb
->ki_flags
& IOCB_HIPRI
)
67 bio_set_polled(bio
, dio
->iocb
);
69 dio
->submit
.last_queue
= bdev_get_queue(iomap
->bdev
);
70 dio
->submit
.cookie
= submit_bio(bio
);
73 static ssize_t
iomap_dio_complete(struct iomap_dio
*dio
)
75 struct kiocb
*iocb
= dio
->iocb
;
76 struct inode
*inode
= file_inode(iocb
->ki_filp
);
77 loff_t offset
= iocb
->ki_pos
;
81 ret
= dio
->end_io(iocb
,
82 dio
->error
? dio
->error
: dio
->size
,
90 /* check for short read */
91 if (offset
+ ret
> dio
->i_size
&&
92 !(dio
->flags
& IOMAP_DIO_WRITE
))
93 ret
= dio
->i_size
- offset
;
98 * Try again to invalidate clean pages which might have been cached by
99 * non-direct readahead, or faulted in by get_user_pages() if the source
100 * of the write was an mmap'ed region of the file we're writing. Either
101 * one is a pretty crazy thing to do, so we don't support it 100%. If
102 * this invalidation fails, tough, the write still worked...
104 * And this page cache invalidation has to be after dio->end_io(), as
105 * some filesystems convert unwritten extents to real allocations in
106 * end_io() when necessary, otherwise a racing buffer read would cache
107 * zeros from unwritten extents.
110 (dio
->flags
& IOMAP_DIO_WRITE
) && inode
->i_mapping
->nrpages
) {
112 err
= invalidate_inode_pages2_range(inode
->i_mapping
,
113 offset
>> PAGE_SHIFT
,
114 (offset
+ dio
->size
- 1) >> PAGE_SHIFT
);
116 dio_warn_stale_pagecache(iocb
->ki_filp
);
120 * If this is a DSYNC write, make sure we push it to stable storage now
121 * that we've written data.
123 if (ret
> 0 && (dio
->flags
& IOMAP_DIO_NEED_SYNC
))
124 ret
= generic_write_sync(iocb
, ret
);
126 inode_dio_end(file_inode(iocb
->ki_filp
));
132 static void iomap_dio_complete_work(struct work_struct
*work
)
134 struct iomap_dio
*dio
= container_of(work
, struct iomap_dio
, aio
.work
);
135 struct kiocb
*iocb
= dio
->iocb
;
137 iocb
->ki_complete(iocb
, iomap_dio_complete(dio
), 0);
141 * Set an error in the dio if none is set yet. We have to use cmpxchg
142 * as the submission context and the completion context(s) can race to
145 static inline void iomap_dio_set_error(struct iomap_dio
*dio
, int ret
)
147 cmpxchg(&dio
->error
, 0, ret
);
150 static void iomap_dio_bio_end_io(struct bio
*bio
)
152 struct iomap_dio
*dio
= bio
->bi_private
;
153 bool should_dirty
= (dio
->flags
& IOMAP_DIO_DIRTY
);
156 iomap_dio_set_error(dio
, blk_status_to_errno(bio
->bi_status
));
158 if (atomic_dec_and_test(&dio
->ref
)) {
159 if (dio
->wait_for_completion
) {
160 struct task_struct
*waiter
= dio
->submit
.waiter
;
161 WRITE_ONCE(dio
->submit
.waiter
, NULL
);
162 blk_wake_io_task(waiter
);
163 } else if (dio
->flags
& IOMAP_DIO_WRITE
) {
164 struct inode
*inode
= file_inode(dio
->iocb
->ki_filp
);
166 INIT_WORK(&dio
->aio
.work
, iomap_dio_complete_work
);
167 queue_work(inode
->i_sb
->s_dio_done_wq
, &dio
->aio
.work
);
169 iomap_dio_complete_work(&dio
->aio
.work
);
174 bio_check_pages_dirty(bio
);
176 bio_release_pages(bio
, false);
182 iomap_dio_zero(struct iomap_dio
*dio
, struct iomap
*iomap
, loff_t pos
,
185 struct page
*page
= ZERO_PAGE(0);
186 int flags
= REQ_SYNC
| REQ_IDLE
;
189 bio
= bio_alloc(GFP_KERNEL
, 1);
190 bio_set_dev(bio
, iomap
->bdev
);
191 bio
->bi_iter
.bi_sector
= iomap_sector(iomap
, pos
);
192 bio
->bi_private
= dio
;
193 bio
->bi_end_io
= iomap_dio_bio_end_io
;
196 __bio_add_page(bio
, page
, len
, 0);
197 bio_set_op_attrs(bio
, REQ_OP_WRITE
, flags
);
198 iomap_dio_submit_bio(dio
, iomap
, bio
);
202 iomap_dio_bio_actor(struct inode
*inode
, loff_t pos
, loff_t length
,
203 struct iomap_dio
*dio
, struct iomap
*iomap
)
205 unsigned int blkbits
= blksize_bits(bdev_logical_block_size(iomap
->bdev
));
206 unsigned int fs_block_size
= i_blocksize(inode
), pad
;
207 unsigned int align
= iov_iter_alignment(dio
->submit
.iter
);
208 struct iov_iter iter
;
210 bool need_zeroout
= false;
211 bool use_fua
= false;
212 int nr_pages
, ret
= 0;
215 if ((pos
| length
| align
) & ((1 << blkbits
) - 1))
218 if (iomap
->type
== IOMAP_UNWRITTEN
) {
219 dio
->flags
|= IOMAP_DIO_UNWRITTEN
;
223 if (iomap
->flags
& IOMAP_F_SHARED
)
224 dio
->flags
|= IOMAP_DIO_COW
;
226 if (iomap
->flags
& IOMAP_F_NEW
) {
228 } else if (iomap
->type
== IOMAP_MAPPED
) {
230 * Use a FUA write if we need datasync semantics, this is a pure
231 * data IO that doesn't require any metadata updates (including
232 * after IO completion such as unwritten extent conversion) and
233 * the underlying device supports FUA. This allows us to avoid
234 * cache flushes on IO completion.
236 if (!(iomap
->flags
& (IOMAP_F_SHARED
|IOMAP_F_DIRTY
)) &&
237 (dio
->flags
& IOMAP_DIO_WRITE_FUA
) &&
238 blk_queue_fua(bdev_get_queue(iomap
->bdev
)))
243 * Operate on a partial iter trimmed to the extent we were called for.
244 * We'll update the iter in the dio once we're done with this extent.
246 iter
= *dio
->submit
.iter
;
247 iov_iter_truncate(&iter
, length
);
249 nr_pages
= iov_iter_npages(&iter
, BIO_MAX_PAGES
);
254 /* zero out from the start of the block to the write offset */
255 pad
= pos
& (fs_block_size
- 1);
257 iomap_dio_zero(dio
, iomap
, pos
- pad
, pad
);
263 iov_iter_revert(dio
->submit
.iter
, copied
);
267 bio
= bio_alloc(GFP_KERNEL
, nr_pages
);
268 bio_set_dev(bio
, iomap
->bdev
);
269 bio
->bi_iter
.bi_sector
= iomap_sector(iomap
, pos
);
270 bio
->bi_write_hint
= dio
->iocb
->ki_hint
;
271 bio
->bi_ioprio
= dio
->iocb
->ki_ioprio
;
272 bio
->bi_private
= dio
;
273 bio
->bi_end_io
= iomap_dio_bio_end_io
;
275 ret
= bio_iov_iter_get_pages(bio
, &iter
);
278 * We have to stop part way through an IO. We must fall
279 * through to the sub-block tail zeroing here, otherwise
280 * this short IO may expose stale data in the tail of
281 * the block we haven't written data to.
287 n
= bio
->bi_iter
.bi_size
;
288 if (dio
->flags
& IOMAP_DIO_WRITE
) {
289 bio
->bi_opf
= REQ_OP_WRITE
| REQ_SYNC
| REQ_IDLE
;
291 bio
->bi_opf
|= REQ_FUA
;
293 dio
->flags
&= ~IOMAP_DIO_WRITE_FUA
;
294 task_io_account_write(n
);
296 bio
->bi_opf
= REQ_OP_READ
;
297 if (dio
->flags
& IOMAP_DIO_DIRTY
)
298 bio_set_pages_dirty(bio
);
301 iov_iter_advance(dio
->submit
.iter
, n
);
307 nr_pages
= iov_iter_npages(&iter
, BIO_MAX_PAGES
);
308 iomap_dio_submit_bio(dio
, iomap
, bio
);
312 * We need to zeroout the tail of a sub-block write if the extent type
313 * requires zeroing or the write extends beyond EOF. If we don't zero
314 * the block tail in the latter case, we can expose stale data via mmap
315 * reads of the EOF block.
319 ((dio
->flags
& IOMAP_DIO_WRITE
) && pos
>= i_size_read(inode
))) {
320 /* zero out from the end of the write to the end of the block */
321 pad
= pos
& (fs_block_size
- 1);
323 iomap_dio_zero(dio
, iomap
, pos
, fs_block_size
- pad
);
325 return copied
? copied
: ret
;
329 iomap_dio_hole_actor(loff_t length
, struct iomap_dio
*dio
)
331 length
= iov_iter_zero(length
, dio
->submit
.iter
);
337 iomap_dio_inline_actor(struct inode
*inode
, loff_t pos
, loff_t length
,
338 struct iomap_dio
*dio
, struct iomap
*iomap
)
340 struct iov_iter
*iter
= dio
->submit
.iter
;
343 BUG_ON(pos
+ length
> PAGE_SIZE
- offset_in_page(iomap
->inline_data
));
345 if (dio
->flags
& IOMAP_DIO_WRITE
) {
346 loff_t size
= inode
->i_size
;
349 memset(iomap
->inline_data
+ size
, 0, pos
- size
);
350 copied
= copy_from_iter(iomap
->inline_data
+ pos
, length
, iter
);
352 if (pos
+ copied
> size
)
353 i_size_write(inode
, pos
+ copied
);
354 mark_inode_dirty(inode
);
357 copied
= copy_to_iter(iomap
->inline_data
+ pos
, length
, iter
);
364 iomap_dio_actor(struct inode
*inode
, loff_t pos
, loff_t length
,
365 void *data
, struct iomap
*iomap
)
367 struct iomap_dio
*dio
= data
;
369 switch (iomap
->type
) {
371 if (WARN_ON_ONCE(dio
->flags
& IOMAP_DIO_WRITE
))
373 return iomap_dio_hole_actor(length
, dio
);
374 case IOMAP_UNWRITTEN
:
375 if (!(dio
->flags
& IOMAP_DIO_WRITE
))
376 return iomap_dio_hole_actor(length
, dio
);
377 return iomap_dio_bio_actor(inode
, pos
, length
, dio
, iomap
);
379 return iomap_dio_bio_actor(inode
, pos
, length
, dio
, iomap
);
381 return iomap_dio_inline_actor(inode
, pos
, length
, dio
, iomap
);
389 * iomap_dio_rw() always completes O_[D]SYNC writes regardless of whether the IO
390 * is being issued as AIO or not. This allows us to optimise pure data writes
391 * to use REQ_FUA rather than requiring generic_write_sync() to issue a
392 * REQ_FLUSH post write. This is slightly tricky because a single request here
393 * can be mapped into multiple disjoint IOs and only a subset of the IOs issued
394 * may be pure data writes. In that case, we still need to do a full data sync
398 iomap_dio_rw(struct kiocb
*iocb
, struct iov_iter
*iter
,
399 const struct iomap_ops
*ops
, iomap_dio_end_io_t end_io
)
401 struct address_space
*mapping
= iocb
->ki_filp
->f_mapping
;
402 struct inode
*inode
= file_inode(iocb
->ki_filp
);
403 size_t count
= iov_iter_count(iter
);
404 loff_t pos
= iocb
->ki_pos
, start
= pos
;
405 loff_t end
= iocb
->ki_pos
+ count
- 1, ret
= 0;
406 unsigned int flags
= IOMAP_DIRECT
;
407 bool wait_for_completion
= is_sync_kiocb(iocb
);
408 struct blk_plug plug
;
409 struct iomap_dio
*dio
;
411 lockdep_assert_held(&inode
->i_rwsem
);
416 dio
= kmalloc(sizeof(*dio
), GFP_KERNEL
);
421 atomic_set(&dio
->ref
, 1);
423 dio
->i_size
= i_size_read(inode
);
424 dio
->end_io
= end_io
;
428 dio
->submit
.iter
= iter
;
429 dio
->submit
.waiter
= current
;
430 dio
->submit
.cookie
= BLK_QC_T_NONE
;
431 dio
->submit
.last_queue
= NULL
;
433 if (iov_iter_rw(iter
) == READ
) {
434 if (pos
>= dio
->i_size
)
437 if (iter_is_iovec(iter
) && iov_iter_rw(iter
) == READ
)
438 dio
->flags
|= IOMAP_DIO_DIRTY
;
440 flags
|= IOMAP_WRITE
;
441 dio
->flags
|= IOMAP_DIO_WRITE
;
443 /* for data sync or sync, we need sync completion processing */
444 if (iocb
->ki_flags
& IOCB_DSYNC
)
445 dio
->flags
|= IOMAP_DIO_NEED_SYNC
;
448 * For datasync only writes, we optimistically try using FUA for
449 * this IO. Any non-FUA write that occurs will clear this flag,
450 * hence we know before completion whether a cache flush is
453 if ((iocb
->ki_flags
& (IOCB_DSYNC
| IOCB_SYNC
)) == IOCB_DSYNC
)
454 dio
->flags
|= IOMAP_DIO_WRITE_FUA
;
457 if (iocb
->ki_flags
& IOCB_NOWAIT
) {
458 if (filemap_range_has_page(mapping
, start
, end
)) {
462 flags
|= IOMAP_NOWAIT
;
465 ret
= filemap_write_and_wait_range(mapping
, start
, end
);
470 * Try to invalidate cache pages for the range we're direct
471 * writing. If this invalidation fails, tough, the write will
472 * still work, but racing two incompatible write paths is a
473 * pretty crazy thing to do, so we don't support it 100%.
475 ret
= invalidate_inode_pages2_range(mapping
,
476 start
>> PAGE_SHIFT
, end
>> PAGE_SHIFT
);
478 dio_warn_stale_pagecache(iocb
->ki_filp
);
481 if (iov_iter_rw(iter
) == WRITE
&& !wait_for_completion
&&
482 !inode
->i_sb
->s_dio_done_wq
) {
483 ret
= sb_init_dio_done_wq(inode
->i_sb
);
488 inode_dio_begin(inode
);
490 blk_start_plug(&plug
);
492 ret
= iomap_apply(inode
, pos
, count
, flags
, ops
, dio
,
495 /* magic error code to fall back to buffered I/O */
496 if (ret
== -ENOTBLK
) {
497 wait_for_completion
= true;
504 if (iov_iter_rw(iter
) == READ
&& pos
>= dio
->i_size
)
506 } while ((count
= iov_iter_count(iter
)) > 0);
507 blk_finish_plug(&plug
);
510 iomap_dio_set_error(dio
, ret
);
513 * If all the writes we issued were FUA, we don't need to flush the
514 * cache on IO completion. Clear the sync flag for this case.
516 if (dio
->flags
& IOMAP_DIO_WRITE_FUA
)
517 dio
->flags
&= ~IOMAP_DIO_NEED_SYNC
;
519 WRITE_ONCE(iocb
->ki_cookie
, dio
->submit
.cookie
);
520 WRITE_ONCE(iocb
->private, dio
->submit
.last_queue
);
523 * We are about to drop our additional submission reference, which
524 * might be the last reference to the dio. There are three three
525 * different ways we can progress here:
527 * (a) If this is the last reference we will always complete and free
529 * (b) If this is not the last reference, and we serve an asynchronous
530 * iocb, we must never touch the dio after the decrement, the
531 * I/O completion handler will complete and free it.
532 * (c) If this is not the last reference, but we serve a synchronous
533 * iocb, the I/O completion handler will wake us up on the drop
534 * of the final reference, and we will complete and free it here
535 * after we got woken by the I/O completion handler.
537 dio
->wait_for_completion
= wait_for_completion
;
538 if (!atomic_dec_and_test(&dio
->ref
)) {
539 if (!wait_for_completion
)
543 set_current_state(TASK_UNINTERRUPTIBLE
);
544 if (!READ_ONCE(dio
->submit
.waiter
))
547 if (!(iocb
->ki_flags
& IOCB_HIPRI
) ||
548 !dio
->submit
.last_queue
||
549 !blk_poll(dio
->submit
.last_queue
,
550 dio
->submit
.cookie
, true))
553 __set_current_state(TASK_RUNNING
);
556 return iomap_dio_complete(dio
);
562 EXPORT_SYMBOL_GPL(iomap_dio_rw
);