1 // SPDX-License-Identifier: GPL-2.0
3 * Copyright (C) 2010 Red Hat, Inc.
4 * Copyright (c) 2016-2021 Christoph Hellwig.
6 #include <linux/module.h>
7 #include <linux/compiler.h>
9 #include <linux/fscrypt.h>
10 #include <linux/pagemap.h>
11 #include <linux/iomap.h>
12 #include <linux/backing-dev.h>
13 #include <linux/uio.h>
14 #include <linux/task_io_accounting_ops.h>
17 #include "../internal.h"
20 * Private flags for iomap_dio, must not overlap with the public ones in
23 #define IOMAP_DIO_CALLER_COMP (1U << 26)
24 #define IOMAP_DIO_INLINE_COMP (1U << 27)
25 #define IOMAP_DIO_WRITE_THROUGH (1U << 28)
26 #define IOMAP_DIO_NEED_SYNC (1U << 29)
27 #define IOMAP_DIO_WRITE (1U << 30)
28 #define IOMAP_DIO_DIRTY (1U << 31)
31 * Used for sub block zeroing in iomap_dio_zero()
33 #define IOMAP_ZERO_PAGE_SIZE (SZ_64K)
34 #define IOMAP_ZERO_PAGE_ORDER (get_order(IOMAP_ZERO_PAGE_SIZE))
35 static struct page
*zero_page
;
39 const struct iomap_dio_ops
*dops
;
46 bool wait_for_completion
;
49 /* used during submission and for synchronous completion: */
51 struct iov_iter
*iter
;
52 struct task_struct
*waiter
;
55 /* used for aio completion: */
57 struct work_struct work
;
62 static struct bio
*iomap_dio_alloc_bio(const struct iomap_iter
*iter
,
63 struct iomap_dio
*dio
, unsigned short nr_vecs
, blk_opf_t opf
)
65 if (dio
->dops
&& dio
->dops
->bio_set
)
66 return bio_alloc_bioset(iter
->iomap
.bdev
, nr_vecs
, opf
,
67 GFP_KERNEL
, dio
->dops
->bio_set
);
68 return bio_alloc(iter
->iomap
.bdev
, nr_vecs
, opf
, GFP_KERNEL
);
71 static void iomap_dio_submit_bio(const struct iomap_iter
*iter
,
72 struct iomap_dio
*dio
, struct bio
*bio
, loff_t pos
)
74 struct kiocb
*iocb
= dio
->iocb
;
76 atomic_inc(&dio
->ref
);
78 /* Sync dio can't be polled reliably */
79 if ((iocb
->ki_flags
& IOCB_HIPRI
) && !is_sync_kiocb(iocb
)) {
80 bio_set_polled(bio
, iocb
);
81 WRITE_ONCE(iocb
->private, bio
);
84 if (dio
->dops
&& dio
->dops
->submit_io
)
85 dio
->dops
->submit_io(iter
, bio
, pos
);
90 ssize_t
iomap_dio_complete(struct iomap_dio
*dio
)
92 const struct iomap_dio_ops
*dops
= dio
->dops
;
93 struct kiocb
*iocb
= dio
->iocb
;
94 loff_t offset
= iocb
->ki_pos
;
95 ssize_t ret
= dio
->error
;
97 if (dops
&& dops
->end_io
)
98 ret
= dops
->end_io(iocb
, dio
->size
, ret
, dio
->flags
);
102 /* check for short read */
103 if (offset
+ ret
> dio
->i_size
&&
104 !(dio
->flags
& IOMAP_DIO_WRITE
))
105 ret
= dio
->i_size
- offset
;
109 * Try again to invalidate clean pages which might have been cached by
110 * non-direct readahead, or faulted in by get_user_pages() if the source
111 * of the write was an mmap'ed region of the file we're writing. Either
112 * one is a pretty crazy thing to do, so we don't support it 100%. If
113 * this invalidation fails, tough, the write still worked...
115 * And this page cache invalidation has to be after ->end_io(), as some
116 * filesystems convert unwritten extents to real allocations in
117 * ->end_io() when necessary, otherwise a racing buffer read would cache
118 * zeros from unwritten extents.
120 if (!dio
->error
&& dio
->size
&& (dio
->flags
& IOMAP_DIO_WRITE
))
121 kiocb_invalidate_post_direct_write(iocb
, dio
->size
);
123 inode_dio_end(file_inode(iocb
->ki_filp
));
129 * If this is a DSYNC write, make sure we push it to stable
130 * storage now that we've written data.
132 if (dio
->flags
& IOMAP_DIO_NEED_SYNC
)
133 ret
= generic_write_sync(iocb
, ret
);
135 ret
+= dio
->done_before
;
137 trace_iomap_dio_complete(iocb
, dio
->error
, ret
);
141 EXPORT_SYMBOL_GPL(iomap_dio_complete
);
143 static ssize_t
iomap_dio_deferred_complete(void *data
)
145 return iomap_dio_complete(data
);
148 static void iomap_dio_complete_work(struct work_struct
*work
)
150 struct iomap_dio
*dio
= container_of(work
, struct iomap_dio
, aio
.work
);
151 struct kiocb
*iocb
= dio
->iocb
;
153 iocb
->ki_complete(iocb
, iomap_dio_complete(dio
));
157 * Set an error in the dio if none is set yet. We have to use cmpxchg
158 * as the submission context and the completion context(s) can race to
161 static inline void iomap_dio_set_error(struct iomap_dio
*dio
, int ret
)
163 cmpxchg(&dio
->error
, 0, ret
);
166 void iomap_dio_bio_end_io(struct bio
*bio
)
168 struct iomap_dio
*dio
= bio
->bi_private
;
169 bool should_dirty
= (dio
->flags
& IOMAP_DIO_DIRTY
);
170 struct kiocb
*iocb
= dio
->iocb
;
173 iomap_dio_set_error(dio
, blk_status_to_errno(bio
->bi_status
));
174 if (!atomic_dec_and_test(&dio
->ref
))
178 * Synchronous dio, task itself will handle any completion work
179 * that needs after IO. All we need to do is wake the task.
181 if (dio
->wait_for_completion
) {
182 struct task_struct
*waiter
= dio
->submit
.waiter
;
184 WRITE_ONCE(dio
->submit
.waiter
, NULL
);
185 blk_wake_io_task(waiter
);
190 * Flagged with IOMAP_DIO_INLINE_COMP, we can complete it inline
192 if (dio
->flags
& IOMAP_DIO_INLINE_COMP
) {
193 WRITE_ONCE(iocb
->private, NULL
);
194 iomap_dio_complete_work(&dio
->aio
.work
);
199 * If this dio is flagged with IOMAP_DIO_CALLER_COMP, then schedule
200 * our completion that way to avoid an async punt to a workqueue.
202 if (dio
->flags
& IOMAP_DIO_CALLER_COMP
) {
203 /* only polled IO cares about private cleared */
205 iocb
->dio_complete
= iomap_dio_deferred_complete
;
208 * Invoke ->ki_complete() directly. We've assigned our
209 * dio_complete callback handler, and since the issuer set
210 * IOCB_DIO_CALLER_COMP, we know their ki_complete handler will
211 * notice ->dio_complete being set and will defer calling that
212 * handler until it can be done from a safe task context.
214 * Note that the 'res' being passed in here is not important
215 * for this case. The actual completion value of the request
216 * will be gotten from dio_complete when that is run by the
219 iocb
->ki_complete(iocb
, 0);
224 * Async DIO completion that requires filesystem level completion work
225 * gets punted to a work queue to complete as the operation may require
226 * more IO to be issued to finalise filesystem metadata changes or
227 * guarantee data integrity.
229 INIT_WORK(&dio
->aio
.work
, iomap_dio_complete_work
);
230 queue_work(file_inode(iocb
->ki_filp
)->i_sb
->s_dio_done_wq
,
234 bio_check_pages_dirty(bio
);
236 bio_release_pages(bio
, false);
240 EXPORT_SYMBOL_GPL(iomap_dio_bio_end_io
);
242 static int iomap_dio_zero(const struct iomap_iter
*iter
, struct iomap_dio
*dio
,
243 loff_t pos
, unsigned len
)
245 struct inode
*inode
= file_inode(dio
->iocb
->ki_filp
);
251 * Max block size supported is 64k
253 if (WARN_ON_ONCE(len
> IOMAP_ZERO_PAGE_SIZE
))
256 bio
= iomap_dio_alloc_bio(iter
, dio
, 1, REQ_OP_WRITE
| REQ_SYNC
| REQ_IDLE
);
257 fscrypt_set_bio_crypt_ctx(bio
, inode
, pos
>> inode
->i_blkbits
,
259 bio
->bi_iter
.bi_sector
= iomap_sector(&iter
->iomap
, pos
);
260 bio
->bi_private
= dio
;
261 bio
->bi_end_io
= iomap_dio_bio_end_io
;
263 __bio_add_page(bio
, zero_page
, len
, 0);
264 iomap_dio_submit_bio(iter
, dio
, bio
, pos
);
269 * Figure out the bio's operation flags from the dio request, the
270 * mapping, and whether or not we want FUA. Note that we can end up
271 * clearing the WRITE_THROUGH flag in the dio request.
273 static inline blk_opf_t
iomap_dio_bio_opflags(struct iomap_dio
*dio
,
274 const struct iomap
*iomap
, bool use_fua
, bool atomic
)
276 blk_opf_t opflags
= REQ_SYNC
| REQ_IDLE
;
278 if (!(dio
->flags
& IOMAP_DIO_WRITE
))
281 opflags
|= REQ_OP_WRITE
;
285 dio
->flags
&= ~IOMAP_DIO_WRITE_THROUGH
;
287 opflags
|= REQ_ATOMIC
;
292 static loff_t
iomap_dio_bio_iter(const struct iomap_iter
*iter
,
293 struct iomap_dio
*dio
)
295 const struct iomap
*iomap
= &iter
->iomap
;
296 struct inode
*inode
= iter
->inode
;
297 unsigned int fs_block_size
= i_blocksize(inode
), pad
;
298 const loff_t length
= iomap_length(iter
);
299 bool atomic
= iter
->flags
& IOMAP_ATOMIC
;
300 loff_t pos
= iter
->pos
;
303 bool need_zeroout
= false;
304 bool use_fua
= false;
305 int nr_pages
, ret
= 0;
309 if (atomic
&& length
!= fs_block_size
)
312 if ((pos
| length
) & (bdev_logical_block_size(iomap
->bdev
) - 1) ||
313 !bdev_iter_is_aligned(iomap
->bdev
, dio
->submit
.iter
))
316 if (iomap
->type
== IOMAP_UNWRITTEN
) {
317 dio
->flags
|= IOMAP_DIO_UNWRITTEN
;
321 if (iomap
->flags
& IOMAP_F_SHARED
)
322 dio
->flags
|= IOMAP_DIO_COW
;
324 if (iomap
->flags
& IOMAP_F_NEW
) {
326 } else if (iomap
->type
== IOMAP_MAPPED
) {
328 * Use a FUA write if we need datasync semantics, this is a pure
329 * data IO that doesn't require any metadata updates (including
330 * after IO completion such as unwritten extent conversion) and
331 * the underlying device either supports FUA or doesn't have
332 * a volatile write cache. This allows us to avoid cache flushes
333 * on IO completion. If we can't use writethrough and need to
334 * sync, disable in-task completions as dio completion will
335 * need to call generic_write_sync() which will do a blocking
336 * fsync / cache flush call.
338 if (!(iomap
->flags
& (IOMAP_F_SHARED
|IOMAP_F_DIRTY
)) &&
339 (dio
->flags
& IOMAP_DIO_WRITE_THROUGH
) &&
340 (bdev_fua(iomap
->bdev
) || !bdev_write_cache(iomap
->bdev
)))
342 else if (dio
->flags
& IOMAP_DIO_NEED_SYNC
)
343 dio
->flags
&= ~IOMAP_DIO_CALLER_COMP
;
347 * Save the original count and trim the iter to just the extent we
348 * are operating on right now. The iter will be re-expanded once
351 orig_count
= iov_iter_count(dio
->submit
.iter
);
352 iov_iter_truncate(dio
->submit
.iter
, length
);
354 if (!iov_iter_count(dio
->submit
.iter
))
358 * We can only do deferred completion for pure overwrites that
359 * don't require additional IO at completion. This rules out
360 * writes that need zeroing or extent conversion, extend
361 * the file size, or issue journal IO or cache flushes
362 * during completion processing.
365 ((dio
->flags
& IOMAP_DIO_NEED_SYNC
) && !use_fua
) ||
366 ((dio
->flags
& IOMAP_DIO_WRITE
) && pos
>= i_size_read(inode
)))
367 dio
->flags
&= ~IOMAP_DIO_CALLER_COMP
;
370 * The rules for polled IO completions follow the guidelines as the
371 * ones we set for inline and deferred completions. If none of those
372 * are available for this IO, clear the polled flag.
374 if (!(dio
->flags
& (IOMAP_DIO_INLINE_COMP
|IOMAP_DIO_CALLER_COMP
)))
375 dio
->iocb
->ki_flags
&= ~IOCB_HIPRI
;
378 /* zero out from the start of the block to the write offset */
379 pad
= pos
& (fs_block_size
- 1);
381 ret
= iomap_dio_zero(iter
, dio
, pos
- pad
, pad
);
386 bio_opf
= iomap_dio_bio_opflags(dio
, iomap
, use_fua
, atomic
);
388 nr_pages
= bio_iov_vecs_to_alloc(dio
->submit
.iter
, BIO_MAX_VECS
);
392 iov_iter_revert(dio
->submit
.iter
, copied
);
397 bio
= iomap_dio_alloc_bio(iter
, dio
, nr_pages
, bio_opf
);
398 fscrypt_set_bio_crypt_ctx(bio
, inode
, pos
>> inode
->i_blkbits
,
400 bio
->bi_iter
.bi_sector
= iomap_sector(iomap
, pos
);
401 bio
->bi_write_hint
= inode
->i_write_hint
;
402 bio
->bi_ioprio
= dio
->iocb
->ki_ioprio
;
403 bio
->bi_private
= dio
;
404 bio
->bi_end_io
= iomap_dio_bio_end_io
;
406 ret
= bio_iov_iter_get_pages(bio
, dio
->submit
.iter
);
409 * We have to stop part way through an IO. We must fall
410 * through to the sub-block tail zeroing here, otherwise
411 * this short IO may expose stale data in the tail of
412 * the block we haven't written data to.
418 n
= bio
->bi_iter
.bi_size
;
419 if (WARN_ON_ONCE(atomic
&& n
!= length
)) {
421 * This bio should have covered the complete length,
422 * which it doesn't, so error. We may need to zero out
423 * the tail (complete FS block), similar to when
424 * bio_iov_iter_get_pages() returns an error, above.
430 if (dio
->flags
& IOMAP_DIO_WRITE
) {
431 task_io_account_write(n
);
433 if (dio
->flags
& IOMAP_DIO_DIRTY
)
434 bio_set_pages_dirty(bio
);
440 nr_pages
= bio_iov_vecs_to_alloc(dio
->submit
.iter
,
443 * We can only poll for single bio I/Os.
446 dio
->iocb
->ki_flags
&= ~IOCB_HIPRI
;
447 iomap_dio_submit_bio(iter
, dio
, bio
, pos
);
452 * We need to zeroout the tail of a sub-block write if the extent type
453 * requires zeroing or the write extends beyond EOF. If we don't zero
454 * the block tail in the latter case, we can expose stale data via mmap
455 * reads of the EOF block.
459 ((dio
->flags
& IOMAP_DIO_WRITE
) && pos
>= i_size_read(inode
))) {
460 /* zero out from the end of the write to the end of the block */
461 pad
= pos
& (fs_block_size
- 1);
463 ret
= iomap_dio_zero(iter
, dio
, pos
,
464 fs_block_size
- pad
);
467 /* Undo iter limitation to current extent */
468 iov_iter_reexpand(dio
->submit
.iter
, orig_count
- copied
);
474 static loff_t
iomap_dio_hole_iter(const struct iomap_iter
*iter
,
475 struct iomap_dio
*dio
)
477 loff_t length
= iov_iter_zero(iomap_length(iter
), dio
->submit
.iter
);
485 static loff_t
iomap_dio_inline_iter(const struct iomap_iter
*iomi
,
486 struct iomap_dio
*dio
)
488 const struct iomap
*iomap
= &iomi
->iomap
;
489 struct iov_iter
*iter
= dio
->submit
.iter
;
490 void *inline_data
= iomap_inline_data(iomap
, iomi
->pos
);
491 loff_t length
= iomap_length(iomi
);
492 loff_t pos
= iomi
->pos
;
495 if (WARN_ON_ONCE(!iomap_inline_data_valid(iomap
)))
498 if (dio
->flags
& IOMAP_DIO_WRITE
) {
499 loff_t size
= iomi
->inode
->i_size
;
502 memset(iomap_inline_data(iomap
, size
), 0, pos
- size
);
503 copied
= copy_from_iter(inline_data
, length
, iter
);
505 if (pos
+ copied
> size
)
506 i_size_write(iomi
->inode
, pos
+ copied
);
507 mark_inode_dirty(iomi
->inode
);
510 copied
= copy_to_iter(inline_data
, length
, iter
);
518 static loff_t
iomap_dio_iter(const struct iomap_iter
*iter
,
519 struct iomap_dio
*dio
)
521 switch (iter
->iomap
.type
) {
523 if (WARN_ON_ONCE(dio
->flags
& IOMAP_DIO_WRITE
))
525 return iomap_dio_hole_iter(iter
, dio
);
526 case IOMAP_UNWRITTEN
:
527 if (!(dio
->flags
& IOMAP_DIO_WRITE
))
528 return iomap_dio_hole_iter(iter
, dio
);
529 return iomap_dio_bio_iter(iter
, dio
);
531 return iomap_dio_bio_iter(iter
, dio
);
533 return iomap_dio_inline_iter(iter
, dio
);
536 * DIO is not serialised against mmap() access at all, and so
537 * if the page_mkwrite occurs between the writeback and the
538 * iomap_iter() call in the DIO path, then it will see the
539 * DELALLOC block that the page-mkwrite allocated.
541 pr_warn_ratelimited("Direct I/O collision with buffered writes! File: %pD4 Comm: %.20s\n",
542 dio
->iocb
->ki_filp
, current
->comm
);
551 * iomap_dio_rw() always completes O_[D]SYNC writes regardless of whether the IO
552 * is being issued as AIO or not. This allows us to optimise pure data writes
553 * to use REQ_FUA rather than requiring generic_write_sync() to issue a
554 * REQ_FLUSH post write. This is slightly tricky because a single request here
555 * can be mapped into multiple disjoint IOs and only a subset of the IOs issued
556 * may be pure data writes. In that case, we still need to do a full data sync
559 * When page faults are disabled and @dio_flags includes IOMAP_DIO_PARTIAL,
560 * __iomap_dio_rw can return a partial result if it encounters a non-resident
561 * page in @iter after preparing a transfer. In that case, the non-resident
562 * pages can be faulted in and the request resumed with @done_before set to the
563 * number of bytes previously transferred. The request will then complete with
564 * the correct total number of bytes transferred; this is essential for
565 * completing partial requests asynchronously.
567 * Returns -ENOTBLK In case of a page invalidation invalidation failure for
568 * writes. The callers needs to fall back to buffered I/O in this case.
571 __iomap_dio_rw(struct kiocb
*iocb
, struct iov_iter
*iter
,
572 const struct iomap_ops
*ops
, const struct iomap_dio_ops
*dops
,
573 unsigned int dio_flags
, void *private, size_t done_before
)
575 struct inode
*inode
= file_inode(iocb
->ki_filp
);
576 struct iomap_iter iomi
= {
579 .len
= iov_iter_count(iter
),
580 .flags
= IOMAP_DIRECT
,
583 bool wait_for_completion
=
584 is_sync_kiocb(iocb
) || (dio_flags
& IOMAP_DIO_FORCE_WAIT
);
585 struct blk_plug plug
;
586 struct iomap_dio
*dio
;
589 trace_iomap_dio_rw_begin(iocb
, iter
, dio_flags
, done_before
);
594 dio
= kmalloc(sizeof(*dio
), GFP_KERNEL
);
596 return ERR_PTR(-ENOMEM
);
599 atomic_set(&dio
->ref
, 1);
601 dio
->i_size
= i_size_read(inode
);
605 dio
->done_before
= done_before
;
607 dio
->submit
.iter
= iter
;
608 dio
->submit
.waiter
= current
;
610 if (iocb
->ki_flags
& IOCB_NOWAIT
)
611 iomi
.flags
|= IOMAP_NOWAIT
;
613 if (iocb
->ki_flags
& IOCB_ATOMIC
)
614 iomi
.flags
|= IOMAP_ATOMIC
;
616 if (iov_iter_rw(iter
) == READ
) {
617 /* reads can always complete inline */
618 dio
->flags
|= IOMAP_DIO_INLINE_COMP
;
620 if (iomi
.pos
>= dio
->i_size
)
623 if (user_backed_iter(iter
))
624 dio
->flags
|= IOMAP_DIO_DIRTY
;
626 ret
= kiocb_write_and_wait(iocb
, iomi
.len
);
630 iomi
.flags
|= IOMAP_WRITE
;
631 dio
->flags
|= IOMAP_DIO_WRITE
;
634 * Flag as supporting deferred completions, if the issuer
635 * groks it. This can avoid a workqueue punt for writes.
636 * We may later clear this flag if we need to do other IO
637 * as part of this IO completion.
639 if (iocb
->ki_flags
& IOCB_DIO_CALLER_COMP
)
640 dio
->flags
|= IOMAP_DIO_CALLER_COMP
;
642 if (dio_flags
& IOMAP_DIO_OVERWRITE_ONLY
) {
644 if (iomi
.pos
>= dio
->i_size
||
645 iomi
.pos
+ iomi
.len
> dio
->i_size
)
647 iomi
.flags
|= IOMAP_OVERWRITE_ONLY
;
650 /* for data sync or sync, we need sync completion processing */
651 if (iocb_is_dsync(iocb
)) {
652 dio
->flags
|= IOMAP_DIO_NEED_SYNC
;
655 * For datasync only writes, we optimistically try using
656 * WRITE_THROUGH for this IO. This flag requires either
657 * FUA writes through the device's write cache, or a
658 * normal write to a device without a volatile write
659 * cache. For the former, Any non-FUA write that occurs
660 * will clear this flag, hence we know before completion
661 * whether a cache flush is necessary.
663 if (!(iocb
->ki_flags
& IOCB_SYNC
))
664 dio
->flags
|= IOMAP_DIO_WRITE_THROUGH
;
668 * Try to invalidate cache pages for the range we are writing.
669 * If this invalidation fails, let the caller fall back to
672 ret
= kiocb_invalidate_pages(iocb
, iomi
.len
);
674 if (ret
!= -EAGAIN
) {
675 trace_iomap_dio_invalidate_fail(inode
, iomi
.pos
,
677 if (iocb
->ki_flags
& IOCB_ATOMIC
) {
679 * folio invalidation failed, maybe
680 * this is transient, unlock and see if
681 * the caller tries again.
685 /* fall back to buffered write */
692 if (!wait_for_completion
&& !inode
->i_sb
->s_dio_done_wq
) {
693 ret
= sb_init_dio_done_wq(inode
->i_sb
);
699 inode_dio_begin(inode
);
701 blk_start_plug(&plug
);
702 while ((ret
= iomap_iter(&iomi
, ops
)) > 0) {
703 iomi
.processed
= iomap_dio_iter(&iomi
, dio
);
706 * We can only poll for single bio I/Os.
708 iocb
->ki_flags
&= ~IOCB_HIPRI
;
711 blk_finish_plug(&plug
);
714 * We only report that we've read data up to i_size.
715 * Revert iter to a state corresponding to that as some callers (such
716 * as the splice code) rely on it.
718 if (iov_iter_rw(iter
) == READ
&& iomi
.pos
>= dio
->i_size
)
719 iov_iter_revert(iter
, iomi
.pos
- dio
->i_size
);
721 if (ret
== -EFAULT
&& dio
->size
&& (dio_flags
& IOMAP_DIO_PARTIAL
)) {
722 if (!(iocb
->ki_flags
& IOCB_NOWAIT
))
723 wait_for_completion
= true;
727 /* magic error code to fall back to buffered I/O */
728 if (ret
== -ENOTBLK
) {
729 wait_for_completion
= true;
733 iomap_dio_set_error(dio
, ret
);
736 * If all the writes we issued were already written through to the
737 * media, we don't need to flush the cache on IO completion. Clear the
738 * sync flag for this case.
740 if (dio
->flags
& IOMAP_DIO_WRITE_THROUGH
)
741 dio
->flags
&= ~IOMAP_DIO_NEED_SYNC
;
744 * We are about to drop our additional submission reference, which
745 * might be the last reference to the dio. There are three different
746 * ways we can progress here:
748 * (a) If this is the last reference we will always complete and free
750 * (b) If this is not the last reference, and we serve an asynchronous
751 * iocb, we must never touch the dio after the decrement, the
752 * I/O completion handler will complete and free it.
753 * (c) If this is not the last reference, but we serve a synchronous
754 * iocb, the I/O completion handler will wake us up on the drop
755 * of the final reference, and we will complete and free it here
756 * after we got woken by the I/O completion handler.
758 dio
->wait_for_completion
= wait_for_completion
;
759 if (!atomic_dec_and_test(&dio
->ref
)) {
760 if (!wait_for_completion
) {
761 trace_iomap_dio_rw_queued(inode
, iomi
.pos
, iomi
.len
);
762 return ERR_PTR(-EIOCBQUEUED
);
766 set_current_state(TASK_UNINTERRUPTIBLE
);
767 if (!READ_ONCE(dio
->submit
.waiter
))
772 __set_current_state(TASK_RUNNING
);
783 EXPORT_SYMBOL_GPL(__iomap_dio_rw
);
786 iomap_dio_rw(struct kiocb
*iocb
, struct iov_iter
*iter
,
787 const struct iomap_ops
*ops
, const struct iomap_dio_ops
*dops
,
788 unsigned int dio_flags
, void *private, size_t done_before
)
790 struct iomap_dio
*dio
;
792 dio
= __iomap_dio_rw(iocb
, iter
, ops
, dops
, dio_flags
, private,
794 if (IS_ERR_OR_NULL(dio
))
795 return PTR_ERR_OR_ZERO(dio
);
796 return iomap_dio_complete(dio
);
798 EXPORT_SYMBOL_GPL(iomap_dio_rw
);
800 static int __init
iomap_dio_init(void)
802 zero_page
= alloc_pages(GFP_KERNEL
| __GFP_ZERO
,
803 IOMAP_ZERO_PAGE_ORDER
);
810 fs_initcall(iomap_dio_init
);