1 // SPDX-License-Identifier: GPL-2.0
5 #include "alloc_foreground.h"
8 #include "fs-io-direct.h"
9 #include "fs-io-pagecache.h"
13 #include <linux/kthread.h>
14 #include <linux/pagemap.h>
15 #include <linux/prefetch.h>
16 #include <linux/task_io_accounting_ops.h>
25 struct bch_read_bio rbio
;
28 static void bio_check_or_release(struct bio
*bio
, bool check_dirty
)
31 bio_check_pages_dirty(bio
);
33 bio_release_pages(bio
, false);
38 static CLOSURE_CALLBACK(bch2_dio_read_complete
)
40 closure_type(dio
, struct dio_read
, cl
);
42 dio
->req
->ki_complete(dio
->req
, dio
->ret
);
43 bio_check_or_release(&dio
->rbio
.bio
, dio
->should_dirty
);
46 static void bch2_direct_IO_read_endio(struct bio
*bio
)
48 struct dio_read
*dio
= bio
->bi_private
;
51 dio
->ret
= blk_status_to_errno(bio
->bi_status
);
53 closure_put(&dio
->cl
);
56 static void bch2_direct_IO_read_split_endio(struct bio
*bio
)
58 struct dio_read
*dio
= bio
->bi_private
;
59 bool should_dirty
= dio
->should_dirty
;
61 bch2_direct_IO_read_endio(bio
);
62 bio_check_or_release(bio
, should_dirty
);
65 static int bch2_direct_IO_read(struct kiocb
*req
, struct iov_iter
*iter
)
67 struct file
*file
= req
->ki_filp
;
68 struct bch_inode_info
*inode
= file_bch_inode(file
);
69 struct bch_fs
*c
= inode
->v
.i_sb
->s_fs_info
;
70 struct bch_io_opts opts
;
73 loff_t offset
= req
->ki_pos
;
74 bool sync
= is_sync_kiocb(req
);
78 bch2_inode_opts_get(&opts
, c
, &inode
->ei_inode
);
80 /* bios must be 512 byte aligned: */
81 if ((offset
|iter
->count
) & (SECTOR_SIZE
- 1))
84 ret
= min_t(loff_t
, iter
->count
,
85 max_t(loff_t
, 0, i_size_read(&inode
->v
) - offset
));
90 shorten
= iov_iter_count(iter
) - round_up(ret
, block_bytes(c
));
91 if (shorten
>= iter
->count
)
93 iter
->count
-= shorten
;
95 bio
= bio_alloc_bioset(NULL
,
96 bio_iov_vecs_to_alloc(iter
, BIO_MAX_VECS
),
101 bio
->bi_end_io
= bch2_direct_IO_read_endio
;
103 dio
= container_of(bio
, struct dio_read
, rbio
.bio
);
104 closure_init(&dio
->cl
, NULL
);
107 * this is a _really_ horrible hack just to avoid an atomic sub at the
111 set_closure_fn(&dio
->cl
, bch2_dio_read_complete
, NULL
);
112 atomic_set(&dio
->cl
.remaining
,
113 CLOSURE_REMAINING_INITIALIZER
-
117 atomic_set(&dio
->cl
.remaining
,
118 CLOSURE_REMAINING_INITIALIZER
+ 1);
119 dio
->cl
.closure_get_happened
= true;
125 * This is one of the sketchier things I've encountered: we have to skip
126 * the dirtying of requests that are internal from the kernel (i.e. from
127 * loopback), because we'll deadlock on page_lock.
129 dio
->should_dirty
= iter_is_iovec(iter
);
132 while (iter
->count
) {
133 bio
= bio_alloc_bioset(NULL
,
134 bio_iov_vecs_to_alloc(iter
, BIO_MAX_VECS
),
138 bio
->bi_end_io
= bch2_direct_IO_read_split_endio
;
140 bio
->bi_opf
= REQ_OP_READ
|REQ_SYNC
;
141 bio
->bi_iter
.bi_sector
= offset
>> 9;
142 bio
->bi_private
= dio
;
144 ret
= bio_iov_iter_get_pages(bio
, iter
);
146 /* XXX: fault inject this path */
147 bio
->bi_status
= BLK_STS_RESOURCE
;
152 offset
+= bio
->bi_iter
.bi_size
;
154 if (dio
->should_dirty
)
155 bio_set_pages_dirty(bio
);
158 closure_get(&dio
->cl
);
160 bch2_read(c
, rbio_init(bio
, opts
), inode_inum(inode
));
163 iter
->count
+= shorten
;
166 closure_sync(&dio
->cl
);
167 closure_debug_destroy(&dio
->cl
);
169 bio_check_or_release(&dio
->rbio
.bio
, dio
->should_dirty
);
176 ssize_t
bch2_read_iter(struct kiocb
*iocb
, struct iov_iter
*iter
)
178 struct file
*file
= iocb
->ki_filp
;
179 struct bch_inode_info
*inode
= file_bch_inode(file
);
180 struct address_space
*mapping
= file
->f_mapping
;
181 size_t count
= iov_iter_count(iter
);
185 return 0; /* skip atime */
187 if (iocb
->ki_flags
& IOCB_DIRECT
) {
188 struct blk_plug plug
;
190 if (unlikely(mapping
->nrpages
)) {
191 ret
= filemap_write_and_wait_range(mapping
,
193 iocb
->ki_pos
+ count
- 1);
200 blk_start_plug(&plug
);
201 ret
= bch2_direct_IO_read(iocb
, iter
);
202 blk_finish_plug(&plug
);
207 bch2_pagecache_add_get(inode
);
208 ret
= filemap_read(iocb
, iter
, ret
);
209 bch2_pagecache_add_put(inode
);
212 return bch2_err_class(ret
);
215 /* O_DIRECT writes */
219 struct address_space
*mapping
;
220 struct bch_inode_info
*inode
;
221 struct mm_struct
*mm
;
222 const struct iovec
*iov
;
227 struct quota_res quota_res
;
230 struct iov_iter iter
;
231 struct iovec inline_vecs
[2];
234 struct bch_write_op op
;
237 static bool bch2_check_range_allocated(struct bch_fs
*c
, subvol_inum inum
,
238 u64 offset
, u64 size
,
239 unsigned nr_replicas
, bool compressed
)
241 struct btree_trans
*trans
= bch2_trans_get(c
);
242 struct btree_iter iter
;
244 u64 end
= offset
+ size
;
249 bch2_trans_begin(trans
);
251 err
= bch2_subvolume_get_snapshot(trans
, inum
.subvol
, &snapshot
);
255 for_each_btree_key_norestart(trans
, iter
, BTREE_ID_extents
,
256 SPOS(inum
.inum
, offset
, snapshot
),
257 BTREE_ITER_slots
, k
, err
) {
258 if (bkey_ge(bkey_start_pos(k
.k
), POS(inum
.inum
, end
)))
261 if (k
.k
->p
.snapshot
!= snapshot
||
262 nr_replicas
> bch2_bkey_replicas(c
, k
) ||
263 (!compressed
&& bch2_bkey_sectors_compressed(k
))) {
269 offset
= iter
.pos
.offset
;
270 bch2_trans_iter_exit(trans
, &iter
);
272 if (bch2_err_matches(err
, BCH_ERR_transaction_restart
))
274 bch2_trans_put(trans
);
276 return err
? false : ret
;
279 static noinline
bool bch2_dio_write_check_allocated(struct dio_write
*dio
)
281 struct bch_fs
*c
= dio
->op
.c
;
282 struct bch_inode_info
*inode
= dio
->inode
;
283 struct bio
*bio
= &dio
->op
.wbio
.bio
;
285 return bch2_check_range_allocated(c
, inode_inum(inode
),
286 dio
->op
.pos
.offset
, bio_sectors(bio
),
287 dio
->op
.opts
.data_replicas
,
288 dio
->op
.opts
.compression
!= 0);
291 static void bch2_dio_write_loop_async(struct bch_write_op
*);
292 static __always_inline
long bch2_dio_write_done(struct dio_write
*dio
);
295 * We're going to return -EIOCBQUEUED, but we haven't finished consuming the
296 * iov_iter yet, so we need to stash a copy of the iovec: it might be on the
297 * caller's stack, we're not guaranteed that it will live for the duration of
300 static noinline
int bch2_dio_write_copy_iov(struct dio_write
*dio
)
302 struct iovec
*iov
= dio
->inline_vecs
;
305 * iov_iter has a single embedded iovec - nothing to do:
307 if (iter_is_ubuf(&dio
->iter
))
311 * We don't currently handle non-iovec iov_iters here - return an error,
312 * and we'll fall back to doing the IO synchronously:
314 if (!iter_is_iovec(&dio
->iter
))
317 if (dio
->iter
.nr_segs
> ARRAY_SIZE(dio
->inline_vecs
)) {
318 dio
->iov
= iov
= kmalloc_array(dio
->iter
.nr_segs
, sizeof(*iov
),
324 memcpy(iov
, dio
->iter
.__iov
, dio
->iter
.nr_segs
* sizeof(*iov
));
325 dio
->iter
.__iov
= iov
;
329 static CLOSURE_CALLBACK(bch2_dio_write_flush_done
)
331 closure_type(dio
, struct dio_write
, op
.cl
);
332 struct bch_fs
*c
= dio
->op
.c
;
334 closure_debug_destroy(cl
);
336 dio
->op
.error
= bch2_journal_error(&c
->journal
);
338 bch2_dio_write_done(dio
);
341 static noinline
void bch2_dio_write_flush(struct dio_write
*dio
)
343 struct bch_fs
*c
= dio
->op
.c
;
344 struct bch_inode_unpacked inode
;
349 closure_init(&dio
->op
.cl
, NULL
);
351 if (!dio
->op
.error
) {
352 ret
= bch2_inode_find_by_inum(c
, inode_inum(dio
->inode
), &inode
);
356 bch2_journal_flush_seq_async(&c
->journal
, inode
.bi_journal_seq
,
358 bch2_inode_flush_nocow_writes_async(c
, dio
->inode
, &dio
->op
.cl
);
363 closure_sync(&dio
->op
.cl
);
364 closure_debug_destroy(&dio
->op
.cl
);
366 continue_at(&dio
->op
.cl
, bch2_dio_write_flush_done
, NULL
);
370 static __always_inline
long bch2_dio_write_done(struct dio_write
*dio
)
372 struct bch_fs
*c
= dio
->op
.c
;
373 struct kiocb
*req
= dio
->req
;
374 struct bch_inode_info
*inode
= dio
->inode
;
375 bool sync
= dio
->sync
;
378 if (unlikely(dio
->flush
)) {
379 bch2_dio_write_flush(dio
);
384 bch2_pagecache_block_put(inode
);
388 ret
= dio
->op
.error
?: ((long) dio
->written
<< 9);
389 bio_put(&dio
->op
.wbio
.bio
);
391 bch2_write_ref_put(c
, BCH_WRITE_REF_dio_write
);
393 /* inode->i_dio_count is our ref on inode and thus bch_fs */
394 inode_dio_end(&inode
->v
);
397 ret
= bch2_err_class(ret
);
400 req
->ki_complete(req
, ret
);
406 static __always_inline
void bch2_dio_write_end(struct dio_write
*dio
)
408 struct bch_fs
*c
= dio
->op
.c
;
409 struct kiocb
*req
= dio
->req
;
410 struct bch_inode_info
*inode
= dio
->inode
;
411 struct bio
*bio
= &dio
->op
.wbio
.bio
;
413 req
->ki_pos
+= (u64
) dio
->op
.written
<< 9;
414 dio
->written
+= dio
->op
.written
;
416 if (dio
->extending
) {
417 spin_lock(&inode
->v
.i_lock
);
418 if (req
->ki_pos
> inode
->v
.i_size
)
419 i_size_write(&inode
->v
, req
->ki_pos
);
420 spin_unlock(&inode
->v
.i_lock
);
423 if (dio
->op
.i_sectors_delta
|| dio
->quota_res
.sectors
) {
424 mutex_lock(&inode
->ei_quota_lock
);
425 __bch2_i_sectors_acct(c
, inode
, &dio
->quota_res
, dio
->op
.i_sectors_delta
);
426 __bch2_quota_reservation_put(c
, inode
, &dio
->quota_res
);
427 mutex_unlock(&inode
->ei_quota_lock
);
430 bio_release_pages(bio
, false);
432 if (unlikely(dio
->op
.error
))
433 set_bit(EI_INODE_ERROR
, &inode
->ei_flags
);
436 static __always_inline
long bch2_dio_write_loop(struct dio_write
*dio
)
438 struct bch_fs
*c
= dio
->op
.c
;
439 struct kiocb
*req
= dio
->req
;
440 struct address_space
*mapping
= dio
->mapping
;
441 struct bch_inode_info
*inode
= dio
->inode
;
442 struct bch_io_opts opts
;
443 struct bio
*bio
= &dio
->op
.wbio
.bio
;
444 unsigned unaligned
, iter_count
;
445 bool sync
= dio
->sync
, dropped_locks
;
448 bch2_inode_opts_get(&opts
, c
, &inode
->ei_inode
);
451 iter_count
= dio
->iter
.count
;
453 EBUG_ON(current
->faults_disabled_mapping
);
454 current
->faults_disabled_mapping
= mapping
;
456 ret
= bio_iov_iter_get_pages(bio
, &dio
->iter
);
458 dropped_locks
= fdm_dropped_locks();
460 current
->faults_disabled_mapping
= NULL
;
463 * If the fault handler returned an error but also signalled
464 * that it dropped & retook ei_pagecache_lock, we just need to
465 * re-shoot down the page cache and retry:
467 if (dropped_locks
&& ret
)
470 if (unlikely(ret
< 0))
473 if (unlikely(dropped_locks
)) {
474 ret
= bch2_write_invalidate_inode_pages_range(mapping
,
476 req
->ki_pos
+ iter_count
- 1);
480 if (!bio
->bi_iter
.bi_size
)
484 unaligned
= bio
->bi_iter
.bi_size
& (block_bytes(c
) - 1);
485 bio
->bi_iter
.bi_size
-= unaligned
;
486 iov_iter_revert(&dio
->iter
, unaligned
);
488 if (!bio
->bi_iter
.bi_size
) {
490 * bio_iov_iter_get_pages was only able to get <
491 * blocksize worth of pages:
497 bch2_write_op_init(&dio
->op
, c
, opts
);
498 dio
->op
.end_io
= sync
500 : bch2_dio_write_loop_async
;
501 dio
->op
.target
= dio
->op
.opts
.foreground_target
;
502 dio
->op
.write_point
= writepoint_hashed((unsigned long) current
);
503 dio
->op
.nr_replicas
= dio
->op
.opts
.data_replicas
;
504 dio
->op
.subvol
= inode
->ei_inum
.subvol
;
505 dio
->op
.pos
= POS(inode
->v
.i_ino
, (u64
) req
->ki_pos
>> 9);
506 dio
->op
.devs_need_flush
= &inode
->ei_devs_need_flush
;
509 dio
->op
.flags
|= BCH_WRITE_SYNC
;
510 dio
->op
.flags
|= BCH_WRITE_CHECK_ENOSPC
;
512 ret
= bch2_quota_reservation_add(c
, inode
, &dio
->quota_res
,
513 bio_sectors(bio
), true);
517 ret
= bch2_disk_reservation_get(c
, &dio
->op
.res
, bio_sectors(bio
),
518 dio
->op
.opts
.data_replicas
, 0);
520 !bch2_dio_write_check_allocated(dio
))
523 task_io_account_write(bio
->bi_iter
.bi_size
);
525 if (unlikely(dio
->iter
.count
) &&
528 bch2_dio_write_copy_iov(dio
))
529 dio
->sync
= sync
= true;
532 closure_call(&dio
->op
.cl
, bch2_write
, NULL
, NULL
);
537 bch2_dio_write_end(dio
);
539 if (likely(!dio
->iter
.count
) || dio
->op
.error
)
542 bio_reset(bio
, NULL
, REQ_OP_WRITE
| REQ_SYNC
| REQ_IDLE
);
545 return bch2_dio_write_done(dio
);
549 bio_release_pages(bio
, false);
551 bch2_quota_reservation_put(c
, inode
, &dio
->quota_res
);
555 static noinline __cold
void bch2_dio_write_continue(struct dio_write
*dio
)
557 struct mm_struct
*mm
= dio
->mm
;
559 bio_reset(&dio
->op
.wbio
.bio
, NULL
, REQ_OP_WRITE
);
563 bch2_dio_write_loop(dio
);
565 kthread_unuse_mm(mm
);
568 static void bch2_dio_write_loop_async(struct bch_write_op
*op
)
570 struct dio_write
*dio
= container_of(op
, struct dio_write
, op
);
572 bch2_dio_write_end(dio
);
574 if (likely(!dio
->iter
.count
) || dio
->op
.error
)
575 bch2_dio_write_done(dio
);
577 bch2_dio_write_continue(dio
);
580 ssize_t
bch2_direct_write(struct kiocb
*req
, struct iov_iter
*iter
)
582 struct file
*file
= req
->ki_filp
;
583 struct address_space
*mapping
= file
->f_mapping
;
584 struct bch_inode_info
*inode
= file_bch_inode(file
);
585 struct bch_fs
*c
= inode
->v
.i_sb
->s_fs_info
;
586 struct dio_write
*dio
;
588 bool locked
= true, extending
;
592 prefetch((void *) &c
->opts
+ 64);
593 prefetch(&inode
->ei_inode
);
594 prefetch((void *) &inode
->ei_inode
+ 64);
596 if (!bch2_write_ref_tryget(c
, BCH_WRITE_REF_dio_write
))
599 inode_lock(&inode
->v
);
601 ret
= generic_write_checks(req
, iter
);
602 if (unlikely(ret
<= 0))
603 goto err_put_write_ref
;
605 ret
= file_remove_privs(file
);
607 goto err_put_write_ref
;
609 ret
= file_update_time(file
);
611 goto err_put_write_ref
;
613 if (unlikely((req
->ki_pos
|iter
->count
) & (block_bytes(c
) - 1))) {
615 goto err_put_write_ref
;
618 inode_dio_begin(&inode
->v
);
619 bch2_pagecache_block_get(inode
);
621 extending
= req
->ki_pos
+ iter
->count
> inode
->v
.i_size
;
623 inode_unlock(&inode
->v
);
627 bio
= bio_alloc_bioset(NULL
,
628 bio_iov_vecs_to_alloc(iter
, BIO_MAX_VECS
),
629 REQ_OP_WRITE
| REQ_SYNC
| REQ_IDLE
,
631 &c
->dio_write_bioset
);
632 dio
= container_of(bio
, struct dio_write
, op
.wbio
.bio
);
634 dio
->mapping
= mapping
;
636 dio
->mm
= current
->mm
;
639 dio
->extending
= extending
;
640 dio
->sync
= is_sync_kiocb(req
) || extending
;
641 dio
->flush
= iocb_is_dsync(req
) && !c
->opts
.journal_flush_disabled
;
642 dio
->quota_res
.sectors
= 0;
647 if (unlikely(mapping
->nrpages
)) {
648 ret
= bch2_write_invalidate_inode_pages_range(mapping
,
650 req
->ki_pos
+ iter
->count
- 1);
655 ret
= bch2_dio_write_loop(dio
);
658 inode_unlock(&inode
->v
);
661 bch2_pagecache_block_put(inode
);
663 inode_dio_end(&inode
->v
);
665 bch2_write_ref_put(c
, BCH_WRITE_REF_dio_write
);
669 void bch2_fs_fs_io_direct_exit(struct bch_fs
*c
)
671 bioset_exit(&c
->dio_write_bioset
);
672 bioset_exit(&c
->dio_read_bioset
);
675 int bch2_fs_fs_io_direct_init(struct bch_fs
*c
)
677 if (bioset_init(&c
->dio_read_bioset
,
678 4, offsetof(struct dio_read
, rbio
.bio
),
680 return -BCH_ERR_ENOMEM_dio_read_bioset_init
;
682 if (bioset_init(&c
->dio_write_bioset
,
683 4, offsetof(struct dio_write
, op
.wbio
.bio
),
685 return -BCH_ERR_ENOMEM_dio_write_bioset_init
;
690 #endif /* NO_BCACHEFS_FS */