1 // SPDX-License-Identifier: GPL-2.0
5 #include "alloc_foreground.h"
8 #include "fs-io-direct.h"
9 #include "fs-io-pagecache.h"
13 #include <linux/kthread.h>
14 #include <linux/pagemap.h>
15 #include <linux/prefetch.h>
16 #include <linux/task_io_accounting_ops.h>
25 struct bch_read_bio rbio
;
28 static void bio_check_or_release(struct bio
*bio
, bool check_dirty
)
31 bio_check_pages_dirty(bio
);
33 bio_release_pages(bio
, false);
38 static CLOSURE_CALLBACK(bch2_dio_read_complete
)
40 closure_type(dio
, struct dio_read
, cl
);
42 dio
->req
->ki_complete(dio
->req
, dio
->ret
);
43 bio_check_or_release(&dio
->rbio
.bio
, dio
->should_dirty
);
46 static void bch2_direct_IO_read_endio(struct bio
*bio
)
48 struct dio_read
*dio
= bio
->bi_private
;
51 dio
->ret
= blk_status_to_errno(bio
->bi_status
);
53 closure_put(&dio
->cl
);
56 static void bch2_direct_IO_read_split_endio(struct bio
*bio
)
58 struct dio_read
*dio
= bio
->bi_private
;
59 bool should_dirty
= dio
->should_dirty
;
61 bch2_direct_IO_read_endio(bio
);
62 bio_check_or_release(bio
, should_dirty
);
65 static int bch2_direct_IO_read(struct kiocb
*req
, struct iov_iter
*iter
)
67 struct file
*file
= req
->ki_filp
;
68 struct bch_inode_info
*inode
= file_bch_inode(file
);
69 struct bch_fs
*c
= inode
->v
.i_sb
->s_fs_info
;
70 struct bch_io_opts opts
;
74 loff_t offset
= req
->ki_pos
;
75 bool sync
= is_sync_kiocb(req
);
79 bch2_inode_opts_get(&opts
, c
, &inode
->ei_inode
);
81 /* bios must be 512 byte aligned: */
82 if ((offset
|iter
->count
) & (SECTOR_SIZE
- 1))
85 ret
= min_t(loff_t
, iter
->count
,
86 max_t(loff_t
, 0, i_size_read(&inode
->v
) - offset
));
91 shorten
= iov_iter_count(iter
) - round_up(ret
, block_bytes(c
));
92 if (shorten
>= iter
->count
)
94 iter
->count
-= shorten
;
96 bio
= bio_alloc_bioset(NULL
,
97 bio_iov_vecs_to_alloc(iter
, BIO_MAX_VECS
),
100 &c
->dio_read_bioset
);
102 bio
->bi_end_io
= bch2_direct_IO_read_endio
;
104 dio
= container_of(bio
, struct dio_read
, rbio
.bio
);
105 closure_init(&dio
->cl
, NULL
);
108 * this is a _really_ horrible hack just to avoid an atomic sub at the
112 set_closure_fn(&dio
->cl
, bch2_dio_read_complete
, NULL
);
113 atomic_set(&dio
->cl
.remaining
,
114 CLOSURE_REMAINING_INITIALIZER
-
118 atomic_set(&dio
->cl
.remaining
,
119 CLOSURE_REMAINING_INITIALIZER
+ 1);
120 dio
->cl
.closure_get_happened
= true;
126 * This is one of the sketchier things I've encountered: we have to skip
127 * the dirtying of requests that are internal from the kernel (i.e. from
128 * loopback), because we'll deadlock on page_lock.
130 dio
->should_dirty
= iter_is_iovec(iter
);
132 blk_start_plug(&plug
);
135 while (iter
->count
) {
136 bio
= bio_alloc_bioset(NULL
,
137 bio_iov_vecs_to_alloc(iter
, BIO_MAX_VECS
),
141 bio
->bi_end_io
= bch2_direct_IO_read_split_endio
;
143 bio
->bi_opf
= REQ_OP_READ
|REQ_SYNC
;
144 bio
->bi_iter
.bi_sector
= offset
>> 9;
145 bio
->bi_private
= dio
;
147 ret
= bio_iov_iter_get_pages(bio
, iter
);
149 /* XXX: fault inject this path */
150 bio
->bi_status
= BLK_STS_RESOURCE
;
155 offset
+= bio
->bi_iter
.bi_size
;
157 if (dio
->should_dirty
)
158 bio_set_pages_dirty(bio
);
161 closure_get(&dio
->cl
);
163 bch2_read(c
, rbio_init(bio
, opts
), inode_inum(inode
));
166 blk_finish_plug(&plug
);
168 iter
->count
+= shorten
;
171 closure_sync(&dio
->cl
);
172 closure_debug_destroy(&dio
->cl
);
174 bio_check_or_release(&dio
->rbio
.bio
, dio
->should_dirty
);
181 ssize_t
bch2_read_iter(struct kiocb
*iocb
, struct iov_iter
*iter
)
183 struct file
*file
= iocb
->ki_filp
;
184 struct bch_inode_info
*inode
= file_bch_inode(file
);
185 struct address_space
*mapping
= file
->f_mapping
;
186 size_t count
= iov_iter_count(iter
);
190 return 0; /* skip atime */
192 if (iocb
->ki_flags
& IOCB_DIRECT
) {
193 struct blk_plug plug
;
195 if (unlikely(mapping
->nrpages
)) {
196 ret
= filemap_write_and_wait_range(mapping
,
198 iocb
->ki_pos
+ count
- 1);
205 blk_start_plug(&plug
);
206 ret
= bch2_direct_IO_read(iocb
, iter
);
207 blk_finish_plug(&plug
);
212 bch2_pagecache_add_get(inode
);
213 ret
= filemap_read(iocb
, iter
, ret
);
214 bch2_pagecache_add_put(inode
);
217 return bch2_err_class(ret
);
220 /* O_DIRECT writes */
224 struct address_space
*mapping
;
225 struct bch_inode_info
*inode
;
226 struct mm_struct
*mm
;
227 const struct iovec
*iov
;
232 struct quota_res quota_res
;
235 struct iov_iter iter
;
236 struct iovec inline_vecs
[2];
239 struct bch_write_op op
;
242 static bool bch2_check_range_allocated(struct bch_fs
*c
, subvol_inum inum
,
243 u64 offset
, u64 size
,
244 unsigned nr_replicas
, bool compressed
)
246 struct btree_trans
*trans
= bch2_trans_get(c
);
247 struct btree_iter iter
;
249 u64 end
= offset
+ size
;
254 bch2_trans_begin(trans
);
256 err
= bch2_subvolume_get_snapshot(trans
, inum
.subvol
, &snapshot
);
260 for_each_btree_key_norestart(trans
, iter
, BTREE_ID_extents
,
261 SPOS(inum
.inum
, offset
, snapshot
),
262 BTREE_ITER_slots
, k
, err
) {
263 if (bkey_ge(bkey_start_pos(k
.k
), POS(inum
.inum
, end
)))
266 if (k
.k
->p
.snapshot
!= snapshot
||
267 nr_replicas
> bch2_bkey_replicas(c
, k
) ||
268 (!compressed
&& bch2_bkey_sectors_compressed(k
))) {
274 offset
= iter
.pos
.offset
;
275 bch2_trans_iter_exit(trans
, &iter
);
277 if (bch2_err_matches(err
, BCH_ERR_transaction_restart
))
279 bch2_trans_put(trans
);
281 return err
? false : ret
;
284 static noinline
bool bch2_dio_write_check_allocated(struct dio_write
*dio
)
286 struct bch_fs
*c
= dio
->op
.c
;
287 struct bch_inode_info
*inode
= dio
->inode
;
288 struct bio
*bio
= &dio
->op
.wbio
.bio
;
290 return bch2_check_range_allocated(c
, inode_inum(inode
),
291 dio
->op
.pos
.offset
, bio_sectors(bio
),
292 dio
->op
.opts
.data_replicas
,
293 dio
->op
.opts
.compression
!= 0);
296 static void bch2_dio_write_loop_async(struct bch_write_op
*);
297 static __always_inline
long bch2_dio_write_done(struct dio_write
*dio
);
300 * We're going to return -EIOCBQUEUED, but we haven't finished consuming the
301 * iov_iter yet, so we need to stash a copy of the iovec: it might be on the
302 * caller's stack, we're not guaranteed that it will live for the duration of
305 static noinline
int bch2_dio_write_copy_iov(struct dio_write
*dio
)
307 struct iovec
*iov
= dio
->inline_vecs
;
310 * iov_iter has a single embedded iovec - nothing to do:
312 if (iter_is_ubuf(&dio
->iter
))
316 * We don't currently handle non-iovec iov_iters here - return an error,
317 * and we'll fall back to doing the IO synchronously:
319 if (!iter_is_iovec(&dio
->iter
))
322 if (dio
->iter
.nr_segs
> ARRAY_SIZE(dio
->inline_vecs
)) {
323 dio
->iov
= iov
= kmalloc_array(dio
->iter
.nr_segs
, sizeof(*iov
),
329 memcpy(iov
, dio
->iter
.__iov
, dio
->iter
.nr_segs
* sizeof(*iov
));
330 dio
->iter
.__iov
= iov
;
334 static CLOSURE_CALLBACK(bch2_dio_write_flush_done
)
336 closure_type(dio
, struct dio_write
, op
.cl
);
337 struct bch_fs
*c
= dio
->op
.c
;
339 closure_debug_destroy(cl
);
341 dio
->op
.error
= bch2_journal_error(&c
->journal
);
343 bch2_dio_write_done(dio
);
346 static noinline
void bch2_dio_write_flush(struct dio_write
*dio
)
348 struct bch_fs
*c
= dio
->op
.c
;
349 struct bch_inode_unpacked inode
;
354 closure_init(&dio
->op
.cl
, NULL
);
356 if (!dio
->op
.error
) {
357 ret
= bch2_inode_find_by_inum(c
, inode_inum(dio
->inode
), &inode
);
361 bch2_journal_flush_seq_async(&c
->journal
, inode
.bi_journal_seq
,
363 bch2_inode_flush_nocow_writes_async(c
, dio
->inode
, &dio
->op
.cl
);
368 closure_sync(&dio
->op
.cl
);
369 closure_debug_destroy(&dio
->op
.cl
);
371 continue_at(&dio
->op
.cl
, bch2_dio_write_flush_done
, NULL
);
375 static __always_inline
long bch2_dio_write_done(struct dio_write
*dio
)
377 struct bch_fs
*c
= dio
->op
.c
;
378 struct kiocb
*req
= dio
->req
;
379 struct bch_inode_info
*inode
= dio
->inode
;
380 bool sync
= dio
->sync
;
383 if (unlikely(dio
->flush
)) {
384 bch2_dio_write_flush(dio
);
389 bch2_pagecache_block_put(inode
);
393 ret
= dio
->op
.error
?: ((long) dio
->written
<< 9);
394 bio_put(&dio
->op
.wbio
.bio
);
396 bch2_write_ref_put(c
, BCH_WRITE_REF_dio_write
);
398 /* inode->i_dio_count is our ref on inode and thus bch_fs */
399 inode_dio_end(&inode
->v
);
402 ret
= bch2_err_class(ret
);
405 req
->ki_complete(req
, ret
);
411 static __always_inline
void bch2_dio_write_end(struct dio_write
*dio
)
413 struct bch_fs
*c
= dio
->op
.c
;
414 struct kiocb
*req
= dio
->req
;
415 struct bch_inode_info
*inode
= dio
->inode
;
416 struct bio
*bio
= &dio
->op
.wbio
.bio
;
418 req
->ki_pos
+= (u64
) dio
->op
.written
<< 9;
419 dio
->written
+= dio
->op
.written
;
421 if (dio
->extending
) {
422 spin_lock(&inode
->v
.i_lock
);
423 if (req
->ki_pos
> inode
->v
.i_size
)
424 i_size_write(&inode
->v
, req
->ki_pos
);
425 spin_unlock(&inode
->v
.i_lock
);
428 if (dio
->op
.i_sectors_delta
|| dio
->quota_res
.sectors
) {
429 mutex_lock(&inode
->ei_quota_lock
);
430 __bch2_i_sectors_acct(c
, inode
, &dio
->quota_res
, dio
->op
.i_sectors_delta
);
431 __bch2_quota_reservation_put(c
, inode
, &dio
->quota_res
);
432 mutex_unlock(&inode
->ei_quota_lock
);
435 bio_release_pages(bio
, false);
437 if (unlikely(dio
->op
.error
))
438 set_bit(EI_INODE_ERROR
, &inode
->ei_flags
);
441 static __always_inline
long bch2_dio_write_loop(struct dio_write
*dio
)
443 struct bch_fs
*c
= dio
->op
.c
;
444 struct kiocb
*req
= dio
->req
;
445 struct address_space
*mapping
= dio
->mapping
;
446 struct bch_inode_info
*inode
= dio
->inode
;
447 struct bch_io_opts opts
;
448 struct bio
*bio
= &dio
->op
.wbio
.bio
;
449 unsigned unaligned
, iter_count
;
450 bool sync
= dio
->sync
, dropped_locks
;
453 bch2_inode_opts_get(&opts
, c
, &inode
->ei_inode
);
456 iter_count
= dio
->iter
.count
;
458 EBUG_ON(current
->faults_disabled_mapping
);
459 current
->faults_disabled_mapping
= mapping
;
461 ret
= bio_iov_iter_get_pages(bio
, &dio
->iter
);
463 dropped_locks
= fdm_dropped_locks();
465 current
->faults_disabled_mapping
= NULL
;
468 * If the fault handler returned an error but also signalled
469 * that it dropped & retook ei_pagecache_lock, we just need to
470 * re-shoot down the page cache and retry:
472 if (dropped_locks
&& ret
)
475 if (unlikely(ret
< 0))
478 if (unlikely(dropped_locks
)) {
479 ret
= bch2_write_invalidate_inode_pages_range(mapping
,
481 req
->ki_pos
+ iter_count
- 1);
485 if (!bio
->bi_iter
.bi_size
)
489 unaligned
= bio
->bi_iter
.bi_size
& (block_bytes(c
) - 1);
490 bio
->bi_iter
.bi_size
-= unaligned
;
491 iov_iter_revert(&dio
->iter
, unaligned
);
493 if (!bio
->bi_iter
.bi_size
) {
495 * bio_iov_iter_get_pages was only able to get <
496 * blocksize worth of pages:
502 bch2_write_op_init(&dio
->op
, c
, opts
);
503 dio
->op
.end_io
= sync
505 : bch2_dio_write_loop_async
;
506 dio
->op
.target
= dio
->op
.opts
.foreground_target
;
507 dio
->op
.write_point
= writepoint_hashed((unsigned long) current
);
508 dio
->op
.nr_replicas
= dio
->op
.opts
.data_replicas
;
509 dio
->op
.subvol
= inode
->ei_inum
.subvol
;
510 dio
->op
.pos
= POS(inode
->v
.i_ino
, (u64
) req
->ki_pos
>> 9);
511 dio
->op
.devs_need_flush
= &inode
->ei_devs_need_flush
;
514 dio
->op
.flags
|= BCH_WRITE_SYNC
;
515 dio
->op
.flags
|= BCH_WRITE_CHECK_ENOSPC
;
517 ret
= bch2_quota_reservation_add(c
, inode
, &dio
->quota_res
,
518 bio_sectors(bio
), true);
522 ret
= bch2_disk_reservation_get(c
, &dio
->op
.res
, bio_sectors(bio
),
523 dio
->op
.opts
.data_replicas
, 0);
525 !bch2_dio_write_check_allocated(dio
))
528 task_io_account_write(bio
->bi_iter
.bi_size
);
530 if (unlikely(dio
->iter
.count
) &&
533 bch2_dio_write_copy_iov(dio
))
534 dio
->sync
= sync
= true;
537 closure_call(&dio
->op
.cl
, bch2_write
, NULL
, NULL
);
542 bch2_dio_write_end(dio
);
544 if (likely(!dio
->iter
.count
) || dio
->op
.error
)
547 bio_reset(bio
, NULL
, REQ_OP_WRITE
| REQ_SYNC
| REQ_IDLE
);
550 return bch2_dio_write_done(dio
);
554 bio_release_pages(bio
, false);
556 bch2_quota_reservation_put(c
, inode
, &dio
->quota_res
);
560 static noinline __cold
void bch2_dio_write_continue(struct dio_write
*dio
)
562 struct mm_struct
*mm
= dio
->mm
;
564 bio_reset(&dio
->op
.wbio
.bio
, NULL
, REQ_OP_WRITE
);
568 bch2_dio_write_loop(dio
);
570 kthread_unuse_mm(mm
);
573 static void bch2_dio_write_loop_async(struct bch_write_op
*op
)
575 struct dio_write
*dio
= container_of(op
, struct dio_write
, op
);
577 bch2_dio_write_end(dio
);
579 if (likely(!dio
->iter
.count
) || dio
->op
.error
)
580 bch2_dio_write_done(dio
);
582 bch2_dio_write_continue(dio
);
585 ssize_t
bch2_direct_write(struct kiocb
*req
, struct iov_iter
*iter
)
587 struct file
*file
= req
->ki_filp
;
588 struct address_space
*mapping
= file
->f_mapping
;
589 struct bch_inode_info
*inode
= file_bch_inode(file
);
590 struct bch_fs
*c
= inode
->v
.i_sb
->s_fs_info
;
591 struct dio_write
*dio
;
593 bool locked
= true, extending
;
597 prefetch((void *) &c
->opts
+ 64);
598 prefetch(&inode
->ei_inode
);
599 prefetch((void *) &inode
->ei_inode
+ 64);
601 if (!bch2_write_ref_tryget(c
, BCH_WRITE_REF_dio_write
))
604 inode_lock(&inode
->v
);
606 ret
= generic_write_checks(req
, iter
);
607 if (unlikely(ret
<= 0))
608 goto err_put_write_ref
;
610 ret
= file_remove_privs(file
);
612 goto err_put_write_ref
;
614 ret
= file_update_time(file
);
616 goto err_put_write_ref
;
618 if (unlikely((req
->ki_pos
|iter
->count
) & (block_bytes(c
) - 1))) {
620 goto err_put_write_ref
;
623 inode_dio_begin(&inode
->v
);
624 bch2_pagecache_block_get(inode
);
626 extending
= req
->ki_pos
+ iter
->count
> inode
->v
.i_size
;
628 inode_unlock(&inode
->v
);
632 bio
= bio_alloc_bioset(NULL
,
633 bio_iov_vecs_to_alloc(iter
, BIO_MAX_VECS
),
634 REQ_OP_WRITE
| REQ_SYNC
| REQ_IDLE
,
636 &c
->dio_write_bioset
);
637 dio
= container_of(bio
, struct dio_write
, op
.wbio
.bio
);
639 dio
->mapping
= mapping
;
641 dio
->mm
= current
->mm
;
644 dio
->extending
= extending
;
645 dio
->sync
= is_sync_kiocb(req
) || extending
;
646 dio
->flush
= iocb_is_dsync(req
) && !c
->opts
.journal_flush_disabled
;
647 dio
->quota_res
.sectors
= 0;
652 if (unlikely(mapping
->nrpages
)) {
653 ret
= bch2_write_invalidate_inode_pages_range(mapping
,
655 req
->ki_pos
+ iter
->count
- 1);
660 ret
= bch2_dio_write_loop(dio
);
663 inode_unlock(&inode
->v
);
666 bch2_pagecache_block_put(inode
);
668 inode_dio_end(&inode
->v
);
670 bch2_write_ref_put(c
, BCH_WRITE_REF_dio_write
);
674 void bch2_fs_fs_io_direct_exit(struct bch_fs
*c
)
676 bioset_exit(&c
->dio_write_bioset
);
677 bioset_exit(&c
->dio_read_bioset
);
680 int bch2_fs_fs_io_direct_init(struct bch_fs
*c
)
682 if (bioset_init(&c
->dio_read_bioset
,
683 4, offsetof(struct dio_read
, rbio
.bio
),
685 return -BCH_ERR_ENOMEM_dio_read_bioset_init
;
687 if (bioset_init(&c
->dio_write_bioset
,
688 4, offsetof(struct dio_write
, op
.wbio
.bio
),
690 return -BCH_ERR_ENOMEM_dio_write_bioset_init
;
695 #endif /* NO_BCACHEFS_FS */