1 // SPDX-License-Identifier: GPL-2.0
5 #include "alloc_foreground.h"
8 #include "fs-io-buffered.h"
9 #include "fs-io-direct.h"
10 #include "fs-io-pagecache.h"
14 #include <linux/backing-dev.h>
15 #include <linux/pagemap.h>
16 #include <linux/writeback.h>
18 static inline bool bio_full(struct bio
*bio
, unsigned len
)
20 if (bio
->bi_vcnt
>= bio
->bi_max_vecs
)
22 if (bio
->bi_iter
.bi_size
> UINT_MAX
- len
)
29 static void bch2_readpages_end_io(struct bio
*bio
)
33 bio_for_each_folio_all(fi
, bio
)
34 folio_end_read(fi
.folio
, bio
->bi_status
== BLK_STS_OK
);
39 struct readpages_iter
{
40 struct address_space
*mapping
;
45 static int readpages_iter_init(struct readpages_iter
*iter
,
46 struct readahead_control
*ractl
)
50 *iter
= (struct readpages_iter
) { ractl
->mapping
};
52 while ((folio
= __readahead_folio(ractl
))) {
53 if (!bch2_folio_create(folio
, GFP_KERNEL
) ||
54 darray_push(&iter
->folios
, folio
)) {
55 bch2_folio_release(folio
);
56 ractl
->_nr_pages
+= folio_nr_pages(folio
);
57 ractl
->_index
-= folio_nr_pages(folio
);
58 return iter
->folios
.nr
? 0 : -ENOMEM
;
67 static inline struct folio
*readpage_iter_peek(struct readpages_iter
*iter
)
69 if (iter
->idx
>= iter
->folios
.nr
)
71 return iter
->folios
.data
[iter
->idx
];
74 static inline void readpage_iter_advance(struct readpages_iter
*iter
)
79 static bool extent_partial_reads_expensive(struct bkey_s_c k
)
81 struct bkey_ptrs_c ptrs
= bch2_bkey_ptrs_c(k
);
82 struct bch_extent_crc_unpacked crc
;
83 const union bch_extent_entry
*i
;
85 bkey_for_each_crc(k
.k
, ptrs
, crc
, i
)
86 if (crc
.csum_type
|| crc
.compression_type
)
91 static int readpage_bio_extend(struct btree_trans
*trans
,
92 struct readpages_iter
*iter
,
94 unsigned sectors_this_extent
,
97 /* Don't hold btree locks while allocating memory: */
98 bch2_trans_unlock(trans
);
100 while (bio_sectors(bio
) < sectors_this_extent
&&
101 bio
->bi_vcnt
< bio
->bi_max_vecs
) {
102 struct folio
*folio
= readpage_iter_peek(iter
);
106 readpage_iter_advance(iter
);
108 pgoff_t folio_offset
= bio_end_sector(bio
) >> PAGE_SECTORS_SHIFT
;
113 folio
= xa_load(&iter
->mapping
->i_pages
, folio_offset
);
114 if (folio
&& !xa_is_value(folio
))
117 folio
= filemap_alloc_folio(readahead_gfp_mask(iter
->mapping
), 0);
121 if (!__bch2_folio_create(folio
, GFP_KERNEL
)) {
126 ret
= filemap_add_folio(iter
->mapping
, folio
, folio_offset
, GFP_KERNEL
);
128 __bch2_folio_release(folio
);
136 BUG_ON(folio_sector(folio
) != bio_end_sector(bio
));
138 BUG_ON(!bio_add_folio(bio
, folio
, folio_size(folio
), 0));
141 return bch2_trans_relock(trans
);
144 static void bchfs_read(struct btree_trans
*trans
,
145 struct bch_read_bio
*rbio
,
147 struct readpages_iter
*readpages_iter
)
149 struct bch_fs
*c
= trans
->c
;
150 struct btree_iter iter
;
152 int flags
= BCH_READ_RETRY_IF_STALE
|
153 BCH_READ_MAY_PROMOTE
;
157 rbio
->start_time
= local_clock();
158 rbio
->subvol
= inum
.subvol
;
160 bch2_bkey_buf_init(&sk
);
161 bch2_trans_begin(trans
);
162 bch2_trans_iter_init(trans
, &iter
, BTREE_ID_extents
,
163 POS(inum
.inum
, rbio
->bio
.bi_iter
.bi_sector
),
167 unsigned bytes
, sectors
, offset_into_extent
;
168 enum btree_id data_btree
= BTREE_ID_extents
;
170 bch2_trans_begin(trans
);
173 ret
= bch2_subvolume_get_snapshot(trans
, inum
.subvol
, &snapshot
);
177 bch2_btree_iter_set_snapshot(&iter
, snapshot
);
179 bch2_btree_iter_set_pos(&iter
,
180 POS(inum
.inum
, rbio
->bio
.bi_iter
.bi_sector
));
182 k
= bch2_btree_iter_peek_slot(&iter
);
187 offset_into_extent
= iter
.pos
.offset
-
188 bkey_start_offset(k
.k
);
189 sectors
= k
.k
->size
- offset_into_extent
;
191 bch2_bkey_buf_reassemble(&sk
, c
, k
);
193 ret
= bch2_read_indirect_extent(trans
, &data_btree
,
194 &offset_into_extent
, &sk
);
198 k
= bkey_i_to_s_c(sk
.k
);
200 sectors
= min(sectors
, k
.k
->size
- offset_into_extent
);
202 if (readpages_iter
) {
203 ret
= readpage_bio_extend(trans
, readpages_iter
, &rbio
->bio
, sectors
,
204 extent_partial_reads_expensive(k
));
209 bytes
= min(sectors
, bio_sectors(&rbio
->bio
)) << 9;
210 swap(rbio
->bio
.bi_iter
.bi_size
, bytes
);
212 if (rbio
->bio
.bi_iter
.bi_size
== bytes
)
213 flags
|= BCH_READ_LAST_FRAGMENT
;
215 bch2_bio_page_state_set(&rbio
->bio
, k
);
217 bch2_read_extent(trans
, rbio
, iter
.pos
,
218 data_btree
, k
, offset_into_extent
, flags
);
220 if (flags
& BCH_READ_LAST_FRAGMENT
)
223 swap(rbio
->bio
.bi_iter
.bi_size
, bytes
);
224 bio_advance(&rbio
->bio
, bytes
);
227 !bch2_err_matches(ret
, BCH_ERR_transaction_restart
))
230 bch2_trans_iter_exit(trans
, &iter
);
233 bch_err_inum_offset_ratelimited(c
,
235 iter
.pos
.offset
<< 9,
236 "read error %i from btree lookup", ret
);
237 rbio
->bio
.bi_status
= BLK_STS_IOERR
;
238 bio_endio(&rbio
->bio
);
241 bch2_bkey_buf_exit(&sk
, c
);
244 void bch2_readahead(struct readahead_control
*ractl
)
246 struct bch_inode_info
*inode
= to_bch_ei(ractl
->mapping
->host
);
247 struct bch_fs
*c
= inode
->v
.i_sb
->s_fs_info
;
248 struct bch_io_opts opts
;
250 struct readpages_iter readpages_iter
;
252 bch2_inode_opts_get(&opts
, c
, &inode
->ei_inode
);
254 int ret
= readpages_iter_init(&readpages_iter
, ractl
);
258 bch2_pagecache_add_get(inode
);
260 struct btree_trans
*trans
= bch2_trans_get(c
);
261 while ((folio
= readpage_iter_peek(&readpages_iter
))) {
262 unsigned n
= min_t(unsigned,
263 readpages_iter
.folios
.nr
-
266 struct bch_read_bio
*rbio
=
267 rbio_init(bio_alloc_bioset(NULL
, n
, REQ_OP_READ
,
268 GFP_KERNEL
, &c
->bio_read
),
271 readpage_iter_advance(&readpages_iter
);
273 rbio
->bio
.bi_iter
.bi_sector
= folio_sector(folio
);
274 rbio
->bio
.bi_end_io
= bch2_readpages_end_io
;
275 BUG_ON(!bio_add_folio(&rbio
->bio
, folio
, folio_size(folio
), 0));
277 bchfs_read(trans
, rbio
, inode_inum(inode
),
279 bch2_trans_unlock(trans
);
281 bch2_trans_put(trans
);
283 bch2_pagecache_add_put(inode
);
285 darray_exit(&readpages_iter
.folios
);
288 static void bch2_read_single_folio_end_io(struct bio
*bio
)
290 complete(bio
->bi_private
);
293 int bch2_read_single_folio(struct folio
*folio
, struct address_space
*mapping
)
295 struct bch_inode_info
*inode
= to_bch_ei(mapping
->host
);
296 struct bch_fs
*c
= inode
->v
.i_sb
->s_fs_info
;
297 struct bch_read_bio
*rbio
;
298 struct bch_io_opts opts
;
300 DECLARE_COMPLETION_ONSTACK(done
);
302 if (!bch2_folio_create(folio
, GFP_KERNEL
))
305 bch2_inode_opts_get(&opts
, c
, &inode
->ei_inode
);
307 rbio
= rbio_init(bio_alloc_bioset(NULL
, 1, REQ_OP_READ
, GFP_KERNEL
, &c
->bio_read
),
309 rbio
->bio
.bi_private
= &done
;
310 rbio
->bio
.bi_end_io
= bch2_read_single_folio_end_io
;
312 rbio
->bio
.bi_opf
= REQ_OP_READ
|REQ_SYNC
;
313 rbio
->bio
.bi_iter
.bi_sector
= folio_sector(folio
);
314 BUG_ON(!bio_add_folio(&rbio
->bio
, folio
, folio_size(folio
), 0));
316 bch2_trans_run(c
, (bchfs_read(trans
, rbio
, inode_inum(inode
), NULL
), 0));
317 wait_for_completion(&done
);
319 ret
= blk_status_to_errno(rbio
->bio
.bi_status
);
325 folio_mark_uptodate(folio
);
329 int bch2_read_folio(struct file
*file
, struct folio
*folio
)
333 ret
= bch2_read_single_folio(folio
, folio
->mapping
);
335 return bch2_err_class(ret
);
340 struct bch_writepage_io
{
341 struct bch_inode_info
*inode
;
344 struct bch_write_op op
;
347 struct bch_writepage_state
{
348 struct bch_writepage_io
*io
;
349 struct bch_io_opts opts
;
350 struct bch_folio_sector
*tmp
;
351 unsigned tmp_sectors
;
354 static inline struct bch_writepage_state
bch_writepage_state_init(struct bch_fs
*c
,
355 struct bch_inode_info
*inode
)
357 struct bch_writepage_state ret
= { 0 };
359 bch2_inode_opts_get(&ret
.opts
, c
, &inode
->ei_inode
);
364 * Determine when a writepage io is full. We have to limit writepage bios to a
365 * single page per bvec (i.e. 1MB with 4k pages) because that is the limit to
366 * what the bounce path in bch2_write_extent() can handle. In theory we could
367 * loosen this restriction for non-bounce I/O, but we don't have that context
368 * here. Ideally, we can up this limit and make it configurable in the future
369 * when the bounce path can be enhanced to accommodate larger source bios.
371 static inline bool bch_io_full(struct bch_writepage_io
*io
, unsigned len
)
373 struct bio
*bio
= &io
->op
.wbio
.bio
;
374 return bio_full(bio
, len
) ||
375 (bio
->bi_iter
.bi_size
+ len
> BIO_MAX_VECS
* PAGE_SIZE
);
378 static void bch2_writepage_io_done(struct bch_write_op
*op
)
380 struct bch_writepage_io
*io
=
381 container_of(op
, struct bch_writepage_io
, op
);
382 struct bch_fs
*c
= io
->op
.c
;
383 struct bio
*bio
= &io
->op
.wbio
.bio
;
384 struct folio_iter fi
;
388 set_bit(EI_INODE_ERROR
, &io
->inode
->ei_flags
);
390 bio_for_each_folio_all(fi
, bio
) {
393 mapping_set_error(fi
.folio
->mapping
, -EIO
);
395 s
= __bch2_folio(fi
.folio
);
397 for (i
= 0; i
< folio_sectors(fi
.folio
); i
++)
398 s
->s
[i
].nr_replicas
= 0;
399 spin_unlock(&s
->lock
);
403 if (io
->op
.flags
& BCH_WRITE_WROTE_DATA_INLINE
) {
404 bio_for_each_folio_all(fi
, bio
) {
407 s
= __bch2_folio(fi
.folio
);
409 for (i
= 0; i
< folio_sectors(fi
.folio
); i
++)
410 s
->s
[i
].nr_replicas
= 0;
411 spin_unlock(&s
->lock
);
416 * racing with fallocate can cause us to add fewer sectors than
417 * expected - but we shouldn't add more sectors than expected:
419 WARN_ON_ONCE(io
->op
.i_sectors_delta
> 0);
422 * (error (due to going RO) halfway through a page can screw that up
425 BUG_ON(io->op.op.i_sectors_delta >= PAGE_SECTORS);
429 * The writeback flag is effectively our ref on the inode -
430 * fixup i_blocks before calling folio_end_writeback:
432 bch2_i_sectors_acct(c
, io
->inode
, NULL
, io
->op
.i_sectors_delta
);
434 bio_for_each_folio_all(fi
, bio
) {
435 struct bch_folio
*s
= __bch2_folio(fi
.folio
);
437 if (atomic_dec_and_test(&s
->write_count
))
438 folio_end_writeback(fi
.folio
);
441 bio_put(&io
->op
.wbio
.bio
);
444 static void bch2_writepage_do_io(struct bch_writepage_state
*w
)
446 struct bch_writepage_io
*io
= w
->io
;
449 closure_call(&io
->op
.cl
, bch2_write
, NULL
, NULL
);
453 * Get a bch_writepage_io and add @page to it - appending to an existing one if
454 * possible, else allocating a new one:
456 static void bch2_writepage_io_alloc(struct bch_fs
*c
,
457 struct writeback_control
*wbc
,
458 struct bch_writepage_state
*w
,
459 struct bch_inode_info
*inode
,
461 unsigned nr_replicas
)
463 struct bch_write_op
*op
;
465 w
->io
= container_of(bio_alloc_bioset(NULL
, BIO_MAX_VECS
,
468 &c
->writepage_bioset
),
469 struct bch_writepage_io
, op
.wbio
.bio
);
471 w
->io
->inode
= inode
;
473 bch2_write_op_init(op
, c
, w
->opts
);
474 op
->target
= w
->opts
.foreground_target
;
475 op
->nr_replicas
= nr_replicas
;
476 op
->res
.nr_replicas
= nr_replicas
;
477 op
->write_point
= writepoint_hashed(inode
->ei_last_dirtied
);
478 op
->subvol
= inode
->ei_inum
.subvol
;
479 op
->pos
= POS(inode
->v
.i_ino
, sector
);
480 op
->end_io
= bch2_writepage_io_done
;
481 op
->devs_need_flush
= &inode
->ei_devs_need_flush
;
482 op
->wbio
.bio
.bi_iter
.bi_sector
= sector
;
483 op
->wbio
.bio
.bi_opf
= wbc_to_write_flags(wbc
);
486 static int __bch2_writepage(struct folio
*folio
,
487 struct writeback_control
*wbc
,
490 struct bch_inode_info
*inode
= to_bch_ei(folio
->mapping
->host
);
491 struct bch_fs
*c
= inode
->v
.i_sb
->s_fs_info
;
492 struct bch_writepage_state
*w
= data
;
494 unsigned i
, offset
, f_sectors
, nr_replicas_this_write
= U32_MAX
;
495 loff_t i_size
= i_size_read(&inode
->v
);
498 EBUG_ON(!folio_test_uptodate(folio
));
500 /* Is the folio fully inside i_size? */
501 if (folio_end_pos(folio
) <= i_size
)
504 /* Is the folio fully outside i_size? (truncate in progress) */
505 if (folio_pos(folio
) >= i_size
) {
511 * The folio straddles i_size. It must be zeroed out on each and every
512 * writepage invocation because it may be mmapped. "A file is mapped
513 * in multiples of the folio size. For a file that is not a multiple of
514 * the folio size, the remaining memory is zeroed when mapped, and
515 * writes to that region are not written out to the file."
517 folio_zero_segment(folio
,
518 i_size
- folio_pos(folio
),
521 f_sectors
= folio_sectors(folio
);
522 s
= bch2_folio(folio
);
524 if (f_sectors
> w
->tmp_sectors
) {
526 w
->tmp
= kcalloc(f_sectors
, sizeof(struct bch_folio_sector
), GFP_NOFS
|__GFP_NOFAIL
);
527 w
->tmp_sectors
= f_sectors
;
531 * Things get really hairy with errors during writeback:
533 ret
= bch2_get_folio_disk_reservation(c
, inode
, folio
, false);
536 /* Before unlocking the page, get copy of reservations: */
538 memcpy(w
->tmp
, s
->s
, sizeof(struct bch_folio_sector
) * f_sectors
);
540 for (i
= 0; i
< f_sectors
; i
++) {
541 if (s
->s
[i
].state
< SECTOR_dirty
)
544 nr_replicas_this_write
=
545 min_t(unsigned, nr_replicas_this_write
,
546 s
->s
[i
].nr_replicas
+
547 s
->s
[i
].replicas_reserved
);
550 for (i
= 0; i
< f_sectors
; i
++) {
551 if (s
->s
[i
].state
< SECTOR_dirty
)
554 s
->s
[i
].nr_replicas
= w
->opts
.compression
555 ? 0 : nr_replicas_this_write
;
557 s
->s
[i
].replicas_reserved
= 0;
558 bch2_folio_sector_set(folio
, s
, i
, SECTOR_allocated
);
560 spin_unlock(&s
->lock
);
562 BUG_ON(atomic_read(&s
->write_count
));
563 atomic_set(&s
->write_count
, 1);
565 BUG_ON(folio_test_writeback(folio
));
566 folio_start_writeback(folio
);
572 unsigned sectors
= 0, dirty_sectors
= 0, reserved_sectors
= 0;
575 while (offset
< f_sectors
&&
576 w
->tmp
[offset
].state
< SECTOR_dirty
)
579 if (offset
== f_sectors
)
582 while (offset
+ sectors
< f_sectors
&&
583 w
->tmp
[offset
+ sectors
].state
>= SECTOR_dirty
) {
584 reserved_sectors
+= w
->tmp
[offset
+ sectors
].replicas_reserved
;
585 dirty_sectors
+= w
->tmp
[offset
+ sectors
].state
== SECTOR_dirty
;
590 sector
= folio_sector(folio
) + offset
;
593 (w
->io
->op
.res
.nr_replicas
!= nr_replicas_this_write
||
594 bch_io_full(w
->io
, sectors
<< 9) ||
595 bio_end_sector(&w
->io
->op
.wbio
.bio
) != sector
))
596 bch2_writepage_do_io(w
);
599 bch2_writepage_io_alloc(c
, wbc
, w
, inode
, sector
,
600 nr_replicas_this_write
);
602 atomic_inc(&s
->write_count
);
604 BUG_ON(inode
!= w
->io
->inode
);
605 BUG_ON(!bio_add_folio(&w
->io
->op
.wbio
.bio
, folio
,
606 sectors
<< 9, offset
<< 9));
608 /* Check for writing past i_size: */
609 WARN_ONCE((bio_end_sector(&w
->io
->op
.wbio
.bio
) << 9) >
610 round_up(i_size
, block_bytes(c
)) &&
611 !test_bit(BCH_FS_emergency_ro
, &c
->flags
),
612 "writing past i_size: %llu > %llu (unrounded %llu)\n",
613 bio_end_sector(&w
->io
->op
.wbio
.bio
) << 9,
614 round_up(i_size
, block_bytes(c
)),
617 w
->io
->op
.res
.sectors
+= reserved_sectors
;
618 w
->io
->op
.i_sectors_delta
-= dirty_sectors
;
619 w
->io
->op
.new_i_size
= i_size
;
624 if (atomic_dec_and_test(&s
->write_count
))
625 folio_end_writeback(folio
);
630 int bch2_writepages(struct address_space
*mapping
, struct writeback_control
*wbc
)
632 struct bch_fs
*c
= mapping
->host
->i_sb
->s_fs_info
;
633 struct bch_writepage_state w
=
634 bch_writepage_state_init(c
, to_bch_ei(mapping
->host
));
635 struct blk_plug plug
;
638 blk_start_plug(&plug
);
639 ret
= write_cache_pages(mapping
, wbc
, __bch2_writepage
, &w
);
641 bch2_writepage_do_io(&w
);
642 blk_finish_plug(&plug
);
644 return bch2_err_class(ret
);
647 /* buffered writes: */
649 int bch2_write_begin(struct file
*file
, struct address_space
*mapping
,
650 loff_t pos
, unsigned len
,
651 struct folio
**foliop
, void **fsdata
)
653 struct bch_inode_info
*inode
= to_bch_ei(mapping
->host
);
654 struct bch_fs
*c
= inode
->v
.i_sb
->s_fs_info
;
655 struct bch2_folio_reservation
*res
;
660 res
= kmalloc(sizeof(*res
), GFP_KERNEL
);
664 bch2_folio_reservation_init(c
, inode
, res
);
667 bch2_pagecache_add_get(inode
);
669 folio
= __filemap_get_folio(mapping
, pos
>> PAGE_SHIFT
,
670 FGP_WRITEBEGIN
| fgf_set_order(len
),
671 mapping_gfp_mask(mapping
));
672 if (IS_ERR_OR_NULL(folio
))
675 offset
= pos
- folio_pos(folio
);
676 len
= min_t(size_t, len
, folio_end_pos(folio
) - pos
);
678 if (folio_test_uptodate(folio
))
681 /* If we're writing entire folio, don't need to read it in first: */
682 if (!offset
&& len
== folio_size(folio
))
685 if (!offset
&& pos
+ len
>= inode
->v
.i_size
) {
686 folio_zero_segment(folio
, len
, folio_size(folio
));
687 flush_dcache_folio(folio
);
691 if (folio_pos(folio
) >= inode
->v
.i_size
) {
692 folio_zero_segments(folio
, 0, offset
, offset
+ len
, folio_size(folio
));
693 flush_dcache_folio(folio
);
697 ret
= bch2_read_single_folio(folio
, mapping
);
701 ret
= bch2_folio_set(c
, inode_inum(inode
), &folio
, 1);
705 ret
= bch2_folio_reservation_get(c
, inode
, folio
, res
, offset
, len
);
707 if (!folio_test_uptodate(folio
)) {
709 * If the folio hasn't been read in, we won't know if we
710 * actually need a reservation - we don't actually need
711 * to read here, we just need to check if the folio is
712 * fully backed by uncompressed data:
726 bch2_pagecache_add_put(inode
);
729 return bch2_err_class(ret
);
732 int bch2_write_end(struct file
*file
, struct address_space
*mapping
,
733 loff_t pos
, unsigned len
, unsigned copied
,
734 struct folio
*folio
, void *fsdata
)
736 struct bch_inode_info
*inode
= to_bch_ei(mapping
->host
);
737 struct bch_fs
*c
= inode
->v
.i_sb
->s_fs_info
;
738 struct bch2_folio_reservation
*res
= fsdata
;
739 unsigned offset
= pos
- folio_pos(folio
);
741 lockdep_assert_held(&inode
->v
.i_rwsem
);
742 BUG_ON(offset
+ copied
> folio_size(folio
));
744 if (unlikely(copied
< len
&& !folio_test_uptodate(folio
))) {
746 * The folio needs to be read in, but that would destroy
747 * our partial write - simplest thing is to just force
748 * userspace to redo the write:
750 folio_zero_range(folio
, 0, folio_size(folio
));
751 flush_dcache_folio(folio
);
755 spin_lock(&inode
->v
.i_lock
);
756 if (pos
+ copied
> inode
->v
.i_size
)
757 i_size_write(&inode
->v
, pos
+ copied
);
758 spin_unlock(&inode
->v
.i_lock
);
761 if (!folio_test_uptodate(folio
))
762 folio_mark_uptodate(folio
);
764 bch2_set_folio_dirty(c
, inode
, folio
, res
, offset
, copied
);
766 inode
->ei_last_dirtied
= (unsigned long) current
;
771 bch2_pagecache_add_put(inode
);
773 bch2_folio_reservation_put(c
, inode
, res
);
779 static noinline
void folios_trunc(folios
*fs
, struct folio
**fi
)
781 while (fs
->data
+ fs
->nr
> fi
) {
782 struct folio
*f
= darray_pop(fs
);
789 static int __bch2_buffered_write(struct bch_inode_info
*inode
,
790 struct address_space
*mapping
,
791 struct iov_iter
*iter
,
792 loff_t pos
, unsigned len
)
794 struct bch_fs
*c
= inode
->v
.i_sb
->s_fs_info
;
795 struct bch2_folio_reservation res
;
798 unsigned copied
= 0, f_offset
, f_copied
;
799 u64 end
= pos
+ len
, f_pos
, f_len
;
800 loff_t last_folio_pos
= inode
->v
.i_size
;
805 bch2_folio_reservation_init(c
, inode
, &res
);
808 ret
= bch2_filemap_get_contig_folios_d(mapping
, pos
, end
,
809 FGP_WRITEBEGIN
| fgf_set_order(len
),
810 mapping_gfp_mask(mapping
), &fs
);
816 f
= darray_first(fs
);
817 if (pos
!= folio_pos(f
) && !folio_test_uptodate(f
)) {
818 ret
= bch2_read_single_folio(f
, mapping
);
824 end
= min(end
, folio_end_pos(f
));
825 last_folio_pos
= folio_pos(f
);
826 if (end
!= folio_end_pos(f
) && !folio_test_uptodate(f
)) {
827 if (end
>= inode
->v
.i_size
) {
828 folio_zero_range(f
, 0, folio_size(f
));
830 ret
= bch2_read_single_folio(f
, mapping
);
836 ret
= bch2_folio_set(c
, inode_inum(inode
), fs
.data
, fs
.nr
);
841 f_offset
= pos
- folio_pos(darray_first(fs
));
842 darray_for_each(fs
, fi
) {
846 f_len
= min(end
, folio_end_pos(f
)) - f_pos
;
847 f_reserved
= bch2_folio_reservation_get_partial(c
, inode
, f
, &res
, f_offset
, f_len
);
849 if (unlikely(f_reserved
!= f_len
)) {
850 if (f_reserved
< 0) {
851 if (f
== darray_first(fs
)) {
856 folios_trunc(&fs
, fi
);
857 end
= min(end
, folio_end_pos(darray_last(fs
)));
859 if (!folio_test_uptodate(f
)) {
860 ret
= bch2_read_single_folio(f
, mapping
);
865 folios_trunc(&fs
, fi
+ 1);
866 end
= f_pos
+ f_reserved
;
872 f_pos
= folio_end_pos(f
);
876 if (mapping_writably_mapped(mapping
))
877 darray_for_each(fs
, fi
)
878 flush_dcache_folio(*fi
);
881 f_offset
= pos
- folio_pos(darray_first(fs
));
882 darray_for_each(fs
, fi
) {
884 f_len
= min(end
, folio_end_pos(f
)) - f_pos
;
885 f_copied
= copy_folio_from_iter_atomic(f
, f_offset
, f_len
, iter
);
887 folios_trunc(&fs
, fi
);
891 if (!folio_test_uptodate(f
) &&
892 f_copied
!= folio_size(f
) &&
893 pos
+ copied
+ f_copied
< inode
->v
.i_size
) {
894 iov_iter_revert(iter
, f_copied
);
895 folio_zero_range(f
, 0, folio_size(f
));
896 folios_trunc(&fs
, fi
);
900 flush_dcache_folio(f
);
903 if (f_copied
!= f_len
) {
904 folios_trunc(&fs
, fi
+ 1);
908 f_pos
= folio_end_pos(f
);
917 spin_lock(&inode
->v
.i_lock
);
918 if (end
> inode
->v
.i_size
)
919 i_size_write(&inode
->v
, end
);
920 spin_unlock(&inode
->v
.i_lock
);
923 f_offset
= pos
- folio_pos(darray_first(fs
));
924 darray_for_each(fs
, fi
) {
926 f_len
= min(end
, folio_end_pos(f
)) - f_pos
;
928 if (!folio_test_uptodate(f
))
929 folio_mark_uptodate(f
);
931 bch2_set_folio_dirty(c
, inode
, f
, &res
, f_offset
, f_len
);
933 f_pos
= folio_end_pos(f
);
937 inode
->ei_last_dirtied
= (unsigned long) current
;
939 darray_for_each(fs
, fi
) {
945 * If the last folio added to the mapping starts beyond current EOF, we
946 * performed a short write but left around at least one post-EOF folio.
947 * Clean up the mapping before we return.
949 if (last_folio_pos
>= inode
->v
.i_size
)
950 truncate_pagecache(&inode
->v
, inode
->v
.i_size
);
953 bch2_folio_reservation_put(c
, inode
, &res
);
955 return copied
?: ret
;
958 static ssize_t
bch2_buffered_write(struct kiocb
*iocb
, struct iov_iter
*iter
)
960 struct file
*file
= iocb
->ki_filp
;
961 struct address_space
*mapping
= file
->f_mapping
;
962 struct bch_inode_info
*inode
= file_bch_inode(file
);
963 loff_t pos
= iocb
->ki_pos
;
967 bch2_pagecache_add_get(inode
);
970 unsigned offset
= pos
& (PAGE_SIZE
- 1);
971 unsigned bytes
= iov_iter_count(iter
);
974 * Bring in the user page that we will copy from _first_.
975 * Otherwise there's a nasty deadlock on copying from the
976 * same page as we're writing to, without it being marked
979 * Not only is this an optimisation, but it is also required
980 * to check that the address is actually valid, when atomic
981 * usercopies are used, below.
983 if (unlikely(fault_in_iov_iter_readable(iter
, bytes
))) {
984 bytes
= min_t(unsigned long, iov_iter_count(iter
),
987 if (unlikely(fault_in_iov_iter_readable(iter
, bytes
))) {
993 if (unlikely(fatal_signal_pending(current
))) {
998 ret
= __bch2_buffered_write(inode
, mapping
, iter
, pos
, bytes
);
999 if (unlikely(ret
< 0))
1004 if (unlikely(ret
== 0)) {
1006 * If we were unable to copy any data at all, we must
1007 * fall back to a single segment length write.
1009 * If we didn't fallback here, we could livelock
1010 * because not all segments in the iov can be copied at
1011 * once without a pagefault.
1013 bytes
= min_t(unsigned long, PAGE_SIZE
- offset
,
1014 iov_iter_single_seg_count(iter
));
1021 balance_dirty_pages_ratelimited(mapping
);
1022 } while (iov_iter_count(iter
));
1024 bch2_pagecache_add_put(inode
);
1026 return written
? written
: ret
;
1029 ssize_t
bch2_write_iter(struct kiocb
*iocb
, struct iov_iter
*from
)
1031 struct file
*file
= iocb
->ki_filp
;
1032 struct bch_inode_info
*inode
= file_bch_inode(file
);
1035 if (iocb
->ki_flags
& IOCB_DIRECT
) {
1036 ret
= bch2_direct_write(iocb
, from
);
1040 inode_lock(&inode
->v
);
1042 ret
= generic_write_checks(iocb
, from
);
1046 ret
= file_remove_privs(file
);
1050 ret
= file_update_time(file
);
1054 ret
= bch2_buffered_write(iocb
, from
);
1055 if (likely(ret
> 0))
1056 iocb
->ki_pos
+= ret
;
1058 inode_unlock(&inode
->v
);
1061 ret
= generic_write_sync(iocb
, ret
);
1063 return bch2_err_class(ret
);
1066 void bch2_fs_fs_io_buffered_exit(struct bch_fs
*c
)
1068 bioset_exit(&c
->writepage_bioset
);
1071 int bch2_fs_fs_io_buffered_init(struct bch_fs
*c
)
1073 if (bioset_init(&c
->writepage_bioset
,
1074 4, offsetof(struct bch_writepage_io
, op
.wbio
.bio
),
1076 return -BCH_ERR_ENOMEM_writepage_bioset_init
;
1081 #endif /* NO_BCACHEFS_FS */