1 // SPDX-License-Identifier: GPL-2.0
5 #include "btree_iter.h"
8 #include "fs-io-pagecache.h"
11 #include <linux/pagevec.h>
12 #include <linux/writeback.h>
14 int bch2_filemap_get_contig_folios_d(struct address_space
*mapping
,
15 loff_t start
, u64 end
,
16 fgf_t fgp_flags
, gfp_t gfp
,
24 if ((u64
) pos
>= (u64
) start
+ (1ULL << 20))
25 fgp_flags
&= ~FGP_CREAT
;
27 ret
= darray_make_room_gfp(fs
, 1, gfp
& GFP_KERNEL
);
31 f
= __filemap_get_folio(mapping
, pos
>> PAGE_SHIFT
, fgp_flags
, gfp
);
32 if (IS_ERR_OR_NULL(f
))
35 BUG_ON(fs
->nr
&& folio_pos(f
) != pos
);
37 pos
= folio_end_pos(f
);
41 if (!fs
->nr
&& !ret
&& (fgp_flags
& FGP_CREAT
))
44 return fs
->nr
? 0 : ret
;
47 /* pagecache_block must be held */
48 int bch2_write_invalidate_inode_pages_range(struct address_space
*mapping
,
49 loff_t start
, loff_t end
)
54 * XXX: the way this is currently implemented, we can spin if a process
55 * is continually redirtying a specific page
58 if (!mapping
->nrpages
)
61 ret
= filemap_write_and_wait_range(mapping
, start
, end
);
65 if (!mapping
->nrpages
)
68 ret
= invalidate_inode_pages2_range(mapping
,
71 } while (ret
== -EBUSY
);
77 /* Useful for debug tracing: */
78 static const char * const bch2_folio_sector_states
[] = {
80 BCH_FOLIO_SECTOR_STATE()
86 static inline enum bch_folio_sector_state
87 folio_sector_dirty(enum bch_folio_sector_state state
)
90 case SECTOR_unallocated
:
93 return SECTOR_dirty_reserved
;
99 static inline enum bch_folio_sector_state
100 folio_sector_undirty(enum bch_folio_sector_state state
)
104 return SECTOR_unallocated
;
105 case SECTOR_dirty_reserved
:
106 return SECTOR_reserved
;
112 static inline enum bch_folio_sector_state
113 folio_sector_reserve(enum bch_folio_sector_state state
)
116 case SECTOR_unallocated
:
117 return SECTOR_reserved
;
119 return SECTOR_dirty_reserved
;
125 /* for newly allocated folios: */
126 struct bch_folio
*__bch2_folio_create(struct folio
*folio
, gfp_t gfp
)
130 s
= kzalloc(sizeof(*s
) +
131 sizeof(struct bch_folio_sector
) *
132 folio_sectors(folio
), gfp
);
136 spin_lock_init(&s
->lock
);
137 folio_attach_private(folio
, s
);
141 struct bch_folio
*bch2_folio_create(struct folio
*folio
, gfp_t gfp
)
143 return bch2_folio(folio
) ?: __bch2_folio_create(folio
, gfp
);
146 static unsigned bkey_to_sector_state(struct bkey_s_c k
)
148 if (bkey_extent_is_reservation(k
))
149 return SECTOR_reserved
;
150 if (bkey_extent_is_allocation(k
.k
))
151 return SECTOR_allocated
;
152 return SECTOR_unallocated
;
155 static void __bch2_folio_set(struct folio
*folio
,
156 unsigned pg_offset
, unsigned pg_len
,
157 unsigned nr_ptrs
, unsigned state
)
159 struct bch_folio
*s
= bch2_folio(folio
);
160 unsigned i
, sectors
= folio_sectors(folio
);
162 BUG_ON(pg_offset
>= sectors
);
163 BUG_ON(pg_offset
+ pg_len
> sectors
);
167 for (i
= pg_offset
; i
< pg_offset
+ pg_len
; i
++) {
168 s
->s
[i
].nr_replicas
= nr_ptrs
;
169 bch2_folio_sector_set(folio
, s
, i
, state
);
175 spin_unlock(&s
->lock
);
179 * Initialize bch_folio state (allocated/unallocated, nr_replicas) from the
182 int bch2_folio_set(struct bch_fs
*c
, subvol_inum inum
,
183 struct folio
**fs
, unsigned nr_folios
)
185 u64 offset
= folio_sector(fs
[0]);
186 bool need_set
= false;
188 for (unsigned folio_idx
= 0; folio_idx
< nr_folios
; folio_idx
++) {
189 struct bch_folio
*s
= bch2_folio_create(fs
[folio_idx
], GFP_KERNEL
);
193 need_set
|= !s
->uptodate
;
199 unsigned folio_idx
= 0;
201 return bch2_trans_run(c
,
202 for_each_btree_key_in_subvolume_upto(trans
, iter
, BTREE_ID_extents
,
203 POS(inum
.inum
, offset
),
204 POS(inum
.inum
, U64_MAX
),
205 inum
.subvol
, BTREE_ITER_slots
, k
, ({
206 unsigned nr_ptrs
= bch2_bkey_nr_ptrs_fully_allocated(k
);
207 unsigned state
= bkey_to_sector_state(k
);
209 while (folio_idx
< nr_folios
) {
210 struct folio
*folio
= fs
[folio_idx
];
211 u64 folio_start
= folio_sector(folio
);
212 u64 folio_end
= folio_end_sector(folio
);
213 unsigned folio_offset
= max(bkey_start_offset(k
.k
), folio_start
) -
215 unsigned folio_len
= min(k
.k
->p
.offset
, folio_end
) -
216 folio_offset
- folio_start
;
218 BUG_ON(k
.k
->p
.offset
< folio_start
);
219 BUG_ON(bkey_start_offset(k
.k
) > folio_end
);
221 if (!bch2_folio(folio
)->uptodate
)
222 __bch2_folio_set(folio
, folio_offset
, folio_len
, nr_ptrs
, state
);
224 if (k
.k
->p
.offset
< folio_end
)
229 if (folio_idx
== nr_folios
)
235 void bch2_bio_page_state_set(struct bio
*bio
, struct bkey_s_c k
)
237 struct bvec_iter iter
;
239 unsigned nr_ptrs
= k
.k
->type
== KEY_TYPE_reflink_v
240 ? 0 : bch2_bkey_nr_ptrs_fully_allocated(k
);
241 unsigned state
= bkey_to_sector_state(k
);
243 bio_for_each_folio(fv
, bio
, iter
)
244 __bch2_folio_set(fv
.fv_folio
,
250 void bch2_mark_pagecache_unallocated(struct bch_inode_info
*inode
,
253 pgoff_t index
= start
>> PAGE_SECTORS_SHIFT
;
254 pgoff_t end_index
= (end
- 1) >> PAGE_SECTORS_SHIFT
;
255 struct folio_batch fbatch
;
261 folio_batch_init(&fbatch
);
263 while (filemap_get_folios(inode
->v
.i_mapping
,
264 &index
, end_index
, &fbatch
)) {
265 for (i
= 0; i
< folio_batch_count(&fbatch
); i
++) {
266 struct folio
*folio
= fbatch
.folios
[i
];
267 u64 folio_start
= folio_sector(folio
);
268 u64 folio_end
= folio_end_sector(folio
);
269 unsigned folio_offset
= max(start
, folio_start
) - folio_start
;
270 unsigned folio_len
= min(end
, folio_end
) - folio_offset
- folio_start
;
273 BUG_ON(end
<= folio_start
);
276 s
= bch2_folio(folio
);
280 for (j
= folio_offset
; j
< folio_offset
+ folio_len
; j
++)
281 s
->s
[j
].nr_replicas
= 0;
282 spin_unlock(&s
->lock
);
287 folio_batch_release(&fbatch
);
292 int bch2_mark_pagecache_reserved(struct bch_inode_info
*inode
,
296 struct bch_fs
*c
= inode
->v
.i_sb
->s_fs_info
;
297 pgoff_t index
= *start
>> PAGE_SECTORS_SHIFT
;
298 pgoff_t end_index
= (end
- 1) >> PAGE_SECTORS_SHIFT
;
299 struct folio_batch fbatch
;
300 s64 i_sectors_delta
= 0;
306 folio_batch_init(&fbatch
);
308 while (filemap_get_folios(inode
->v
.i_mapping
,
309 &index
, end_index
, &fbatch
)) {
310 for (unsigned i
= 0; i
< folio_batch_count(&fbatch
); i
++) {
311 struct folio
*folio
= fbatch
.folios
[i
];
315 else if (!folio_trylock(folio
)) {
316 folio_batch_release(&fbatch
);
321 u64 folio_start
= folio_sector(folio
);
322 u64 folio_end
= folio_end_sector(folio
);
324 BUG_ON(end
<= folio_start
);
326 *start
= min(end
, folio_end
);
328 struct bch_folio
*s
= bch2_folio(folio
);
330 unsigned folio_offset
= max(*start
, folio_start
) - folio_start
;
331 unsigned folio_len
= min(end
, folio_end
) - folio_offset
- folio_start
;
334 for (unsigned j
= folio_offset
; j
< folio_offset
+ folio_len
; j
++) {
335 i_sectors_delta
-= s
->s
[j
].state
== SECTOR_dirty
;
336 bch2_folio_sector_set(folio
, s
, j
,
337 folio_sector_reserve(s
->s
[j
].state
));
339 spin_unlock(&s
->lock
);
344 folio_batch_release(&fbatch
);
348 bch2_i_sectors_acct(c
, inode
, NULL
, i_sectors_delta
);
352 static inline unsigned sectors_to_reserve(struct bch_folio_sector
*s
,
353 unsigned nr_replicas
)
355 return max(0, (int) nr_replicas
-
357 s
->replicas_reserved
);
360 int bch2_get_folio_disk_reservation(struct bch_fs
*c
,
361 struct bch_inode_info
*inode
,
362 struct folio
*folio
, bool check_enospc
)
364 struct bch_folio
*s
= bch2_folio_create(folio
, 0);
365 unsigned nr_replicas
= inode_nr_replicas(c
, inode
);
366 struct disk_reservation disk_res
= { 0 };
367 unsigned i
, sectors
= folio_sectors(folio
), disk_res_sectors
= 0;
373 for (i
= 0; i
< sectors
; i
++)
374 disk_res_sectors
+= sectors_to_reserve(&s
->s
[i
], nr_replicas
);
376 if (!disk_res_sectors
)
379 ret
= bch2_disk_reservation_get(c
, &disk_res
,
382 ? BCH_DISK_RESERVATION_NOFAIL
387 for (i
= 0; i
< sectors
; i
++)
388 s
->s
[i
].replicas_reserved
+=
389 sectors_to_reserve(&s
->s
[i
], nr_replicas
);
394 void bch2_folio_reservation_put(struct bch_fs
*c
,
395 struct bch_inode_info
*inode
,
396 struct bch2_folio_reservation
*res
)
398 bch2_disk_reservation_put(c
, &res
->disk
);
399 bch2_quota_reservation_put(c
, inode
, &res
->quota
);
402 static int __bch2_folio_reservation_get(struct bch_fs
*c
,
403 struct bch_inode_info
*inode
,
405 struct bch2_folio_reservation
*res
,
406 size_t offset
, size_t len
,
409 struct bch_folio
*s
= bch2_folio_create(folio
, 0);
410 unsigned i
, disk_sectors
= 0, quota_sectors
= 0;
411 struct disk_reservation disk_res
= {};
412 size_t reserved
= len
;
418 BUG_ON(!s
->uptodate
);
420 for (i
= round_down(offset
, block_bytes(c
)) >> 9;
421 i
< round_up(offset
+ len
, block_bytes(c
)) >> 9;
423 disk_sectors
+= sectors_to_reserve(&s
->s
[i
], res
->disk
.nr_replicas
);
424 quota_sectors
+= s
->s
[i
].state
== SECTOR_unallocated
;
428 ret
= bch2_disk_reservation_add(c
, &disk_res
, disk_sectors
,
429 partial
? BCH_DISK_RESERVATION_PARTIAL
: 0);
433 if (unlikely(disk_res
.sectors
!= disk_sectors
)) {
434 disk_sectors
= quota_sectors
= 0;
436 for (i
= round_down(offset
, block_bytes(c
)) >> 9;
437 i
< round_up(offset
+ len
, block_bytes(c
)) >> 9;
439 disk_sectors
+= sectors_to_reserve(&s
->s
[i
], res
->disk
.nr_replicas
);
440 if (disk_sectors
> disk_res
.sectors
) {
442 * Make sure to get a reservation that's
443 * aligned to the filesystem blocksize:
445 unsigned reserved_offset
= round_down(i
<< 9, block_bytes(c
));
446 reserved
= clamp(reserved_offset
, offset
, offset
+ len
) - offset
;
449 bch2_disk_reservation_put(c
, &disk_res
);
450 return -BCH_ERR_ENOSPC_disk_reservation
;
454 quota_sectors
+= s
->s
[i
].state
== SECTOR_unallocated
;
460 ret
= bch2_quota_reservation_add(c
, inode
, &res
->quota
, quota_sectors
, true);
462 bch2_disk_reservation_put(c
, &disk_res
);
467 res
->disk
.sectors
+= disk_res
.sectors
;
468 return partial
? reserved
: 0;
471 int bch2_folio_reservation_get(struct bch_fs
*c
,
472 struct bch_inode_info
*inode
,
474 struct bch2_folio_reservation
*res
,
475 size_t offset
, size_t len
)
477 return __bch2_folio_reservation_get(c
, inode
, folio
, res
, offset
, len
, false);
480 ssize_t
bch2_folio_reservation_get_partial(struct bch_fs
*c
,
481 struct bch_inode_info
*inode
,
483 struct bch2_folio_reservation
*res
,
484 size_t offset
, size_t len
)
486 return __bch2_folio_reservation_get(c
, inode
, folio
, res
, offset
, len
, true);
489 static void bch2_clear_folio_bits(struct folio
*folio
)
491 struct bch_inode_info
*inode
= to_bch_ei(folio
->mapping
->host
);
492 struct bch_fs
*c
= inode
->v
.i_sb
->s_fs_info
;
493 struct bch_folio
*s
= bch2_folio(folio
);
494 struct disk_reservation disk_res
= { 0 };
495 int i
, sectors
= folio_sectors(folio
), dirty_sectors
= 0;
500 EBUG_ON(!folio_test_locked(folio
));
501 EBUG_ON(folio_test_writeback(folio
));
503 for (i
= 0; i
< sectors
; i
++) {
504 disk_res
.sectors
+= s
->s
[i
].replicas_reserved
;
505 s
->s
[i
].replicas_reserved
= 0;
507 dirty_sectors
-= s
->s
[i
].state
== SECTOR_dirty
;
508 bch2_folio_sector_set(folio
, s
, i
, folio_sector_undirty(s
->s
[i
].state
));
511 bch2_disk_reservation_put(c
, &disk_res
);
513 bch2_i_sectors_acct(c
, inode
, NULL
, dirty_sectors
);
515 bch2_folio_release(folio
);
518 void bch2_set_folio_dirty(struct bch_fs
*c
,
519 struct bch_inode_info
*inode
,
521 struct bch2_folio_reservation
*res
,
522 unsigned offset
, unsigned len
)
524 struct bch_folio
*s
= bch2_folio(folio
);
525 unsigned i
, dirty_sectors
= 0;
527 WARN_ON((u64
) folio_pos(folio
) + offset
+ len
>
528 round_up((u64
) i_size_read(&inode
->v
), block_bytes(c
)));
530 BUG_ON(!s
->uptodate
);
534 for (i
= round_down(offset
, block_bytes(c
)) >> 9;
535 i
< round_up(offset
+ len
, block_bytes(c
)) >> 9;
537 unsigned sectors
= sectors_to_reserve(&s
->s
[i
],
538 res
->disk
.nr_replicas
);
541 * This can happen if we race with the error path in
542 * bch2_writepage_io_done():
544 sectors
= min_t(unsigned, sectors
, res
->disk
.sectors
);
546 s
->s
[i
].replicas_reserved
+= sectors
;
547 res
->disk
.sectors
-= sectors
;
549 dirty_sectors
+= s
->s
[i
].state
== SECTOR_unallocated
;
551 bch2_folio_sector_set(folio
, s
, i
, folio_sector_dirty(s
->s
[i
].state
));
554 spin_unlock(&s
->lock
);
556 bch2_i_sectors_acct(c
, inode
, &res
->quota
, dirty_sectors
);
558 if (!folio_test_dirty(folio
))
559 filemap_dirty_folio(inode
->v
.i_mapping
, folio
);
562 vm_fault_t
bch2_page_fault(struct vm_fault
*vmf
)
564 struct file
*file
= vmf
->vma
->vm_file
;
565 struct address_space
*mapping
= file
->f_mapping
;
566 struct address_space
*fdm
= faults_disabled_mapping();
567 struct bch_inode_info
*inode
= file_bch_inode(file
);
571 return VM_FAULT_SIGBUS
;
575 struct bch_inode_info
*fdm_host
= to_bch_ei(fdm
->host
);
577 if (bch2_pagecache_add_tryget(inode
))
580 bch2_pagecache_block_put(fdm_host
);
582 bch2_pagecache_add_get(inode
);
583 bch2_pagecache_add_put(inode
);
585 bch2_pagecache_block_get(fdm_host
);
587 /* Signal that lock has been dropped: */
588 set_fdm_dropped_locks();
589 return VM_FAULT_SIGBUS
;
592 bch2_pagecache_add_get(inode
);
594 ret
= filemap_fault(vmf
);
595 bch2_pagecache_add_put(inode
);
600 vm_fault_t
bch2_page_mkwrite(struct vm_fault
*vmf
)
602 struct folio
*folio
= page_folio(vmf
->page
);
603 struct file
*file
= vmf
->vma
->vm_file
;
604 struct bch_inode_info
*inode
= file_bch_inode(file
);
605 struct address_space
*mapping
= file
->f_mapping
;
606 struct bch_fs
*c
= inode
->v
.i_sb
->s_fs_info
;
607 struct bch2_folio_reservation res
;
612 bch2_folio_reservation_init(c
, inode
, &res
);
614 sb_start_pagefault(inode
->v
.i_sb
);
615 file_update_time(file
);
618 * Not strictly necessary, but helps avoid dio writes livelocking in
619 * bch2_write_invalidate_inode_pages_range() - can drop this if/when we get
620 * a bch2_write_invalidate_inode_pages_range() that works without dropping
621 * page lock before invalidating page
623 bch2_pagecache_add_get(inode
);
626 isize
= i_size_read(&inode
->v
);
628 if (folio
->mapping
!= mapping
|| folio_pos(folio
) >= isize
) {
630 ret
= VM_FAULT_NOPAGE
;
634 len
= min_t(loff_t
, folio_size(folio
), isize
- folio_pos(folio
));
636 if (bch2_folio_set(c
, inode_inum(inode
), &folio
, 1) ?:
637 bch2_folio_reservation_get(c
, inode
, folio
, &res
, 0, len
)) {
639 ret
= VM_FAULT_SIGBUS
;
643 bch2_set_folio_dirty(c
, inode
, folio
, &res
, 0, len
);
644 bch2_folio_reservation_put(c
, inode
, &res
);
646 folio_wait_stable(folio
);
647 ret
= VM_FAULT_LOCKED
;
649 bch2_pagecache_add_put(inode
);
650 sb_end_pagefault(inode
->v
.i_sb
);
655 void bch2_invalidate_folio(struct folio
*folio
, size_t offset
, size_t length
)
657 if (offset
|| length
< folio_size(folio
))
660 bch2_clear_folio_bits(folio
);
663 bool bch2_release_folio(struct folio
*folio
, gfp_t gfp_mask
)
665 if (folio_test_dirty(folio
) || folio_test_writeback(folio
))
668 bch2_clear_folio_bits(folio
);
674 static int folio_data_offset(struct folio
*folio
, loff_t pos
,
675 unsigned min_replicas
)
677 struct bch_folio
*s
= bch2_folio(folio
);
678 unsigned i
, sectors
= folio_sectors(folio
);
681 for (i
= folio_pos_to_s(folio
, pos
); i
< sectors
; i
++)
682 if (s
->s
[i
].state
>= SECTOR_dirty
&&
683 s
->s
[i
].nr_replicas
+ s
->s
[i
].replicas_reserved
>= min_replicas
)
684 return i
<< SECTOR_SHIFT
;
689 loff_t
bch2_seek_pagecache_data(struct inode
*vinode
,
692 unsigned min_replicas
,
695 struct folio_batch fbatch
;
696 pgoff_t start_index
= start_offset
>> PAGE_SHIFT
;
697 pgoff_t end_index
= end_offset
>> PAGE_SHIFT
;
698 pgoff_t index
= start_index
;
703 folio_batch_init(&fbatch
);
705 while (filemap_get_folios(vinode
->i_mapping
,
706 &index
, end_index
, &fbatch
)) {
707 for (i
= 0; i
< folio_batch_count(&fbatch
); i
++) {
708 struct folio
*folio
= fbatch
.folios
[i
];
712 } else if (!folio_trylock(folio
)) {
713 folio_batch_release(&fbatch
);
717 offset
= folio_data_offset(folio
,
718 max(folio_pos(folio
), start_offset
),
721 ret
= clamp(folio_pos(folio
) + offset
,
722 start_offset
, end_offset
);
724 folio_batch_release(&fbatch
);
729 folio_batch_release(&fbatch
);
737 * Search for a hole in a folio.
739 * The filemap layer returns -ENOENT if no folio exists, so reuse the same error
740 * code to indicate a pagecache hole exists at the returned offset. Otherwise
741 * return 0 if the folio is filled with data, or an error code. This function
742 * can return -EAGAIN if nonblock is specified.
744 static int folio_hole_offset(struct address_space
*mapping
, loff_t
*offset
,
745 unsigned min_replicas
, bool nonblock
)
752 folio
= __filemap_get_folio(mapping
, *offset
>> PAGE_SHIFT
,
753 FGP_LOCK
|(nonblock
? FGP_NOWAIT
: 0), 0);
755 return PTR_ERR(folio
);
757 s
= bch2_folio(folio
);
761 sectors
= folio_sectors(folio
);
762 for (i
= folio_pos_to_s(folio
, *offset
); i
< sectors
; i
++)
763 if (s
->s
[i
].state
< SECTOR_dirty
||
764 s
->s
[i
].nr_replicas
+ s
->s
[i
].replicas_reserved
< min_replicas
) {
765 *offset
= max(*offset
,
766 folio_pos(folio
) + (i
<< SECTOR_SHIFT
));
770 *offset
= folio_end_pos(folio
);
778 loff_t
bch2_seek_pagecache_hole(struct inode
*vinode
,
781 unsigned min_replicas
,
784 struct address_space
*mapping
= vinode
->i_mapping
;
785 loff_t offset
= start_offset
;
788 while (!ret
&& offset
< end_offset
)
789 ret
= folio_hole_offset(mapping
, &offset
, min_replicas
, nonblock
);
791 if (ret
&& ret
!= -ENOENT
)
793 return min(offset
, end_offset
);
796 int bch2_clamp_data_hole(struct inode
*inode
,
799 unsigned min_replicas
,
804 ret
= bch2_seek_pagecache_hole(inode
,
805 *hole_start
<< 9, *hole_end
<< 9, min_replicas
, nonblock
) >> 9;
811 if (*hole_start
== *hole_end
)
814 ret
= bch2_seek_pagecache_data(inode
,
815 *hole_start
<< 9, *hole_end
<< 9, min_replicas
, nonblock
) >> 9;
823 #endif /* NO_BCACHEFS_FS */