1 // SPDX-License-Identifier: GPL-2.0
3 * Moving/copying garbage collector
5 * Copyright 2012 Google, Inc.
9 #include "alloc_background.h"
10 #include "alloc_foreground.h"
11 #include "btree_iter.h"
12 #include "btree_update.h"
13 #include "btree_write_buffer.h"
23 #include <linux/freezer.h>
24 #include <linux/kthread.h>
25 #include <linux/math64.h>
26 #include <linux/sched/task.h>
27 #include <linux/wait.h>
29 struct buckets_in_flight
{
30 struct rhashtable table
;
31 struct move_bucket_in_flight
*first
;
32 struct move_bucket_in_flight
*last
;
37 static const struct rhashtable_params bch_move_bucket_params
= {
38 .head_offset
= offsetof(struct move_bucket_in_flight
, hash
),
39 .key_offset
= offsetof(struct move_bucket_in_flight
, bucket
.k
),
40 .key_len
= sizeof(struct move_bucket_key
),
41 .automatic_shrinking
= true,
44 static struct move_bucket_in_flight
*
45 move_bucket_in_flight_add(struct buckets_in_flight
*list
, struct move_bucket b
)
47 struct move_bucket_in_flight
*new = kzalloc(sizeof(*new), GFP_KERNEL
);
51 return ERR_PTR(-ENOMEM
);
55 ret
= rhashtable_lookup_insert_fast(&list
->table
, &new->hash
,
56 bch_move_bucket_params
);
65 list
->last
->next
= new;
69 list
->sectors
+= b
.sectors
;
73 static int bch2_bucket_is_movable(struct btree_trans
*trans
,
74 struct move_bucket
*b
, u64 time
)
76 struct bch_fs
*c
= trans
->c
;
77 struct btree_iter iter
;
79 struct bch_alloc_v4 _a
;
80 const struct bch_alloc_v4
*a
;
83 if (bch2_bucket_is_open(trans
->c
,
88 k
= bch2_bkey_get_iter(trans
, &iter
, BTREE_ID_alloc
,
89 b
->k
.bucket
, BTREE_ITER_cached
);
94 struct bch_dev
*ca
= bch2_dev_tryget(c
, k
.k
->p
.inode
);
98 a
= bch2_alloc_to_v4(k
, &_a
);
100 b
->sectors
= bch2_bucket_sectors_dirty(*a
);
101 u64 lru_idx
= alloc_lru_idx_fragmentation(*a
, ca
);
103 ret
= lru_idx
&& lru_idx
<= time
;
107 bch2_trans_iter_exit(trans
, &iter
);
111 static void move_buckets_wait(struct moving_context
*ctxt
,
112 struct buckets_in_flight
*list
,
115 struct move_bucket_in_flight
*i
;
118 while ((i
= list
->first
)) {
120 move_ctxt_wait_event(ctxt
, !atomic_read(&i
->count
));
122 if (atomic_read(&i
->count
))
125 list
->first
= i
->next
;
130 list
->sectors
-= i
->bucket
.sectors
;
132 ret
= rhashtable_remove_fast(&list
->table
, &i
->hash
,
133 bch_move_bucket_params
);
138 bch2_trans_unlock_long(ctxt
->trans
);
141 static bool bucket_in_flight(struct buckets_in_flight
*list
,
142 struct move_bucket_key k
)
144 return rhashtable_lookup_fast(&list
->table
, &k
, bch_move_bucket_params
);
147 typedef DARRAY(struct move_bucket
) move_buckets
;
149 static int bch2_copygc_get_buckets(struct moving_context
*ctxt
,
150 struct buckets_in_flight
*buckets_in_flight
,
151 move_buckets
*buckets
)
153 struct btree_trans
*trans
= ctxt
->trans
;
154 struct bch_fs
*c
= trans
->c
;
155 size_t nr_to_get
= max_t(size_t, 16U, buckets_in_flight
->nr
/ 4);
156 size_t saw
= 0, in_flight
= 0, not_movable
= 0, sectors
= 0;
159 move_buckets_wait(ctxt
, buckets_in_flight
, false);
161 ret
= bch2_btree_write_buffer_tryflush(trans
);
162 if (bch2_err_matches(ret
, EROFS
))
165 if (bch2_fs_fatal_err_on(ret
, c
, "%s: from bch2_btree_write_buffer_tryflush()", bch2_err_str(ret
)))
168 bch2_trans_begin(trans
);
170 ret
= for_each_btree_key_upto(trans
, iter
, BTREE_ID_lru
,
171 lru_pos(BCH_LRU_FRAGMENTATION_START
, 0, 0),
172 lru_pos(BCH_LRU_FRAGMENTATION_START
, U64_MAX
, LRU_TIME_MAX
),
174 struct move_bucket b
= { .k
.bucket
= u64_to_bucket(k
.k
->p
.offset
) };
179 ret2
= bch2_bucket_is_movable(trans
, &b
, lru_pos_time(k
.k
->p
));
185 else if (bucket_in_flight(buckets_in_flight
, b
.k
))
188 ret2
= darray_push(buckets
, b
);
191 sectors
+= b
.sectors
;
194 ret2
= buckets
->nr
>= nr_to_get
;
199 pr_debug("have: %zu (%zu) saw %zu in flight %zu not movable %zu got %zu (%zu)/%zu buckets ret %i",
200 buckets_in_flight
->nr
, buckets_in_flight
->sectors
,
201 saw
, in_flight
, not_movable
, buckets
->nr
, sectors
, nr_to_get
, ret
);
203 return ret
< 0 ? ret
: 0;
207 static int bch2_copygc(struct moving_context
*ctxt
,
208 struct buckets_in_flight
*buckets_in_flight
,
211 struct btree_trans
*trans
= ctxt
->trans
;
212 struct bch_fs
*c
= trans
->c
;
213 struct data_update_opts data_opts
= {
214 .btree_insert_flags
= BCH_WATERMARK_copygc
,
216 move_buckets buckets
= { 0 };
217 struct move_bucket_in_flight
*f
;
218 u64 moved
= atomic64_read(&ctxt
->stats
->sectors_moved
);
221 ret
= bch2_copygc_get_buckets(ctxt
, buckets_in_flight
, &buckets
);
225 darray_for_each(buckets
, i
) {
226 if (kthread_should_stop() || freezing(current
))
229 f
= move_bucket_in_flight_add(buckets_in_flight
, *i
);
230 ret
= PTR_ERR_OR_ZERO(f
);
231 if (ret
== -EEXIST
) { /* rare race: copygc_get_buckets returned same bucket more than once */
235 if (ret
== -ENOMEM
) { /* flush IO, continue later */
240 ret
= bch2_evacuate_bucket(ctxt
, f
, f
->bucket
.k
.bucket
,
241 f
->bucket
.k
.gen
, data_opts
);
248 darray_exit(&buckets
);
250 /* no entries in LRU btree found, or got to end: */
251 if (bch2_err_matches(ret
, ENOENT
))
254 if (ret
< 0 && !bch2_err_matches(ret
, EROFS
))
255 bch_err_msg(c
, ret
, "from bch2_move_data()");
257 moved
= atomic64_read(&ctxt
->stats
->sectors_moved
) - moved
;
258 trace_and_count(c
, copygc
, c
, moved
, 0, 0, 0);
263 * Copygc runs when the amount of fragmented data is above some arbitrary
266 * The threshold at the limit - when the device is full - is the amount of space
267 * we reserved in bch2_recalc_capacity; we can't have more than that amount of
268 * disk space stranded due to fragmentation and store everything we have
271 * But we don't want to be running copygc unnecessarily when the device still
272 * has plenty of free space - rather, we want copygc to smoothly run every so
273 * often and continually reduce the amount of fragmented space as the device
274 * fills up. So, we increase the threshold by half the current free space.
276 unsigned long bch2_copygc_wait_amount(struct bch_fs
*c
)
278 s64 wait
= S64_MAX
, fragmented_allowed
, fragmented
;
280 for_each_rw_member(c
, ca
) {
281 struct bch_dev_usage usage
= bch2_dev_usage_read(ca
);
283 fragmented_allowed
= ((__dev_buckets_available(ca
, usage
, BCH_WATERMARK_stripe
) *
284 ca
->mi
.bucket_size
) >> 1);
287 for (unsigned i
= 0; i
< BCH_DATA_NR
; i
++)
288 if (data_type_movable(i
))
289 fragmented
+= usage
.d
[i
].fragmented
;
291 wait
= min(wait
, max(0LL, fragmented_allowed
- fragmented
));
297 void bch2_copygc_wait_to_text(struct printbuf
*out
, struct bch_fs
*c
)
299 printbuf_tabstop_push(out
, 32);
300 prt_printf(out
, "running:\t%u\n", c
->copygc_running
);
301 prt_printf(out
, "copygc_wait:\t%llu\n", c
->copygc_wait
);
302 prt_printf(out
, "copygc_wait_at:\t%llu\n", c
->copygc_wait_at
);
304 prt_printf(out
, "Currently waiting for:\t");
305 prt_human_readable_u64(out
, max(0LL, c
->copygc_wait
-
306 atomic64_read(&c
->io_clock
[WRITE
].now
)) << 9);
309 prt_printf(out
, "Currently waiting since:\t");
310 prt_human_readable_u64(out
, max(0LL,
311 atomic64_read(&c
->io_clock
[WRITE
].now
) -
312 c
->copygc_wait_at
) << 9);
315 prt_printf(out
, "Currently calculated wait:\t");
316 prt_human_readable_u64(out
, bch2_copygc_wait_amount(c
));
320 static int bch2_copygc_thread(void *arg
)
322 struct bch_fs
*c
= arg
;
323 struct moving_context ctxt
;
324 struct bch_move_stats move_stats
;
325 struct io_clock
*clock
= &c
->io_clock
[WRITE
];
326 struct buckets_in_flight
*buckets
;
330 buckets
= kzalloc(sizeof(struct buckets_in_flight
), GFP_KERNEL
);
333 ret
= rhashtable_init(&buckets
->table
, &bch_move_bucket_params
);
334 bch_err_msg(c
, ret
, "allocating copygc buckets in flight");
342 bch2_move_stats_init(&move_stats
, "copygc");
343 bch2_moving_ctxt_init(&ctxt
, c
, NULL
, &move_stats
,
344 writepoint_ptr(&c
->copygc_write_point
),
347 while (!ret
&& !kthread_should_stop()) {
348 bool did_work
= false;
350 bch2_trans_unlock_long(ctxt
.trans
);
353 if (!c
->copy_gc_enabled
) {
354 move_buckets_wait(&ctxt
, buckets
, true);
355 kthread_wait_freezable(c
->copy_gc_enabled
||
356 kthread_should_stop());
359 if (unlikely(freezing(current
))) {
360 move_buckets_wait(&ctxt
, buckets
, true);
361 __refrigerator(false);
365 last
= atomic64_read(&clock
->now
);
366 wait
= bch2_copygc_wait_amount(c
);
368 if (wait
> clock
->max_slop
) {
369 c
->copygc_wait_at
= last
;
370 c
->copygc_wait
= last
+ wait
;
371 move_buckets_wait(&ctxt
, buckets
, true);
372 trace_and_count(c
, copygc_wait
, c
, wait
, last
+ wait
);
373 bch2_kthread_io_clock_wait(clock
, last
+ wait
,
374 MAX_SCHEDULE_TIMEOUT
);
380 c
->copygc_running
= true;
381 ret
= bch2_copygc(&ctxt
, buckets
, &did_work
);
382 c
->copygc_running
= false;
384 wake_up(&c
->copygc_running_wq
);
386 if (!wait
&& !did_work
) {
387 u64 min_member_capacity
= bch2_min_rw_member_capacity(c
);
389 if (min_member_capacity
== U64_MAX
)
390 min_member_capacity
= 128 * 2048;
392 move_buckets_wait(&ctxt
, buckets
, true);
393 bch2_kthread_io_clock_wait(clock
, last
+ (min_member_capacity
>> 6),
394 MAX_SCHEDULE_TIMEOUT
);
398 move_buckets_wait(&ctxt
, buckets
, true);
400 rhashtable_destroy(&buckets
->table
);
402 bch2_moving_ctxt_exit(&ctxt
);
403 bch2_move_stats_exit(&move_stats
, c
);
408 void bch2_copygc_stop(struct bch_fs
*c
)
410 if (c
->copygc_thread
) {
411 kthread_stop(c
->copygc_thread
);
412 put_task_struct(c
->copygc_thread
);
414 c
->copygc_thread
= NULL
;
417 int bch2_copygc_start(struct bch_fs
*c
)
419 struct task_struct
*t
;
422 if (c
->copygc_thread
)
425 if (c
->opts
.nochanges
)
428 if (bch2_fs_init_fault("copygc_start"))
431 t
= kthread_create(bch2_copygc_thread
, c
, "bch-copygc/%s", c
->name
);
432 ret
= PTR_ERR_OR_ZERO(t
);
433 bch_err_msg(c
, ret
, "creating copygc thread");
439 c
->copygc_thread
= t
;
440 wake_up_process(c
->copygc_thread
);
445 void bch2_fs_copygc_init(struct bch_fs
*c
)
447 init_waitqueue_head(&c
->copygc_running_wq
);
448 c
->copygc_running
= false;