1 // SPDX-License-Identifier: GPL-2.0
4 #include "alloc_background.h"
5 #include "alloc_foreground.h"
6 #include "btree_iter.h"
7 #include "btree_update.h"
8 #include "btree_write_buffer.h"
12 #include "disk_groups.h"
18 #include "rebalance.h"
19 #include "subvolume.h"
23 #include <linux/freezer.h>
24 #include <linux/kthread.h>
25 #include <linux/sched/cputime.h>
27 #define REBALANCE_WORK_SCAN_OFFSET (U64_MAX - 1)
29 static const char * const bch2_rebalance_state_strs
[] = {
31 BCH_REBALANCE_STATES()
36 static int __bch2_set_rebalance_needs_scan(struct btree_trans
*trans
, u64 inum
)
38 struct btree_iter iter
;
40 struct bkey_i_cookie
*cookie
;
44 bch2_trans_iter_init(trans
, &iter
, BTREE_ID_rebalance_work
,
45 SPOS(inum
, REBALANCE_WORK_SCAN_OFFSET
, U32_MAX
),
47 k
= bch2_btree_iter_peek_slot(&iter
);
52 v
= k
.k
->type
== KEY_TYPE_cookie
53 ? le64_to_cpu(bkey_s_c_to_cookie(k
).v
->cookie
)
56 cookie
= bch2_trans_kmalloc(trans
, sizeof(*cookie
));
57 ret
= PTR_ERR_OR_ZERO(cookie
);
61 bkey_cookie_init(&cookie
->k_i
);
62 cookie
->k
.p
= iter
.pos
;
63 cookie
->v
.cookie
= cpu_to_le64(v
+ 1);
65 ret
= bch2_trans_update(trans
, &iter
, &cookie
->k_i
, 0);
67 bch2_trans_iter_exit(trans
, &iter
);
71 int bch2_set_rebalance_needs_scan(struct bch_fs
*c
, u64 inum
)
73 int ret
= bch2_trans_commit_do(c
, NULL
, NULL
,
74 BCH_TRANS_COMMIT_no_enospc
|
75 BCH_TRANS_COMMIT_lazy_rw
,
76 __bch2_set_rebalance_needs_scan(trans
, inum
));
81 int bch2_set_fs_needs_rebalance(struct bch_fs
*c
)
83 return bch2_set_rebalance_needs_scan(c
, 0);
86 static int bch2_clear_rebalance_needs_scan(struct btree_trans
*trans
, u64 inum
, u64 cookie
)
88 struct btree_iter iter
;
93 bch2_trans_iter_init(trans
, &iter
, BTREE_ID_rebalance_work
,
94 SPOS(inum
, REBALANCE_WORK_SCAN_OFFSET
, U32_MAX
),
96 k
= bch2_btree_iter_peek_slot(&iter
);
101 v
= k
.k
->type
== KEY_TYPE_cookie
102 ? le64_to_cpu(bkey_s_c_to_cookie(k
).v
->cookie
)
106 ret
= bch2_btree_delete_at(trans
, &iter
, 0);
108 bch2_trans_iter_exit(trans
, &iter
);
112 static struct bkey_s_c
next_rebalance_entry(struct btree_trans
*trans
,
113 struct btree_iter
*work_iter
)
115 return !kthread_should_stop()
116 ? bch2_btree_iter_peek(work_iter
)
120 static int bch2_bkey_clear_needs_rebalance(struct btree_trans
*trans
,
121 struct btree_iter
*iter
,
124 struct bkey_i
*n
= bch2_bkey_make_mut(trans
, iter
, &k
, 0);
125 int ret
= PTR_ERR_OR_ZERO(n
);
129 extent_entry_drop(bkey_i_to_s(n
),
130 (void *) bch2_bkey_rebalance_opts(bkey_i_to_s_c(n
)));
131 return bch2_trans_commit(trans
, NULL
, NULL
, BCH_TRANS_COMMIT_no_enospc
);
134 static struct bkey_s_c
next_rebalance_extent(struct btree_trans
*trans
,
135 struct bpos work_pos
,
136 struct btree_iter
*extent_iter
,
137 struct data_update_opts
*data_opts
)
139 struct bch_fs
*c
= trans
->c
;
142 bch2_trans_iter_exit(trans
, extent_iter
);
143 bch2_trans_iter_init(trans
, extent_iter
,
144 work_pos
.inode
? BTREE_ID_extents
: BTREE_ID_reflink
,
146 BTREE_ITER_all_snapshots
);
147 k
= bch2_btree_iter_peek_slot(extent_iter
);
151 const struct bch_extent_rebalance
*r
= k
.k
? bch2_bkey_rebalance_opts(k
) : NULL
;
153 /* raced due to btree write buffer, nothing to do */
154 return bkey_s_c_null
;
157 memset(data_opts
, 0, sizeof(*data_opts
));
159 data_opts
->rewrite_ptrs
=
160 bch2_bkey_ptrs_need_rebalance(c
, k
, r
->target
, r
->compression
);
161 data_opts
->target
= r
->target
;
162 data_opts
->write_flags
|= BCH_WRITE_ONLY_SPECIFIED_DEVS
;
164 if (!data_opts
->rewrite_ptrs
) {
166 * device we would want to write to offline? devices in target
169 * We'll now need a full scan before this extent is picked up
172 int ret
= bch2_bkey_clear_needs_rebalance(trans
, extent_iter
, k
);
174 return bkey_s_c_err(ret
);
175 return bkey_s_c_null
;
178 if (trace_rebalance_extent_enabled()) {
179 struct printbuf buf
= PRINTBUF
;
181 prt_str(&buf
, "target=");
182 bch2_target_to_text(&buf
, c
, r
->target
);
183 prt_str(&buf
, " compression=");
184 bch2_compression_opt_to_text(&buf
, r
->compression
);
186 bch2_bkey_val_to_text(&buf
, c
, k
);
188 trace_rebalance_extent(c
, buf
.buf
);
196 static int do_rebalance_extent(struct moving_context
*ctxt
,
197 struct bpos work_pos
,
198 struct btree_iter
*extent_iter
)
200 struct btree_trans
*trans
= ctxt
->trans
;
201 struct bch_fs
*c
= trans
->c
;
202 struct bch_fs_rebalance
*r
= &trans
->c
->rebalance
;
203 struct data_update_opts data_opts
;
204 struct bch_io_opts io_opts
;
209 ctxt
->stats
= &r
->work_stats
;
210 r
->state
= BCH_REBALANCE_working
;
212 bch2_bkey_buf_init(&sk
);
214 ret
= bkey_err(k
= next_rebalance_extent(trans
, work_pos
,
215 extent_iter
, &data_opts
));
219 ret
= bch2_move_get_io_opts_one(trans
, &io_opts
, k
);
223 atomic64_add(k
.k
->size
, &ctxt
->stats
->sectors_seen
);
226 * The iterator gets unlocked by __bch2_read_extent - need to
227 * save a copy of @k elsewhere:
229 bch2_bkey_buf_reassemble(&sk
, c
, k
);
230 k
= bkey_i_to_s_c(sk
.k
);
232 ret
= bch2_move_extent(ctxt
, NULL
, extent_iter
, k
, io_opts
, data_opts
);
234 if (bch2_err_matches(ret
, ENOMEM
)) {
235 /* memory allocation failure, wait for some IO to finish */
236 bch2_move_ctxt_wait_for_io(ctxt
);
237 ret
= -BCH_ERR_transaction_restart_nested
;
240 if (bch2_err_matches(ret
, BCH_ERR_transaction_restart
))
243 /* skip it and continue, XXX signal failure */
247 bch2_bkey_buf_exit(&sk
, c
);
251 static bool rebalance_pred(struct bch_fs
*c
, void *arg
,
253 struct bch_io_opts
*io_opts
,
254 struct data_update_opts
*data_opts
)
256 unsigned target
, compression
;
259 target
= io_opts
->background_target
;
260 compression
= background_compression(*io_opts
);
262 const struct bch_extent_rebalance
*r
= bch2_bkey_rebalance_opts(k
);
264 target
= r
? r
->target
: io_opts
->background_target
;
265 compression
= r
? r
->compression
: background_compression(*io_opts
);
268 data_opts
->rewrite_ptrs
= bch2_bkey_ptrs_need_rebalance(c
, k
, target
, compression
);
269 data_opts
->target
= target
;
270 data_opts
->write_flags
|= BCH_WRITE_ONLY_SPECIFIED_DEVS
;
271 return data_opts
->rewrite_ptrs
!= 0;
274 static int do_rebalance_scan(struct moving_context
*ctxt
, u64 inum
, u64 cookie
)
276 struct btree_trans
*trans
= ctxt
->trans
;
277 struct bch_fs_rebalance
*r
= &trans
->c
->rebalance
;
280 bch2_move_stats_init(&r
->scan_stats
, "rebalance_scan");
281 ctxt
->stats
= &r
->scan_stats
;
284 r
->scan_start
= BBPOS_MIN
;
285 r
->scan_end
= BBPOS_MAX
;
287 r
->scan_start
= BBPOS(BTREE_ID_extents
, POS(inum
, 0));
288 r
->scan_end
= BBPOS(BTREE_ID_extents
, POS(inum
, U64_MAX
));
291 r
->state
= BCH_REBALANCE_scanning
;
293 ret
= __bch2_move_data(ctxt
, r
->scan_start
, r
->scan_end
, rebalance_pred
, NULL
) ?:
294 commit_do(trans
, NULL
, NULL
, BCH_TRANS_COMMIT_no_enospc
,
295 bch2_clear_rebalance_needs_scan(trans
, inum
, cookie
));
297 bch2_move_stats_exit(&r
->scan_stats
, trans
->c
);
301 static void rebalance_wait(struct bch_fs
*c
)
303 struct bch_fs_rebalance
*r
= &c
->rebalance
;
304 struct io_clock
*clock
= &c
->io_clock
[WRITE
];
305 u64 now
= atomic64_read(&clock
->now
);
306 u64 min_member_capacity
= bch2_min_rw_member_capacity(c
);
308 if (min_member_capacity
== U64_MAX
)
309 min_member_capacity
= 128 * 2048;
311 r
->wait_iotime_end
= now
+ (min_member_capacity
>> 6);
313 if (r
->state
!= BCH_REBALANCE_waiting
) {
314 r
->wait_iotime_start
= now
;
315 r
->wait_wallclock_start
= ktime_get_real_ns();
316 r
->state
= BCH_REBALANCE_waiting
;
319 bch2_kthread_io_clock_wait(clock
, r
->wait_iotime_end
, MAX_SCHEDULE_TIMEOUT
);
322 static int do_rebalance(struct moving_context
*ctxt
)
324 struct btree_trans
*trans
= ctxt
->trans
;
325 struct bch_fs
*c
= trans
->c
;
326 struct bch_fs_rebalance
*r
= &c
->rebalance
;
327 struct btree_iter rebalance_work_iter
, extent_iter
= { NULL
};
331 bch2_trans_begin(trans
);
333 bch2_move_stats_init(&r
->work_stats
, "rebalance_work");
334 bch2_move_stats_init(&r
->scan_stats
, "rebalance_scan");
336 bch2_trans_iter_init(trans
, &rebalance_work_iter
,
337 BTREE_ID_rebalance_work
, POS_MIN
,
338 BTREE_ITER_all_snapshots
);
340 while (!bch2_move_ratelimit(ctxt
)) {
342 bch2_moving_ctxt_flush_all(ctxt
);
343 kthread_wait_freezable(r
->enabled
||
344 kthread_should_stop());
347 if (kthread_should_stop())
350 bch2_trans_begin(trans
);
352 ret
= bkey_err(k
= next_rebalance_entry(trans
, &rebalance_work_iter
));
353 if (bch2_err_matches(ret
, BCH_ERR_transaction_restart
))
358 ret
= k
.k
->type
== KEY_TYPE_cookie
359 ? do_rebalance_scan(ctxt
, k
.k
->p
.inode
,
360 le64_to_cpu(bkey_s_c_to_cookie(k
).v
->cookie
))
361 : do_rebalance_extent(ctxt
, k
.k
->p
, &extent_iter
);
363 if (bch2_err_matches(ret
, BCH_ERR_transaction_restart
))
368 bch2_btree_iter_advance(&rebalance_work_iter
);
371 bch2_trans_iter_exit(trans
, &extent_iter
);
372 bch2_trans_iter_exit(trans
, &rebalance_work_iter
);
373 bch2_move_stats_exit(&r
->scan_stats
, c
);
376 !kthread_should_stop() &&
377 !atomic64_read(&r
->work_stats
.sectors_seen
) &&
378 !atomic64_read(&r
->scan_stats
.sectors_seen
)) {
379 bch2_moving_ctxt_flush_all(ctxt
);
380 bch2_trans_unlock_long(trans
);
384 if (!bch2_err_matches(ret
, EROFS
))
389 static int bch2_rebalance_thread(void *arg
)
391 struct bch_fs
*c
= arg
;
392 struct bch_fs_rebalance
*r
= &c
->rebalance
;
393 struct moving_context ctxt
;
397 bch2_moving_ctxt_init(&ctxt
, c
, NULL
, &r
->work_stats
,
398 writepoint_ptr(&c
->rebalance_write_point
),
401 while (!kthread_should_stop() && !do_rebalance(&ctxt
))
404 bch2_moving_ctxt_exit(&ctxt
);
409 void bch2_rebalance_status_to_text(struct printbuf
*out
, struct bch_fs
*c
)
411 struct bch_fs_rebalance
*r
= &c
->rebalance
;
413 prt_str(out
, bch2_rebalance_state_strs
[r
->state
]);
415 printbuf_indent_add(out
, 2);
418 case BCH_REBALANCE_waiting
: {
419 u64 now
= atomic64_read(&c
->io_clock
[WRITE
].now
);
421 prt_str(out
, "io wait duration: ");
422 bch2_prt_human_readable_s64(out
, (r
->wait_iotime_end
- r
->wait_iotime_start
) << 9);
425 prt_str(out
, "io wait remaining: ");
426 bch2_prt_human_readable_s64(out
, (r
->wait_iotime_end
- now
) << 9);
429 prt_str(out
, "duration waited: ");
430 bch2_pr_time_units(out
, ktime_get_real_ns() - r
->wait_wallclock_start
);
434 case BCH_REBALANCE_working
:
435 bch2_move_stats_to_text(out
, &r
->work_stats
);
437 case BCH_REBALANCE_scanning
:
438 bch2_move_stats_to_text(out
, &r
->scan_stats
);
442 printbuf_indent_sub(out
, 2);
445 void bch2_rebalance_stop(struct bch_fs
*c
)
447 struct task_struct
*p
;
449 c
->rebalance
.pd
.rate
.rate
= UINT_MAX
;
450 bch2_ratelimit_reset(&c
->rebalance
.pd
.rate
);
452 p
= rcu_dereference_protected(c
->rebalance
.thread
, 1);
453 c
->rebalance
.thread
= NULL
;
456 /* for sychronizing with rebalance_wakeup() */
464 int bch2_rebalance_start(struct bch_fs
*c
)
466 struct task_struct
*p
;
469 if (c
->rebalance
.thread
)
472 if (c
->opts
.nochanges
)
475 p
= kthread_create(bch2_rebalance_thread
, c
, "bch-rebalance/%s", c
->name
);
476 ret
= PTR_ERR_OR_ZERO(p
);
477 bch_err_msg(c
, ret
, "creating rebalance thread");
482 rcu_assign_pointer(c
->rebalance
.thread
, p
);
487 void bch2_fs_rebalance_init(struct bch_fs
*c
)
489 bch2_pd_controller_init(&c
->rebalance
.pd
);