1 // SPDX-License-Identifier: GPL-2.0-or-later
3 * cgroups support for the BFQ I/O scheduler.
5 #include <linux/module.h>
6 #include <linux/slab.h>
7 #include <linux/blkdev.h>
8 #include <linux/cgroup.h>
9 #include <linux/elevator.h>
10 #include <linux/ktime.h>
11 #include <linux/rbtree.h>
12 #include <linux/ioprio.h>
13 #include <linux/sbitmap.h>
14 #include <linux/delay.h>
16 #include "bfq-iosched.h"
18 #ifdef CONFIG_BFQ_CGROUP_DEBUG
19 static int bfq_stat_init(struct bfq_stat
*stat
, gfp_t gfp
)
23 ret
= percpu_counter_init(&stat
->cpu_cnt
, 0, gfp
);
27 atomic64_set(&stat
->aux_cnt
, 0);
31 static void bfq_stat_exit(struct bfq_stat
*stat
)
33 percpu_counter_destroy(&stat
->cpu_cnt
);
37 * bfq_stat_add - add a value to a bfq_stat
38 * @stat: target bfq_stat
41 * Add @val to @stat. The caller must ensure that IRQ on the same CPU
42 * don't re-enter this function for the same counter.
44 static inline void bfq_stat_add(struct bfq_stat
*stat
, uint64_t val
)
46 percpu_counter_add_batch(&stat
->cpu_cnt
, val
, BLKG_STAT_CPU_BATCH
);
50 * bfq_stat_read - read the current value of a bfq_stat
51 * @stat: bfq_stat to read
53 static inline uint64_t bfq_stat_read(struct bfq_stat
*stat
)
55 return percpu_counter_sum_positive(&stat
->cpu_cnt
);
59 * bfq_stat_reset - reset a bfq_stat
60 * @stat: bfq_stat to reset
62 static inline void bfq_stat_reset(struct bfq_stat
*stat
)
64 percpu_counter_set(&stat
->cpu_cnt
, 0);
65 atomic64_set(&stat
->aux_cnt
, 0);
69 * bfq_stat_add_aux - add a bfq_stat into another's aux count
70 * @to: the destination bfq_stat
73 * Add @from's count including the aux one to @to's aux count.
75 static inline void bfq_stat_add_aux(struct bfq_stat
*to
,
76 struct bfq_stat
*from
)
78 atomic64_add(bfq_stat_read(from
) + atomic64_read(&from
->aux_cnt
),
83 * blkg_prfill_stat - prfill callback for bfq_stat
84 * @sf: seq_file to print to
85 * @pd: policy private data of interest
86 * @off: offset to the bfq_stat in @pd
88 * prfill callback for printing a bfq_stat.
90 static u64
blkg_prfill_stat(struct seq_file
*sf
, struct blkg_policy_data
*pd
,
93 return __blkg_prfill_u64(sf
, pd
, bfq_stat_read((void *)pd
+ off
));
96 /* bfqg stats flags */
97 enum bfqg_stats_flags
{
98 BFQG_stats_waiting
= 0,
103 #define BFQG_FLAG_FNS(name) \
104 static void bfqg_stats_mark_##name(struct bfqg_stats *stats) \
106 stats->flags |= (1 << BFQG_stats_##name); \
108 static void bfqg_stats_clear_##name(struct bfqg_stats *stats) \
110 stats->flags &= ~(1 << BFQG_stats_##name); \
112 static int bfqg_stats_##name(struct bfqg_stats *stats) \
114 return (stats->flags & (1 << BFQG_stats_##name)) != 0; \
117 BFQG_FLAG_FNS(waiting)
118 BFQG_FLAG_FNS(idling
)
122 /* This should be called with the scheduler lock held. */
123 static void bfqg_stats_update_group_wait_time(struct bfqg_stats
*stats
)
127 if (!bfqg_stats_waiting(stats
))
130 now
= ktime_get_ns();
131 if (now
> stats
->start_group_wait_time
)
132 bfq_stat_add(&stats
->group_wait_time
,
133 now
- stats
->start_group_wait_time
);
134 bfqg_stats_clear_waiting(stats
);
137 /* This should be called with the scheduler lock held. */
138 static void bfqg_stats_set_start_group_wait_time(struct bfq_group
*bfqg
,
139 struct bfq_group
*curr_bfqg
)
141 struct bfqg_stats
*stats
= &bfqg
->stats
;
143 if (bfqg_stats_waiting(stats
))
145 if (bfqg
== curr_bfqg
)
147 stats
->start_group_wait_time
= ktime_get_ns();
148 bfqg_stats_mark_waiting(stats
);
151 /* This should be called with the scheduler lock held. */
152 static void bfqg_stats_end_empty_time(struct bfqg_stats
*stats
)
156 if (!bfqg_stats_empty(stats
))
159 now
= ktime_get_ns();
160 if (now
> stats
->start_empty_time
)
161 bfq_stat_add(&stats
->empty_time
,
162 now
- stats
->start_empty_time
);
163 bfqg_stats_clear_empty(stats
);
166 void bfqg_stats_update_dequeue(struct bfq_group
*bfqg
)
168 bfq_stat_add(&bfqg
->stats
.dequeue
, 1);
171 void bfqg_stats_set_start_empty_time(struct bfq_group
*bfqg
)
173 struct bfqg_stats
*stats
= &bfqg
->stats
;
175 if (blkg_rwstat_total(&stats
->queued
))
179 * group is already marked empty. This can happen if bfqq got new
180 * request in parent group and moved to this group while being added
181 * to service tree. Just ignore the event and move on.
183 if (bfqg_stats_empty(stats
))
186 stats
->start_empty_time
= ktime_get_ns();
187 bfqg_stats_mark_empty(stats
);
190 void bfqg_stats_update_idle_time(struct bfq_group
*bfqg
)
192 struct bfqg_stats
*stats
= &bfqg
->stats
;
194 if (bfqg_stats_idling(stats
)) {
195 u64 now
= ktime_get_ns();
197 if (now
> stats
->start_idle_time
)
198 bfq_stat_add(&stats
->idle_time
,
199 now
- stats
->start_idle_time
);
200 bfqg_stats_clear_idling(stats
);
204 void bfqg_stats_set_start_idle_time(struct bfq_group
*bfqg
)
206 struct bfqg_stats
*stats
= &bfqg
->stats
;
208 stats
->start_idle_time
= ktime_get_ns();
209 bfqg_stats_mark_idling(stats
);
212 void bfqg_stats_update_avg_queue_size(struct bfq_group
*bfqg
)
214 struct bfqg_stats
*stats
= &bfqg
->stats
;
216 bfq_stat_add(&stats
->avg_queue_size_sum
,
217 blkg_rwstat_total(&stats
->queued
));
218 bfq_stat_add(&stats
->avg_queue_size_samples
, 1);
219 bfqg_stats_update_group_wait_time(stats
);
222 void bfqg_stats_update_io_add(struct bfq_group
*bfqg
, struct bfq_queue
*bfqq
,
225 blkg_rwstat_add(&bfqg
->stats
.queued
, op
, 1);
226 bfqg_stats_end_empty_time(&bfqg
->stats
);
227 if (!(bfqq
== ((struct bfq_data
*)bfqg
->bfqd
)->in_service_queue
))
228 bfqg_stats_set_start_group_wait_time(bfqg
, bfqq_group(bfqq
));
231 void bfqg_stats_update_io_remove(struct bfq_group
*bfqg
, unsigned int op
)
233 blkg_rwstat_add(&bfqg
->stats
.queued
, op
, -1);
236 void bfqg_stats_update_io_merged(struct bfq_group
*bfqg
, unsigned int op
)
238 blkg_rwstat_add(&bfqg
->stats
.merged
, op
, 1);
241 void bfqg_stats_update_completion(struct bfq_group
*bfqg
, u64 start_time_ns
,
242 u64 io_start_time_ns
, unsigned int op
)
244 struct bfqg_stats
*stats
= &bfqg
->stats
;
245 u64 now
= ktime_get_ns();
247 if (now
> io_start_time_ns
)
248 blkg_rwstat_add(&stats
->service_time
, op
,
249 now
- io_start_time_ns
);
250 if (io_start_time_ns
> start_time_ns
)
251 blkg_rwstat_add(&stats
->wait_time
, op
,
252 io_start_time_ns
- start_time_ns
);
255 #else /* CONFIG_BFQ_CGROUP_DEBUG */
257 void bfqg_stats_update_io_add(struct bfq_group
*bfqg
, struct bfq_queue
*bfqq
,
259 void bfqg_stats_update_io_remove(struct bfq_group
*bfqg
, unsigned int op
) { }
260 void bfqg_stats_update_io_merged(struct bfq_group
*bfqg
, unsigned int op
) { }
261 void bfqg_stats_update_completion(struct bfq_group
*bfqg
, u64 start_time_ns
,
262 u64 io_start_time_ns
, unsigned int op
) { }
263 void bfqg_stats_update_dequeue(struct bfq_group
*bfqg
) { }
264 void bfqg_stats_set_start_empty_time(struct bfq_group
*bfqg
) { }
265 void bfqg_stats_update_idle_time(struct bfq_group
*bfqg
) { }
266 void bfqg_stats_set_start_idle_time(struct bfq_group
*bfqg
) { }
267 void bfqg_stats_update_avg_queue_size(struct bfq_group
*bfqg
) { }
269 #endif /* CONFIG_BFQ_CGROUP_DEBUG */
271 #ifdef CONFIG_BFQ_GROUP_IOSCHED
274 * blk-cgroup policy-related handlers
275 * The following functions help in converting between blk-cgroup
276 * internal structures and BFQ-specific structures.
279 static struct bfq_group
*pd_to_bfqg(struct blkg_policy_data
*pd
)
281 return pd
? container_of(pd
, struct bfq_group
, pd
) : NULL
;
284 struct blkcg_gq
*bfqg_to_blkg(struct bfq_group
*bfqg
)
286 return pd_to_blkg(&bfqg
->pd
);
289 static struct bfq_group
*blkg_to_bfqg(struct blkcg_gq
*blkg
)
291 return pd_to_bfqg(blkg_to_pd(blkg
, &blkcg_policy_bfq
));
296 * The following functions help in navigating the bfq_group hierarchy
297 * by allowing to find the parent of a bfq_group or the bfq_group
298 * associated to a bfq_queue.
301 static struct bfq_group
*bfqg_parent(struct bfq_group
*bfqg
)
303 struct blkcg_gq
*pblkg
= bfqg_to_blkg(bfqg
)->parent
;
305 return pblkg
? blkg_to_bfqg(pblkg
) : NULL
;
308 struct bfq_group
*bfqq_group(struct bfq_queue
*bfqq
)
310 struct bfq_entity
*group_entity
= bfqq
->entity
.parent
;
312 return group_entity
? container_of(group_entity
, struct bfq_group
,
314 bfqq
->bfqd
->root_group
;
318 * The following two functions handle get and put of a bfq_group by
319 * wrapping the related blk-cgroup hooks.
322 static void bfqg_get(struct bfq_group
*bfqg
)
327 static void bfqg_put(struct bfq_group
*bfqg
)
335 static void bfqg_and_blkg_get(struct bfq_group
*bfqg
)
337 /* see comments in bfq_bic_update_cgroup for why refcounting bfqg */
340 blkg_get(bfqg_to_blkg(bfqg
));
343 void bfqg_and_blkg_put(struct bfq_group
*bfqg
)
345 blkg_put(bfqg_to_blkg(bfqg
));
351 static void bfqg_stats_reset(struct bfqg_stats
*stats
)
353 #ifdef CONFIG_BFQ_CGROUP_DEBUG
354 /* queued stats shouldn't be cleared */
355 blkg_rwstat_reset(&stats
->merged
);
356 blkg_rwstat_reset(&stats
->service_time
);
357 blkg_rwstat_reset(&stats
->wait_time
);
358 bfq_stat_reset(&stats
->time
);
359 bfq_stat_reset(&stats
->avg_queue_size_sum
);
360 bfq_stat_reset(&stats
->avg_queue_size_samples
);
361 bfq_stat_reset(&stats
->dequeue
);
362 bfq_stat_reset(&stats
->group_wait_time
);
363 bfq_stat_reset(&stats
->idle_time
);
364 bfq_stat_reset(&stats
->empty_time
);
369 static void bfqg_stats_add_aux(struct bfqg_stats
*to
, struct bfqg_stats
*from
)
374 #ifdef CONFIG_BFQ_CGROUP_DEBUG
375 /* queued stats shouldn't be cleared */
376 blkg_rwstat_add_aux(&to
->merged
, &from
->merged
);
377 blkg_rwstat_add_aux(&to
->service_time
, &from
->service_time
);
378 blkg_rwstat_add_aux(&to
->wait_time
, &from
->wait_time
);
379 bfq_stat_add_aux(&from
->time
, &from
->time
);
380 bfq_stat_add_aux(&to
->avg_queue_size_sum
, &from
->avg_queue_size_sum
);
381 bfq_stat_add_aux(&to
->avg_queue_size_samples
,
382 &from
->avg_queue_size_samples
);
383 bfq_stat_add_aux(&to
->dequeue
, &from
->dequeue
);
384 bfq_stat_add_aux(&to
->group_wait_time
, &from
->group_wait_time
);
385 bfq_stat_add_aux(&to
->idle_time
, &from
->idle_time
);
386 bfq_stat_add_aux(&to
->empty_time
, &from
->empty_time
);
391 * Transfer @bfqg's stats to its parent's aux counts so that the ancestors'
392 * recursive stats can still account for the amount used by this bfqg after
395 static void bfqg_stats_xfer_dead(struct bfq_group
*bfqg
)
397 struct bfq_group
*parent
;
399 if (!bfqg
) /* root_group */
402 parent
= bfqg_parent(bfqg
);
404 lockdep_assert_held(&bfqg_to_blkg(bfqg
)->q
->queue_lock
);
406 if (unlikely(!parent
))
409 bfqg_stats_add_aux(&parent
->stats
, &bfqg
->stats
);
410 bfqg_stats_reset(&bfqg
->stats
);
413 void bfq_init_entity(struct bfq_entity
*entity
, struct bfq_group
*bfqg
)
415 struct bfq_queue
*bfqq
= bfq_entity_to_bfqq(entity
);
417 entity
->weight
= entity
->new_weight
;
418 entity
->orig_weight
= entity
->new_weight
;
420 bfqq
->ioprio
= bfqq
->new_ioprio
;
421 bfqq
->ioprio_class
= bfqq
->new_ioprio_class
;
423 * Make sure that bfqg and its associated blkg do not
424 * disappear before entity.
426 bfqg_and_blkg_get(bfqg
);
428 entity
->parent
= bfqg
->my_entity
; /* NULL for root group */
429 entity
->sched_data
= &bfqg
->sched_data
;
432 static void bfqg_stats_exit(struct bfqg_stats
*stats
)
434 #ifdef CONFIG_BFQ_CGROUP_DEBUG
435 blkg_rwstat_exit(&stats
->merged
);
436 blkg_rwstat_exit(&stats
->service_time
);
437 blkg_rwstat_exit(&stats
->wait_time
);
438 blkg_rwstat_exit(&stats
->queued
);
439 bfq_stat_exit(&stats
->time
);
440 bfq_stat_exit(&stats
->avg_queue_size_sum
);
441 bfq_stat_exit(&stats
->avg_queue_size_samples
);
442 bfq_stat_exit(&stats
->dequeue
);
443 bfq_stat_exit(&stats
->group_wait_time
);
444 bfq_stat_exit(&stats
->idle_time
);
445 bfq_stat_exit(&stats
->empty_time
);
449 static int bfqg_stats_init(struct bfqg_stats
*stats
, gfp_t gfp
)
451 #ifdef CONFIG_BFQ_CGROUP_DEBUG
452 if (blkg_rwstat_init(&stats
->merged
, gfp
) ||
453 blkg_rwstat_init(&stats
->service_time
, gfp
) ||
454 blkg_rwstat_init(&stats
->wait_time
, gfp
) ||
455 blkg_rwstat_init(&stats
->queued
, gfp
) ||
456 bfq_stat_init(&stats
->time
, gfp
) ||
457 bfq_stat_init(&stats
->avg_queue_size_sum
, gfp
) ||
458 bfq_stat_init(&stats
->avg_queue_size_samples
, gfp
) ||
459 bfq_stat_init(&stats
->dequeue
, gfp
) ||
460 bfq_stat_init(&stats
->group_wait_time
, gfp
) ||
461 bfq_stat_init(&stats
->idle_time
, gfp
) ||
462 bfq_stat_init(&stats
->empty_time
, gfp
)) {
463 bfqg_stats_exit(stats
);
471 static struct bfq_group_data
*cpd_to_bfqgd(struct blkcg_policy_data
*cpd
)
473 return cpd
? container_of(cpd
, struct bfq_group_data
, pd
) : NULL
;
476 static struct bfq_group_data
*blkcg_to_bfqgd(struct blkcg
*blkcg
)
478 return cpd_to_bfqgd(blkcg_to_cpd(blkcg
, &blkcg_policy_bfq
));
481 static struct blkcg_policy_data
*bfq_cpd_alloc(gfp_t gfp
)
483 struct bfq_group_data
*bgd
;
485 bgd
= kzalloc(sizeof(*bgd
), gfp
);
491 static void bfq_cpd_init(struct blkcg_policy_data
*cpd
)
493 struct bfq_group_data
*d
= cpd_to_bfqgd(cpd
);
495 d
->weight
= cgroup_subsys_on_dfl(io_cgrp_subsys
) ?
496 CGROUP_WEIGHT_DFL
: BFQ_WEIGHT_LEGACY_DFL
;
499 static void bfq_cpd_free(struct blkcg_policy_data
*cpd
)
501 kfree(cpd_to_bfqgd(cpd
));
504 static struct blkg_policy_data
*bfq_pd_alloc(gfp_t gfp
, struct request_queue
*q
,
507 struct bfq_group
*bfqg
;
509 bfqg
= kzalloc_node(sizeof(*bfqg
), gfp
, q
->node
);
513 if (bfqg_stats_init(&bfqg
->stats
, gfp
)) {
518 /* see comments in bfq_bic_update_cgroup for why refcounting */
523 static void bfq_pd_init(struct blkg_policy_data
*pd
)
525 struct blkcg_gq
*blkg
= pd_to_blkg(pd
);
526 struct bfq_group
*bfqg
= blkg_to_bfqg(blkg
);
527 struct bfq_data
*bfqd
= blkg
->q
->elevator
->elevator_data
;
528 struct bfq_entity
*entity
= &bfqg
->entity
;
529 struct bfq_group_data
*d
= blkcg_to_bfqgd(blkg
->blkcg
);
531 entity
->orig_weight
= entity
->weight
= entity
->new_weight
= d
->weight
;
532 entity
->my_sched_data
= &bfqg
->sched_data
;
533 bfqg
->my_entity
= entity
; /*
534 * the root_group's will be set to NULL
535 * in bfq_init_queue()
538 bfqg
->active_entities
= 0;
539 bfqg
->rq_pos_tree
= RB_ROOT
;
542 static void bfq_pd_free(struct blkg_policy_data
*pd
)
544 struct bfq_group
*bfqg
= pd_to_bfqg(pd
);
546 bfqg_stats_exit(&bfqg
->stats
);
550 static void bfq_pd_reset_stats(struct blkg_policy_data
*pd
)
552 struct bfq_group
*bfqg
= pd_to_bfqg(pd
);
554 bfqg_stats_reset(&bfqg
->stats
);
557 static void bfq_group_set_parent(struct bfq_group
*bfqg
,
558 struct bfq_group
*parent
)
560 struct bfq_entity
*entity
;
562 entity
= &bfqg
->entity
;
563 entity
->parent
= parent
->my_entity
;
564 entity
->sched_data
= &parent
->sched_data
;
567 static struct bfq_group
*bfq_lookup_bfqg(struct bfq_data
*bfqd
,
570 struct blkcg_gq
*blkg
;
572 blkg
= blkg_lookup(blkcg
, bfqd
->queue
);
574 return blkg_to_bfqg(blkg
);
578 struct bfq_group
*bfq_find_set_group(struct bfq_data
*bfqd
,
581 struct bfq_group
*bfqg
, *parent
;
582 struct bfq_entity
*entity
;
584 bfqg
= bfq_lookup_bfqg(bfqd
, blkcg
);
590 * Update chain of bfq_groups as we might be handling a leaf group
591 * which, along with some of its relatives, has not been hooked yet
592 * to the private hierarchy of BFQ.
594 entity
= &bfqg
->entity
;
595 for_each_entity(entity
) {
596 bfqg
= container_of(entity
, struct bfq_group
, entity
);
597 if (bfqg
!= bfqd
->root_group
) {
598 parent
= bfqg_parent(bfqg
);
600 parent
= bfqd
->root_group
;
601 bfq_group_set_parent(bfqg
, parent
);
609 * bfq_bfqq_move - migrate @bfqq to @bfqg.
610 * @bfqd: queue descriptor.
611 * @bfqq: the queue to move.
612 * @bfqg: the group to move to.
614 * Move @bfqq to @bfqg, deactivating it from its old group and reactivating
615 * it on the new one. Avoid putting the entity on the old group idle tree.
617 * Must be called under the scheduler lock, to make sure that the blkg
618 * owning @bfqg does not disappear (see comments in
619 * bfq_bic_update_cgroup on guaranteeing the consistency of blkg
622 void bfq_bfqq_move(struct bfq_data
*bfqd
, struct bfq_queue
*bfqq
,
623 struct bfq_group
*bfqg
)
625 struct bfq_entity
*entity
= &bfqq
->entity
;
627 /* If bfqq is empty, then bfq_bfqq_expire also invokes
628 * bfq_del_bfqq_busy, thereby removing bfqq and its entity
629 * from data structures related to current group. Otherwise we
630 * need to remove bfqq explicitly with bfq_deactivate_bfqq, as
633 if (bfqq
== bfqd
->in_service_queue
)
634 bfq_bfqq_expire(bfqd
, bfqd
->in_service_queue
,
635 false, BFQQE_PREEMPTED
);
637 if (bfq_bfqq_busy(bfqq
))
638 bfq_deactivate_bfqq(bfqd
, bfqq
, false, false);
639 else if (entity
->on_st
)
640 bfq_put_idle_entity(bfq_entity_service_tree(entity
), entity
);
641 bfqg_and_blkg_put(bfqq_group(bfqq
));
643 entity
->parent
= bfqg
->my_entity
;
644 entity
->sched_data
= &bfqg
->sched_data
;
645 /* pin down bfqg and its associated blkg */
646 bfqg_and_blkg_get(bfqg
);
648 if (bfq_bfqq_busy(bfqq
)) {
649 if (unlikely(!bfqd
->nonrot_with_queueing
))
650 bfq_pos_tree_add_move(bfqd
, bfqq
);
651 bfq_activate_bfqq(bfqd
, bfqq
);
654 if (!bfqd
->in_service_queue
&& !bfqd
->rq_in_driver
)
655 bfq_schedule_dispatch(bfqd
);
659 * __bfq_bic_change_cgroup - move @bic to @cgroup.
660 * @bfqd: the queue descriptor.
661 * @bic: the bic to move.
662 * @blkcg: the blk-cgroup to move to.
664 * Move bic to blkcg, assuming that bfqd->lock is held; which makes
665 * sure that the reference to cgroup is valid across the call (see
666 * comments in bfq_bic_update_cgroup on this issue)
668 * NOTE: an alternative approach might have been to store the current
669 * cgroup in bfqq and getting a reference to it, reducing the lookup
670 * time here, at the price of slightly more complex code.
672 static struct bfq_group
*__bfq_bic_change_cgroup(struct bfq_data
*bfqd
,
673 struct bfq_io_cq
*bic
,
676 struct bfq_queue
*async_bfqq
= bic_to_bfqq(bic
, 0);
677 struct bfq_queue
*sync_bfqq
= bic_to_bfqq(bic
, 1);
678 struct bfq_group
*bfqg
;
679 struct bfq_entity
*entity
;
681 bfqg
= bfq_find_set_group(bfqd
, blkcg
);
684 bfqg
= bfqd
->root_group
;
687 entity
= &async_bfqq
->entity
;
689 if (entity
->sched_data
!= &bfqg
->sched_data
) {
690 bic_set_bfqq(bic
, NULL
, 0);
691 bfq_log_bfqq(bfqd
, async_bfqq
,
692 "bic_change_group: %p %d",
693 async_bfqq
, async_bfqq
->ref
);
694 bfq_put_queue(async_bfqq
);
699 entity
= &sync_bfqq
->entity
;
700 if (entity
->sched_data
!= &bfqg
->sched_data
)
701 bfq_bfqq_move(bfqd
, sync_bfqq
, bfqg
);
707 void bfq_bic_update_cgroup(struct bfq_io_cq
*bic
, struct bio
*bio
)
709 struct bfq_data
*bfqd
= bic_to_bfqd(bic
);
710 struct bfq_group
*bfqg
= NULL
;
714 serial_nr
= __bio_blkcg(bio
)->css
.serial_nr
;
717 * Check whether blkcg has changed. The condition may trigger
718 * spuriously on a newly created cic but there's no harm.
720 if (unlikely(!bfqd
) || likely(bic
->blkcg_serial_nr
== serial_nr
))
723 bfqg
= __bfq_bic_change_cgroup(bfqd
, bic
, __bio_blkcg(bio
));
725 * Update blkg_path for bfq_log_* functions. We cache this
726 * path, and update it here, for the following
727 * reasons. Operations on blkg objects in blk-cgroup are
728 * protected with the request_queue lock, and not with the
729 * lock that protects the instances of this scheduler
730 * (bfqd->lock). This exposes BFQ to the following sort of
733 * The blkg_lookup performed in bfq_get_queue, protected
734 * through rcu, may happen to return the address of a copy of
735 * the original blkg. If this is the case, then the
736 * bfqg_and_blkg_get performed in bfq_get_queue, to pin down
737 * the blkg, is useless: it does not prevent blk-cgroup code
738 * from destroying both the original blkg and all objects
739 * directly or indirectly referred by the copy of the
742 * On the bright side, destroy operations on a blkg invoke, as
743 * a first step, hooks of the scheduler associated with the
744 * blkg. And these hooks are executed with bfqd->lock held for
745 * BFQ. As a consequence, for any blkg associated with the
746 * request queue this instance of the scheduler is attached
747 * to, we are guaranteed that such a blkg is not destroyed, and
748 * that all the pointers it contains are consistent, while we
749 * are holding bfqd->lock. A blkg_lookup performed with
750 * bfqd->lock held then returns a fully consistent blkg, which
751 * remains consistent until this lock is held.
753 * Thanks to the last fact, and to the fact that: (1) bfqg has
754 * been obtained through a blkg_lookup in the above
755 * assignment, and (2) bfqd->lock is being held, here we can
756 * safely use the policy data for the involved blkg (i.e., the
757 * field bfqg->pd) to get to the blkg associated with bfqg,
758 * and then we can safely use any field of blkg. After we
759 * release bfqd->lock, even just getting blkg through this
760 * bfqg may cause dangling references to be traversed, as
761 * bfqg->pd may not exist any more.
763 * In view of the above facts, here we cache, in the bfqg, any
764 * blkg data we may need for this bic, and for its associated
765 * bfq_queue. As of now, we need to cache only the path of the
766 * blkg, which is used in the bfq_log_* functions.
768 * Finally, note that bfqg itself needs to be protected from
769 * destruction on the blkg_free of the original blkg (which
770 * invokes bfq_pd_free). We use an additional private
771 * refcounter for bfqg, to let it disappear only after no
772 * bfq_queue refers to it any longer.
774 blkg_path(bfqg_to_blkg(bfqg
), bfqg
->blkg_path
, sizeof(bfqg
->blkg_path
));
775 bic
->blkcg_serial_nr
= serial_nr
;
781 * bfq_flush_idle_tree - deactivate any entity on the idle tree of @st.
782 * @st: the service tree being flushed.
784 static void bfq_flush_idle_tree(struct bfq_service_tree
*st
)
786 struct bfq_entity
*entity
= st
->first_idle
;
788 for (; entity
; entity
= st
->first_idle
)
789 __bfq_deactivate_entity(entity
, false);
793 * bfq_reparent_leaf_entity - move leaf entity to the root_group.
794 * @bfqd: the device data structure with the root group.
795 * @entity: the entity to move.
797 static void bfq_reparent_leaf_entity(struct bfq_data
*bfqd
,
798 struct bfq_entity
*entity
)
800 struct bfq_queue
*bfqq
= bfq_entity_to_bfqq(entity
);
802 bfq_bfqq_move(bfqd
, bfqq
, bfqd
->root_group
);
806 * bfq_reparent_active_entities - move to the root group all active
808 * @bfqd: the device data structure with the root group.
809 * @bfqg: the group to move from.
810 * @st: the service tree with the entities.
812 static void bfq_reparent_active_entities(struct bfq_data
*bfqd
,
813 struct bfq_group
*bfqg
,
814 struct bfq_service_tree
*st
)
816 struct rb_root
*active
= &st
->active
;
817 struct bfq_entity
*entity
= NULL
;
819 if (!RB_EMPTY_ROOT(&st
->active
))
820 entity
= bfq_entity_of(rb_first(active
));
822 for (; entity
; entity
= bfq_entity_of(rb_first(active
)))
823 bfq_reparent_leaf_entity(bfqd
, entity
);
825 if (bfqg
->sched_data
.in_service_entity
)
826 bfq_reparent_leaf_entity(bfqd
,
827 bfqg
->sched_data
.in_service_entity
);
831 * bfq_pd_offline - deactivate the entity associated with @pd,
832 * and reparent its children entities.
833 * @pd: descriptor of the policy going offline.
835 * blkio already grabs the queue_lock for us, so no need to use
838 static void bfq_pd_offline(struct blkg_policy_data
*pd
)
840 struct bfq_service_tree
*st
;
841 struct bfq_group
*bfqg
= pd_to_bfqg(pd
);
842 struct bfq_data
*bfqd
= bfqg
->bfqd
;
843 struct bfq_entity
*entity
= bfqg
->my_entity
;
847 spin_lock_irqsave(&bfqd
->lock
, flags
);
849 if (!entity
) /* root group */
850 goto put_async_queues
;
853 * Empty all service_trees belonging to this group before
854 * deactivating the group itself.
856 for (i
= 0; i
< BFQ_IOPRIO_CLASSES
; i
++) {
857 st
= bfqg
->sched_data
.service_tree
+ i
;
860 * The idle tree may still contain bfq_queues belonging
861 * to exited task because they never migrated to a different
862 * cgroup from the one being destroyed now.
864 bfq_flush_idle_tree(st
);
867 * It may happen that some queues are still active
868 * (busy) upon group destruction (if the corresponding
869 * processes have been forced to terminate). We move
870 * all the leaf entities corresponding to these queues
872 * Also, it may happen that the group has an entity
873 * in service, which is disconnected from the active
874 * tree: it must be moved, too.
875 * There is no need to put the sync queues, as the
876 * scheduler has taken no reference.
878 bfq_reparent_active_entities(bfqd
, bfqg
, st
);
881 __bfq_deactivate_entity(entity
, false);
884 bfq_put_async_queues(bfqd
, bfqg
);
886 spin_unlock_irqrestore(&bfqd
->lock
, flags
);
888 * @blkg is going offline and will be ignored by
889 * blkg_[rw]stat_recursive_sum(). Transfer stats to the parent so
890 * that they don't get lost. If IOs complete after this point, the
891 * stats for them will be lost. Oh well...
893 bfqg_stats_xfer_dead(bfqg
);
896 void bfq_end_wr_async(struct bfq_data
*bfqd
)
898 struct blkcg_gq
*blkg
;
900 list_for_each_entry(blkg
, &bfqd
->queue
->blkg_list
, q_node
) {
901 struct bfq_group
*bfqg
= blkg_to_bfqg(blkg
);
903 bfq_end_wr_async_queues(bfqd
, bfqg
);
905 bfq_end_wr_async_queues(bfqd
, bfqd
->root_group
);
908 static int bfq_io_show_weight_legacy(struct seq_file
*sf
, void *v
)
910 struct blkcg
*blkcg
= css_to_blkcg(seq_css(sf
));
911 struct bfq_group_data
*bfqgd
= blkcg_to_bfqgd(blkcg
);
912 unsigned int val
= 0;
917 seq_printf(sf
, "%u\n", val
);
922 static u64
bfqg_prfill_weight_device(struct seq_file
*sf
,
923 struct blkg_policy_data
*pd
, int off
)
925 struct bfq_group
*bfqg
= pd_to_bfqg(pd
);
927 if (!bfqg
->entity
.dev_weight
)
929 return __blkg_prfill_u64(sf
, pd
, bfqg
->entity
.dev_weight
);
932 static int bfq_io_show_weight(struct seq_file
*sf
, void *v
)
934 struct blkcg
*blkcg
= css_to_blkcg(seq_css(sf
));
935 struct bfq_group_data
*bfqgd
= blkcg_to_bfqgd(blkcg
);
937 seq_printf(sf
, "default %u\n", bfqgd
->weight
);
938 blkcg_print_blkgs(sf
, blkcg
, bfqg_prfill_weight_device
,
939 &blkcg_policy_bfq
, 0, false);
943 static void bfq_group_set_weight(struct bfq_group
*bfqg
, u64 weight
, u64 dev_weight
)
945 weight
= dev_weight
?: weight
;
947 bfqg
->entity
.dev_weight
= dev_weight
;
949 * Setting the prio_changed flag of the entity
950 * to 1 with new_weight == weight would re-set
951 * the value of the weight to its ioprio mapping.
952 * Set the flag only if necessary.
954 if ((unsigned short)weight
!= bfqg
->entity
.new_weight
) {
955 bfqg
->entity
.new_weight
= (unsigned short)weight
;
957 * Make sure that the above new value has been
958 * stored in bfqg->entity.new_weight before
959 * setting the prio_changed flag. In fact,
960 * this flag may be read asynchronously (in
961 * critical sections protected by a different
962 * lock than that held here), and finding this
963 * flag set may cause the execution of the code
964 * for updating parameters whose value may
965 * depend also on bfqg->entity.new_weight (in
966 * __bfq_entity_update_weight_prio).
967 * This barrier makes sure that the new value
968 * of bfqg->entity.new_weight is correctly
972 bfqg
->entity
.prio_changed
= 1;
976 static int bfq_io_set_weight_legacy(struct cgroup_subsys_state
*css
,
977 struct cftype
*cftype
,
980 struct blkcg
*blkcg
= css_to_blkcg(css
);
981 struct bfq_group_data
*bfqgd
= blkcg_to_bfqgd(blkcg
);
982 struct blkcg_gq
*blkg
;
985 if (val
< BFQ_MIN_WEIGHT
|| val
> BFQ_MAX_WEIGHT
)
989 spin_lock_irq(&blkcg
->lock
);
990 bfqgd
->weight
= (unsigned short)val
;
991 hlist_for_each_entry(blkg
, &blkcg
->blkg_list
, blkcg_node
) {
992 struct bfq_group
*bfqg
= blkg_to_bfqg(blkg
);
995 bfq_group_set_weight(bfqg
, val
, 0);
997 spin_unlock_irq(&blkcg
->lock
);
1002 static ssize_t
bfq_io_set_device_weight(struct kernfs_open_file
*of
,
1003 char *buf
, size_t nbytes
,
1007 struct blkg_conf_ctx ctx
;
1008 struct blkcg
*blkcg
= css_to_blkcg(of_css(of
));
1009 struct bfq_group
*bfqg
;
1012 ret
= blkg_conf_prep(blkcg
, &blkcg_policy_bfq
, buf
, &ctx
);
1016 if (sscanf(ctx
.body
, "%llu", &v
) == 1) {
1017 /* require "default" on dfl */
1021 } else if (!strcmp(strim(ctx
.body
), "default")) {
1028 bfqg
= blkg_to_bfqg(ctx
.blkg
);
1031 if (!v
|| (v
>= BFQ_MIN_WEIGHT
&& v
<= BFQ_MAX_WEIGHT
)) {
1032 bfq_group_set_weight(bfqg
, bfqg
->entity
.weight
, v
);
1036 blkg_conf_finish(&ctx
);
1037 return ret
?: nbytes
;
1040 static ssize_t
bfq_io_set_weight(struct kernfs_open_file
*of
,
1041 char *buf
, size_t nbytes
,
1050 /* "WEIGHT" or "default WEIGHT" sets the default weight */
1051 v
= simple_strtoull(buf
, &endp
, 0);
1052 if (*endp
== '\0' || sscanf(buf
, "default %llu", &v
) == 1) {
1053 ret
= bfq_io_set_weight_legacy(of_css(of
), NULL
, v
);
1054 return ret
?: nbytes
;
1057 return bfq_io_set_device_weight(of
, buf
, nbytes
, off
);
1060 #ifdef CONFIG_BFQ_CGROUP_DEBUG
1061 static int bfqg_print_stat(struct seq_file
*sf
, void *v
)
1063 blkcg_print_blkgs(sf
, css_to_blkcg(seq_css(sf
)), blkg_prfill_stat
,
1064 &blkcg_policy_bfq
, seq_cft(sf
)->private, false);
1068 static int bfqg_print_rwstat(struct seq_file
*sf
, void *v
)
1070 blkcg_print_blkgs(sf
, css_to_blkcg(seq_css(sf
)), blkg_prfill_rwstat
,
1071 &blkcg_policy_bfq
, seq_cft(sf
)->private, true);
1075 static u64
bfqg_prfill_stat_recursive(struct seq_file
*sf
,
1076 struct blkg_policy_data
*pd
, int off
)
1078 struct blkcg_gq
*blkg
= pd_to_blkg(pd
);
1079 struct blkcg_gq
*pos_blkg
;
1080 struct cgroup_subsys_state
*pos_css
;
1083 lockdep_assert_held(&blkg
->q
->queue_lock
);
1086 blkg_for_each_descendant_pre(pos_blkg
, pos_css
, blkg
) {
1087 struct bfq_stat
*stat
;
1089 if (!pos_blkg
->online
)
1092 stat
= (void *)blkg_to_pd(pos_blkg
, &blkcg_policy_bfq
) + off
;
1093 sum
+= bfq_stat_read(stat
) + atomic64_read(&stat
->aux_cnt
);
1097 return __blkg_prfill_u64(sf
, pd
, sum
);
1100 static u64
bfqg_prfill_rwstat_recursive(struct seq_file
*sf
,
1101 struct blkg_policy_data
*pd
, int off
)
1103 struct blkg_rwstat_sample sum
;
1105 blkg_rwstat_recursive_sum(pd_to_blkg(pd
), &blkcg_policy_bfq
, off
, &sum
);
1106 return __blkg_prfill_rwstat(sf
, pd
, &sum
);
1109 static int bfqg_print_stat_recursive(struct seq_file
*sf
, void *v
)
1111 blkcg_print_blkgs(sf
, css_to_blkcg(seq_css(sf
)),
1112 bfqg_prfill_stat_recursive
, &blkcg_policy_bfq
,
1113 seq_cft(sf
)->private, false);
1117 static int bfqg_print_rwstat_recursive(struct seq_file
*sf
, void *v
)
1119 blkcg_print_blkgs(sf
, css_to_blkcg(seq_css(sf
)),
1120 bfqg_prfill_rwstat_recursive
, &blkcg_policy_bfq
,
1121 seq_cft(sf
)->private, true);
1125 static u64
bfqg_prfill_sectors(struct seq_file
*sf
, struct blkg_policy_data
*pd
,
1128 u64 sum
= blkg_rwstat_total(&pd
->blkg
->stat_bytes
);
1130 return __blkg_prfill_u64(sf
, pd
, sum
>> 9);
1133 static int bfqg_print_stat_sectors(struct seq_file
*sf
, void *v
)
1135 blkcg_print_blkgs(sf
, css_to_blkcg(seq_css(sf
)),
1136 bfqg_prfill_sectors
, &blkcg_policy_bfq
, 0, false);
1140 static u64
bfqg_prfill_sectors_recursive(struct seq_file
*sf
,
1141 struct blkg_policy_data
*pd
, int off
)
1143 struct blkg_rwstat_sample tmp
;
1145 blkg_rwstat_recursive_sum(pd
->blkg
, NULL
,
1146 offsetof(struct blkcg_gq
, stat_bytes
), &tmp
);
1148 return __blkg_prfill_u64(sf
, pd
,
1149 (tmp
.cnt
[BLKG_RWSTAT_READ
] + tmp
.cnt
[BLKG_RWSTAT_WRITE
]) >> 9);
1152 static int bfqg_print_stat_sectors_recursive(struct seq_file
*sf
, void *v
)
1154 blkcg_print_blkgs(sf
, css_to_blkcg(seq_css(sf
)),
1155 bfqg_prfill_sectors_recursive
, &blkcg_policy_bfq
, 0,
1160 static u64
bfqg_prfill_avg_queue_size(struct seq_file
*sf
,
1161 struct blkg_policy_data
*pd
, int off
)
1163 struct bfq_group
*bfqg
= pd_to_bfqg(pd
);
1164 u64 samples
= bfq_stat_read(&bfqg
->stats
.avg_queue_size_samples
);
1168 v
= bfq_stat_read(&bfqg
->stats
.avg_queue_size_sum
);
1169 v
= div64_u64(v
, samples
);
1171 __blkg_prfill_u64(sf
, pd
, v
);
1175 /* print avg_queue_size */
1176 static int bfqg_print_avg_queue_size(struct seq_file
*sf
, void *v
)
1178 blkcg_print_blkgs(sf
, css_to_blkcg(seq_css(sf
)),
1179 bfqg_prfill_avg_queue_size
, &blkcg_policy_bfq
,
1183 #endif /* CONFIG_BFQ_CGROUP_DEBUG */
1185 struct bfq_group
*bfq_create_group_hierarchy(struct bfq_data
*bfqd
, int node
)
1189 ret
= blkcg_activate_policy(bfqd
->queue
, &blkcg_policy_bfq
);
1193 return blkg_to_bfqg(bfqd
->queue
->root_blkg
);
1196 struct blkcg_policy blkcg_policy_bfq
= {
1197 .dfl_cftypes
= bfq_blkg_files
,
1198 .legacy_cftypes
= bfq_blkcg_legacy_files
,
1200 .cpd_alloc_fn
= bfq_cpd_alloc
,
1201 .cpd_init_fn
= bfq_cpd_init
,
1202 .cpd_bind_fn
= bfq_cpd_init
,
1203 .cpd_free_fn
= bfq_cpd_free
,
1205 .pd_alloc_fn
= bfq_pd_alloc
,
1206 .pd_init_fn
= bfq_pd_init
,
1207 .pd_offline_fn
= bfq_pd_offline
,
1208 .pd_free_fn
= bfq_pd_free
,
1209 .pd_reset_stats_fn
= bfq_pd_reset_stats
,
1212 struct cftype bfq_blkcg_legacy_files
[] = {
1214 .name
= "bfq.weight",
1215 .flags
= CFTYPE_NOT_ON_ROOT
,
1216 .seq_show
= bfq_io_show_weight_legacy
,
1217 .write_u64
= bfq_io_set_weight_legacy
,
1220 .name
= "bfq.weight_device",
1221 .flags
= CFTYPE_NOT_ON_ROOT
,
1222 .seq_show
= bfq_io_show_weight
,
1223 .write
= bfq_io_set_weight
,
1226 /* statistics, covers only the tasks in the bfqg */
1228 .name
= "bfq.io_service_bytes",
1229 .private = (unsigned long)&blkcg_policy_bfq
,
1230 .seq_show
= blkg_print_stat_bytes
,
1233 .name
= "bfq.io_serviced",
1234 .private = (unsigned long)&blkcg_policy_bfq
,
1235 .seq_show
= blkg_print_stat_ios
,
1237 #ifdef CONFIG_BFQ_CGROUP_DEBUG
1240 .private = offsetof(struct bfq_group
, stats
.time
),
1241 .seq_show
= bfqg_print_stat
,
1244 .name
= "bfq.sectors",
1245 .seq_show
= bfqg_print_stat_sectors
,
1248 .name
= "bfq.io_service_time",
1249 .private = offsetof(struct bfq_group
, stats
.service_time
),
1250 .seq_show
= bfqg_print_rwstat
,
1253 .name
= "bfq.io_wait_time",
1254 .private = offsetof(struct bfq_group
, stats
.wait_time
),
1255 .seq_show
= bfqg_print_rwstat
,
1258 .name
= "bfq.io_merged",
1259 .private = offsetof(struct bfq_group
, stats
.merged
),
1260 .seq_show
= bfqg_print_rwstat
,
1263 .name
= "bfq.io_queued",
1264 .private = offsetof(struct bfq_group
, stats
.queued
),
1265 .seq_show
= bfqg_print_rwstat
,
1267 #endif /* CONFIG_BFQ_CGROUP_DEBUG */
1269 /* the same statistics which cover the bfqg and its descendants */
1271 .name
= "bfq.io_service_bytes_recursive",
1272 .private = (unsigned long)&blkcg_policy_bfq
,
1273 .seq_show
= blkg_print_stat_bytes_recursive
,
1276 .name
= "bfq.io_serviced_recursive",
1277 .private = (unsigned long)&blkcg_policy_bfq
,
1278 .seq_show
= blkg_print_stat_ios_recursive
,
1280 #ifdef CONFIG_BFQ_CGROUP_DEBUG
1282 .name
= "bfq.time_recursive",
1283 .private = offsetof(struct bfq_group
, stats
.time
),
1284 .seq_show
= bfqg_print_stat_recursive
,
1287 .name
= "bfq.sectors_recursive",
1288 .seq_show
= bfqg_print_stat_sectors_recursive
,
1291 .name
= "bfq.io_service_time_recursive",
1292 .private = offsetof(struct bfq_group
, stats
.service_time
),
1293 .seq_show
= bfqg_print_rwstat_recursive
,
1296 .name
= "bfq.io_wait_time_recursive",
1297 .private = offsetof(struct bfq_group
, stats
.wait_time
),
1298 .seq_show
= bfqg_print_rwstat_recursive
,
1301 .name
= "bfq.io_merged_recursive",
1302 .private = offsetof(struct bfq_group
, stats
.merged
),
1303 .seq_show
= bfqg_print_rwstat_recursive
,
1306 .name
= "bfq.io_queued_recursive",
1307 .private = offsetof(struct bfq_group
, stats
.queued
),
1308 .seq_show
= bfqg_print_rwstat_recursive
,
1311 .name
= "bfq.avg_queue_size",
1312 .seq_show
= bfqg_print_avg_queue_size
,
1315 .name
= "bfq.group_wait_time",
1316 .private = offsetof(struct bfq_group
, stats
.group_wait_time
),
1317 .seq_show
= bfqg_print_stat
,
1320 .name
= "bfq.idle_time",
1321 .private = offsetof(struct bfq_group
, stats
.idle_time
),
1322 .seq_show
= bfqg_print_stat
,
1325 .name
= "bfq.empty_time",
1326 .private = offsetof(struct bfq_group
, stats
.empty_time
),
1327 .seq_show
= bfqg_print_stat
,
1330 .name
= "bfq.dequeue",
1331 .private = offsetof(struct bfq_group
, stats
.dequeue
),
1332 .seq_show
= bfqg_print_stat
,
1334 #endif /* CONFIG_BFQ_CGROUP_DEBUG */
1338 struct cftype bfq_blkg_files
[] = {
1340 .name
= "bfq.weight",
1341 .flags
= CFTYPE_NOT_ON_ROOT
,
1342 .seq_show
= bfq_io_show_weight
,
1343 .write
= bfq_io_set_weight
,
1348 #else /* CONFIG_BFQ_GROUP_IOSCHED */
1350 void bfq_bfqq_move(struct bfq_data
*bfqd
, struct bfq_queue
*bfqq
,
1351 struct bfq_group
*bfqg
) {}
1353 void bfq_init_entity(struct bfq_entity
*entity
, struct bfq_group
*bfqg
)
1355 struct bfq_queue
*bfqq
= bfq_entity_to_bfqq(entity
);
1357 entity
->weight
= entity
->new_weight
;
1358 entity
->orig_weight
= entity
->new_weight
;
1360 bfqq
->ioprio
= bfqq
->new_ioprio
;
1361 bfqq
->ioprio_class
= bfqq
->new_ioprio_class
;
1363 entity
->sched_data
= &bfqg
->sched_data
;
1366 void bfq_bic_update_cgroup(struct bfq_io_cq
*bic
, struct bio
*bio
) {}
1368 void bfq_end_wr_async(struct bfq_data
*bfqd
)
1370 bfq_end_wr_async_queues(bfqd
, bfqd
->root_group
);
1373 struct bfq_group
*bfq_find_set_group(struct bfq_data
*bfqd
, struct blkcg
*blkcg
)
1375 return bfqd
->root_group
;
1378 struct bfq_group
*bfqq_group(struct bfq_queue
*bfqq
)
1380 return bfqq
->bfqd
->root_group
;
1383 struct bfq_group
*bfq_create_group_hierarchy(struct bfq_data
*bfqd
, int node
)
1385 struct bfq_group
*bfqg
;
1388 bfqg
= kmalloc_node(sizeof(*bfqg
), GFP_KERNEL
| __GFP_ZERO
, node
);
1392 for (i
= 0; i
< BFQ_IOPRIO_CLASSES
; i
++)
1393 bfqg
->sched_data
.service_tree
[i
] = BFQ_SERVICE_TREE_INIT
;
1397 #endif /* CONFIG_BFQ_GROUP_IOSCHED */