updated on Sat Jan 14 12:12:45 UTC 2012
[aur-mirror.git] / linux-n130 / 0003-block-introduce-the-BFQ-v3r1-I-O-sched-for-3.1.patch
blob5ead978af456cf64ae2e961036247547e75b70a3
1 From 344896ccf3aa4c97659f5576579779892ed4caa3 Mon Sep 17 00:00:00 2001
2 From: Arianna Avanzini <avanzini.arianna@gmail.com>
3 Date: Tue, 18 Oct 2011 21:23:59 +0200
4 Subject: [PATCH 3/3] block: introduce the BFQ-v3r1 I/O sched for 3.1
6 Add the BFQ-v3r1 I/O scheduler to 3.1.
7 The general structure is borrowed from CFQ, as much of the code. A (bfq_)queue
8 is associated to each task doing I/O on a device, and each time a scheduling
9 decision has to be taken a queue is selected and it is served until it expires.
11 The main differences are:
12 - Slices are given in the service domain: tasks are assigned budgets,
13 measured in number of sectors. Once got the disk, a task must
14 however consume its assigned budget within a configurable maximum time
15 (by default, the maximum possible value of the budgets is automatically
16 computed to comply with this timeout). This allows the desired latency
17 vs "throughput boosting" tradeoff to be set.
19 - Budgets are scheduled according to a variant of WF2Q+, implemented
20 using an augmented rb-tree to take eligibility into account while
21 preserving an O(log N) overall complexity.
23 - A low-latency tunable is provided; if enabled, both interactive and soft
24 real-time applications are guaranteed very low latency.
26 - Latency guarantees are preserved also in presence of NCQ.
28 - Useful features borrowed from CFQ: cooperating-queues merging (with
29 some additional optimizations with respect to the original CFQ version),
30 static fallback queue for OOM.
32 - BFQ supports full hierarchical scheduling, exporting a cgroups
33 interface. Each node has a full scheduler, so each group can
34 be assigned its own ioprio and an ioprio_class.
36 - If the cgroups interface is used, weights can be explictly assigned,
37 otherwise ioprio values are mapped to weights using the relation
38 weight = IOPRIO_BE_NR - ioprio.
40 - ioprio classes are served in strict priority order, i.e., lower
41 priority queues are not served as long as there are higher priority
42 queues. Among queues in the same class the bandwidth is distributed
43 in proportion to the weights of each queue. A very thin extra bandwidth
44 is however guaranteed to the Idle class, to prevent it from starving.
46 Regarding what has not changed it is worth noting:
47 - the handling of cfq_io_contexts to associate queues to tasks.
48 Much of the code has been reused just renaming it. (There is room for
49 code sharing with CFQ but we wanted to minimize the impact of this
50 patch.)
52 - The handling of async queues.
54 - The handling of idle windows.
56 - The handling of merging.
58 - The heuristics to assert that a task is worth an idle window (with
59 minor modifications to hw_tag/CIC_SEEKY detection).
61 Signed-off-by: Paolo Valente <paolo.valente@unimore.it>
62 Signed-off-by: Arianna Avanzini <avanzini.arianna@gmail.com>
63 ---
64 block/bfq-cgroup.c | 768 ++++++++++++++
65 block/bfq-ioc.c | 380 +++++++
66 block/bfq-iosched.c | 2937 +++++++++++++++++++++++++++++++++++++++++++++++++++
67 block/bfq-sched.c | 1037 ++++++++++++++++++
68 block/bfq.h | 585 ++++++++++
69 5 files changed, 5707 insertions(+), 0 deletions(-)
70 create mode 100644 block/bfq-cgroup.c
71 create mode 100644 block/bfq-ioc.c
72 create mode 100644 block/bfq-iosched.c
73 create mode 100644 block/bfq-sched.c
74 create mode 100644 block/bfq.h
76 diff --git a/block/bfq-cgroup.c b/block/bfq-cgroup.c
77 new file mode 100644
78 index 0000000..436c29a
79 --- /dev/null
80 +++ b/block/bfq-cgroup.c
81 @@ -0,0 +1,768 @@
82 +/*
83 + * BFQ: CGROUPS support.
84 + *
85 + * Based on ideas and code from CFQ:
86 + * Copyright (C) 2003 Jens Axboe <axboe@kernel.dk>
87 + *
88 + * Copyright (C) 2008 Fabio Checconi <fabio@gandalf.sssup.it>
89 + * Paolo Valente <paolo.valente@unimore.it>
90 + *
91 + * Licensed under the GPL-2 as detailed in the accompanying COPYING.BFQ file.
92 + */
94 +#ifdef CONFIG_CGROUP_BFQIO
95 +static struct bfqio_cgroup bfqio_root_cgroup = {
96 + .weight = BFQ_DEFAULT_GRP_WEIGHT,
97 + .ioprio = BFQ_DEFAULT_GRP_IOPRIO,
98 + .ioprio_class = BFQ_DEFAULT_GRP_CLASS,
99 +};
101 +static inline void bfq_init_entity(struct bfq_entity *entity,
102 + struct bfq_group *bfqg)
104 + entity->weight = entity->new_weight;
105 + entity->orig_weight = entity->new_weight;
106 + entity->ioprio = entity->new_ioprio;
107 + entity->ioprio_class = entity->new_ioprio_class;
108 + entity->parent = bfqg->my_entity;
109 + entity->sched_data = &bfqg->sched_data;
112 +static struct bfqio_cgroup *cgroup_to_bfqio(struct cgroup *cgroup)
114 + return container_of(cgroup_subsys_state(cgroup, bfqio_subsys_id),
115 + struct bfqio_cgroup, css);
119 + * Search the bfq_group for bfqd into the hash table (by now only a list)
120 + * of bgrp. Must be called under rcu_read_lock().
121 + */
122 +static struct bfq_group *bfqio_lookup_group(struct bfqio_cgroup *bgrp,
123 + struct bfq_data *bfqd)
125 + struct bfq_group *bfqg;
126 + struct hlist_node *n;
127 + void *key;
129 + hlist_for_each_entry_rcu(bfqg, n, &bgrp->group_data, group_node) {
130 + key = rcu_dereference(bfqg->bfqd);
131 + if (key == bfqd)
132 + return bfqg;
135 + return NULL;
138 +static inline void bfq_group_init_entity(struct bfqio_cgroup *bgrp,
139 + struct bfq_group *bfqg)
141 + struct bfq_entity *entity = &bfqg->entity;
143 + entity->weight = entity->new_weight = bgrp->weight;
144 + entity->orig_weight = entity->new_weight;
145 + entity->ioprio = entity->new_ioprio = bgrp->ioprio;
146 + entity->ioprio_class = entity->new_ioprio_class = bgrp->ioprio_class;
147 + entity->ioprio_changed = 1;
148 + entity->my_sched_data = &bfqg->sched_data;
151 +static inline void bfq_group_set_parent(struct bfq_group *bfqg,
152 + struct bfq_group *parent)
154 + struct bfq_entity *entity;
156 + BUG_ON(parent == NULL);
157 + BUG_ON(bfqg == NULL);
159 + entity = &bfqg->entity;
160 + entity->parent = parent->my_entity;
161 + entity->sched_data = &parent->sched_data;
164 +/**
165 + * bfq_group_chain_alloc - allocate a chain of groups.
166 + * @bfqd: queue descriptor.
167 + * @cgroup: the leaf cgroup this chain starts from.
169 + * Allocate a chain of groups starting from the one belonging to
170 + * @cgroup up to the root cgroup. Stop if a cgroup on the chain
171 + * to the root has already an allocated group on @bfqd.
172 + */
173 +static struct bfq_group *bfq_group_chain_alloc(struct bfq_data *bfqd,
174 + struct cgroup *cgroup)
176 + struct bfqio_cgroup *bgrp;
177 + struct bfq_group *bfqg, *prev = NULL, *leaf = NULL;
179 + for (; cgroup != NULL; cgroup = cgroup->parent) {
180 + bgrp = cgroup_to_bfqio(cgroup);
182 + bfqg = bfqio_lookup_group(bgrp, bfqd);
183 + if (bfqg != NULL) {
184 + /*
185 + * All the cgroups in the path from there to the
186 + * root must have a bfq_group for bfqd, so we don't
187 + * need any more allocations.
188 + */
189 + break;
192 + bfqg = kzalloc(sizeof(*bfqg), GFP_ATOMIC);
193 + if (bfqg == NULL)
194 + goto cleanup;
196 + bfq_group_init_entity(bgrp, bfqg);
197 + bfqg->my_entity = &bfqg->entity;
199 + if (leaf == NULL) {
200 + leaf = bfqg;
201 + prev = leaf;
202 + } else {
203 + bfq_group_set_parent(prev, bfqg);
204 + /*
205 + * Build a list of allocated nodes using the bfqd
206 + * filed, that is still unused and will be initialized
207 + * only after the node will be connected.
208 + */
209 + prev->bfqd = bfqg;
210 + prev = bfqg;
214 + return leaf;
216 +cleanup:
217 + while (leaf != NULL) {
218 + prev = leaf;
219 + leaf = leaf->bfqd;
220 + kfree(prev);
223 + return NULL;
226 +/**
227 + * bfq_group_chain_link - link an allocatd group chain to a cgroup hierarchy.
228 + * @bfqd: the queue descriptor.
229 + * @cgroup: the leaf cgroup to start from.
230 + * @leaf: the leaf group (to be associated to @cgroup).
232 + * Try to link a chain of groups to a cgroup hierarchy, connecting the
233 + * nodes bottom-up, so we can be sure that when we find a cgroup in the
234 + * hierarchy that already as a group associated to @bfqd all the nodes
235 + * in the path to the root cgroup have one too.
237 + * On locking: the queue lock protects the hierarchy (there is a hierarchy
238 + * per device) while the bfqio_cgroup lock protects the list of groups
239 + * belonging to the same cgroup.
240 + */
241 +static void bfq_group_chain_link(struct bfq_data *bfqd, struct cgroup *cgroup,
242 + struct bfq_group *leaf)
244 + struct bfqio_cgroup *bgrp;
245 + struct bfq_group *bfqg, *next, *prev = NULL;
246 + unsigned long flags;
248 + assert_spin_locked(bfqd->queue->queue_lock);
250 + for (; cgroup != NULL && leaf != NULL; cgroup = cgroup->parent) {
251 + bgrp = cgroup_to_bfqio(cgroup);
252 + next = leaf->bfqd;
254 + bfqg = bfqio_lookup_group(bgrp, bfqd);
255 + BUG_ON(bfqg != NULL);
257 + spin_lock_irqsave(&bgrp->lock, flags);
259 + rcu_assign_pointer(leaf->bfqd, bfqd);
260 + hlist_add_head_rcu(&leaf->group_node, &bgrp->group_data);
261 + hlist_add_head(&leaf->bfqd_node, &bfqd->group_list);
263 + spin_unlock_irqrestore(&bgrp->lock, flags);
265 + prev = leaf;
266 + leaf = next;
269 + BUG_ON(cgroup == NULL && leaf != NULL);
270 + if (cgroup != NULL && prev != NULL) {
271 + bgrp = cgroup_to_bfqio(cgroup);
272 + bfqg = bfqio_lookup_group(bgrp, bfqd);
273 + bfq_group_set_parent(prev, bfqg);
277 +/**
278 + * bfq_find_alloc_group - return the group associated to @bfqd in @cgroup.
279 + * @bfqd: queue descriptor.
280 + * @cgroup: cgroup being searched for.
282 + * Return a group associated to @bfqd in @cgroup, allocating one if
283 + * necessary. When a group is returned all the cgroups in the path
284 + * to the root have a group associated to @bfqd.
286 + * If the allocation fails, return the root group: this breaks guarantees
287 + * but is a safe fallbak. If this loss becames a problem it can be
288 + * mitigated using the equivalent weight (given by the product of the
289 + * weights of the groups in the path from @group to the root) in the
290 + * root scheduler.
292 + * We allocate all the missing nodes in the path from the leaf cgroup
293 + * to the root and we connect the nodes only after all the allocations
294 + * have been successful.
295 + */
296 +static struct bfq_group *bfq_find_alloc_group(struct bfq_data *bfqd,
297 + struct cgroup *cgroup)
299 + struct bfqio_cgroup *bgrp = cgroup_to_bfqio(cgroup);
300 + struct bfq_group *bfqg;
302 + bfqg = bfqio_lookup_group(bgrp, bfqd);
303 + if (bfqg != NULL)
304 + return bfqg;
306 + bfqg = bfq_group_chain_alloc(bfqd, cgroup);
307 + if (bfqg != NULL)
308 + bfq_group_chain_link(bfqd, cgroup, bfqg);
309 + else
310 + bfqg = bfqd->root_group;
312 + return bfqg;
315 +/**
316 + * bfq_bfqq_move - migrate @bfqq to @bfqg.
317 + * @bfqd: queue descriptor.
318 + * @bfqq: the queue to move.
319 + * @entity: @bfqq's entity.
320 + * @bfqg: the group to move to.
322 + * Move @bfqq to @bfqg, deactivating it from its old group and reactivating
323 + * it on the new one. Avoid putting the entity on the old group idle tree.
325 + * Must be called under the queue lock; the cgroup owning @bfqg must
326 + * not disappear (by now this just means that we are called under
327 + * rcu_read_lock()).
328 + */
329 +static void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq,
330 + struct bfq_entity *entity, struct bfq_group *bfqg)
332 + int busy, resume;
334 + busy = bfq_bfqq_busy(bfqq);
335 + resume = !RB_EMPTY_ROOT(&bfqq->sort_list);
337 + BUG_ON(resume && !entity->on_st);
338 + BUG_ON(busy && !resume && entity->on_st && bfqq != bfqd->active_queue);
340 + if (busy) {
341 + BUG_ON(atomic_read(&bfqq->ref) < 2);
343 + if (!resume)
344 + bfq_del_bfqq_busy(bfqd, bfqq, 0);
345 + else
346 + bfq_deactivate_bfqq(bfqd, bfqq, 0);
349 + /*
350 + * Here we use a reference to bfqg. We don't need a refcounter
351 + * as the cgroup reference will not be dropped, so that its
352 + * destroy() callback will not be invoked.
353 + */
354 + entity->parent = bfqg->my_entity;
355 + entity->sched_data = &bfqg->sched_data;
357 + if (busy && resume)
358 + bfq_activate_bfqq(bfqd, bfqq);
361 +/**
362 + * __bfq_cic_change_cgroup - move @cic to @cgroup.
363 + * @bfqd: the queue descriptor.
364 + * @cic: the cic to move.
365 + * @cgroup: the cgroup to move to.
367 + * Move cic to cgroup, assuming that bfqd->queue is locked; the caller
368 + * has to make sure that the reference to cgroup is valid across the call.
370 + * NOTE: an alternative approach might have been to store the current
371 + * cgroup in bfqq and getting a reference to it, reducing the lookup
372 + * time here, at the price of slightly more complex code.
373 + */
374 +static struct bfq_group *__bfq_cic_change_cgroup(struct bfq_data *bfqd,
375 + struct cfq_io_context *cic,
376 + struct cgroup *cgroup)
378 + struct bfq_queue *async_bfqq = cic_to_bfqq(cic, 0);
379 + struct bfq_queue *sync_bfqq = cic_to_bfqq(cic, 1);
380 + struct bfq_entity *entity;
381 + struct bfq_group *bfqg;
382 + struct bfqio_cgroup *bgrp;
384 + bgrp = cgroup_to_bfqio(cgroup);
386 + bfqg = bfq_find_alloc_group(bfqd, cgroup);
387 + if (async_bfqq != NULL) {
388 + entity = &async_bfqq->entity;
390 + if (entity->sched_data != &bfqg->sched_data) {
391 + cic_set_bfqq(cic, NULL, 0);
392 + bfq_log_bfqq(bfqd, async_bfqq,
393 + "cic_change_group: %p %d",
394 + async_bfqq, atomic_read(&async_bfqq->ref));
395 + bfq_put_queue(async_bfqq);
399 + if (sync_bfqq != NULL) {
400 + entity = &sync_bfqq->entity;
401 + if (entity->sched_data != &bfqg->sched_data)
402 + bfq_bfqq_move(bfqd, sync_bfqq, entity, bfqg);
405 + return bfqg;
408 +/**
409 + * bfq_cic_change_cgroup - move @cic to @cgroup.
410 + * @cic: the cic being migrated.
411 + * @cgroup: the destination cgroup.
413 + * When the task owning @cic is moved to @cgroup, @cic is immediately
414 + * moved into its new parent group.
415 + */
416 +static void bfq_cic_change_cgroup(struct cfq_io_context *cic,
417 + struct cgroup *cgroup)
419 + struct bfq_data *bfqd;
420 + unsigned long uninitialized_var(flags);
422 + bfqd = bfq_get_bfqd_locked(&cic->key, &flags);
423 + if (bfqd != NULL) {
424 + __bfq_cic_change_cgroup(bfqd, cic, cgroup);
425 + bfq_put_bfqd_unlock(bfqd, &flags);
429 +/**
430 + * bfq_cic_update_cgroup - update the cgroup of @cic.
431 + * @cic: the @cic to update.
433 + * Make sure that @cic is enqueued in the cgroup of the current task.
434 + * We need this in addition to moving cics during the cgroup attach
435 + * phase because the task owning @cic could be at its first disk
436 + * access or we may end up in the root cgroup as the result of a
437 + * memory allocation failure and here we try to move to the right
438 + * group.
440 + * Must be called under the queue lock. It is safe to use the returned
441 + * value even after the rcu_read_unlock() as the migration/destruction
442 + * paths act under the queue lock too. IOW it is impossible to race with
443 + * group migration/destruction and end up with an invalid group as:
444 + * a) here cgroup has not yet been destroyed, nor its destroy callback
445 + * has started execution, as current holds a reference to it,
446 + * b) if it is destroyed after rcu_read_unlock() [after current is
447 + * migrated to a different cgroup] its attach() callback will have
448 + * taken care of remove all the references to the old cgroup data.
449 + */
450 +static struct bfq_group *bfq_cic_update_cgroup(struct cfq_io_context *cic)
452 + struct bfq_data *bfqd = cic->key;
453 + struct bfq_group *bfqg;
454 + struct cgroup *cgroup;
456 + BUG_ON(bfqd == NULL);
458 + rcu_read_lock();
459 + cgroup = task_cgroup(current, bfqio_subsys_id);
460 + bfqg = __bfq_cic_change_cgroup(bfqd, cic, cgroup);
461 + rcu_read_unlock();
463 + return bfqg;
466 +/**
467 + * bfq_flush_idle_tree - deactivate any entity on the idle tree of @st.
468 + * @st: the service tree being flushed.
469 + */
470 +static inline void bfq_flush_idle_tree(struct bfq_service_tree *st)
472 + struct bfq_entity *entity = st->first_idle;
474 + for (; entity != NULL; entity = st->first_idle)
475 + __bfq_deactivate_entity(entity, 0);
478 +/**
479 + * bfq_destroy_group - destroy @bfqg.
480 + * @bgrp: the bfqio_cgroup containing @bfqg.
481 + * @bfqg: the group being destroyed.
483 + * Destroy @bfqg, making sure that it is not referenced from its parent.
484 + */
485 +static void bfq_destroy_group(struct bfqio_cgroup *bgrp, struct bfq_group *bfqg)
487 + struct bfq_data *bfqd;
488 + struct bfq_service_tree *st;
489 + struct bfq_entity *entity = bfqg->my_entity;
490 + unsigned long uninitialized_var(flags);
491 + int i;
493 + hlist_del(&bfqg->group_node);
495 + /*
496 + * We may race with device destruction, take extra care when
497 + * dereferencing bfqg->bfqd.
498 + */
499 + bfqd = bfq_get_bfqd_locked(&bfqg->bfqd, &flags);
500 + if (bfqd != NULL) {
501 + hlist_del(&bfqg->bfqd_node);
502 + __bfq_deactivate_entity(entity, 0);
503 + bfq_put_async_queues(bfqd, bfqg);
504 + bfq_put_bfqd_unlock(bfqd, &flags);
507 + for (i = 0; i < BFQ_IOPRIO_CLASSES; i++) {
508 + st = bfqg->sched_data.service_tree + i;
510 + /*
511 + * The idle tree may still contain bfq_queues belonging
512 + * to exited task because they never migrated to a different
513 + * cgroup from the one being destroyed now. Noone else
514 + * can access them so it's safe to act without any lock.
515 + */
516 + bfq_flush_idle_tree(st);
518 + BUG_ON(!RB_EMPTY_ROOT(&st->active));
519 + BUG_ON(!RB_EMPTY_ROOT(&st->idle));
521 + BUG_ON(bfqg->sched_data.next_active != NULL);
522 + BUG_ON(bfqg->sched_data.active_entity != NULL);
523 + BUG_ON(entity->tree != NULL);
525 + /*
526 + * No need to defer the kfree() to the end of the RCU grace
527 + * period: we are called from the destroy() callback of our
528 + * cgroup, so we can be sure that noone is a) still using
529 + * this cgroup or b) doing lookups in it.
530 + */
531 + kfree(bfqg);
534 +/**
535 + * bfq_disconnect_groups - diconnect @bfqd from all its groups.
536 + * @bfqd: the device descriptor being exited.
538 + * When the device exits we just make sure that no lookup can return
539 + * the now unused group structures. They will be deallocated on cgroup
540 + * destruction.
541 + */
542 +static void bfq_disconnect_groups(struct bfq_data *bfqd)
544 + struct hlist_node *pos, *n;
545 + struct bfq_group *bfqg;
547 + bfq_log(bfqd, "disconnect_groups beginning") ;
548 + hlist_for_each_entry_safe(bfqg, pos, n, &bfqd->group_list, bfqd_node) {
549 + hlist_del(&bfqg->bfqd_node);
551 + __bfq_deactivate_entity(bfqg->my_entity, 0);
553 + /*
554 + * Don't remove from the group hash, just set an
555 + * invalid key. No lookups can race with the
556 + * assignment as bfqd is being destroyed; this
557 + * implies also that new elements cannot be added
558 + * to the list.
559 + */
560 + rcu_assign_pointer(bfqg->bfqd, NULL);
562 + bfq_log(bfqd, "disconnect_groups: put async for group %p",
563 + bfqg) ;
564 + bfq_put_async_queues(bfqd, bfqg);
568 +static inline void bfq_free_root_group(struct bfq_data *bfqd)
570 + struct bfqio_cgroup *bgrp = &bfqio_root_cgroup;
571 + struct bfq_group *bfqg = bfqd->root_group;
573 + bfq_put_async_queues(bfqd, bfqg);
575 + spin_lock_irq(&bgrp->lock);
576 + hlist_del_rcu(&bfqg->group_node);
577 + spin_unlock_irq(&bgrp->lock);
579 + /*
580 + * No need to synchronize_rcu() here: since the device is gone
581 + * there cannot be any read-side access to its root_group.
582 + */
583 + kfree(bfqg);
586 +static struct bfq_group *bfq_alloc_root_group(struct bfq_data *bfqd, int node)
588 + struct bfq_group *bfqg;
589 + struct bfqio_cgroup *bgrp;
590 + int i;
592 + bfqg = kmalloc_node(sizeof(*bfqg), GFP_KERNEL | __GFP_ZERO, node);
593 + if (bfqg == NULL)
594 + return NULL;
596 + bfqg->entity.parent = NULL;
597 + for (i = 0; i < BFQ_IOPRIO_CLASSES; i++)
598 + bfqg->sched_data.service_tree[i] = BFQ_SERVICE_TREE_INIT;
600 + bgrp = &bfqio_root_cgroup;
601 + spin_lock_irq(&bgrp->lock);
602 + rcu_assign_pointer(bfqg->bfqd, bfqd);
603 + hlist_add_head_rcu(&bfqg->group_node, &bgrp->group_data);
604 + spin_unlock_irq(&bgrp->lock);
606 + return bfqg;
609 +#define SHOW_FUNCTION(__VAR) \
610 +static u64 bfqio_cgroup_##__VAR##_read(struct cgroup *cgroup, \
611 + struct cftype *cftype) \
612 +{ \
613 + struct bfqio_cgroup *bgrp; \
614 + u64 ret; \
616 + if (!cgroup_lock_live_group(cgroup)) \
617 + return -ENODEV; \
619 + bgrp = cgroup_to_bfqio(cgroup); \
620 + spin_lock_irq(&bgrp->lock); \
621 + ret = bgrp->__VAR; \
622 + spin_unlock_irq(&bgrp->lock); \
624 + cgroup_unlock(); \
626 + return ret; \
629 +SHOW_FUNCTION(weight);
630 +SHOW_FUNCTION(ioprio);
631 +SHOW_FUNCTION(ioprio_class);
632 +#undef SHOW_FUNCTION
634 +#define STORE_FUNCTION(__VAR, __MIN, __MAX) \
635 +static int bfqio_cgroup_##__VAR##_write(struct cgroup *cgroup, \
636 + struct cftype *cftype, \
637 + u64 val) \
638 +{ \
639 + struct bfqio_cgroup *bgrp; \
640 + struct bfq_group *bfqg; \
641 + struct hlist_node *n; \
643 + if (val < (__MIN) || val > (__MAX)) \
644 + return -EINVAL; \
646 + if (!cgroup_lock_live_group(cgroup)) \
647 + return -ENODEV; \
649 + bgrp = cgroup_to_bfqio(cgroup); \
651 + spin_lock_irq(&bgrp->lock); \
652 + bgrp->__VAR = (unsigned short)val; \
653 + hlist_for_each_entry(bfqg, n, &bgrp->group_data, group_node) { \
654 + bfqg->entity.new_##__VAR = (unsigned short)val; \
655 + smp_wmb(); \
656 + bfqg->entity.ioprio_changed = 1; \
657 + } \
658 + spin_unlock_irq(&bgrp->lock); \
660 + cgroup_unlock(); \
662 + return 0; \
665 +STORE_FUNCTION(weight, BFQ_MIN_WEIGHT, BFQ_MAX_WEIGHT);
666 +STORE_FUNCTION(ioprio, 0, IOPRIO_BE_NR - 1);
667 +STORE_FUNCTION(ioprio_class, IOPRIO_CLASS_RT, IOPRIO_CLASS_IDLE);
668 +#undef STORE_FUNCTION
670 +static struct cftype bfqio_files[] = {
672 + .name = "weight",
673 + .read_u64 = bfqio_cgroup_weight_read,
674 + .write_u64 = bfqio_cgroup_weight_write,
675 + },
677 + .name = "ioprio",
678 + .read_u64 = bfqio_cgroup_ioprio_read,
679 + .write_u64 = bfqio_cgroup_ioprio_write,
680 + },
682 + .name = "ioprio_class",
683 + .read_u64 = bfqio_cgroup_ioprio_class_read,
684 + .write_u64 = bfqio_cgroup_ioprio_class_write,
685 + },
688 +static int bfqio_populate(struct cgroup_subsys *subsys, struct cgroup *cgroup)
690 + return cgroup_add_files(cgroup, subsys, bfqio_files,
691 + ARRAY_SIZE(bfqio_files));
694 +static struct cgroup_subsys_state *bfqio_create(struct cgroup_subsys *subsys,
695 + struct cgroup *cgroup)
697 + struct bfqio_cgroup *bgrp;
699 + if (cgroup->parent != NULL) {
700 + bgrp = kzalloc(sizeof(*bgrp), GFP_KERNEL);
701 + if (bgrp == NULL)
702 + return ERR_PTR(-ENOMEM);
703 + } else
704 + bgrp = &bfqio_root_cgroup;
706 + spin_lock_init(&bgrp->lock);
707 + INIT_HLIST_HEAD(&bgrp->group_data);
708 + bgrp->ioprio = BFQ_DEFAULT_GRP_IOPRIO;
709 + bgrp->ioprio_class = BFQ_DEFAULT_GRP_CLASS;
711 + return &bgrp->css;
715 + * We cannot support shared io contexts, as we have no mean to support
716 + * two tasks with the same ioc in two different groups without major rework
717 + * of the main cic/bfqq data structures. By now we allow a task to change
718 + * its cgroup only if it's the only owner of its ioc; the drawback of this
719 + * behavior is that a group containing a task that forked using CLONE_IO
720 + * will not be destroyed until the tasks sharing the ioc die.
721 + */
722 +static int bfqio_can_attach(struct cgroup_subsys *subsys, struct cgroup *cgroup,
723 + struct task_struct *tsk)
725 + struct io_context *ioc;
726 + int ret = 0;
728 + /* task_lock() is needed to avoid races with exit_io_context() */
729 + task_lock(tsk);
730 + ioc = tsk->io_context;
731 + if (ioc != NULL && atomic_read(&ioc->nr_tasks) > 1)
732 + /*
733 + * ioc == NULL means that the task is either too young or
734 + * exiting: if it has still no ioc the ioc can't be shared,
735 + * if the task is exiting the attach will fail anyway, no
736 + * matter what we return here.
737 + */
738 + ret = -EINVAL;
739 + task_unlock(tsk);
741 + return ret;
744 +static void bfqio_attach(struct cgroup_subsys *subsys, struct cgroup *cgroup,
745 + struct cgroup *prev, struct task_struct *tsk)
747 + struct io_context *ioc;
748 + struct cfq_io_context *cic;
749 + struct hlist_node *n;
751 + task_lock(tsk);
752 + ioc = tsk->io_context;
753 + if (ioc != NULL) {
754 + BUG_ON(atomic_long_read(&ioc->refcount) == 0);
755 + atomic_long_inc(&ioc->refcount);
757 + task_unlock(tsk);
759 + if (ioc == NULL)
760 + return;
762 + rcu_read_lock();
763 + hlist_for_each_entry_rcu(cic, n, &ioc->bfq_cic_list, cic_list)
764 + bfq_cic_change_cgroup(cic, cgroup);
765 + rcu_read_unlock();
767 + put_io_context(ioc);
770 +static void bfqio_destroy(struct cgroup_subsys *subsys, struct cgroup *cgroup)
772 + struct bfqio_cgroup *bgrp = cgroup_to_bfqio(cgroup);
773 + struct hlist_node *n, *tmp;
774 + struct bfq_group *bfqg;
776 + /*
777 + * Since we are destroying the cgroup, there are no more tasks
778 + * referencing it, and all the RCU grace periods that may have
779 + * referenced it are ended (as the destruction of the parent
780 + * cgroup is RCU-safe); bgrp->group_data will not be accessed by
781 + * anything else and we don't need any synchronization.
782 + */
783 + hlist_for_each_entry_safe(bfqg, n, tmp, &bgrp->group_data, group_node)
784 + bfq_destroy_group(bgrp, bfqg);
786 + BUG_ON(!hlist_empty(&bgrp->group_data));
788 + kfree(bgrp);
791 +struct cgroup_subsys bfqio_subsys = {
792 + .name = "bfqio",
793 + .create = bfqio_create,
794 + .can_attach = bfqio_can_attach,
795 + .attach = bfqio_attach,
796 + .destroy = bfqio_destroy,
797 + .populate = bfqio_populate,
798 + .subsys_id = bfqio_subsys_id,
800 +#else
801 +static inline void bfq_init_entity(struct bfq_entity *entity,
802 + struct bfq_group *bfqg)
804 + entity->weight = entity->new_weight;
805 + entity->orig_weight = entity->new_weight;
806 + entity->ioprio = entity->new_ioprio;
807 + entity->ioprio_class = entity->new_ioprio_class;
808 + entity->sched_data = &bfqg->sched_data;
811 +static inline struct bfq_group *
812 +bfq_cic_update_cgroup(struct cfq_io_context *cic)
814 + struct bfq_data *bfqd = cic->key;
815 + return bfqd->root_group;
818 +static inline void bfq_bfqq_move(struct bfq_data *bfqd,
819 + struct bfq_queue *bfqq,
820 + struct bfq_entity *entity,
821 + struct bfq_group *bfqg)
825 +static inline void bfq_disconnect_groups(struct bfq_data *bfqd)
827 + bfq_put_async_queues(bfqd, bfqd->root_group);
830 +static inline void bfq_free_root_group(struct bfq_data *bfqd)
832 + kfree(bfqd->root_group);
835 +static struct bfq_group *bfq_alloc_root_group(struct bfq_data *bfqd, int node)
837 + struct bfq_group *bfqg;
838 + int i;
840 + bfqg = kmalloc_node(sizeof(*bfqg), GFP_KERNEL | __GFP_ZERO, node);
841 + if (bfqg == NULL)
842 + return NULL;
844 + for (i = 0; i < BFQ_IOPRIO_CLASSES; i++)
845 + bfqg->sched_data.service_tree[i] = BFQ_SERVICE_TREE_INIT;
847 + return bfqg;
849 +#endif
850 diff --git a/block/bfq-ioc.c b/block/bfq-ioc.c
851 new file mode 100644
852 index 0000000..8f2b6c6
853 --- /dev/null
854 +++ b/block/bfq-ioc.c
855 @@ -0,0 +1,380 @@
857 + * BFQ: I/O context handling.
859 + * Based on ideas and code from CFQ:
860 + * Copyright (C) 2003 Jens Axboe <axboe@kernel.dk>
862 + * Copyright (C) 2008 Fabio Checconi <fabio@gandalf.sssup.it>
863 + * Paolo Valente <paolo.valente@unimore.it>
864 + */
866 +/**
867 + * bfq_cic_free_rcu - deferred cic freeing.
868 + * @head: RCU head of the cic to free.
870 + * Free the cic containing @head and, if it was the last one and
871 + * the module is exiting wake up anyone waiting for its deallocation
872 + * (see bfq_exit()).
873 + */
874 +static void bfq_cic_free_rcu(struct rcu_head *head)
876 + struct cfq_io_context *cic;
878 + cic = container_of(head, struct cfq_io_context, rcu_head);
880 + kmem_cache_free(bfq_ioc_pool, cic);
881 + elv_ioc_count_dec(bfq_ioc_count);
883 + if (bfq_ioc_gone != NULL) {
884 + spin_lock(&bfq_ioc_gone_lock);
885 + if (bfq_ioc_gone != NULL &&
886 + !elv_ioc_count_read(bfq_ioc_count)) {
887 + complete(bfq_ioc_gone);
888 + bfq_ioc_gone = NULL;
890 + spin_unlock(&bfq_ioc_gone_lock);
894 +static void bfq_cic_free(struct cfq_io_context *cic)
896 + call_rcu(&cic->rcu_head, bfq_cic_free_rcu);
899 +/**
900 + * cic_free_func - disconnect a cic ready to be freed.
901 + * @ioc: the io_context @cic belongs to.
902 + * @cic: the cic to be freed.
904 + * Remove @cic from the @ioc radix tree hash and from its cic list,
905 + * deferring the deallocation of @cic to the end of the current RCU
906 + * grace period. This assumes that __bfq_exit_single_io_context()
907 + * has already been called for @cic.
908 + */
909 +static void cic_free_func(struct io_context *ioc, struct cfq_io_context *cic)
911 + unsigned long flags;
912 + unsigned long dead_key = (unsigned long) cic->key;
914 + BUG_ON(!(dead_key & CIC_DEAD_KEY));
916 + spin_lock_irqsave(&ioc->lock, flags);
917 + radix_tree_delete(&ioc->bfq_radix_root,
918 + dead_key >> CIC_DEAD_INDEX_SHIFT);
919 + hlist_del_init_rcu(&cic->cic_list);
920 + spin_unlock_irqrestore(&ioc->lock, flags);
922 + bfq_cic_free(cic);
925 +static void bfq_free_io_context(struct io_context *ioc)
927 + /*
928 + * ioc->refcount is zero here, or we are called from elv_unregister(),
929 + * so no more cic's are allowed to be linked into this ioc. So it
930 + * should be ok to iterate over the known list, we will see all cic's
931 + * since no new ones are added.
932 + */
933 + call_for_each_cic(ioc, cic_free_func);
936 +/**
937 + * __bfq_exit_single_io_context - deassociate @cic from any running task.
938 + * @bfqd: bfq_data on which @cic is valid.
939 + * @cic: the cic being exited.
941 + * Whenever no more tasks are using @cic or @bfqd is deallocated we
942 + * need to invalidate its entry in the radix tree hash table and to
943 + * release the queues it refers to.
945 + * Called under the queue lock.
946 + */
947 +static void __bfq_exit_single_io_context(struct bfq_data *bfqd,
948 + struct cfq_io_context *cic)
950 + struct io_context *ioc = cic->ioc;
952 + list_del_init(&cic->queue_list);
954 + /*
955 + * Make sure dead mark is seen for dead queues
956 + */
957 + smp_wmb();
958 + rcu_assign_pointer(cic->key, bfqd_dead_key(bfqd));
960 + /*
961 + * No write-side locking as no task is using @ioc (they're exited
962 + * or bfqd is being deallocated.
963 + */
964 + rcu_read_lock();
965 + if (rcu_dereference(ioc->ioc_data) == cic) {
966 + rcu_read_unlock();
967 + spin_lock(&ioc->lock);
968 + rcu_assign_pointer(ioc->ioc_data, NULL);
969 + spin_unlock(&ioc->lock);
970 + } else
971 + rcu_read_unlock();
973 + if (cic->cfqq[BLK_RW_ASYNC] != NULL) {
974 + bfq_exit_bfqq(bfqd, cic->cfqq[BLK_RW_ASYNC]);
975 + cic->cfqq[BLK_RW_ASYNC] = NULL;
978 + if (cic->cfqq[BLK_RW_SYNC] != NULL) {
979 + bfq_exit_bfqq(bfqd, cic->cfqq[BLK_RW_SYNC]);
980 + cic->cfqq[BLK_RW_SYNC] = NULL;
984 +/**
985 + * bfq_exit_single_io_context - deassociate @cic from @ioc (unlocked version).
986 + * @ioc: the io_context @cic belongs to.
987 + * @cic: the cic being exited.
989 + * Take the queue lock and call __bfq_exit_single_io_context() to do the
990 + * rest of the work. We take care of possible races with bfq_exit_queue()
991 + * using bfq_get_bfqd_locked() (and abusing a little bit the RCU mechanism).
992 + */
993 +static void bfq_exit_single_io_context(struct io_context *ioc,
994 + struct cfq_io_context *cic)
996 + struct bfq_data *bfqd;
997 + unsigned long uninitialized_var(flags);
999 + bfqd = bfq_get_bfqd_locked(&cic->key, &flags);
1000 + if (bfqd != NULL) {
1001 + __bfq_exit_single_io_context(bfqd, cic);
1002 + bfq_put_bfqd_unlock(bfqd, &flags);
1006 +/**
1007 + * bfq_exit_io_context - deassociate @ioc from all cics it owns.
1008 + * @ioc: the @ioc being exited.
1010 + * No more processes are using @ioc we need to clean up and put the
1011 + * internal structures we have that belongs to that process. Loop
1012 + * through all its cics, locking their queues and exiting them.
1013 + */
1014 +static void bfq_exit_io_context(struct io_context *ioc)
1016 + call_for_each_cic(ioc, bfq_exit_single_io_context);
1019 +static struct cfq_io_context *bfq_alloc_io_context(struct bfq_data *bfqd,
1020 + gfp_t gfp_mask)
1022 + struct cfq_io_context *cic;
1024 + cic = kmem_cache_alloc_node(bfq_ioc_pool, gfp_mask | __GFP_ZERO,
1025 + bfqd->queue->node);
1026 + if (cic != NULL) {
1027 + cic->ttime.last_end_request = jiffies;
1028 + INIT_LIST_HEAD(&cic->queue_list);
1029 + INIT_HLIST_NODE(&cic->cic_list);
1030 + cic->dtor = bfq_free_io_context;
1031 + cic->exit = bfq_exit_io_context;
1032 + elv_ioc_count_inc(bfq_ioc_count);
1035 + return cic;
1038 +/**
1039 + * bfq_drop_dead_cic - free an exited cic.
1040 + * @bfqd: bfq data for the device in use.
1041 + * @ioc: io_context owning @cic.
1042 + * @cic: the @cic to free.
1044 + * We drop cfq io contexts lazily, so we may find a dead one.
1045 + */
1046 +static void bfq_drop_dead_cic(struct bfq_data *bfqd, struct io_context *ioc,
1047 + struct cfq_io_context *cic)
1049 + unsigned long flags;
1051 + WARN_ON(!list_empty(&cic->queue_list));
1052 + BUG_ON(cic->key != bfqd_dead_key(bfqd));
1054 + spin_lock_irqsave(&ioc->lock, flags);
1056 + BUG_ON(ioc->ioc_data == cic);
1058 + /*
1059 + * With shared I/O contexts two lookups may race and drop the
1060 + * same cic more than one time: RCU guarantees that the storage
1061 + * will not be freed too early, here we make sure that we do
1062 + * not try to remove the cic from the hashing structures multiple
1063 + * times.
1064 + */
1065 + if (!hlist_unhashed(&cic->cic_list)) {
1066 + radix_tree_delete(&ioc->bfq_radix_root, bfqd->cic_index);
1067 + hlist_del_init_rcu(&cic->cic_list);
1068 + bfq_cic_free(cic);
1071 + spin_unlock_irqrestore(&ioc->lock, flags);
1074 +/**
1075 + * bfq_cic_lookup - search into @ioc a cic associated to @bfqd.
1076 + * @bfqd: the lookup key.
1077 + * @ioc: the io_context of the process doing I/O.
1079 + * If @ioc already has a cic associated to @bfqd return it, return %NULL
1080 + * otherwise.
1081 + */
1082 +static struct cfq_io_context *bfq_cic_lookup(struct bfq_data *bfqd,
1083 + struct io_context *ioc)
1085 + struct cfq_io_context *cic;
1086 + unsigned long flags;
1087 + void *k;
1089 + if (unlikely(ioc == NULL))
1090 + return NULL;
1092 + rcu_read_lock();
1094 + /* We maintain a last-hit cache, to avoid browsing over the tree. */
1095 + cic = rcu_dereference(ioc->ioc_data);
1096 + if (cic != NULL) {
1097 + k = rcu_dereference(cic->key);
1098 + if (k == bfqd)
1099 + goto out;
1102 + do {
1103 + cic = radix_tree_lookup(&ioc->bfq_radix_root,
1104 + bfqd->cic_index);
1105 + if (cic == NULL)
1106 + goto out;
1108 + k = rcu_dereference(cic->key);
1109 + if (unlikely(k != bfqd)) {
1110 + rcu_read_unlock();
1111 + bfq_drop_dead_cic(bfqd, ioc, cic);
1112 + rcu_read_lock();
1113 + continue;
1116 + spin_lock_irqsave(&ioc->lock, flags);
1117 + rcu_assign_pointer(ioc->ioc_data, cic);
1118 + spin_unlock_irqrestore(&ioc->lock, flags);
1119 + break;
1120 + } while (1);
1122 +out:
1123 + rcu_read_unlock();
1125 + return cic;
1128 +/**
1129 + * bfq_cic_link - add @cic to @ioc.
1130 + * @bfqd: bfq_data @cic refers to.
1131 + * @ioc: io_context @cic belongs to.
1132 + * @cic: the cic to link.
1133 + * @gfp_mask: the mask to use for radix tree preallocations.
1135 + * Add @cic to @ioc, using @bfqd as the search key. This enables us to
1136 + * lookup the process specific cfq io context when entered from the block
1137 + * layer. Also adds @cic to a per-bfqd list, used when this queue is
1138 + * removed.
1139 + */
1140 +static int bfq_cic_link(struct bfq_data *bfqd, struct io_context *ioc,
1141 + struct cfq_io_context *cic, gfp_t gfp_mask)
1143 + unsigned long flags;
1144 + int ret;
1146 + ret = radix_tree_preload(gfp_mask);
1147 + if (ret == 0) {
1148 + cic->ioc = ioc;
1150 + /* No write-side locking, cic is not published yet. */
1151 + rcu_assign_pointer(cic->key, bfqd);
1153 + spin_lock_irqsave(&ioc->lock, flags);
1154 + ret = radix_tree_insert(&ioc->bfq_radix_root,
1155 + bfqd->cic_index, cic);
1156 + if (ret == 0)
1157 + hlist_add_head_rcu(&cic->cic_list, &ioc->bfq_cic_list);
1158 + spin_unlock_irqrestore(&ioc->lock, flags);
1160 + radix_tree_preload_end();
1162 + if (ret == 0) {
1163 + spin_lock_irqsave(bfqd->queue->queue_lock, flags);
1164 + list_add(&cic->queue_list, &bfqd->cic_list);
1165 + spin_unlock_irqrestore(bfqd->queue->queue_lock, flags);
1169 + if (ret != 0)
1170 + printk(KERN_ERR "bfq: cic link failed!\n");
1172 + return ret;
1175 +/**
1176 + * bfq_ioc_set_ioprio - signal a priority change to the cics belonging to @ioc.
1177 + * @ioc: the io_context changing its priority.
1178 + */
1179 +static inline void bfq_ioc_set_ioprio(struct io_context *ioc)
1181 + call_for_each_cic(ioc, bfq_changed_ioprio);
1184 +/**
1185 + * bfq_get_io_context - return the @cic associated to @bfqd in @ioc.
1186 + * @bfqd: the search key.
1187 + * @gfp_mask: the mask to use for cic allocation.
1189 + * Setup general io context and cfq io context. There can be several cfq
1190 + * io contexts per general io context, if this process is doing io to more
1191 + * than one device managed by cfq.
1192 + */
1193 +static struct cfq_io_context *bfq_get_io_context(struct bfq_data *bfqd,
1194 + gfp_t gfp_mask)
1196 + struct io_context *ioc = NULL;
1197 + struct cfq_io_context *cic;
1199 + might_sleep_if(gfp_mask & __GFP_WAIT);
1201 + ioc = get_io_context(gfp_mask, bfqd->queue->node);
1202 + if (ioc == NULL)
1203 + return NULL;
1205 + /* Lookup for an existing cic. */
1206 + cic = bfq_cic_lookup(bfqd, ioc);
1207 + if (cic != NULL)
1208 + goto out;
1210 + /* Alloc one if needed. */
1211 + cic = bfq_alloc_io_context(bfqd, gfp_mask);
1212 + if (cic == NULL)
1213 + goto err;
1215 + /* Link it into the ioc's radix tree and cic list. */
1216 + if (bfq_cic_link(bfqd, ioc, cic, gfp_mask) != 0)
1217 + goto err_free;
1219 +out:
1220 + /*
1221 + * test_and_clear_bit() implies a memory barrier, paired with
1222 + * the wmb() in fs/ioprio.c, so the value seen for ioprio is the
1223 + * new one.
1224 + */
1225 + if (unlikely(test_and_clear_bit(IOC_BFQ_IOPRIO_CHANGED,
1226 + ioc->ioprio_changed)))
1227 + bfq_ioc_set_ioprio(ioc);
1229 + return cic;
1230 +err_free:
1231 + bfq_cic_free(cic);
1232 +err:
1233 + put_io_context(ioc);
1234 + return NULL;
1236 diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c
1237 new file mode 100644
1238 index 0000000..985547f
1239 --- /dev/null
1240 +++ b/block/bfq-iosched.c
1241 @@ -0,0 +1,2937 @@
1243 + * BFQ, or Budget Fair Queueing, disk scheduler.
1245 + * Based on ideas and code from CFQ:
1246 + * Copyright (C) 2003 Jens Axboe <axboe@kernel.dk>
1248 + * Copyright (C) 2008 Fabio Checconi <fabio@gandalf.sssup.it>
1249 + * Paolo Valente <paolo.valente@unimore.it>
1251 + * Licensed under the GPL-2 as detailed in the accompanying COPYING.BFQ file.
1253 + * BFQ is a proportional share disk scheduling algorithm based on the
1254 + * slice-by-slice service scheme of CFQ. But BFQ assigns budgets,
1255 + * measured in number of sectors, to tasks instead of time slices.
1256 + * The disk is not granted to the active task for a given time slice,
1257 + * but until it has exahusted its assigned budget. This change from
1258 + * the time to the service domain allows BFQ to distribute the disk
1259 + * bandwidth among tasks as desired, without any distortion due to
1260 + * ZBR, workload fluctuations or other factors. BFQ uses an ad hoc
1261 + * internal scheduler, called B-WF2Q+, to schedule tasks according to
1262 + * their budgets. Thanks to this accurate scheduler, BFQ can afford
1263 + * to assign high budgets to disk-bound non-seeky tasks (to boost the
1264 + * throughput), and yet guarantee low latencies to interactive and
1265 + * soft real-time applications.
1267 + * BFQ has been introduced in [1], where the interested reader can
1268 + * find an accurate description of the algorithm, the bandwidth
1269 + * distribution and latency guarantees it provides, plus formal proofs
1270 + * of all the properties. With respect to the algorithm presented in
1271 + * the paper, this implementation adds several little heuristics, and
1272 + * a hierarchical extension, based on H-WF2Q+.
1274 + * B-WF2Q+ is based on WF2Q+, that is described in [2], together with
1275 + * H-WF2Q+, while the augmented tree used to implement B-WF2Q+ with O(log N)
1276 + * complexity derives from the one introduced with EEVDF in [3].
1278 + * [1] P. Valente and F. Checconi, ``High Throughput Disk Scheduling
1279 + * with Deterministic Guarantees on Bandwidth Distribution,'',
1280 + * IEEE Transactions on Computer, May 2010.
1282 + * http://algo.ing.unimo.it/people/paolo/disk_sched/bfq-techreport.pdf
1284 + * [2] Jon C.R. Bennett and H. Zhang, ``Hierarchical Packet Fair Queueing
1285 + * Algorithms,'' IEEE/ACM Transactions on Networking, 5(5):675-689,
1286 + * Oct 1997.
1288 + * http://www.cs.cmu.edu/~hzhang/papers/TON-97-Oct.ps.gz
1290 + * [3] I. Stoica and H. Abdel-Wahab, ``Earliest Eligible Virtual Deadline
1291 + * First: A Flexible and Accurate Mechanism for Proportional Share
1292 + * Resource Allocation,'' technical report.
1294 + * http://www.cs.berkeley.edu/~istoica/papers/eevdf-tr-95.pdf
1295 + */
1296 +#include <linux/module.h>
1297 +#include <linux/slab.h>
1298 +#include <linux/blkdev.h>
1299 +#include <linux/cgroup.h>
1300 +#include <linux/elevator.h>
1301 +#include <linux/jiffies.h>
1302 +#include <linux/rbtree.h>
1303 +#include <linux/ioprio.h>
1304 +#include "bfq.h"
1306 +/* Max number of dispatches in one round of service. */
1307 +static const int bfq_quantum = 4;
1309 +/* Expiration time of sync (0) and async (1) requests, in jiffies. */
1310 +static const int bfq_fifo_expire[2] = { HZ / 4, HZ / 8 };
1312 +/* Maximum backwards seek, in KiB. */
1313 +static const int bfq_back_max = 16 * 1024;
1315 +/* Penalty of a backwards seek, in number of sectors. */
1316 +static const int bfq_back_penalty = 2;
1318 +/* Idling period duration, in jiffies. */
1319 +static int bfq_slice_idle = HZ / 125;
1321 +/* Default maximum budget values, in sectors and number of requests. */
1322 +static const int bfq_default_max_budget = 16 * 1024;
1323 +static const int bfq_max_budget_async_rq = 4;
1326 + * Async to sync throughput distribution is controlled as follows:
1327 + * when an async request is served, the entity is charged the number
1328 + * of sectors of the request, multipled by the factor below
1329 + */
1330 +static const int bfq_async_charge_factor = 10;
1332 +/* Default timeout values, in jiffies, approximating CFQ defaults. */
1333 +static const int bfq_timeout_sync = HZ / 8;
1334 +static int bfq_timeout_async = HZ / 25;
1336 +struct kmem_cache *bfq_pool;
1337 +struct kmem_cache *bfq_ioc_pool;
1339 +static DEFINE_PER_CPU(unsigned long, bfq_ioc_count);
1340 +static struct completion *bfq_ioc_gone;
1341 +static DEFINE_SPINLOCK(bfq_ioc_gone_lock);
1343 +static DEFINE_SPINLOCK(cic_index_lock);
1344 +static DEFINE_IDA(cic_index_ida);
1346 +/* Below this threshold (in ms), we consider thinktime immediate. */
1347 +#define BFQ_MIN_TT 2
1349 +/* hw_tag detection: parallel requests threshold and min samples needed. */
1350 +#define BFQ_HW_QUEUE_THRESHOLD 4
1351 +#define BFQ_HW_QUEUE_SAMPLES 32
1353 +#define BFQQ_SEEK_THR (sector_t)(8 * 1024)
1354 +#define BFQQ_SEEKY(bfqq) ((bfqq)->seek_mean > BFQQ_SEEK_THR)
1356 +/* Min samples used for peak rate estimation (for autotuning). */
1357 +#define BFQ_PEAK_RATE_SAMPLES 32
1359 +/* Shift used for peak rate fixed precision calculations. */
1360 +#define BFQ_RATE_SHIFT 16
1362 +#define BFQ_SERVICE_TREE_INIT ((struct bfq_service_tree) \
1363 + { RB_ROOT, RB_ROOT, NULL, NULL, 0, 0 })
1365 +#define RQ_CIC(rq) \
1366 + ((struct cfq_io_context *) (rq)->elevator_private[0])
1367 +#define RQ_BFQQ(rq) ((rq)->elevator_private[1])
1369 +#include "bfq-ioc.c"
1370 +#include "bfq-sched.c"
1371 +#include "bfq-cgroup.c"
1373 +#define bfq_class_idle(bfqq) ((bfqq)->entity.ioprio_class ==\
1374 + IOPRIO_CLASS_IDLE)
1375 +#define bfq_class_rt(bfqq) ((bfqq)->entity.ioprio_class ==\
1376 + IOPRIO_CLASS_RT)
1378 +#define bfq_sample_valid(samples) ((samples) > 80)
1381 + * We regard a request as SYNC, if either it's a read or has the SYNC bit
1382 + * set (in which case it could also be a direct WRITE).
1383 + */
1384 +static inline int bfq_bio_sync(struct bio *bio)
1386 + if (bio_data_dir(bio) == READ || (bio->bi_rw & REQ_SYNC))
1387 + return 1;
1389 + return 0;
1393 + * Scheduler run of queue, if there are requests pending and no one in the
1394 + * driver that will restart queueing.
1395 + */
1396 +static inline void bfq_schedule_dispatch(struct bfq_data *bfqd)
1398 + if (bfqd->queued != 0) {
1399 + bfq_log(bfqd, "schedule dispatch");
1400 + kblockd_schedule_work(bfqd->queue, &bfqd->unplug_work);
1405 + * Lifted from AS - choose which of rq1 and rq2 that is best served now.
1406 + * We choose the request that is closesr to the head right now. Distance
1407 + * behind the head is penalized and only allowed to a certain extent.
1408 + */
1409 +static struct request *bfq_choose_req(struct bfq_data *bfqd,
1410 + struct request *rq1,
1411 + struct request *rq2,
1412 + sector_t last)
1414 + sector_t s1, s2, d1 = 0, d2 = 0;
1415 + unsigned long back_max;
1416 +#define BFQ_RQ1_WRAP 0x01 /* request 1 wraps */
1417 +#define BFQ_RQ2_WRAP 0x02 /* request 2 wraps */
1418 + unsigned wrap = 0; /* bit mask: requests behind the disk head? */
1420 + if (rq1 == NULL || rq1 == rq2)
1421 + return rq2;
1422 + if (rq2 == NULL)
1423 + return rq1;
1425 + if (rq_is_sync(rq1) && !rq_is_sync(rq2))
1426 + return rq1;
1427 + else if (rq_is_sync(rq2) && !rq_is_sync(rq1))
1428 + return rq2;
1429 + if ((rq1->cmd_flags & REQ_META) && !(rq2->cmd_flags & REQ_META))
1430 + return rq1;
1431 + else if ((rq2->cmd_flags & REQ_META) && !(rq1->cmd_flags & REQ_META))
1432 + return rq2;
1434 + s1 = blk_rq_pos(rq1);
1435 + s2 = blk_rq_pos(rq2);
1437 + /*
1438 + * By definition, 1KiB is 2 sectors.
1439 + */
1440 + back_max = bfqd->bfq_back_max * 2;
1442 + /*
1443 + * Strict one way elevator _except_ in the case where we allow
1444 + * short backward seeks which are biased as twice the cost of a
1445 + * similar forward seek.
1446 + */
1447 + if (s1 >= last)
1448 + d1 = s1 - last;
1449 + else if (s1 + back_max >= last)
1450 + d1 = (last - s1) * bfqd->bfq_back_penalty;
1451 + else
1452 + wrap |= BFQ_RQ1_WRAP;
1454 + if (s2 >= last)
1455 + d2 = s2 - last;
1456 + else if (s2 + back_max >= last)
1457 + d2 = (last - s2) * bfqd->bfq_back_penalty;
1458 + else
1459 + wrap |= BFQ_RQ2_WRAP;
1461 + /* Found required data */
1463 + /*
1464 + * By doing switch() on the bit mask "wrap" we avoid having to
1465 + * check two variables for all permutations: --> faster!
1466 + */
1467 + switch (wrap) {
1468 + case 0: /* common case for CFQ: rq1 and rq2 not wrapped */
1469 + if (d1 < d2)
1470 + return rq1;
1471 + else if (d2 < d1)
1472 + return rq2;
1473 + else {
1474 + if (s1 >= s2)
1475 + return rq1;
1476 + else
1477 + return rq2;
1480 + case BFQ_RQ2_WRAP:
1481 + return rq1;
1482 + case BFQ_RQ1_WRAP:
1483 + return rq2;
1484 + case (BFQ_RQ1_WRAP|BFQ_RQ2_WRAP): /* both rqs wrapped */
1485 + default:
1486 + /*
1487 + * Since both rqs are wrapped,
1488 + * start with the one that's further behind head
1489 + * (--> only *one* back seek required),
1490 + * since back seek takes more time than forward.
1491 + */
1492 + if (s1 <= s2)
1493 + return rq1;
1494 + else
1495 + return rq2;
1499 +static struct bfq_queue *
1500 +bfq_rq_pos_tree_lookup(struct bfq_data *bfqd, struct rb_root *root,
1501 + sector_t sector, struct rb_node **ret_parent,
1502 + struct rb_node ***rb_link)
1504 + struct rb_node **p, *parent;
1505 + struct bfq_queue *bfqq = NULL;
1507 + parent = NULL;
1508 + p = &root->rb_node;
1509 + while (*p) {
1510 + struct rb_node **n;
1512 + parent = *p;
1513 + bfqq = rb_entry(parent, struct bfq_queue, pos_node);
1515 + /*
1516 + * Sort strictly based on sector. Smallest to the left,
1517 + * largest to the right.
1518 + */
1519 + if (sector > blk_rq_pos(bfqq->next_rq))
1520 + n = &(*p)->rb_right;
1521 + else if (sector < blk_rq_pos(bfqq->next_rq))
1522 + n = &(*p)->rb_left;
1523 + else
1524 + break;
1525 + p = n;
1526 + bfqq = NULL;
1529 + *ret_parent = parent;
1530 + if (rb_link)
1531 + *rb_link = p;
1533 + bfq_log(bfqd, "rq_pos_tree_lookup %llu: returning %d",
1534 + (long long unsigned)sector,
1535 + bfqq != NULL ? bfqq->pid : 0);
1537 + return bfqq;
1540 +static void bfq_rq_pos_tree_add(struct bfq_data *bfqd, struct bfq_queue *bfqq)
1542 + struct rb_node **p, *parent;
1543 + struct bfq_queue *__bfqq;
1545 + if (bfqq->pos_root != NULL) {
1546 + rb_erase(&bfqq->pos_node, bfqq->pos_root);
1547 + bfqq->pos_root = NULL;
1550 + if (bfq_class_idle(bfqq))
1551 + return;
1552 + if (!bfqq->next_rq)
1553 + return;
1555 + bfqq->pos_root = &bfqd->rq_pos_tree;
1556 + __bfqq = bfq_rq_pos_tree_lookup(bfqd, bfqq->pos_root,
1557 + blk_rq_pos(bfqq->next_rq), &parent, &p);
1558 + if (__bfqq == NULL) {
1559 + rb_link_node(&bfqq->pos_node, parent, p);
1560 + rb_insert_color(&bfqq->pos_node, bfqq->pos_root);
1561 + } else
1562 + bfqq->pos_root = NULL;
1565 +static struct request *bfq_find_next_rq(struct bfq_data *bfqd,
1566 + struct bfq_queue *bfqq,
1567 + struct request *last)
1569 + struct rb_node *rbnext = rb_next(&last->rb_node);
1570 + struct rb_node *rbprev = rb_prev(&last->rb_node);
1571 + struct request *next = NULL, *prev = NULL;
1573 + BUG_ON(RB_EMPTY_NODE(&last->rb_node));
1575 + if (rbprev != NULL)
1576 + prev = rb_entry_rq(rbprev);
1578 + if (rbnext != NULL)
1579 + next = rb_entry_rq(rbnext);
1580 + else {
1581 + rbnext = rb_first(&bfqq->sort_list);
1582 + if (rbnext && rbnext != &last->rb_node)
1583 + next = rb_entry_rq(rbnext);
1586 + return bfq_choose_req(bfqd, next, prev, blk_rq_pos(last));
1589 +static void bfq_del_rq_rb(struct request *rq)
1591 + struct bfq_queue *bfqq = RQ_BFQQ(rq);
1592 + struct bfq_data *bfqd = bfqq->bfqd;
1593 + const int sync = rq_is_sync(rq);
1595 + BUG_ON(bfqq->queued[sync] == 0);
1596 + bfqq->queued[sync]--;
1597 + bfqd->queued--;
1599 + elv_rb_del(&bfqq->sort_list, rq);
1601 + if (RB_EMPTY_ROOT(&bfqq->sort_list)) {
1602 + if (bfq_bfqq_busy(bfqq) && bfqq != bfqd->active_queue)
1603 + bfq_del_bfqq_busy(bfqd, bfqq, 1);
1604 + /*
1605 + * Remove queue from request-position tree as it is empty.
1606 + */
1607 + if (bfqq->pos_root != NULL) {
1608 + rb_erase(&bfqq->pos_node, bfqq->pos_root);
1609 + bfqq->pos_root = NULL;
1614 +/* see the definition of bfq_async_charge_factor for details */
1615 +static inline unsigned long bfq_serv_to_charge(struct request *rq,
1616 + struct bfq_queue *bfqq)
1618 + return blk_rq_sectors(rq) *
1619 + (1 + ((!bfq_bfqq_sync(bfqq)) * (bfqq->raising_coeff == 1) *
1620 + bfq_async_charge_factor));
1623 +/**
1624 + * bfq_updated_next_req - update the queue after a new next_rq selection.
1625 + * @bfqd: the device data the queue belongs to.
1626 + * @bfqq: the queue to update.
1628 + * If the first request of a queue changes we make sure that the queue
1629 + * has enough budget to serve at least its first request (if the
1630 + * request has grown). We do this because if the queue has not enough
1631 + * budget for its first request, it has to go through two dispatch
1632 + * rounds to actually get it dispatched.
1633 + */
1634 +static void bfq_updated_next_req(struct bfq_data *bfqd,
1635 + struct bfq_queue *bfqq)
1637 + struct bfq_entity *entity = &bfqq->entity;
1638 + struct bfq_service_tree *st = bfq_entity_service_tree(entity);
1639 + struct request *next_rq = bfqq->next_rq;
1640 + unsigned long new_budget;
1642 + if (next_rq == NULL)
1643 + return;
1645 + if (bfqq == bfqd->active_queue)
1646 + /*
1647 + * In order not to break guarantees, budgets cannot be
1648 + * changed after an entity has been selected.
1649 + */
1650 + return;
1652 + BUG_ON(entity->tree != &st->active);
1653 + BUG_ON(entity == entity->sched_data->active_entity);
1655 + new_budget = max_t(unsigned long, bfqq->max_budget,
1656 + bfq_serv_to_charge(next_rq, bfqq));
1657 + entity->budget = new_budget;
1658 + bfq_log_bfqq(bfqd, bfqq, "updated next rq: new budget %lu", new_budget);
1659 + bfq_activate_bfqq(bfqd, bfqq);
1662 +static void bfq_add_rq_rb(struct request *rq)
1664 + struct bfq_queue *bfqq = RQ_BFQQ(rq);
1665 + struct bfq_entity *entity = &bfqq->entity;
1666 + struct bfq_data *bfqd = bfqq->bfqd;
1667 + struct request *next_rq, *prev;
1668 + unsigned long old_raising_coeff = bfqq->raising_coeff;
1669 + int idle_for_long_time = bfqq->budget_timeout +
1670 + bfqd->bfq_raising_min_idle_time < jiffies;
1672 + bfq_log_bfqq(bfqd, bfqq, "add_rq_rb %d", rq_is_sync(rq));
1673 + bfqq->queued[rq_is_sync(rq)]++;
1674 + bfqd->queued++;
1676 + elv_rb_add(&bfqq->sort_list, rq);
1678 + /*
1679 + * Check if this request is a better next-serve candidate.
1680 + */
1681 + prev = bfqq->next_rq;
1682 + next_rq = bfq_choose_req(bfqd, bfqq->next_rq, rq, bfqd->last_position);
1683 + BUG_ON(next_rq == NULL);
1684 + bfqq->next_rq = next_rq;
1686 + /*
1687 + * Adjust priority tree position, if next_rq changes.
1688 + */
1689 + if (prev != bfqq->next_rq)
1690 + bfq_rq_pos_tree_add(bfqd, bfqq);
1692 + if (!bfq_bfqq_busy(bfqq)) {
1693 + int soft_rt = bfqd->bfq_raising_max_softrt_rate > 0 &&
1694 + bfqq->soft_rt_next_start < jiffies;
1695 + entity->budget = max_t(unsigned long, bfqq->max_budget,
1696 + bfq_serv_to_charge(next_rq, bfqq));
1698 + if (! bfqd->low_latency)
1699 + goto add_bfqq_busy;
1701 + /*
1702 + * If the queue is not being boosted and has been idle
1703 + * for enough time, start a weight-raising period
1704 + */
1705 + if(old_raising_coeff == 1 && (idle_for_long_time || soft_rt)) {
1706 + bfqq->raising_coeff = bfqd->bfq_raising_coeff;
1707 + bfqq->raising_cur_max_time = idle_for_long_time ?
1708 + bfqd->bfq_raising_max_time :
1709 + bfqd->bfq_raising_rt_max_time;
1710 + bfq_log_bfqq(bfqd, bfqq,
1711 + "wrais starting at %llu msec,"
1712 + "rais_max_time %u",
1713 + bfqq->last_rais_start_finish,
1714 + jiffies_to_msecs(bfqq->
1715 + raising_cur_max_time));
1716 + } else if (old_raising_coeff > 1) {
1717 + if (idle_for_long_time)
1718 + bfqq->raising_cur_max_time =
1719 + bfqd->bfq_raising_max_time;
1720 + else if (bfqq->raising_cur_max_time ==
1721 + bfqd->bfq_raising_rt_max_time &&
1722 + !soft_rt) {
1723 + bfqq->raising_coeff = 1;
1724 + bfq_log_bfqq(bfqd, bfqq,
1725 + "wrais ending at %llu msec,"
1726 + "rais_max_time %u",
1727 + bfqq->last_rais_start_finish,
1728 + jiffies_to_msecs(bfqq->
1729 + raising_cur_max_time));
1732 + if (old_raising_coeff != bfqq->raising_coeff)
1733 + entity->ioprio_changed = 1;
1734 +add_bfqq_busy:
1735 + bfq_add_bfqq_busy(bfqd, bfqq);
1736 + } else {
1737 + if(old_raising_coeff == 1 && bfqq->last_rais_start_finish +
1738 + bfqd->bfq_raising_min_idle_time < jiffies) {
1739 + bfqq->raising_coeff = bfqd->bfq_raising_coeff;
1741 + entity->ioprio_changed = 1;
1742 + bfq_log_bfqq(bfqd, bfqq,
1743 + "non-idle wrais starting at %llu msec,"
1744 + "rais_max_time %u",
1745 + bfqq->last_rais_start_finish,
1746 + jiffies_to_msecs(bfqq->
1747 + raising_cur_max_time));
1749 + bfq_updated_next_req(bfqd, bfqq);
1752 + if(bfqd->low_latency &&
1753 + (old_raising_coeff == 1 || bfqq->raising_coeff == 1 ||
1754 + idle_for_long_time))
1755 + bfqq->last_rais_start_finish = jiffies;
1758 +static void bfq_reposition_rq_rb(struct bfq_queue *bfqq, struct request *rq)
1760 + elv_rb_del(&bfqq->sort_list, rq);
1761 + bfqq->queued[rq_is_sync(rq)]--;
1762 + bfqq->bfqd->queued--;
1763 + bfq_add_rq_rb(rq);
1766 +static struct request *bfq_find_rq_fmerge(struct bfq_data *bfqd,
1767 + struct bio *bio)
1769 + struct task_struct *tsk = current;
1770 + struct cfq_io_context *cic;
1771 + struct bfq_queue *bfqq;
1773 + cic = bfq_cic_lookup(bfqd, tsk->io_context);
1774 + if (cic == NULL)
1775 + return NULL;
1777 + bfqq = cic_to_bfqq(cic, bfq_bio_sync(bio));
1778 + if (bfqq != NULL) {
1779 + sector_t sector = bio->bi_sector + bio_sectors(bio);
1781 + return elv_rb_find(&bfqq->sort_list, sector);
1784 + return NULL;
1787 +static void bfq_activate_request(struct request_queue *q, struct request *rq)
1789 + struct bfq_data *bfqd = q->elevator->elevator_data;
1791 + bfqd->rq_in_driver++;
1792 + bfqd->last_position = blk_rq_pos(rq) + blk_rq_sectors(rq);
1793 + bfq_log(bfqd, "activate_request: new bfqd->last_position %llu",
1794 + (long long unsigned)bfqd->last_position);
1797 +static void bfq_deactivate_request(struct request_queue *q, struct request *rq)
1799 + struct bfq_data *bfqd = q->elevator->elevator_data;
1801 + WARN_ON(bfqd->rq_in_driver == 0);
1802 + bfqd->rq_in_driver--;
1805 +static void bfq_remove_request(struct request *rq)
1807 + struct bfq_queue *bfqq = RQ_BFQQ(rq);
1808 + struct bfq_data *bfqd = bfqq->bfqd;
1810 + if (bfqq->next_rq == rq) {
1811 + bfqq->next_rq = bfq_find_next_rq(bfqd, bfqq, rq);
1812 + bfq_updated_next_req(bfqd, bfqq);
1815 + list_del_init(&rq->queuelist);
1816 + bfq_del_rq_rb(rq);
1818 + if (rq->cmd_flags & REQ_META) {
1819 + WARN_ON(bfqq->meta_pending == 0);
1820 + bfqq->meta_pending--;
1824 +static int bfq_merge(struct request_queue *q, struct request **req,
1825 + struct bio *bio)
1827 + struct bfq_data *bfqd = q->elevator->elevator_data;
1828 + struct request *__rq;
1830 + __rq = bfq_find_rq_fmerge(bfqd, bio);
1831 + if (__rq != NULL && elv_rq_merge_ok(__rq, bio)) {
1832 + *req = __rq;
1833 + return ELEVATOR_FRONT_MERGE;
1836 + return ELEVATOR_NO_MERGE;
1839 +static void bfq_merged_request(struct request_queue *q, struct request *req,
1840 + int type)
1842 + if (type == ELEVATOR_FRONT_MERGE) {
1843 + struct bfq_queue *bfqq = RQ_BFQQ(req);
1845 + bfq_reposition_rq_rb(bfqq, req);
1849 +static void bfq_merged_requests(struct request_queue *q, struct request *rq,
1850 + struct request *next)
1852 + struct bfq_queue *bfqq = RQ_BFQQ(rq);
1854 + /*
1855 + * Reposition in fifo if next is older than rq.
1856 + */
1857 + if (!list_empty(&rq->queuelist) && !list_empty(&next->queuelist) &&
1858 + time_before(rq_fifo_time(next), rq_fifo_time(rq))) {
1859 + list_move(&rq->queuelist, &next->queuelist);
1860 + rq_set_fifo_time(rq, rq_fifo_time(next));
1863 + if (bfqq->next_rq == next)
1864 + bfqq->next_rq = rq;
1866 + bfq_remove_request(next);
1869 +static int bfq_allow_merge(struct request_queue *q, struct request *rq,
1870 + struct bio *bio)
1872 + struct bfq_data *bfqd = q->elevator->elevator_data;
1873 + struct cfq_io_context *cic;
1874 + struct bfq_queue *bfqq;
1876 + /* Disallow merge of a sync bio into an async request. */
1877 + if (bfq_bio_sync(bio) && !rq_is_sync(rq))
1878 + return 0;
1880 + /*
1881 + * Lookup the bfqq that this bio will be queued with. Allow
1882 + * merge only if rq is queued there.
1883 + */
1884 + cic = bfq_cic_lookup(bfqd, current->io_context);
1885 + if (cic == NULL)
1886 + return 0;
1888 + bfqq = cic_to_bfqq(cic, bfq_bio_sync(bio));
1889 + return bfqq == RQ_BFQQ(rq);
1892 +static void __bfq_set_active_queue(struct bfq_data *bfqd,
1893 + struct bfq_queue *bfqq)
1895 + if (bfqq != NULL) {
1896 + bfq_mark_bfqq_must_alloc(bfqq);
1897 + bfq_mark_bfqq_budget_new(bfqq);
1898 + bfq_clear_bfqq_fifo_expire(bfqq);
1900 + bfqd->budgets_assigned = (bfqd->budgets_assigned*7 + 256) / 8;
1902 + bfq_log_bfqq(bfqd, bfqq, "set_active_queue, cur-budget = %lu",
1903 + bfqq->entity.budget);
1906 + bfqd->active_queue = bfqq;
1910 + * Get and set a new active queue for service.
1911 + */
1912 +static struct bfq_queue *bfq_set_active_queue(struct bfq_data *bfqd,
1913 + struct bfq_queue *bfqq)
1915 + if (!bfqq)
1916 + bfqq = bfq_get_next_queue(bfqd);
1917 + else
1918 + bfq_get_next_queue_forced(bfqd, bfqq);
1920 + __bfq_set_active_queue(bfqd, bfqq);
1921 + return bfqq;
1924 +static inline sector_t bfq_dist_from_last(struct bfq_data *bfqd,
1925 + struct request *rq)
1927 + if (blk_rq_pos(rq) >= bfqd->last_position)
1928 + return blk_rq_pos(rq) - bfqd->last_position;
1929 + else
1930 + return bfqd->last_position - blk_rq_pos(rq);
1934 + * Return true if bfqq has no request pending and rq is close enough to
1935 + * bfqd->last_position, or if rq is closer to bfqd->last_position than
1936 + * bfqq->next_rq
1937 + */
1938 +static inline int bfq_rq_close(struct bfq_data *bfqd, struct bfq_queue *bfqq,
1939 + struct request *rq)
1941 + sector_t sdist = bfqq->seek_mean;
1943 + if (!bfq_sample_valid(bfqq->seek_samples))
1944 + sdist = BFQQ_SEEK_THR;
1946 + /* If seek_mean is large, using it as close criteria is meaningless */
1947 + if (sdist > BFQQ_SEEK_THR)
1948 + sdist = BFQQ_SEEK_THR;
1950 + return bfq_dist_from_last(bfqd, rq) <= sdist;
1953 +static struct bfq_queue *bfqq_close(struct bfq_data *bfqd,
1954 + struct bfq_queue *cur_bfqq)
1956 + struct rb_root *root = &bfqd->rq_pos_tree;
1957 + struct rb_node *parent, *node;
1958 + struct bfq_queue *__bfqq;
1959 + sector_t sector = bfqd->last_position;
1961 + if (RB_EMPTY_ROOT(root))
1962 + return NULL;
1964 + /*
1965 + * First, if we find a request starting at the end of the last
1966 + * request, choose it.
1967 + */
1968 + __bfqq = bfq_rq_pos_tree_lookup(bfqd, root, sector, &parent, NULL);
1969 + if (__bfqq != NULL)
1970 + return __bfqq;
1972 + /*
1973 + * If the exact sector wasn't found, the parent of the NULL leaf
1974 + * will contain the closest sector (rq_pos_tree sorted by next_request
1975 + * position).
1976 + */
1977 + __bfqq = rb_entry(parent, struct bfq_queue, pos_node);
1978 + if (bfq_rq_close(bfqd, cur_bfqq, __bfqq->next_rq))
1979 + return __bfqq;
1981 + if (blk_rq_pos(__bfqq->next_rq) < sector)
1982 + node = rb_next(&__bfqq->pos_node);
1983 + else
1984 + node = rb_prev(&__bfqq->pos_node);
1985 + if (node == NULL)
1986 + return NULL;
1988 + __bfqq = rb_entry(node, struct bfq_queue, pos_node);
1989 + if (bfq_rq_close(bfqd, cur_bfqq, __bfqq->next_rq))
1990 + return __bfqq;
1992 + return NULL;
1996 + * bfqd - obvious
1997 + * cur_bfqq - passed in so that we don't decide that the current queue
1998 + * is closely cooperating with itself.
2000 + * We are assuming that cur_bfqq has dispatched at least one request,
2001 + * and that bfqd->last_position reflects a position on the disk associated
2002 + * with the I/O issued by cur_bfqq.
2003 + */
2004 +static struct bfq_queue *bfq_close_cooperator(struct bfq_data *bfqd,
2005 + struct bfq_queue *cur_bfqq)
2007 + struct bfq_queue *bfqq;
2009 + if (bfq_class_idle(cur_bfqq))
2010 + return NULL;
2011 + if (!bfq_bfqq_sync(cur_bfqq))
2012 + return NULL;
2013 + if (BFQQ_SEEKY(cur_bfqq))
2014 + return NULL;
2016 + /* If device has only one backlogged bfq_queue, don't search. */
2017 + if (bfqd->busy_queues == 1)
2018 + return NULL;
2020 + /*
2021 + * We should notice if some of the queues are cooperating, e.g.
2022 + * working closely on the same area of the disk. In that case,
2023 + * we can group them together and don't waste time idling.
2024 + */
2025 + bfqq = bfqq_close(bfqd, cur_bfqq);
2026 + if (bfqq == NULL || bfqq == cur_bfqq)
2027 + return NULL;
2029 + /*
2030 + * Do not merge queues from different bfq_groups.
2031 + */
2032 + if (bfqq->entity.parent != cur_bfqq->entity.parent)
2033 + return NULL;
2035 + /*
2036 + * It only makes sense to merge sync queues.
2037 + */
2038 + if (!bfq_bfqq_sync(bfqq))
2039 + return NULL;
2040 + if (BFQQ_SEEKY(bfqq))
2041 + return NULL;
2043 + /*
2044 + * Do not merge queues of different priority classes.
2045 + */
2046 + if (bfq_class_rt(bfqq) != bfq_class_rt(cur_bfqq))
2047 + return NULL;
2049 + return bfqq;
2053 + * If enough samples have been computed, return the current max budget
2054 + * stored in bfqd, which is dynamically updated according to the
2055 + * estimated disk peak rate; otherwise return the default max budget
2056 + */
2057 +static inline unsigned long bfq_max_budget(struct bfq_data *bfqd)
2059 + return bfqd->budgets_assigned < 194 ? bfq_default_max_budget :
2060 + bfqd->bfq_max_budget;
2064 + * Return min budget, which is a fraction of the current or default
2065 + * max budget (trying with 1/32)
2066 + */
2067 +static inline unsigned long bfq_min_budget(struct bfq_data *bfqd)
2069 + return bfqd->budgets_assigned < 194 ? bfq_default_max_budget / 32 :
2070 + bfqd->bfq_max_budget / 32;
2073 +static void bfq_arm_slice_timer(struct bfq_data *bfqd)
2075 + struct bfq_queue *bfqq = bfqd->active_queue;
2076 + struct cfq_io_context *cic;
2077 + unsigned long sl;
2079 + WARN_ON(!RB_EMPTY_ROOT(&bfqq->sort_list));
2081 + /* Idling is disabled, either manually or by past process history. */
2082 + if (bfqd->bfq_slice_idle == 0 || !bfq_bfqq_idle_window(bfqq))
2083 + return;
2085 + /* Tasks have exited, don't wait. */
2086 + cic = bfqd->active_cic;
2087 + if (cic == NULL || atomic_read(&cic->ioc->nr_tasks) == 0)
2088 + return;
2090 + bfq_mark_bfqq_wait_request(bfqq);
2092 + /*
2093 + * We don't want to idle for seeks, but we do want to allow
2094 + * fair distribution of slice time for a process doing back-to-back
2095 + * seeks. So allow a little bit of time for him to submit a new rq.
2097 + * To prevent processes with (partly) seeky workloads from
2098 + * being too ill-treated, grant them a small fraction of the
2099 + * assigned budget before reducing the waiting time to
2100 + * BFQ_MIN_TT. This happened to help reduce latency.
2101 + */
2102 + sl = bfqd->bfq_slice_idle;
2103 + if (bfq_sample_valid(bfqq->seek_samples) && BFQQ_SEEKY(bfqq) &&
2104 + bfqq->entity.service > bfq_max_budget(bfqd) / 8 &&
2105 + bfqq->raising_coeff == 1)
2106 + sl = min(sl, msecs_to_jiffies(BFQ_MIN_TT));
2107 + else if (bfqq->raising_coeff > 1)
2108 + sl = sl * 3;
2109 + bfqd->last_idling_start = ktime_get();
2110 + mod_timer(&bfqd->idle_slice_timer, jiffies + sl);
2111 + bfq_log(bfqd, "arm idle: %u/%u ms",
2112 + jiffies_to_msecs(sl), jiffies_to_msecs(bfqd->bfq_slice_idle));
2116 + * Set the maximum time for the active queue to consume its
2117 + * budget. This prevents seeky processes from lowering the disk
2118 + * throughput (always guaranteed with a time slice scheme as in CFQ).
2119 + */
2120 +static void bfq_set_budget_timeout(struct bfq_data *bfqd)
2122 + struct bfq_queue *bfqq = bfqd->active_queue;
2123 + unsigned int timeout_coeff =
2124 + bfqq->raising_cur_max_time == bfqd->bfq_raising_rt_max_time ?
2125 + 1 : (bfqq->entity.weight / bfqq->entity.orig_weight);
2127 + bfqd->last_budget_start = ktime_get();
2129 + bfq_clear_bfqq_budget_new(bfqq);
2130 + bfqq->budget_timeout = jiffies +
2131 + bfqd->bfq_timeout[bfq_bfqq_sync(bfqq)] * timeout_coeff;
2133 + bfq_log_bfqq(bfqd, bfqq, "set budget_timeout %u",
2134 + jiffies_to_msecs(bfqd->bfq_timeout[bfq_bfqq_sync(bfqq)] *
2135 + timeout_coeff));
2139 + * Move request from internal lists to the request queue dispatch list.
2140 + */
2141 +static void bfq_dispatch_insert(struct request_queue *q, struct request *rq)
2143 + struct bfq_data *bfqd = q->elevator->elevator_data;
2144 + struct bfq_queue *bfqq = RQ_BFQQ(rq);
2146 + bfq_remove_request(rq);
2147 + bfqq->dispatched++;
2148 + elv_dispatch_sort(q, rq);
2150 + if (bfq_bfqq_sync(bfqq))
2151 + bfqd->sync_flight++;
2155 + * Return expired entry, or NULL to just start from scratch in rbtree.
2156 + */
2157 +static struct request *bfq_check_fifo(struct bfq_queue *bfqq)
2159 + struct request *rq = NULL;
2161 + if (bfq_bfqq_fifo_expire(bfqq))
2162 + return NULL;
2164 + bfq_mark_bfqq_fifo_expire(bfqq);
2166 + if (list_empty(&bfqq->fifo))
2167 + return NULL;
2169 + rq = rq_entry_fifo(bfqq->fifo.next);
2171 + if (time_before(jiffies, rq_fifo_time(rq)))
2172 + return NULL;
2174 + return rq;
2178 + * Must be called with the queue_lock held.
2179 + */
2180 +static int bfqq_process_refs(struct bfq_queue *bfqq)
2182 + int process_refs, io_refs;
2184 + io_refs = bfqq->allocated[READ] + bfqq->allocated[WRITE];
2185 + process_refs = atomic_read(&bfqq->ref) - io_refs;
2186 + BUG_ON(process_refs < 0);
2187 + return process_refs;
2190 +static void bfq_setup_merge(struct bfq_queue *bfqq, struct bfq_queue *new_bfqq)
2192 + int process_refs, new_process_refs;
2193 + struct bfq_queue *__bfqq;
2195 + /*
2196 + * If there are no process references on the new_bfqq, then it is
2197 + * unsafe to follow the ->new_bfqq chain as other bfqq's in the chain
2198 + * may have dropped their last reference (not just their last process
2199 + * reference).
2200 + */
2201 + if (!bfqq_process_refs(new_bfqq))
2202 + return;
2204 + /* Avoid a circular list and skip interim queue merges. */
2205 + while ((__bfqq = new_bfqq->new_bfqq)) {
2206 + if (__bfqq == bfqq)
2207 + return;
2208 + new_bfqq = __bfqq;
2211 + process_refs = bfqq_process_refs(bfqq);
2212 + new_process_refs = bfqq_process_refs(new_bfqq);
2213 + /*
2214 + * If the process for the bfqq has gone away, there is no
2215 + * sense in merging the queues.
2216 + */
2217 + if (process_refs == 0 || new_process_refs == 0)
2218 + return;
2220 + /*
2221 + * Merge in the direction of the lesser amount of work.
2222 + */
2223 + if (new_process_refs >= process_refs) {
2224 + bfqq->new_bfqq = new_bfqq;
2225 + atomic_add(process_refs, &new_bfqq->ref);
2226 + } else {
2227 + new_bfqq->new_bfqq = bfqq;
2228 + atomic_add(new_process_refs, &bfqq->ref);
2230 + bfq_log_bfqq(bfqq->bfqd, bfqq, "scheduling merge with queue %d",
2231 + new_bfqq->pid);
2234 +static inline unsigned long bfq_bfqq_budget_left(struct bfq_queue *bfqq)
2236 + struct bfq_entity *entity = &bfqq->entity;
2237 + return entity->budget - entity->service;
2240 +static void __bfq_bfqq_expire(struct bfq_data *bfqd, struct bfq_queue *bfqq)
2242 + BUG_ON(bfqq != bfqd->active_queue);
2244 + __bfq_bfqd_reset_active(bfqd);
2246 + if (RB_EMPTY_ROOT(&bfqq->sort_list)) {
2247 + bfq_del_bfqq_busy(bfqd, bfqq, 1);
2248 + /*
2249 + * overloading budget_timeout field to store when
2250 + * the queue remains with no backlog, used by
2251 + * the weight-raising mechanism
2252 + */
2253 + bfqq->budget_timeout = jiffies ;
2255 + else {
2256 + bfq_activate_bfqq(bfqd, bfqq);
2257 + /*
2258 + * Resort priority tree of potential close cooperators.
2259 + */
2260 + bfq_rq_pos_tree_add(bfqd, bfqq);
2263 + /*
2264 + * If this bfqq is shared between multiple processes, check
2265 + * to make sure that those processes are still issuing I/Os
2266 + * within the mean seek distance. If not, it may be time to
2267 + * break the queues apart again.
2268 + */
2269 + if (bfq_bfqq_coop(bfqq) && BFQQ_SEEKY(bfqq))
2270 + bfq_mark_bfqq_split_coop(bfqq);
2273 +/**
2274 + * __bfq_bfqq_recalc_budget - try to adapt the budget to the @bfqq behavior.
2275 + * @bfqd: device data.
2276 + * @bfqq: queue to update.
2277 + * @reason: reason for expiration.
2279 + * Handle the feedback on @bfqq budget. See the body for detailed
2280 + * comments.
2281 + */
2282 +static void __bfq_bfqq_recalc_budget(struct bfq_data *bfqd,
2283 + struct bfq_queue *bfqq,
2284 + enum bfqq_expiration reason)
2286 + struct request *next_rq;
2287 + unsigned long budget, min_budget;
2289 + budget = bfqq->max_budget;
2290 + min_budget = bfq_min_budget(bfqd);
2292 + BUG_ON(bfqq != bfqd->active_queue);
2294 + bfq_log_bfqq(bfqd, bfqq, "recalc_budg: last budg %lu, budg left %lu",
2295 + bfqq->entity.budget, bfq_bfqq_budget_left(bfqq));
2296 + bfq_log_bfqq(bfqd, bfqq, "recalc_budg: last max_budg %lu, min budg %lu",
2297 + budget, bfq_min_budget(bfqd));
2298 + bfq_log_bfqq(bfqd, bfqq, "recalc_budg: sync %d, seeky %d",
2299 + bfq_bfqq_sync(bfqq), BFQQ_SEEKY(bfqd->active_queue));
2301 + if (bfq_bfqq_sync(bfqq)) {
2302 + switch (reason) {
2303 + /*
2304 + * Caveat: in all the following cases we trade latency
2305 + * for throughput.
2306 + */
2307 + case BFQ_BFQQ_TOO_IDLE:
2308 + /*
2309 + * This is the only case where we may reduce
2310 + * the budget: if there is no requets of the
2311 + * process still waiting for completion, then
2312 + * we assume (tentatively) that the timer has
2313 + * expired because the batch of requests of
2314 + * the process could have been served with a
2315 + * smaller budget. Hence, betting that
2316 + * process will behave in the same way when it
2317 + * becomes backlogged again, we reduce its
2318 + * next budget. As long as we guess right,
2319 + * this budget cut reduces the latency
2320 + * experienced by the process.
2322 + * However, if there are still outstanding
2323 + * requests, then the process may have not yet
2324 + * issued its next request just because it is
2325 + * still waiting for the completion of some of
2326 + * the still oustanding ones. So in this
2327 + * subcase we do not reduce its budget, on the
2328 + * contrary we increase it to possibly boost
2329 + * the throughput, as discussed in the
2330 + * comments to the BUDGET_TIMEOUT case.
2331 + */
2332 + if (bfqq->dispatched > 0) /* still oustanding reqs */
2333 + budget = min(budget * 2, bfqd->bfq_max_budget);
2334 + else {
2335 + if (budget > 5 * min_budget)
2336 + budget -= 4 * min_budget;
2337 + else
2338 + budget = min_budget;
2340 + break;
2341 + case BFQ_BFQQ_BUDGET_TIMEOUT:
2342 + /*
2343 + * We double the budget here because: 1) it
2344 + * gives the chance to boost the throughput if
2345 + * this is not a seeky process (which may have
2346 + * bumped into this timeout because of, e.g.,
2347 + * ZBR), 2) together with charge_full_budget
2348 + * it helps give seeky processes higher
2349 + * timestamps, and hence be served less
2350 + * frequently.
2351 + */
2352 + budget = min(budget * 2, bfqd->bfq_max_budget);
2353 + break;
2354 + case BFQ_BFQQ_BUDGET_EXHAUSTED:
2355 + /*
2356 + * The process still has backlog, and did not
2357 + * let either the budget timeout or the disk
2358 + * idling timeout expire. Hence it is not
2359 + * seeky, has a short thinktime and may be
2360 + * happy with a higher budget too. So
2361 + * definitely increase the budget of this good
2362 + * candidate to boost the disk throughput.
2363 + */
2364 + budget = min(budget * 4, bfqd->bfq_max_budget);
2365 + break;
2366 + case BFQ_BFQQ_NO_MORE_REQUESTS:
2367 + /*
2368 + * Leave the budget unchanged.
2369 + */
2370 + default:
2371 + return;
2373 + } else /* async queue */
2374 + /* async queues get always the maximum possible budget
2375 + * (their ability to dispatch is limited by
2376 + * @bfqd->bfq_max_budget_async_rq).
2377 + */
2378 + budget = bfqd->bfq_max_budget;
2380 + bfqq->max_budget = budget;
2382 + if (bfqd->budgets_assigned >= 194 && bfqd->bfq_user_max_budget == 0 &&
2383 + bfqq->max_budget > bfqd->bfq_max_budget)
2384 + bfqq->max_budget = bfqd->bfq_max_budget;
2386 + /*
2387 + * Make sure that we have enough budget for the next request.
2388 + * Since the finish time of the bfqq must be kept in sync with
2389 + * the budget, be sure to call __bfq_bfqq_expire() after the
2390 + * update.
2391 + */
2392 + next_rq = bfqq->next_rq;
2393 + if (next_rq != NULL)
2394 + bfqq->entity.budget = max_t(unsigned long, bfqq->max_budget,
2395 + bfq_serv_to_charge(next_rq, bfqq));
2396 + else
2397 + bfqq->entity.budget = bfqq->max_budget;
2399 + bfq_log_bfqq(bfqd, bfqq, "head sect: %u, new budget %lu",
2400 + next_rq != NULL ? blk_rq_sectors(next_rq) : 0,
2401 + bfqq->entity.budget);
2404 +static unsigned long bfq_calc_max_budget(u64 peak_rate, u64 timeout)
2406 + unsigned long max_budget;
2408 + /*
2409 + * The max_budget calculated when autotuning is equal to the
2410 + * amount of sectors transfered in timeout_sync at the
2411 + * estimated peak rate.
2412 + */
2413 + max_budget = (unsigned long)(peak_rate * 1000 *
2414 + timeout >> BFQ_RATE_SHIFT);
2416 + return max_budget;
2420 + * In addition to updating the peak rate, checks whether the process
2421 + * is "slow", and returns 1 if so. This slow flag is used, in addition
2422 + * to the budget timeout, to reduce the amount of service provided to
2423 + * seeky processes, and hence reduce their chances to lower the
2424 + * throughput. See the code for more details.
2425 + */
2426 +static int bfq_update_peak_rate(struct bfq_data *bfqd, struct bfq_queue *bfqq,
2427 + int compensate, enum bfqq_expiration reason)
2429 + u64 bw, usecs, expected, timeout;
2430 + ktime_t delta;
2431 + int update = 0;
2433 + if (!bfq_bfqq_sync(bfqq) || bfq_bfqq_budget_new(bfqq))
2434 + return 0;
2436 + delta = compensate ? bfqd->last_idling_start : ktime_get();
2437 + delta = ktime_sub(delta, bfqd->last_budget_start);
2438 + usecs = ktime_to_us(delta);
2440 + /* Don't trust short/unrealistic values. */
2441 + if (usecs < 100 || usecs >= LONG_MAX)
2442 + return 0;
2444 + /*
2445 + * Calculate the bandwidth for the last slice. We use a 64 bit
2446 + * value to store the peak rate, in sectors per usec in fixed
2447 + * point math. We do so to have enough precision in the estimate
2448 + * and to avoid overflows.
2449 + */
2450 + bw = (u64)bfqq->entity.service << BFQ_RATE_SHIFT;
2451 + do_div(bw, (unsigned long)usecs);
2453 + timeout = jiffies_to_msecs(bfqd->bfq_timeout[BLK_RW_SYNC]);
2455 + /*
2456 + * Use only long (> 20ms) intervals to filter out spikes for
2457 + * the peak rate estimation.
2458 + */
2459 + if (usecs > 20000) {
2460 + if (bw > bfqd->peak_rate ||
2461 + (!BFQQ_SEEKY(bfqq) &&
2462 + reason == BFQ_BFQQ_BUDGET_TIMEOUT)) {
2463 + bfq_log(bfqd, "measured bw =%llu", bw);
2464 + /*
2465 + * To smooth oscillations use a low-pass filter with
2466 + * alpha=7/8, i.e.,
2467 + * new_rate = (7/8) * old_rate + (1/8) * bw
2468 + */
2469 + do_div(bw, 8);
2470 + bfqd->peak_rate *= 7;
2471 + do_div(bfqd->peak_rate, 8);
2472 + bfqd->peak_rate += bw;
2473 + update = 1;
2474 + bfq_log(bfqd, "new peak_rate=%llu", bfqd->peak_rate);
2477 + update |= bfqd->peak_rate_samples == BFQ_PEAK_RATE_SAMPLES - 1;
2479 + if (bfqd->peak_rate_samples < BFQ_PEAK_RATE_SAMPLES)
2480 + bfqd->peak_rate_samples++;
2482 + if (bfqd->peak_rate_samples == BFQ_PEAK_RATE_SAMPLES &&
2483 + update && bfqd->bfq_user_max_budget == 0) {
2484 + bfqd->bfq_max_budget =
2485 + bfq_calc_max_budget(bfqd->peak_rate, timeout);
2486 + bfq_log(bfqd, "new max_budget=%lu",
2487 + bfqd->bfq_max_budget);
2491 + /*
2492 + * If the process has been served for a too short time
2493 + * interval to let its possible sequential accesses prevail on
2494 + * the initial seek time needed to move the disk head on the
2495 + * first sector it requested, then give the process a chance
2496 + * and for the moment return false.
2497 + */
2498 + if (bfqq->entity.budget <= bfq_max_budget(bfqd) / 8)
2499 + return 0;
2501 + /*
2502 + * A process is considered ``slow'' (i.e., seeky, so that we
2503 + * cannot treat it fairly in the service domain, as it would
2504 + * slow down too much the other processes) if, when a slice
2505 + * ends for whatever reason, it has received service at a
2506 + * rate that would not be high enough to complete the budget
2507 + * before the budget timeout expiration.
2508 + */
2509 + expected = bw * 1000 * timeout >> BFQ_RATE_SHIFT;
2511 + /*
2512 + * Caveat: processes doing IO in the slower disk zones will
2513 + * tend to be slow(er) even if not seeky. And the estimated
2514 + * peak rate will actually be an average over the disk
2515 + * surface. Hence, to not be too harsh with unlucky processes,
2516 + * we keep a budget/3 margin of safety before declaring a
2517 + * process slow.
2518 + */
2519 + return expected > (4 * bfqq->entity.budget) / 3;
2522 +/**
2523 + * bfq_bfqq_expire - expire a queue.
2524 + * @bfqd: device owning the queue.
2525 + * @bfqq: the queue to expire.
2526 + * @compensate: if true, compensate for the time spent idling.
2527 + * @reason: the reason causing the expiration.
2530 + * If the process associated to the queue is slow (i.e., seeky), or in
2531 + * case of budget timeout, or, finally, if it is async, we
2532 + * artificially charge it an entire budget (independently of the
2533 + * actual service it received). As a consequence, the queue will get
2534 + * higher timestamps than the correct ones upon reactivation, and
2535 + * hence it will be rescheduled as if it had received more service
2536 + * than what it actually received. In the end, this class of processes
2537 + * will receive less service in proportion to how slowly they consume
2538 + * their budgets (and hence how seriously they tend to lower the
2539 + * throughput).
2541 + * In contrast, when a queue expires because it has been idling for
2542 + * too much or because it exhausted its budget, we do not touch the
2543 + * amount of service it has received. Hence when the queue will be
2544 + * reactivated and its timestamps updated, the latter will be in sync
2545 + * with the actual service received by the queue until expiration.
2547 + * Charging a full budget to the first type of queues and the exact
2548 + * service to the others has the effect of using the WF2Q+ policy to
2549 + * schedule the former on a timeslice basis, without violating the
2550 + * service domain guarantees of the latter.
2551 + */
2552 +static void bfq_bfqq_expire(struct bfq_data *bfqd,
2553 + struct bfq_queue *bfqq,
2554 + int compensate,
2555 + enum bfqq_expiration reason)
2557 + int slow;
2558 + BUG_ON(bfqq != bfqd->active_queue);
2560 + /* Update disk peak rate for autotuning and check whether the
2561 + * process is slow (see bfq_update_peak_rate).
2562 + */
2563 + slow = bfq_update_peak_rate(bfqd, bfqq, compensate, reason);
2565 + /*
2566 + * As above explained, 'punish' slow (i.e., seeky), timed-out
2567 + * and async queues, to favor sequential sync workloads.
2569 + * Processes doing IO in the slower disk zones will tend to be
2570 + * slow(er) even if not seeky. Hence, since the estimated peak
2571 + * rate is actually an average over the disk surface, these
2572 + * processes may timeout just for bad luck. To avoid punishing
2573 + * them we do not charge a full budget to a process that
2574 + * succeeded in consuming at least 2/3 of its budget.
2575 + */
2576 + if (slow || (reason == BFQ_BFQQ_BUDGET_TIMEOUT &&
2577 + bfq_bfqq_budget_left(bfqq) >= bfqq->entity.budget / 3))
2578 + bfq_bfqq_charge_full_budget(bfqq);
2580 + if (bfqd->low_latency && bfqq->raising_coeff == 1)
2581 + bfqq->last_rais_start_finish = jiffies;
2583 + if (bfqd->low_latency && bfqd->bfq_raising_max_softrt_rate > 0) {
2584 + if(reason != BFQ_BFQQ_BUDGET_TIMEOUT)
2585 + bfqq->soft_rt_next_start =
2586 + jiffies +
2587 + HZ * bfqq->entity.service /
2588 + bfqd->bfq_raising_max_softrt_rate;
2589 + else
2590 + bfqq->soft_rt_next_start = -1; /* infinity */
2592 + bfq_log_bfqq(bfqd, bfqq,
2593 + "expire (%d, slow %d, num_disp %d, idle_win %d)", reason, slow,
2594 + bfqq->dispatched, bfq_bfqq_idle_window(bfqq));
2596 + /* Increase, decrease or leave budget unchanged according to reason */
2597 + __bfq_bfqq_recalc_budget(bfqd, bfqq, reason);
2598 + __bfq_bfqq_expire(bfqd, bfqq);
2602 + * Budget timeout is not implemented through a dedicated timer, but
2603 + * just checked on request arrivals and completions, as well as on
2604 + * idle timer expirations.
2605 + */
2606 +static int bfq_bfqq_budget_timeout(struct bfq_queue *bfqq)
2608 + if (bfq_bfqq_budget_new(bfqq))
2609 + return 0;
2611 + if (time_before(jiffies, bfqq->budget_timeout))
2612 + return 0;
2614 + return 1;
2618 + * If we expire a queue that is waiting for the arrival of a new
2619 + * request, we may prevent the fictitious timestamp backshifting that
2620 + * allows the guarantees of the queue to be preserved (see [1] for
2621 + * this tricky aspect). Hence we return true only if this condition
2622 + * does not hold, or if the queue is slow enough to deserve only to be
2623 + * kicked off for preserving a high throughput.
2625 +static inline int bfq_may_expire_for_budg_timeout(struct bfq_queue *bfqq)
2627 + bfq_log_bfqq(bfqq->bfqd, bfqq,
2628 + "may_budget_timeout: wr %d left %d timeout %d",
2629 + bfq_bfqq_wait_request(bfqq),
2630 + bfq_bfqq_budget_left(bfqq) >= bfqq->entity.budget / 3,
2631 + bfq_bfqq_budget_timeout(bfqq));
2633 + return (!bfq_bfqq_wait_request(bfqq) ||
2634 + bfq_bfqq_budget_left(bfqq) >= bfqq->entity.budget / 3)
2635 + &&
2636 + bfq_bfqq_budget_timeout(bfqq);
2640 + * Select a queue for service. If we have a current active queue,
2641 + * check whether to continue servicing it, or retrieve and set a new one.
2642 + */
2643 +static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd)
2645 + struct bfq_queue *bfqq, *new_bfqq = NULL;
2646 + struct request *next_rq;
2647 + enum bfqq_expiration reason = BFQ_BFQQ_BUDGET_TIMEOUT;
2649 + bfqq = bfqd->active_queue;
2650 + if (bfqq == NULL)
2651 + goto new_queue;
2653 + bfq_log_bfqq(bfqd, bfqq, "select_queue: already active queue");
2655 + /*
2656 + * If another queue has a request waiting within our mean seek
2657 + * distance, let it run. The expire code will check for close
2658 + * cooperators and put the close queue at the front of the
2659 + * service tree. If possible, merge the expiring queue with the
2660 + * new bfqq.
2661 + */
2662 + new_bfqq = bfq_close_cooperator(bfqd, bfqq);
2663 + if (new_bfqq != NULL && bfqq->new_bfqq == NULL)
2664 + bfq_setup_merge(bfqq, new_bfqq);
2666 + if (bfq_may_expire_for_budg_timeout(bfqq))
2667 + goto expire;
2669 + next_rq = bfqq->next_rq;
2670 + /*
2671 + * If bfqq has requests queued and it has enough budget left to
2672 + * serve them, keep the queue, otherwise expire it.
2673 + */
2674 + if (next_rq != NULL) {
2675 + if (bfq_serv_to_charge(next_rq, bfqq) >
2676 + bfq_bfqq_budget_left(bfqq)) {
2677 + reason = BFQ_BFQQ_BUDGET_EXHAUSTED;
2678 + goto expire;
2679 + } else {
2680 + /*
2681 + * The idle timer may be pending because we may not
2682 + * disable disk idling even when a new request arrives
2683 + */
2684 + if (timer_pending(&bfqd->idle_slice_timer)) {
2685 + /*
2686 + * If we get here: 1) at least a new request
2687 + * has arrived but we have not disabled the
2688 + * timer because the request was too small,
2689 + * 2) then the block layer has unplugged the
2690 + * device, causing the dispatch to be invoked.
2692 + * Since the device is unplugged, now the
2693 + * requests are probably large enough to
2694 + * provide a reasonable throughput.
2695 + * So we disable idling.
2696 + */
2697 + bfq_clear_bfqq_wait_request(bfqq);
2698 + del_timer(&bfqd->idle_slice_timer);
2700 + if (new_bfqq == NULL)
2701 + goto keep_queue;
2702 + else
2703 + goto expire;
2707 + /*
2708 + * No requests pending. If there is no cooperator, and the active
2709 + * queue still has requests in flight or is idling for a new request,
2710 + * then keep it.
2711 + */
2712 + if (new_bfqq == NULL && (timer_pending(&bfqd->idle_slice_timer) ||
2713 + (bfqq->dispatched != 0 && bfq_bfqq_idle_window(bfqq)))) {
2714 + bfqq = NULL;
2715 + goto keep_queue;
2716 + } else if (new_bfqq != NULL && timer_pending(&bfqd->idle_slice_timer)) {
2717 + /*
2718 + * Expiring the queue because there is a close cooperator,
2719 + * cancel timer.
2720 + */
2721 + bfq_clear_bfqq_wait_request(bfqq);
2722 + del_timer(&bfqd->idle_slice_timer);
2725 + reason = BFQ_BFQQ_NO_MORE_REQUESTS;
2726 +expire:
2727 + bfq_bfqq_expire(bfqd, bfqq, 0, reason);
2728 +new_queue:
2729 + bfqq = bfq_set_active_queue(bfqd, new_bfqq);
2730 + bfq_log(bfqd, "select_queue: new queue %d returned",
2731 + bfqq != NULL ? bfqq->pid : 0);
2732 +keep_queue:
2733 + return bfqq;
2736 +static void update_raising_data(struct bfq_data *bfqd, struct bfq_queue *bfqq)
2738 + if (bfqq->raising_coeff > 1) { /* queue is being boosted */
2739 + struct bfq_entity *entity = &bfqq->entity;
2741 + bfq_log_bfqq(bfqd, bfqq,
2742 + "raising period dur %u/%u msec, "
2743 + "old raising coeff %u, w %d(%d)",
2744 + jiffies_to_msecs(jiffies -
2745 + bfqq->last_rais_start_finish),
2746 + jiffies_to_msecs(bfqq->raising_cur_max_time),
2747 + bfqq->raising_coeff,
2748 + bfqq->entity.weight, bfqq->entity.orig_weight);
2750 + BUG_ON(bfqq != bfqd->active_queue && entity->weight !=
2751 + entity->orig_weight * bfqq->raising_coeff);
2752 + if(entity->ioprio_changed)
2753 + bfq_log_bfqq(bfqd, bfqq,
2754 + "WARN: pending prio change");
2755 + /*
2756 + * If too much time has elapsed from the beginning
2757 + * of this weight-raising period and process is not soft
2758 + * real-time, stop it
2759 + */
2760 + if (jiffies - bfqq->last_rais_start_finish >
2761 + bfqq->raising_cur_max_time) {
2762 + int soft_rt = bfqd->bfq_raising_max_softrt_rate > 0 &&
2763 + bfqq->soft_rt_next_start < jiffies;
2765 + bfqq->last_rais_start_finish = jiffies;
2766 + if (soft_rt)
2767 + bfqq->raising_cur_max_time =
2768 + bfqd->bfq_raising_rt_max_time;
2769 + else {
2770 + bfqq->raising_coeff = 1;
2771 + entity->ioprio_changed = 1;
2772 + __bfq_entity_update_weight_prio(
2773 + bfq_entity_service_tree(entity),
2774 + entity);
2782 + * Dispatch one request from bfqq, moving it to the request queue
2783 + * dispatch list.
2784 + */
2785 +static int bfq_dispatch_request(struct bfq_data *bfqd,
2786 + struct bfq_queue *bfqq)
2788 + int dispatched = 0;
2789 + struct request *rq;
2790 + unsigned long service_to_charge;
2792 + BUG_ON(RB_EMPTY_ROOT(&bfqq->sort_list));
2794 + /* Follow expired path, else get first next available. */
2795 + rq = bfq_check_fifo(bfqq);
2796 + if (rq == NULL)
2797 + rq = bfqq->next_rq;
2798 + service_to_charge = bfq_serv_to_charge(rq, bfqq);
2800 + if (service_to_charge > bfq_bfqq_budget_left(bfqq)) {
2801 + /*
2802 + * This may happen if the next rq is chosen
2803 + * in fifo order instead of sector order.
2804 + * The budget is properly dimensioned
2805 + * to be always sufficient to serve the next request
2806 + * only if it is chosen in sector order. The reason is
2807 + * that it would be quite inefficient and little useful
2808 + * to always make sure that the budget is large enough
2809 + * to serve even the possible next rq in fifo order.
2810 + * In fact, requests are seldom served in fifo order.
2812 + * Expire the queue for budget exhaustion, and
2813 + * make sure that the next act_budget is enough
2814 + * to serve the next request, even if it comes
2815 + * from the fifo expired path.
2816 + */
2817 + bfqq->next_rq = rq;
2818 + /*
2819 + * Since this dispatch is failed, make sure that
2820 + * a new one will be performed
2821 + */
2822 + if (!bfqd->rq_in_driver)
2823 + bfq_schedule_dispatch(bfqd);
2824 + goto expire;
2827 + /* Finally, insert request into driver dispatch list. */
2828 + bfq_bfqq_served(bfqq, service_to_charge);
2829 + bfq_dispatch_insert(bfqd->queue, rq);
2831 + update_raising_data(bfqd, bfqq);
2833 + bfq_log_bfqq(bfqd, bfqq, "dispatched %u sec req (%llu), "
2834 + "budg left %lu",
2835 + blk_rq_sectors(rq),
2836 + (long long unsigned)blk_rq_pos(rq),
2837 + bfq_bfqq_budget_left(bfqq));
2839 + dispatched++;
2841 + if (bfqd->active_cic == NULL) {
2842 + atomic_long_inc(&RQ_CIC(rq)->ioc->refcount);
2843 + bfqd->active_cic = RQ_CIC(rq);
2846 + if (bfqd->busy_queues > 1 && ((!bfq_bfqq_sync(bfqq) &&
2847 + dispatched >= bfqd->bfq_max_budget_async_rq) ||
2848 + bfq_class_idle(bfqq)))
2849 + goto expire;
2851 + return dispatched;
2853 +expire:
2854 + bfq_bfqq_expire(bfqd, bfqq, 0, BFQ_BFQQ_BUDGET_EXHAUSTED);
2855 + return dispatched;
2858 +static int __bfq_forced_dispatch_bfqq(struct bfq_queue *bfqq)
2860 + int dispatched = 0;
2862 + while (bfqq->next_rq != NULL) {
2863 + bfq_dispatch_insert(bfqq->bfqd->queue, bfqq->next_rq);
2864 + dispatched++;
2867 + BUG_ON(!list_empty(&bfqq->fifo));
2868 + return dispatched;
2872 + * Drain our current requests. Used for barriers and when switching
2873 + * io schedulers on-the-fly.
2874 + */
2875 +static int bfq_forced_dispatch(struct bfq_data *bfqd)
2877 + struct bfq_queue *bfqq, *n;
2878 + struct bfq_service_tree *st;
2879 + int dispatched = 0;
2881 + bfqq = bfqd->active_queue;
2882 + if (bfqq != NULL)
2883 + __bfq_bfqq_expire(bfqd, bfqq);
2885 + /*
2886 + * Loop through classes, and be careful to leave the scheduler
2887 + * in a consistent state, as feedback mechanisms and vtime
2888 + * updates cannot be disabled during the process.
2889 + */
2890 + list_for_each_entry_safe(bfqq, n, &bfqd->active_list, bfqq_list) {
2891 + st = bfq_entity_service_tree(&bfqq->entity);
2893 + dispatched += __bfq_forced_dispatch_bfqq(bfqq);
2894 + bfqq->max_budget = bfq_max_budget(bfqd);
2896 + bfq_forget_idle(st);
2899 + BUG_ON(bfqd->busy_queues != 0);
2901 + return dispatched;
2904 +static int bfq_dispatch_requests(struct request_queue *q, int force)
2906 + struct bfq_data *bfqd = q->elevator->elevator_data;
2907 + struct bfq_queue *bfqq;
2908 + int max_dispatch;
2910 + bfq_log(bfqd, "dispatch requests: %d busy queues", bfqd->busy_queues);
2911 + if (bfqd->busy_queues == 0)
2912 + return 0;
2914 + if (unlikely(force))
2915 + return bfq_forced_dispatch(bfqd);
2917 + if((bfqq = bfq_select_queue(bfqd)) == NULL)
2918 + return 0;
2920 + max_dispatch = bfqd->bfq_quantum;
2921 + if (bfq_class_idle(bfqq))
2922 + max_dispatch = 1;
2924 + if (!bfq_bfqq_sync(bfqq))
2925 + max_dispatch = bfqd->bfq_max_budget_async_rq;
2927 + if (bfqq->dispatched >= max_dispatch) {
2928 + if (bfqd->busy_queues > 1)
2929 + return 0;
2930 + if (bfqq->dispatched >= 4 * max_dispatch)
2931 + return 0;
2934 + if (bfqd->sync_flight != 0 && !bfq_bfqq_sync(bfqq))
2935 + return 0;
2937 + bfq_clear_bfqq_wait_request(bfqq);
2938 + BUG_ON(timer_pending(&bfqd->idle_slice_timer));
2940 + if (! bfq_dispatch_request(bfqd, bfqq))
2941 + return 0;
2943 + bfq_log_bfqq(bfqd, bfqq, "dispatched one request of %d"
2944 + "(max_disp %d)", bfqq->pid, max_dispatch);
2946 + return 1;
2950 + * Task holds one reference to the queue, dropped when task exits. Each rq
2951 + * in-flight on this queue also holds a reference, dropped when rq is freed.
2953 + * Queue lock must be held here.
2954 + */
2955 +static void bfq_put_queue(struct bfq_queue *bfqq)
2957 + struct bfq_data *bfqd = bfqq->bfqd;
2959 + BUG_ON(atomic_read(&bfqq->ref) <= 0);
2961 + bfq_log_bfqq(bfqd, bfqq, "put_queue: %p %d", bfqq,
2962 + atomic_read(&bfqq->ref));
2963 + if (!atomic_dec_and_test(&bfqq->ref))
2964 + return;
2966 + BUG_ON(rb_first(&bfqq->sort_list) != NULL);
2967 + BUG_ON(bfqq->allocated[READ] + bfqq->allocated[WRITE] != 0);
2968 + BUG_ON(bfqq->entity.tree != NULL);
2969 + BUG_ON(bfq_bfqq_busy(bfqq));
2970 + BUG_ON(bfqd->active_queue == bfqq);
2972 + bfq_log_bfqq(bfqd, bfqq, "put_queue: %p freed", bfqq);
2974 + kmem_cache_free(bfq_pool, bfqq);
2977 +static void bfq_put_cooperator(struct bfq_queue *bfqq)
2979 + struct bfq_queue *__bfqq, *next;
2981 + /*
2982 + * If this queue was scheduled to merge with another queue, be
2983 + * sure to drop the reference taken on that queue (and others in
2984 + * the merge chain). See bfq_setup_merge and bfq_merge_bfqqs.
2985 + */
2986 + __bfqq = bfqq->new_bfqq;
2987 + while (__bfqq) {
2988 + if (__bfqq == bfqq) {
2989 + WARN(1, "bfqq->new_bfqq loop detected.\n");
2990 + break;
2992 + next = __bfqq->new_bfqq;
2993 + bfq_put_queue(__bfqq);
2994 + __bfqq = next;
2998 +static void bfq_exit_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq)
3000 + if (bfqq == bfqd->active_queue) {
3001 + __bfq_bfqq_expire(bfqd, bfqq);
3002 + bfq_schedule_dispatch(bfqd);
3005 + bfq_log_bfqq(bfqd, bfqq, "exit_bfqq: %p, %d", bfqq,
3006 + atomic_read(&bfqq->ref));
3008 + bfq_put_cooperator(bfqq);
3010 + bfq_put_queue(bfqq);
3014 + * Update the entity prio values; note that the new values will not
3015 + * be used until the next (re)activation.
3016 + */
3017 +static void bfq_init_prio_data(struct bfq_queue *bfqq, struct io_context *ioc)
3019 + struct task_struct *tsk = current;
3020 + int ioprio_class;
3022 + if (!bfq_bfqq_prio_changed(bfqq))
3023 + return;
3025 + ioprio_class = IOPRIO_PRIO_CLASS(ioc->ioprio);
3026 + switch (ioprio_class) {
3027 + default:
3028 + printk(KERN_ERR "bfq: bad prio %x\n", ioprio_class);
3029 + case IOPRIO_CLASS_NONE:
3030 + /*
3031 + * No prio set, inherit CPU scheduling settings.
3032 + */
3033 + bfqq->entity.new_ioprio = task_nice_ioprio(tsk);
3034 + bfqq->entity.new_ioprio_class = task_nice_ioclass(tsk);
3035 + break;
3036 + case IOPRIO_CLASS_RT:
3037 + bfqq->entity.new_ioprio = task_ioprio(ioc);
3038 + bfqq->entity.new_ioprio_class = IOPRIO_CLASS_RT;
3039 + break;
3040 + case IOPRIO_CLASS_BE:
3041 + bfqq->entity.new_ioprio = task_ioprio(ioc);
3042 + bfqq->entity.new_ioprio_class = IOPRIO_CLASS_BE;
3043 + break;
3044 + case IOPRIO_CLASS_IDLE:
3045 + bfqq->entity.new_ioprio_class = IOPRIO_CLASS_IDLE;
3046 + bfqq->entity.new_ioprio = 7;
3047 + bfq_clear_bfqq_idle_window(bfqq);
3048 + break;
3051 + bfqq->entity.ioprio_changed = 1;
3053 + /*
3054 + * Keep track of original prio settings in case we have to temporarily
3055 + * elevate the priority of this queue.
3056 + */
3057 + bfqq->org_ioprio = bfqq->entity.new_ioprio;
3058 + bfq_clear_bfqq_prio_changed(bfqq);
3061 +static void bfq_changed_ioprio(struct io_context *ioc,
3062 + struct cfq_io_context *cic)
3064 + struct bfq_data *bfqd;
3065 + struct bfq_queue *bfqq, *new_bfqq;
3066 + struct bfq_group *bfqg;
3067 + unsigned long uninitialized_var(flags);
3069 + bfqd = bfq_get_bfqd_locked(&cic->key, &flags);
3070 + if (unlikely(bfqd == NULL))
3071 + return;
3073 + bfqq = cic->cfqq[BLK_RW_ASYNC];
3074 + if (bfqq != NULL) {
3075 + bfqg = container_of(bfqq->entity.sched_data, struct bfq_group,
3076 + sched_data);
3077 + new_bfqq = bfq_get_queue(bfqd, bfqg, BLK_RW_ASYNC, cic->ioc,
3078 + GFP_ATOMIC);
3079 + if (new_bfqq != NULL) {
3080 + cic->cfqq[BLK_RW_ASYNC] = new_bfqq;
3081 + bfq_log_bfqq(bfqd, bfqq,
3082 + "changed_ioprio: bfqq %p %d",
3083 + bfqq, atomic_read(&bfqq->ref));
3084 + bfq_put_queue(bfqq);
3088 + bfqq = cic->cfqq[BLK_RW_SYNC];
3089 + if (bfqq != NULL)
3090 + bfq_mark_bfqq_prio_changed(bfqq);
3092 + bfq_put_bfqd_unlock(bfqd, &flags);
3095 +static void bfq_init_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq,
3096 + pid_t pid, int is_sync)
3098 + RB_CLEAR_NODE(&bfqq->entity.rb_node);
3099 + INIT_LIST_HEAD(&bfqq->fifo);
3101 + atomic_set(&bfqq->ref, 0);
3102 + bfqq->bfqd = bfqd;
3104 + bfq_mark_bfqq_prio_changed(bfqq);
3106 + if (is_sync) {
3107 + if (!bfq_class_idle(bfqq))
3108 + bfq_mark_bfqq_idle_window(bfqq);
3109 + bfq_mark_bfqq_sync(bfqq);
3112 + /* Tentative initial value to trade off between thr and lat */
3113 + bfqq->max_budget = (2 * bfq_max_budget(bfqd)) / 3;
3114 + bfqq->pid = pid;
3116 + bfqq->raising_coeff = 1;
3117 + bfqq->last_rais_start_finish = 0;
3118 + bfqq->soft_rt_next_start = -1;
3121 +static struct bfq_queue *bfq_find_alloc_queue(struct bfq_data *bfqd,
3122 + struct bfq_group *bfqg,
3123 + int is_sync,
3124 + struct io_context *ioc,
3125 + gfp_t gfp_mask)
3127 + struct bfq_queue *bfqq, *new_bfqq = NULL;
3128 + struct cfq_io_context *cic;
3130 +retry:
3131 + cic = bfq_cic_lookup(bfqd, ioc);
3132 + /* cic always exists here */
3133 + bfqq = cic_to_bfqq(cic, is_sync);
3135 + /*
3136 + * Always try a new alloc if we fall back to the OOM bfqq
3137 + * originally, since it should just be a temporary situation.
3138 + */
3139 + if (bfqq == NULL || bfqq == &bfqd->oom_bfqq) {
3140 + bfqq = NULL;
3141 + if (new_bfqq != NULL) {
3142 + bfqq = new_bfqq;
3143 + new_bfqq = NULL;
3144 + } else if (gfp_mask & __GFP_WAIT) {
3145 + spin_unlock_irq(bfqd->queue->queue_lock);
3146 + new_bfqq = kmem_cache_alloc_node(bfq_pool,
3147 + gfp_mask | __GFP_ZERO,
3148 + bfqd->queue->node);
3149 + spin_lock_irq(bfqd->queue->queue_lock);
3150 + if (new_bfqq != NULL)
3151 + goto retry;
3152 + } else {
3153 + bfqq = kmem_cache_alloc_node(bfq_pool,
3154 + gfp_mask | __GFP_ZERO,
3155 + bfqd->queue->node);
3158 + if (bfqq != NULL) {
3159 + bfq_init_bfqq(bfqd, bfqq, current->pid, is_sync);
3160 + bfq_log_bfqq(bfqd, bfqq, "allocated");
3161 + } else {
3162 + bfqq = &bfqd->oom_bfqq;
3163 + bfq_log_bfqq(bfqd, bfqq, "using oom bfqq");
3166 + bfq_init_prio_data(bfqq, ioc);
3167 + bfq_init_entity(&bfqq->entity, bfqg);
3170 + if (new_bfqq != NULL)
3171 + kmem_cache_free(bfq_pool, new_bfqq);
3173 + return bfqq;
3176 +static struct bfq_queue **bfq_async_queue_prio(struct bfq_data *bfqd,
3177 + struct bfq_group *bfqg,
3178 + int ioprio_class, int ioprio)
3180 + switch (ioprio_class) {
3181 + case IOPRIO_CLASS_RT:
3182 + return &bfqg->async_bfqq[0][ioprio];
3183 + case IOPRIO_CLASS_BE:
3184 + return &bfqg->async_bfqq[1][ioprio];
3185 + case IOPRIO_CLASS_IDLE:
3186 + return &bfqg->async_idle_bfqq;
3187 + default:
3188 + BUG();
3192 +static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd,
3193 + struct bfq_group *bfqg, int is_sync,
3194 + struct io_context *ioc, gfp_t gfp_mask)
3196 + const int ioprio = task_ioprio(ioc);
3197 + const int ioprio_class = task_ioprio_class(ioc);
3198 + struct bfq_queue **async_bfqq = NULL;
3199 + struct bfq_queue *bfqq = NULL;
3201 + if (!is_sync) {
3202 + async_bfqq = bfq_async_queue_prio(bfqd, bfqg, ioprio_class,
3203 + ioprio);
3204 + bfqq = *async_bfqq;
3207 + if (bfqq == NULL)
3208 + bfqq = bfq_find_alloc_queue(bfqd, bfqg, is_sync, ioc, gfp_mask);
3210 + /*
3211 + * Pin the queue now that it's allocated, scheduler exit will prune it.
3212 + */
3213 + if (!is_sync && *async_bfqq == NULL) {
3214 + atomic_inc(&bfqq->ref);
3215 + bfq_log_bfqq(bfqd, bfqq, "get_queue, bfqq not in async: %p, %d",
3216 + bfqq, atomic_read(&bfqq->ref));
3217 + *async_bfqq = bfqq;
3220 + atomic_inc(&bfqq->ref);
3221 + bfq_log_bfqq(bfqd, bfqq, "get_queue, at end: %p, %d", bfqq,
3222 + atomic_read(&bfqq->ref));
3223 + return bfqq;
3226 +static void bfq_update_io_thinktime(struct bfq_data *bfqd,
3227 + struct cfq_io_context *cic)
3229 + unsigned long elapsed = jiffies - cic->ttime.last_end_request;
3230 + unsigned long ttime = min(elapsed, 2UL * bfqd->bfq_slice_idle);
3232 + cic->ttime.ttime_samples = (7*cic->ttime.ttime_samples + 256) / 8;
3233 + cic->ttime.ttime_total = (7*cic->ttime.ttime_total + 256*ttime) / 8;
3234 + cic->ttime.ttime_mean = (cic->ttime.ttime_total + 128) / cic->ttime.ttime_samples;
3237 +static void bfq_update_io_seektime(struct bfq_data *bfqd,
3238 + struct bfq_queue *bfqq,
3239 + struct request *rq)
3241 + sector_t sdist;
3242 + u64 total;
3244 + if (bfqq->last_request_pos < blk_rq_pos(rq))
3245 + sdist = blk_rq_pos(rq) - bfqq->last_request_pos;
3246 + else
3247 + sdist = bfqq->last_request_pos - blk_rq_pos(rq);
3249 + /*
3250 + * Don't allow the seek distance to get too large from the
3251 + * odd fragment, pagein, etc.
3252 + */
3253 + if (bfqq->seek_samples == 0) /* first request, not really a seek */
3254 + sdist = 0;
3255 + else if (bfqq->seek_samples <= 60) /* second & third seek */
3256 + sdist = min(sdist, (bfqq->seek_mean * 4) + 2*1024*1024);
3257 + else
3258 + sdist = min(sdist, (bfqq->seek_mean * 4) + 2*1024*64);
3260 + bfqq->seek_samples = (7*bfqq->seek_samples + 256) / 8;
3261 + bfqq->seek_total = (7*bfqq->seek_total + (u64)256*sdist) / 8;
3262 + total = bfqq->seek_total + (bfqq->seek_samples/2);
3263 + do_div(total, bfqq->seek_samples);
3264 + if (bfq_bfqq_coop(bfqq)) {
3265 + /*
3266 + * If the mean seektime increases for a (non-seeky) shared
3267 + * queue, some cooperator is likely to be idling too much.
3268 + * On the contrary, if it decreases, some cooperator has
3269 + * probably waked up.
3271 + */
3272 + if ((sector_t)total < bfqq->seek_mean)
3273 + bfq_mark_bfqq_some_coop_idle(bfqq) ;
3274 + else if ((sector_t)total > bfqq->seek_mean)
3275 + bfq_clear_bfqq_some_coop_idle(bfqq) ;
3277 + bfqq->seek_mean = (sector_t)total;
3279 + bfq_log_bfqq(bfqd, bfqq, "dist=%llu mean=%llu", (u64)sdist,
3280 + (u64)bfqq->seek_mean);
3284 + * Disable idle window if the process thinks too long or seeks so much that
3285 + * it doesn't matter.
3286 + */
3287 +static void bfq_update_idle_window(struct bfq_data *bfqd,
3288 + struct bfq_queue *bfqq,
3289 + struct cfq_io_context *cic)
3291 + int enable_idle;
3293 + /* Don't idle for async or idle io prio class. */
3294 + if (!bfq_bfqq_sync(bfqq) || bfq_class_idle(bfqq))
3295 + return;
3297 + enable_idle = bfq_bfqq_idle_window(bfqq);
3299 + if (atomic_read(&cic->ioc->nr_tasks) == 0 ||
3300 + bfqd->bfq_slice_idle == 0 ||
3301 + (bfqd->hw_tag && BFQQ_SEEKY(bfqq) &&
3302 + bfqq->raising_coeff == 1))
3303 + enable_idle = 0;
3304 + else if (bfq_sample_valid(cic->ttime.ttime_samples)) {
3305 + if (cic->ttime.ttime_mean > bfqd->bfq_slice_idle &&
3306 + bfqq->raising_coeff == 1)
3307 + enable_idle = 0;
3308 + else
3309 + enable_idle = 1;
3311 + bfq_log_bfqq(bfqd, bfqq, "update_idle_window: enable_idle %d",
3312 + enable_idle);
3314 + if (enable_idle)
3315 + bfq_mark_bfqq_idle_window(bfqq);
3316 + else
3317 + bfq_clear_bfqq_idle_window(bfqq);
3321 + * Called when a new fs request (rq) is added to bfqq. Check if there's
3322 + * something we should do about it.
3323 + */
3324 +static void bfq_rq_enqueued(struct bfq_data *bfqd, struct bfq_queue *bfqq,
3325 + struct request *rq)
3327 + struct cfq_io_context *cic = RQ_CIC(rq);
3329 + if (rq->cmd_flags & REQ_META)
3330 + bfqq->meta_pending++;
3332 + bfq_update_io_thinktime(bfqd, cic);
3333 + bfq_update_io_seektime(bfqd, bfqq, rq);
3334 + if (bfqq->entity.service > bfq_max_budget(bfqd) / 8 ||
3335 + !BFQQ_SEEKY(bfqq))
3336 + bfq_update_idle_window(bfqd, bfqq, cic);
3338 + bfq_log_bfqq(bfqd, bfqq,
3339 + "rq_enqueued: idle_window=%d (seeky %d, mean %llu)",
3340 + bfq_bfqq_idle_window(bfqq), BFQQ_SEEKY(bfqq),
3341 + (long long unsigned)bfqq->seek_mean);
3343 + bfqq->last_request_pos = blk_rq_pos(rq) + blk_rq_sectors(rq);
3345 + if (bfqq == bfqd->active_queue) {
3346 + /*
3347 + * If there is just this request queued and the request
3348 + * is small, just exit.
3349 + * In this way, if the disk is being idled to wait for a new
3350 + * request from the active queue, we avoid unplugging the
3351 + * device now.
3353 + * By doing so, we spare the disk to be committed
3354 + * to serve just a small request. On the contrary, we wait for
3355 + * the block layer to decide when to unplug the device:
3356 + * hopefully, new requests will be merged to this
3357 + * one quickly, then the device will be unplugged
3358 + * and larger requests will be dispatched.
3359 + */
3360 + if (bfqq->queued[rq_is_sync(rq)] == 1 &&
3361 + blk_rq_sectors(rq) < 32) {
3362 + return;
3364 + if (bfq_bfqq_wait_request(bfqq)) {
3365 + /*
3366 + * If we are waiting for a request for this queue, let
3367 + * it rip immediately and flag that we must not expire
3368 + * this queue just now.
3369 + */
3370 + bfq_clear_bfqq_wait_request(bfqq);
3371 + del_timer(&bfqd->idle_slice_timer);
3372 + /*
3373 + * Here we can safely expire the queue, in
3374 + * case of budget timeout, without wasting
3375 + * guarantees
3376 + */
3377 + if (bfq_bfqq_budget_timeout(bfqq))
3378 + bfq_bfqq_expire(bfqd, bfqq, 0,
3379 + BFQ_BFQQ_BUDGET_TIMEOUT);
3380 + __blk_run_queue(bfqd->queue);
3385 +static void bfq_insert_request(struct request_queue *q, struct request *rq)
3387 + struct bfq_data *bfqd = q->elevator->elevator_data;
3388 + struct bfq_queue *bfqq = RQ_BFQQ(rq);
3390 + assert_spin_locked(bfqd->queue->queue_lock);
3391 + bfq_init_prio_data(bfqq, RQ_CIC(rq)->ioc);
3393 + bfq_add_rq_rb(rq);
3395 + rq_set_fifo_time(rq, jiffies + bfqd->bfq_fifo_expire[rq_is_sync(rq)]);
3396 + list_add_tail(&rq->queuelist, &bfqq->fifo);
3398 + bfq_rq_enqueued(bfqd, bfqq, rq);
3401 +static void bfq_update_hw_tag(struct bfq_data *bfqd)
3403 + bfqd->max_rq_in_driver = max(bfqd->max_rq_in_driver,
3404 + bfqd->rq_in_driver);
3406 + /*
3407 + * This sample is valid if the number of outstanding requests
3408 + * is large enough to allow a queueing behavior. Note that the
3409 + * sum is not exact, as it's not taking into account deactivated
3410 + * requests.
3411 + */
3412 + if (bfqd->rq_in_driver + bfqd->queued < BFQ_HW_QUEUE_THRESHOLD)
3413 + return;
3415 + if (bfqd->hw_tag_samples++ < BFQ_HW_QUEUE_SAMPLES)
3416 + return;
3418 + bfqd->hw_tag = bfqd->max_rq_in_driver > BFQ_HW_QUEUE_THRESHOLD;
3419 + bfqd->max_rq_in_driver = 0;
3420 + bfqd->hw_tag_samples = 0;
3423 +static void bfq_completed_request(struct request_queue *q, struct request *rq)
3425 + struct bfq_queue *bfqq = RQ_BFQQ(rq);
3426 + struct bfq_data *bfqd = bfqq->bfqd;
3427 + const int sync = rq_is_sync(rq);
3429 + bfq_log_bfqq(bfqd, bfqq, "completed %u sects req (%d)",
3430 + blk_rq_sectors(rq), sync);
3432 + bfq_update_hw_tag(bfqd);
3434 + WARN_ON(!bfqd->rq_in_driver);
3435 + WARN_ON(!bfqq->dispatched);
3436 + bfqd->rq_in_driver--;
3437 + bfqq->dispatched--;
3439 + if (bfq_bfqq_sync(bfqq))
3440 + bfqd->sync_flight--;
3442 + if (sync)
3443 + RQ_CIC(rq)->ttime.last_end_request = jiffies;
3445 + /*
3446 + * If this is the active queue, check if it needs to be expired,
3447 + * or if we want to idle in case it has no pending requests.
3448 + */
3449 + if (bfqd->active_queue == bfqq) {
3450 + if (bfq_bfqq_budget_new(bfqq))
3451 + bfq_set_budget_timeout(bfqd);
3453 + /* Idling is disabled also for cooperation issues:
3454 + * 1) there is a close cooperator for the queue, or
3455 + * 2) the queue is shared and some cooperator is likely
3456 + * to be idle (in this case, by not arming the idle timer,
3457 + * we try to slow down the queue, to prevent the zones
3458 + * of the disk accessed by the active cooperators to become
3459 + * too distant from the zone that will be accessed by the
3460 + * currently idle cooperators)
3461 + */
3462 + if (bfq_may_expire_for_budg_timeout(bfqq))
3463 + bfq_bfqq_expire(bfqd, bfqq, 0, BFQ_BFQQ_BUDGET_TIMEOUT);
3464 + else if (sync &&
3465 + (bfqd->rq_in_driver == 0 ||
3466 + bfqq->raising_coeff > 1)
3467 + && RB_EMPTY_ROOT(&bfqq->sort_list)
3468 + && !bfq_close_cooperator(bfqd, bfqq)
3469 + && (!bfq_bfqq_coop(bfqq) ||
3470 + !bfq_bfqq_some_coop_idle(bfqq)))
3471 + bfq_arm_slice_timer(bfqd);
3474 + if (!bfqd->rq_in_driver)
3475 + bfq_schedule_dispatch(bfqd);
3478 +static inline int __bfq_may_queue(struct bfq_queue *bfqq)
3480 + if (bfq_bfqq_wait_request(bfqq) && bfq_bfqq_must_alloc(bfqq)) {
3481 + bfq_clear_bfqq_must_alloc(bfqq);
3482 + return ELV_MQUEUE_MUST;
3485 + return ELV_MQUEUE_MAY;
3488 +static int bfq_may_queue(struct request_queue *q, int rw)
3490 + struct bfq_data *bfqd = q->elevator->elevator_data;
3491 + struct task_struct *tsk = current;
3492 + struct cfq_io_context *cic;
3493 + struct bfq_queue *bfqq;
3495 + /*
3496 + * Don't force setup of a queue from here, as a call to may_queue
3497 + * does not necessarily imply that a request actually will be queued.
3498 + * So just lookup a possibly existing queue, or return 'may queue'
3499 + * if that fails.
3500 + */
3501 + cic = bfq_cic_lookup(bfqd, tsk->io_context);
3502 + if (cic == NULL)
3503 + return ELV_MQUEUE_MAY;
3505 + bfqq = cic_to_bfqq(cic, rw_is_sync(rw));
3506 + if (bfqq != NULL) {
3507 + bfq_init_prio_data(bfqq, cic->ioc);
3509 + return __bfq_may_queue(bfqq);
3512 + return ELV_MQUEUE_MAY;
3516 + * Queue lock held here.
3517 + */
3518 +static void bfq_put_request(struct request *rq)
3520 + struct bfq_queue *bfqq = RQ_BFQQ(rq);
3522 + if (bfqq != NULL) {
3523 + const int rw = rq_data_dir(rq);
3525 + BUG_ON(!bfqq->allocated[rw]);
3526 + bfqq->allocated[rw]--;
3528 + put_io_context(RQ_CIC(rq)->ioc);
3530 + rq->elevator_private[0] = NULL;
3531 + rq->elevator_private[1] = NULL;
3533 + bfq_log_bfqq(bfqq->bfqd, bfqq, "put_request %p, %d",
3534 + bfqq, atomic_read(&bfqq->ref));
3535 + bfq_put_queue(bfqq);
3539 +static struct bfq_queue *
3540 +bfq_merge_bfqqs(struct bfq_data *bfqd, struct cfq_io_context *cic,
3541 + struct bfq_queue *bfqq)
3543 + bfq_log_bfqq(bfqd, bfqq, "merging with queue %lu",
3544 + (long unsigned)bfqq->new_bfqq->pid);
3545 + cic_set_bfqq(cic, bfqq->new_bfqq, 1);
3546 + bfq_mark_bfqq_coop(bfqq->new_bfqq);
3547 + bfq_put_queue(bfqq);
3548 + return cic_to_bfqq(cic, 1);
3552 + * Returns NULL if a new bfqq should be allocated, or the old bfqq if this
3553 + * was the last process referring to said bfqq.
3554 + */
3555 +static struct bfq_queue *
3556 +bfq_split_bfqq(struct cfq_io_context *cic, struct bfq_queue *bfqq)
3558 + bfq_log_bfqq(bfqq->bfqd, bfqq, "splitting queue");
3559 + if (bfqq_process_refs(bfqq) == 1) {
3560 + bfqq->pid = current->pid;
3561 + bfq_clear_bfqq_some_coop_idle(bfqq);
3562 + bfq_clear_bfqq_coop(bfqq);
3563 + bfq_clear_bfqq_split_coop(bfqq);
3564 + return bfqq;
3567 + cic_set_bfqq(cic, NULL, 1);
3569 + bfq_put_cooperator(bfqq);
3571 + bfq_put_queue(bfqq);
3572 + return NULL;
3576 + * Allocate bfq data structures associated with this request.
3577 + */
3578 +static int bfq_set_request(struct request_queue *q, struct request *rq,
3579 + gfp_t gfp_mask)
3581 + struct bfq_data *bfqd = q->elevator->elevator_data;
3582 + struct cfq_io_context *cic;
3583 + const int rw = rq_data_dir(rq);
3584 + const int is_sync = rq_is_sync(rq);
3585 + struct bfq_queue *bfqq;
3586 + struct bfq_group *bfqg;
3587 + unsigned long flags;
3589 + might_sleep_if(gfp_mask & __GFP_WAIT);
3591 + cic = bfq_get_io_context(bfqd, gfp_mask);
3593 + spin_lock_irqsave(q->queue_lock, flags);
3595 + if (cic == NULL)
3596 + goto queue_fail;
3598 + bfqg = bfq_cic_update_cgroup(cic);
3600 +new_queue:
3601 + bfqq = cic_to_bfqq(cic, is_sync);
3602 + if (bfqq == NULL || bfqq == &bfqd->oom_bfqq) {
3603 + bfqq = bfq_get_queue(bfqd, bfqg, is_sync, cic->ioc, gfp_mask);
3604 + cic_set_bfqq(cic, bfqq, is_sync);
3605 + } else {
3606 + /*
3607 + * If the queue was seeky for too long, break it apart.
3608 + */
3609 + if (bfq_bfqq_coop(bfqq) && bfq_bfqq_split_coop(bfqq)) {
3610 + bfq_log_bfqq(bfqd, bfqq, "breaking apart bfqq");
3611 + bfqq = bfq_split_bfqq(cic, bfqq);
3612 + if (!bfqq)
3613 + goto new_queue;
3616 + /*
3617 + * Check to see if this queue is scheduled to merge with
3618 + * another closely cooperating queue. The merging of queues
3619 + * happens here as it must be done in process context.
3620 + * The reference on new_bfqq was taken in merge_bfqqs.
3621 + */
3622 + if (bfqq->new_bfqq != NULL)
3623 + bfqq = bfq_merge_bfqqs(bfqd, cic, bfqq);
3626 + bfqq->allocated[rw]++;
3627 + atomic_inc(&bfqq->ref);
3628 + bfq_log_bfqq(bfqd, bfqq, "set_request: bfqq %p, %d", bfqq,
3629 + atomic_read(&bfqq->ref));
3631 + spin_unlock_irqrestore(q->queue_lock, flags);
3633 + rq->elevator_private[0] = cic;
3634 + rq->elevator_private[1] = bfqq;
3636 + return 0;
3638 +queue_fail:
3639 + if (cic != NULL)
3640 + put_io_context(cic->ioc);
3642 + bfq_schedule_dispatch(bfqd);
3643 + spin_unlock_irqrestore(q->queue_lock, flags);
3645 + return 1;
3648 +static void bfq_kick_queue(struct work_struct *work)
3650 + struct bfq_data *bfqd =
3651 + container_of(work, struct bfq_data, unplug_work);
3652 + struct request_queue *q = bfqd->queue;
3654 + spin_lock_irq(q->queue_lock);
3655 + __blk_run_queue(q);
3656 + spin_unlock_irq(q->queue_lock);
3660 + * Handler of the expiration of the timer running if the active_queue
3661 + * is idling inside its time slice.
3662 + */
3663 +static void bfq_idle_slice_timer(unsigned long data)
3665 + struct bfq_data *bfqd = (struct bfq_data *)data;
3666 + struct bfq_queue *bfqq;
3667 + unsigned long flags;
3668 + enum bfqq_expiration reason;
3670 + spin_lock_irqsave(bfqd->queue->queue_lock, flags);
3672 + bfqq = bfqd->active_queue;
3673 + /*
3674 + * Theoretical race here: active_queue can be NULL or different
3675 + * from the queue that was idling if the timer handler spins on
3676 + * the queue_lock and a new request arrives for the current
3677 + * queue and there is a full dispatch cycle that changes the
3678 + * active_queue. This can hardly happen, but in the worst case
3679 + * we just expire a queue too early.
3680 + */
3681 + if (bfqq != NULL) {
3682 + bfq_log_bfqq(bfqd, bfqq, "slice_timer expired");
3683 + if (bfq_bfqq_budget_timeout(bfqq))
3684 + /*
3685 + * Also here the queue can be safely expired
3686 + * for budget timeout without wasting
3687 + * guarantees
3688 + */
3689 + reason = BFQ_BFQQ_BUDGET_TIMEOUT;
3690 + else if (bfqq->queued[0] == 0 && bfqq->queued[1] == 0)
3691 + /*
3692 + * The queue may not be empty upon timer expiration,
3693 + * because we may not disable the timer when the first
3694 + * request of the active queue arrives during
3695 + * disk idling
3696 + */
3697 + reason = BFQ_BFQQ_TOO_IDLE;
3698 + else
3699 + goto schedule_dispatch;
3701 + bfq_bfqq_expire(bfqd, bfqq, 1, reason);
3704 +schedule_dispatch:
3705 + bfq_schedule_dispatch(bfqd);
3707 + spin_unlock_irqrestore(bfqd->queue->queue_lock, flags);
3710 +static void bfq_shutdown_timer_wq(struct bfq_data *bfqd)
3712 + del_timer_sync(&bfqd->idle_slice_timer);
3713 + cancel_work_sync(&bfqd->unplug_work);
3716 +static inline void __bfq_put_async_bfqq(struct bfq_data *bfqd,
3717 + struct bfq_queue **bfqq_ptr)
3719 + struct bfq_group *root_group = bfqd->root_group;
3720 + struct bfq_queue *bfqq = *bfqq_ptr;
3722 + bfq_log(bfqd, "put_async_bfqq: %p", bfqq);
3723 + if (bfqq != NULL) {
3724 + bfq_bfqq_move(bfqd, bfqq, &bfqq->entity, root_group);
3725 + bfq_log_bfqq(bfqd, bfqq, "put_async_bfqq: putting %p, %d",
3726 + bfqq, atomic_read(&bfqq->ref));
3727 + bfq_put_queue(bfqq);
3728 + *bfqq_ptr = NULL;
3733 + * Release all the bfqg references to its async queues. If we are
3734 + * deallocating the group these queues may still contain requests, so
3735 + * we reparent them to the root cgroup (i.e., the only one that will
3736 + * exist for sure untill all the requests on a device are gone).
3737 + */
3738 +static void bfq_put_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg)
3740 + int i, j;
3742 + for (i = 0; i < 2; i++)
3743 + for (j = 0; j < IOPRIO_BE_NR; j++)
3744 + __bfq_put_async_bfqq(bfqd, &bfqg->async_bfqq[i][j]);
3746 + __bfq_put_async_bfqq(bfqd, &bfqg->async_idle_bfqq);
3749 +static void bfq_exit_queue(struct elevator_queue *e)
3751 + struct bfq_data *bfqd = e->elevator_data;
3752 + struct request_queue *q = bfqd->queue;
3753 + struct bfq_queue *bfqq, *n;
3754 + struct cfq_io_context *cic;
3756 + bfq_shutdown_timer_wq(bfqd);
3758 + spin_lock_irq(q->queue_lock);
3760 + while (!list_empty(&bfqd->cic_list)) {
3761 + cic = list_entry(bfqd->cic_list.next, struct cfq_io_context,
3762 + queue_list);
3763 + __bfq_exit_single_io_context(bfqd, cic);
3766 + BUG_ON(bfqd->active_queue != NULL);
3767 + list_for_each_entry_safe(bfqq, n, &bfqd->idle_list, bfqq_list)
3768 + bfq_deactivate_bfqq(bfqd, bfqq, 0);
3770 + bfq_disconnect_groups(bfqd);
3771 + spin_unlock_irq(q->queue_lock);
3773 + bfq_shutdown_timer_wq(bfqd);
3775 + spin_lock(&cic_index_lock);
3776 + ida_remove(&cic_index_ida, bfqd->cic_index);
3777 + spin_unlock(&cic_index_lock);
3779 + /* Wait for cic->key accessors to exit their grace periods. */
3780 + synchronize_rcu();
3782 + BUG_ON(timer_pending(&bfqd->idle_slice_timer));
3784 + bfq_free_root_group(bfqd);
3785 + kfree(bfqd);
3788 +static int bfq_alloc_cic_index(void)
3790 + int index, error;
3792 + do {
3793 + if (!ida_pre_get(&cic_index_ida, GFP_KERNEL))
3794 + return -ENOMEM;
3796 + spin_lock(&cic_index_lock);
3797 + error = ida_get_new(&cic_index_ida, &index);
3798 + spin_unlock(&cic_index_lock);
3799 + if (error && error != -EAGAIN)
3800 + return error;
3801 + } while (error);
3803 + return index;
3806 +static void *bfq_init_queue(struct request_queue *q)
3808 + struct bfq_group *bfqg;
3809 + struct bfq_data *bfqd;
3810 + int i;
3812 + i = bfq_alloc_cic_index();
3813 + if (i < 0)
3814 + return NULL;
3816 + bfqd = kmalloc_node(sizeof(*bfqd), GFP_KERNEL | __GFP_ZERO, q->node);
3817 + if (bfqd == NULL)
3818 + return NULL;
3820 + bfqd->cic_index = i;
3822 + /*
3823 + * Our fallback bfqq if bfq_find_alloc_queue() runs into OOM issues.
3824 + * Grab a permanent reference to it, so that the normal code flow
3825 + * will not attempt to free it.
3826 + */
3827 + bfq_init_bfqq(bfqd, &bfqd->oom_bfqq, 1, 0);
3828 + atomic_inc(&bfqd->oom_bfqq.ref);
3830 + INIT_LIST_HEAD(&bfqd->cic_list);
3832 + bfqd->queue = q;
3834 + bfqg = bfq_alloc_root_group(bfqd, q->node);
3835 + if (bfqg == NULL) {
3836 + kfree(bfqd);
3837 + return NULL;
3840 + bfqd->root_group = bfqg;
3842 + init_timer(&bfqd->idle_slice_timer);
3843 + bfqd->idle_slice_timer.function = bfq_idle_slice_timer;
3844 + bfqd->idle_slice_timer.data = (unsigned long)bfqd;
3846 + bfqd->rq_pos_tree = RB_ROOT;
3848 + INIT_WORK(&bfqd->unplug_work, bfq_kick_queue);
3850 + INIT_LIST_HEAD(&bfqd->active_list);
3851 + INIT_LIST_HEAD(&bfqd->idle_list);
3853 + bfqd->hw_tag = 1;
3855 + bfqd->bfq_max_budget = bfq_default_max_budget;
3857 + bfqd->bfq_quantum = bfq_quantum;
3858 + bfqd->bfq_fifo_expire[0] = bfq_fifo_expire[0];
3859 + bfqd->bfq_fifo_expire[1] = bfq_fifo_expire[1];
3860 + bfqd->bfq_back_max = bfq_back_max;
3861 + bfqd->bfq_back_penalty = bfq_back_penalty;
3862 + bfqd->bfq_slice_idle = bfq_slice_idle;
3863 + bfqd->bfq_class_idle_last_service = 0;
3864 + bfqd->bfq_max_budget_async_rq = bfq_max_budget_async_rq;
3865 + bfqd->bfq_timeout[BLK_RW_ASYNC] = bfq_timeout_async;
3866 + bfqd->bfq_timeout[BLK_RW_SYNC] = bfq_timeout_sync;
3868 + bfqd->low_latency = true;
3870 + bfqd->bfq_raising_coeff = 20;
3871 + bfqd->bfq_raising_rt_max_time = msecs_to_jiffies(300);
3872 + bfqd->bfq_raising_max_time = msecs_to_jiffies(7500);
3873 + bfqd->bfq_raising_min_idle_time = msecs_to_jiffies(2000);
3874 + bfqd->bfq_raising_max_softrt_rate = 7000;
3876 + return bfqd;
3879 +static void bfq_slab_kill(void)
3881 + if (bfq_pool != NULL)
3882 + kmem_cache_destroy(bfq_pool);
3883 + if (bfq_ioc_pool != NULL)
3884 + kmem_cache_destroy(bfq_ioc_pool);
3887 +static int __init bfq_slab_setup(void)
3889 + bfq_pool = KMEM_CACHE(bfq_queue, 0);
3890 + if (bfq_pool == NULL)
3891 + goto fail;
3893 + bfq_ioc_pool = kmem_cache_create("bfq_io_context",
3894 + sizeof(struct cfq_io_context),
3895 + __alignof__(struct cfq_io_context),
3896 + 0, NULL);
3897 + if (bfq_ioc_pool == NULL)
3898 + goto fail;
3900 + return 0;
3901 +fail:
3902 + bfq_slab_kill();
3903 + return -ENOMEM;
3906 +static ssize_t bfq_var_show(unsigned int var, char *page)
3908 + return sprintf(page, "%d\n", var);
3911 +static ssize_t bfq_var_store(unsigned long *var, const char *page, size_t count)
3913 + unsigned long new_val;
3914 + int ret = strict_strtoul(page, 10, &new_val);
3916 + if (ret == 0)
3917 + *var = new_val;
3919 + return count;
3922 +static ssize_t bfq_weights_show(struct elevator_queue *e, char *page)
3924 + struct bfq_queue *bfqq;
3925 + struct bfq_data *bfqd = e->elevator_data;
3926 + ssize_t num_char = 0;
3928 + num_char += sprintf(page + num_char, "Active:\n");
3929 + list_for_each_entry(bfqq, &bfqd->active_list, bfqq_list) {
3930 + num_char += sprintf(page + num_char,
3931 + "pid%d: weight %hu, dur %d/%u\n",
3932 + bfqq->pid,
3933 + bfqq->entity.weight,
3934 + jiffies_to_msecs(jiffies -
3935 + bfqq->last_rais_start_finish),
3936 + jiffies_to_msecs(bfqq->raising_cur_max_time));
3938 + num_char += sprintf(page + num_char, "Idle:\n");
3939 + list_for_each_entry(bfqq, &bfqd->idle_list, bfqq_list) {
3940 + num_char += sprintf(page + num_char,
3941 + "pid%d: weight %hu, dur %d/%u\n",
3942 + bfqq->pid,
3943 + bfqq->entity.weight,
3944 + jiffies_to_msecs(jiffies -
3945 + bfqq->last_rais_start_finish),
3946 + jiffies_to_msecs(bfqq->raising_cur_max_time));
3948 + return num_char;
3951 +#define SHOW_FUNCTION(__FUNC, __VAR, __CONV) \
3952 +static ssize_t __FUNC(struct elevator_queue *e, char *page) \
3953 +{ \
3954 + struct bfq_data *bfqd = e->elevator_data; \
3955 + unsigned int __data = __VAR; \
3956 + if (__CONV) \
3957 + __data = jiffies_to_msecs(__data); \
3958 + return bfq_var_show(__data, (page)); \
3960 +SHOW_FUNCTION(bfq_quantum_show, bfqd->bfq_quantum, 0);
3961 +SHOW_FUNCTION(bfq_fifo_expire_sync_show, bfqd->bfq_fifo_expire[1], 1);
3962 +SHOW_FUNCTION(bfq_fifo_expire_async_show, bfqd->bfq_fifo_expire[0], 1);
3963 +SHOW_FUNCTION(bfq_back_seek_max_show, bfqd->bfq_back_max, 0);
3964 +SHOW_FUNCTION(bfq_back_seek_penalty_show, bfqd->bfq_back_penalty, 0);
3965 +SHOW_FUNCTION(bfq_slice_idle_show, bfqd->bfq_slice_idle, 1);
3966 +SHOW_FUNCTION(bfq_max_budget_show, bfqd->bfq_user_max_budget, 0);
3967 +SHOW_FUNCTION(bfq_max_budget_async_rq_show, bfqd->bfq_max_budget_async_rq, 0);
3968 +SHOW_FUNCTION(bfq_timeout_sync_show, bfqd->bfq_timeout[BLK_RW_SYNC], 1);
3969 +SHOW_FUNCTION(bfq_timeout_async_show, bfqd->bfq_timeout[BLK_RW_ASYNC], 1);
3970 +SHOW_FUNCTION(bfq_low_latency_show, bfqd->low_latency, 0);
3971 +SHOW_FUNCTION(bfq_raising_coeff_show, bfqd->bfq_raising_coeff, 0);
3972 +SHOW_FUNCTION(bfq_raising_max_time_show, bfqd->bfq_raising_max_time, 1);
3973 +SHOW_FUNCTION(bfq_raising_rt_max_time_show, bfqd->bfq_raising_rt_max_time, 1);
3974 +SHOW_FUNCTION(bfq_raising_min_idle_time_show, bfqd->bfq_raising_min_idle_time,
3975 + 1);
3976 +SHOW_FUNCTION(bfq_raising_max_softrt_rate_show,
3977 + bfqd->bfq_raising_max_softrt_rate, 0);
3978 +#undef SHOW_FUNCTION
3980 +#define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV) \
3981 +static ssize_t \
3982 +__FUNC(struct elevator_queue *e, const char *page, size_t count) \
3983 +{ \
3984 + struct bfq_data *bfqd = e->elevator_data; \
3985 + unsigned long __data; \
3986 + int ret = bfq_var_store(&__data, (page), count); \
3987 + if (__data < (MIN)) \
3988 + __data = (MIN); \
3989 + else if (__data > (MAX)) \
3990 + __data = (MAX); \
3991 + if (__CONV) \
3992 + *(__PTR) = msecs_to_jiffies(__data); \
3993 + else \
3994 + *(__PTR) = __data; \
3995 + return ret; \
3997 +STORE_FUNCTION(bfq_quantum_store, &bfqd->bfq_quantum, 1, INT_MAX, 0);
3998 +STORE_FUNCTION(bfq_fifo_expire_sync_store, &bfqd->bfq_fifo_expire[1], 1,
3999 + INT_MAX, 1);
4000 +STORE_FUNCTION(bfq_fifo_expire_async_store, &bfqd->bfq_fifo_expire[0], 1,
4001 + INT_MAX, 1);
4002 +STORE_FUNCTION(bfq_back_seek_max_store, &bfqd->bfq_back_max, 0, INT_MAX, 0);
4003 +STORE_FUNCTION(bfq_back_seek_penalty_store, &bfqd->bfq_back_penalty, 1,
4004 + INT_MAX, 0);
4005 +STORE_FUNCTION(bfq_slice_idle_store, &bfqd->bfq_slice_idle, 0, INT_MAX, 1);
4006 +STORE_FUNCTION(bfq_max_budget_async_rq_store, &bfqd->bfq_max_budget_async_rq,
4007 + 1, INT_MAX, 0);
4008 +STORE_FUNCTION(bfq_timeout_async_store, &bfqd->bfq_timeout[BLK_RW_ASYNC], 0,
4009 + INT_MAX, 1);
4010 +STORE_FUNCTION(bfq_raising_coeff_store, &bfqd->bfq_raising_coeff, 1,
4011 + INT_MAX, 0);
4012 +STORE_FUNCTION(bfq_raising_max_time_store, &bfqd->bfq_raising_max_time, 0,
4013 + INT_MAX, 1);
4014 +STORE_FUNCTION(bfq_raising_rt_max_time_store, &bfqd->bfq_raising_rt_max_time, 0,
4015 + INT_MAX, 1);
4016 +STORE_FUNCTION(bfq_raising_min_idle_time_store,
4017 + &bfqd->bfq_raising_min_idle_time, 0, INT_MAX, 1);
4018 +STORE_FUNCTION(bfq_raising_max_softrt_rate_store,
4019 + &bfqd->bfq_raising_max_softrt_rate, 0, INT_MAX, 0);
4020 +#undef STORE_FUNCTION
4022 +/* do nothing for the moment */
4023 +static ssize_t bfq_weights_store(struct elevator_queue *e,
4024 + const char *page, size_t count)
4026 + return count;
4029 +static inline unsigned long bfq_estimated_max_budget(struct bfq_data *bfqd)
4031 + u64 timeout = jiffies_to_msecs(bfqd->bfq_timeout[BLK_RW_SYNC]);
4033 + if (bfqd->peak_rate_samples >= BFQ_PEAK_RATE_SAMPLES)
4034 + return bfq_calc_max_budget(bfqd->peak_rate, timeout);
4035 + else
4036 + return bfq_default_max_budget;
4039 +static ssize_t bfq_max_budget_store(struct elevator_queue *e,
4040 + const char *page, size_t count)
4042 + struct bfq_data *bfqd = e->elevator_data;
4043 + unsigned long __data;
4044 + int ret = bfq_var_store(&__data, (page), count);
4046 + if (__data == 0)
4047 + bfqd->bfq_max_budget = bfq_estimated_max_budget(bfqd);
4048 + else {
4049 + if (__data > INT_MAX)
4050 + __data = INT_MAX;
4051 + bfqd->bfq_max_budget = __data;
4054 + bfqd->bfq_user_max_budget = __data;
4056 + return ret;
4059 +static ssize_t bfq_timeout_sync_store(struct elevator_queue *e,
4060 + const char *page, size_t count)
4062 + struct bfq_data *bfqd = e->elevator_data;
4063 + unsigned long __data;
4064 + int ret = bfq_var_store(&__data, (page), count);
4066 + if (__data < 1)
4067 + __data = 1;
4068 + else if (__data > INT_MAX)
4069 + __data = INT_MAX;
4071 + bfqd->bfq_timeout[BLK_RW_SYNC] = msecs_to_jiffies(__data);
4072 + if (bfqd->bfq_user_max_budget == 0)
4073 + bfqd->bfq_max_budget = bfq_estimated_max_budget(bfqd);
4075 + return ret;
4078 +static ssize_t bfq_low_latency_store(struct elevator_queue *e,
4079 + const char *page, size_t count)
4081 + struct bfq_data *bfqd = e->elevator_data;
4082 + unsigned long __data;
4083 + int ret = bfq_var_store(&__data, (page), count);
4085 + if (__data > 1)
4086 + __data = 1;
4087 + bfqd->low_latency = __data;
4089 + return ret;
4092 +#define BFQ_ATTR(name) \
4093 + __ATTR(name, S_IRUGO|S_IWUSR, bfq_##name##_show, bfq_##name##_store)
4095 +static struct elv_fs_entry bfq_attrs[] = {
4096 + BFQ_ATTR(quantum),
4097 + BFQ_ATTR(fifo_expire_sync),
4098 + BFQ_ATTR(fifo_expire_async),
4099 + BFQ_ATTR(back_seek_max),
4100 + BFQ_ATTR(back_seek_penalty),
4101 + BFQ_ATTR(slice_idle),
4102 + BFQ_ATTR(max_budget),
4103 + BFQ_ATTR(max_budget_async_rq),
4104 + BFQ_ATTR(timeout_sync),
4105 + BFQ_ATTR(timeout_async),
4106 + BFQ_ATTR(low_latency),
4107 + BFQ_ATTR(raising_coeff),
4108 + BFQ_ATTR(raising_max_time),
4109 + BFQ_ATTR(raising_rt_max_time),
4110 + BFQ_ATTR(raising_min_idle_time),
4111 + BFQ_ATTR(raising_max_softrt_rate),
4112 + BFQ_ATTR(weights),
4113 + __ATTR_NULL
4116 +static struct elevator_type iosched_bfq = {
4117 + .ops = {
4118 + .elevator_merge_fn = bfq_merge,
4119 + .elevator_merged_fn = bfq_merged_request,
4120 + .elevator_merge_req_fn = bfq_merged_requests,
4121 + .elevator_allow_merge_fn = bfq_allow_merge,
4122 + .elevator_dispatch_fn = bfq_dispatch_requests,
4123 + .elevator_add_req_fn = bfq_insert_request,
4124 + .elevator_activate_req_fn = bfq_activate_request,
4125 + .elevator_deactivate_req_fn = bfq_deactivate_request,
4126 + .elevator_completed_req_fn = bfq_completed_request,
4127 + .elevator_former_req_fn = elv_rb_former_request,
4128 + .elevator_latter_req_fn = elv_rb_latter_request,
4129 + .elevator_set_req_fn = bfq_set_request,
4130 + .elevator_put_req_fn = bfq_put_request,
4131 + .elevator_may_queue_fn = bfq_may_queue,
4132 + .elevator_init_fn = bfq_init_queue,
4133 + .elevator_exit_fn = bfq_exit_queue,
4134 + .trim = bfq_free_io_context,
4135 + },
4136 + .elevator_attrs = bfq_attrs,
4137 + .elevator_name = "bfq",
4138 + .elevator_owner = THIS_MODULE,
4141 +static int __init bfq_init(void)
4143 + /*
4144 + * Can be 0 on HZ < 1000 setups.
4145 + */
4146 + if (bfq_slice_idle == 0)
4147 + bfq_slice_idle = 1;
4149 + if (bfq_timeout_async == 0)
4150 + bfq_timeout_async = 1;
4152 + if (bfq_slab_setup())
4153 + return -ENOMEM;
4155 + elv_register(&iosched_bfq);
4157 + return 0;
4160 +static void __exit bfq_exit(void)
4162 + DECLARE_COMPLETION_ONSTACK(all_gone);
4163 + elv_unregister(&iosched_bfq);
4164 + bfq_ioc_gone = &all_gone;
4165 + /* bfq_ioc_gone's update must be visible before reading bfq_ioc_count */
4166 + smp_wmb();
4167 + if (elv_ioc_count_read(bfq_ioc_count) != 0)
4168 + wait_for_completion(&all_gone);
4169 + ida_destroy(&cic_index_ida);
4170 + bfq_slab_kill();
4173 +module_init(bfq_init);
4174 +module_exit(bfq_exit);
4176 +MODULE_AUTHOR("Fabio Checconi, Paolo Valente");
4177 +MODULE_LICENSE("GPL");
4178 +MODULE_DESCRIPTION("Budget Fair Queueing IO scheduler");
4179 diff --git a/block/bfq-sched.c b/block/bfq-sched.c
4180 new file mode 100644
4181 index 0000000..1551839
4182 --- /dev/null
4183 +++ b/block/bfq-sched.c
4184 @@ -0,0 +1,1037 @@
4186 + * BFQ: Hierarchical B-WF2Q+ scheduler.
4188 + * Based on ideas and code from CFQ:
4189 + * Copyright (C) 2003 Jens Axboe <axboe@kernel.dk>
4191 + * Copyright (C) 2008 Fabio Checconi <fabio@gandalf.sssup.it>
4192 + * Paolo Valente <paolo.valente@unimore.it>
4193 + */
4195 +#ifdef CONFIG_CGROUP_BFQIO
4196 +#define for_each_entity(entity) \
4197 + for (; entity != NULL; entity = entity->parent)
4199 +#define for_each_entity_safe(entity, parent) \
4200 + for (; entity && ({ parent = entity->parent; 1; }); entity = parent)
4202 +static struct bfq_entity *bfq_lookup_next_entity(struct bfq_sched_data *sd,
4203 + int extract,
4204 + struct bfq_data *bfqd);
4206 +static int bfq_update_next_active(struct bfq_sched_data *sd)
4208 + struct bfq_group *bfqg;
4209 + struct bfq_entity *entity, *next_active;
4211 + if (sd->active_entity != NULL)
4212 + /* will update/requeue at the end of service */
4213 + return 0;
4215 + /*
4216 + * NOTE: this can be improved in many ways, such as returning
4217 + * 1 (and thus propagating upwards the update) only when the
4218 + * budget changes, or caching the bfqq that will be scheduled
4219 + * next from this subtree. By now we worry more about
4220 + * correctness than about performance...
4221 + */
4222 + next_active = bfq_lookup_next_entity(sd, 0, NULL);
4223 + sd->next_active = next_active;
4225 + if (next_active != NULL) {
4226 + bfqg = container_of(sd, struct bfq_group, sched_data);
4227 + entity = bfqg->my_entity;
4228 + if (entity != NULL)
4229 + entity->budget = next_active->budget;
4232 + return 1;
4235 +static inline void bfq_check_next_active(struct bfq_sched_data *sd,
4236 + struct bfq_entity *entity)
4238 + BUG_ON(sd->next_active != entity);
4240 +#else
4241 +#define for_each_entity(entity) \
4242 + for (; entity != NULL; entity = NULL)
4244 +#define for_each_entity_safe(entity, parent) \
4245 + for (parent = NULL; entity != NULL; entity = parent)
4247 +static inline int bfq_update_next_active(struct bfq_sched_data *sd)
4249 + return 0;
4252 +static inline void bfq_check_next_active(struct bfq_sched_data *sd,
4253 + struct bfq_entity *entity)
4256 +#endif
4259 + * Shift for timestamp calculations. This actually limits the maximum
4260 + * service allowed in one timestamp delta (small shift values increase it),
4261 + * the maximum total weight that can be used for the queues in the system
4262 + * (big shift values increase it), and the period of virtual time wraparounds.
4263 + */
4264 +#define WFQ_SERVICE_SHIFT 22
4266 +/**
4267 + * bfq_gt - compare two timestamps.
4268 + * @a: first ts.
4269 + * @b: second ts.
4271 + * Return @a > @b, dealing with wrapping correctly.
4272 + */
4273 +static inline int bfq_gt(u64 a, u64 b)
4275 + return (s64)(a - b) > 0;
4278 +static inline struct bfq_queue *bfq_entity_to_bfqq(struct bfq_entity *entity)
4280 + struct bfq_queue *bfqq = NULL;
4282 + BUG_ON(entity == NULL);
4284 + if (entity->my_sched_data == NULL)
4285 + bfqq = container_of(entity, struct bfq_queue, entity);
4287 + return bfqq;
4291 +/**
4292 + * bfq_delta - map service into the virtual time domain.
4293 + * @service: amount of service.
4294 + * @weight: scale factor (weight of an entity or weight sum).
4295 + */
4296 +static inline u64 bfq_delta(unsigned long service,
4297 + unsigned long weight)
4299 + u64 d = (u64)service << WFQ_SERVICE_SHIFT;
4301 + do_div(d, weight);
4302 + return d;
4305 +/**
4306 + * bfq_calc_finish - assign the finish time to an entity.
4307 + * @entity: the entity to act upon.
4308 + * @service: the service to be charged to the entity.
4309 + */
4310 +static inline void bfq_calc_finish(struct bfq_entity *entity,
4311 + unsigned long service)
4313 + struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
4315 + BUG_ON(entity->weight == 0);
4317 + entity->finish = entity->start +
4318 + bfq_delta(service, entity->weight);
4320 + if (bfqq != NULL) {
4321 + bfq_log_bfqq(bfqq->bfqd, bfqq,
4322 + "calc_finish: serv %lu, w %d",
4323 + service, entity->weight);
4324 + bfq_log_bfqq(bfqq->bfqd, bfqq,
4325 + "calc_finish: start %llu, finish %llu, delta %llu",
4326 + entity->start, entity->finish,
4327 + bfq_delta(service, entity->weight));
4331 +/**
4332 + * bfq_entity_of - get an entity from a node.
4333 + * @node: the node field of the entity.
4335 + * Convert a node pointer to the relative entity. This is used only
4336 + * to simplify the logic of some functions and not as the generic
4337 + * conversion mechanism because, e.g., in the tree walking functions,
4338 + * the check for a %NULL value would be redundant.
4339 + */
4340 +static inline struct bfq_entity *bfq_entity_of(struct rb_node *node)
4342 + struct bfq_entity *entity = NULL;
4344 + if (node != NULL)
4345 + entity = rb_entry(node, struct bfq_entity, rb_node);
4347 + return entity;
4350 +/**
4351 + * bfq_extract - remove an entity from a tree.
4352 + * @root: the tree root.
4353 + * @entity: the entity to remove.
4354 + */
4355 +static inline void bfq_extract(struct rb_root *root,
4356 + struct bfq_entity *entity)
4358 + BUG_ON(entity->tree != root);
4360 + entity->tree = NULL;
4361 + rb_erase(&entity->rb_node, root);
4364 +/**
4365 + * bfq_idle_extract - extract an entity from the idle tree.
4366 + * @st: the service tree of the owning @entity.
4367 + * @entity: the entity being removed.
4368 + */
4369 +static void bfq_idle_extract(struct bfq_service_tree *st,
4370 + struct bfq_entity *entity)
4372 + struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
4373 + struct rb_node *next;
4375 + BUG_ON(entity->tree != &st->idle);
4377 + if (entity == st->first_idle) {
4378 + next = rb_next(&entity->rb_node);
4379 + st->first_idle = bfq_entity_of(next);
4382 + if (entity == st->last_idle) {
4383 + next = rb_prev(&entity->rb_node);
4384 + st->last_idle = bfq_entity_of(next);
4387 + bfq_extract(&st->idle, entity);
4389 + if (bfqq != NULL)
4390 + list_del(&bfqq->bfqq_list);
4393 +/**
4394 + * bfq_insert - generic tree insertion.
4395 + * @root: tree root.
4396 + * @entity: entity to insert.
4398 + * This is used for the idle and the active tree, since they are both
4399 + * ordered by finish time.
4400 + */
4401 +static void bfq_insert(struct rb_root *root, struct bfq_entity *entity)
4403 + struct bfq_entity *entry;
4404 + struct rb_node **node = &root->rb_node;
4405 + struct rb_node *parent = NULL;
4407 + BUG_ON(entity->tree != NULL);
4409 + while (*node != NULL) {
4410 + parent = *node;
4411 + entry = rb_entry(parent, struct bfq_entity, rb_node);
4413 + if (bfq_gt(entry->finish, entity->finish))
4414 + node = &parent->rb_left;
4415 + else
4416 + node = &parent->rb_right;
4419 + rb_link_node(&entity->rb_node, parent, node);
4420 + rb_insert_color(&entity->rb_node, root);
4422 + entity->tree = root;
4425 +/**
4426 + * bfq_update_min - update the min_start field of a entity.
4427 + * @entity: the entity to update.
4428 + * @node: one of its children.
4430 + * This function is called when @entity may store an invalid value for
4431 + * min_start due to updates to the active tree. The function assumes
4432 + * that the subtree rooted at @node (which may be its left or its right
4433 + * child) has a valid min_start value.
4434 + */
4435 +static inline void bfq_update_min(struct bfq_entity *entity,
4436 + struct rb_node *node)
4438 + struct bfq_entity *child;
4440 + if (node != NULL) {
4441 + child = rb_entry(node, struct bfq_entity, rb_node);
4442 + if (bfq_gt(entity->min_start, child->min_start))
4443 + entity->min_start = child->min_start;
4447 +/**
4448 + * bfq_update_active_node - recalculate min_start.
4449 + * @node: the node to update.
4451 + * @node may have changed position or one of its children may have moved,
4452 + * this function updates its min_start value. The left and right subtrees
4453 + * are assumed to hold a correct min_start value.
4454 + */
4455 +static inline void bfq_update_active_node(struct rb_node *node)
4457 + struct bfq_entity *entity = rb_entry(node, struct bfq_entity, rb_node);
4459 + entity->min_start = entity->start;
4460 + bfq_update_min(entity, node->rb_right);
4461 + bfq_update_min(entity, node->rb_left);
4464 +/**
4465 + * bfq_update_active_tree - update min_start for the whole active tree.
4466 + * @node: the starting node.
4468 + * @node must be the deepest modified node after an update. This function
4469 + * updates its min_start using the values held by its children, assuming
4470 + * that they did not change, and then updates all the nodes that may have
4471 + * changed in the path to the root. The only nodes that may have changed
4472 + * are the ones in the path or their siblings.
4473 + */
4474 +static void bfq_update_active_tree(struct rb_node *node)
4476 + struct rb_node *parent;
4478 +up:
4479 + bfq_update_active_node(node);
4481 + parent = rb_parent(node);
4482 + if (parent == NULL)
4483 + return;
4485 + if (node == parent->rb_left && parent->rb_right != NULL)
4486 + bfq_update_active_node(parent->rb_right);
4487 + else if (parent->rb_left != NULL)
4488 + bfq_update_active_node(parent->rb_left);
4490 + node = parent;
4491 + goto up;
4494 +/**
4495 + * bfq_active_insert - insert an entity in the active tree of its group/device.
4496 + * @st: the service tree of the entity.
4497 + * @entity: the entity being inserted.
4499 + * The active tree is ordered by finish time, but an extra key is kept
4500 + * per each node, containing the minimum value for the start times of
4501 + * its children (and the node itself), so it's possible to search for
4502 + * the eligible node with the lowest finish time in logarithmic time.
4503 + */
4504 +static void bfq_active_insert(struct bfq_service_tree *st,
4505 + struct bfq_entity *entity)
4507 + struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
4508 + struct rb_node *node = &entity->rb_node;
4510 + bfq_insert(&st->active, entity);
4512 + if (node->rb_left != NULL)
4513 + node = node->rb_left;
4514 + else if (node->rb_right != NULL)
4515 + node = node->rb_right;
4517 + bfq_update_active_tree(node);
4519 + if (bfqq != NULL)
4520 + list_add(&bfqq->bfqq_list, &bfqq->bfqd->active_list);
4523 +/**
4524 + * bfq_ioprio_to_weight - calc a weight from an ioprio.
4525 + * @ioprio: the ioprio value to convert.
4526 + */
4527 +static unsigned short bfq_ioprio_to_weight(int ioprio)
4529 + WARN_ON(ioprio < 0 || ioprio >= IOPRIO_BE_NR);
4530 + return IOPRIO_BE_NR - ioprio;
4533 +/**
4534 + * bfq_weight_to_ioprio - calc an ioprio from a weight.
4535 + * @weight: the weight value to convert.
4537 + * To preserve as mush as possible the old only-ioprio user interface,
4538 + * 0 is used as an escape ioprio value for weights (numerically) equal or
4539 + * larger than IOPRIO_BE_NR
4540 + */
4541 +static unsigned short bfq_weight_to_ioprio(int weight)
4543 + WARN_ON(weight < BFQ_MIN_WEIGHT || weight > BFQ_MAX_WEIGHT);
4544 + return IOPRIO_BE_NR - weight < 0 ? 0 : IOPRIO_BE_NR - weight;
4547 +static inline void bfq_get_entity(struct bfq_entity *entity)
4549 + struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
4550 + struct bfq_sched_data *sd;
4552 + if (bfqq != NULL) {
4553 + sd = entity->sched_data;
4554 + atomic_inc(&bfqq->ref);
4555 + bfq_log_bfqq(bfqq->bfqd, bfqq, "get_entity: %p %d",
4556 + bfqq, atomic_read(&bfqq->ref));
4560 +/**
4561 + * bfq_find_deepest - find the deepest node that an extraction can modify.
4562 + * @node: the node being removed.
4564 + * Do the first step of an extraction in an rb tree, looking for the
4565 + * node that will replace @node, and returning the deepest node that
4566 + * the following modifications to the tree can touch. If @node is the
4567 + * last node in the tree return %NULL.
4568 + */
4569 +static struct rb_node *bfq_find_deepest(struct rb_node *node)
4571 + struct rb_node *deepest;
4573 + if (node->rb_right == NULL && node->rb_left == NULL)
4574 + deepest = rb_parent(node);
4575 + else if (node->rb_right == NULL)
4576 + deepest = node->rb_left;
4577 + else if (node->rb_left == NULL)
4578 + deepest = node->rb_right;
4579 + else {
4580 + deepest = rb_next(node);
4581 + if (deepest->rb_right != NULL)
4582 + deepest = deepest->rb_right;
4583 + else if (rb_parent(deepest) != node)
4584 + deepest = rb_parent(deepest);
4587 + return deepest;
4590 +/**
4591 + * bfq_active_extract - remove an entity from the active tree.
4592 + * @st: the service_tree containing the tree.
4593 + * @entity: the entity being removed.
4594 + */
4595 +static void bfq_active_extract(struct bfq_service_tree *st,
4596 + struct bfq_entity *entity)
4598 + struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
4599 + struct rb_node *node;
4601 + node = bfq_find_deepest(&entity->rb_node);
4602 + bfq_extract(&st->active, entity);
4604 + if (node != NULL)
4605 + bfq_update_active_tree(node);
4607 + if (bfqq != NULL)
4608 + list_del(&bfqq->bfqq_list);
4611 +/**
4612 + * bfq_idle_insert - insert an entity into the idle tree.
4613 + * @st: the service tree containing the tree.
4614 + * @entity: the entity to insert.
4615 + */
4616 +static void bfq_idle_insert(struct bfq_service_tree *st,
4617 + struct bfq_entity *entity)
4619 + struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
4620 + struct bfq_entity *first_idle = st->first_idle;
4621 + struct bfq_entity *last_idle = st->last_idle;
4623 + if (first_idle == NULL || bfq_gt(first_idle->finish, entity->finish))
4624 + st->first_idle = entity;
4625 + if (last_idle == NULL || bfq_gt(entity->finish, last_idle->finish))
4626 + st->last_idle = entity;
4628 + bfq_insert(&st->idle, entity);
4630 + if (bfqq != NULL)
4631 + list_add(&bfqq->bfqq_list, &bfqq->bfqd->idle_list);
4634 +/**
4635 + * bfq_forget_entity - remove an entity from the wfq trees.
4636 + * @st: the service tree.
4637 + * @entity: the entity being removed.
4639 + * Update the device status and forget everything about @entity, putting
4640 + * the device reference to it, if it is a queue. Entities belonging to
4641 + * groups are not refcounted.
4642 + */
4643 +static void bfq_forget_entity(struct bfq_service_tree *st,
4644 + struct bfq_entity *entity)
4646 + struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
4647 + struct bfq_sched_data *sd;
4649 + BUG_ON(!entity->on_st);
4651 + entity->on_st = 0;
4652 + st->wsum -= entity->weight;
4653 + if (bfqq != NULL) {
4654 + sd = entity->sched_data;
4655 + bfq_log_bfqq(bfqq->bfqd, bfqq, "forget_entity: %p %d",
4656 + bfqq, atomic_read(&bfqq->ref));
4657 + bfq_put_queue(bfqq);
4661 +/**
4662 + * bfq_put_idle_entity - release the idle tree ref of an entity.
4663 + * @st: service tree for the entity.
4664 + * @entity: the entity being released.
4665 + */
4666 +static void bfq_put_idle_entity(struct bfq_service_tree *st,
4667 + struct bfq_entity *entity)
4669 + bfq_idle_extract(st, entity);
4670 + bfq_forget_entity(st, entity);
4673 +/**
4674 + * bfq_forget_idle - update the idle tree if necessary.
4675 + * @st: the service tree to act upon.
4677 + * To preserve the global O(log N) complexity we only remove one entry here;
4678 + * as the idle tree will not grow indefinitely this can be done safely.
4679 + */
4680 +static void bfq_forget_idle(struct bfq_service_tree *st)
4682 + struct bfq_entity *first_idle = st->first_idle;
4683 + struct bfq_entity *last_idle = st->last_idle;
4685 + if (RB_EMPTY_ROOT(&st->active) && last_idle != NULL &&
4686 + !bfq_gt(last_idle->finish, st->vtime)) {
4687 + /*
4688 + * Forget the whole idle tree, increasing the vtime past
4689 + * the last finish time of idle entities.
4690 + */
4691 + st->vtime = last_idle->finish;
4694 + if (first_idle != NULL && !bfq_gt(first_idle->finish, st->vtime))
4695 + bfq_put_idle_entity(st, first_idle);
4698 +static struct bfq_service_tree *
4699 +__bfq_entity_update_weight_prio(struct bfq_service_tree *old_st,
4700 + struct bfq_entity *entity)
4702 + struct bfq_service_tree *new_st = old_st;
4704 + if (entity->ioprio_changed) {
4705 + struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
4707 + BUG_ON(old_st->wsum < entity->weight);
4708 + old_st->wsum -= entity->weight;
4710 + if (entity->new_weight != entity->orig_weight) {
4711 + entity->orig_weight = entity->new_weight;
4712 + entity->ioprio =
4713 + bfq_weight_to_ioprio(entity->orig_weight);
4714 + } else if (entity->new_ioprio != entity->ioprio) {
4715 + entity->ioprio = entity->new_ioprio;
4716 + entity->orig_weight =
4717 + bfq_ioprio_to_weight(entity->ioprio);
4718 + } else
4719 + entity->new_weight = entity->orig_weight =
4720 + bfq_ioprio_to_weight(entity->ioprio);
4722 + entity->ioprio_class = entity->new_ioprio_class;
4723 + entity->ioprio_changed = 0;
4725 + /*
4726 + * NOTE: here we may be changing the weight too early,
4727 + * this will cause unfairness. The correct approach
4728 + * would have required additional complexity to defer
4729 + * weight changes to the proper time instants (i.e.,
4730 + * when entity->finish <= old_st->vtime).
4731 + */
4732 + new_st = bfq_entity_service_tree(entity);
4733 + entity->weight = entity->orig_weight *
4734 + (bfqq != NULL ? bfqq->raising_coeff : 1);
4735 + new_st->wsum += entity->weight;
4737 + if (new_st != old_st)
4738 + entity->start = new_st->vtime;
4741 + return new_st;
4744 +/**
4745 + * bfq_bfqq_served - update the scheduler status after selection for service.
4746 + * @bfqq: the queue being served.
4747 + * @served: bytes to transfer.
4749 + * NOTE: this can be optimized, as the timestamps of upper level entities
4750 + * are synchronized every time a new bfqq is selected for service. By now,
4751 + * we keep it to better check consistency.
4752 + */
4753 +static void bfq_bfqq_served(struct bfq_queue *bfqq, unsigned long served)
4755 + struct bfq_entity *entity = &bfqq->entity;
4756 + struct bfq_service_tree *st;
4758 + for_each_entity(entity) {
4759 + st = bfq_entity_service_tree(entity);
4761 + entity->service += served;
4762 + WARN_ON_ONCE(entity->service > entity->budget);
4763 + BUG_ON(st->wsum == 0);
4765 + st->vtime += bfq_delta(served, st->wsum);
4766 + bfq_forget_idle(st);
4768 + bfq_log_bfqq(bfqq->bfqd, bfqq, "bfqq_served %lu secs", served);
4771 +/**
4772 + * bfq_bfqq_charge_full_budget - set the service to the entity budget.
4773 + * @bfqq: the queue that needs a service update.
4775 + * When it's not possible to be fair in the service domain, because
4776 + * a queue is not consuming its budget fast enough (the meaning of
4777 + * fast depends on the timeout parameter), we charge it a full
4778 + * budget. In this way we should obtain a sort of time-domain
4779 + * fairness among all the seeky/slow queues.
4780 + */
4781 +static inline void bfq_bfqq_charge_full_budget(struct bfq_queue *bfqq)
4783 + struct bfq_entity *entity = &bfqq->entity;
4785 + bfq_log_bfqq(bfqq->bfqd, bfqq, "charge_full_budget");
4787 + bfq_bfqq_served(bfqq, entity->budget - entity->service);
4790 +/**
4791 + * __bfq_activate_entity - activate an entity.
4792 + * @entity: the entity being activated.
4794 + * Called whenever an entity is activated, i.e., it is not active and one
4795 + * of its children receives a new request, or has to be reactivated due to
4796 + * budget exhaustion. It uses the current budget of the entity (and the
4797 + * service received if @entity is active) of the queue to calculate its
4798 + * timestamps.
4799 + */
4800 +static void __bfq_activate_entity(struct bfq_entity *entity)
4802 + struct bfq_sched_data *sd = entity->sched_data;
4803 + struct bfq_service_tree *st = bfq_entity_service_tree(entity);
4805 + if (entity == sd->active_entity) {
4806 + BUG_ON(entity->tree != NULL);
4807 + /*
4808 + * If we are requeueing the current entity we have
4809 + * to take care of not charging to it service it has
4810 + * not received.
4811 + */
4812 + bfq_calc_finish(entity, entity->service);
4813 + entity->start = entity->finish;
4814 + sd->active_entity = NULL;
4815 + } else if (entity->tree == &st->active) {
4816 + /*
4817 + * Requeueing an entity due to a change of some
4818 + * next_active entity below it. We reuse the old
4819 + * start time.
4820 + */
4821 + bfq_active_extract(st, entity);
4822 + } else if (entity->tree == &st->idle) {
4823 + /*
4824 + * Must be on the idle tree, bfq_idle_extract() will
4825 + * check for that.
4826 + */
4827 + bfq_idle_extract(st, entity);
4828 + entity->start = bfq_gt(st->vtime, entity->finish) ?
4829 + st->vtime : entity->finish;
4830 + } else {
4831 + /*
4832 + * The finish time of the entity may be invalid, and
4833 + * it is in the past for sure, otherwise the queue
4834 + * would have been on the idle tree.
4835 + */
4836 + entity->start = st->vtime;
4837 + st->wsum += entity->weight;
4838 + bfq_get_entity(entity);
4840 + BUG_ON(entity->on_st);
4841 + entity->on_st = 1;
4844 + st = __bfq_entity_update_weight_prio(st, entity);
4845 + bfq_calc_finish(entity, entity->budget);
4846 + bfq_active_insert(st, entity);
4849 +/**
4850 + * bfq_activate_entity - activate an entity and its ancestors if necessary.
4851 + * @entity: the entity to activate.
4853 + * Activate @entity and all the entities on the path from it to the root.
4854 + */
4855 +static void bfq_activate_entity(struct bfq_entity *entity)
4857 + struct bfq_sched_data *sd;
4859 + for_each_entity(entity) {
4860 + __bfq_activate_entity(entity);
4862 + sd = entity->sched_data;
4863 + if (!bfq_update_next_active(sd))
4864 + /*
4865 + * No need to propagate the activation to the
4866 + * upper entities, as they will be updated when
4867 + * the active entity is rescheduled.
4868 + */
4869 + break;
4873 +/**
4874 + * __bfq_deactivate_entity - deactivate an entity from its service tree.
4875 + * @entity: the entity to deactivate.
4876 + * @requeue: if false, the entity will not be put into the idle tree.
4878 + * Deactivate an entity, independently from its previous state. If the
4879 + * entity was not on a service tree just return, otherwise if it is on
4880 + * any scheduler tree, extract it from that tree, and if necessary
4881 + * and if the caller did not specify @requeue, put it on the idle tree.
4883 + * Return %1 if the caller should update the entity hierarchy, i.e.,
4884 + * if the entity was under service or if it was the next_active for
4885 + * its sched_data; return %0 otherwise.
4886 + */
4887 +static int __bfq_deactivate_entity(struct bfq_entity *entity, int requeue)
4889 + struct bfq_sched_data *sd = entity->sched_data;
4890 + struct bfq_service_tree *st = bfq_entity_service_tree(entity);
4891 + int was_active = entity == sd->active_entity;
4892 + int ret = 0;
4894 + if (!entity->on_st)
4895 + return 0;
4897 + BUG_ON(was_active && entity->tree != NULL);
4899 + if (was_active) {
4900 + bfq_calc_finish(entity, entity->service);
4901 + sd->active_entity = NULL;
4902 + } else if (entity->tree == &st->active)
4903 + bfq_active_extract(st, entity);
4904 + else if (entity->tree == &st->idle)
4905 + bfq_idle_extract(st, entity);
4906 + else if (entity->tree != NULL)
4907 + BUG();
4909 + if (was_active || sd->next_active == entity)
4910 + ret = bfq_update_next_active(sd);
4912 + if (!requeue || !bfq_gt(entity->finish, st->vtime))
4913 + bfq_forget_entity(st, entity);
4914 + else
4915 + bfq_idle_insert(st, entity);
4917 + BUG_ON(sd->active_entity == entity);
4918 + BUG_ON(sd->next_active == entity);
4920 + return ret;
4923 +/**
4924 + * bfq_deactivate_entity - deactivate an entity.
4925 + * @entity: the entity to deactivate.
4926 + * @requeue: true if the entity can be put on the idle tree
4927 + */
4928 +static void bfq_deactivate_entity(struct bfq_entity *entity, int requeue)
4930 + struct bfq_sched_data *sd;
4931 + struct bfq_entity *parent;
4933 + for_each_entity_safe(entity, parent) {
4934 + sd = entity->sched_data;
4936 + if (!__bfq_deactivate_entity(entity, requeue))
4937 + /*
4938 + * The parent entity is still backlogged, and
4939 + * we don't need to update it as it is still
4940 + * under service.
4941 + */
4942 + break;
4944 + if (sd->next_active != NULL)
4945 + /*
4946 + * The parent entity is still backlogged and
4947 + * the budgets on the path towards the root
4948 + * need to be updated.
4949 + */
4950 + goto update;
4952 + /*
4953 + * If we reach there the parent is no more backlogged and
4954 + * we want to propagate the dequeue upwards.
4955 + */
4956 + requeue = 1;
4959 + return;
4961 +update:
4962 + entity = parent;
4963 + for_each_entity(entity) {
4964 + __bfq_activate_entity(entity);
4966 + sd = entity->sched_data;
4967 + if (!bfq_update_next_active(sd))
4968 + break;
4972 +/**
4973 + * bfq_update_vtime - update vtime if necessary.
4974 + * @st: the service tree to act upon.
4976 + * If necessary update the service tree vtime to have at least one
4977 + * eligible entity, skipping to its start time. Assumes that the
4978 + * active tree of the device is not empty.
4980 + * NOTE: this hierarchical implementation updates vtimes quite often,
4981 + * we may end up with reactivated tasks getting timestamps after a
4982 + * vtime skip done because we needed a ->first_active entity on some
4983 + * intermediate node.
4984 + */
4985 +static void bfq_update_vtime(struct bfq_service_tree *st)
4987 + struct bfq_entity *entry;
4988 + struct rb_node *node = st->active.rb_node;
4990 + entry = rb_entry(node, struct bfq_entity, rb_node);
4991 + if (bfq_gt(entry->min_start, st->vtime)) {
4992 + st->vtime = entry->min_start;
4993 + bfq_forget_idle(st);
4997 +/**
4998 + * bfq_first_active - find the eligible entity with the smallest finish time
4999 + * @st: the service tree to select from.
5001 + * This function searches the first schedulable entity, starting from the
5002 + * root of the tree and going on the left every time on this side there is
5003 + * a subtree with at least one eligible (start >= vtime) entity. The path
5004 + * on the right is followed only if a) the left subtree contains no eligible
5005 + * entities and b) no eligible entity has been found yet.
5006 + */
5007 +static struct bfq_entity *bfq_first_active_entity(struct bfq_service_tree *st)
5009 + struct bfq_entity *entry, *first = NULL;
5010 + struct rb_node *node = st->active.rb_node;
5012 + while (node != NULL) {
5013 + entry = rb_entry(node, struct bfq_entity, rb_node);
5014 +left:
5015 + if (!bfq_gt(entry->start, st->vtime))
5016 + first = entry;
5018 + BUG_ON(bfq_gt(entry->min_start, st->vtime));
5020 + if (node->rb_left != NULL) {
5021 + entry = rb_entry(node->rb_left,
5022 + struct bfq_entity, rb_node);
5023 + if (!bfq_gt(entry->min_start, st->vtime)) {
5024 + node = node->rb_left;
5025 + goto left;
5028 + if (first != NULL)
5029 + break;
5030 + node = node->rb_right;
5033 + BUG_ON(first == NULL && !RB_EMPTY_ROOT(&st->active));
5034 + return first;
5037 +/**
5038 + * __bfq_lookup_next_entity - return the first eligible entity in @st.
5039 + * @st: the service tree.
5041 + * Update the virtual time in @st and return the first eligible entity
5042 + * it contains.
5043 + */
5044 +static struct bfq_entity *__bfq_lookup_next_entity(struct bfq_service_tree *st)
5046 + struct bfq_entity *entity;
5048 + if (RB_EMPTY_ROOT(&st->active))
5049 + return NULL;
5051 + bfq_update_vtime(st);
5052 + entity = bfq_first_active_entity(st);
5053 + BUG_ON(bfq_gt(entity->start, st->vtime));
5055 + return entity;
5058 +/**
5059 + * bfq_lookup_next_entity - return the first eligible entity in @sd.
5060 + * @sd: the sched_data.
5061 + * @extract: if true the returned entity will be also extracted from @sd.
5063 + * NOTE: since we cache the next_active entity at each level of the
5064 + * hierarchy, the complexity of the lookup can be decreased with
5065 + * absolutely no effort just returning the cached next_active value;
5066 + * we prefer to do full lookups to test the consistency of * the data
5067 + * structures.
5068 + */
5069 +static struct bfq_entity *bfq_lookup_next_entity(struct bfq_sched_data *sd,
5070 + int extract,
5071 + struct bfq_data *bfqd)
5073 + struct bfq_service_tree *st = sd->service_tree;
5074 + struct bfq_entity *entity;
5075 + int i=0;
5077 + BUG_ON(sd->active_entity != NULL);
5079 + if (bfqd != NULL &&
5080 + jiffies - bfqd->bfq_class_idle_last_service > BFQ_CL_IDLE_TIMEOUT) {
5081 + entity = __bfq_lookup_next_entity(st + BFQ_IOPRIO_CLASSES - 1);
5082 + if (entity != NULL) {
5083 + i = BFQ_IOPRIO_CLASSES - 1;
5084 + bfqd->bfq_class_idle_last_service = jiffies;
5085 + sd->next_active = entity;
5088 + for (; i < BFQ_IOPRIO_CLASSES; i++) {
5089 + entity = __bfq_lookup_next_entity(st + i);
5090 + if (entity != NULL) {
5091 + if (extract) {
5092 + bfq_check_next_active(sd, entity);
5093 + bfq_active_extract(st + i, entity);
5094 + sd->active_entity = entity;
5095 + sd->next_active = NULL;
5097 + break;
5101 + return entity;
5105 + * Get next queue for service.
5106 + */
5107 +static struct bfq_queue *bfq_get_next_queue(struct bfq_data *bfqd)
5109 + struct bfq_entity *entity = NULL;
5110 + struct bfq_sched_data *sd;
5111 + struct bfq_queue *bfqq;
5113 + BUG_ON(bfqd->active_queue != NULL);
5115 + if (bfqd->busy_queues == 0)
5116 + return NULL;
5118 + sd = &bfqd->root_group->sched_data;
5119 + for (; sd != NULL; sd = entity->my_sched_data) {
5120 + entity = bfq_lookup_next_entity(sd, 1, bfqd);
5121 + BUG_ON(entity == NULL);
5122 + entity->service = 0;
5125 + bfqq = bfq_entity_to_bfqq(entity);
5126 + BUG_ON(bfqq == NULL);
5128 + return bfqq;
5132 + * Forced extraction of the given queue.
5133 + */
5134 +static void bfq_get_next_queue_forced(struct bfq_data *bfqd,
5135 + struct bfq_queue *bfqq)
5137 + struct bfq_entity *entity;
5138 + struct bfq_sched_data *sd;
5140 + BUG_ON(bfqd->active_queue != NULL);
5142 + entity = &bfqq->entity;
5143 + /*
5144 + * Bubble up extraction/update from the leaf to the root.
5145 + */
5146 + for_each_entity(entity) {
5147 + sd = entity->sched_data;
5148 + bfq_update_vtime(bfq_entity_service_tree(entity));
5149 + bfq_active_extract(bfq_entity_service_tree(entity), entity);
5150 + sd->active_entity = entity;
5151 + sd->next_active = NULL;
5152 + entity->service = 0;
5155 + return;
5158 +static void __bfq_bfqd_reset_active(struct bfq_data *bfqd)
5160 + if (bfqd->active_cic != NULL) {
5161 + put_io_context(bfqd->active_cic->ioc);
5162 + bfqd->active_cic = NULL;
5165 + bfqd->active_queue = NULL;
5166 + del_timer(&bfqd->idle_slice_timer);
5169 +static void bfq_deactivate_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq,
5170 + int requeue)
5172 + struct bfq_entity *entity = &bfqq->entity;
5174 + if (bfqq == bfqd->active_queue)
5175 + __bfq_bfqd_reset_active(bfqd);
5177 + bfq_deactivate_entity(entity, requeue);
5180 +static void bfq_activate_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq)
5182 + struct bfq_entity *entity = &bfqq->entity;
5184 + bfq_activate_entity(entity);
5188 + * Called when the bfqq no longer has requests pending, remove it from
5189 + * the service tree.
5190 + */
5191 +static void bfq_del_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq,
5192 + int requeue)
5194 + BUG_ON(!bfq_bfqq_busy(bfqq));
5195 + BUG_ON(!RB_EMPTY_ROOT(&bfqq->sort_list));
5197 + bfq_log_bfqq(bfqd, bfqq, "del from busy");
5199 + bfq_clear_bfqq_busy(bfqq);
5201 + BUG_ON(bfqd->busy_queues == 0);
5202 + bfqd->busy_queues--;
5204 + bfq_deactivate_bfqq(bfqd, bfqq, requeue);
5208 + * Called when an inactive queue receives a new request.
5209 + */
5210 +static void bfq_add_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq)
5212 + BUG_ON(bfq_bfqq_busy(bfqq));
5213 + BUG_ON(bfqq == bfqd->active_queue);
5215 + bfq_log_bfqq(bfqd, bfqq, "add to busy");
5217 + bfq_activate_bfqq(bfqd, bfqq);
5219 + bfq_mark_bfqq_busy(bfqq);
5220 + bfqd->busy_queues++;
5222 diff --git a/block/bfq.h b/block/bfq.h
5223 new file mode 100644
5224 index 0000000..2a23939
5225 --- /dev/null
5226 +++ b/block/bfq.h
5227 @@ -0,0 +1,585 @@
5229 + * BFQ-v3r1 for 3.1.0: data structures and common functions prototypes.
5231 + * Based on ideas and code from CFQ:
5232 + * Copyright (C) 2003 Jens Axboe <axboe@kernel.dk>
5234 + * Copyright (C) 2008 Fabio Checconi <fabio@gandalf.sssup.it>
5235 + * Paolo Valente <paolo.valente@unimore.it>
5236 + */
5238 +#ifndef _BFQ_H
5239 +#define _BFQ_H
5241 +#include <linux/blktrace_api.h>
5242 +#include <linux/hrtimer.h>
5243 +#include <linux/ioprio.h>
5244 +#include <linux/rbtree.h>
5246 +#define BFQ_IOPRIO_CLASSES 3
5247 +#define BFQ_CL_IDLE_TIMEOUT HZ/5
5249 +#define BFQ_MIN_WEIGHT 1
5250 +#define BFQ_MAX_WEIGHT 1000
5252 +#define BFQ_DEFAULT_GRP_WEIGHT 10
5253 +#define BFQ_DEFAULT_GRP_IOPRIO 0
5254 +#define BFQ_DEFAULT_GRP_CLASS IOPRIO_CLASS_BE
5256 +struct bfq_entity;
5258 +/**
5259 + * struct bfq_service_tree - per ioprio_class service tree.
5260 + * @active: tree for active entities (i.e., those backlogged).
5261 + * @idle: tree for idle entities (i.e., those not backlogged, with V <= F_i).
5262 + * @first_idle: idle entity with minimum F_i.
5263 + * @last_idle: idle entity with maximum F_i.
5264 + * @vtime: scheduler virtual time.
5265 + * @wsum: scheduler weight sum; active and idle entities contribute to it.
5267 + * Each service tree represents a B-WF2Q+ scheduler on its own. Each
5268 + * ioprio_class has its own independent scheduler, and so its own
5269 + * bfq_service_tree. All the fields are protected by the queue lock
5270 + * of the containing bfqd.
5271 + */
5272 +struct bfq_service_tree {
5273 + struct rb_root active;
5274 + struct rb_root idle;
5276 + struct bfq_entity *first_idle;
5277 + struct bfq_entity *last_idle;
5279 + u64 vtime;
5280 + unsigned long wsum;
5283 +/**
5284 + * struct bfq_sched_data - multi-class scheduler.
5285 + * @active_entity: entity under service.
5286 + * @next_active: head-of-the-line entity in the scheduler.
5287 + * @service_tree: array of service trees, one per ioprio_class.
5289 + * bfq_sched_data is the basic scheduler queue. It supports three
5290 + * ioprio_classes, and can be used either as a toplevel queue or as
5291 + * an intermediate queue on a hierarchical setup.
5292 + * @next_active points to the active entity of the sched_data service
5293 + * trees that will be scheduled next.
5295 + * The supported ioprio_classes are the same as in CFQ, in descending
5296 + * priority order, IOPRIO_CLASS_RT, IOPRIO_CLASS_BE, IOPRIO_CLASS_IDLE.
5297 + * Requests from higher priority queues are served before all the
5298 + * requests from lower priority queues; among requests of the same
5299 + * queue requests are served according to B-WF2Q+.
5300 + * All the fields are protected by the queue lock of the containing bfqd.
5301 + */
5302 +struct bfq_sched_data {
5303 + struct bfq_entity *active_entity;
5304 + struct bfq_entity *next_active;
5305 + struct bfq_service_tree service_tree[BFQ_IOPRIO_CLASSES];
5308 +/**
5309 + * struct bfq_entity - schedulable entity.
5310 + * @rb_node: service_tree member.
5311 + * @on_st: flag, true if the entity is on a tree (either the active or
5312 + * the idle one of its service_tree).
5313 + * @finish: B-WF2Q+ finish timestamp (aka F_i).
5314 + * @start: B-WF2Q+ start timestamp (aka S_i).
5315 + * @tree: tree the entity is enqueued into; %NULL if not on a tree.
5316 + * @min_start: minimum start time of the (active) subtree rooted at
5317 + * this entity; used for O(log N) lookups into active trees.
5318 + * @service: service received during the last round of service.
5319 + * @budget: budget used to calculate F_i; F_i = S_i + @budget / @weight.
5320 + * @weight: weight of the queue
5321 + * @parent: parent entity, for hierarchical scheduling.
5322 + * @my_sched_data: for non-leaf nodes in the cgroup hierarchy, the
5323 + * associated scheduler queue, %NULL on leaf nodes.
5324 + * @sched_data: the scheduler queue this entity belongs to.
5325 + * @ioprio: the ioprio in use.
5326 + * @new_weight: when a weight change is requested, the new weight value.
5327 + * @orig_weight: original weight, used to implement weight boosting
5328 + * @new_ioprio: when an ioprio change is requested, the new ioprio value.
5329 + * @ioprio_class: the ioprio_class in use.
5330 + * @new_ioprio_class: when an ioprio_class change is requested, the new
5331 + * ioprio_class value.
5332 + * @ioprio_changed: flag, true when the user requested a weight, ioprio or
5333 + * ioprio_class change.
5335 + * A bfq_entity is used to represent either a bfq_queue (leaf node in the
5336 + * cgroup hierarchy) or a bfq_group into the upper level scheduler. Each
5337 + * entity belongs to the sched_data of the parent group in the cgroup
5338 + * hierarchy. Non-leaf entities have also their own sched_data, stored
5339 + * in @my_sched_data.
5341 + * Each entity stores independently its priority values; this would
5342 + * allow different weights on different devices, but this
5343 + * functionality is not exported to userspace by now. Priorities and
5344 + * weights are updated lazily, first storing the new values into the
5345 + * new_* fields, then setting the @ioprio_changed flag. As soon as
5346 + * there is a transition in the entity state that allows the priority
5347 + * update to take place the effective and the requested priority
5348 + * values are synchronized.
5350 + * Unless cgroups are used, the weight value is calculated from the
5351 + * ioprio to export the same interface as CFQ. When dealing with
5352 + * ``well-behaved'' queues (i.e., queues that do not spend too much
5353 + * time to consume their budget and have true sequential behavior, and
5354 + * when there are no external factors breaking anticipation) the
5355 + * relative weights at each level of the cgroups hierarchy should be
5356 + * guaranteed. All the fields are protected by the queue lock of the
5357 + * containing bfqd.
5358 + */
5359 +struct bfq_entity {
5360 + struct rb_node rb_node;
5362 + int on_st;
5364 + u64 finish;
5365 + u64 start;
5367 + struct rb_root *tree;
5369 + u64 min_start;
5371 + unsigned long service, budget;
5372 + unsigned short weight, new_weight;
5373 + unsigned short orig_weight;
5375 + struct bfq_entity *parent;
5377 + struct bfq_sched_data *my_sched_data;
5378 + struct bfq_sched_data *sched_data;
5380 + unsigned short ioprio, new_ioprio;
5381 + unsigned short ioprio_class, new_ioprio_class;
5383 + int ioprio_changed;
5386 +struct bfq_group;
5388 +/**
5389 + * struct bfq_queue - leaf schedulable entity.
5390 + * @ref: reference counter.
5391 + * @bfqd: parent bfq_data.
5392 + * @new_bfqq: shared bfq_queue if queue is cooperating with
5393 + * one or more other queues.
5394 + * @pos_node: request-position tree member (see bfq_data's @rq_pos_tree).
5395 + * @pos_root: request-position tree root (see bfq_data's @rq_pos_tree).
5396 + * @sort_list: sorted list of pending requests.
5397 + * @next_rq: if fifo isn't expired, next request to serve.
5398 + * @queued: nr of requests queued in @sort_list.
5399 + * @allocated: currently allocated requests.
5400 + * @meta_pending: pending metadata requests.
5401 + * @fifo: fifo list of requests in sort_list.
5402 + * @entity: entity representing this queue in the scheduler.
5403 + * @max_budget: maximum budget allowed from the feedback mechanism.
5404 + * @budget_timeout: budget expiration (in jiffies).
5405 + * @dispatched: number of requests on the dispatch list or inside driver.
5406 + * @org_ioprio: saved ioprio during boosted periods.
5407 + * @flags: status flags.
5408 + * @bfqq_list: node for active/idle bfqq list inside our bfqd.
5409 + * @seek_samples: number of seeks sampled
5410 + * @seek_total: sum of the distances of the seeks sampled
5411 + * @seek_mean: mean seek distance
5412 + * @last_request_pos: position of the last request enqueued
5413 + * @pid: pid of the process owning the queue, used for logging purposes.
5414 + * @last_rais_start_time: last (idle -> weight-raised) transition attempt
5415 + * @raising_cur_max_time: current max raising time for this queue
5417 + * A bfq_queue is a leaf request queue; it can be associated to an io_context
5418 + * or more (if it is an async one). @cgroup holds a reference to the
5419 + * cgroup, to be sure that it does not disappear while a bfqq still
5420 + * references it (mostly to avoid races between request issuing and task
5421 + * migration followed by cgroup distruction).
5422 + * All the fields are protected by the queue lock of the containing bfqd.
5423 + */
5424 +struct bfq_queue {
5425 + atomic_t ref;
5426 + struct bfq_data *bfqd;
5428 + /* fields for cooperating queues handling */
5429 + struct bfq_queue *new_bfqq;
5430 + struct rb_node pos_node;
5431 + struct rb_root *pos_root;
5433 + struct rb_root sort_list;
5434 + struct request *next_rq;
5435 + int queued[2];
5436 + int allocated[2];
5437 + int meta_pending;
5438 + struct list_head fifo;
5440 + struct bfq_entity entity;
5442 + unsigned long max_budget;
5443 + unsigned long budget_timeout;
5445 + int dispatched;
5447 + unsigned short org_ioprio;
5449 + unsigned int flags;
5451 + struct list_head bfqq_list;
5453 + unsigned int seek_samples;
5454 + u64 seek_total;
5455 + sector_t seek_mean;
5456 + sector_t last_request_pos;
5458 + pid_t pid;
5460 + /* weight-raising fields */
5461 + unsigned int raising_cur_max_time;
5462 + u64 last_rais_start_finish, soft_rt_next_start;
5463 + unsigned int raising_coeff;
5466 +/**
5467 + * struct bfq_data - per device data structure.
5468 + * @queue: request queue for the managed device.
5469 + * @root_group: root bfq_group for the device.
5470 + * @rq_pos_tree: rbtree sorted by next_request position,
5471 + * used when determining if two or more queues
5472 + * have interleaving requests (see bfq_close_cooperator).
5473 + * @busy_queues: number of bfq_queues containing requests (including the
5474 + * queue under service, even if it is idling).
5475 + * @queued: number of queued requests.
5476 + * @rq_in_driver: number of requests dispatched and waiting for completion.
5477 + * @sync_flight: number of sync requests in the driver.
5478 + * @max_rq_in_driver: max number of reqs in driver in the last @hw_tag_samples
5479 + * completed requests .
5480 + * @hw_tag_samples: nr of samples used to calculate hw_tag.
5481 + * @hw_tag: flag set to one if the driver is showing a queueing behavior.
5482 + * @budgets_assigned: number of budgets assigned.
5483 + * @idle_slice_timer: timer set when idling for the next sequential request
5484 + * from the queue under service.
5485 + * @unplug_work: delayed work to restart dispatching on the request queue.
5486 + * @active_queue: bfq_queue under service.
5487 + * @active_cic: cfq_io_context (cic) associated with the @active_queue.
5488 + * @last_position: on-disk position of the last served request.
5489 + * @last_budget_start: beginning of the last budget.
5490 + * @last_idling_start: beginning of the last idle slice.
5491 + * @peak_rate: peak transfer rate observed for a budget.
5492 + * @peak_rate_samples: number of samples used to calculate @peak_rate.
5493 + * @bfq_max_budget: maximum budget allotted to a bfq_queue before rescheduling.
5494 + * @cic_index: use small consequent indexes as radix tree keys to reduce depth
5495 + * @cic_list: list of all the cics active on the bfq_data device.
5496 + * @group_list: list of all the bfq_groups active on the device.
5497 + * @active_list: list of all the bfq_queues active on the device.
5498 + * @idle_list: list of all the bfq_queues idle on the device.
5499 + * @bfq_quantum: max number of requests dispatched per dispatch round.
5500 + * @bfq_fifo_expire: timeout for async/sync requests; when it expires
5501 + * requests are served in fifo order.
5502 + * @bfq_back_penalty: weight of backward seeks wrt forward ones.
5503 + * @bfq_back_max: maximum allowed backward seek.
5504 + * @bfq_slice_idle: maximum idling time.
5505 + * @bfq_user_max_budget: user-configured max budget value (0 for auto-tuning).
5506 + * @bfq_max_budget_async_rq: maximum budget (in nr of requests) allotted to
5507 + * async queues.
5508 + * @bfq_timeout: timeout for bfq_queues to consume their budget; used to
5509 + * to prevent seeky queues to impose long latencies to well
5510 + * behaved ones (this also implies that seeky queues cannot
5511 + * receive guarantees in the service domain; after a timeout
5512 + * they are charged for the whole allocated budget, to try
5513 + * to preserve a behavior reasonably fair among them, but
5514 + * without service-domain guarantees).
5515 + * @bfq_raising_coeff: Maximum factor by which the weight of a boosted
5516 + * queue is multiplied
5517 + * @bfq_raising_max_time: maximum duration of a weight-raising period (jiffies)
5518 + * @bfq_raising_rt_max_time: maximum duration for soft real-time processes
5519 + * @bfq_raising_min_idle_time: minimum idle period after which weight-raising
5520 + * may be reactivated for a queue (in jiffies)
5521 + * @bfq_raising_max_softrt_rate: max service-rate for a soft real-time queue,
5522 + * sectors per seconds
5523 + * @oom_bfqq: fallback dummy bfqq for extreme OOM conditions
5525 + * All the fields are protected by the @queue lock.
5526 + */
5527 +struct bfq_data {
5528 + struct request_queue *queue;
5530 + struct bfq_group *root_group;
5532 + struct rb_root rq_pos_tree;
5534 + int busy_queues;
5535 + int queued;
5536 + int rq_in_driver;
5537 + int sync_flight;
5539 + int max_rq_in_driver;
5540 + int hw_tag_samples;
5541 + int hw_tag;
5543 + int budgets_assigned;
5545 + struct timer_list idle_slice_timer;
5546 + struct work_struct unplug_work;
5548 + struct bfq_queue *active_queue;
5549 + struct cfq_io_context *active_cic;
5551 + sector_t last_position;
5553 + ktime_t last_budget_start;
5554 + ktime_t last_idling_start;
5555 + int peak_rate_samples;
5556 + u64 peak_rate;
5557 + unsigned long bfq_max_budget;
5559 + unsigned int cic_index;
5560 + struct list_head cic_list;
5561 + struct hlist_head group_list;
5562 + struct list_head active_list;
5563 + struct list_head idle_list;
5565 + unsigned int bfq_quantum;
5566 + unsigned int bfq_fifo_expire[2];
5567 + unsigned int bfq_back_penalty;
5568 + unsigned int bfq_back_max;
5569 + unsigned int bfq_slice_idle;
5570 + u64 bfq_class_idle_last_service;
5572 + unsigned int bfq_user_max_budget;
5573 + unsigned int bfq_max_budget_async_rq;
5574 + unsigned int bfq_timeout[2];
5576 + bool low_latency;
5578 + /* parameters of the low_latency heuristics */
5579 + unsigned int bfq_raising_coeff;
5580 + unsigned int bfq_raising_max_time;
5581 + unsigned int bfq_raising_rt_max_time;
5582 + unsigned int bfq_raising_min_idle_time;
5583 + unsigned int bfq_raising_max_softrt_rate;
5585 + struct bfq_queue oom_bfqq;
5588 +enum bfqq_state_flags {
5589 + BFQ_BFQQ_FLAG_busy = 0, /* has requests or is under service */
5590 + BFQ_BFQQ_FLAG_wait_request, /* waiting for a request */
5591 + BFQ_BFQQ_FLAG_must_alloc, /* must be allowed rq alloc */
5592 + BFQ_BFQQ_FLAG_fifo_expire, /* FIFO checked in this slice */
5593 + BFQ_BFQQ_FLAG_idle_window, /* slice idling enabled */
5594 + BFQ_BFQQ_FLAG_prio_changed, /* task priority has changed */
5595 + BFQ_BFQQ_FLAG_sync, /* synchronous queue */
5596 + BFQ_BFQQ_FLAG_budget_new, /* no completion with this budget */
5597 + BFQ_BFQQ_FLAG_coop, /* bfqq is shared */
5598 + BFQ_BFQQ_FLAG_split_coop, /* shared bfqq will be splitted */
5599 + BFQ_BFQQ_FLAG_some_coop_idle, /* some cooperator is inactive */
5602 +#define BFQ_BFQQ_FNS(name) \
5603 +static inline void bfq_mark_bfqq_##name(struct bfq_queue *bfqq) \
5604 +{ \
5605 + (bfqq)->flags |= (1 << BFQ_BFQQ_FLAG_##name); \
5606 +} \
5607 +static inline void bfq_clear_bfqq_##name(struct bfq_queue *bfqq) \
5608 +{ \
5609 + (bfqq)->flags &= ~(1 << BFQ_BFQQ_FLAG_##name); \
5610 +} \
5611 +static inline int bfq_bfqq_##name(const struct bfq_queue *bfqq) \
5612 +{ \
5613 + return ((bfqq)->flags & (1 << BFQ_BFQQ_FLAG_##name)) != 0; \
5616 +BFQ_BFQQ_FNS(busy);
5617 +BFQ_BFQQ_FNS(wait_request);
5618 +BFQ_BFQQ_FNS(must_alloc);
5619 +BFQ_BFQQ_FNS(fifo_expire);
5620 +BFQ_BFQQ_FNS(idle_window);
5621 +BFQ_BFQQ_FNS(prio_changed);
5622 +BFQ_BFQQ_FNS(sync);
5623 +BFQ_BFQQ_FNS(budget_new);
5624 +BFQ_BFQQ_FNS(coop);
5625 +BFQ_BFQQ_FNS(split_coop);
5626 +BFQ_BFQQ_FNS(some_coop_idle);
5627 +#undef BFQ_BFQQ_FNS
5629 +/* Logging facilities. */
5630 +#define bfq_log_bfqq(bfqd, bfqq, fmt, args...) \
5631 + blk_add_trace_msg((bfqd)->queue, "bfq%d " fmt, (bfqq)->pid, ##args)
5633 +#define bfq_log(bfqd, fmt, args...) \
5634 + blk_add_trace_msg((bfqd)->queue, "bfq " fmt, ##args)
5636 +/* Expiration reasons. */
5637 +enum bfqq_expiration {
5638 + BFQ_BFQQ_TOO_IDLE = 0, /* queue has been idling for too long */
5639 + BFQ_BFQQ_BUDGET_TIMEOUT, /* budget took too long to be used */
5640 + BFQ_BFQQ_BUDGET_EXHAUSTED, /* budget consumed */
5641 + BFQ_BFQQ_NO_MORE_REQUESTS, /* the queue has no more requests */
5644 +#ifdef CONFIG_CGROUP_BFQIO
5645 +/**
5646 + * struct bfq_group - per (device, cgroup) data structure.
5647 + * @entity: schedulable entity to insert into the parent group sched_data.
5648 + * @sched_data: own sched_data, to contain child entities (they may be
5649 + * both bfq_queues and bfq_groups).
5650 + * @group_node: node to be inserted into the bfqio_cgroup->group_data
5651 + * list of the containing cgroup's bfqio_cgroup.
5652 + * @bfqd_node: node to be inserted into the @bfqd->group_list list
5653 + * of the groups active on the same device; used for cleanup.
5654 + * @bfqd: the bfq_data for the device this group acts upon.
5655 + * @async_bfqq: array of async queues for all the tasks belonging to
5656 + * the group, one queue per ioprio value per ioprio_class,
5657 + * except for the idle class that has only one queue.
5658 + * @async_idle_bfqq: async queue for the idle class (ioprio is ignored).
5659 + * @my_entity: pointer to @entity, %NULL for the toplevel group; used
5660 + * to avoid too many special cases during group creation/migration.
5662 + * Each (device, cgroup) pair has its own bfq_group, i.e., for each cgroup
5663 + * there is a set of bfq_groups, each one collecting the lower-level
5664 + * entities belonging to the group that are acting on the same device.
5666 + * Locking works as follows:
5667 + * o @group_node is protected by the bfqio_cgroup lock, and is accessed
5668 + * via RCU from its readers.
5669 + * o @bfqd is protected by the queue lock, RCU is used to access it
5670 + * from the readers.
5671 + * o All the other fields are protected by the @bfqd queue lock.
5672 + */
5673 +struct bfq_group {
5674 + struct bfq_entity entity;
5675 + struct bfq_sched_data sched_data;
5677 + struct hlist_node group_node;
5678 + struct hlist_node bfqd_node;
5680 + void *bfqd;
5682 + struct bfq_queue *async_bfqq[2][IOPRIO_BE_NR];
5683 + struct bfq_queue *async_idle_bfqq;
5685 + struct bfq_entity *my_entity;
5688 +/**
5689 + * struct bfqio_cgroup - bfq cgroup data structure.
5690 + * @css: subsystem state for bfq in the containing cgroup.
5691 + * @weight: cgroup weight.
5692 + * @ioprio: cgroup ioprio.
5693 + * @ioprio_class: cgroup ioprio_class.
5694 + * @lock: spinlock that protects @ioprio, @ioprio_class and @group_data.
5695 + * @group_data: list containing the bfq_group belonging to this cgroup.
5697 + * @group_data is accessed using RCU, with @lock protecting the updates,
5698 + * @ioprio and @ioprio_class are protected by @lock.
5699 + */
5700 +struct bfqio_cgroup {
5701 + struct cgroup_subsys_state css;
5703 + unsigned short weight, ioprio, ioprio_class;
5705 + spinlock_t lock;
5706 + struct hlist_head group_data;
5708 +#else
5709 +struct bfq_group {
5710 + struct bfq_sched_data sched_data;
5712 + struct bfq_queue *async_bfqq[2][IOPRIO_BE_NR];
5713 + struct bfq_queue *async_idle_bfqq;
5715 +#endif
5717 +static inline struct bfq_service_tree *
5718 +bfq_entity_service_tree(struct bfq_entity *entity)
5720 + struct bfq_sched_data *sched_data = entity->sched_data;
5721 + unsigned int idx = entity->ioprio_class - 1;
5723 + BUG_ON(idx >= BFQ_IOPRIO_CLASSES);
5724 + BUG_ON(sched_data == NULL);
5726 + return sched_data->service_tree + idx;
5729 +static inline struct bfq_queue *cic_to_bfqq(struct cfq_io_context *cic,
5730 + int is_sync)
5732 + return cic->cfqq[!!is_sync];
5735 +static inline void cic_set_bfqq(struct cfq_io_context *cic,
5736 + struct bfq_queue *bfqq, int is_sync)
5738 + cic->cfqq[!!is_sync] = bfqq;
5741 +static inline void call_for_each_cic(struct io_context *ioc,
5742 + void (*func)(struct io_context *,
5743 + struct cfq_io_context *))
5745 + struct cfq_io_context *cic;
5746 + struct hlist_node *n;
5748 + rcu_read_lock();
5749 + hlist_for_each_entry_rcu(cic, n, &ioc->bfq_cic_list, cic_list)
5750 + func(ioc, cic);
5751 + rcu_read_unlock();
5754 +#define CIC_DEAD_KEY 1ul
5755 +#define CIC_DEAD_INDEX_SHIFT 1
5757 +static inline void *bfqd_dead_key(struct bfq_data *bfqd)
5759 + return (void *)(bfqd->cic_index << CIC_DEAD_INDEX_SHIFT | CIC_DEAD_KEY);
5762 +/**
5763 + * bfq_get_bfqd_locked - get a lock to a bfqd using a RCU protected pointer.
5764 + * @ptr: a pointer to a bfqd.
5765 + * @flags: storage for the flags to be saved.
5767 + * This function allows cic->key and bfqg->bfqd to be protected by the
5768 + * queue lock of the bfqd they reference; the pointer is dereferenced
5769 + * under RCU, so the storage for bfqd is assured to be safe as long
5770 + * as the RCU read side critical section does not end. After the
5771 + * bfqd->queue->queue_lock is taken the pointer is rechecked, to be
5772 + * sure that no other writer accessed it. If we raced with a writer,
5773 + * the function returns NULL, with the queue unlocked, otherwise it
5774 + * returns the dereferenced pointer, with the queue locked.
5775 + */
5776 +static inline struct bfq_data *bfq_get_bfqd_locked(void **ptr,
5777 + unsigned long *flags)
5779 + struct bfq_data *bfqd;
5781 + rcu_read_lock();
5782 + bfqd = rcu_dereference(*(struct bfq_data **)ptr);
5784 + if (bfqd != NULL && !((unsigned long) bfqd & CIC_DEAD_KEY)) {
5785 + spin_lock_irqsave(bfqd->queue->queue_lock, *flags);
5786 + if (*ptr == bfqd)
5787 + goto out;
5788 + spin_unlock_irqrestore(bfqd->queue->queue_lock, *flags);
5791 + bfqd = NULL;
5792 +out:
5793 + rcu_read_unlock();
5794 + return bfqd;
5797 +static inline void bfq_put_bfqd_unlock(struct bfq_data *bfqd,
5798 + unsigned long *flags)
5800 + spin_unlock_irqrestore(bfqd->queue->queue_lock, *flags);
5803 +static void bfq_changed_ioprio(struct io_context *ioc,
5804 + struct cfq_io_context *cic);
5805 +static void bfq_put_queue(struct bfq_queue *bfqq);
5806 +static void bfq_dispatch_insert(struct request_queue *q, struct request *rq);
5807 +static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd,
5808 + struct bfq_group *bfqg, int is_sync,
5809 + struct io_context *ioc, gfp_t gfp_mask);
5810 +static void bfq_put_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg);
5811 +static void bfq_exit_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq);
5812 +#endif
5814 1.7.2.5