Merge tag 'v3.3.7' into 3.3/master
[zen-stable.git] / block / bfq-iosched.c
blob81cc977852a7f22848c614c0a3d9b4e447562b39
1 /*
2 * BFQ, or Budget Fair Queueing, disk scheduler.
4 * Based on ideas and code from CFQ:
5 * Copyright (C) 2003 Jens Axboe <axboe@kernel.dk>
7 * Copyright (C) 2008 Fabio Checconi <fabio@gandalf.sssup.it>
8 * Paolo Valente <paolo.valente@unimore.it>
10 * Licensed under the GPL-2 as detailed in the accompanying COPYING.BFQ file.
12 * BFQ is a proportional share disk scheduling algorithm based on the
13 * slice-by-slice service scheme of CFQ. But BFQ assigns budgets,
14 * measured in number of sectors, to tasks instead of time slices.
15 * The disk is not granted to the active task for a given time slice,
16 * but until it has exahusted its assigned budget. This change from
17 * the time to the service domain allows BFQ to distribute the disk
18 * bandwidth among tasks as desired, without any distortion due to
19 * ZBR, workload fluctuations or other factors. BFQ uses an ad hoc
20 * internal scheduler, called B-WF2Q+, to schedule tasks according to
21 * their budgets. Thanks to this accurate scheduler, BFQ can afford
22 * to assign high budgets to disk-bound non-seeky tasks (to boost the
23 * throughput), and yet guarantee low latencies to interactive and
24 * soft real-time applications.
26 * BFQ has been introduced in [1], where the interested reader can
27 * find an accurate description of the algorithm, the bandwidth
28 * distribution and latency guarantees it provides, plus formal proofs
29 * of all the properties. With respect to the algorithm presented in
30 * the paper, this implementation adds several little heuristics, and
31 * a hierarchical extension, based on H-WF2Q+.
33 * B-WF2Q+ is based on WF2Q+, that is described in [2], together with
34 * H-WF2Q+, while the augmented tree used to implement B-WF2Q+ with O(log N)
35 * complexity derives from the one introduced with EEVDF in [3].
37 * [1] P. Valente and F. Checconi, ``High Throughput Disk Scheduling
38 * with Deterministic Guarantees on Bandwidth Distribution,'',
39 * IEEE Transactions on Computer, May 2010.
41 * http://algo.ing.unimo.it/people/paolo/disk_sched/bfq-techreport.pdf
43 * [2] Jon C.R. Bennett and H. Zhang, ``Hierarchical Packet Fair Queueing
44 * Algorithms,'' IEEE/ACM Transactions on Networking, 5(5):675-689,
45 * Oct 1997.
47 * http://www.cs.cmu.edu/~hzhang/papers/TON-97-Oct.ps.gz
49 * [3] I. Stoica and H. Abdel-Wahab, ``Earliest Eligible Virtual Deadline
50 * First: A Flexible and Accurate Mechanism for Proportional Share
51 * Resource Allocation,'' technical report.
53 * http://www.cs.berkeley.edu/~istoica/papers/eevdf-tr-95.pdf
55 #include <linux/module.h>
56 #include <linux/slab.h>
57 #include <linux/blkdev.h>
58 #include <linux/cgroup.h>
59 #include <linux/elevator.h>
60 #include <linux/jiffies.h>
61 #include <linux/rbtree.h>
62 #include <linux/ioprio.h>
63 #include "bfq.h"
64 #include "blk.h"
66 /* Max number of dispatches in one round of service. */
67 static const int bfq_quantum = 4;
69 /* Expiration time of sync (0) and async (1) requests, in jiffies. */
70 static const int bfq_fifo_expire[2] = { HZ / 4, HZ / 8 };
72 /* Maximum backwards seek, in KiB. */
73 static const int bfq_back_max = 16 * 1024;
75 /* Penalty of a backwards seek, in number of sectors. */
76 static const int bfq_back_penalty = 2;
78 /* Idling period duration, in jiffies. */
79 static int bfq_slice_idle = HZ / 125;
81 /* Default maximum budget values, in sectors and number of requests. */
82 static const int bfq_default_max_budget = 16 * 1024;
83 static const int bfq_max_budget_async_rq = 4;
86 * Async to sync throughput distribution is controlled as follows:
87 * when an async request is served, the entity is charged the number
88 * of sectors of the request, multipled by the factor below
90 static const int bfq_async_charge_factor = 10;
92 /* Default timeout values, in jiffies, approximating CFQ defaults. */
93 static const int bfq_timeout_sync = HZ / 8;
94 static int bfq_timeout_async = HZ / 25;
96 struct kmem_cache *bfq_pool;
98 /* Below this threshold (in ms), we consider thinktime immediate. */
99 #define BFQ_MIN_TT 2
101 /* hw_tag detection: parallel requests threshold and min samples needed. */
102 #define BFQ_HW_QUEUE_THRESHOLD 4
103 #define BFQ_HW_QUEUE_SAMPLES 32
105 #define BFQQ_SEEK_THR (sector_t)(8 * 1024)
106 #define BFQQ_SEEKY(bfqq) ((bfqq)->seek_mean > BFQQ_SEEK_THR)
108 /* Min samples used for peak rate estimation (for autotuning). */
109 #define BFQ_PEAK_RATE_SAMPLES 32
111 /* Shift used for peak rate fixed precision calculations. */
112 #define BFQ_RATE_SHIFT 16
114 #define BFQ_SERVICE_TREE_INIT ((struct bfq_service_tree) \
115 { RB_ROOT, RB_ROOT, NULL, NULL, 0, 0 })
117 #define RQ_BIC(rq) ((struct bfq_io_cq *) (rq)->elv.priv[0])
118 #define RQ_BFQQ(rq) ((rq)->elv.priv[1])
120 #include "bfq-ioc.c"
121 #include "bfq-sched.c"
122 #include "bfq-cgroup.c"
124 #define bfq_class_idle(bfqq) ((bfqq)->entity.ioprio_class ==\
125 IOPRIO_CLASS_IDLE)
126 #define bfq_class_rt(bfqq) ((bfqq)->entity.ioprio_class ==\
127 IOPRIO_CLASS_RT)
129 #define bfq_sample_valid(samples) ((samples) > 80)
132 * We regard a request as SYNC, if either it's a read or has the SYNC bit
133 * set (in which case it could also be a direct WRITE).
135 static inline int bfq_bio_sync(struct bio *bio)
137 if (bio_data_dir(bio) == READ || (bio->bi_rw & REQ_SYNC))
138 return 1;
140 return 0;
144 * Scheduler run of queue, if there are requests pending and no one in the
145 * driver that will restart queueing.
147 static inline void bfq_schedule_dispatch(struct bfq_data *bfqd)
149 if (bfqd->queued != 0) {
150 bfq_log(bfqd, "schedule dispatch");
151 kblockd_schedule_work(bfqd->queue, &bfqd->unplug_work);
156 * Lifted from AS - choose which of rq1 and rq2 that is best served now.
157 * We choose the request that is closesr to the head right now. Distance
158 * behind the head is penalized and only allowed to a certain extent.
160 static struct request *bfq_choose_req(struct bfq_data *bfqd,
161 struct request *rq1,
162 struct request *rq2,
163 sector_t last)
165 sector_t s1, s2, d1 = 0, d2 = 0;
166 unsigned long back_max;
167 #define BFQ_RQ1_WRAP 0x01 /* request 1 wraps */
168 #define BFQ_RQ2_WRAP 0x02 /* request 2 wraps */
169 unsigned wrap = 0; /* bit mask: requests behind the disk head? */
171 if (rq1 == NULL || rq1 == rq2)
172 return rq2;
173 if (rq2 == NULL)
174 return rq1;
176 if (rq_is_sync(rq1) && !rq_is_sync(rq2))
177 return rq1;
178 else if (rq_is_sync(rq2) && !rq_is_sync(rq1))
179 return rq2;
180 if ((rq1->cmd_flags & REQ_META) && !(rq2->cmd_flags & REQ_META))
181 return rq1;
182 else if ((rq2->cmd_flags & REQ_META) && !(rq1->cmd_flags & REQ_META))
183 return rq2;
185 s1 = blk_rq_pos(rq1);
186 s2 = blk_rq_pos(rq2);
189 * By definition, 1KiB is 2 sectors.
191 back_max = bfqd->bfq_back_max * 2;
194 * Strict one way elevator _except_ in the case where we allow
195 * short backward seeks which are biased as twice the cost of a
196 * similar forward seek.
198 if (s1 >= last)
199 d1 = s1 - last;
200 else if (s1 + back_max >= last)
201 d1 = (last - s1) * bfqd->bfq_back_penalty;
202 else
203 wrap |= BFQ_RQ1_WRAP;
205 if (s2 >= last)
206 d2 = s2 - last;
207 else if (s2 + back_max >= last)
208 d2 = (last - s2) * bfqd->bfq_back_penalty;
209 else
210 wrap |= BFQ_RQ2_WRAP;
212 /* Found required data */
215 * By doing switch() on the bit mask "wrap" we avoid having to
216 * check two variables for all permutations: --> faster!
218 switch (wrap) {
219 case 0: /* common case for CFQ: rq1 and rq2 not wrapped */
220 if (d1 < d2)
221 return rq1;
222 else if (d2 < d1)
223 return rq2;
224 else {
225 if (s1 >= s2)
226 return rq1;
227 else
228 return rq2;
231 case BFQ_RQ2_WRAP:
232 return rq1;
233 case BFQ_RQ1_WRAP:
234 return rq2;
235 case (BFQ_RQ1_WRAP|BFQ_RQ2_WRAP): /* both rqs wrapped */
236 default:
238 * Since both rqs are wrapped,
239 * start with the one that's further behind head
240 * (--> only *one* back seek required),
241 * since back seek takes more time than forward.
243 if (s1 <= s2)
244 return rq1;
245 else
246 return rq2;
250 static struct bfq_queue *
251 bfq_rq_pos_tree_lookup(struct bfq_data *bfqd, struct rb_root *root,
252 sector_t sector, struct rb_node **ret_parent,
253 struct rb_node ***rb_link)
255 struct rb_node **p, *parent;
256 struct bfq_queue *bfqq = NULL;
258 parent = NULL;
259 p = &root->rb_node;
260 while (*p) {
261 struct rb_node **n;
263 parent = *p;
264 bfqq = rb_entry(parent, struct bfq_queue, pos_node);
267 * Sort strictly based on sector. Smallest to the left,
268 * largest to the right.
270 if (sector > blk_rq_pos(bfqq->next_rq))
271 n = &(*p)->rb_right;
272 else if (sector < blk_rq_pos(bfqq->next_rq))
273 n = &(*p)->rb_left;
274 else
275 break;
276 p = n;
277 bfqq = NULL;
280 *ret_parent = parent;
281 if (rb_link)
282 *rb_link = p;
284 bfq_log(bfqd, "rq_pos_tree_lookup %llu: returning %d",
285 (long long unsigned)sector,
286 bfqq != NULL ? bfqq->pid : 0);
288 return bfqq;
291 static void bfq_rq_pos_tree_add(struct bfq_data *bfqd, struct bfq_queue *bfqq)
293 struct rb_node **p, *parent;
294 struct bfq_queue *__bfqq;
296 if (bfqq->pos_root != NULL) {
297 rb_erase(&bfqq->pos_node, bfqq->pos_root);
298 bfqq->pos_root = NULL;
301 if (bfq_class_idle(bfqq))
302 return;
303 if (!bfqq->next_rq)
304 return;
306 bfqq->pos_root = &bfqd->rq_pos_tree;
307 __bfqq = bfq_rq_pos_tree_lookup(bfqd, bfqq->pos_root,
308 blk_rq_pos(bfqq->next_rq), &parent, &p);
309 if (__bfqq == NULL) {
310 rb_link_node(&bfqq->pos_node, parent, p);
311 rb_insert_color(&bfqq->pos_node, bfqq->pos_root);
312 } else
313 bfqq->pos_root = NULL;
316 static struct request *bfq_find_next_rq(struct bfq_data *bfqd,
317 struct bfq_queue *bfqq,
318 struct request *last)
320 struct rb_node *rbnext = rb_next(&last->rb_node);
321 struct rb_node *rbprev = rb_prev(&last->rb_node);
322 struct request *next = NULL, *prev = NULL;
324 BUG_ON(RB_EMPTY_NODE(&last->rb_node));
326 if (rbprev != NULL)
327 prev = rb_entry_rq(rbprev);
329 if (rbnext != NULL)
330 next = rb_entry_rq(rbnext);
331 else {
332 rbnext = rb_first(&bfqq->sort_list);
333 if (rbnext && rbnext != &last->rb_node)
334 next = rb_entry_rq(rbnext);
337 return bfq_choose_req(bfqd, next, prev, blk_rq_pos(last));
340 static void bfq_del_rq_rb(struct request *rq)
342 struct bfq_queue *bfqq = RQ_BFQQ(rq);
343 struct bfq_data *bfqd = bfqq->bfqd;
344 const int sync = rq_is_sync(rq);
346 BUG_ON(bfqq->queued[sync] == 0);
347 bfqq->queued[sync]--;
348 bfqd->queued--;
350 elv_rb_del(&bfqq->sort_list, rq);
352 if (RB_EMPTY_ROOT(&bfqq->sort_list)) {
353 if (bfq_bfqq_busy(bfqq) && bfqq != bfqd->active_queue)
354 bfq_del_bfqq_busy(bfqd, bfqq, 1);
356 * Remove queue from request-position tree as it is empty.
358 if (bfqq->pos_root != NULL) {
359 rb_erase(&bfqq->pos_node, bfqq->pos_root);
360 bfqq->pos_root = NULL;
365 /* see the definition of bfq_async_charge_factor for details */
366 static inline unsigned long bfq_serv_to_charge(struct request *rq,
367 struct bfq_queue *bfqq)
369 return blk_rq_sectors(rq) *
370 (1 + ((!bfq_bfqq_sync(bfqq)) * (bfqq->raising_coeff == 1) *
371 bfq_async_charge_factor));
375 * bfq_updated_next_req - update the queue after a new next_rq selection.
376 * @bfqd: the device data the queue belongs to.
377 * @bfqq: the queue to update.
379 * If the first request of a queue changes we make sure that the queue
380 * has enough budget to serve at least its first request (if the
381 * request has grown). We do this because if the queue has not enough
382 * budget for its first request, it has to go through two dispatch
383 * rounds to actually get it dispatched.
385 static void bfq_updated_next_req(struct bfq_data *bfqd,
386 struct bfq_queue *bfqq)
388 struct bfq_entity *entity = &bfqq->entity;
389 struct bfq_service_tree *st = bfq_entity_service_tree(entity);
390 struct request *next_rq = bfqq->next_rq;
391 unsigned long new_budget;
393 if (next_rq == NULL)
394 return;
396 if (bfqq == bfqd->active_queue)
398 * In order not to break guarantees, budgets cannot be
399 * changed after an entity has been selected.
401 return;
403 BUG_ON(entity->tree != &st->active);
404 BUG_ON(entity == entity->sched_data->active_entity);
406 new_budget = max_t(unsigned long, bfqq->max_budget,
407 bfq_serv_to_charge(next_rq, bfqq));
408 entity->budget = new_budget;
409 bfq_log_bfqq(bfqd, bfqq, "updated next rq: new budget %lu", new_budget);
410 bfq_activate_bfqq(bfqd, bfqq);
413 static void bfq_add_rq_rb(struct request *rq)
415 struct bfq_queue *bfqq = RQ_BFQQ(rq);
416 struct bfq_entity *entity = &bfqq->entity;
417 struct bfq_data *bfqd = bfqq->bfqd;
418 struct request *next_rq, *prev;
419 unsigned long old_raising_coeff = bfqq->raising_coeff;
420 int idle_for_long_time = bfqq->budget_timeout +
421 bfqd->bfq_raising_min_idle_time < jiffies;
423 bfq_log_bfqq(bfqd, bfqq, "add_rq_rb %d", rq_is_sync(rq));
424 bfqq->queued[rq_is_sync(rq)]++;
425 bfqd->queued++;
427 elv_rb_add(&bfqq->sort_list, rq);
430 * Check if this request is a better next-serve candidate.
432 prev = bfqq->next_rq;
433 next_rq = bfq_choose_req(bfqd, bfqq->next_rq, rq, bfqd->last_position);
434 BUG_ON(next_rq == NULL);
435 bfqq->next_rq = next_rq;
438 * Adjust priority tree position, if next_rq changes.
440 if (prev != bfqq->next_rq)
441 bfq_rq_pos_tree_add(bfqd, bfqq);
443 if (!bfq_bfqq_busy(bfqq)) {
444 int soft_rt = bfqd->bfq_raising_max_softrt_rate > 0 &&
445 bfqq->soft_rt_next_start < jiffies;
446 entity->budget = max_t(unsigned long, bfqq->max_budget,
447 bfq_serv_to_charge(next_rq, bfqq));
449 if (! bfqd->low_latency)
450 goto add_bfqq_busy;
453 * If the queue is not being boosted and has been idle
454 * for enough time, start a weight-raising period
456 if(old_raising_coeff == 1 && (idle_for_long_time || soft_rt)) {
457 bfqq->raising_coeff = bfqd->bfq_raising_coeff;
458 bfqq->raising_cur_max_time = idle_for_long_time ?
459 bfqd->bfq_raising_max_time :
460 bfqd->bfq_raising_rt_max_time;
461 bfq_log_bfqq(bfqd, bfqq,
462 "wrais starting at %llu msec,"
463 "rais_max_time %u",
464 bfqq->last_rais_start_finish,
465 jiffies_to_msecs(bfqq->
466 raising_cur_max_time));
467 } else if (old_raising_coeff > 1) {
468 if (idle_for_long_time)
469 bfqq->raising_cur_max_time =
470 bfqd->bfq_raising_max_time;
471 else if (bfqq->raising_cur_max_time ==
472 bfqd->bfq_raising_rt_max_time &&
473 !soft_rt) {
474 bfqq->raising_coeff = 1;
475 bfq_log_bfqq(bfqd, bfqq,
476 "wrais ending at %llu msec,"
477 "rais_max_time %u",
478 bfqq->last_rais_start_finish,
479 jiffies_to_msecs(bfqq->
480 raising_cur_max_time));
483 if (old_raising_coeff != bfqq->raising_coeff)
484 entity->ioprio_changed = 1;
485 add_bfqq_busy:
486 bfq_add_bfqq_busy(bfqd, bfqq);
487 } else {
488 if(bfqd->low_latency && old_raising_coeff == 1 &&
489 !rq_is_sync(rq) &&
490 bfqq->last_rais_start_finish +
491 bfqd->bfq_raising_min_inter_arr_async < jiffies) {
492 bfqq->raising_coeff = bfqd->bfq_raising_coeff;
493 bfqq->raising_cur_max_time = bfqd->bfq_raising_max_time;
495 entity->ioprio_changed = 1;
496 bfq_log_bfqq(bfqd, bfqq,
497 "non-idle wrais starting at %llu msec,"
498 "rais_max_time %u",
499 bfqq->last_rais_start_finish,
500 jiffies_to_msecs(bfqq->
501 raising_cur_max_time));
503 bfq_updated_next_req(bfqd, bfqq);
506 if(bfqd->low_latency &&
507 (old_raising_coeff == 1 || bfqq->raising_coeff == 1 ||
508 idle_for_long_time))
509 bfqq->last_rais_start_finish = jiffies;
512 static void bfq_reposition_rq_rb(struct bfq_queue *bfqq, struct request *rq)
514 elv_rb_del(&bfqq->sort_list, rq);
515 bfqq->queued[rq_is_sync(rq)]--;
516 bfqq->bfqd->queued--;
517 bfq_add_rq_rb(rq);
520 static struct request *bfq_find_rq_fmerge(struct bfq_data *bfqd,
521 struct bio *bio)
523 struct task_struct *tsk = current;
524 struct bfq_io_cq *bic;
525 struct bfq_queue *bfqq;
527 bic = bfq_bic_lookup(bfqd, tsk->io_context);
528 if (bic == NULL)
529 return NULL;
531 bfqq = bic_to_bfqq(bic, bfq_bio_sync(bio));
532 if (bfqq != NULL) {
533 sector_t sector = bio->bi_sector + bio_sectors(bio);
535 return elv_rb_find(&bfqq->sort_list, sector);
538 return NULL;
541 static void bfq_activate_request(struct request_queue *q, struct request *rq)
543 struct bfq_data *bfqd = q->elevator->elevator_data;
545 bfqd->rq_in_driver++;
546 bfqd->last_position = blk_rq_pos(rq) + blk_rq_sectors(rq);
547 bfq_log(bfqd, "activate_request: new bfqd->last_position %llu",
548 (long long unsigned)bfqd->last_position);
551 static void bfq_deactivate_request(struct request_queue *q, struct request *rq)
553 struct bfq_data *bfqd = q->elevator->elevator_data;
555 WARN_ON(bfqd->rq_in_driver == 0);
556 bfqd->rq_in_driver--;
559 static void bfq_remove_request(struct request *rq)
561 struct bfq_queue *bfqq = RQ_BFQQ(rq);
562 struct bfq_data *bfqd = bfqq->bfqd;
564 if (bfqq->next_rq == rq) {
565 bfqq->next_rq = bfq_find_next_rq(bfqd, bfqq, rq);
566 bfq_updated_next_req(bfqd, bfqq);
569 list_del_init(&rq->queuelist);
570 bfq_del_rq_rb(rq);
572 if (rq->cmd_flags & REQ_META) {
573 WARN_ON(bfqq->meta_pending == 0);
574 bfqq->meta_pending--;
578 static int bfq_merge(struct request_queue *q, struct request **req,
579 struct bio *bio)
581 struct bfq_data *bfqd = q->elevator->elevator_data;
582 struct request *__rq;
584 __rq = bfq_find_rq_fmerge(bfqd, bio);
585 if (__rq != NULL && elv_rq_merge_ok(__rq, bio)) {
586 *req = __rq;
587 return ELEVATOR_FRONT_MERGE;
590 return ELEVATOR_NO_MERGE;
593 static void bfq_merged_request(struct request_queue *q, struct request *req,
594 int type)
596 if (type == ELEVATOR_FRONT_MERGE) {
597 struct bfq_queue *bfqq = RQ_BFQQ(req);
599 bfq_reposition_rq_rb(bfqq, req);
603 static void bfq_merged_requests(struct request_queue *q, struct request *rq,
604 struct request *next)
606 struct bfq_queue *bfqq = RQ_BFQQ(rq);
609 * Reposition in fifo if next is older than rq.
611 if (!list_empty(&rq->queuelist) && !list_empty(&next->queuelist) &&
612 time_before(rq_fifo_time(next), rq_fifo_time(rq))) {
613 list_move(&rq->queuelist, &next->queuelist);
614 rq_set_fifo_time(rq, rq_fifo_time(next));
617 if (bfqq->next_rq == next)
618 bfqq->next_rq = rq;
620 bfq_remove_request(next);
623 static int bfq_allow_merge(struct request_queue *q, struct request *rq,
624 struct bio *bio)
626 struct bfq_data *bfqd = q->elevator->elevator_data;
627 struct bfq_io_cq *bic;
628 struct bfq_queue *bfqq;
631 * Disallow merge of a sync bio into an async request.
633 if (bfq_bio_sync(bio) && !rq_is_sync(rq))
634 return 0;
637 * Lookup the bfqq that this bio will be queued with. Allow
638 * merge only if rq is queued there.
639 * Queue lock is held here.
641 bic = bfq_bic_lookup(bfqd, current->io_context);
642 if (bic == NULL)
643 return 0;
645 bfqq = bic_to_bfqq(bic, bfq_bio_sync(bio));
646 return bfqq == RQ_BFQQ(rq);
649 static void __bfq_set_active_queue(struct bfq_data *bfqd,
650 struct bfq_queue *bfqq)
652 if (bfqq != NULL) {
653 bfq_mark_bfqq_must_alloc(bfqq);
654 bfq_mark_bfqq_budget_new(bfqq);
655 bfq_clear_bfqq_fifo_expire(bfqq);
657 bfqd->budgets_assigned = (bfqd->budgets_assigned*7 + 256) / 8;
659 bfq_log_bfqq(bfqd, bfqq, "set_active_queue, cur-budget = %lu",
660 bfqq->entity.budget);
663 bfqd->active_queue = bfqq;
667 * Get and set a new active queue for service.
669 static struct bfq_queue *bfq_set_active_queue(struct bfq_data *bfqd,
670 struct bfq_queue *bfqq)
672 if (!bfqq)
673 bfqq = bfq_get_next_queue(bfqd);
674 else
675 bfq_get_next_queue_forced(bfqd, bfqq);
677 __bfq_set_active_queue(bfqd, bfqq);
678 return bfqq;
681 static inline sector_t bfq_dist_from_last(struct bfq_data *bfqd,
682 struct request *rq)
684 if (blk_rq_pos(rq) >= bfqd->last_position)
685 return blk_rq_pos(rq) - bfqd->last_position;
686 else
687 return bfqd->last_position - blk_rq_pos(rq);
691 * Return true if bfqq has no request pending and rq is close enough to
692 * bfqd->last_position, or if rq is closer to bfqd->last_position than
693 * bfqq->next_rq
695 static inline int bfq_rq_close(struct bfq_data *bfqd, struct bfq_queue *bfqq,
696 struct request *rq)
698 sector_t sdist = bfqq->seek_mean;
700 if (!bfq_sample_valid(bfqq->seek_samples))
701 sdist = BFQQ_SEEK_THR;
703 /* If seek_mean is large, using it as close criteria is meaningless */
704 if (sdist > BFQQ_SEEK_THR)
705 sdist = BFQQ_SEEK_THR;
707 return bfq_dist_from_last(bfqd, rq) <= sdist;
710 static struct bfq_queue *bfqq_close(struct bfq_data *bfqd,
711 struct bfq_queue *cur_bfqq)
713 struct rb_root *root = &bfqd->rq_pos_tree;
714 struct rb_node *parent, *node;
715 struct bfq_queue *__bfqq;
716 sector_t sector = bfqd->last_position;
718 if (RB_EMPTY_ROOT(root))
719 return NULL;
722 * First, if we find a request starting at the end of the last
723 * request, choose it.
725 __bfqq = bfq_rq_pos_tree_lookup(bfqd, root, sector, &parent, NULL);
726 if (__bfqq != NULL)
727 return __bfqq;
730 * If the exact sector wasn't found, the parent of the NULL leaf
731 * will contain the closest sector (rq_pos_tree sorted by next_request
732 * position).
734 __bfqq = rb_entry(parent, struct bfq_queue, pos_node);
735 if (bfq_rq_close(bfqd, cur_bfqq, __bfqq->next_rq))
736 return __bfqq;
738 if (blk_rq_pos(__bfqq->next_rq) < sector)
739 node = rb_next(&__bfqq->pos_node);
740 else
741 node = rb_prev(&__bfqq->pos_node);
742 if (node == NULL)
743 return NULL;
745 __bfqq = rb_entry(node, struct bfq_queue, pos_node);
746 if (bfq_rq_close(bfqd, cur_bfqq, __bfqq->next_rq))
747 return __bfqq;
749 return NULL;
753 * bfqd - obvious
754 * cur_bfqq - passed in so that we don't decide that the current queue
755 * is closely cooperating with itself.
757 * We are assuming that cur_bfqq has dispatched at least one request,
758 * and that bfqd->last_position reflects a position on the disk associated
759 * with the I/O issued by cur_bfqq.
761 static struct bfq_queue *bfq_close_cooperator(struct bfq_data *bfqd,
762 struct bfq_queue *cur_bfqq)
764 struct bfq_queue *bfqq;
766 if (bfq_class_idle(cur_bfqq))
767 return NULL;
768 if (!bfq_bfqq_sync(cur_bfqq))
769 return NULL;
770 if (BFQQ_SEEKY(cur_bfqq))
771 return NULL;
773 /* If device has only one backlogged bfq_queue, don't search. */
774 if (bfqd->busy_queues == 1)
775 return NULL;
778 * We should notice if some of the queues are cooperating, e.g.
779 * working closely on the same area of the disk. In that case,
780 * we can group them together and don't waste time idling.
782 bfqq = bfqq_close(bfqd, cur_bfqq);
783 if (bfqq == NULL || bfqq == cur_bfqq)
784 return NULL;
787 * Do not merge queues from different bfq_groups.
789 if (bfqq->entity.parent != cur_bfqq->entity.parent)
790 return NULL;
793 * It only makes sense to merge sync queues.
795 if (!bfq_bfqq_sync(bfqq))
796 return NULL;
797 if (BFQQ_SEEKY(bfqq))
798 return NULL;
801 * Do not merge queues of different priority classes.
803 if (bfq_class_rt(bfqq) != bfq_class_rt(cur_bfqq))
804 return NULL;
806 return bfqq;
810 * If enough samples have been computed, return the current max budget
811 * stored in bfqd, which is dynamically updated according to the
812 * estimated disk peak rate; otherwise return the default max budget
814 static inline unsigned long bfq_max_budget(struct bfq_data *bfqd)
816 return bfqd->budgets_assigned < 194 ? bfq_default_max_budget :
817 bfqd->bfq_max_budget;
821 * Return min budget, which is a fraction of the current or default
822 * max budget (trying with 1/32)
824 static inline unsigned long bfq_min_budget(struct bfq_data *bfqd)
826 return bfqd->budgets_assigned < 194 ? bfq_default_max_budget / 32 :
827 bfqd->bfq_max_budget / 32;
830 static void bfq_arm_slice_timer(struct bfq_data *bfqd)
832 struct bfq_queue *bfqq = bfqd->active_queue;
833 struct bfq_io_cq *bic;
834 unsigned long sl;
836 WARN_ON(!RB_EMPTY_ROOT(&bfqq->sort_list));
838 /* Idling is disabled, either manually or by past process history. */
839 if (bfqd->bfq_slice_idle == 0 || !bfq_bfqq_idle_window(bfqq))
840 return;
842 /* Tasks have exited, don't wait. */
843 bic = bfqd->active_bic;
844 if (bic == NULL || atomic_read(&bic->icq.ioc->nr_tasks) == 0)
845 return;
847 bfq_mark_bfqq_wait_request(bfqq);
850 * We don't want to idle for seeks, but we do want to allow
851 * fair distribution of slice time for a process doing back-to-back
852 * seeks. So allow a little bit of time for him to submit a new rq.
854 * To prevent processes with (partly) seeky workloads from
855 * being too ill-treated, grant them a small fraction of the
856 * assigned budget before reducing the waiting time to
857 * BFQ_MIN_TT. This happened to help reduce latency.
859 sl = bfqd->bfq_slice_idle;
860 if (bfq_sample_valid(bfqq->seek_samples) && BFQQ_SEEKY(bfqq) &&
861 bfqq->entity.service > bfq_max_budget(bfqd) / 8 &&
862 bfqq->raising_coeff == 1)
863 sl = min(sl, msecs_to_jiffies(BFQ_MIN_TT));
864 else if (bfqq->raising_coeff > 1)
865 sl = sl * 3;
866 bfqd->last_idling_start = ktime_get();
867 mod_timer(&bfqd->idle_slice_timer, jiffies + sl);
868 bfq_log(bfqd, "arm idle: %u/%u ms",
869 jiffies_to_msecs(sl), jiffies_to_msecs(bfqd->bfq_slice_idle));
873 * Set the maximum time for the active queue to consume its
874 * budget. This prevents seeky processes from lowering the disk
875 * throughput (always guaranteed with a time slice scheme as in CFQ).
877 static void bfq_set_budget_timeout(struct bfq_data *bfqd)
879 struct bfq_queue *bfqq = bfqd->active_queue;
880 unsigned int timeout_coeff =
881 bfqq->raising_cur_max_time == bfqd->bfq_raising_rt_max_time ?
882 1 : (bfqq->entity.weight / bfqq->entity.orig_weight);
884 bfqd->last_budget_start = ktime_get();
886 bfq_clear_bfqq_budget_new(bfqq);
887 bfqq->budget_timeout = jiffies +
888 bfqd->bfq_timeout[bfq_bfqq_sync(bfqq)] * timeout_coeff;
890 bfq_log_bfqq(bfqd, bfqq, "set budget_timeout %u",
891 jiffies_to_msecs(bfqd->bfq_timeout[bfq_bfqq_sync(bfqq)] *
892 timeout_coeff));
896 * Move request from internal lists to the request queue dispatch list.
898 static void bfq_dispatch_insert(struct request_queue *q, struct request *rq)
900 struct bfq_data *bfqd = q->elevator->elevator_data;
901 struct bfq_queue *bfqq = RQ_BFQQ(rq);
903 bfq_remove_request(rq);
904 bfqq->dispatched++;
905 elv_dispatch_sort(q, rq);
907 if (bfq_bfqq_sync(bfqq))
908 bfqd->sync_flight++;
912 * Return expired entry, or NULL to just start from scratch in rbtree.
914 static struct request *bfq_check_fifo(struct bfq_queue *bfqq)
916 struct request *rq = NULL;
918 if (bfq_bfqq_fifo_expire(bfqq))
919 return NULL;
921 bfq_mark_bfqq_fifo_expire(bfqq);
923 if (list_empty(&bfqq->fifo))
924 return NULL;
926 rq = rq_entry_fifo(bfqq->fifo.next);
928 if (time_before(jiffies, rq_fifo_time(rq)))
929 return NULL;
931 return rq;
935 * Must be called with the queue_lock held.
937 static int bfqq_process_refs(struct bfq_queue *bfqq)
939 int process_refs, io_refs;
941 io_refs = bfqq->allocated[READ] + bfqq->allocated[WRITE];
942 process_refs = atomic_read(&bfqq->ref) - io_refs;
943 BUG_ON(process_refs < 0);
944 return process_refs;
947 static void bfq_setup_merge(struct bfq_queue *bfqq, struct bfq_queue *new_bfqq)
949 int process_refs, new_process_refs;
950 struct bfq_queue *__bfqq;
953 * If there are no process references on the new_bfqq, then it is
954 * unsafe to follow the ->new_bfqq chain as other bfqq's in the chain
955 * may have dropped their last reference (not just their last process
956 * reference).
958 if (!bfqq_process_refs(new_bfqq))
959 return;
961 /* Avoid a circular list and skip interim queue merges. */
962 while ((__bfqq = new_bfqq->new_bfqq)) {
963 if (__bfqq == bfqq)
964 return;
965 new_bfqq = __bfqq;
968 process_refs = bfqq_process_refs(bfqq);
969 new_process_refs = bfqq_process_refs(new_bfqq);
971 * If the process for the bfqq has gone away, there is no
972 * sense in merging the queues.
974 if (process_refs == 0 || new_process_refs == 0)
975 return;
978 * Merge in the direction of the lesser amount of work.
980 if (new_process_refs >= process_refs) {
981 bfqq->new_bfqq = new_bfqq;
982 atomic_add(process_refs, &new_bfqq->ref);
983 } else {
984 new_bfqq->new_bfqq = bfqq;
985 atomic_add(new_process_refs, &bfqq->ref);
987 bfq_log_bfqq(bfqq->bfqd, bfqq, "scheduling merge with queue %d",
988 new_bfqq->pid);
991 static inline unsigned long bfq_bfqq_budget_left(struct bfq_queue *bfqq)
993 struct bfq_entity *entity = &bfqq->entity;
994 return entity->budget - entity->service;
997 static void __bfq_bfqq_expire(struct bfq_data *bfqd, struct bfq_queue *bfqq)
999 BUG_ON(bfqq != bfqd->active_queue);
1001 __bfq_bfqd_reset_active(bfqd);
1003 if (RB_EMPTY_ROOT(&bfqq->sort_list)) {
1004 bfq_del_bfqq_busy(bfqd, bfqq, 1);
1006 * overloading budget_timeout field to store when
1007 * the queue remains with no backlog, used by
1008 * the weight-raising mechanism
1010 bfqq->budget_timeout = jiffies ;
1011 } else {
1012 bfq_activate_bfqq(bfqd, bfqq);
1014 * Resort priority tree of potential close cooperators.
1016 bfq_rq_pos_tree_add(bfqd, bfqq);
1020 * If this bfqq is shared between multiple processes, check
1021 * to make sure that those processes are still issuing I/Os
1022 * within the mean seek distance. If not, it may be time to
1023 * break the queues apart again.
1025 if (bfq_bfqq_coop(bfqq) && BFQQ_SEEKY(bfqq))
1026 bfq_mark_bfqq_split_coop(bfqq);
1030 * __bfq_bfqq_recalc_budget - try to adapt the budget to the @bfqq behavior.
1031 * @bfqd: device data.
1032 * @bfqq: queue to update.
1033 * @reason: reason for expiration.
1035 * Handle the feedback on @bfqq budget. See the body for detailed
1036 * comments.
1038 static void __bfq_bfqq_recalc_budget(struct bfq_data *bfqd,
1039 struct bfq_queue *bfqq,
1040 enum bfqq_expiration reason)
1042 struct request *next_rq;
1043 unsigned long budget, min_budget;
1045 budget = bfqq->max_budget;
1046 min_budget = bfq_min_budget(bfqd);
1048 BUG_ON(bfqq != bfqd->active_queue);
1050 bfq_log_bfqq(bfqd, bfqq, "recalc_budg: last budg %lu, budg left %lu",
1051 bfqq->entity.budget, bfq_bfqq_budget_left(bfqq));
1052 bfq_log_bfqq(bfqd, bfqq, "recalc_budg: last max_budg %lu, min budg %lu",
1053 budget, bfq_min_budget(bfqd));
1054 bfq_log_bfqq(bfqd, bfqq, "recalc_budg: sync %d, seeky %d",
1055 bfq_bfqq_sync(bfqq), BFQQ_SEEKY(bfqd->active_queue));
1057 if (bfq_bfqq_sync(bfqq)) {
1058 switch (reason) {
1060 * Caveat: in all the following cases we trade latency
1061 * for throughput.
1063 case BFQ_BFQQ_TOO_IDLE:
1065 * This is the only case where we may reduce
1066 * the budget: if there is no requets of the
1067 * process still waiting for completion, then
1068 * we assume (tentatively) that the timer has
1069 * expired because the batch of requests of
1070 * the process could have been served with a
1071 * smaller budget. Hence, betting that
1072 * process will behave in the same way when it
1073 * becomes backlogged again, we reduce its
1074 * next budget. As long as we guess right,
1075 * this budget cut reduces the latency
1076 * experienced by the process.
1078 * However, if there are still outstanding
1079 * requests, then the process may have not yet
1080 * issued its next request just because it is
1081 * still waiting for the completion of some of
1082 * the still oustanding ones. So in this
1083 * subcase we do not reduce its budget, on the
1084 * contrary we increase it to possibly boost
1085 * the throughput, as discussed in the
1086 * comments to the BUDGET_TIMEOUT case.
1088 if (bfqq->dispatched > 0) /* still oustanding reqs */
1089 budget = min(budget * 2, bfqd->bfq_max_budget);
1090 else {
1091 if (budget > 5 * min_budget)
1092 budget -= 4 * min_budget;
1093 else
1094 budget = min_budget;
1096 break;
1097 case BFQ_BFQQ_BUDGET_TIMEOUT:
1099 * We double the budget here because: 1) it
1100 * gives the chance to boost the throughput if
1101 * this is not a seeky process (which may have
1102 * bumped into this timeout because of, e.g.,
1103 * ZBR), 2) together with charge_full_budget
1104 * it helps give seeky processes higher
1105 * timestamps, and hence be served less
1106 * frequently.
1108 budget = min(budget * 2, bfqd->bfq_max_budget);
1109 break;
1110 case BFQ_BFQQ_BUDGET_EXHAUSTED:
1112 * The process still has backlog, and did not
1113 * let either the budget timeout or the disk
1114 * idling timeout expire. Hence it is not
1115 * seeky, has a short thinktime and may be
1116 * happy with a higher budget too. So
1117 * definitely increase the budget of this good
1118 * candidate to boost the disk throughput.
1120 budget = min(budget * 4, bfqd->bfq_max_budget);
1121 break;
1122 case BFQ_BFQQ_NO_MORE_REQUESTS:
1124 * Leave the budget unchanged.
1126 default:
1127 return;
1129 } else /* async queue */
1130 /* async queues get always the maximum possible budget
1131 * (their ability to dispatch is limited by
1132 * @bfqd->bfq_max_budget_async_rq).
1134 budget = bfqd->bfq_max_budget;
1136 bfqq->max_budget = budget;
1138 if (bfqd->budgets_assigned >= 194 && bfqd->bfq_user_max_budget == 0 &&
1139 bfqq->max_budget > bfqd->bfq_max_budget)
1140 bfqq->max_budget = bfqd->bfq_max_budget;
1143 * Make sure that we have enough budget for the next request.
1144 * Since the finish time of the bfqq must be kept in sync with
1145 * the budget, be sure to call __bfq_bfqq_expire() after the
1146 * update.
1148 next_rq = bfqq->next_rq;
1149 if (next_rq != NULL)
1150 bfqq->entity.budget = max_t(unsigned long, bfqq->max_budget,
1151 bfq_serv_to_charge(next_rq, bfqq));
1152 else
1153 bfqq->entity.budget = bfqq->max_budget;
1155 bfq_log_bfqq(bfqd, bfqq, "head sect: %u, new budget %lu",
1156 next_rq != NULL ? blk_rq_sectors(next_rq) : 0,
1157 bfqq->entity.budget);
1160 static unsigned long bfq_calc_max_budget(u64 peak_rate, u64 timeout)
1162 unsigned long max_budget;
1165 * The max_budget calculated when autotuning is equal to the
1166 * amount of sectors transfered in timeout_sync at the
1167 * estimated peak rate.
1169 max_budget = (unsigned long)(peak_rate * 1000 *
1170 timeout >> BFQ_RATE_SHIFT);
1172 return max_budget;
1176 * In addition to updating the peak rate, checks whether the process
1177 * is "slow", and returns 1 if so. This slow flag is used, in addition
1178 * to the budget timeout, to reduce the amount of service provided to
1179 * seeky processes, and hence reduce their chances to lower the
1180 * throughput. See the code for more details.
1182 static int bfq_update_peak_rate(struct bfq_data *bfqd, struct bfq_queue *bfqq,
1183 int compensate, enum bfqq_expiration reason)
1185 u64 bw, usecs, expected, timeout;
1186 ktime_t delta;
1187 int update = 0;
1189 if (!bfq_bfqq_sync(bfqq) || bfq_bfqq_budget_new(bfqq))
1190 return 0;
1192 delta = compensate ? bfqd->last_idling_start : ktime_get();
1193 delta = ktime_sub(delta, bfqd->last_budget_start);
1194 usecs = ktime_to_us(delta);
1196 /* Don't trust short/unrealistic values. */
1197 if (usecs < 100 || usecs >= LONG_MAX)
1198 return 0;
1201 * Calculate the bandwidth for the last slice. We use a 64 bit
1202 * value to store the peak rate, in sectors per usec in fixed
1203 * point math. We do so to have enough precision in the estimate
1204 * and to avoid overflows.
1206 bw = (u64)bfqq->entity.service << BFQ_RATE_SHIFT;
1207 do_div(bw, (unsigned long)usecs);
1209 timeout = jiffies_to_msecs(bfqd->bfq_timeout[BLK_RW_SYNC]);
1212 * Use only long (> 20ms) intervals to filter out spikes for
1213 * the peak rate estimation.
1215 if (usecs > 20000) {
1216 if (bw > bfqd->peak_rate ||
1217 (!BFQQ_SEEKY(bfqq) &&
1218 reason == BFQ_BFQQ_BUDGET_TIMEOUT)) {
1219 bfq_log(bfqd, "measured bw =%llu", bw);
1221 * To smooth oscillations use a low-pass filter with
1222 * alpha=7/8, i.e.,
1223 * new_rate = (7/8) * old_rate + (1/8) * bw
1225 do_div(bw, 8);
1226 bfqd->peak_rate *= 7;
1227 do_div(bfqd->peak_rate, 8);
1228 bfqd->peak_rate += bw;
1229 update = 1;
1230 bfq_log(bfqd, "new peak_rate=%llu", bfqd->peak_rate);
1233 update |= bfqd->peak_rate_samples == BFQ_PEAK_RATE_SAMPLES - 1;
1235 if (bfqd->peak_rate_samples < BFQ_PEAK_RATE_SAMPLES)
1236 bfqd->peak_rate_samples++;
1238 if (bfqd->peak_rate_samples == BFQ_PEAK_RATE_SAMPLES &&
1239 update && bfqd->bfq_user_max_budget == 0) {
1240 bfqd->bfq_max_budget =
1241 bfq_calc_max_budget(bfqd->peak_rate, timeout);
1242 bfq_log(bfqd, "new max_budget=%lu",
1243 bfqd->bfq_max_budget);
1248 * If the process has been served for a too short time
1249 * interval to let its possible sequential accesses prevail on
1250 * the initial seek time needed to move the disk head on the
1251 * first sector it requested, then give the process a chance
1252 * and for the moment return false.
1254 if (bfqq->entity.budget <= bfq_max_budget(bfqd) / 8)
1255 return 0;
1258 * A process is considered ``slow'' (i.e., seeky, so that we
1259 * cannot treat it fairly in the service domain, as it would
1260 * slow down too much the other processes) if, when a slice
1261 * ends for whatever reason, it has received service at a
1262 * rate that would not be high enough to complete the budget
1263 * before the budget timeout expiration.
1265 expected = bw * 1000 * timeout >> BFQ_RATE_SHIFT;
1268 * Caveat: processes doing IO in the slower disk zones will
1269 * tend to be slow(er) even if not seeky. And the estimated
1270 * peak rate will actually be an average over the disk
1271 * surface. Hence, to not be too harsh with unlucky processes,
1272 * we keep a budget/3 margin of safety before declaring a
1273 * process slow.
1275 return expected > (4 * bfqq->entity.budget) / 3;
1279 * bfq_bfqq_expire - expire a queue.
1280 * @bfqd: device owning the queue.
1281 * @bfqq: the queue to expire.
1282 * @compensate: if true, compensate for the time spent idling.
1283 * @reason: the reason causing the expiration.
1286 * If the process associated to the queue is slow (i.e., seeky), or in
1287 * case of budget timeout, or, finally, if it is async, we
1288 * artificially charge it an entire budget (independently of the
1289 * actual service it received). As a consequence, the queue will get
1290 * higher timestamps than the correct ones upon reactivation, and
1291 * hence it will be rescheduled as if it had received more service
1292 * than what it actually received. In the end, this class of processes
1293 * will receive less service in proportion to how slowly they consume
1294 * their budgets (and hence how seriously they tend to lower the
1295 * throughput).
1297 * In contrast, when a queue expires because it has been idling for
1298 * too much or because it exhausted its budget, we do not touch the
1299 * amount of service it has received. Hence when the queue will be
1300 * reactivated and its timestamps updated, the latter will be in sync
1301 * with the actual service received by the queue until expiration.
1303 * Charging a full budget to the first type of queues and the exact
1304 * service to the others has the effect of using the WF2Q+ policy to
1305 * schedule the former on a timeslice basis, without violating the
1306 * service domain guarantees of the latter.
1308 static void bfq_bfqq_expire(struct bfq_data *bfqd,
1309 struct bfq_queue *bfqq,
1310 int compensate,
1311 enum bfqq_expiration reason)
1313 int slow;
1314 BUG_ON(bfqq != bfqd->active_queue);
1316 /* Update disk peak rate for autotuning and check whether the
1317 * process is slow (see bfq_update_peak_rate).
1319 slow = bfq_update_peak_rate(bfqd, bfqq, compensate, reason);
1322 * As above explained, 'punish' slow (i.e., seeky), timed-out
1323 * and async queues, to favor sequential sync workloads.
1325 * Processes doing IO in the slower disk zones will tend to be
1326 * slow(er) even if not seeky. Hence, since the estimated peak
1327 * rate is actually an average over the disk surface, these
1328 * processes may timeout just for bad luck. To avoid punishing
1329 * them we do not charge a full budget to a process that
1330 * succeeded in consuming at least 2/3 of its budget.
1332 if (slow || (reason == BFQ_BFQQ_BUDGET_TIMEOUT &&
1333 bfq_bfqq_budget_left(bfqq) >= bfqq->entity.budget / 3))
1334 bfq_bfqq_charge_full_budget(bfqq);
1336 if (bfqd->low_latency && bfqq->raising_coeff == 1)
1337 bfqq->last_rais_start_finish = jiffies;
1339 if (bfqd->low_latency && bfqd->bfq_raising_max_softrt_rate > 0) {
1340 if(reason != BFQ_BFQQ_BUDGET_TIMEOUT)
1341 bfqq->soft_rt_next_start =
1342 jiffies +
1343 HZ * bfqq->entity.service /
1344 bfqd->bfq_raising_max_softrt_rate;
1345 else
1346 bfqq->soft_rt_next_start = -1; /* infinity */
1348 bfq_log_bfqq(bfqd, bfqq,
1349 "expire (%d, slow %d, num_disp %d, idle_win %d)", reason, slow,
1350 bfqq->dispatched, bfq_bfqq_idle_window(bfqq));
1352 /* Increase, decrease or leave budget unchanged according to reason */
1353 __bfq_bfqq_recalc_budget(bfqd, bfqq, reason);
1354 __bfq_bfqq_expire(bfqd, bfqq);
1358 * Budget timeout is not implemented through a dedicated timer, but
1359 * just checked on request arrivals and completions, as well as on
1360 * idle timer expirations.
1362 static int bfq_bfqq_budget_timeout(struct bfq_queue *bfqq)
1364 if (bfq_bfqq_budget_new(bfqq))
1365 return 0;
1367 if (time_before(jiffies, bfqq->budget_timeout))
1368 return 0;
1370 return 1;
1374 * If we expire a queue that is waiting for the arrival of a new
1375 * request, we may prevent the fictitious timestamp backshifting that
1376 * allows the guarantees of the queue to be preserved (see [1] for
1377 * this tricky aspect). Hence we return true only if this condition
1378 * does not hold, or if the queue is slow enough to deserve only to be
1379 * kicked off for preserving a high throughput.
1381 static inline int bfq_may_expire_for_budg_timeout(struct bfq_queue *bfqq)
1383 bfq_log_bfqq(bfqq->bfqd, bfqq,
1384 "may_budget_timeout: wr %d left %d timeout %d",
1385 bfq_bfqq_wait_request(bfqq),
1386 bfq_bfqq_budget_left(bfqq) >= bfqq->entity.budget / 3,
1387 bfq_bfqq_budget_timeout(bfqq));
1389 return (!bfq_bfqq_wait_request(bfqq) ||
1390 bfq_bfqq_budget_left(bfqq) >= bfqq->entity.budget / 3)
1392 bfq_bfqq_budget_timeout(bfqq);
1396 * Select a queue for service. If we have a current active queue,
1397 * check whether to continue servicing it, or retrieve and set a new one.
1399 static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd)
1401 struct bfq_queue *bfqq, *new_bfqq = NULL;
1402 struct request *next_rq;
1403 enum bfqq_expiration reason = BFQ_BFQQ_BUDGET_TIMEOUT;
1405 bfqq = bfqd->active_queue;
1406 if (bfqq == NULL)
1407 goto new_queue;
1409 bfq_log_bfqq(bfqd, bfqq, "select_queue: already active queue");
1412 * If another queue has a request waiting within our mean seek
1413 * distance, let it run. The expire code will check for close
1414 * cooperators and put the close queue at the front of the
1415 * service tree. If possible, merge the expiring queue with the
1416 * new bfqq.
1418 new_bfqq = bfq_close_cooperator(bfqd, bfqq);
1419 if (new_bfqq != NULL && bfqq->new_bfqq == NULL)
1420 bfq_setup_merge(bfqq, new_bfqq);
1422 if (bfq_may_expire_for_budg_timeout(bfqq))
1423 goto expire;
1425 next_rq = bfqq->next_rq;
1427 * If bfqq has requests queued and it has enough budget left to
1428 * serve them, keep the queue, otherwise expire it.
1430 if (next_rq != NULL) {
1431 if (bfq_serv_to_charge(next_rq, bfqq) >
1432 bfq_bfqq_budget_left(bfqq)) {
1433 reason = BFQ_BFQQ_BUDGET_EXHAUSTED;
1434 goto expire;
1435 } else {
1437 * The idle timer may be pending because we may not
1438 * disable disk idling even when a new request arrives
1440 if (timer_pending(&bfqd->idle_slice_timer)) {
1442 * If we get here: 1) at least a new request
1443 * has arrived but we have not disabled the
1444 * timer because the request was too small,
1445 * 2) then the block layer has unplugged the
1446 * device, causing the dispatch to be invoked.
1448 * Since the device is unplugged, now the
1449 * requests are probably large enough to
1450 * provide a reasonable throughput.
1451 * So we disable idling.
1453 bfq_clear_bfqq_wait_request(bfqq);
1454 del_timer(&bfqd->idle_slice_timer);
1456 if (new_bfqq == NULL)
1457 goto keep_queue;
1458 else
1459 goto expire;
1464 * No requests pending. If there is no cooperator, and the active
1465 * queue still has requests in flight or is idling for a new request,
1466 * then keep it.
1468 if (new_bfqq == NULL && (timer_pending(&bfqd->idle_slice_timer) ||
1469 (bfqq->dispatched != 0 && bfq_bfqq_idle_window(bfqq)))) {
1470 bfqq = NULL;
1471 goto keep_queue;
1472 } else if (new_bfqq != NULL && timer_pending(&bfqd->idle_slice_timer)) {
1474 * Expiring the queue because there is a close cooperator,
1475 * cancel timer.
1477 bfq_clear_bfqq_wait_request(bfqq);
1478 del_timer(&bfqd->idle_slice_timer);
1481 reason = BFQ_BFQQ_NO_MORE_REQUESTS;
1482 expire:
1483 bfq_bfqq_expire(bfqd, bfqq, 0, reason);
1484 new_queue:
1485 bfqq = bfq_set_active_queue(bfqd, new_bfqq);
1486 bfq_log(bfqd, "select_queue: new queue %d returned",
1487 bfqq != NULL ? bfqq->pid : 0);
1488 keep_queue:
1489 return bfqq;
1492 static void update_raising_data(struct bfq_data *bfqd, struct bfq_queue *bfqq)
1494 if (bfqq->raising_coeff > 1) { /* queue is being boosted */
1495 struct bfq_entity *entity = &bfqq->entity;
1497 bfq_log_bfqq(bfqd, bfqq,
1498 "raising period dur %u/%u msec, "
1499 "old raising coeff %u, w %d(%d)",
1500 jiffies_to_msecs(jiffies -
1501 bfqq->last_rais_start_finish),
1502 jiffies_to_msecs(bfqq->raising_cur_max_time),
1503 bfqq->raising_coeff,
1504 bfqq->entity.weight, bfqq->entity.orig_weight);
1506 BUG_ON(bfqq != bfqd->active_queue && entity->weight !=
1507 entity->orig_weight * bfqq->raising_coeff);
1508 if(entity->ioprio_changed)
1509 bfq_log_bfqq(bfqd, bfqq,
1510 "WARN: pending prio change");
1512 * If too much time has elapsed from the beginning
1513 * of this weight-raising period and process is not soft
1514 * real-time, stop it
1516 if (jiffies - bfqq->last_rais_start_finish >
1517 bfqq->raising_cur_max_time) {
1518 int soft_rt = bfqd->bfq_raising_max_softrt_rate > 0 &&
1519 bfqq->soft_rt_next_start < jiffies;
1521 bfqq->last_rais_start_finish = jiffies;
1522 if (soft_rt)
1523 bfqq->raising_cur_max_time =
1524 bfqd->bfq_raising_rt_max_time;
1525 else {
1526 bfq_log_bfqq(bfqd, bfqq,
1527 "wrais ending at %llu msec,"
1528 "rais_max_time %u",
1529 bfqq->last_rais_start_finish,
1530 jiffies_to_msecs(bfqq->
1531 raising_cur_max_time));
1532 bfqq->raising_coeff = 1;
1533 entity->ioprio_changed = 1;
1534 __bfq_entity_update_weight_prio(
1535 bfq_entity_service_tree(entity),
1536 entity);
1544 * Dispatch one request from bfqq, moving it to the request queue
1545 * dispatch list.
1547 static int bfq_dispatch_request(struct bfq_data *bfqd,
1548 struct bfq_queue *bfqq)
1550 int dispatched = 0;
1551 struct request *rq;
1552 unsigned long service_to_charge;
1554 BUG_ON(RB_EMPTY_ROOT(&bfqq->sort_list));
1556 /* Follow expired path, else get first next available. */
1557 rq = bfq_check_fifo(bfqq);
1558 if (rq == NULL)
1559 rq = bfqq->next_rq;
1560 service_to_charge = bfq_serv_to_charge(rq, bfqq);
1562 if (service_to_charge > bfq_bfqq_budget_left(bfqq)) {
1564 * This may happen if the next rq is chosen
1565 * in fifo order instead of sector order.
1566 * The budget is properly dimensioned
1567 * to be always sufficient to serve the next request
1568 * only if it is chosen in sector order. The reason is
1569 * that it would be quite inefficient and little useful
1570 * to always make sure that the budget is large enough
1571 * to serve even the possible next rq in fifo order.
1572 * In fact, requests are seldom served in fifo order.
1574 * Expire the queue for budget exhaustion, and
1575 * make sure that the next act_budget is enough
1576 * to serve the next request, even if it comes
1577 * from the fifo expired path.
1579 bfqq->next_rq = rq;
1581 * Since this dispatch is failed, make sure that
1582 * a new one will be performed
1584 if (!bfqd->rq_in_driver)
1585 bfq_schedule_dispatch(bfqd);
1586 goto expire;
1589 /* Finally, insert request into driver dispatch list. */
1590 bfq_bfqq_served(bfqq, service_to_charge);
1591 bfq_dispatch_insert(bfqd->queue, rq);
1593 update_raising_data(bfqd, bfqq);
1595 bfq_log_bfqq(bfqd, bfqq, "dispatched %u sec req (%llu), "
1596 "budg left %lu",
1597 blk_rq_sectors(rq),
1598 (long long unsigned)blk_rq_pos(rq),
1599 bfq_bfqq_budget_left(bfqq));
1601 dispatched++;
1603 if (bfqd->active_bic == NULL) {
1604 atomic_long_inc(&RQ_BIC(rq)->icq.ioc->refcount);
1605 bfqd->active_bic = RQ_BIC(rq);
1608 if (bfqd->busy_queues > 1 && ((!bfq_bfqq_sync(bfqq) &&
1609 dispatched >= bfqd->bfq_max_budget_async_rq) ||
1610 bfq_class_idle(bfqq)))
1611 goto expire;
1613 return dispatched;
1615 expire:
1616 bfq_bfqq_expire(bfqd, bfqq, 0, BFQ_BFQQ_BUDGET_EXHAUSTED);
1617 return dispatched;
1620 static int __bfq_forced_dispatch_bfqq(struct bfq_queue *bfqq)
1622 int dispatched = 0;
1624 while (bfqq->next_rq != NULL) {
1625 bfq_dispatch_insert(bfqq->bfqd->queue, bfqq->next_rq);
1626 dispatched++;
1629 BUG_ON(!list_empty(&bfqq->fifo));
1630 return dispatched;
1634 * Drain our current requests. Used for barriers and when switching
1635 * io schedulers on-the-fly.
1637 static int bfq_forced_dispatch(struct bfq_data *bfqd)
1639 struct bfq_queue *bfqq, *n;
1640 struct bfq_service_tree *st;
1641 int dispatched = 0;
1643 bfqq = bfqd->active_queue;
1644 if (bfqq != NULL)
1645 __bfq_bfqq_expire(bfqd, bfqq);
1648 * Loop through classes, and be careful to leave the scheduler
1649 * in a consistent state, as feedback mechanisms and vtime
1650 * updates cannot be disabled during the process.
1652 list_for_each_entry_safe(bfqq, n, &bfqd->active_list, bfqq_list) {
1653 st = bfq_entity_service_tree(&bfqq->entity);
1655 dispatched += __bfq_forced_dispatch_bfqq(bfqq);
1656 bfqq->max_budget = bfq_max_budget(bfqd);
1658 bfq_forget_idle(st);
1661 BUG_ON(bfqd->busy_queues != 0);
1663 return dispatched;
1666 static int bfq_dispatch_requests(struct request_queue *q, int force)
1668 struct bfq_data *bfqd = q->elevator->elevator_data;
1669 struct bfq_queue *bfqq;
1670 int max_dispatch;
1672 bfq_log(bfqd, "dispatch requests: %d busy queues", bfqd->busy_queues);
1673 if (bfqd->busy_queues == 0)
1674 return 0;
1676 if (unlikely(force))
1677 return bfq_forced_dispatch(bfqd);
1679 if((bfqq = bfq_select_queue(bfqd)) == NULL)
1680 return 0;
1682 max_dispatch = bfqd->bfq_quantum;
1683 if (bfq_class_idle(bfqq))
1684 max_dispatch = 1;
1686 if (!bfq_bfqq_sync(bfqq))
1687 max_dispatch = bfqd->bfq_max_budget_async_rq;
1689 if (bfqq->dispatched >= max_dispatch) {
1690 if (bfqd->busy_queues > 1)
1691 return 0;
1692 if (bfqq->dispatched >= 4 * max_dispatch)
1693 return 0;
1696 if (bfqd->sync_flight != 0 && !bfq_bfqq_sync(bfqq))
1697 return 0;
1699 bfq_clear_bfqq_wait_request(bfqq);
1700 BUG_ON(timer_pending(&bfqd->idle_slice_timer));
1702 if (! bfq_dispatch_request(bfqd, bfqq))
1703 return 0;
1705 bfq_log_bfqq(bfqd, bfqq, "dispatched one request of %d"
1706 "(max_disp %d)", bfqq->pid, max_dispatch);
1708 return 1;
1712 * Task holds one reference to the queue, dropped when task exits. Each rq
1713 * in-flight on this queue also holds a reference, dropped when rq is freed.
1715 * Queue lock must be held here.
1717 static void bfq_put_queue(struct bfq_queue *bfqq)
1719 struct bfq_data *bfqd = bfqq->bfqd;
1721 BUG_ON(atomic_read(&bfqq->ref) <= 0);
1723 bfq_log_bfqq(bfqd, bfqq, "put_queue: %p %d", bfqq,
1724 atomic_read(&bfqq->ref));
1725 if (!atomic_dec_and_test(&bfqq->ref))
1726 return;
1728 BUG_ON(rb_first(&bfqq->sort_list) != NULL);
1729 BUG_ON(bfqq->allocated[READ] + bfqq->allocated[WRITE] != 0);
1730 BUG_ON(bfqq->entity.tree != NULL);
1731 BUG_ON(bfq_bfqq_busy(bfqq));
1732 BUG_ON(bfqd->active_queue == bfqq);
1734 bfq_log_bfqq(bfqd, bfqq, "put_queue: %p freed", bfqq);
1736 kmem_cache_free(bfq_pool, bfqq);
1739 static void bfq_put_cooperator(struct bfq_queue *bfqq)
1741 struct bfq_queue *__bfqq, *next;
1744 * If this queue was scheduled to merge with another queue, be
1745 * sure to drop the reference taken on that queue (and others in
1746 * the merge chain). See bfq_setup_merge and bfq_merge_bfqqs.
1748 __bfqq = bfqq->new_bfqq;
1749 while (__bfqq) {
1750 if (__bfqq == bfqq) {
1751 WARN(1, "bfqq->new_bfqq loop detected.\n");
1752 break;
1754 next = __bfqq->new_bfqq;
1755 bfq_put_queue(__bfqq);
1756 __bfqq = next;
1760 static void bfq_exit_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq)
1762 if (bfqq == bfqd->active_queue) {
1763 __bfq_bfqq_expire(bfqd, bfqq);
1764 bfq_schedule_dispatch(bfqd);
1767 bfq_log_bfqq(bfqd, bfqq, "exit_bfqq: %p, %d", bfqq,
1768 atomic_read(&bfqq->ref));
1770 bfq_put_cooperator(bfqq);
1772 bfq_put_queue(bfqq);
1775 static void bfq_init_icq(struct io_cq *icq)
1777 struct bfq_io_cq *bic = icq_to_bic(icq);
1779 bic->ttime.last_end_request = jiffies;
1782 static void bfq_exit_icq(struct io_cq *icq)
1784 struct bfq_io_cq *bic = icq_to_bic(icq);
1785 struct bfq_data *bfqd = bic_to_bfqd(bic);
1787 if (bic->bfqq[BLK_RW_ASYNC]) {
1788 bfq_exit_bfqq(bfqd, bic->bfqq[BLK_RW_ASYNC]);
1789 bic->bfqq[BLK_RW_ASYNC] = NULL;
1792 if (bic->bfqq[BLK_RW_SYNC]) {
1793 bfq_exit_bfqq(bfqd, bic->bfqq[BLK_RW_SYNC]);
1794 bic->bfqq[BLK_RW_SYNC] = NULL;
1799 * Update the entity prio values; note that the new values will not
1800 * be used until the next (re)activation.
1802 static void bfq_init_prio_data(struct bfq_queue *bfqq, struct io_context *ioc)
1804 struct task_struct *tsk = current;
1805 int ioprio_class;
1807 if (!bfq_bfqq_prio_changed(bfqq))
1808 return;
1810 ioprio_class = IOPRIO_PRIO_CLASS(ioc->ioprio);
1811 switch (ioprio_class) {
1812 default:
1813 printk(KERN_ERR "bfq: bad prio %x\n", ioprio_class);
1814 case IOPRIO_CLASS_NONE:
1816 * No prio set, inherit CPU scheduling settings.
1818 bfqq->entity.new_ioprio = task_nice_ioprio(tsk);
1819 bfqq->entity.new_ioprio_class = task_nice_ioclass(tsk);
1820 break;
1821 case IOPRIO_CLASS_RT:
1822 bfqq->entity.new_ioprio = task_ioprio(ioc);
1823 bfqq->entity.new_ioprio_class = IOPRIO_CLASS_RT;
1824 break;
1825 case IOPRIO_CLASS_BE:
1826 bfqq->entity.new_ioprio = task_ioprio(ioc);
1827 bfqq->entity.new_ioprio_class = IOPRIO_CLASS_BE;
1828 break;
1829 case IOPRIO_CLASS_IDLE:
1830 bfqq->entity.new_ioprio_class = IOPRIO_CLASS_IDLE;
1831 bfqq->entity.new_ioprio = 7;
1832 bfq_clear_bfqq_idle_window(bfqq);
1833 break;
1836 bfqq->entity.ioprio_changed = 1;
1839 * Keep track of original prio settings in case we have to temporarily
1840 * elevate the priority of this queue.
1842 bfqq->org_ioprio = bfqq->entity.new_ioprio;
1843 bfq_clear_bfqq_prio_changed(bfqq);
1846 static void bfq_changed_ioprio(struct io_context *ioc,
1847 struct bfq_io_cq *bic)
1849 struct bfq_data *bfqd;
1850 struct bfq_queue *bfqq, *new_bfqq;
1851 struct bfq_group *bfqg;
1852 unsigned long uninitialized_var(flags);
1854 bfqd = bfq_get_bfqd_locked(&(bic->icq.q->elevator->elevator_data), &flags);
1855 if (unlikely(bfqd == NULL))
1856 return;
1858 bfqq = bic->bfqq[BLK_RW_ASYNC];
1859 if (bfqq != NULL) {
1860 bfqg = container_of(bfqq->entity.sched_data, struct bfq_group,
1861 sched_data);
1862 new_bfqq = bfq_get_queue(bfqd, bfqg, BLK_RW_ASYNC, bic->icq.ioc,
1863 GFP_ATOMIC);
1864 if (new_bfqq != NULL) {
1865 bic->bfqq[BLK_RW_ASYNC] = new_bfqq;
1866 bfq_log_bfqq(bfqd, bfqq,
1867 "changed_ioprio: bfqq %p %d",
1868 bfqq, atomic_read(&bfqq->ref));
1869 bfq_put_queue(bfqq);
1873 bfqq = bic->bfqq[BLK_RW_SYNC];
1874 if (bfqq != NULL)
1875 bfq_mark_bfqq_prio_changed(bfqq);
1877 bfq_put_bfqd_unlock(bfqd, &flags);
1880 static void bfq_init_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq,
1881 pid_t pid, int is_sync)
1883 RB_CLEAR_NODE(&bfqq->entity.rb_node);
1884 INIT_LIST_HEAD(&bfqq->fifo);
1886 atomic_set(&bfqq->ref, 0);
1887 bfqq->bfqd = bfqd;
1889 bfq_mark_bfqq_prio_changed(bfqq);
1891 if (is_sync) {
1892 if (!bfq_class_idle(bfqq))
1893 bfq_mark_bfqq_idle_window(bfqq);
1894 bfq_mark_bfqq_sync(bfqq);
1897 /* Tentative initial value to trade off between thr and lat */
1898 bfqq->max_budget = (2 * bfq_max_budget(bfqd)) / 3;
1899 bfqq->pid = pid;
1901 bfqq->raising_coeff = 1;
1902 bfqq->last_rais_start_finish = 0;
1903 bfqq->soft_rt_next_start = -1;
1906 static struct bfq_queue *bfq_find_alloc_queue(struct bfq_data *bfqd,
1907 struct bfq_group *bfqg,
1908 int is_sync,
1909 struct io_context *ioc,
1910 gfp_t gfp_mask)
1912 struct bfq_queue *bfqq, *new_bfqq = NULL;
1913 struct bfq_io_cq *bic;
1915 retry:
1916 bic = bfq_bic_lookup(bfqd, ioc);
1917 /* bic always exists here */
1918 bfqq = bic_to_bfqq(bic, is_sync);
1921 * Always try a new alloc if we fall back to the OOM bfqq
1922 * originally, since it should just be a temporary situation.
1924 if (bfqq == NULL || bfqq == &bfqd->oom_bfqq) {
1925 bfqq = NULL;
1926 if (new_bfqq != NULL) {
1927 bfqq = new_bfqq;
1928 new_bfqq = NULL;
1929 } else if (gfp_mask & __GFP_WAIT) {
1930 spin_unlock_irq(bfqd->queue->queue_lock);
1931 new_bfqq = kmem_cache_alloc_node(bfq_pool,
1932 gfp_mask | __GFP_ZERO,
1933 bfqd->queue->node);
1934 spin_lock_irq(bfqd->queue->queue_lock);
1935 if (new_bfqq != NULL)
1936 goto retry;
1937 } else {
1938 bfqq = kmem_cache_alloc_node(bfq_pool,
1939 gfp_mask | __GFP_ZERO,
1940 bfqd->queue->node);
1943 if (bfqq != NULL) {
1944 bfq_init_bfqq(bfqd, bfqq, current->pid, is_sync);
1945 bfq_log_bfqq(bfqd, bfqq, "allocated");
1946 } else {
1947 bfqq = &bfqd->oom_bfqq;
1948 bfq_log_bfqq(bfqd, bfqq, "using oom bfqq");
1951 bfq_init_prio_data(bfqq, ioc);
1952 bfq_init_entity(&bfqq->entity, bfqg);
1955 if (new_bfqq != NULL)
1956 kmem_cache_free(bfq_pool, new_bfqq);
1958 return bfqq;
1961 static struct bfq_queue **bfq_async_queue_prio(struct bfq_data *bfqd,
1962 struct bfq_group *bfqg,
1963 int ioprio_class, int ioprio)
1965 switch (ioprio_class) {
1966 case IOPRIO_CLASS_RT:
1967 return &bfqg->async_bfqq[0][ioprio];
1968 case IOPRIO_CLASS_BE:
1969 return &bfqg->async_bfqq[1][ioprio];
1970 case IOPRIO_CLASS_IDLE:
1971 return &bfqg->async_idle_bfqq;
1972 default:
1973 BUG();
1977 static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd,
1978 struct bfq_group *bfqg, int is_sync,
1979 struct io_context *ioc, gfp_t gfp_mask)
1981 const int ioprio = task_ioprio(ioc);
1982 const int ioprio_class = task_ioprio_class(ioc);
1983 struct bfq_queue **async_bfqq = NULL;
1984 struct bfq_queue *bfqq = NULL;
1986 if (!is_sync) {
1987 async_bfqq = bfq_async_queue_prio(bfqd, bfqg, ioprio_class,
1988 ioprio);
1989 bfqq = *async_bfqq;
1992 if (bfqq == NULL)
1993 bfqq = bfq_find_alloc_queue(bfqd, bfqg, is_sync, ioc, gfp_mask);
1996 * Pin the queue now that it's allocated, scheduler exit will prune it.
1998 if (!is_sync && *async_bfqq == NULL) {
1999 atomic_inc(&bfqq->ref);
2000 bfq_log_bfqq(bfqd, bfqq, "get_queue, bfqq not in async: %p, %d",
2001 bfqq, atomic_read(&bfqq->ref));
2002 *async_bfqq = bfqq;
2005 atomic_inc(&bfqq->ref);
2006 bfq_log_bfqq(bfqd, bfqq, "get_queue, at end: %p, %d", bfqq,
2007 atomic_read(&bfqq->ref));
2008 return bfqq;
2011 static void bfq_update_io_thinktime(struct bfq_data *bfqd,
2012 struct bfq_io_cq *bic)
2014 unsigned long elapsed = jiffies - bic->ttime.last_end_request;
2015 unsigned long ttime = min(elapsed, 2UL * bfqd->bfq_slice_idle);
2017 bic->ttime.ttime_samples = (7*bic->ttime.ttime_samples + 256) / 8;
2018 bic->ttime.ttime_total = (7*bic->ttime.ttime_total + 256*ttime) / 8;
2019 bic->ttime.ttime_mean = (bic->ttime.ttime_total + 128) / bic->ttime.ttime_samples;
2022 static void bfq_update_io_seektime(struct bfq_data *bfqd,
2023 struct bfq_queue *bfqq,
2024 struct request *rq)
2026 sector_t sdist;
2027 u64 total;
2029 if (bfqq->last_request_pos < blk_rq_pos(rq))
2030 sdist = blk_rq_pos(rq) - bfqq->last_request_pos;
2031 else
2032 sdist = bfqq->last_request_pos - blk_rq_pos(rq);
2035 * Don't allow the seek distance to get too large from the
2036 * odd fragment, pagein, etc.
2038 if (bfqq->seek_samples == 0) /* first request, not really a seek */
2039 sdist = 0;
2040 else if (bfqq->seek_samples <= 60) /* second & third seek */
2041 sdist = min(sdist, (bfqq->seek_mean * 4) + 2*1024*1024);
2042 else
2043 sdist = min(sdist, (bfqq->seek_mean * 4) + 2*1024*64);
2045 bfqq->seek_samples = (7*bfqq->seek_samples + 256) / 8;
2046 bfqq->seek_total = (7*bfqq->seek_total + (u64)256*sdist) / 8;
2047 total = bfqq->seek_total + (bfqq->seek_samples/2);
2048 do_div(total, bfqq->seek_samples);
2049 if (bfq_bfqq_coop(bfqq)) {
2051 * If the mean seektime increases for a (non-seeky) shared
2052 * queue, some cooperator is likely to be idling too much.
2053 * On the contrary, if it decreases, some cooperator has
2054 * probably waked up.
2057 if ((sector_t)total < bfqq->seek_mean)
2058 bfq_mark_bfqq_some_coop_idle(bfqq) ;
2059 else if ((sector_t)total > bfqq->seek_mean)
2060 bfq_clear_bfqq_some_coop_idle(bfqq) ;
2062 bfqq->seek_mean = (sector_t)total;
2064 bfq_log_bfqq(bfqd, bfqq, "dist=%llu mean=%llu", (u64)sdist,
2065 (u64)bfqq->seek_mean);
2069 * Disable idle window if the process thinks too long or seeks so much that
2070 * it doesn't matter.
2072 static void bfq_update_idle_window(struct bfq_data *bfqd,
2073 struct bfq_queue *bfqq,
2074 struct bfq_io_cq *bic)
2076 int enable_idle;
2078 /* Don't idle for async or idle io prio class. */
2079 if (!bfq_bfqq_sync(bfqq) || bfq_class_idle(bfqq))
2080 return;
2082 enable_idle = bfq_bfqq_idle_window(bfqq);
2084 if (atomic_read(&bic->icq.ioc->nr_tasks) == 0 ||
2085 bfqd->bfq_slice_idle == 0 ||
2086 (bfqd->hw_tag && BFQQ_SEEKY(bfqq) &&
2087 bfqq->raising_coeff == 1))
2088 enable_idle = 0;
2089 else if (bfq_sample_valid(bic->ttime.ttime_samples)) {
2090 if (bic->ttime.ttime_mean > bfqd->bfq_slice_idle &&
2091 bfqq->raising_coeff == 1)
2092 enable_idle = 0;
2093 else
2094 enable_idle = 1;
2096 bfq_log_bfqq(bfqd, bfqq, "update_idle_window: enable_idle %d",
2097 enable_idle);
2099 if (enable_idle)
2100 bfq_mark_bfqq_idle_window(bfqq);
2101 else
2102 bfq_clear_bfqq_idle_window(bfqq);
2106 * Called when a new fs request (rq) is added to bfqq. Check if there's
2107 * something we should do about it.
2109 static void bfq_rq_enqueued(struct bfq_data *bfqd, struct bfq_queue *bfqq,
2110 struct request *rq)
2112 struct bfq_io_cq *bic = RQ_BIC(rq);
2114 if (rq->cmd_flags & REQ_META)
2115 bfqq->meta_pending++;
2117 bfq_update_io_thinktime(bfqd, bic);
2118 bfq_update_io_seektime(bfqd, bfqq, rq);
2119 if (bfqq->entity.service > bfq_max_budget(bfqd) / 8 ||
2120 !BFQQ_SEEKY(bfqq))
2121 bfq_update_idle_window(bfqd, bfqq, bic);
2123 bfq_log_bfqq(bfqd, bfqq,
2124 "rq_enqueued: idle_window=%d (seeky %d, mean %llu)",
2125 bfq_bfqq_idle_window(bfqq), BFQQ_SEEKY(bfqq),
2126 (long long unsigned)bfqq->seek_mean);
2128 bfqq->last_request_pos = blk_rq_pos(rq) + blk_rq_sectors(rq);
2130 if (bfqq == bfqd->active_queue) {
2132 * If there is just this request queued and the request
2133 * is small, just exit.
2134 * In this way, if the disk is being idled to wait for a new
2135 * request from the active queue, we avoid unplugging the
2136 * device now.
2138 * By doing so, we spare the disk to be committed
2139 * to serve just a small request. On the contrary, we wait for
2140 * the block layer to decide when to unplug the device:
2141 * hopefully, new requests will be merged to this
2142 * one quickly, then the device will be unplugged
2143 * and larger requests will be dispatched.
2145 if (bfqq->queued[rq_is_sync(rq)] == 1 &&
2146 blk_rq_sectors(rq) < 32) {
2147 return;
2149 if (bfq_bfqq_wait_request(bfqq)) {
2151 * If we are waiting for a request for this queue, let
2152 * it rip immediately and flag that we must not expire
2153 * this queue just now.
2155 bfq_clear_bfqq_wait_request(bfqq);
2156 del_timer(&bfqd->idle_slice_timer);
2158 * Here we can safely expire the queue, in
2159 * case of budget timeout, without wasting
2160 * guarantees
2162 if (bfq_bfqq_budget_timeout(bfqq))
2163 bfq_bfqq_expire(bfqd, bfqq, 0,
2164 BFQ_BFQQ_BUDGET_TIMEOUT);
2165 __blk_run_queue(bfqd->queue);
2170 static void bfq_insert_request(struct request_queue *q, struct request *rq)
2172 struct bfq_data *bfqd = q->elevator->elevator_data;
2173 struct bfq_queue *bfqq = RQ_BFQQ(rq);
2175 assert_spin_locked(bfqd->queue->queue_lock);
2176 bfq_init_prio_data(bfqq, RQ_BIC(rq)->icq.ioc);
2178 bfq_add_rq_rb(rq);
2180 rq_set_fifo_time(rq, jiffies + bfqd->bfq_fifo_expire[rq_is_sync(rq)]);
2181 list_add_tail(&rq->queuelist, &bfqq->fifo);
2183 bfq_rq_enqueued(bfqd, bfqq, rq);
2186 static void bfq_update_hw_tag(struct bfq_data *bfqd)
2188 bfqd->max_rq_in_driver = max(bfqd->max_rq_in_driver,
2189 bfqd->rq_in_driver);
2192 * This sample is valid if the number of outstanding requests
2193 * is large enough to allow a queueing behavior. Note that the
2194 * sum is not exact, as it's not taking into account deactivated
2195 * requests.
2197 if (bfqd->rq_in_driver + bfqd->queued < BFQ_HW_QUEUE_THRESHOLD)
2198 return;
2200 if (bfqd->hw_tag_samples++ < BFQ_HW_QUEUE_SAMPLES)
2201 return;
2203 bfqd->hw_tag = bfqd->max_rq_in_driver > BFQ_HW_QUEUE_THRESHOLD;
2204 bfqd->max_rq_in_driver = 0;
2205 bfqd->hw_tag_samples = 0;
2208 static void bfq_completed_request(struct request_queue *q, struct request *rq)
2210 struct bfq_queue *bfqq = RQ_BFQQ(rq);
2211 struct bfq_data *bfqd = bfqq->bfqd;
2212 const int sync = rq_is_sync(rq);
2214 bfq_log_bfqq(bfqd, bfqq, "completed %u sects req (%d)",
2215 blk_rq_sectors(rq), sync);
2217 bfq_update_hw_tag(bfqd);
2219 WARN_ON(!bfqd->rq_in_driver);
2220 WARN_ON(!bfqq->dispatched);
2221 bfqd->rq_in_driver--;
2222 bfqq->dispatched--;
2224 if (bfq_bfqq_sync(bfqq))
2225 bfqd->sync_flight--;
2227 if (sync)
2228 RQ_BIC(rq)->ttime.last_end_request = jiffies;
2231 * If this is the active queue, check if it needs to be expired,
2232 * or if we want to idle in case it has no pending requests.
2234 if (bfqd->active_queue == bfqq) {
2235 if (bfq_bfqq_budget_new(bfqq))
2236 bfq_set_budget_timeout(bfqd);
2238 /* Idling is disabled also for cooperation issues:
2239 * 1) there is a close cooperator for the queue, or
2240 * 2) the queue is shared and some cooperator is likely
2241 * to be idle (in this case, by not arming the idle timer,
2242 * we try to slow down the queue, to prevent the zones
2243 * of the disk accessed by the active cooperators to become
2244 * too distant from the zone that will be accessed by the
2245 * currently idle cooperators)
2247 if (bfq_may_expire_for_budg_timeout(bfqq))
2248 bfq_bfqq_expire(bfqd, bfqq, 0, BFQ_BFQQ_BUDGET_TIMEOUT);
2249 else if (sync &&
2250 (bfqd->rq_in_driver == 0 ||
2251 bfqq->raising_coeff > 1)
2252 && RB_EMPTY_ROOT(&bfqq->sort_list)
2253 && !bfq_close_cooperator(bfqd, bfqq)
2254 && (!bfq_bfqq_coop(bfqq) ||
2255 !bfq_bfqq_some_coop_idle(bfqq)))
2256 bfq_arm_slice_timer(bfqd);
2259 if (!bfqd->rq_in_driver)
2260 bfq_schedule_dispatch(bfqd);
2263 static inline int __bfq_may_queue(struct bfq_queue *bfqq)
2265 if (bfq_bfqq_wait_request(bfqq) && bfq_bfqq_must_alloc(bfqq)) {
2266 bfq_clear_bfqq_must_alloc(bfqq);
2267 return ELV_MQUEUE_MUST;
2270 return ELV_MQUEUE_MAY;
2273 static int bfq_may_queue(struct request_queue *q, int rw)
2275 struct bfq_data *bfqd = q->elevator->elevator_data;
2276 struct task_struct *tsk = current;
2277 struct bfq_io_cq *bic;
2278 struct bfq_queue *bfqq;
2281 * Don't force setup of a queue from here, as a call to may_queue
2282 * does not necessarily imply that a request actually will be queued.
2283 * So just lookup a possibly existing queue, or return 'may queue'
2284 * if that fails.
2286 bic = bfq_bic_lookup(bfqd, tsk->io_context);
2287 if (bic == NULL)
2288 return ELV_MQUEUE_MAY;
2290 bfqq = bic_to_bfqq(bic, rw_is_sync(rw));
2291 if (bfqq != NULL) {
2292 bfq_init_prio_data(bfqq, bic->icq.ioc);
2294 return __bfq_may_queue(bfqq);
2297 return ELV_MQUEUE_MAY;
2301 * Queue lock held here.
2303 static void bfq_put_request(struct request *rq)
2305 struct bfq_queue *bfqq = RQ_BFQQ(rq);
2307 if (bfqq != NULL) {
2308 const int rw = rq_data_dir(rq);
2310 BUG_ON(!bfqq->allocated[rw]);
2311 bfqq->allocated[rw]--;
2313 rq->elv.priv[0] = NULL;
2314 rq->elv.priv[1] = NULL;
2316 bfq_log_bfqq(bfqq->bfqd, bfqq, "put_request %p, %d",
2317 bfqq, atomic_read(&bfqq->ref));
2318 bfq_put_queue(bfqq);
2322 static struct bfq_queue *
2323 bfq_merge_bfqqs(struct bfq_data *bfqd, struct bfq_io_cq *bic,
2324 struct bfq_queue *bfqq)
2326 bfq_log_bfqq(bfqd, bfqq, "merging with queue %lu",
2327 (long unsigned)bfqq->new_bfqq->pid);
2328 bic_set_bfqq(bic, bfqq->new_bfqq, 1);
2329 bfq_mark_bfqq_coop(bfqq->new_bfqq);
2330 bfq_put_queue(bfqq);
2331 return bic_to_bfqq(bic, 1);
2335 * Returns NULL if a new bfqq should be allocated, or the old bfqq if this
2336 * was the last process referring to said bfqq.
2338 static struct bfq_queue *
2339 bfq_split_bfqq(struct bfq_io_cq *bic, struct bfq_queue *bfqq)
2341 bfq_log_bfqq(bfqq->bfqd, bfqq, "splitting queue");
2342 if (bfqq_process_refs(bfqq) == 1) {
2343 bfqq->pid = current->pid;
2344 bfq_clear_bfqq_some_coop_idle(bfqq);
2345 bfq_clear_bfqq_coop(bfqq);
2346 bfq_clear_bfqq_split_coop(bfqq);
2347 return bfqq;
2350 bic_set_bfqq(bic, NULL, 1);
2352 bfq_put_cooperator(bfqq);
2354 bfq_put_queue(bfqq);
2355 return NULL;
2359 * Allocate bfq data structures associated with this request.
2361 static int bfq_set_request(struct request_queue *q, struct request *rq,
2362 gfp_t gfp_mask)
2364 struct bfq_data *bfqd = q->elevator->elevator_data;
2365 struct bfq_io_cq *bic = icq_to_bic(rq->elv.icq);
2366 const int rw = rq_data_dir(rq);
2367 const int is_sync = rq_is_sync(rq);
2368 struct bfq_queue *bfqq;
2369 struct bfq_group *bfqg;
2370 unsigned long flags;
2372 /* handle changed prio notifications; cgroup change is handled separately */
2373 if (unlikely(icq_get_changed(&bic->icq) & ICQ_IOPRIO_CHANGED))
2374 bfq_changed_ioprio(bic->icq.ioc, bic);
2376 might_sleep_if(gfp_mask & __GFP_WAIT);
2378 spin_lock_irqsave(q->queue_lock, flags);
2380 if (bic == NULL)
2381 goto queue_fail;
2383 bfqg = bfq_bic_update_cgroup(bic);
2385 new_queue:
2386 bfqq = bic_to_bfqq(bic, is_sync);
2387 if (bfqq == NULL || bfqq == &bfqd->oom_bfqq) {
2388 bfqq = bfq_get_queue(bfqd, bfqg, is_sync, bic->icq.ioc, gfp_mask);
2389 bic_set_bfqq(bic, bfqq, is_sync);
2390 } else {
2392 * If the queue was seeky for too long, break it apart.
2394 if (bfq_bfqq_coop(bfqq) && bfq_bfqq_split_coop(bfqq)) {
2395 bfq_log_bfqq(bfqd, bfqq, "breaking apart bfqq");
2396 bfqq = bfq_split_bfqq(bic, bfqq);
2397 if (!bfqq)
2398 goto new_queue;
2402 * Check to see if this queue is scheduled to merge with
2403 * another closely cooperating queue. The merging of queues
2404 * happens here as it must be done in process context.
2405 * The reference on new_bfqq was taken in merge_bfqqs.
2407 if (bfqq->new_bfqq != NULL)
2408 bfqq = bfq_merge_bfqqs(bfqd, bic, bfqq);
2411 bfqq->allocated[rw]++;
2412 atomic_inc(&bfqq->ref);
2413 bfq_log_bfqq(bfqd, bfqq, "set_request: bfqq %p, %d", bfqq,
2414 atomic_read(&bfqq->ref));
2416 rq->elv.priv[0] = bic;
2417 rq->elv.priv[1] = bfqq;
2419 spin_unlock_irqrestore(q->queue_lock, flags);
2421 return 0;
2423 queue_fail:
2424 bfq_schedule_dispatch(bfqd);
2425 spin_unlock_irqrestore(q->queue_lock, flags);
2427 return 1;
2430 static void bfq_kick_queue(struct work_struct *work)
2432 struct bfq_data *bfqd =
2433 container_of(work, struct bfq_data, unplug_work);
2434 struct request_queue *q = bfqd->queue;
2436 spin_lock_irq(q->queue_lock);
2437 __blk_run_queue(q);
2438 spin_unlock_irq(q->queue_lock);
2442 * Handler of the expiration of the timer running if the active_queue
2443 * is idling inside its time slice.
2445 static void bfq_idle_slice_timer(unsigned long data)
2447 struct bfq_data *bfqd = (struct bfq_data *)data;
2448 struct bfq_queue *bfqq;
2449 unsigned long flags;
2450 enum bfqq_expiration reason;
2452 spin_lock_irqsave(bfqd->queue->queue_lock, flags);
2454 bfqq = bfqd->active_queue;
2456 * Theoretical race here: active_queue can be NULL or different
2457 * from the queue that was idling if the timer handler spins on
2458 * the queue_lock and a new request arrives for the current
2459 * queue and there is a full dispatch cycle that changes the
2460 * active_queue. This can hardly happen, but in the worst case
2461 * we just expire a queue too early.
2463 if (bfqq != NULL) {
2464 bfq_log_bfqq(bfqd, bfqq, "slice_timer expired");
2465 if (bfq_bfqq_budget_timeout(bfqq))
2467 * Also here the queue can be safely expired
2468 * for budget timeout without wasting
2469 * guarantees
2471 reason = BFQ_BFQQ_BUDGET_TIMEOUT;
2472 else if (bfqq->queued[0] == 0 && bfqq->queued[1] == 0)
2474 * The queue may not be empty upon timer expiration,
2475 * because we may not disable the timer when the first
2476 * request of the active queue arrives during
2477 * disk idling
2479 reason = BFQ_BFQQ_TOO_IDLE;
2480 else
2481 goto schedule_dispatch;
2483 bfq_bfqq_expire(bfqd, bfqq, 1, reason);
2486 schedule_dispatch:
2487 bfq_schedule_dispatch(bfqd);
2489 spin_unlock_irqrestore(bfqd->queue->queue_lock, flags);
2492 static void bfq_shutdown_timer_wq(struct bfq_data *bfqd)
2494 del_timer_sync(&bfqd->idle_slice_timer);
2495 cancel_work_sync(&bfqd->unplug_work);
2498 static inline void __bfq_put_async_bfqq(struct bfq_data *bfqd,
2499 struct bfq_queue **bfqq_ptr)
2501 struct bfq_group *root_group = bfqd->root_group;
2502 struct bfq_queue *bfqq = *bfqq_ptr;
2504 bfq_log(bfqd, "put_async_bfqq: %p", bfqq);
2505 if (bfqq != NULL) {
2506 bfq_bfqq_move(bfqd, bfqq, &bfqq->entity, root_group);
2507 bfq_log_bfqq(bfqd, bfqq, "put_async_bfqq: putting %p, %d",
2508 bfqq, atomic_read(&bfqq->ref));
2509 bfq_put_queue(bfqq);
2510 *bfqq_ptr = NULL;
2515 * Release all the bfqg references to its async queues. If we are
2516 * deallocating the group these queues may still contain requests, so
2517 * we reparent them to the root cgroup (i.e., the only one that will
2518 * exist for sure untill all the requests on a device are gone).
2520 static void bfq_put_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg)
2522 int i, j;
2524 for (i = 0; i < 2; i++)
2525 for (j = 0; j < IOPRIO_BE_NR; j++)
2526 __bfq_put_async_bfqq(bfqd, &bfqg->async_bfqq[i][j]);
2528 __bfq_put_async_bfqq(bfqd, &bfqg->async_idle_bfqq);
2531 static void bfq_exit_queue(struct elevator_queue *e)
2533 struct bfq_data *bfqd = e->elevator_data;
2534 struct request_queue *q = bfqd->queue;
2535 struct bfq_queue *bfqq, *n;
2537 bfq_shutdown_timer_wq(bfqd);
2539 spin_lock_irq(q->queue_lock);
2541 BUG_ON(bfqd->active_queue != NULL);
2542 list_for_each_entry_safe(bfqq, n, &bfqd->idle_list, bfqq_list)
2543 bfq_deactivate_bfqq(bfqd, bfqq, 0);
2545 bfq_disconnect_groups(bfqd);
2546 spin_unlock_irq(q->queue_lock);
2548 bfq_shutdown_timer_wq(bfqd);
2550 synchronize_rcu();
2552 BUG_ON(timer_pending(&bfqd->idle_slice_timer));
2554 bfq_free_root_group(bfqd);
2555 kfree(bfqd);
2558 static void *bfq_init_queue(struct request_queue *q)
2560 struct bfq_group *bfqg;
2561 struct bfq_data *bfqd;
2563 bfqd = kmalloc_node(sizeof(*bfqd), GFP_KERNEL | __GFP_ZERO, q->node);
2564 if (bfqd == NULL)
2565 return NULL;
2568 * Our fallback bfqq if bfq_find_alloc_queue() runs into OOM issues.
2569 * Grab a permanent reference to it, so that the normal code flow
2570 * will not attempt to free it.
2572 bfq_init_bfqq(bfqd, &bfqd->oom_bfqq, 1, 0);
2573 atomic_inc(&bfqd->oom_bfqq.ref);
2575 bfqd->queue = q;
2577 bfqg = bfq_alloc_root_group(bfqd, q->node);
2578 if (bfqg == NULL) {
2579 kfree(bfqd);
2580 return NULL;
2583 bfqd->root_group = bfqg;
2585 init_timer(&bfqd->idle_slice_timer);
2586 bfqd->idle_slice_timer.function = bfq_idle_slice_timer;
2587 bfqd->idle_slice_timer.data = (unsigned long)bfqd;
2589 bfqd->rq_pos_tree = RB_ROOT;
2591 INIT_WORK(&bfqd->unplug_work, bfq_kick_queue);
2593 INIT_LIST_HEAD(&bfqd->active_list);
2594 INIT_LIST_HEAD(&bfqd->idle_list);
2596 bfqd->hw_tag = 1;
2598 bfqd->bfq_max_budget = bfq_default_max_budget;
2600 bfqd->bfq_quantum = bfq_quantum;
2601 bfqd->bfq_fifo_expire[0] = bfq_fifo_expire[0];
2602 bfqd->bfq_fifo_expire[1] = bfq_fifo_expire[1];
2603 bfqd->bfq_back_max = bfq_back_max;
2604 bfqd->bfq_back_penalty = bfq_back_penalty;
2605 bfqd->bfq_slice_idle = bfq_slice_idle;
2606 bfqd->bfq_class_idle_last_service = 0;
2607 bfqd->bfq_max_budget_async_rq = bfq_max_budget_async_rq;
2608 bfqd->bfq_timeout[BLK_RW_ASYNC] = bfq_timeout_async;
2609 bfqd->bfq_timeout[BLK_RW_SYNC] = bfq_timeout_sync;
2611 bfqd->low_latency = true;
2613 bfqd->bfq_raising_coeff = 20;
2614 bfqd->bfq_raising_rt_max_time = msecs_to_jiffies(300);
2615 bfqd->bfq_raising_max_time = msecs_to_jiffies(7500);
2616 bfqd->bfq_raising_min_idle_time = msecs_to_jiffies(2000);
2617 bfqd->bfq_raising_min_inter_arr_async = msecs_to_jiffies(500);
2618 bfqd->bfq_raising_max_softrt_rate = 7000;
2620 return bfqd;
2623 static void bfq_slab_kill(void)
2625 if (bfq_pool != NULL)
2626 kmem_cache_destroy(bfq_pool);
2629 static int __init bfq_slab_setup(void)
2631 bfq_pool = KMEM_CACHE(bfq_queue, 0);
2632 if (bfq_pool == NULL)
2633 return -ENOMEM;
2634 return 0;
2637 static ssize_t bfq_var_show(unsigned int var, char *page)
2639 return sprintf(page, "%d\n", var);
2642 static ssize_t bfq_var_store(unsigned long *var, const char *page, size_t count)
2644 unsigned long new_val;
2645 int ret = strict_strtoul(page, 10, &new_val);
2647 if (ret == 0)
2648 *var = new_val;
2650 return count;
2653 static ssize_t bfq_weights_show(struct elevator_queue *e, char *page)
2655 struct bfq_queue *bfqq;
2656 struct bfq_data *bfqd = e->elevator_data;
2657 ssize_t num_char = 0;
2659 num_char += sprintf(page + num_char, "Active:\n");
2660 list_for_each_entry(bfqq, &bfqd->active_list, bfqq_list) {
2661 num_char += sprintf(page + num_char,
2662 "pid%d: weight %hu, dur %d/%u\n",
2663 bfqq->pid,
2664 bfqq->entity.weight,
2665 jiffies_to_msecs(jiffies -
2666 bfqq->last_rais_start_finish),
2667 jiffies_to_msecs(bfqq->raising_cur_max_time));
2669 num_char += sprintf(page + num_char, "Idle:\n");
2670 list_for_each_entry(bfqq, &bfqd->idle_list, bfqq_list) {
2671 num_char += sprintf(page + num_char,
2672 "pid%d: weight %hu, dur %d/%u\n",
2673 bfqq->pid,
2674 bfqq->entity.weight,
2675 jiffies_to_msecs(jiffies -
2676 bfqq->last_rais_start_finish),
2677 jiffies_to_msecs(bfqq->raising_cur_max_time));
2679 return num_char;
2682 #define SHOW_FUNCTION(__FUNC, __VAR, __CONV) \
2683 static ssize_t __FUNC(struct elevator_queue *e, char *page) \
2685 struct bfq_data *bfqd = e->elevator_data; \
2686 unsigned int __data = __VAR; \
2687 if (__CONV) \
2688 __data = jiffies_to_msecs(__data); \
2689 return bfq_var_show(__data, (page)); \
2691 SHOW_FUNCTION(bfq_quantum_show, bfqd->bfq_quantum, 0);
2692 SHOW_FUNCTION(bfq_fifo_expire_sync_show, bfqd->bfq_fifo_expire[1], 1);
2693 SHOW_FUNCTION(bfq_fifo_expire_async_show, bfqd->bfq_fifo_expire[0], 1);
2694 SHOW_FUNCTION(bfq_back_seek_max_show, bfqd->bfq_back_max, 0);
2695 SHOW_FUNCTION(bfq_back_seek_penalty_show, bfqd->bfq_back_penalty, 0);
2696 SHOW_FUNCTION(bfq_slice_idle_show, bfqd->bfq_slice_idle, 1);
2697 SHOW_FUNCTION(bfq_max_budget_show, bfqd->bfq_user_max_budget, 0);
2698 SHOW_FUNCTION(bfq_max_budget_async_rq_show, bfqd->bfq_max_budget_async_rq, 0);
2699 SHOW_FUNCTION(bfq_timeout_sync_show, bfqd->bfq_timeout[BLK_RW_SYNC], 1);
2700 SHOW_FUNCTION(bfq_timeout_async_show, bfqd->bfq_timeout[BLK_RW_ASYNC], 1);
2701 SHOW_FUNCTION(bfq_low_latency_show, bfqd->low_latency, 0);
2702 SHOW_FUNCTION(bfq_raising_coeff_show, bfqd->bfq_raising_coeff, 0);
2703 SHOW_FUNCTION(bfq_raising_max_time_show, bfqd->bfq_raising_max_time, 1);
2704 SHOW_FUNCTION(bfq_raising_rt_max_time_show, bfqd->bfq_raising_rt_max_time, 1);
2705 SHOW_FUNCTION(bfq_raising_min_idle_time_show, bfqd->bfq_raising_min_idle_time,
2707 SHOW_FUNCTION(bfq_raising_min_inter_arr_async_show,
2708 bfqd->bfq_raising_min_inter_arr_async,
2710 SHOW_FUNCTION(bfq_raising_max_softrt_rate_show,
2711 bfqd->bfq_raising_max_softrt_rate, 0);
2712 #undef SHOW_FUNCTION
2714 #define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV) \
2715 static ssize_t \
2716 __FUNC(struct elevator_queue *e, const char *page, size_t count) \
2718 struct bfq_data *bfqd = e->elevator_data; \
2719 unsigned long __data; \
2720 int ret = bfq_var_store(&__data, (page), count); \
2721 if (__data < (MIN)) \
2722 __data = (MIN); \
2723 else if (__data > (MAX)) \
2724 __data = (MAX); \
2725 if (__CONV) \
2726 *(__PTR) = msecs_to_jiffies(__data); \
2727 else \
2728 *(__PTR) = __data; \
2729 return ret; \
2731 STORE_FUNCTION(bfq_quantum_store, &bfqd->bfq_quantum, 1, INT_MAX, 0);
2732 STORE_FUNCTION(bfq_fifo_expire_sync_store, &bfqd->bfq_fifo_expire[1], 1,
2733 INT_MAX, 1);
2734 STORE_FUNCTION(bfq_fifo_expire_async_store, &bfqd->bfq_fifo_expire[0], 1,
2735 INT_MAX, 1);
2736 STORE_FUNCTION(bfq_back_seek_max_store, &bfqd->bfq_back_max, 0, INT_MAX, 0);
2737 STORE_FUNCTION(bfq_back_seek_penalty_store, &bfqd->bfq_back_penalty, 1,
2738 INT_MAX, 0);
2739 STORE_FUNCTION(bfq_slice_idle_store, &bfqd->bfq_slice_idle, 0, INT_MAX, 1);
2740 STORE_FUNCTION(bfq_max_budget_async_rq_store, &bfqd->bfq_max_budget_async_rq,
2741 1, INT_MAX, 0);
2742 STORE_FUNCTION(bfq_timeout_async_store, &bfqd->bfq_timeout[BLK_RW_ASYNC], 0,
2743 INT_MAX, 1);
2744 STORE_FUNCTION(bfq_raising_coeff_store, &bfqd->bfq_raising_coeff, 1,
2745 INT_MAX, 0);
2746 STORE_FUNCTION(bfq_raising_max_time_store, &bfqd->bfq_raising_max_time, 0,
2747 INT_MAX, 1);
2748 STORE_FUNCTION(bfq_raising_rt_max_time_store, &bfqd->bfq_raising_rt_max_time, 0,
2749 INT_MAX, 1);
2750 STORE_FUNCTION(bfq_raising_min_idle_time_store,
2751 &bfqd->bfq_raising_min_idle_time, 0, INT_MAX, 1);
2752 STORE_FUNCTION(bfq_raising_min_inter_arr_async_store,
2753 &bfqd->bfq_raising_min_inter_arr_async, 0, INT_MAX, 1);
2754 STORE_FUNCTION(bfq_raising_max_softrt_rate_store,
2755 &bfqd->bfq_raising_max_softrt_rate, 0, INT_MAX, 0);
2756 #undef STORE_FUNCTION
2758 /* do nothing for the moment */
2759 static ssize_t bfq_weights_store(struct elevator_queue *e,
2760 const char *page, size_t count)
2762 return count;
2765 static inline unsigned long bfq_estimated_max_budget(struct bfq_data *bfqd)
2767 u64 timeout = jiffies_to_msecs(bfqd->bfq_timeout[BLK_RW_SYNC]);
2769 if (bfqd->peak_rate_samples >= BFQ_PEAK_RATE_SAMPLES)
2770 return bfq_calc_max_budget(bfqd->peak_rate, timeout);
2771 else
2772 return bfq_default_max_budget;
2775 static ssize_t bfq_max_budget_store(struct elevator_queue *e,
2776 const char *page, size_t count)
2778 struct bfq_data *bfqd = e->elevator_data;
2779 unsigned long __data;
2780 int ret = bfq_var_store(&__data, (page), count);
2782 if (__data == 0)
2783 bfqd->bfq_max_budget = bfq_estimated_max_budget(bfqd);
2784 else {
2785 if (__data > INT_MAX)
2786 __data = INT_MAX;
2787 bfqd->bfq_max_budget = __data;
2790 bfqd->bfq_user_max_budget = __data;
2792 return ret;
2795 static ssize_t bfq_timeout_sync_store(struct elevator_queue *e,
2796 const char *page, size_t count)
2798 struct bfq_data *bfqd = e->elevator_data;
2799 unsigned long __data;
2800 int ret = bfq_var_store(&__data, (page), count);
2802 if (__data < 1)
2803 __data = 1;
2804 else if (__data > INT_MAX)
2805 __data = INT_MAX;
2807 bfqd->bfq_timeout[BLK_RW_SYNC] = msecs_to_jiffies(__data);
2808 if (bfqd->bfq_user_max_budget == 0)
2809 bfqd->bfq_max_budget = bfq_estimated_max_budget(bfqd);
2811 return ret;
2814 static ssize_t bfq_low_latency_store(struct elevator_queue *e,
2815 const char *page, size_t count)
2817 struct bfq_data *bfqd = e->elevator_data;
2818 unsigned long __data;
2819 int ret = bfq_var_store(&__data, (page), count);
2821 if (__data > 1)
2822 __data = 1;
2823 bfqd->low_latency = __data;
2825 return ret;
2828 #define BFQ_ATTR(name) \
2829 __ATTR(name, S_IRUGO|S_IWUSR, bfq_##name##_show, bfq_##name##_store)
2831 static struct elv_fs_entry bfq_attrs[] = {
2832 BFQ_ATTR(quantum),
2833 BFQ_ATTR(fifo_expire_sync),
2834 BFQ_ATTR(fifo_expire_async),
2835 BFQ_ATTR(back_seek_max),
2836 BFQ_ATTR(back_seek_penalty),
2837 BFQ_ATTR(slice_idle),
2838 BFQ_ATTR(max_budget),
2839 BFQ_ATTR(max_budget_async_rq),
2840 BFQ_ATTR(timeout_sync),
2841 BFQ_ATTR(timeout_async),
2842 BFQ_ATTR(low_latency),
2843 BFQ_ATTR(raising_coeff),
2844 BFQ_ATTR(raising_max_time),
2845 BFQ_ATTR(raising_rt_max_time),
2846 BFQ_ATTR(raising_min_idle_time),
2847 BFQ_ATTR(raising_min_inter_arr_async),
2848 BFQ_ATTR(raising_max_softrt_rate),
2849 BFQ_ATTR(weights),
2850 __ATTR_NULL
2853 static struct elevator_type iosched_bfq = {
2854 .ops = {
2855 .elevator_merge_fn = bfq_merge,
2856 .elevator_merged_fn = bfq_merged_request,
2857 .elevator_merge_req_fn = bfq_merged_requests,
2858 .elevator_allow_merge_fn = bfq_allow_merge,
2859 .elevator_dispatch_fn = bfq_dispatch_requests,
2860 .elevator_add_req_fn = bfq_insert_request,
2861 .elevator_activate_req_fn = bfq_activate_request,
2862 .elevator_deactivate_req_fn = bfq_deactivate_request,
2863 .elevator_completed_req_fn = bfq_completed_request,
2864 .elevator_former_req_fn = elv_rb_former_request,
2865 .elevator_latter_req_fn = elv_rb_latter_request,
2866 .elevator_init_icq_fn = bfq_init_icq,
2867 .elevator_exit_icq_fn = bfq_exit_icq,
2868 .elevator_set_req_fn = bfq_set_request,
2869 .elevator_put_req_fn = bfq_put_request,
2870 .elevator_may_queue_fn = bfq_may_queue,
2871 .elevator_init_fn = bfq_init_queue,
2872 .elevator_exit_fn = bfq_exit_queue,
2874 .icq_size = sizeof(struct bfq_io_cq),
2875 .icq_align = __alignof__(struct bfq_io_cq),
2876 .elevator_attrs = bfq_attrs,
2877 .elevator_name = "bfq",
2878 .elevator_owner = THIS_MODULE,
2881 static int __init bfq_init(void)
2884 * Can be 0 on HZ < 1000 setups.
2886 if (bfq_slice_idle == 0)
2887 bfq_slice_idle = 1;
2889 if (bfq_timeout_async == 0)
2890 bfq_timeout_async = 1;
2892 if (bfq_slab_setup())
2893 return -ENOMEM;
2895 elv_register(&iosched_bfq);
2897 return 0;
2900 static void __exit bfq_exit(void)
2902 elv_unregister(&iosched_bfq);
2903 bfq_slab_kill();
2906 module_init(bfq_init);
2907 module_exit(bfq_exit);
2909 MODULE_AUTHOR("Fabio Checconi, Paolo Valente");
2910 MODULE_LICENSE("GPL");
2911 MODULE_DESCRIPTION("Budget Fair Queueing IO scheduler");