2 * BFQ, or Budget Fair Queueing, disk scheduler.
4 * Based on ideas and code from CFQ:
5 * Copyright (C) 2003 Jens Axboe <axboe@kernel.dk>
7 * Copyright (C) 2008 Fabio Checconi <fabio@gandalf.sssup.it>
8 * Paolo Valente <paolo.valente@unimore.it>
10 * Licensed under the GPL-2 as detailed in the accompanying COPYING.BFQ file.
12 * BFQ is a proportional share disk scheduling algorithm based on the
13 * slice-by-slice service scheme of CFQ. But BFQ assigns budgets,
14 * measured in number of sectors, to tasks instead of time slices.
15 * The disk is not granted to the active task for a given time slice,
16 * but until it has exahusted its assigned budget. This change from
17 * the time to the service domain allows BFQ to distribute the disk
18 * bandwidth among tasks as desired, without any distortion due to
19 * ZBR, workload fluctuations or other factors. BFQ uses an ad hoc
20 * internal scheduler, called B-WF2Q+, to schedule tasks according to
21 * their budgets. Thanks to this accurate scheduler, BFQ can afford
22 * to assign high budgets to disk-bound non-seeky tasks (to boost the
23 * throughput), and yet guarantee low latencies to interactive and
24 * soft real-time applications.
26 * BFQ has been introduced in [1], where the interested reader can
27 * find an accurate description of the algorithm, the bandwidth
28 * distribution and latency guarantees it provides, plus formal proofs
29 * of all the properties. With respect to the algorithm presented in
30 * the paper, this implementation adds several little heuristics, and
31 * a hierarchical extension, based on H-WF2Q+.
33 * B-WF2Q+ is based on WF2Q+, that is described in [2], together with
34 * H-WF2Q+, while the augmented tree used to implement B-WF2Q+ with O(log N)
35 * complexity derives from the one introduced with EEVDF in [3].
37 * [1] P. Valente and F. Checconi, ``High Throughput Disk Scheduling
38 * with Deterministic Guarantees on Bandwidth Distribution,'',
39 * IEEE Transactions on Computer, May 2010.
41 * http://algo.ing.unimo.it/people/paolo/disk_sched/bfq-techreport.pdf
43 * [2] Jon C.R. Bennett and H. Zhang, ``Hierarchical Packet Fair Queueing
44 * Algorithms,'' IEEE/ACM Transactions on Networking, 5(5):675-689,
47 * http://www.cs.cmu.edu/~hzhang/papers/TON-97-Oct.ps.gz
49 * [3] I. Stoica and H. Abdel-Wahab, ``Earliest Eligible Virtual Deadline
50 * First: A Flexible and Accurate Mechanism for Proportional Share
51 * Resource Allocation,'' technical report.
53 * http://www.cs.berkeley.edu/~istoica/papers/eevdf-tr-95.pdf
55 #include <linux/module.h>
56 #include <linux/slab.h>
57 #include <linux/blkdev.h>
58 #include <linux/cgroup.h>
59 #include <linux/elevator.h>
60 #include <linux/jiffies.h>
61 #include <linux/rbtree.h>
62 #include <linux/ioprio.h>
66 /* Max number of dispatches in one round of service. */
67 static const int bfq_quantum
= 4;
69 /* Expiration time of sync (0) and async (1) requests, in jiffies. */
70 static const int bfq_fifo_expire
[2] = { HZ
/ 4, HZ
/ 8 };
72 /* Maximum backwards seek, in KiB. */
73 static const int bfq_back_max
= 16 * 1024;
75 /* Penalty of a backwards seek, in number of sectors. */
76 static const int bfq_back_penalty
= 2;
78 /* Idling period duration, in jiffies. */
79 static int bfq_slice_idle
= HZ
/ 125;
81 /* Default maximum budget values, in sectors and number of requests. */
82 static const int bfq_default_max_budget
= 16 * 1024;
83 static const int bfq_max_budget_async_rq
= 4;
86 * Async to sync throughput distribution is controlled as follows:
87 * when an async request is served, the entity is charged the number
88 * of sectors of the request, multipled by the factor below
90 static const int bfq_async_charge_factor
= 10;
92 /* Default timeout values, in jiffies, approximating CFQ defaults. */
93 static const int bfq_timeout_sync
= HZ
/ 8;
94 static int bfq_timeout_async
= HZ
/ 25;
96 struct kmem_cache
*bfq_pool
;
98 /* Below this threshold (in ms), we consider thinktime immediate. */
101 /* hw_tag detection: parallel requests threshold and min samples needed. */
102 #define BFQ_HW_QUEUE_THRESHOLD 4
103 #define BFQ_HW_QUEUE_SAMPLES 32
105 #define BFQQ_SEEK_THR (sector_t)(8 * 1024)
106 #define BFQQ_SEEKY(bfqq) ((bfqq)->seek_mean > BFQQ_SEEK_THR)
108 /* Min samples used for peak rate estimation (for autotuning). */
109 #define BFQ_PEAK_RATE_SAMPLES 32
111 /* Shift used for peak rate fixed precision calculations. */
112 #define BFQ_RATE_SHIFT 16
114 #define BFQ_SERVICE_TREE_INIT ((struct bfq_service_tree) \
115 { RB_ROOT, RB_ROOT, NULL, NULL, 0, 0 })
117 #define RQ_BIC(rq) ((struct bfq_io_cq *) (rq)->elv.priv[0])
118 #define RQ_BFQQ(rq) ((rq)->elv.priv[1])
121 #include "bfq-sched.c"
122 #include "bfq-cgroup.c"
124 #define bfq_class_idle(bfqq) ((bfqq)->entity.ioprio_class ==\
126 #define bfq_class_rt(bfqq) ((bfqq)->entity.ioprio_class ==\
129 #define bfq_sample_valid(samples) ((samples) > 80)
132 * We regard a request as SYNC, if either it's a read or has the SYNC bit
133 * set (in which case it could also be a direct WRITE).
135 static inline int bfq_bio_sync(struct bio
*bio
)
137 if (bio_data_dir(bio
) == READ
|| (bio
->bi_rw
& REQ_SYNC
))
144 * Scheduler run of queue, if there are requests pending and no one in the
145 * driver that will restart queueing.
147 static inline void bfq_schedule_dispatch(struct bfq_data
*bfqd
)
149 if (bfqd
->queued
!= 0) {
150 bfq_log(bfqd
, "schedule dispatch");
151 kblockd_schedule_work(bfqd
->queue
, &bfqd
->unplug_work
);
156 * Lifted from AS - choose which of rq1 and rq2 that is best served now.
157 * We choose the request that is closesr to the head right now. Distance
158 * behind the head is penalized and only allowed to a certain extent.
160 static struct request
*bfq_choose_req(struct bfq_data
*bfqd
,
165 sector_t s1
, s2
, d1
= 0, d2
= 0;
166 unsigned long back_max
;
167 #define BFQ_RQ1_WRAP 0x01 /* request 1 wraps */
168 #define BFQ_RQ2_WRAP 0x02 /* request 2 wraps */
169 unsigned wrap
= 0; /* bit mask: requests behind the disk head? */
171 if (rq1
== NULL
|| rq1
== rq2
)
176 if (rq_is_sync(rq1
) && !rq_is_sync(rq2
))
178 else if (rq_is_sync(rq2
) && !rq_is_sync(rq1
))
180 if ((rq1
->cmd_flags
& REQ_META
) && !(rq2
->cmd_flags
& REQ_META
))
182 else if ((rq2
->cmd_flags
& REQ_META
) && !(rq1
->cmd_flags
& REQ_META
))
185 s1
= blk_rq_pos(rq1
);
186 s2
= blk_rq_pos(rq2
);
189 * By definition, 1KiB is 2 sectors.
191 back_max
= bfqd
->bfq_back_max
* 2;
194 * Strict one way elevator _except_ in the case where we allow
195 * short backward seeks which are biased as twice the cost of a
196 * similar forward seek.
200 else if (s1
+ back_max
>= last
)
201 d1
= (last
- s1
) * bfqd
->bfq_back_penalty
;
203 wrap
|= BFQ_RQ1_WRAP
;
207 else if (s2
+ back_max
>= last
)
208 d2
= (last
- s2
) * bfqd
->bfq_back_penalty
;
210 wrap
|= BFQ_RQ2_WRAP
;
212 /* Found required data */
215 * By doing switch() on the bit mask "wrap" we avoid having to
216 * check two variables for all permutations: --> faster!
219 case 0: /* common case for CFQ: rq1 and rq2 not wrapped */
235 case (BFQ_RQ1_WRAP
|BFQ_RQ2_WRAP
): /* both rqs wrapped */
238 * Since both rqs are wrapped,
239 * start with the one that's further behind head
240 * (--> only *one* back seek required),
241 * since back seek takes more time than forward.
250 static struct bfq_queue
*
251 bfq_rq_pos_tree_lookup(struct bfq_data
*bfqd
, struct rb_root
*root
,
252 sector_t sector
, struct rb_node
**ret_parent
,
253 struct rb_node
***rb_link
)
255 struct rb_node
**p
, *parent
;
256 struct bfq_queue
*bfqq
= NULL
;
264 bfqq
= rb_entry(parent
, struct bfq_queue
, pos_node
);
267 * Sort strictly based on sector. Smallest to the left,
268 * largest to the right.
270 if (sector
> blk_rq_pos(bfqq
->next_rq
))
272 else if (sector
< blk_rq_pos(bfqq
->next_rq
))
280 *ret_parent
= parent
;
284 bfq_log(bfqd
, "rq_pos_tree_lookup %llu: returning %d",
285 (long long unsigned)sector
,
286 bfqq
!= NULL
? bfqq
->pid
: 0);
291 static void bfq_rq_pos_tree_add(struct bfq_data
*bfqd
, struct bfq_queue
*bfqq
)
293 struct rb_node
**p
, *parent
;
294 struct bfq_queue
*__bfqq
;
296 if (bfqq
->pos_root
!= NULL
) {
297 rb_erase(&bfqq
->pos_node
, bfqq
->pos_root
);
298 bfqq
->pos_root
= NULL
;
301 if (bfq_class_idle(bfqq
))
306 bfqq
->pos_root
= &bfqd
->rq_pos_tree
;
307 __bfqq
= bfq_rq_pos_tree_lookup(bfqd
, bfqq
->pos_root
,
308 blk_rq_pos(bfqq
->next_rq
), &parent
, &p
);
309 if (__bfqq
== NULL
) {
310 rb_link_node(&bfqq
->pos_node
, parent
, p
);
311 rb_insert_color(&bfqq
->pos_node
, bfqq
->pos_root
);
313 bfqq
->pos_root
= NULL
;
316 static struct request
*bfq_find_next_rq(struct bfq_data
*bfqd
,
317 struct bfq_queue
*bfqq
,
318 struct request
*last
)
320 struct rb_node
*rbnext
= rb_next(&last
->rb_node
);
321 struct rb_node
*rbprev
= rb_prev(&last
->rb_node
);
322 struct request
*next
= NULL
, *prev
= NULL
;
324 BUG_ON(RB_EMPTY_NODE(&last
->rb_node
));
327 prev
= rb_entry_rq(rbprev
);
330 next
= rb_entry_rq(rbnext
);
332 rbnext
= rb_first(&bfqq
->sort_list
);
333 if (rbnext
&& rbnext
!= &last
->rb_node
)
334 next
= rb_entry_rq(rbnext
);
337 return bfq_choose_req(bfqd
, next
, prev
, blk_rq_pos(last
));
340 static void bfq_del_rq_rb(struct request
*rq
)
342 struct bfq_queue
*bfqq
= RQ_BFQQ(rq
);
343 struct bfq_data
*bfqd
= bfqq
->bfqd
;
344 const int sync
= rq_is_sync(rq
);
346 BUG_ON(bfqq
->queued
[sync
] == 0);
347 bfqq
->queued
[sync
]--;
350 elv_rb_del(&bfqq
->sort_list
, rq
);
352 if (RB_EMPTY_ROOT(&bfqq
->sort_list
)) {
353 if (bfq_bfqq_busy(bfqq
) && bfqq
!= bfqd
->active_queue
)
354 bfq_del_bfqq_busy(bfqd
, bfqq
, 1);
356 * Remove queue from request-position tree as it is empty.
358 if (bfqq
->pos_root
!= NULL
) {
359 rb_erase(&bfqq
->pos_node
, bfqq
->pos_root
);
360 bfqq
->pos_root
= NULL
;
365 /* see the definition of bfq_async_charge_factor for details */
366 static inline unsigned long bfq_serv_to_charge(struct request
*rq
,
367 struct bfq_queue
*bfqq
)
369 return blk_rq_sectors(rq
) *
370 (1 + ((!bfq_bfqq_sync(bfqq
)) * (bfqq
->raising_coeff
== 1) *
371 bfq_async_charge_factor
));
375 * bfq_updated_next_req - update the queue after a new next_rq selection.
376 * @bfqd: the device data the queue belongs to.
377 * @bfqq: the queue to update.
379 * If the first request of a queue changes we make sure that the queue
380 * has enough budget to serve at least its first request (if the
381 * request has grown). We do this because if the queue has not enough
382 * budget for its first request, it has to go through two dispatch
383 * rounds to actually get it dispatched.
385 static void bfq_updated_next_req(struct bfq_data
*bfqd
,
386 struct bfq_queue
*bfqq
)
388 struct bfq_entity
*entity
= &bfqq
->entity
;
389 struct bfq_service_tree
*st
= bfq_entity_service_tree(entity
);
390 struct request
*next_rq
= bfqq
->next_rq
;
391 unsigned long new_budget
;
396 if (bfqq
== bfqd
->active_queue
)
398 * In order not to break guarantees, budgets cannot be
399 * changed after an entity has been selected.
403 BUG_ON(entity
->tree
!= &st
->active
);
404 BUG_ON(entity
== entity
->sched_data
->active_entity
);
406 new_budget
= max_t(unsigned long, bfqq
->max_budget
,
407 bfq_serv_to_charge(next_rq
, bfqq
));
408 entity
->budget
= new_budget
;
409 bfq_log_bfqq(bfqd
, bfqq
, "updated next rq: new budget %lu", new_budget
);
410 bfq_activate_bfqq(bfqd
, bfqq
);
413 static void bfq_add_rq_rb(struct request
*rq
)
415 struct bfq_queue
*bfqq
= RQ_BFQQ(rq
);
416 struct bfq_entity
*entity
= &bfqq
->entity
;
417 struct bfq_data
*bfqd
= bfqq
->bfqd
;
418 struct request
*next_rq
, *prev
;
419 unsigned long old_raising_coeff
= bfqq
->raising_coeff
;
420 int idle_for_long_time
= bfqq
->budget_timeout
+
421 bfqd
->bfq_raising_min_idle_time
< jiffies
;
423 bfq_log_bfqq(bfqd
, bfqq
, "add_rq_rb %d", rq_is_sync(rq
));
424 bfqq
->queued
[rq_is_sync(rq
)]++;
427 elv_rb_add(&bfqq
->sort_list
, rq
);
430 * Check if this request is a better next-serve candidate.
432 prev
= bfqq
->next_rq
;
433 next_rq
= bfq_choose_req(bfqd
, bfqq
->next_rq
, rq
, bfqd
->last_position
);
434 BUG_ON(next_rq
== NULL
);
435 bfqq
->next_rq
= next_rq
;
438 * Adjust priority tree position, if next_rq changes.
440 if (prev
!= bfqq
->next_rq
)
441 bfq_rq_pos_tree_add(bfqd
, bfqq
);
443 if (!bfq_bfqq_busy(bfqq
)) {
444 int soft_rt
= bfqd
->bfq_raising_max_softrt_rate
> 0 &&
445 bfqq
->soft_rt_next_start
< jiffies
;
446 entity
->budget
= max_t(unsigned long, bfqq
->max_budget
,
447 bfq_serv_to_charge(next_rq
, bfqq
));
449 if (! bfqd
->low_latency
)
453 * If the queue is not being boosted and has been idle
454 * for enough time, start a weight-raising period
456 if(old_raising_coeff
== 1 && (idle_for_long_time
|| soft_rt
)) {
457 bfqq
->raising_coeff
= bfqd
->bfq_raising_coeff
;
458 bfqq
->raising_cur_max_time
= idle_for_long_time
?
459 bfqd
->bfq_raising_max_time
:
460 bfqd
->bfq_raising_rt_max_time
;
461 bfq_log_bfqq(bfqd
, bfqq
,
462 "wrais starting at %llu msec,"
464 bfqq
->last_rais_start_finish
,
465 jiffies_to_msecs(bfqq
->
466 raising_cur_max_time
));
467 } else if (old_raising_coeff
> 1) {
468 if (idle_for_long_time
)
469 bfqq
->raising_cur_max_time
=
470 bfqd
->bfq_raising_max_time
;
471 else if (bfqq
->raising_cur_max_time
==
472 bfqd
->bfq_raising_rt_max_time
&&
474 bfqq
->raising_coeff
= 1;
475 bfq_log_bfqq(bfqd
, bfqq
,
476 "wrais ending at %llu msec,"
478 bfqq
->last_rais_start_finish
,
479 jiffies_to_msecs(bfqq
->
480 raising_cur_max_time
));
483 if (old_raising_coeff
!= bfqq
->raising_coeff
)
484 entity
->ioprio_changed
= 1;
486 bfq_add_bfqq_busy(bfqd
, bfqq
);
488 if(bfqd
->low_latency
&& old_raising_coeff
== 1 &&
490 bfqq
->last_rais_start_finish
+
491 bfqd
->bfq_raising_min_inter_arr_async
< jiffies
) {
492 bfqq
->raising_coeff
= bfqd
->bfq_raising_coeff
;
493 bfqq
->raising_cur_max_time
= bfqd
->bfq_raising_max_time
;
495 entity
->ioprio_changed
= 1;
496 bfq_log_bfqq(bfqd
, bfqq
,
497 "non-idle wrais starting at %llu msec,"
499 bfqq
->last_rais_start_finish
,
500 jiffies_to_msecs(bfqq
->
501 raising_cur_max_time
));
503 bfq_updated_next_req(bfqd
, bfqq
);
506 if(bfqd
->low_latency
&&
507 (old_raising_coeff
== 1 || bfqq
->raising_coeff
== 1 ||
509 bfqq
->last_rais_start_finish
= jiffies
;
512 static void bfq_reposition_rq_rb(struct bfq_queue
*bfqq
, struct request
*rq
)
514 elv_rb_del(&bfqq
->sort_list
, rq
);
515 bfqq
->queued
[rq_is_sync(rq
)]--;
516 bfqq
->bfqd
->queued
--;
520 static struct request
*bfq_find_rq_fmerge(struct bfq_data
*bfqd
,
523 struct task_struct
*tsk
= current
;
524 struct bfq_io_cq
*bic
;
525 struct bfq_queue
*bfqq
;
527 bic
= bfq_bic_lookup(bfqd
, tsk
->io_context
);
531 bfqq
= bic_to_bfqq(bic
, bfq_bio_sync(bio
));
533 sector_t sector
= bio
->bi_sector
+ bio_sectors(bio
);
535 return elv_rb_find(&bfqq
->sort_list
, sector
);
541 static void bfq_activate_request(struct request_queue
*q
, struct request
*rq
)
543 struct bfq_data
*bfqd
= q
->elevator
->elevator_data
;
545 bfqd
->rq_in_driver
++;
546 bfqd
->last_position
= blk_rq_pos(rq
) + blk_rq_sectors(rq
);
547 bfq_log(bfqd
, "activate_request: new bfqd->last_position %llu",
548 (long long unsigned)bfqd
->last_position
);
551 static void bfq_deactivate_request(struct request_queue
*q
, struct request
*rq
)
553 struct bfq_data
*bfqd
= q
->elevator
->elevator_data
;
555 WARN_ON(bfqd
->rq_in_driver
== 0);
556 bfqd
->rq_in_driver
--;
559 static void bfq_remove_request(struct request
*rq
)
561 struct bfq_queue
*bfqq
= RQ_BFQQ(rq
);
562 struct bfq_data
*bfqd
= bfqq
->bfqd
;
564 if (bfqq
->next_rq
== rq
) {
565 bfqq
->next_rq
= bfq_find_next_rq(bfqd
, bfqq
, rq
);
566 bfq_updated_next_req(bfqd
, bfqq
);
569 list_del_init(&rq
->queuelist
);
572 if (rq
->cmd_flags
& REQ_META
) {
573 WARN_ON(bfqq
->meta_pending
== 0);
574 bfqq
->meta_pending
--;
578 static int bfq_merge(struct request_queue
*q
, struct request
**req
,
581 struct bfq_data
*bfqd
= q
->elevator
->elevator_data
;
582 struct request
*__rq
;
584 __rq
= bfq_find_rq_fmerge(bfqd
, bio
);
585 if (__rq
!= NULL
&& elv_rq_merge_ok(__rq
, bio
)) {
587 return ELEVATOR_FRONT_MERGE
;
590 return ELEVATOR_NO_MERGE
;
593 static void bfq_merged_request(struct request_queue
*q
, struct request
*req
,
596 if (type
== ELEVATOR_FRONT_MERGE
) {
597 struct bfq_queue
*bfqq
= RQ_BFQQ(req
);
599 bfq_reposition_rq_rb(bfqq
, req
);
603 static void bfq_merged_requests(struct request_queue
*q
, struct request
*rq
,
604 struct request
*next
)
606 struct bfq_queue
*bfqq
= RQ_BFQQ(rq
);
609 * Reposition in fifo if next is older than rq.
611 if (!list_empty(&rq
->queuelist
) && !list_empty(&next
->queuelist
) &&
612 time_before(rq_fifo_time(next
), rq_fifo_time(rq
))) {
613 list_move(&rq
->queuelist
, &next
->queuelist
);
614 rq_set_fifo_time(rq
, rq_fifo_time(next
));
617 if (bfqq
->next_rq
== next
)
620 bfq_remove_request(next
);
623 static int bfq_allow_merge(struct request_queue
*q
, struct request
*rq
,
626 struct bfq_data
*bfqd
= q
->elevator
->elevator_data
;
627 struct bfq_io_cq
*bic
;
628 struct bfq_queue
*bfqq
;
631 * Disallow merge of a sync bio into an async request.
633 if (bfq_bio_sync(bio
) && !rq_is_sync(rq
))
637 * Lookup the bfqq that this bio will be queued with. Allow
638 * merge only if rq is queued there.
639 * Queue lock is held here.
641 bic
= bfq_bic_lookup(bfqd
, current
->io_context
);
645 bfqq
= bic_to_bfqq(bic
, bfq_bio_sync(bio
));
646 return bfqq
== RQ_BFQQ(rq
);
649 static void __bfq_set_active_queue(struct bfq_data
*bfqd
,
650 struct bfq_queue
*bfqq
)
653 bfq_mark_bfqq_must_alloc(bfqq
);
654 bfq_mark_bfqq_budget_new(bfqq
);
655 bfq_clear_bfqq_fifo_expire(bfqq
);
657 bfqd
->budgets_assigned
= (bfqd
->budgets_assigned
*7 + 256) / 8;
659 bfq_log_bfqq(bfqd
, bfqq
, "set_active_queue, cur-budget = %lu",
660 bfqq
->entity
.budget
);
663 bfqd
->active_queue
= bfqq
;
667 * Get and set a new active queue for service.
669 static struct bfq_queue
*bfq_set_active_queue(struct bfq_data
*bfqd
,
670 struct bfq_queue
*bfqq
)
673 bfqq
= bfq_get_next_queue(bfqd
);
675 bfq_get_next_queue_forced(bfqd
, bfqq
);
677 __bfq_set_active_queue(bfqd
, bfqq
);
681 static inline sector_t
bfq_dist_from_last(struct bfq_data
*bfqd
,
684 if (blk_rq_pos(rq
) >= bfqd
->last_position
)
685 return blk_rq_pos(rq
) - bfqd
->last_position
;
687 return bfqd
->last_position
- blk_rq_pos(rq
);
691 * Return true if bfqq has no request pending and rq is close enough to
692 * bfqd->last_position, or if rq is closer to bfqd->last_position than
695 static inline int bfq_rq_close(struct bfq_data
*bfqd
, struct bfq_queue
*bfqq
,
698 sector_t sdist
= bfqq
->seek_mean
;
700 if (!bfq_sample_valid(bfqq
->seek_samples
))
701 sdist
= BFQQ_SEEK_THR
;
703 /* If seek_mean is large, using it as close criteria is meaningless */
704 if (sdist
> BFQQ_SEEK_THR
)
705 sdist
= BFQQ_SEEK_THR
;
707 return bfq_dist_from_last(bfqd
, rq
) <= sdist
;
710 static struct bfq_queue
*bfqq_close(struct bfq_data
*bfqd
,
711 struct bfq_queue
*cur_bfqq
)
713 struct rb_root
*root
= &bfqd
->rq_pos_tree
;
714 struct rb_node
*parent
, *node
;
715 struct bfq_queue
*__bfqq
;
716 sector_t sector
= bfqd
->last_position
;
718 if (RB_EMPTY_ROOT(root
))
722 * First, if we find a request starting at the end of the last
723 * request, choose it.
725 __bfqq
= bfq_rq_pos_tree_lookup(bfqd
, root
, sector
, &parent
, NULL
);
730 * If the exact sector wasn't found, the parent of the NULL leaf
731 * will contain the closest sector (rq_pos_tree sorted by next_request
734 __bfqq
= rb_entry(parent
, struct bfq_queue
, pos_node
);
735 if (bfq_rq_close(bfqd
, cur_bfqq
, __bfqq
->next_rq
))
738 if (blk_rq_pos(__bfqq
->next_rq
) < sector
)
739 node
= rb_next(&__bfqq
->pos_node
);
741 node
= rb_prev(&__bfqq
->pos_node
);
745 __bfqq
= rb_entry(node
, struct bfq_queue
, pos_node
);
746 if (bfq_rq_close(bfqd
, cur_bfqq
, __bfqq
->next_rq
))
754 * cur_bfqq - passed in so that we don't decide that the current queue
755 * is closely cooperating with itself.
757 * We are assuming that cur_bfqq has dispatched at least one request,
758 * and that bfqd->last_position reflects a position on the disk associated
759 * with the I/O issued by cur_bfqq.
761 static struct bfq_queue
*bfq_close_cooperator(struct bfq_data
*bfqd
,
762 struct bfq_queue
*cur_bfqq
)
764 struct bfq_queue
*bfqq
;
766 if (bfq_class_idle(cur_bfqq
))
768 if (!bfq_bfqq_sync(cur_bfqq
))
770 if (BFQQ_SEEKY(cur_bfqq
))
773 /* If device has only one backlogged bfq_queue, don't search. */
774 if (bfqd
->busy_queues
== 1)
778 * We should notice if some of the queues are cooperating, e.g.
779 * working closely on the same area of the disk. In that case,
780 * we can group them together and don't waste time idling.
782 bfqq
= bfqq_close(bfqd
, cur_bfqq
);
783 if (bfqq
== NULL
|| bfqq
== cur_bfqq
)
787 * Do not merge queues from different bfq_groups.
789 if (bfqq
->entity
.parent
!= cur_bfqq
->entity
.parent
)
793 * It only makes sense to merge sync queues.
795 if (!bfq_bfqq_sync(bfqq
))
797 if (BFQQ_SEEKY(bfqq
))
801 * Do not merge queues of different priority classes.
803 if (bfq_class_rt(bfqq
) != bfq_class_rt(cur_bfqq
))
810 * If enough samples have been computed, return the current max budget
811 * stored in bfqd, which is dynamically updated according to the
812 * estimated disk peak rate; otherwise return the default max budget
814 static inline unsigned long bfq_max_budget(struct bfq_data
*bfqd
)
816 return bfqd
->budgets_assigned
< 194 ? bfq_default_max_budget
:
817 bfqd
->bfq_max_budget
;
821 * Return min budget, which is a fraction of the current or default
822 * max budget (trying with 1/32)
824 static inline unsigned long bfq_min_budget(struct bfq_data
*bfqd
)
826 return bfqd
->budgets_assigned
< 194 ? bfq_default_max_budget
/ 32 :
827 bfqd
->bfq_max_budget
/ 32;
830 static void bfq_arm_slice_timer(struct bfq_data
*bfqd
)
832 struct bfq_queue
*bfqq
= bfqd
->active_queue
;
833 struct bfq_io_cq
*bic
;
836 WARN_ON(!RB_EMPTY_ROOT(&bfqq
->sort_list
));
838 /* Idling is disabled, either manually or by past process history. */
839 if (bfqd
->bfq_slice_idle
== 0 || !bfq_bfqq_idle_window(bfqq
))
842 /* Tasks have exited, don't wait. */
843 bic
= bfqd
->active_bic
;
844 if (bic
== NULL
|| atomic_read(&bic
->icq
.ioc
->nr_tasks
) == 0)
847 bfq_mark_bfqq_wait_request(bfqq
);
850 * We don't want to idle for seeks, but we do want to allow
851 * fair distribution of slice time for a process doing back-to-back
852 * seeks. So allow a little bit of time for him to submit a new rq.
854 * To prevent processes with (partly) seeky workloads from
855 * being too ill-treated, grant them a small fraction of the
856 * assigned budget before reducing the waiting time to
857 * BFQ_MIN_TT. This happened to help reduce latency.
859 sl
= bfqd
->bfq_slice_idle
;
860 if (bfq_sample_valid(bfqq
->seek_samples
) && BFQQ_SEEKY(bfqq
) &&
861 bfqq
->entity
.service
> bfq_max_budget(bfqd
) / 8 &&
862 bfqq
->raising_coeff
== 1)
863 sl
= min(sl
, msecs_to_jiffies(BFQ_MIN_TT
));
864 else if (bfqq
->raising_coeff
> 1)
866 bfqd
->last_idling_start
= ktime_get();
867 mod_timer(&bfqd
->idle_slice_timer
, jiffies
+ sl
);
868 bfq_log(bfqd
, "arm idle: %u/%u ms",
869 jiffies_to_msecs(sl
), jiffies_to_msecs(bfqd
->bfq_slice_idle
));
873 * Set the maximum time for the active queue to consume its
874 * budget. This prevents seeky processes from lowering the disk
875 * throughput (always guaranteed with a time slice scheme as in CFQ).
877 static void bfq_set_budget_timeout(struct bfq_data
*bfqd
)
879 struct bfq_queue
*bfqq
= bfqd
->active_queue
;
880 unsigned int timeout_coeff
=
881 bfqq
->raising_cur_max_time
== bfqd
->bfq_raising_rt_max_time
?
882 1 : (bfqq
->entity
.weight
/ bfqq
->entity
.orig_weight
);
884 bfqd
->last_budget_start
= ktime_get();
886 bfq_clear_bfqq_budget_new(bfqq
);
887 bfqq
->budget_timeout
= jiffies
+
888 bfqd
->bfq_timeout
[bfq_bfqq_sync(bfqq
)] * timeout_coeff
;
890 bfq_log_bfqq(bfqd
, bfqq
, "set budget_timeout %u",
891 jiffies_to_msecs(bfqd
->bfq_timeout
[bfq_bfqq_sync(bfqq
)] *
896 * Move request from internal lists to the request queue dispatch list.
898 static void bfq_dispatch_insert(struct request_queue
*q
, struct request
*rq
)
900 struct bfq_data
*bfqd
= q
->elevator
->elevator_data
;
901 struct bfq_queue
*bfqq
= RQ_BFQQ(rq
);
903 bfq_remove_request(rq
);
905 elv_dispatch_sort(q
, rq
);
907 if (bfq_bfqq_sync(bfqq
))
912 * Return expired entry, or NULL to just start from scratch in rbtree.
914 static struct request
*bfq_check_fifo(struct bfq_queue
*bfqq
)
916 struct request
*rq
= NULL
;
918 if (bfq_bfqq_fifo_expire(bfqq
))
921 bfq_mark_bfqq_fifo_expire(bfqq
);
923 if (list_empty(&bfqq
->fifo
))
926 rq
= rq_entry_fifo(bfqq
->fifo
.next
);
928 if (time_before(jiffies
, rq_fifo_time(rq
)))
935 * Must be called with the queue_lock held.
937 static int bfqq_process_refs(struct bfq_queue
*bfqq
)
939 int process_refs
, io_refs
;
941 io_refs
= bfqq
->allocated
[READ
] + bfqq
->allocated
[WRITE
];
942 process_refs
= atomic_read(&bfqq
->ref
) - io_refs
;
943 BUG_ON(process_refs
< 0);
947 static void bfq_setup_merge(struct bfq_queue
*bfqq
, struct bfq_queue
*new_bfqq
)
949 int process_refs
, new_process_refs
;
950 struct bfq_queue
*__bfqq
;
953 * If there are no process references on the new_bfqq, then it is
954 * unsafe to follow the ->new_bfqq chain as other bfqq's in the chain
955 * may have dropped their last reference (not just their last process
958 if (!bfqq_process_refs(new_bfqq
))
961 /* Avoid a circular list and skip interim queue merges. */
962 while ((__bfqq
= new_bfqq
->new_bfqq
)) {
968 process_refs
= bfqq_process_refs(bfqq
);
969 new_process_refs
= bfqq_process_refs(new_bfqq
);
971 * If the process for the bfqq has gone away, there is no
972 * sense in merging the queues.
974 if (process_refs
== 0 || new_process_refs
== 0)
978 * Merge in the direction of the lesser amount of work.
980 if (new_process_refs
>= process_refs
) {
981 bfqq
->new_bfqq
= new_bfqq
;
982 atomic_add(process_refs
, &new_bfqq
->ref
);
984 new_bfqq
->new_bfqq
= bfqq
;
985 atomic_add(new_process_refs
, &bfqq
->ref
);
987 bfq_log_bfqq(bfqq
->bfqd
, bfqq
, "scheduling merge with queue %d",
991 static inline unsigned long bfq_bfqq_budget_left(struct bfq_queue
*bfqq
)
993 struct bfq_entity
*entity
= &bfqq
->entity
;
994 return entity
->budget
- entity
->service
;
997 static void __bfq_bfqq_expire(struct bfq_data
*bfqd
, struct bfq_queue
*bfqq
)
999 BUG_ON(bfqq
!= bfqd
->active_queue
);
1001 __bfq_bfqd_reset_active(bfqd
);
1003 if (RB_EMPTY_ROOT(&bfqq
->sort_list
)) {
1004 bfq_del_bfqq_busy(bfqd
, bfqq
, 1);
1006 * overloading budget_timeout field to store when
1007 * the queue remains with no backlog, used by
1008 * the weight-raising mechanism
1010 bfqq
->budget_timeout
= jiffies
;
1012 bfq_activate_bfqq(bfqd
, bfqq
);
1014 * Resort priority tree of potential close cooperators.
1016 bfq_rq_pos_tree_add(bfqd
, bfqq
);
1020 * If this bfqq is shared between multiple processes, check
1021 * to make sure that those processes are still issuing I/Os
1022 * within the mean seek distance. If not, it may be time to
1023 * break the queues apart again.
1025 if (bfq_bfqq_coop(bfqq
) && BFQQ_SEEKY(bfqq
))
1026 bfq_mark_bfqq_split_coop(bfqq
);
1030 * __bfq_bfqq_recalc_budget - try to adapt the budget to the @bfqq behavior.
1031 * @bfqd: device data.
1032 * @bfqq: queue to update.
1033 * @reason: reason for expiration.
1035 * Handle the feedback on @bfqq budget. See the body for detailed
1038 static void __bfq_bfqq_recalc_budget(struct bfq_data
*bfqd
,
1039 struct bfq_queue
*bfqq
,
1040 enum bfqq_expiration reason
)
1042 struct request
*next_rq
;
1043 unsigned long budget
, min_budget
;
1045 budget
= bfqq
->max_budget
;
1046 min_budget
= bfq_min_budget(bfqd
);
1048 BUG_ON(bfqq
!= bfqd
->active_queue
);
1050 bfq_log_bfqq(bfqd
, bfqq
, "recalc_budg: last budg %lu, budg left %lu",
1051 bfqq
->entity
.budget
, bfq_bfqq_budget_left(bfqq
));
1052 bfq_log_bfqq(bfqd
, bfqq
, "recalc_budg: last max_budg %lu, min budg %lu",
1053 budget
, bfq_min_budget(bfqd
));
1054 bfq_log_bfqq(bfqd
, bfqq
, "recalc_budg: sync %d, seeky %d",
1055 bfq_bfqq_sync(bfqq
), BFQQ_SEEKY(bfqd
->active_queue
));
1057 if (bfq_bfqq_sync(bfqq
)) {
1060 * Caveat: in all the following cases we trade latency
1063 case BFQ_BFQQ_TOO_IDLE
:
1065 * This is the only case where we may reduce
1066 * the budget: if there is no requets of the
1067 * process still waiting for completion, then
1068 * we assume (tentatively) that the timer has
1069 * expired because the batch of requests of
1070 * the process could have been served with a
1071 * smaller budget. Hence, betting that
1072 * process will behave in the same way when it
1073 * becomes backlogged again, we reduce its
1074 * next budget. As long as we guess right,
1075 * this budget cut reduces the latency
1076 * experienced by the process.
1078 * However, if there are still outstanding
1079 * requests, then the process may have not yet
1080 * issued its next request just because it is
1081 * still waiting for the completion of some of
1082 * the still oustanding ones. So in this
1083 * subcase we do not reduce its budget, on the
1084 * contrary we increase it to possibly boost
1085 * the throughput, as discussed in the
1086 * comments to the BUDGET_TIMEOUT case.
1088 if (bfqq
->dispatched
> 0) /* still oustanding reqs */
1089 budget
= min(budget
* 2, bfqd
->bfq_max_budget
);
1091 if (budget
> 5 * min_budget
)
1092 budget
-= 4 * min_budget
;
1094 budget
= min_budget
;
1097 case BFQ_BFQQ_BUDGET_TIMEOUT
:
1099 * We double the budget here because: 1) it
1100 * gives the chance to boost the throughput if
1101 * this is not a seeky process (which may have
1102 * bumped into this timeout because of, e.g.,
1103 * ZBR), 2) together with charge_full_budget
1104 * it helps give seeky processes higher
1105 * timestamps, and hence be served less
1108 budget
= min(budget
* 2, bfqd
->bfq_max_budget
);
1110 case BFQ_BFQQ_BUDGET_EXHAUSTED
:
1112 * The process still has backlog, and did not
1113 * let either the budget timeout or the disk
1114 * idling timeout expire. Hence it is not
1115 * seeky, has a short thinktime and may be
1116 * happy with a higher budget too. So
1117 * definitely increase the budget of this good
1118 * candidate to boost the disk throughput.
1120 budget
= min(budget
* 4, bfqd
->bfq_max_budget
);
1122 case BFQ_BFQQ_NO_MORE_REQUESTS
:
1124 * Leave the budget unchanged.
1129 } else /* async queue */
1130 /* async queues get always the maximum possible budget
1131 * (their ability to dispatch is limited by
1132 * @bfqd->bfq_max_budget_async_rq).
1134 budget
= bfqd
->bfq_max_budget
;
1136 bfqq
->max_budget
= budget
;
1138 if (bfqd
->budgets_assigned
>= 194 && bfqd
->bfq_user_max_budget
== 0 &&
1139 bfqq
->max_budget
> bfqd
->bfq_max_budget
)
1140 bfqq
->max_budget
= bfqd
->bfq_max_budget
;
1143 * Make sure that we have enough budget for the next request.
1144 * Since the finish time of the bfqq must be kept in sync with
1145 * the budget, be sure to call __bfq_bfqq_expire() after the
1148 next_rq
= bfqq
->next_rq
;
1149 if (next_rq
!= NULL
)
1150 bfqq
->entity
.budget
= max_t(unsigned long, bfqq
->max_budget
,
1151 bfq_serv_to_charge(next_rq
, bfqq
));
1153 bfqq
->entity
.budget
= bfqq
->max_budget
;
1155 bfq_log_bfqq(bfqd
, bfqq
, "head sect: %u, new budget %lu",
1156 next_rq
!= NULL
? blk_rq_sectors(next_rq
) : 0,
1157 bfqq
->entity
.budget
);
1160 static unsigned long bfq_calc_max_budget(u64 peak_rate
, u64 timeout
)
1162 unsigned long max_budget
;
1165 * The max_budget calculated when autotuning is equal to the
1166 * amount of sectors transfered in timeout_sync at the
1167 * estimated peak rate.
1169 max_budget
= (unsigned long)(peak_rate
* 1000 *
1170 timeout
>> BFQ_RATE_SHIFT
);
1176 * In addition to updating the peak rate, checks whether the process
1177 * is "slow", and returns 1 if so. This slow flag is used, in addition
1178 * to the budget timeout, to reduce the amount of service provided to
1179 * seeky processes, and hence reduce their chances to lower the
1180 * throughput. See the code for more details.
1182 static int bfq_update_peak_rate(struct bfq_data
*bfqd
, struct bfq_queue
*bfqq
,
1183 int compensate
, enum bfqq_expiration reason
)
1185 u64 bw
, usecs
, expected
, timeout
;
1189 if (!bfq_bfqq_sync(bfqq
) || bfq_bfqq_budget_new(bfqq
))
1192 delta
= compensate
? bfqd
->last_idling_start
: ktime_get();
1193 delta
= ktime_sub(delta
, bfqd
->last_budget_start
);
1194 usecs
= ktime_to_us(delta
);
1196 /* Don't trust short/unrealistic values. */
1197 if (usecs
< 100 || usecs
>= LONG_MAX
)
1201 * Calculate the bandwidth for the last slice. We use a 64 bit
1202 * value to store the peak rate, in sectors per usec in fixed
1203 * point math. We do so to have enough precision in the estimate
1204 * and to avoid overflows.
1206 bw
= (u64
)bfqq
->entity
.service
<< BFQ_RATE_SHIFT
;
1207 do_div(bw
, (unsigned long)usecs
);
1209 timeout
= jiffies_to_msecs(bfqd
->bfq_timeout
[BLK_RW_SYNC
]);
1212 * Use only long (> 20ms) intervals to filter out spikes for
1213 * the peak rate estimation.
1215 if (usecs
> 20000) {
1216 if (bw
> bfqd
->peak_rate
||
1217 (!BFQQ_SEEKY(bfqq
) &&
1218 reason
== BFQ_BFQQ_BUDGET_TIMEOUT
)) {
1219 bfq_log(bfqd
, "measured bw =%llu", bw
);
1221 * To smooth oscillations use a low-pass filter with
1223 * new_rate = (7/8) * old_rate + (1/8) * bw
1226 bfqd
->peak_rate
*= 7;
1227 do_div(bfqd
->peak_rate
, 8);
1228 bfqd
->peak_rate
+= bw
;
1230 bfq_log(bfqd
, "new peak_rate=%llu", bfqd
->peak_rate
);
1233 update
|= bfqd
->peak_rate_samples
== BFQ_PEAK_RATE_SAMPLES
- 1;
1235 if (bfqd
->peak_rate_samples
< BFQ_PEAK_RATE_SAMPLES
)
1236 bfqd
->peak_rate_samples
++;
1238 if (bfqd
->peak_rate_samples
== BFQ_PEAK_RATE_SAMPLES
&&
1239 update
&& bfqd
->bfq_user_max_budget
== 0) {
1240 bfqd
->bfq_max_budget
=
1241 bfq_calc_max_budget(bfqd
->peak_rate
, timeout
);
1242 bfq_log(bfqd
, "new max_budget=%lu",
1243 bfqd
->bfq_max_budget
);
1248 * If the process has been served for a too short time
1249 * interval to let its possible sequential accesses prevail on
1250 * the initial seek time needed to move the disk head on the
1251 * first sector it requested, then give the process a chance
1252 * and for the moment return false.
1254 if (bfqq
->entity
.budget
<= bfq_max_budget(bfqd
) / 8)
1258 * A process is considered ``slow'' (i.e., seeky, so that we
1259 * cannot treat it fairly in the service domain, as it would
1260 * slow down too much the other processes) if, when a slice
1261 * ends for whatever reason, it has received service at a
1262 * rate that would not be high enough to complete the budget
1263 * before the budget timeout expiration.
1265 expected
= bw
* 1000 * timeout
>> BFQ_RATE_SHIFT
;
1268 * Caveat: processes doing IO in the slower disk zones will
1269 * tend to be slow(er) even if not seeky. And the estimated
1270 * peak rate will actually be an average over the disk
1271 * surface. Hence, to not be too harsh with unlucky processes,
1272 * we keep a budget/3 margin of safety before declaring a
1275 return expected
> (4 * bfqq
->entity
.budget
) / 3;
1279 * bfq_bfqq_expire - expire a queue.
1280 * @bfqd: device owning the queue.
1281 * @bfqq: the queue to expire.
1282 * @compensate: if true, compensate for the time spent idling.
1283 * @reason: the reason causing the expiration.
1286 * If the process associated to the queue is slow (i.e., seeky), or in
1287 * case of budget timeout, or, finally, if it is async, we
1288 * artificially charge it an entire budget (independently of the
1289 * actual service it received). As a consequence, the queue will get
1290 * higher timestamps than the correct ones upon reactivation, and
1291 * hence it will be rescheduled as if it had received more service
1292 * than what it actually received. In the end, this class of processes
1293 * will receive less service in proportion to how slowly they consume
1294 * their budgets (and hence how seriously they tend to lower the
1297 * In contrast, when a queue expires because it has been idling for
1298 * too much or because it exhausted its budget, we do not touch the
1299 * amount of service it has received. Hence when the queue will be
1300 * reactivated and its timestamps updated, the latter will be in sync
1301 * with the actual service received by the queue until expiration.
1303 * Charging a full budget to the first type of queues and the exact
1304 * service to the others has the effect of using the WF2Q+ policy to
1305 * schedule the former on a timeslice basis, without violating the
1306 * service domain guarantees of the latter.
1308 static void bfq_bfqq_expire(struct bfq_data
*bfqd
,
1309 struct bfq_queue
*bfqq
,
1311 enum bfqq_expiration reason
)
1314 BUG_ON(bfqq
!= bfqd
->active_queue
);
1316 /* Update disk peak rate for autotuning and check whether the
1317 * process is slow (see bfq_update_peak_rate).
1319 slow
= bfq_update_peak_rate(bfqd
, bfqq
, compensate
, reason
);
1322 * As above explained, 'punish' slow (i.e., seeky), timed-out
1323 * and async queues, to favor sequential sync workloads.
1325 * Processes doing IO in the slower disk zones will tend to be
1326 * slow(er) even if not seeky. Hence, since the estimated peak
1327 * rate is actually an average over the disk surface, these
1328 * processes may timeout just for bad luck. To avoid punishing
1329 * them we do not charge a full budget to a process that
1330 * succeeded in consuming at least 2/3 of its budget.
1332 if (slow
|| (reason
== BFQ_BFQQ_BUDGET_TIMEOUT
&&
1333 bfq_bfqq_budget_left(bfqq
) >= bfqq
->entity
.budget
/ 3))
1334 bfq_bfqq_charge_full_budget(bfqq
);
1336 if (bfqd
->low_latency
&& bfqq
->raising_coeff
== 1)
1337 bfqq
->last_rais_start_finish
= jiffies
;
1339 if (bfqd
->low_latency
&& bfqd
->bfq_raising_max_softrt_rate
> 0) {
1340 if(reason
!= BFQ_BFQQ_BUDGET_TIMEOUT
)
1341 bfqq
->soft_rt_next_start
=
1343 HZ
* bfqq
->entity
.service
/
1344 bfqd
->bfq_raising_max_softrt_rate
;
1346 bfqq
->soft_rt_next_start
= -1; /* infinity */
1348 bfq_log_bfqq(bfqd
, bfqq
,
1349 "expire (%d, slow %d, num_disp %d, idle_win %d)", reason
, slow
,
1350 bfqq
->dispatched
, bfq_bfqq_idle_window(bfqq
));
1352 /* Increase, decrease or leave budget unchanged according to reason */
1353 __bfq_bfqq_recalc_budget(bfqd
, bfqq
, reason
);
1354 __bfq_bfqq_expire(bfqd
, bfqq
);
1358 * Budget timeout is not implemented through a dedicated timer, but
1359 * just checked on request arrivals and completions, as well as on
1360 * idle timer expirations.
1362 static int bfq_bfqq_budget_timeout(struct bfq_queue
*bfqq
)
1364 if (bfq_bfqq_budget_new(bfqq
))
1367 if (time_before(jiffies
, bfqq
->budget_timeout
))
1374 * If we expire a queue that is waiting for the arrival of a new
1375 * request, we may prevent the fictitious timestamp backshifting that
1376 * allows the guarantees of the queue to be preserved (see [1] for
1377 * this tricky aspect). Hence we return true only if this condition
1378 * does not hold, or if the queue is slow enough to deserve only to be
1379 * kicked off for preserving a high throughput.
1381 static inline int bfq_may_expire_for_budg_timeout(struct bfq_queue
*bfqq
)
1383 bfq_log_bfqq(bfqq
->bfqd
, bfqq
,
1384 "may_budget_timeout: wr %d left %d timeout %d",
1385 bfq_bfqq_wait_request(bfqq
),
1386 bfq_bfqq_budget_left(bfqq
) >= bfqq
->entity
.budget
/ 3,
1387 bfq_bfqq_budget_timeout(bfqq
));
1389 return (!bfq_bfqq_wait_request(bfqq
) ||
1390 bfq_bfqq_budget_left(bfqq
) >= bfqq
->entity
.budget
/ 3)
1392 bfq_bfqq_budget_timeout(bfqq
);
1396 * Select a queue for service. If we have a current active queue,
1397 * check whether to continue servicing it, or retrieve and set a new one.
1399 static struct bfq_queue
*bfq_select_queue(struct bfq_data
*bfqd
)
1401 struct bfq_queue
*bfqq
, *new_bfqq
= NULL
;
1402 struct request
*next_rq
;
1403 enum bfqq_expiration reason
= BFQ_BFQQ_BUDGET_TIMEOUT
;
1405 bfqq
= bfqd
->active_queue
;
1409 bfq_log_bfqq(bfqd
, bfqq
, "select_queue: already active queue");
1412 * If another queue has a request waiting within our mean seek
1413 * distance, let it run. The expire code will check for close
1414 * cooperators and put the close queue at the front of the
1415 * service tree. If possible, merge the expiring queue with the
1418 new_bfqq
= bfq_close_cooperator(bfqd
, bfqq
);
1419 if (new_bfqq
!= NULL
&& bfqq
->new_bfqq
== NULL
)
1420 bfq_setup_merge(bfqq
, new_bfqq
);
1422 if (bfq_may_expire_for_budg_timeout(bfqq
))
1425 next_rq
= bfqq
->next_rq
;
1427 * If bfqq has requests queued and it has enough budget left to
1428 * serve them, keep the queue, otherwise expire it.
1430 if (next_rq
!= NULL
) {
1431 if (bfq_serv_to_charge(next_rq
, bfqq
) >
1432 bfq_bfqq_budget_left(bfqq
)) {
1433 reason
= BFQ_BFQQ_BUDGET_EXHAUSTED
;
1437 * The idle timer may be pending because we may not
1438 * disable disk idling even when a new request arrives
1440 if (timer_pending(&bfqd
->idle_slice_timer
)) {
1442 * If we get here: 1) at least a new request
1443 * has arrived but we have not disabled the
1444 * timer because the request was too small,
1445 * 2) then the block layer has unplugged the
1446 * device, causing the dispatch to be invoked.
1448 * Since the device is unplugged, now the
1449 * requests are probably large enough to
1450 * provide a reasonable throughput.
1451 * So we disable idling.
1453 bfq_clear_bfqq_wait_request(bfqq
);
1454 del_timer(&bfqd
->idle_slice_timer
);
1456 if (new_bfqq
== NULL
)
1464 * No requests pending. If there is no cooperator, and the active
1465 * queue still has requests in flight or is idling for a new request,
1468 if (new_bfqq
== NULL
&& (timer_pending(&bfqd
->idle_slice_timer
) ||
1469 (bfqq
->dispatched
!= 0 && bfq_bfqq_idle_window(bfqq
)))) {
1472 } else if (new_bfqq
!= NULL
&& timer_pending(&bfqd
->idle_slice_timer
)) {
1474 * Expiring the queue because there is a close cooperator,
1477 bfq_clear_bfqq_wait_request(bfqq
);
1478 del_timer(&bfqd
->idle_slice_timer
);
1481 reason
= BFQ_BFQQ_NO_MORE_REQUESTS
;
1483 bfq_bfqq_expire(bfqd
, bfqq
, 0, reason
);
1485 bfqq
= bfq_set_active_queue(bfqd
, new_bfqq
);
1486 bfq_log(bfqd
, "select_queue: new queue %d returned",
1487 bfqq
!= NULL
? bfqq
->pid
: 0);
1492 static void update_raising_data(struct bfq_data
*bfqd
, struct bfq_queue
*bfqq
)
1494 if (bfqq
->raising_coeff
> 1) { /* queue is being boosted */
1495 struct bfq_entity
*entity
= &bfqq
->entity
;
1497 bfq_log_bfqq(bfqd
, bfqq
,
1498 "raising period dur %u/%u msec, "
1499 "old raising coeff %u, w %d(%d)",
1500 jiffies_to_msecs(jiffies
-
1501 bfqq
->last_rais_start_finish
),
1502 jiffies_to_msecs(bfqq
->raising_cur_max_time
),
1503 bfqq
->raising_coeff
,
1504 bfqq
->entity
.weight
, bfqq
->entity
.orig_weight
);
1506 BUG_ON(bfqq
!= bfqd
->active_queue
&& entity
->weight
!=
1507 entity
->orig_weight
* bfqq
->raising_coeff
);
1508 if(entity
->ioprio_changed
)
1509 bfq_log_bfqq(bfqd
, bfqq
,
1510 "WARN: pending prio change");
1512 * If too much time has elapsed from the beginning
1513 * of this weight-raising period and process is not soft
1514 * real-time, stop it
1516 if (jiffies
- bfqq
->last_rais_start_finish
>
1517 bfqq
->raising_cur_max_time
) {
1518 int soft_rt
= bfqd
->bfq_raising_max_softrt_rate
> 0 &&
1519 bfqq
->soft_rt_next_start
< jiffies
;
1521 bfqq
->last_rais_start_finish
= jiffies
;
1523 bfqq
->raising_cur_max_time
=
1524 bfqd
->bfq_raising_rt_max_time
;
1526 bfq_log_bfqq(bfqd
, bfqq
,
1527 "wrais ending at %llu msec,"
1529 bfqq
->last_rais_start_finish
,
1530 jiffies_to_msecs(bfqq
->
1531 raising_cur_max_time
));
1532 bfqq
->raising_coeff
= 1;
1533 entity
->ioprio_changed
= 1;
1534 __bfq_entity_update_weight_prio(
1535 bfq_entity_service_tree(entity
),
1544 * Dispatch one request from bfqq, moving it to the request queue
1547 static int bfq_dispatch_request(struct bfq_data
*bfqd
,
1548 struct bfq_queue
*bfqq
)
1552 unsigned long service_to_charge
;
1554 BUG_ON(RB_EMPTY_ROOT(&bfqq
->sort_list
));
1556 /* Follow expired path, else get first next available. */
1557 rq
= bfq_check_fifo(bfqq
);
1560 service_to_charge
= bfq_serv_to_charge(rq
, bfqq
);
1562 if (service_to_charge
> bfq_bfqq_budget_left(bfqq
)) {
1564 * This may happen if the next rq is chosen
1565 * in fifo order instead of sector order.
1566 * The budget is properly dimensioned
1567 * to be always sufficient to serve the next request
1568 * only if it is chosen in sector order. The reason is
1569 * that it would be quite inefficient and little useful
1570 * to always make sure that the budget is large enough
1571 * to serve even the possible next rq in fifo order.
1572 * In fact, requests are seldom served in fifo order.
1574 * Expire the queue for budget exhaustion, and
1575 * make sure that the next act_budget is enough
1576 * to serve the next request, even if it comes
1577 * from the fifo expired path.
1581 * Since this dispatch is failed, make sure that
1582 * a new one will be performed
1584 if (!bfqd
->rq_in_driver
)
1585 bfq_schedule_dispatch(bfqd
);
1589 /* Finally, insert request into driver dispatch list. */
1590 bfq_bfqq_served(bfqq
, service_to_charge
);
1591 bfq_dispatch_insert(bfqd
->queue
, rq
);
1593 update_raising_data(bfqd
, bfqq
);
1595 bfq_log_bfqq(bfqd
, bfqq
, "dispatched %u sec req (%llu), "
1598 (long long unsigned)blk_rq_pos(rq
),
1599 bfq_bfqq_budget_left(bfqq
));
1603 if (bfqd
->active_bic
== NULL
) {
1604 atomic_long_inc(&RQ_BIC(rq
)->icq
.ioc
->refcount
);
1605 bfqd
->active_bic
= RQ_BIC(rq
);
1608 if (bfqd
->busy_queues
> 1 && ((!bfq_bfqq_sync(bfqq
) &&
1609 dispatched
>= bfqd
->bfq_max_budget_async_rq
) ||
1610 bfq_class_idle(bfqq
)))
1616 bfq_bfqq_expire(bfqd
, bfqq
, 0, BFQ_BFQQ_BUDGET_EXHAUSTED
);
1620 static int __bfq_forced_dispatch_bfqq(struct bfq_queue
*bfqq
)
1624 while (bfqq
->next_rq
!= NULL
) {
1625 bfq_dispatch_insert(bfqq
->bfqd
->queue
, bfqq
->next_rq
);
1629 BUG_ON(!list_empty(&bfqq
->fifo
));
1634 * Drain our current requests. Used for barriers and when switching
1635 * io schedulers on-the-fly.
1637 static int bfq_forced_dispatch(struct bfq_data
*bfqd
)
1639 struct bfq_queue
*bfqq
, *n
;
1640 struct bfq_service_tree
*st
;
1643 bfqq
= bfqd
->active_queue
;
1645 __bfq_bfqq_expire(bfqd
, bfqq
);
1648 * Loop through classes, and be careful to leave the scheduler
1649 * in a consistent state, as feedback mechanisms and vtime
1650 * updates cannot be disabled during the process.
1652 list_for_each_entry_safe(bfqq
, n
, &bfqd
->active_list
, bfqq_list
) {
1653 st
= bfq_entity_service_tree(&bfqq
->entity
);
1655 dispatched
+= __bfq_forced_dispatch_bfqq(bfqq
);
1656 bfqq
->max_budget
= bfq_max_budget(bfqd
);
1658 bfq_forget_idle(st
);
1661 BUG_ON(bfqd
->busy_queues
!= 0);
1666 static int bfq_dispatch_requests(struct request_queue
*q
, int force
)
1668 struct bfq_data
*bfqd
= q
->elevator
->elevator_data
;
1669 struct bfq_queue
*bfqq
;
1672 bfq_log(bfqd
, "dispatch requests: %d busy queues", bfqd
->busy_queues
);
1673 if (bfqd
->busy_queues
== 0)
1676 if (unlikely(force
))
1677 return bfq_forced_dispatch(bfqd
);
1679 if((bfqq
= bfq_select_queue(bfqd
)) == NULL
)
1682 max_dispatch
= bfqd
->bfq_quantum
;
1683 if (bfq_class_idle(bfqq
))
1686 if (!bfq_bfqq_sync(bfqq
))
1687 max_dispatch
= bfqd
->bfq_max_budget_async_rq
;
1689 if (bfqq
->dispatched
>= max_dispatch
) {
1690 if (bfqd
->busy_queues
> 1)
1692 if (bfqq
->dispatched
>= 4 * max_dispatch
)
1696 if (bfqd
->sync_flight
!= 0 && !bfq_bfqq_sync(bfqq
))
1699 bfq_clear_bfqq_wait_request(bfqq
);
1700 BUG_ON(timer_pending(&bfqd
->idle_slice_timer
));
1702 if (! bfq_dispatch_request(bfqd
, bfqq
))
1705 bfq_log_bfqq(bfqd
, bfqq
, "dispatched one request of %d"
1706 "(max_disp %d)", bfqq
->pid
, max_dispatch
);
1712 * Task holds one reference to the queue, dropped when task exits. Each rq
1713 * in-flight on this queue also holds a reference, dropped when rq is freed.
1715 * Queue lock must be held here.
1717 static void bfq_put_queue(struct bfq_queue
*bfqq
)
1719 struct bfq_data
*bfqd
= bfqq
->bfqd
;
1721 BUG_ON(atomic_read(&bfqq
->ref
) <= 0);
1723 bfq_log_bfqq(bfqd
, bfqq
, "put_queue: %p %d", bfqq
,
1724 atomic_read(&bfqq
->ref
));
1725 if (!atomic_dec_and_test(&bfqq
->ref
))
1728 BUG_ON(rb_first(&bfqq
->sort_list
) != NULL
);
1729 BUG_ON(bfqq
->allocated
[READ
] + bfqq
->allocated
[WRITE
] != 0);
1730 BUG_ON(bfqq
->entity
.tree
!= NULL
);
1731 BUG_ON(bfq_bfqq_busy(bfqq
));
1732 BUG_ON(bfqd
->active_queue
== bfqq
);
1734 bfq_log_bfqq(bfqd
, bfqq
, "put_queue: %p freed", bfqq
);
1736 kmem_cache_free(bfq_pool
, bfqq
);
1739 static void bfq_put_cooperator(struct bfq_queue
*bfqq
)
1741 struct bfq_queue
*__bfqq
, *next
;
1744 * If this queue was scheduled to merge with another queue, be
1745 * sure to drop the reference taken on that queue (and others in
1746 * the merge chain). See bfq_setup_merge and bfq_merge_bfqqs.
1748 __bfqq
= bfqq
->new_bfqq
;
1750 if (__bfqq
== bfqq
) {
1751 WARN(1, "bfqq->new_bfqq loop detected.\n");
1754 next
= __bfqq
->new_bfqq
;
1755 bfq_put_queue(__bfqq
);
1760 static void bfq_exit_bfqq(struct bfq_data
*bfqd
, struct bfq_queue
*bfqq
)
1762 if (bfqq
== bfqd
->active_queue
) {
1763 __bfq_bfqq_expire(bfqd
, bfqq
);
1764 bfq_schedule_dispatch(bfqd
);
1767 bfq_log_bfqq(bfqd
, bfqq
, "exit_bfqq: %p, %d", bfqq
,
1768 atomic_read(&bfqq
->ref
));
1770 bfq_put_cooperator(bfqq
);
1772 bfq_put_queue(bfqq
);
1775 static void bfq_init_icq(struct io_cq
*icq
)
1777 struct bfq_io_cq
*bic
= icq_to_bic(icq
);
1779 bic
->ttime
.last_end_request
= jiffies
;
1782 static void bfq_exit_icq(struct io_cq
*icq
)
1784 struct bfq_io_cq
*bic
= icq_to_bic(icq
);
1785 struct bfq_data
*bfqd
= bic_to_bfqd(bic
);
1787 if (bic
->bfqq
[BLK_RW_ASYNC
]) {
1788 bfq_exit_bfqq(bfqd
, bic
->bfqq
[BLK_RW_ASYNC
]);
1789 bic
->bfqq
[BLK_RW_ASYNC
] = NULL
;
1792 if (bic
->bfqq
[BLK_RW_SYNC
]) {
1793 bfq_exit_bfqq(bfqd
, bic
->bfqq
[BLK_RW_SYNC
]);
1794 bic
->bfqq
[BLK_RW_SYNC
] = NULL
;
1799 * Update the entity prio values; note that the new values will not
1800 * be used until the next (re)activation.
1802 static void bfq_init_prio_data(struct bfq_queue
*bfqq
, struct io_context
*ioc
)
1804 struct task_struct
*tsk
= current
;
1807 if (!bfq_bfqq_prio_changed(bfqq
))
1810 ioprio_class
= IOPRIO_PRIO_CLASS(ioc
->ioprio
);
1811 switch (ioprio_class
) {
1813 printk(KERN_ERR
"bfq: bad prio %x\n", ioprio_class
);
1814 case IOPRIO_CLASS_NONE
:
1816 * No prio set, inherit CPU scheduling settings.
1818 bfqq
->entity
.new_ioprio
= task_nice_ioprio(tsk
);
1819 bfqq
->entity
.new_ioprio_class
= task_nice_ioclass(tsk
);
1821 case IOPRIO_CLASS_RT
:
1822 bfqq
->entity
.new_ioprio
= task_ioprio(ioc
);
1823 bfqq
->entity
.new_ioprio_class
= IOPRIO_CLASS_RT
;
1825 case IOPRIO_CLASS_BE
:
1826 bfqq
->entity
.new_ioprio
= task_ioprio(ioc
);
1827 bfqq
->entity
.new_ioprio_class
= IOPRIO_CLASS_BE
;
1829 case IOPRIO_CLASS_IDLE
:
1830 bfqq
->entity
.new_ioprio_class
= IOPRIO_CLASS_IDLE
;
1831 bfqq
->entity
.new_ioprio
= 7;
1832 bfq_clear_bfqq_idle_window(bfqq
);
1836 bfqq
->entity
.ioprio_changed
= 1;
1839 * Keep track of original prio settings in case we have to temporarily
1840 * elevate the priority of this queue.
1842 bfqq
->org_ioprio
= bfqq
->entity
.new_ioprio
;
1843 bfq_clear_bfqq_prio_changed(bfqq
);
1846 static void bfq_changed_ioprio(struct io_context
*ioc
,
1847 struct bfq_io_cq
*bic
)
1849 struct bfq_data
*bfqd
;
1850 struct bfq_queue
*bfqq
, *new_bfqq
;
1851 struct bfq_group
*bfqg
;
1852 unsigned long uninitialized_var(flags
);
1854 bfqd
= bfq_get_bfqd_locked(&(bic
->icq
.q
->elevator
->elevator_data
), &flags
);
1855 if (unlikely(bfqd
== NULL
))
1858 bfqq
= bic
->bfqq
[BLK_RW_ASYNC
];
1860 bfqg
= container_of(bfqq
->entity
.sched_data
, struct bfq_group
,
1862 new_bfqq
= bfq_get_queue(bfqd
, bfqg
, BLK_RW_ASYNC
, bic
->icq
.ioc
,
1864 if (new_bfqq
!= NULL
) {
1865 bic
->bfqq
[BLK_RW_ASYNC
] = new_bfqq
;
1866 bfq_log_bfqq(bfqd
, bfqq
,
1867 "changed_ioprio: bfqq %p %d",
1868 bfqq
, atomic_read(&bfqq
->ref
));
1869 bfq_put_queue(bfqq
);
1873 bfqq
= bic
->bfqq
[BLK_RW_SYNC
];
1875 bfq_mark_bfqq_prio_changed(bfqq
);
1877 bfq_put_bfqd_unlock(bfqd
, &flags
);
1880 static void bfq_init_bfqq(struct bfq_data
*bfqd
, struct bfq_queue
*bfqq
,
1881 pid_t pid
, int is_sync
)
1883 RB_CLEAR_NODE(&bfqq
->entity
.rb_node
);
1884 INIT_LIST_HEAD(&bfqq
->fifo
);
1886 atomic_set(&bfqq
->ref
, 0);
1889 bfq_mark_bfqq_prio_changed(bfqq
);
1892 if (!bfq_class_idle(bfqq
))
1893 bfq_mark_bfqq_idle_window(bfqq
);
1894 bfq_mark_bfqq_sync(bfqq
);
1897 /* Tentative initial value to trade off between thr and lat */
1898 bfqq
->max_budget
= (2 * bfq_max_budget(bfqd
)) / 3;
1901 bfqq
->raising_coeff
= 1;
1902 bfqq
->last_rais_start_finish
= 0;
1903 bfqq
->soft_rt_next_start
= -1;
1906 static struct bfq_queue
*bfq_find_alloc_queue(struct bfq_data
*bfqd
,
1907 struct bfq_group
*bfqg
,
1909 struct io_context
*ioc
,
1912 struct bfq_queue
*bfqq
, *new_bfqq
= NULL
;
1913 struct bfq_io_cq
*bic
;
1916 bic
= bfq_bic_lookup(bfqd
, ioc
);
1917 /* bic always exists here */
1918 bfqq
= bic_to_bfqq(bic
, is_sync
);
1921 * Always try a new alloc if we fall back to the OOM bfqq
1922 * originally, since it should just be a temporary situation.
1924 if (bfqq
== NULL
|| bfqq
== &bfqd
->oom_bfqq
) {
1926 if (new_bfqq
!= NULL
) {
1929 } else if (gfp_mask
& __GFP_WAIT
) {
1930 spin_unlock_irq(bfqd
->queue
->queue_lock
);
1931 new_bfqq
= kmem_cache_alloc_node(bfq_pool
,
1932 gfp_mask
| __GFP_ZERO
,
1934 spin_lock_irq(bfqd
->queue
->queue_lock
);
1935 if (new_bfqq
!= NULL
)
1938 bfqq
= kmem_cache_alloc_node(bfq_pool
,
1939 gfp_mask
| __GFP_ZERO
,
1944 bfq_init_bfqq(bfqd
, bfqq
, current
->pid
, is_sync
);
1945 bfq_log_bfqq(bfqd
, bfqq
, "allocated");
1947 bfqq
= &bfqd
->oom_bfqq
;
1948 bfq_log_bfqq(bfqd
, bfqq
, "using oom bfqq");
1951 bfq_init_prio_data(bfqq
, ioc
);
1952 bfq_init_entity(&bfqq
->entity
, bfqg
);
1955 if (new_bfqq
!= NULL
)
1956 kmem_cache_free(bfq_pool
, new_bfqq
);
1961 static struct bfq_queue
**bfq_async_queue_prio(struct bfq_data
*bfqd
,
1962 struct bfq_group
*bfqg
,
1963 int ioprio_class
, int ioprio
)
1965 switch (ioprio_class
) {
1966 case IOPRIO_CLASS_RT
:
1967 return &bfqg
->async_bfqq
[0][ioprio
];
1968 case IOPRIO_CLASS_BE
:
1969 return &bfqg
->async_bfqq
[1][ioprio
];
1970 case IOPRIO_CLASS_IDLE
:
1971 return &bfqg
->async_idle_bfqq
;
1977 static struct bfq_queue
*bfq_get_queue(struct bfq_data
*bfqd
,
1978 struct bfq_group
*bfqg
, int is_sync
,
1979 struct io_context
*ioc
, gfp_t gfp_mask
)
1981 const int ioprio
= task_ioprio(ioc
);
1982 const int ioprio_class
= task_ioprio_class(ioc
);
1983 struct bfq_queue
**async_bfqq
= NULL
;
1984 struct bfq_queue
*bfqq
= NULL
;
1987 async_bfqq
= bfq_async_queue_prio(bfqd
, bfqg
, ioprio_class
,
1993 bfqq
= bfq_find_alloc_queue(bfqd
, bfqg
, is_sync
, ioc
, gfp_mask
);
1996 * Pin the queue now that it's allocated, scheduler exit will prune it.
1998 if (!is_sync
&& *async_bfqq
== NULL
) {
1999 atomic_inc(&bfqq
->ref
);
2000 bfq_log_bfqq(bfqd
, bfqq
, "get_queue, bfqq not in async: %p, %d",
2001 bfqq
, atomic_read(&bfqq
->ref
));
2005 atomic_inc(&bfqq
->ref
);
2006 bfq_log_bfqq(bfqd
, bfqq
, "get_queue, at end: %p, %d", bfqq
,
2007 atomic_read(&bfqq
->ref
));
2011 static void bfq_update_io_thinktime(struct bfq_data
*bfqd
,
2012 struct bfq_io_cq
*bic
)
2014 unsigned long elapsed
= jiffies
- bic
->ttime
.last_end_request
;
2015 unsigned long ttime
= min(elapsed
, 2UL * bfqd
->bfq_slice_idle
);
2017 bic
->ttime
.ttime_samples
= (7*bic
->ttime
.ttime_samples
+ 256) / 8;
2018 bic
->ttime
.ttime_total
= (7*bic
->ttime
.ttime_total
+ 256*ttime
) / 8;
2019 bic
->ttime
.ttime_mean
= (bic
->ttime
.ttime_total
+ 128) / bic
->ttime
.ttime_samples
;
2022 static void bfq_update_io_seektime(struct bfq_data
*bfqd
,
2023 struct bfq_queue
*bfqq
,
2029 if (bfqq
->last_request_pos
< blk_rq_pos(rq
))
2030 sdist
= blk_rq_pos(rq
) - bfqq
->last_request_pos
;
2032 sdist
= bfqq
->last_request_pos
- blk_rq_pos(rq
);
2035 * Don't allow the seek distance to get too large from the
2036 * odd fragment, pagein, etc.
2038 if (bfqq
->seek_samples
== 0) /* first request, not really a seek */
2040 else if (bfqq
->seek_samples
<= 60) /* second & third seek */
2041 sdist
= min(sdist
, (bfqq
->seek_mean
* 4) + 2*1024*1024);
2043 sdist
= min(sdist
, (bfqq
->seek_mean
* 4) + 2*1024*64);
2045 bfqq
->seek_samples
= (7*bfqq
->seek_samples
+ 256) / 8;
2046 bfqq
->seek_total
= (7*bfqq
->seek_total
+ (u64
)256*sdist
) / 8;
2047 total
= bfqq
->seek_total
+ (bfqq
->seek_samples
/2);
2048 do_div(total
, bfqq
->seek_samples
);
2049 if (bfq_bfqq_coop(bfqq
)) {
2051 * If the mean seektime increases for a (non-seeky) shared
2052 * queue, some cooperator is likely to be idling too much.
2053 * On the contrary, if it decreases, some cooperator has
2054 * probably waked up.
2057 if ((sector_t
)total
< bfqq
->seek_mean
)
2058 bfq_mark_bfqq_some_coop_idle(bfqq
) ;
2059 else if ((sector_t
)total
> bfqq
->seek_mean
)
2060 bfq_clear_bfqq_some_coop_idle(bfqq
) ;
2062 bfqq
->seek_mean
= (sector_t
)total
;
2064 bfq_log_bfqq(bfqd
, bfqq
, "dist=%llu mean=%llu", (u64
)sdist
,
2065 (u64
)bfqq
->seek_mean
);
2069 * Disable idle window if the process thinks too long or seeks so much that
2070 * it doesn't matter.
2072 static void bfq_update_idle_window(struct bfq_data
*bfqd
,
2073 struct bfq_queue
*bfqq
,
2074 struct bfq_io_cq
*bic
)
2078 /* Don't idle for async or idle io prio class. */
2079 if (!bfq_bfqq_sync(bfqq
) || bfq_class_idle(bfqq
))
2082 enable_idle
= bfq_bfqq_idle_window(bfqq
);
2084 if (atomic_read(&bic
->icq
.ioc
->nr_tasks
) == 0 ||
2085 bfqd
->bfq_slice_idle
== 0 ||
2086 (bfqd
->hw_tag
&& BFQQ_SEEKY(bfqq
) &&
2087 bfqq
->raising_coeff
== 1))
2089 else if (bfq_sample_valid(bic
->ttime
.ttime_samples
)) {
2090 if (bic
->ttime
.ttime_mean
> bfqd
->bfq_slice_idle
&&
2091 bfqq
->raising_coeff
== 1)
2096 bfq_log_bfqq(bfqd
, bfqq
, "update_idle_window: enable_idle %d",
2100 bfq_mark_bfqq_idle_window(bfqq
);
2102 bfq_clear_bfqq_idle_window(bfqq
);
2106 * Called when a new fs request (rq) is added to bfqq. Check if there's
2107 * something we should do about it.
2109 static void bfq_rq_enqueued(struct bfq_data
*bfqd
, struct bfq_queue
*bfqq
,
2112 struct bfq_io_cq
*bic
= RQ_BIC(rq
);
2114 if (rq
->cmd_flags
& REQ_META
)
2115 bfqq
->meta_pending
++;
2117 bfq_update_io_thinktime(bfqd
, bic
);
2118 bfq_update_io_seektime(bfqd
, bfqq
, rq
);
2119 if (bfqq
->entity
.service
> bfq_max_budget(bfqd
) / 8 ||
2121 bfq_update_idle_window(bfqd
, bfqq
, bic
);
2123 bfq_log_bfqq(bfqd
, bfqq
,
2124 "rq_enqueued: idle_window=%d (seeky %d, mean %llu)",
2125 bfq_bfqq_idle_window(bfqq
), BFQQ_SEEKY(bfqq
),
2126 (long long unsigned)bfqq
->seek_mean
);
2128 bfqq
->last_request_pos
= blk_rq_pos(rq
) + blk_rq_sectors(rq
);
2130 if (bfqq
== bfqd
->active_queue
) {
2132 * If there is just this request queued and the request
2133 * is small, just exit.
2134 * In this way, if the disk is being idled to wait for a new
2135 * request from the active queue, we avoid unplugging the
2138 * By doing so, we spare the disk to be committed
2139 * to serve just a small request. On the contrary, we wait for
2140 * the block layer to decide when to unplug the device:
2141 * hopefully, new requests will be merged to this
2142 * one quickly, then the device will be unplugged
2143 * and larger requests will be dispatched.
2145 if (bfqq
->queued
[rq_is_sync(rq
)] == 1 &&
2146 blk_rq_sectors(rq
) < 32) {
2149 if (bfq_bfqq_wait_request(bfqq
)) {
2151 * If we are waiting for a request for this queue, let
2152 * it rip immediately and flag that we must not expire
2153 * this queue just now.
2155 bfq_clear_bfqq_wait_request(bfqq
);
2156 del_timer(&bfqd
->idle_slice_timer
);
2158 * Here we can safely expire the queue, in
2159 * case of budget timeout, without wasting
2162 if (bfq_bfqq_budget_timeout(bfqq
))
2163 bfq_bfqq_expire(bfqd
, bfqq
, 0,
2164 BFQ_BFQQ_BUDGET_TIMEOUT
);
2165 __blk_run_queue(bfqd
->queue
);
2170 static void bfq_insert_request(struct request_queue
*q
, struct request
*rq
)
2172 struct bfq_data
*bfqd
= q
->elevator
->elevator_data
;
2173 struct bfq_queue
*bfqq
= RQ_BFQQ(rq
);
2175 assert_spin_locked(bfqd
->queue
->queue_lock
);
2176 bfq_init_prio_data(bfqq
, RQ_BIC(rq
)->icq
.ioc
);
2180 rq_set_fifo_time(rq
, jiffies
+ bfqd
->bfq_fifo_expire
[rq_is_sync(rq
)]);
2181 list_add_tail(&rq
->queuelist
, &bfqq
->fifo
);
2183 bfq_rq_enqueued(bfqd
, bfqq
, rq
);
2186 static void bfq_update_hw_tag(struct bfq_data
*bfqd
)
2188 bfqd
->max_rq_in_driver
= max(bfqd
->max_rq_in_driver
,
2189 bfqd
->rq_in_driver
);
2192 * This sample is valid if the number of outstanding requests
2193 * is large enough to allow a queueing behavior. Note that the
2194 * sum is not exact, as it's not taking into account deactivated
2197 if (bfqd
->rq_in_driver
+ bfqd
->queued
< BFQ_HW_QUEUE_THRESHOLD
)
2200 if (bfqd
->hw_tag_samples
++ < BFQ_HW_QUEUE_SAMPLES
)
2203 bfqd
->hw_tag
= bfqd
->max_rq_in_driver
> BFQ_HW_QUEUE_THRESHOLD
;
2204 bfqd
->max_rq_in_driver
= 0;
2205 bfqd
->hw_tag_samples
= 0;
2208 static void bfq_completed_request(struct request_queue
*q
, struct request
*rq
)
2210 struct bfq_queue
*bfqq
= RQ_BFQQ(rq
);
2211 struct bfq_data
*bfqd
= bfqq
->bfqd
;
2212 const int sync
= rq_is_sync(rq
);
2214 bfq_log_bfqq(bfqd
, bfqq
, "completed %u sects req (%d)",
2215 blk_rq_sectors(rq
), sync
);
2217 bfq_update_hw_tag(bfqd
);
2219 WARN_ON(!bfqd
->rq_in_driver
);
2220 WARN_ON(!bfqq
->dispatched
);
2221 bfqd
->rq_in_driver
--;
2224 if (bfq_bfqq_sync(bfqq
))
2225 bfqd
->sync_flight
--;
2228 RQ_BIC(rq
)->ttime
.last_end_request
= jiffies
;
2231 * If this is the active queue, check if it needs to be expired,
2232 * or if we want to idle in case it has no pending requests.
2234 if (bfqd
->active_queue
== bfqq
) {
2235 if (bfq_bfqq_budget_new(bfqq
))
2236 bfq_set_budget_timeout(bfqd
);
2238 /* Idling is disabled also for cooperation issues:
2239 * 1) there is a close cooperator for the queue, or
2240 * 2) the queue is shared and some cooperator is likely
2241 * to be idle (in this case, by not arming the idle timer,
2242 * we try to slow down the queue, to prevent the zones
2243 * of the disk accessed by the active cooperators to become
2244 * too distant from the zone that will be accessed by the
2245 * currently idle cooperators)
2247 if (bfq_may_expire_for_budg_timeout(bfqq
))
2248 bfq_bfqq_expire(bfqd
, bfqq
, 0, BFQ_BFQQ_BUDGET_TIMEOUT
);
2250 (bfqd
->rq_in_driver
== 0 ||
2251 bfqq
->raising_coeff
> 1)
2252 && RB_EMPTY_ROOT(&bfqq
->sort_list
)
2253 && !bfq_close_cooperator(bfqd
, bfqq
)
2254 && (!bfq_bfqq_coop(bfqq
) ||
2255 !bfq_bfqq_some_coop_idle(bfqq
)))
2256 bfq_arm_slice_timer(bfqd
);
2259 if (!bfqd
->rq_in_driver
)
2260 bfq_schedule_dispatch(bfqd
);
2263 static inline int __bfq_may_queue(struct bfq_queue
*bfqq
)
2265 if (bfq_bfqq_wait_request(bfqq
) && bfq_bfqq_must_alloc(bfqq
)) {
2266 bfq_clear_bfqq_must_alloc(bfqq
);
2267 return ELV_MQUEUE_MUST
;
2270 return ELV_MQUEUE_MAY
;
2273 static int bfq_may_queue(struct request_queue
*q
, int rw
)
2275 struct bfq_data
*bfqd
= q
->elevator
->elevator_data
;
2276 struct task_struct
*tsk
= current
;
2277 struct bfq_io_cq
*bic
;
2278 struct bfq_queue
*bfqq
;
2281 * Don't force setup of a queue from here, as a call to may_queue
2282 * does not necessarily imply that a request actually will be queued.
2283 * So just lookup a possibly existing queue, or return 'may queue'
2286 bic
= bfq_bic_lookup(bfqd
, tsk
->io_context
);
2288 return ELV_MQUEUE_MAY
;
2290 bfqq
= bic_to_bfqq(bic
, rw_is_sync(rw
));
2292 bfq_init_prio_data(bfqq
, bic
->icq
.ioc
);
2294 return __bfq_may_queue(bfqq
);
2297 return ELV_MQUEUE_MAY
;
2301 * Queue lock held here.
2303 static void bfq_put_request(struct request
*rq
)
2305 struct bfq_queue
*bfqq
= RQ_BFQQ(rq
);
2308 const int rw
= rq_data_dir(rq
);
2310 BUG_ON(!bfqq
->allocated
[rw
]);
2311 bfqq
->allocated
[rw
]--;
2313 rq
->elv
.priv
[0] = NULL
;
2314 rq
->elv
.priv
[1] = NULL
;
2316 bfq_log_bfqq(bfqq
->bfqd
, bfqq
, "put_request %p, %d",
2317 bfqq
, atomic_read(&bfqq
->ref
));
2318 bfq_put_queue(bfqq
);
2322 static struct bfq_queue
*
2323 bfq_merge_bfqqs(struct bfq_data
*bfqd
, struct bfq_io_cq
*bic
,
2324 struct bfq_queue
*bfqq
)
2326 bfq_log_bfqq(bfqd
, bfqq
, "merging with queue %lu",
2327 (long unsigned)bfqq
->new_bfqq
->pid
);
2328 bic_set_bfqq(bic
, bfqq
->new_bfqq
, 1);
2329 bfq_mark_bfqq_coop(bfqq
->new_bfqq
);
2330 bfq_put_queue(bfqq
);
2331 return bic_to_bfqq(bic
, 1);
2335 * Returns NULL if a new bfqq should be allocated, or the old bfqq if this
2336 * was the last process referring to said bfqq.
2338 static struct bfq_queue
*
2339 bfq_split_bfqq(struct bfq_io_cq
*bic
, struct bfq_queue
*bfqq
)
2341 bfq_log_bfqq(bfqq
->bfqd
, bfqq
, "splitting queue");
2342 if (bfqq_process_refs(bfqq
) == 1) {
2343 bfqq
->pid
= current
->pid
;
2344 bfq_clear_bfqq_some_coop_idle(bfqq
);
2345 bfq_clear_bfqq_coop(bfqq
);
2346 bfq_clear_bfqq_split_coop(bfqq
);
2350 bic_set_bfqq(bic
, NULL
, 1);
2352 bfq_put_cooperator(bfqq
);
2354 bfq_put_queue(bfqq
);
2359 * Allocate bfq data structures associated with this request.
2361 static int bfq_set_request(struct request_queue
*q
, struct request
*rq
,
2364 struct bfq_data
*bfqd
= q
->elevator
->elevator_data
;
2365 struct bfq_io_cq
*bic
= icq_to_bic(rq
->elv
.icq
);
2366 const int rw
= rq_data_dir(rq
);
2367 const int is_sync
= rq_is_sync(rq
);
2368 struct bfq_queue
*bfqq
;
2369 struct bfq_group
*bfqg
;
2370 unsigned long flags
;
2372 /* handle changed prio notifications; cgroup change is handled separately */
2373 if (unlikely(icq_get_changed(&bic
->icq
) & ICQ_IOPRIO_CHANGED
))
2374 bfq_changed_ioprio(bic
->icq
.ioc
, bic
);
2376 might_sleep_if(gfp_mask
& __GFP_WAIT
);
2378 spin_lock_irqsave(q
->queue_lock
, flags
);
2383 bfqg
= bfq_bic_update_cgroup(bic
);
2386 bfqq
= bic_to_bfqq(bic
, is_sync
);
2387 if (bfqq
== NULL
|| bfqq
== &bfqd
->oom_bfqq
) {
2388 bfqq
= bfq_get_queue(bfqd
, bfqg
, is_sync
, bic
->icq
.ioc
, gfp_mask
);
2389 bic_set_bfqq(bic
, bfqq
, is_sync
);
2392 * If the queue was seeky for too long, break it apart.
2394 if (bfq_bfqq_coop(bfqq
) && bfq_bfqq_split_coop(bfqq
)) {
2395 bfq_log_bfqq(bfqd
, bfqq
, "breaking apart bfqq");
2396 bfqq
= bfq_split_bfqq(bic
, bfqq
);
2402 * Check to see if this queue is scheduled to merge with
2403 * another closely cooperating queue. The merging of queues
2404 * happens here as it must be done in process context.
2405 * The reference on new_bfqq was taken in merge_bfqqs.
2407 if (bfqq
->new_bfqq
!= NULL
)
2408 bfqq
= bfq_merge_bfqqs(bfqd
, bic
, bfqq
);
2411 bfqq
->allocated
[rw
]++;
2412 atomic_inc(&bfqq
->ref
);
2413 bfq_log_bfqq(bfqd
, bfqq
, "set_request: bfqq %p, %d", bfqq
,
2414 atomic_read(&bfqq
->ref
));
2416 rq
->elv
.priv
[0] = bic
;
2417 rq
->elv
.priv
[1] = bfqq
;
2419 spin_unlock_irqrestore(q
->queue_lock
, flags
);
2424 bfq_schedule_dispatch(bfqd
);
2425 spin_unlock_irqrestore(q
->queue_lock
, flags
);
2430 static void bfq_kick_queue(struct work_struct
*work
)
2432 struct bfq_data
*bfqd
=
2433 container_of(work
, struct bfq_data
, unplug_work
);
2434 struct request_queue
*q
= bfqd
->queue
;
2436 spin_lock_irq(q
->queue_lock
);
2438 spin_unlock_irq(q
->queue_lock
);
2442 * Handler of the expiration of the timer running if the active_queue
2443 * is idling inside its time slice.
2445 static void bfq_idle_slice_timer(unsigned long data
)
2447 struct bfq_data
*bfqd
= (struct bfq_data
*)data
;
2448 struct bfq_queue
*bfqq
;
2449 unsigned long flags
;
2450 enum bfqq_expiration reason
;
2452 spin_lock_irqsave(bfqd
->queue
->queue_lock
, flags
);
2454 bfqq
= bfqd
->active_queue
;
2456 * Theoretical race here: active_queue can be NULL or different
2457 * from the queue that was idling if the timer handler spins on
2458 * the queue_lock and a new request arrives for the current
2459 * queue and there is a full dispatch cycle that changes the
2460 * active_queue. This can hardly happen, but in the worst case
2461 * we just expire a queue too early.
2464 bfq_log_bfqq(bfqd
, bfqq
, "slice_timer expired");
2465 if (bfq_bfqq_budget_timeout(bfqq
))
2467 * Also here the queue can be safely expired
2468 * for budget timeout without wasting
2471 reason
= BFQ_BFQQ_BUDGET_TIMEOUT
;
2472 else if (bfqq
->queued
[0] == 0 && bfqq
->queued
[1] == 0)
2474 * The queue may not be empty upon timer expiration,
2475 * because we may not disable the timer when the first
2476 * request of the active queue arrives during
2479 reason
= BFQ_BFQQ_TOO_IDLE
;
2481 goto schedule_dispatch
;
2483 bfq_bfqq_expire(bfqd
, bfqq
, 1, reason
);
2487 bfq_schedule_dispatch(bfqd
);
2489 spin_unlock_irqrestore(bfqd
->queue
->queue_lock
, flags
);
2492 static void bfq_shutdown_timer_wq(struct bfq_data
*bfqd
)
2494 del_timer_sync(&bfqd
->idle_slice_timer
);
2495 cancel_work_sync(&bfqd
->unplug_work
);
2498 static inline void __bfq_put_async_bfqq(struct bfq_data
*bfqd
,
2499 struct bfq_queue
**bfqq_ptr
)
2501 struct bfq_group
*root_group
= bfqd
->root_group
;
2502 struct bfq_queue
*bfqq
= *bfqq_ptr
;
2504 bfq_log(bfqd
, "put_async_bfqq: %p", bfqq
);
2506 bfq_bfqq_move(bfqd
, bfqq
, &bfqq
->entity
, root_group
);
2507 bfq_log_bfqq(bfqd
, bfqq
, "put_async_bfqq: putting %p, %d",
2508 bfqq
, atomic_read(&bfqq
->ref
));
2509 bfq_put_queue(bfqq
);
2515 * Release all the bfqg references to its async queues. If we are
2516 * deallocating the group these queues may still contain requests, so
2517 * we reparent them to the root cgroup (i.e., the only one that will
2518 * exist for sure untill all the requests on a device are gone).
2520 static void bfq_put_async_queues(struct bfq_data
*bfqd
, struct bfq_group
*bfqg
)
2524 for (i
= 0; i
< 2; i
++)
2525 for (j
= 0; j
< IOPRIO_BE_NR
; j
++)
2526 __bfq_put_async_bfqq(bfqd
, &bfqg
->async_bfqq
[i
][j
]);
2528 __bfq_put_async_bfqq(bfqd
, &bfqg
->async_idle_bfqq
);
2531 static void bfq_exit_queue(struct elevator_queue
*e
)
2533 struct bfq_data
*bfqd
= e
->elevator_data
;
2534 struct request_queue
*q
= bfqd
->queue
;
2535 struct bfq_queue
*bfqq
, *n
;
2537 bfq_shutdown_timer_wq(bfqd
);
2539 spin_lock_irq(q
->queue_lock
);
2541 BUG_ON(bfqd
->active_queue
!= NULL
);
2542 list_for_each_entry_safe(bfqq
, n
, &bfqd
->idle_list
, bfqq_list
)
2543 bfq_deactivate_bfqq(bfqd
, bfqq
, 0);
2545 bfq_disconnect_groups(bfqd
);
2546 spin_unlock_irq(q
->queue_lock
);
2548 bfq_shutdown_timer_wq(bfqd
);
2552 BUG_ON(timer_pending(&bfqd
->idle_slice_timer
));
2554 bfq_free_root_group(bfqd
);
2558 static void *bfq_init_queue(struct request_queue
*q
)
2560 struct bfq_group
*bfqg
;
2561 struct bfq_data
*bfqd
;
2563 bfqd
= kmalloc_node(sizeof(*bfqd
), GFP_KERNEL
| __GFP_ZERO
, q
->node
);
2568 * Our fallback bfqq if bfq_find_alloc_queue() runs into OOM issues.
2569 * Grab a permanent reference to it, so that the normal code flow
2570 * will not attempt to free it.
2572 bfq_init_bfqq(bfqd
, &bfqd
->oom_bfqq
, 1, 0);
2573 atomic_inc(&bfqd
->oom_bfqq
.ref
);
2577 bfqg
= bfq_alloc_root_group(bfqd
, q
->node
);
2583 bfqd
->root_group
= bfqg
;
2585 init_timer(&bfqd
->idle_slice_timer
);
2586 bfqd
->idle_slice_timer
.function
= bfq_idle_slice_timer
;
2587 bfqd
->idle_slice_timer
.data
= (unsigned long)bfqd
;
2589 bfqd
->rq_pos_tree
= RB_ROOT
;
2591 INIT_WORK(&bfqd
->unplug_work
, bfq_kick_queue
);
2593 INIT_LIST_HEAD(&bfqd
->active_list
);
2594 INIT_LIST_HEAD(&bfqd
->idle_list
);
2598 bfqd
->bfq_max_budget
= bfq_default_max_budget
;
2600 bfqd
->bfq_quantum
= bfq_quantum
;
2601 bfqd
->bfq_fifo_expire
[0] = bfq_fifo_expire
[0];
2602 bfqd
->bfq_fifo_expire
[1] = bfq_fifo_expire
[1];
2603 bfqd
->bfq_back_max
= bfq_back_max
;
2604 bfqd
->bfq_back_penalty
= bfq_back_penalty
;
2605 bfqd
->bfq_slice_idle
= bfq_slice_idle
;
2606 bfqd
->bfq_class_idle_last_service
= 0;
2607 bfqd
->bfq_max_budget_async_rq
= bfq_max_budget_async_rq
;
2608 bfqd
->bfq_timeout
[BLK_RW_ASYNC
] = bfq_timeout_async
;
2609 bfqd
->bfq_timeout
[BLK_RW_SYNC
] = bfq_timeout_sync
;
2611 bfqd
->low_latency
= true;
2613 bfqd
->bfq_raising_coeff
= 20;
2614 bfqd
->bfq_raising_rt_max_time
= msecs_to_jiffies(300);
2615 bfqd
->bfq_raising_max_time
= msecs_to_jiffies(7500);
2616 bfqd
->bfq_raising_min_idle_time
= msecs_to_jiffies(2000);
2617 bfqd
->bfq_raising_min_inter_arr_async
= msecs_to_jiffies(500);
2618 bfqd
->bfq_raising_max_softrt_rate
= 7000;
2623 static void bfq_slab_kill(void)
2625 if (bfq_pool
!= NULL
)
2626 kmem_cache_destroy(bfq_pool
);
2629 static int __init
bfq_slab_setup(void)
2631 bfq_pool
= KMEM_CACHE(bfq_queue
, 0);
2632 if (bfq_pool
== NULL
)
2637 static ssize_t
bfq_var_show(unsigned int var
, char *page
)
2639 return sprintf(page
, "%d\n", var
);
2642 static ssize_t
bfq_var_store(unsigned long *var
, const char *page
, size_t count
)
2644 unsigned long new_val
;
2645 int ret
= strict_strtoul(page
, 10, &new_val
);
2653 static ssize_t
bfq_weights_show(struct elevator_queue
*e
, char *page
)
2655 struct bfq_queue
*bfqq
;
2656 struct bfq_data
*bfqd
= e
->elevator_data
;
2657 ssize_t num_char
= 0;
2659 num_char
+= sprintf(page
+ num_char
, "Active:\n");
2660 list_for_each_entry(bfqq
, &bfqd
->active_list
, bfqq_list
) {
2661 num_char
+= sprintf(page
+ num_char
,
2662 "pid%d: weight %hu, dur %d/%u\n",
2664 bfqq
->entity
.weight
,
2665 jiffies_to_msecs(jiffies
-
2666 bfqq
->last_rais_start_finish
),
2667 jiffies_to_msecs(bfqq
->raising_cur_max_time
));
2669 num_char
+= sprintf(page
+ num_char
, "Idle:\n");
2670 list_for_each_entry(bfqq
, &bfqd
->idle_list
, bfqq_list
) {
2671 num_char
+= sprintf(page
+ num_char
,
2672 "pid%d: weight %hu, dur %d/%u\n",
2674 bfqq
->entity
.weight
,
2675 jiffies_to_msecs(jiffies
-
2676 bfqq
->last_rais_start_finish
),
2677 jiffies_to_msecs(bfqq
->raising_cur_max_time
));
2682 #define SHOW_FUNCTION(__FUNC, __VAR, __CONV) \
2683 static ssize_t __FUNC(struct elevator_queue *e, char *page) \
2685 struct bfq_data *bfqd = e->elevator_data; \
2686 unsigned int __data = __VAR; \
2688 __data = jiffies_to_msecs(__data); \
2689 return bfq_var_show(__data, (page)); \
2691 SHOW_FUNCTION(bfq_quantum_show
, bfqd
->bfq_quantum
, 0);
2692 SHOW_FUNCTION(bfq_fifo_expire_sync_show
, bfqd
->bfq_fifo_expire
[1], 1);
2693 SHOW_FUNCTION(bfq_fifo_expire_async_show
, bfqd
->bfq_fifo_expire
[0], 1);
2694 SHOW_FUNCTION(bfq_back_seek_max_show
, bfqd
->bfq_back_max
, 0);
2695 SHOW_FUNCTION(bfq_back_seek_penalty_show
, bfqd
->bfq_back_penalty
, 0);
2696 SHOW_FUNCTION(bfq_slice_idle_show
, bfqd
->bfq_slice_idle
, 1);
2697 SHOW_FUNCTION(bfq_max_budget_show
, bfqd
->bfq_user_max_budget
, 0);
2698 SHOW_FUNCTION(bfq_max_budget_async_rq_show
, bfqd
->bfq_max_budget_async_rq
, 0);
2699 SHOW_FUNCTION(bfq_timeout_sync_show
, bfqd
->bfq_timeout
[BLK_RW_SYNC
], 1);
2700 SHOW_FUNCTION(bfq_timeout_async_show
, bfqd
->bfq_timeout
[BLK_RW_ASYNC
], 1);
2701 SHOW_FUNCTION(bfq_low_latency_show
, bfqd
->low_latency
, 0);
2702 SHOW_FUNCTION(bfq_raising_coeff_show
, bfqd
->bfq_raising_coeff
, 0);
2703 SHOW_FUNCTION(bfq_raising_max_time_show
, bfqd
->bfq_raising_max_time
, 1);
2704 SHOW_FUNCTION(bfq_raising_rt_max_time_show
, bfqd
->bfq_raising_rt_max_time
, 1);
2705 SHOW_FUNCTION(bfq_raising_min_idle_time_show
, bfqd
->bfq_raising_min_idle_time
,
2707 SHOW_FUNCTION(bfq_raising_min_inter_arr_async_show
,
2708 bfqd
->bfq_raising_min_inter_arr_async
,
2710 SHOW_FUNCTION(bfq_raising_max_softrt_rate_show
,
2711 bfqd
->bfq_raising_max_softrt_rate
, 0);
2712 #undef SHOW_FUNCTION
2714 #define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV) \
2716 __FUNC(struct elevator_queue *e, const char *page, size_t count) \
2718 struct bfq_data *bfqd = e->elevator_data; \
2719 unsigned long __data; \
2720 int ret = bfq_var_store(&__data, (page), count); \
2721 if (__data < (MIN)) \
2723 else if (__data > (MAX)) \
2726 *(__PTR) = msecs_to_jiffies(__data); \
2728 *(__PTR) = __data; \
2731 STORE_FUNCTION(bfq_quantum_store
, &bfqd
->bfq_quantum
, 1, INT_MAX
, 0);
2732 STORE_FUNCTION(bfq_fifo_expire_sync_store
, &bfqd
->bfq_fifo_expire
[1], 1,
2734 STORE_FUNCTION(bfq_fifo_expire_async_store
, &bfqd
->bfq_fifo_expire
[0], 1,
2736 STORE_FUNCTION(bfq_back_seek_max_store
, &bfqd
->bfq_back_max
, 0, INT_MAX
, 0);
2737 STORE_FUNCTION(bfq_back_seek_penalty_store
, &bfqd
->bfq_back_penalty
, 1,
2739 STORE_FUNCTION(bfq_slice_idle_store
, &bfqd
->bfq_slice_idle
, 0, INT_MAX
, 1);
2740 STORE_FUNCTION(bfq_max_budget_async_rq_store
, &bfqd
->bfq_max_budget_async_rq
,
2742 STORE_FUNCTION(bfq_timeout_async_store
, &bfqd
->bfq_timeout
[BLK_RW_ASYNC
], 0,
2744 STORE_FUNCTION(bfq_raising_coeff_store
, &bfqd
->bfq_raising_coeff
, 1,
2746 STORE_FUNCTION(bfq_raising_max_time_store
, &bfqd
->bfq_raising_max_time
, 0,
2748 STORE_FUNCTION(bfq_raising_rt_max_time_store
, &bfqd
->bfq_raising_rt_max_time
, 0,
2750 STORE_FUNCTION(bfq_raising_min_idle_time_store
,
2751 &bfqd
->bfq_raising_min_idle_time
, 0, INT_MAX
, 1);
2752 STORE_FUNCTION(bfq_raising_min_inter_arr_async_store
,
2753 &bfqd
->bfq_raising_min_inter_arr_async
, 0, INT_MAX
, 1);
2754 STORE_FUNCTION(bfq_raising_max_softrt_rate_store
,
2755 &bfqd
->bfq_raising_max_softrt_rate
, 0, INT_MAX
, 0);
2756 #undef STORE_FUNCTION
2758 /* do nothing for the moment */
2759 static ssize_t
bfq_weights_store(struct elevator_queue
*e
,
2760 const char *page
, size_t count
)
2765 static inline unsigned long bfq_estimated_max_budget(struct bfq_data
*bfqd
)
2767 u64 timeout
= jiffies_to_msecs(bfqd
->bfq_timeout
[BLK_RW_SYNC
]);
2769 if (bfqd
->peak_rate_samples
>= BFQ_PEAK_RATE_SAMPLES
)
2770 return bfq_calc_max_budget(bfqd
->peak_rate
, timeout
);
2772 return bfq_default_max_budget
;
2775 static ssize_t
bfq_max_budget_store(struct elevator_queue
*e
,
2776 const char *page
, size_t count
)
2778 struct bfq_data
*bfqd
= e
->elevator_data
;
2779 unsigned long __data
;
2780 int ret
= bfq_var_store(&__data
, (page
), count
);
2783 bfqd
->bfq_max_budget
= bfq_estimated_max_budget(bfqd
);
2785 if (__data
> INT_MAX
)
2787 bfqd
->bfq_max_budget
= __data
;
2790 bfqd
->bfq_user_max_budget
= __data
;
2795 static ssize_t
bfq_timeout_sync_store(struct elevator_queue
*e
,
2796 const char *page
, size_t count
)
2798 struct bfq_data
*bfqd
= e
->elevator_data
;
2799 unsigned long __data
;
2800 int ret
= bfq_var_store(&__data
, (page
), count
);
2804 else if (__data
> INT_MAX
)
2807 bfqd
->bfq_timeout
[BLK_RW_SYNC
] = msecs_to_jiffies(__data
);
2808 if (bfqd
->bfq_user_max_budget
== 0)
2809 bfqd
->bfq_max_budget
= bfq_estimated_max_budget(bfqd
);
2814 static ssize_t
bfq_low_latency_store(struct elevator_queue
*e
,
2815 const char *page
, size_t count
)
2817 struct bfq_data
*bfqd
= e
->elevator_data
;
2818 unsigned long __data
;
2819 int ret
= bfq_var_store(&__data
, (page
), count
);
2823 bfqd
->low_latency
= __data
;
2828 #define BFQ_ATTR(name) \
2829 __ATTR(name, S_IRUGO|S_IWUSR, bfq_##name##_show, bfq_##name##_store)
2831 static struct elv_fs_entry bfq_attrs
[] = {
2833 BFQ_ATTR(fifo_expire_sync
),
2834 BFQ_ATTR(fifo_expire_async
),
2835 BFQ_ATTR(back_seek_max
),
2836 BFQ_ATTR(back_seek_penalty
),
2837 BFQ_ATTR(slice_idle
),
2838 BFQ_ATTR(max_budget
),
2839 BFQ_ATTR(max_budget_async_rq
),
2840 BFQ_ATTR(timeout_sync
),
2841 BFQ_ATTR(timeout_async
),
2842 BFQ_ATTR(low_latency
),
2843 BFQ_ATTR(raising_coeff
),
2844 BFQ_ATTR(raising_max_time
),
2845 BFQ_ATTR(raising_rt_max_time
),
2846 BFQ_ATTR(raising_min_idle_time
),
2847 BFQ_ATTR(raising_min_inter_arr_async
),
2848 BFQ_ATTR(raising_max_softrt_rate
),
2853 static struct elevator_type iosched_bfq
= {
2855 .elevator_merge_fn
= bfq_merge
,
2856 .elevator_merged_fn
= bfq_merged_request
,
2857 .elevator_merge_req_fn
= bfq_merged_requests
,
2858 .elevator_allow_merge_fn
= bfq_allow_merge
,
2859 .elevator_dispatch_fn
= bfq_dispatch_requests
,
2860 .elevator_add_req_fn
= bfq_insert_request
,
2861 .elevator_activate_req_fn
= bfq_activate_request
,
2862 .elevator_deactivate_req_fn
= bfq_deactivate_request
,
2863 .elevator_completed_req_fn
= bfq_completed_request
,
2864 .elevator_former_req_fn
= elv_rb_former_request
,
2865 .elevator_latter_req_fn
= elv_rb_latter_request
,
2866 .elevator_init_icq_fn
= bfq_init_icq
,
2867 .elevator_exit_icq_fn
= bfq_exit_icq
,
2868 .elevator_set_req_fn
= bfq_set_request
,
2869 .elevator_put_req_fn
= bfq_put_request
,
2870 .elevator_may_queue_fn
= bfq_may_queue
,
2871 .elevator_init_fn
= bfq_init_queue
,
2872 .elevator_exit_fn
= bfq_exit_queue
,
2874 .icq_size
= sizeof(struct bfq_io_cq
),
2875 .icq_align
= __alignof__(struct bfq_io_cq
),
2876 .elevator_attrs
= bfq_attrs
,
2877 .elevator_name
= "bfq",
2878 .elevator_owner
= THIS_MODULE
,
2881 static int __init
bfq_init(void)
2884 * Can be 0 on HZ < 1000 setups.
2886 if (bfq_slice_idle
== 0)
2889 if (bfq_timeout_async
== 0)
2890 bfq_timeout_async
= 1;
2892 if (bfq_slab_setup())
2895 elv_register(&iosched_bfq
);
2900 static void __exit
bfq_exit(void)
2902 elv_unregister(&iosched_bfq
);
2906 module_init(bfq_init
);
2907 module_exit(bfq_exit
);
2909 MODULE_AUTHOR("Fabio Checconi, Paolo Valente");
2910 MODULE_LICENSE("GPL");
2911 MODULE_DESCRIPTION("Budget Fair Queueing IO scheduler");