block/bfq-iosched.c

   1 /*
   2  * BFQ, or Budget Fair Queueing, disk scheduler.
   3  *
   4  * Based on ideas and code from CFQ:
   5  * Copyright (C) 2003 Jens Axboe <axboe@kernel.dk>
   6  *
   7  * Copyright (C) 2008 Fabio Checconi <fabio@gandalf.sssup.it>
   8  *                    Paolo Valente <paolo.valente@unimore.it>
   9  *
  10  * Licensed under the GPL-2 as detailed in the accompanying COPYING.BFQ file.
  11  *
  12  * BFQ is a proportional share disk scheduling algorithm based on the
  13  * slice-by-slice service scheme of CFQ. But BFQ assigns budgets,
  14  * measured in number of sectors, to tasks instead of time slices.
  15  * The disk is not granted to the active task for a given time slice,
  16  * but until it has exahusted its assigned budget.  This change from
  17  * the time to the service domain allows BFQ to distribute the disk
  18  * bandwidth among tasks as desired, without any distortion due to
  19  * ZBR, workload fluctuations or other factors. BFQ uses an ad hoc
  20  * internal scheduler, called B-WF2Q+, to schedule tasks according to
  21  * their budgets.  Thanks to this accurate scheduler, BFQ can afford
  22  * to assign high budgets to disk-bound non-seeky tasks (to boost the
  23  * throughput), and yet guarantee low latencies to interactive and
  24  * soft real-time applications.
  25  *
  26  * BFQ has been introduced in [1], where the interested reader can
  27  * find an accurate description of the algorithm, the bandwidth
  28  * distribution and latency guarantees it provides, plus formal proofs
  29  * of all the properties.  With respect to the algorithm presented in
  30  * the paper, this implementation adds several little heuristics, and
  31  * a hierarchical extension, based on H-WF2Q+.
  32  *
  33  * B-WF2Q+ is based on WF2Q+, that is described in [2], together with
  34  * H-WF2Q+, while the augmented tree used to implement B-WF2Q+ with O(log N)
  35  * complexity derives from the one introduced with EEVDF in [3].
  36  *
  37  * [1] P. Valente and F. Checconi, ``High Throughput Disk Scheduling
  38  *     with Deterministic Guarantees on Bandwidth Distribution,'',
  39  *     IEEE Transactions on Computer, May 2010.
  40  *
  41  *     http://algo.ing.unimo.it/people/paolo/disk_sched/bfq-techreport.pdf
  42  *
  43  * [2] Jon C.R. Bennett and H. Zhang, ``Hierarchical Packet Fair Queueing
  44  *     Algorithms,'' IEEE/ACM Transactions on Networking, 5(5):675-689,
  45  *     Oct 1997.
  46  *
  47  *     http://www.cs.cmu.edu/~hzhang/papers/TON-97-Oct.ps.gz
  48  *
  49  * [3] I. Stoica and H. Abdel-Wahab, ``Earliest Eligible Virtual Deadline
  50  *     First: A Flexible and Accurate Mechanism for Proportional Share
  51  *     Resource Allocation,'' technical report.
  52  *
  53  *     http://www.cs.berkeley.edu/~istoica/papers/eevdf-tr-95.pdf
  54  */
  55 #include <linux/module.h>
  56 #include <linux/slab.h>
  57 #include <linux/blkdev.h>
  58 #include <linux/cgroup.h>
  59 #include <linux/elevator.h>
  60 #include <linux/jiffies.h>
  61 #include <linux/rbtree.h>
  62 #include <linux/ioprio.h>
  63 #include "bfq.h"
  64 #include "blk.h"
  65
  66 /* Max number of dispatches in one round of service. */
  67 static const int bfq_quantum = 4;
  68
  69 /* Expiration time of sync (0) and async (1) requests, in jiffies. */
  70 static const int bfq_fifo_expire[2] = { HZ / 4, HZ / 8 };
  71
  72 /* Maximum backwards seek, in KiB. */
  73 static const int bfq_back_max = 16 * 1024;
  74
  75 /* Penalty of a backwards seek, in number of sectors. */
  76 static const int bfq_back_penalty = 2;
  77
  78 /* Idling period duration, in jiffies. */
  79 static int bfq_slice_idle = HZ / 125;
  80
  81 /* Default maximum budget values, in sectors and number of requests. */
  82 static const int bfq_default_max_budget = 16 * 1024;
  83 static const int bfq_max_budget_async_rq = 4;
  84
  85 /*
  86  * Async to sync throughput distribution is controlled as follows:
  87  * when an async request is served, the entity is charged the number
  88  * of sectors of the request, multipled by the factor below
  89  */
  90 static const int bfq_async_charge_factor = 10;
  91
  92 /* Default timeout values, in jiffies, approximating CFQ defaults. */
  93 static const int bfq_timeout_sync = HZ / 8;
  94 static int bfq_timeout_async = HZ / 25;
  95
  96 struct kmem_cache *bfq_pool;
  97
  98 /* Below this threshold (in ms), we consider thinktime immediate. */
  99 #define BFQ_MIN_TT              2
 100
 101 /* hw_tag detection: parallel requests threshold and min samples needed. */
 102 #define BFQ_HW_QUEUE_THRESHOLD  4
 103 #define BFQ_HW_QUEUE_SAMPLES    32
 104
 105 #define BFQQ_SEEK_THR    (sector_t)(8 * 1024)
 106 #define BFQQ_SEEKY(bfqq) ((bfqq)->seek_mean > BFQQ_SEEK_THR)
 107
 108 /* Min samples used for peak rate estimation (for autotuning). */
 109 #define BFQ_PEAK_RATE_SAMPLES   32
 110
 111 /* Shift used for peak rate fixed precision calculations. */
 112 #define BFQ_RATE_SHIFT          16
 113
 114 #define BFQ_SERVICE_TREE_INIT   ((struct bfq_service_tree)              \
 115                                 { RB_ROOT, RB_ROOT, NULL, NULL, 0, 0 })
 116
 117 #define RQ_BIC(rq)              ((struct bfq_io_cq *) (rq)->elv.priv[0])
 118 #define RQ_BFQQ(rq)             ((rq)->elv.priv[1])
 119
 120 #include "bfq-ioc.c"
 121 #include "bfq-sched.c"
 122 #include "bfq-cgroup.c"
 123
 124 #define bfq_class_idle(bfqq)    ((bfqq)->entity.ioprio_class ==\
 125                                  IOPRIO_CLASS_IDLE)
 126 #define bfq_class_rt(bfqq)      ((bfqq)->entity.ioprio_class ==\
 127                                  IOPRIO_CLASS_RT)
 128
 129 #define bfq_sample_valid(samples)       ((samples) > 80)
 130
 131 /*
 132  * We regard a request as SYNC, if either it's a read or has the SYNC bit
 133  * set (in which case it could also be a direct WRITE).
 134  */
 135 static inline int bfq_bio_sync(struct bio *bio)
 136 {
 137         if (bio_data_dir(bio) == READ || (bio->bi_rw & REQ_SYNC))
 138                 return 1;
 139
 140         return 0;
 141 }
 142
 143 /*
 144  * Scheduler run of queue, if there are requests pending and no one in the
 145  * driver that will restart queueing.
 146  */
 147 static inline void bfq_schedule_dispatch(struct bfq_data *bfqd)
 148 {
 149         if (bfqd->queued != 0) {
 150                 bfq_log(bfqd, "schedule dispatch");
 151                 kblockd_schedule_work(bfqd->queue, &bfqd->unplug_work);
 152         }
 153 }
 154
 155 /*
 156  * Lifted from AS - choose which of rq1 and rq2 that is best served now.
 157  * We choose the request that is closesr to the head right now.  Distance
 158  * behind the head is penalized and only allowed to a certain extent.
 159  */
 160 static struct request *bfq_choose_req(struct bfq_data *bfqd,
 161                                       struct request *rq1,
 162                                       struct request *rq2,
 163                                       sector_t last)
 164 {
 165         sector_t s1, s2, d1 = 0, d2 = 0;
 166         unsigned long back_max;
 167 #define BFQ_RQ1_WRAP    0x01 /* request 1 wraps */
 168 #define BFQ_RQ2_WRAP    0x02 /* request 2 wraps */
 169         unsigned wrap = 0; /* bit mask: requests behind the disk head? */
 170
 171         if (rq1 == NULL || rq1 == rq2)
 172                 return rq2;
 173         if (rq2 == NULL)
 174                 return rq1;
 175
 176         if (rq_is_sync(rq1) && !rq_is_sync(rq2))
 177                 return rq1;
 178         else if (rq_is_sync(rq2) && !rq_is_sync(rq1))
 179                 return rq2;
 180         if ((rq1->cmd_flags & REQ_META) && !(rq2->cmd_flags & REQ_META))
 181                 return rq1;
 182         else if ((rq2->cmd_flags & REQ_META) && !(rq1->cmd_flags & REQ_META))
 183                 return rq2;
 184
 185         s1 = blk_rq_pos(rq1);
 186         s2 = blk_rq_pos(rq2);
 187
 188         /*
 189          * By definition, 1KiB is 2 sectors.
 190          */
 191         back_max = bfqd->bfq_back_max * 2;
 192
 193         /*
 194          * Strict one way elevator _except_ in the case where we allow
 195          * short backward seeks which are biased as twice the cost of a
 196          * similar forward seek.
 197          */
 198         if (s1 >= last)
 199                 d1 = s1 - last;
 200         else if (s1 + back_max >= last)
 201                 d1 = (last - s1) * bfqd->bfq_back_penalty;
 202         else
 203                 wrap |= BFQ_RQ1_WRAP;
 204
 205         if (s2 >= last)
 206                 d2 = s2 - last;
 207         else if (s2 + back_max >= last)
 208                 d2 = (last - s2) * bfqd->bfq_back_penalty;
 209         else
 210                 wrap |= BFQ_RQ2_WRAP;
 211
 212         /* Found required data */
 213
 214         /*
 215          * By doing switch() on the bit mask "wrap" we avoid having to
 216          * check two variables for all permutations: --> faster!
 217          */
 218         switch (wrap) {
 219         case 0: /* common case for CFQ: rq1 and rq2 not wrapped */
 220                 if (d1 < d2)
 221                         return rq1;
 222                 else if (d2 < d1)
 223                         return rq2;
 224                 else {
 225                         if (s1 >= s2)
 226                                 return rq1;
 227                         else
 228                                 return rq2;
 229                 }
 230
 231         case BFQ_RQ2_WRAP:
 232                 return rq1;
 233         case BFQ_RQ1_WRAP:
 234                 return rq2;
 235         case (BFQ_RQ1_WRAP|BFQ_RQ2_WRAP): /* both rqs wrapped */
 236         default:
 237                 /*
 238                  * Since both rqs are wrapped,
 239                  * start with the one that's further behind head
 240                  * (--> only *one* back seek required),
 241                  * since back seek takes more time than forward.
 242                  */
 243                 if (s1 <= s2)
 244                         return rq1;
 245                 else
 246                         return rq2;
 247         }
 248 }
 249
 250 static struct bfq_queue *
 251 bfq_rq_pos_tree_lookup(struct bfq_data *bfqd, struct rb_root *root,
 252                      sector_t sector, struct rb_node **ret_parent,
 253                      struct rb_node ***rb_link)
 254 {
 255         struct rb_node **p, *parent;
 256         struct bfq_queue *bfqq = NULL;
 257
 258         parent = NULL;
 259         p = &root->rb_node;
 260         while (*p) {
 261                 struct rb_node **n;
 262
 263                 parent = *p;
 264                 bfqq = rb_entry(parent, struct bfq_queue, pos_node);
 265
 266                 /*
 267                  * Sort strictly based on sector. Smallest to the left,
 268                  * largest to the right.
 269                  */
 270                 if (sector > blk_rq_pos(bfqq->next_rq))
 271                         n = &(*p)->rb_right;
 272                 else if (sector < blk_rq_pos(bfqq->next_rq))
 273                         n = &(*p)->rb_left;
 274                 else
 275                         break;
 276                 p = n;
 277                 bfqq = NULL;
 278         }
 279
 280         *ret_parent = parent;
 281         if (rb_link)
 282                 *rb_link = p;
 283
 284         bfq_log(bfqd, "rq_pos_tree_lookup %llu: returning %d",
 285                 (long long unsigned)sector,
 286                 bfqq != NULL ? bfqq->pid : 0);
 287
 288         return bfqq;
 289 }
 290
 291 static void bfq_rq_pos_tree_add(struct bfq_data *bfqd, struct bfq_queue *bfqq)
 292 {
 293         struct rb_node **p, *parent;
 294         struct bfq_queue *__bfqq;
 295
 296         if (bfqq->pos_root != NULL) {
 297                 rb_erase(&bfqq->pos_node, bfqq->pos_root);
 298                 bfqq->pos_root = NULL;
 299         }
 300
 301         if (bfq_class_idle(bfqq))
 302                 return;
 303         if (!bfqq->next_rq)
 304                 return;
 305
 306         bfqq->pos_root = &bfqd->rq_pos_tree;
 307         __bfqq = bfq_rq_pos_tree_lookup(bfqd, bfqq->pos_root,
 308                         blk_rq_pos(bfqq->next_rq), &parent, &p);
 309         if (__bfqq == NULL) {
 310                 rb_link_node(&bfqq->pos_node, parent, p);
 311                 rb_insert_color(&bfqq->pos_node, bfqq->pos_root);
 312         } else
 313                 bfqq->pos_root = NULL;
 314 }
 315
 316 static struct request *bfq_find_next_rq(struct bfq_data *bfqd,
 317                                         struct bfq_queue *bfqq,
 318                                         struct request *last)
 319 {
 320         struct rb_node *rbnext = rb_next(&last->rb_node);
 321         struct rb_node *rbprev = rb_prev(&last->rb_node);
 322         struct request *next = NULL, *prev = NULL;
 323
 324         BUG_ON(RB_EMPTY_NODE(&last->rb_node));
 325
 326         if (rbprev != NULL)
 327                 prev = rb_entry_rq(rbprev);
 328
 329         if (rbnext != NULL)
 330                 next = rb_entry_rq(rbnext);
 331         else {
 332                 rbnext = rb_first(&bfqq->sort_list);
 333                 if (rbnext && rbnext != &last->rb_node)
 334                         next = rb_entry_rq(rbnext);
 335         }
 336
 337         return bfq_choose_req(bfqd, next, prev, blk_rq_pos(last));
 338 }
 339
 340 static void bfq_del_rq_rb(struct request *rq)
 341 {
 342         struct bfq_queue *bfqq = RQ_BFQQ(rq);
 343         struct bfq_data *bfqd = bfqq->bfqd;
 344         const int sync = rq_is_sync(rq);
 345
 346         BUG_ON(bfqq->queued[sync] == 0);
 347         bfqq->queued[sync]--;
 348         bfqd->queued--;
 349
 350         elv_rb_del(&bfqq->sort_list, rq);
 351
 352         if (RB_EMPTY_ROOT(&bfqq->sort_list)) {
 353                 if (bfq_bfqq_busy(bfqq) && bfqq != bfqd->active_queue)
 354                         bfq_del_bfqq_busy(bfqd, bfqq, 1);
 355                 /*
 356                  * Remove queue from request-position tree as it is empty.
 357                  */
 358                 if (bfqq->pos_root != NULL) {
 359                         rb_erase(&bfqq->pos_node, bfqq->pos_root);
 360                         bfqq->pos_root = NULL;
 361                 }
 362         }
 363 }
 364
 365 /* see the definition of bfq_async_charge_factor for details */
 366 static inline unsigned long bfq_serv_to_charge(struct request *rq,
 367                                                struct bfq_queue *bfqq)
 368 {
 369         return blk_rq_sectors(rq) *
 370                 (1 + ((!bfq_bfqq_sync(bfqq)) * (bfqq->raising_coeff == 1) *
 371                 bfq_async_charge_factor));
 372 }
 373
 374 /**
 375  * bfq_updated_next_req - update the queue after a new next_rq selection.
 376  * @bfqd: the device data the queue belongs to.
 377  * @bfqq: the queue to update.
 378  *
 379  * If the first request of a queue changes we make sure that the queue
 380  * has enough budget to serve at least its first request (if the
 381  * request has grown).  We do this because if the queue has not enough
 382  * budget for its first request, it has to go through two dispatch
 383  * rounds to actually get it dispatched.
 384  */
 385 static void bfq_updated_next_req(struct bfq_data *bfqd,
 386                                  struct bfq_queue *bfqq)
 387 {
 388         struct bfq_entity *entity = &bfqq->entity;
 389         struct bfq_service_tree *st = bfq_entity_service_tree(entity);
 390         struct request *next_rq = bfqq->next_rq;
 391         unsigned long new_budget;
 392
 393         if (next_rq == NULL)
 394                 return;
 395
 396         if (bfqq == bfqd->active_queue)
 397                 /*
 398                  * In order not to break guarantees, budgets cannot be
 399                  * changed after an entity has been selected.
 400                  */
 401                 return;
 402
 403         BUG_ON(entity->tree != &st->active);
 404         BUG_ON(entity == entity->sched_data->active_entity);
 405
 406         new_budget = max_t(unsigned long, bfqq->max_budget,
 407                            bfq_serv_to_charge(next_rq, bfqq));
 408         entity->budget = new_budget;
 409         bfq_log_bfqq(bfqd, bfqq, "updated next rq: new budget %lu", new_budget);
 410         bfq_activate_bfqq(bfqd, bfqq);
 411 }
 412
 413 static void bfq_add_rq_rb(struct request *rq)
 414 {
 415         struct bfq_queue *bfqq = RQ_BFQQ(rq);
 416         struct bfq_entity *entity = &bfqq->entity;
 417         struct bfq_data *bfqd = bfqq->bfqd;
 418         struct request *next_rq, *prev;
 419         unsigned long old_raising_coeff = bfqq->raising_coeff;
 420         int idle_for_long_time = bfqq->budget_timeout +
 421                 bfqd->bfq_raising_min_idle_time < jiffies;
 422
 423         bfq_log_bfqq(bfqd, bfqq, "add_rq_rb %d", rq_is_sync(rq));
 424         bfqq->queued[rq_is_sync(rq)]++;
 425         bfqd->queued++;
 426
 427         elv_rb_add(&bfqq->sort_list, rq);
 428
 429         /*
 430          * Check if this request is a better next-serve candidate.
 431          */
 432         prev = bfqq->next_rq;
 433         next_rq = bfq_choose_req(bfqd, bfqq->next_rq, rq, bfqd->last_position);
 434         BUG_ON(next_rq == NULL);
 435         bfqq->next_rq = next_rq;
 436
 437         /*
 438          * Adjust priority tree position, if next_rq changes.
 439          */
 440         if (prev != bfqq->next_rq)
 441                 bfq_rq_pos_tree_add(bfqd, bfqq);
 442
 443         if (!bfq_bfqq_busy(bfqq)) {
 444                 int soft_rt = bfqd->bfq_raising_max_softrt_rate > 0 &&
 445                         bfqq->soft_rt_next_start < jiffies;
 446                 entity->budget = max_t(unsigned long, bfqq->max_budget,
 447                                        bfq_serv_to_charge(next_rq, bfqq));
 448
 449                 if (! bfqd->low_latency)
 450                         goto add_bfqq_busy;
 451
 452                 /*
 453                  * If the queue is not being boosted and has been idle
 454                  * for enough time, start a weight-raising period
 455                  */
 456                 if(old_raising_coeff == 1 && (idle_for_long_time || soft_rt)) {
 457                         bfqq->raising_coeff = bfqd->bfq_raising_coeff;
 458                         bfqq->raising_cur_max_time = idle_for_long_time ?
 459                                 bfqd->bfq_raising_max_time :
 460                                 bfqd->bfq_raising_rt_max_time;
 461                         bfq_log_bfqq(bfqd, bfqq,
 462                                      "wrais starting at %llu msec,"
 463                                      "rais_max_time %u",
 464                                      bfqq->last_rais_start_finish,
 465                                      jiffies_to_msecs(bfqq->
 466                                         raising_cur_max_time));
 467                 } else if (old_raising_coeff > 1) {
 468                         if (idle_for_long_time)
 469                                 bfqq->raising_cur_max_time =
 470                                         bfqd->bfq_raising_max_time;
 471                         else if (bfqq->raising_cur_max_time ==
 472                                  bfqd->bfq_raising_rt_max_time &&
 473                                  !soft_rt) {
 474                                 bfqq->raising_coeff = 1;
 475                                 bfq_log_bfqq(bfqd, bfqq,
 476                                              "wrais ending at %llu msec,"
 477                                              "rais_max_time %u",
 478                                              bfqq->last_rais_start_finish,
 479                                              jiffies_to_msecs(bfqq->
 480                                                 raising_cur_max_time));
 481                                 }
 482                 }
 483                 if (old_raising_coeff != bfqq->raising_coeff)
 484                         entity->ioprio_changed = 1;
 485 add_bfqq_busy:
 486                 bfq_add_bfqq_busy(bfqd, bfqq);
 487         } else {
 488                 if(bfqd->low_latency && old_raising_coeff == 1 &&
 489                         !rq_is_sync(rq) &&
 490                         bfqq->last_rais_start_finish +
 491                         bfqd->bfq_raising_min_inter_arr_async < jiffies) {
 492                         bfqq->raising_coeff = bfqd->bfq_raising_coeff;
 493                         bfqq->raising_cur_max_time = bfqd->bfq_raising_max_time;
 494
 495                         entity->ioprio_changed = 1;
 496                         bfq_log_bfqq(bfqd, bfqq,
 497                                      "non-idle wrais starting at %llu msec,"
 498                                      "rais_max_time %u",
 499                                      bfqq->last_rais_start_finish,
 500                                      jiffies_to_msecs(bfqq->
 501                                         raising_cur_max_time));
 502                 }
 503                 bfq_updated_next_req(bfqd, bfqq);
 504         }
 505
 506         if(bfqd->low_latency &&
 507                 (old_raising_coeff == 1 || bfqq->raising_coeff == 1 ||
 508                  idle_for_long_time))
 509                 bfqq->last_rais_start_finish = jiffies;
 510 }
 511
 512 static void bfq_reposition_rq_rb(struct bfq_queue *bfqq, struct request *rq)
 513 {
 514         elv_rb_del(&bfqq->sort_list, rq);
 515         bfqq->queued[rq_is_sync(rq)]--;
 516         bfqq->bfqd->queued--;
 517         bfq_add_rq_rb(rq);
 518 }
 519
 520 static struct request *bfq_find_rq_fmerge(struct bfq_data *bfqd,
 521                                           struct bio *bio)
 522 {
 523         struct task_struct *tsk = current;
 524         struct bfq_io_cq *bic;
 525         struct bfq_queue *bfqq;
 526
 527         bic = bfq_bic_lookup(bfqd, tsk->io_context);
 528         if (bic == NULL)
 529                 return NULL;
 530
 531         bfqq = bic_to_bfqq(bic, bfq_bio_sync(bio));
 532         if (bfqq != NULL) {
 533                 sector_t sector = bio->bi_sector + bio_sectors(bio);
 534
 535                 return elv_rb_find(&bfqq->sort_list, sector);
 536         }
 537
 538         return NULL;
 539 }
 540
 541 static void bfq_activate_request(struct request_queue *q, struct request *rq)
 542 {
 543         struct bfq_data *bfqd = q->elevator->elevator_data;
 544
 545         bfqd->rq_in_driver++;
 546         bfqd->last_position = blk_rq_pos(rq) + blk_rq_sectors(rq);
 547         bfq_log(bfqd, "activate_request: new bfqd->last_position %llu",
 548                 (long long unsigned)bfqd->last_position);
 549 }
 550
 551 static void bfq_deactivate_request(struct request_queue *q, struct request *rq)
 552 {
 553         struct bfq_data *bfqd = q->elevator->elevator_data;
 554
 555         WARN_ON(bfqd->rq_in_driver == 0);
 556         bfqd->rq_in_driver--;
 557 }
 558
 559 static void bfq_remove_request(struct request *rq)
 560 {
 561         struct bfq_queue *bfqq = RQ_BFQQ(rq);
 562         struct bfq_data *bfqd = bfqq->bfqd;
 563
 564         if (bfqq->next_rq == rq) {
 565                 bfqq->next_rq = bfq_find_next_rq(bfqd, bfqq, rq);
 566                 bfq_updated_next_req(bfqd, bfqq);
 567         }
 568
 569         list_del_init(&rq->queuelist);
 570         bfq_del_rq_rb(rq);
 571
 572         if (rq->cmd_flags & REQ_META) {
 573                 WARN_ON(bfqq->meta_pending == 0);
 574                 bfqq->meta_pending--;
 575         }
 576 }
 577
 578 static int bfq_merge(struct request_queue *q, struct request **req,
 579                      struct bio *bio)
 580 {
 581         struct bfq_data *bfqd = q->elevator->elevator_data;
 582         struct request *__rq;
 583
 584         __rq = bfq_find_rq_fmerge(bfqd, bio);
 585         if (__rq != NULL && elv_rq_merge_ok(__rq, bio)) {
 586                 *req = __rq;
 587                 return ELEVATOR_FRONT_MERGE;
 588         }
 589
 590         return ELEVATOR_NO_MERGE;
 591 }
 592
 593 static void bfq_merged_request(struct request_queue *q, struct request *req,
 594                                int type)
 595 {
 596         if (type == ELEVATOR_FRONT_MERGE) {
 597                 struct bfq_queue *bfqq = RQ_BFQQ(req);
 598
 599                 bfq_reposition_rq_rb(bfqq, req);
 600         }
 601 }
 602
 603 static void bfq_merged_requests(struct request_queue *q, struct request *rq,
 604                                 struct request *next)
 605 {
 606         struct bfq_queue *bfqq = RQ_BFQQ(rq);
 607
 608         /*
 609          * Reposition in fifo if next is older than rq.
 610          */
 611         if (!list_empty(&rq->queuelist) && !list_empty(&next->queuelist) &&
 612             time_before(rq_fifo_time(next), rq_fifo_time(rq))) {
 613                 list_move(&rq->queuelist, &next->queuelist);
 614                 rq_set_fifo_time(rq, rq_fifo_time(next));
 615         }
 616
 617         if (bfqq->next_rq == next)
 618                 bfqq->next_rq = rq;
 619
 620         bfq_remove_request(next);
 621 }
 622
 623 static int bfq_allow_merge(struct request_queue *q, struct request *rq,
 624                            struct bio *bio)
 625 {
 626         struct bfq_data *bfqd = q->elevator->elevator_data;
 627         struct bfq_io_cq *bic;
 628         struct bfq_queue *bfqq;
 629
 630         /*
 631          * Disallow merge of a sync bio into an async request.
 632          */
 633         if (bfq_bio_sync(bio) && !rq_is_sync(rq))
 634                 return 0;
 635
 636         /*
 637          * Lookup the bfqq that this bio will be queued with. Allow
 638          * merge only if rq is queued there.
 639          * Queue lock is held here.
 640          */
 641         bic = bfq_bic_lookup(bfqd, current->io_context);
 642         if (bic == NULL)
 643                 return 0;
 644
 645         bfqq = bic_to_bfqq(bic, bfq_bio_sync(bio));
 646         return bfqq == RQ_BFQQ(rq);
 647 }
 648
 649 static void __bfq_set_active_queue(struct bfq_data *bfqd,
 650                                    struct bfq_queue *bfqq)
 651 {
 652         if (bfqq != NULL) {
 653                 bfq_mark_bfqq_must_alloc(bfqq);
 654                 bfq_mark_bfqq_budget_new(bfqq);
 655                 bfq_clear_bfqq_fifo_expire(bfqq);
 656
 657                 bfqd->budgets_assigned = (bfqd->budgets_assigned*7 + 256) / 8;
 658
 659                 bfq_log_bfqq(bfqd, bfqq, "set_active_queue, cur-budget = %lu",
 660                              bfqq->entity.budget);
 661         }
 662
 663         bfqd->active_queue = bfqq;
 664 }
 665
 666 /*
 667  * Get and set a new active queue for service.
 668  */
 669 static struct bfq_queue *bfq_set_active_queue(struct bfq_data *bfqd,
 670                                               struct bfq_queue *bfqq)
 671 {
 672         if (!bfqq)
 673                 bfqq = bfq_get_next_queue(bfqd);
 674         else
 675                 bfq_get_next_queue_forced(bfqd, bfqq);
 676
 677         __bfq_set_active_queue(bfqd, bfqq);
 678         return bfqq;
 679 }
 680
 681 static inline sector_t bfq_dist_from_last(struct bfq_data *bfqd,
 682                                           struct request *rq)
 683 {
 684         if (blk_rq_pos(rq) >= bfqd->last_position)
 685                 return blk_rq_pos(rq) - bfqd->last_position;
 686         else
 687                 return bfqd->last_position - blk_rq_pos(rq);
 688 }
 689
 690 /*
 691  * Return true if bfqq has no request pending and rq is close enough to
 692  * bfqd->last_position, or if rq is closer to bfqd->last_position than
 693  * bfqq->next_rq
 694  */
 695 static inline int bfq_rq_close(struct bfq_data *bfqd, struct bfq_queue *bfqq,
 696                                struct request *rq)
 697 {
 698         sector_t sdist = bfqq->seek_mean;
 699
 700         if (!bfq_sample_valid(bfqq->seek_samples))
 701                 sdist = BFQQ_SEEK_THR;
 702
 703         /* If seek_mean is large, using it as close criteria is meaningless */
 704         if (sdist > BFQQ_SEEK_THR)
 705                 sdist = BFQQ_SEEK_THR;
 706
 707         return bfq_dist_from_last(bfqd, rq) <= sdist;
 708 }
 709
 710 static struct bfq_queue *bfqq_close(struct bfq_data *bfqd,
 711                                     struct bfq_queue *cur_bfqq)
 712 {
 713         struct rb_root *root = &bfqd->rq_pos_tree;
 714         struct rb_node *parent, *node;
 715         struct bfq_queue *__bfqq;
 716         sector_t sector = bfqd->last_position;
 717
 718         if (RB_EMPTY_ROOT(root))
 719                 return NULL;
 720
 721         /*
 722          * First, if we find a request starting at the end of the last
 723          * request, choose it.
 724          */
 725         __bfqq = bfq_rq_pos_tree_lookup(bfqd, root, sector, &parent, NULL);
 726         if (__bfqq != NULL)
 727                 return __bfqq;
 728
 729         /*
 730          * If the exact sector wasn't found, the parent of the NULL leaf
 731          * will contain the closest sector (rq_pos_tree sorted by next_request
 732          * position).
 733          */
 734         __bfqq = rb_entry(parent, struct bfq_queue, pos_node);
 735         if (bfq_rq_close(bfqd, cur_bfqq, __bfqq->next_rq))
 736                 return __bfqq;
 737
 738         if (blk_rq_pos(__bfqq->next_rq) < sector)
 739                 node = rb_next(&__bfqq->pos_node);
 740         else
 741                 node = rb_prev(&__bfqq->pos_node);
 742         if (node == NULL)
 743                 return NULL;
 744
 745         __bfqq = rb_entry(node, struct bfq_queue, pos_node);
 746         if (bfq_rq_close(bfqd, cur_bfqq, __bfqq->next_rq))
 747                 return __bfqq;
 748
 749         return NULL;
 750 }
 751
 752 /*
 753  * bfqd - obvious
 754  * cur_bfqq - passed in so that we don't decide that the current queue
 755  *            is closely cooperating with itself.
 756  *
 757  * We are assuming that cur_bfqq has dispatched at least one request,
 758  * and that bfqd->last_position reflects a position on the disk associated
 759  * with the I/O issued by cur_bfqq.
 760  */
 761 static struct bfq_queue *bfq_close_cooperator(struct bfq_data *bfqd,
 762                                               struct bfq_queue *cur_bfqq)
 763 {
 764         struct bfq_queue *bfqq;
 765
 766         if (bfq_class_idle(cur_bfqq))
 767                 return NULL;
 768         if (!bfq_bfqq_sync(cur_bfqq))
 769                 return NULL;
 770         if (BFQQ_SEEKY(cur_bfqq))
 771                 return NULL;
 772
 773         /* If device has only one backlogged bfq_queue, don't search. */
 774         if (bfqd->busy_queues == 1)
 775                 return NULL;
 776
 777         /*
 778          * We should notice if some of the queues are cooperating, e.g.
 779          * working closely on the same area of the disk. In that case,
 780          * we can group them together and don't waste time idling.
 781          */
 782         bfqq = bfqq_close(bfqd, cur_bfqq);
 783         if (bfqq == NULL || bfqq == cur_bfqq)
 784                 return NULL;
 785
 786         /*
 787          * Do not merge queues from different bfq_groups.
 788         */
 789         if (bfqq->entity.parent != cur_bfqq->entity.parent)
 790                 return NULL;
 791
 792         /*
 793          * It only makes sense to merge sync queues.
 794          */
 795         if (!bfq_bfqq_sync(bfqq))
 796                 return NULL;
 797         if (BFQQ_SEEKY(bfqq))
 798                 return NULL;
 799
 800         /*
 801          * Do not merge queues of different priority classes.
 802          */
 803         if (bfq_class_rt(bfqq) != bfq_class_rt(cur_bfqq))
 804                 return NULL;
 805
 806         return bfqq;
 807 }
 808
 809 /*
 810  * If enough samples have been computed, return the current max budget
 811  * stored in bfqd, which is dynamically updated according to the
 812  * estimated disk peak rate; otherwise return the default max budget
 813  */
 814 static inline unsigned long bfq_max_budget(struct bfq_data *bfqd)
 815 {
 816         return bfqd->budgets_assigned < 194 ? bfq_default_max_budget :
 817                 bfqd->bfq_max_budget;
 818 }
 819
 820 /*
 821  * Return min budget, which is a fraction of the current or default
 822  * max budget (trying with 1/32)
 823  */
 824 static inline unsigned long bfq_min_budget(struct bfq_data *bfqd)
 825 {
 826         return bfqd->budgets_assigned < 194 ? bfq_default_max_budget / 32 :
 827                 bfqd->bfq_max_budget / 32;
 828 }
 829
 830 static void bfq_arm_slice_timer(struct bfq_data *bfqd)
 831 {
 832         struct bfq_queue *bfqq = bfqd->active_queue;
 833         struct bfq_io_cq *bic;
 834         unsigned long sl;
 835
 836         WARN_ON(!RB_EMPTY_ROOT(&bfqq->sort_list));
 837
 838         /* Idling is disabled, either manually or by past process history. */
 839         if (bfqd->bfq_slice_idle == 0 || !bfq_bfqq_idle_window(bfqq))
 840                 return;
 841
 842         /* Tasks have exited, don't wait. */
 843         bic = bfqd->active_bic;
 844         if (bic == NULL || atomic_read(&bic->icq.ioc->nr_tasks) == 0)
 845                 return;
 846
 847         bfq_mark_bfqq_wait_request(bfqq);
 848
 849         /*
 850          * We don't want to idle for seeks, but we do want to allow
 851          * fair distribution of slice time for a process doing back-to-back
 852          * seeks. So allow a little bit of time for him to submit a new rq.
 853          *
 854          * To prevent processes with (partly) seeky workloads from
 855          * being too ill-treated, grant them a small fraction of the
 856          * assigned budget before reducing the waiting time to
 857          * BFQ_MIN_TT. This happened to help reduce latency.
 858          */
 859         sl = bfqd->bfq_slice_idle;
 860         if (bfq_sample_valid(bfqq->seek_samples) && BFQQ_SEEKY(bfqq) &&
 861             bfqq->entity.service > bfq_max_budget(bfqd) / 8 &&
 862             bfqq->raising_coeff == 1)
 863                 sl = min(sl, msecs_to_jiffies(BFQ_MIN_TT));
 864         else if (bfqq->raising_coeff > 1)
 865                 sl = sl * 3;
 866         bfqd->last_idling_start = ktime_get();
 867         mod_timer(&bfqd->idle_slice_timer, jiffies + sl);
 868         bfq_log(bfqd, "arm idle: %u/%u ms",
 869                 jiffies_to_msecs(sl), jiffies_to_msecs(bfqd->bfq_slice_idle));
 870 }
 871
 872 /*
 873  * Set the maximum time for the active queue to consume its
 874  * budget. This prevents seeky processes from lowering the disk
 875  * throughput (always guaranteed with a time slice scheme as in CFQ).
 876  */
 877 static void bfq_set_budget_timeout(struct bfq_data *bfqd)
 878 {
 879         struct bfq_queue *bfqq = bfqd->active_queue;
 880         unsigned int timeout_coeff =
 881                 bfqq->raising_cur_max_time == bfqd->bfq_raising_rt_max_time ?
 882                 1 : (bfqq->entity.weight / bfqq->entity.orig_weight);
 883
 884         bfqd->last_budget_start = ktime_get();
 885
 886         bfq_clear_bfqq_budget_new(bfqq);
 887         bfqq->budget_timeout = jiffies +
 888                 bfqd->bfq_timeout[bfq_bfqq_sync(bfqq)] * timeout_coeff;
 889
 890         bfq_log_bfqq(bfqd, bfqq, "set budget_timeout %u",
 891                 jiffies_to_msecs(bfqd->bfq_timeout[bfq_bfqq_sync(bfqq)] *
 892                 timeout_coeff));
 893 }
 894
 895 /*
 896  * Move request from internal lists to the request queue dispatch list.
 897  */
 898 static void bfq_dispatch_insert(struct request_queue *q, struct request *rq)
 899 {
 900         struct bfq_data *bfqd = q->elevator->elevator_data;
 901         struct bfq_queue *bfqq = RQ_BFQQ(rq);
 902
 903         bfq_remove_request(rq);
 904         bfqq->dispatched++;
 905         elv_dispatch_sort(q, rq);
 906
 907         if (bfq_bfqq_sync(bfqq))
 908                 bfqd->sync_flight++;
 909 }
 910
 911 /*
 912  * Return expired entry, or NULL to just start from scratch in rbtree.
 913  */
 914 static struct request *bfq_check_fifo(struct bfq_queue *bfqq)
 915 {
 916         struct request *rq = NULL;
 917
 918         if (bfq_bfqq_fifo_expire(bfqq))
 919                 return NULL;
 920
 921         bfq_mark_bfqq_fifo_expire(bfqq);
 922
 923         if (list_empty(&bfqq->fifo))
 924                 return NULL;
 925
 926         rq = rq_entry_fifo(bfqq->fifo.next);
 927
 928         if (time_before(jiffies, rq_fifo_time(rq)))
 929                 return NULL;
 930
 931         return rq;
 932 }
 933
 934 /*
 935  * Must be called with the queue_lock held.
 936  */
 937 static int bfqq_process_refs(struct bfq_queue *bfqq)
 938 {
 939         int process_refs, io_refs;
 940
 941         io_refs = bfqq->allocated[READ] + bfqq->allocated[WRITE];
 942         process_refs = atomic_read(&bfqq->ref) - io_refs;
 943         BUG_ON(process_refs < 0);
 944         return process_refs;
 945 }
 946
 947 static void bfq_setup_merge(struct bfq_queue *bfqq, struct bfq_queue *new_bfqq)
 948 {
 949         int process_refs, new_process_refs;
 950         struct bfq_queue *__bfqq;
 951
 952         /*
 953          * If there are no process references on the new_bfqq, then it is
 954          * unsafe to follow the ->new_bfqq chain as other bfqq's in the chain
 955          * may have dropped their last reference (not just their last process
 956          * reference).
 957          */
 958         if (!bfqq_process_refs(new_bfqq))
 959                 return;
 960
 961         /* Avoid a circular list and skip interim queue merges. */
 962         while ((__bfqq = new_bfqq->new_bfqq)) {
 963                 if (__bfqq == bfqq)
 964                         return;
 965                 new_bfqq = __bfqq;
 966         }
 967
 968         process_refs = bfqq_process_refs(bfqq);
 969         new_process_refs = bfqq_process_refs(new_bfqq);
 970         /*
 971          * If the process for the bfqq has gone away, there is no
 972          * sense in merging the queues.
 973          */
 974         if (process_refs == 0 || new_process_refs == 0)
 975                 return;
 976
 977         /*
 978          * Merge in the direction of the lesser amount of work.
 979          */
 980         if (new_process_refs >= process_refs) {
 981                 bfqq->new_bfqq = new_bfqq;
 982                 atomic_add(process_refs, &new_bfqq->ref);
 983         } else {
 984                 new_bfqq->new_bfqq = bfqq;
 985                 atomic_add(new_process_refs, &bfqq->ref);
 986         }
 987         bfq_log_bfqq(bfqq->bfqd, bfqq, "scheduling merge with queue %d",
 988                 new_bfqq->pid);
 989 }
 990
 991 static inline unsigned long bfq_bfqq_budget_left(struct bfq_queue *bfqq)
 992 {
 993         struct bfq_entity *entity = &bfqq->entity;
 994         return entity->budget - entity->service;
 995 }
 996
 997 static void __bfq_bfqq_expire(struct bfq_data *bfqd, struct bfq_queue *bfqq)
 998 {
 999         BUG_ON(bfqq != bfqd->active_queue);
1000
1001         __bfq_bfqd_reset_active(bfqd);
1002
1003         if (RB_EMPTY_ROOT(&bfqq->sort_list)) {
1004                 bfq_del_bfqq_busy(bfqd, bfqq, 1);
1005                 /*
1006                  * overloading budget_timeout field to store when
1007                  * the queue remains with no backlog, used by
1008                  * the weight-raising mechanism
1009                  */
1010                 bfqq->budget_timeout = jiffies ;
1011         } else {
1012                 bfq_activate_bfqq(bfqd, bfqq);
1013                 /*
1014                  * Resort priority tree of potential close cooperators.
1015                  */
1016                 bfq_rq_pos_tree_add(bfqd, bfqq);
1017         }
1018
1019         /*
1020          * If this bfqq is shared between multiple processes, check
1021          * to make sure that those processes are still issuing I/Os
1022          * within the mean seek distance. If not, it may be time to
1023          * break the queues apart again.
1024          */
1025         if (bfq_bfqq_coop(bfqq) && BFQQ_SEEKY(bfqq))
1026                 bfq_mark_bfqq_split_coop(bfqq);
1027 }
1028
1029 /**
1030  * __bfq_bfqq_recalc_budget - try to adapt the budget to the @bfqq behavior.
1031  * @bfqd: device data.
1032  * @bfqq: queue to update.
1033  * @reason: reason for expiration.
1034  *
1035  * Handle the feedback on @bfqq budget.  See the body for detailed
1036  * comments.
1037  */
1038 static void __bfq_bfqq_recalc_budget(struct bfq_data *bfqd,
1039                                      struct bfq_queue *bfqq,
1040                                      enum bfqq_expiration reason)
1041 {
1042         struct request *next_rq;
1043         unsigned long budget, min_budget;
1044
1045         budget = bfqq->max_budget;
1046         min_budget = bfq_min_budget(bfqd);
1047
1048         BUG_ON(bfqq != bfqd->active_queue);
1049
1050         bfq_log_bfqq(bfqd, bfqq, "recalc_budg: last budg %lu, budg left %lu",
1051                 bfqq->entity.budget, bfq_bfqq_budget_left(bfqq));
1052         bfq_log_bfqq(bfqd, bfqq, "recalc_budg: last max_budg %lu, min budg %lu",
1053                 budget, bfq_min_budget(bfqd));
1054         bfq_log_bfqq(bfqd, bfqq, "recalc_budg: sync %d, seeky %d",
1055                 bfq_bfqq_sync(bfqq), BFQQ_SEEKY(bfqd->active_queue));
1056
1057         if (bfq_bfqq_sync(bfqq)) {
1058                 switch (reason) {
1059                 /*
1060                  * Caveat: in all the following cases we trade latency
1061                  * for throughput.
1062                  */
1063                 case BFQ_BFQQ_TOO_IDLE:
1064                         /*
1065                          * This is the only case where we may reduce
1066                          * the budget: if there is no requets of the
1067                          * process still waiting for completion, then
1068                          * we assume (tentatively) that the timer has
1069                          * expired because the batch of requests of
1070                          * the process could have been served with a
1071                          * smaller budget.  Hence, betting that
1072                          * process will behave in the same way when it
1073                          * becomes backlogged again, we reduce its
1074                          * next budget.  As long as we guess right,
1075                          * this budget cut reduces the latency
1076                          * experienced by the process.
1077                          *
1078                          * However, if there are still outstanding
1079                          * requests, then the process may have not yet
1080                          * issued its next request just because it is
1081                          * still waiting for the completion of some of
1082                          * the still oustanding ones.  So in this
1083                          * subcase we do not reduce its budget, on the
1084                          * contrary we increase it to possibly boost
1085                          * the throughput, as discussed in the
1086                          * comments to the BUDGET_TIMEOUT case.
1087                          */
1088                         if (bfqq->dispatched > 0) /* still oustanding reqs */
1089                                 budget = min(budget * 2, bfqd->bfq_max_budget);
1090                         else {
1091                                 if (budget > 5 * min_budget)
1092                                         budget -= 4 * min_budget;
1093                                 else
1094                                         budget = min_budget;
1095                         }
1096                         break;
1097                 case BFQ_BFQQ_BUDGET_TIMEOUT:
1098                         /*
1099                          * We double the budget here because: 1) it
1100                          * gives the chance to boost the throughput if
1101                          * this is not a seeky process (which may have
1102                          * bumped into this timeout because of, e.g.,
1103                          * ZBR), 2) together with charge_full_budget
1104                          * it helps give seeky processes higher
1105                          * timestamps, and hence be served less
1106                          * frequently.
1107                          */
1108                         budget = min(budget * 2, bfqd->bfq_max_budget);
1109                         break;
1110                 case BFQ_BFQQ_BUDGET_EXHAUSTED:
1111                         /*
1112                          * The process still has backlog, and did not
1113                          * let either the budget timeout or the disk
1114                          * idling timeout expire. Hence it is not
1115                          * seeky, has a short thinktime and may be
1116                          * happy with a higher budget too. So
1117                          * definitely increase the budget of this good
1118                          * candidate to boost the disk throughput.
1119                          */
1120                         budget = min(budget * 4, bfqd->bfq_max_budget);
1121                         break;
1122                 case BFQ_BFQQ_NO_MORE_REQUESTS:
1123                        /*
1124                         * Leave the budget unchanged.
1125                         */
1126                 default:
1127                         return;
1128                 }
1129         } else /* async queue */
1130             /* async queues get always the maximum possible budget
1131              * (their ability to dispatch is limited by
1132              * @bfqd->bfq_max_budget_async_rq).
1133              */
1134                 budget = bfqd->bfq_max_budget;
1135
1136         bfqq->max_budget = budget;
1137
1138         if (bfqd->budgets_assigned >= 194 && bfqd->bfq_user_max_budget == 0 &&
1139             bfqq->max_budget > bfqd->bfq_max_budget)
1140                 bfqq->max_budget = bfqd->bfq_max_budget;
1141
1142         /*
1143          * Make sure that we have enough budget for the next request.
1144          * Since the finish time of the bfqq must be kept in sync with
1145          * the budget, be sure to call __bfq_bfqq_expire() after the
1146          * update.
1147          */
1148         next_rq = bfqq->next_rq;
1149         if (next_rq != NULL)
1150                 bfqq->entity.budget = max_t(unsigned long, bfqq->max_budget,
1151                                             bfq_serv_to_charge(next_rq, bfqq));
1152         else
1153                 bfqq->entity.budget = bfqq->max_budget;
1154
1155         bfq_log_bfqq(bfqd, bfqq, "head sect: %u, new budget %lu",
1156                         next_rq != NULL ? blk_rq_sectors(next_rq) : 0,
1157                         bfqq->entity.budget);
1158 }
1159
1160 static unsigned long bfq_calc_max_budget(u64 peak_rate, u64 timeout)
1161 {
1162         unsigned long max_budget;
1163
1164         /*
1165          * The max_budget calculated when autotuning is equal to the
1166          * amount of sectors transfered in timeout_sync at the
1167          * estimated peak rate.
1168          */
1169         max_budget = (unsigned long)(peak_rate * 1000 *
1170                                      timeout >> BFQ_RATE_SHIFT);
1171
1172         return max_budget;
1173 }
1174
1175 /*
1176  * In addition to updating the peak rate, checks whether the process
1177  * is "slow", and returns 1 if so. This slow flag is used, in addition
1178  * to the budget timeout, to reduce the amount of service provided to
1179  * seeky processes, and hence reduce their chances to lower the
1180  * throughput. See the code for more details.
1181  */
1182 static int bfq_update_peak_rate(struct bfq_data *bfqd, struct bfq_queue *bfqq,
1183                                 int compensate, enum bfqq_expiration reason)
1184 {
1185         u64 bw, usecs, expected, timeout;
1186         ktime_t delta;
1187         int update = 0;
1188
1189         if (!bfq_bfqq_sync(bfqq) || bfq_bfqq_budget_new(bfqq))
1190                 return 0;
1191
1192         delta = compensate ? bfqd->last_idling_start : ktime_get();
1193         delta = ktime_sub(delta, bfqd->last_budget_start);
1194         usecs = ktime_to_us(delta);
1195
1196         /* Don't trust short/unrealistic values. */
1197         if (usecs < 100 || usecs >= LONG_MAX)
1198                 return 0;
1199
1200         /*
1201          * Calculate the bandwidth for the last slice.  We use a 64 bit
1202          * value to store the peak rate, in sectors per usec in fixed
1203          * point math.  We do so to have enough precision in the estimate
1204          * and to avoid overflows.
1205          */
1206         bw = (u64)bfqq->entity.service << BFQ_RATE_SHIFT;
1207         do_div(bw, (unsigned long)usecs);
1208
1209         timeout = jiffies_to_msecs(bfqd->bfq_timeout[BLK_RW_SYNC]);
1210
1211         /*
1212          * Use only long (> 20ms) intervals to filter out spikes for
1213          * the peak rate estimation.
1214          */
1215         if (usecs > 20000) {
1216                 if (bw > bfqd->peak_rate ||
1217                    (!BFQQ_SEEKY(bfqq) &&
1218                     reason == BFQ_BFQQ_BUDGET_TIMEOUT)) {
1219                         bfq_log(bfqd, "measured bw =%llu", bw);
1220                         /*
1221                          * To smooth oscillations use a low-pass filter with
1222                          * alpha=7/8, i.e.,
1223                          * new_rate = (7/8) * old_rate + (1/8) * bw
1224                          */
1225                         do_div(bw, 8);
1226                         bfqd->peak_rate *= 7;
1227                         do_div(bfqd->peak_rate, 8);
1228                         bfqd->peak_rate += bw;
1229                         update = 1;
1230                         bfq_log(bfqd, "new peak_rate=%llu", bfqd->peak_rate);
1231                 }
1232
1233                 update |= bfqd->peak_rate_samples == BFQ_PEAK_RATE_SAMPLES - 1;
1234
1235                 if (bfqd->peak_rate_samples < BFQ_PEAK_RATE_SAMPLES)
1236                         bfqd->peak_rate_samples++;
1237
1238                 if (bfqd->peak_rate_samples == BFQ_PEAK_RATE_SAMPLES &&
1239                     update && bfqd->bfq_user_max_budget == 0) {
1240                         bfqd->bfq_max_budget =
1241                                 bfq_calc_max_budget(bfqd->peak_rate, timeout);
1242                         bfq_log(bfqd, "new max_budget=%lu",
1243                                 bfqd->bfq_max_budget);
1244                 }
1245         }
1246
1247         /*
1248          * If the process has been served for a too short time
1249          * interval to let its possible sequential accesses prevail on
1250          * the initial seek time needed to move the disk head on the
1251          * first sector it requested, then give the process a chance
1252          * and for the moment return false.
1253          */
1254         if (bfqq->entity.budget <= bfq_max_budget(bfqd) / 8)
1255                 return 0;
1256
1257         /*
1258          * A process is considered ``slow'' (i.e., seeky, so that we
1259          * cannot treat it fairly in the service domain, as it would
1260          * slow down too much the other processes) if, when a slice
1261          * ends for whatever reason, it has received service at a
1262          * rate that would not be high enough to complete the budget
1263          * before the budget timeout expiration.
1264          */
1265         expected = bw * 1000 * timeout >> BFQ_RATE_SHIFT;
1266
1267         /*
1268          * Caveat: processes doing IO in the slower disk zones will
1269          * tend to be slow(er) even if not seeky. And the estimated
1270          * peak rate will actually be an average over the disk
1271          * surface. Hence, to not be too harsh with unlucky processes,
1272          * we keep a budget/3 margin of safety before declaring a
1273          * process slow.
1274          */
1275         return expected > (4 * bfqq->entity.budget) / 3;
1276 }
1277
1278 /**
1279  * bfq_bfqq_expire - expire a queue.
1280  * @bfqd: device owning the queue.
1281  * @bfqq: the queue to expire.
1282  * @compensate: if true, compensate for the time spent idling.
1283  * @reason: the reason causing the expiration.
1284  *
1285  *
1286  * If the process associated to the queue is slow (i.e., seeky), or in
1287  * case of budget timeout, or, finally, if it is async, we
1288  * artificially charge it an entire budget (independently of the
1289  * actual service it received). As a consequence, the queue will get
1290  * higher timestamps than the correct ones upon reactivation, and
1291  * hence it will be rescheduled as if it had received more service
1292  * than what it actually received. In the end, this class of processes
1293  * will receive less service in proportion to how slowly they consume
1294  * their budgets (and hence how seriously they tend to lower the
1295  * throughput).
1296  *
1297  * In contrast, when a queue expires because it has been idling for
1298  * too much or because it exhausted its budget, we do not touch the
1299  * amount of service it has received. Hence when the queue will be
1300  * reactivated and its timestamps updated, the latter will be in sync
1301  * with the actual service received by the queue until expiration.
1302  *
1303  * Charging a full budget to the first type of queues and the exact
1304  * service to the others has the effect of using the WF2Q+ policy to
1305  * schedule the former on a timeslice basis, without violating the
1306  * service domain guarantees of the latter.
1307  */
1308 static void bfq_bfqq_expire(struct bfq_data *bfqd,
1309                             struct bfq_queue *bfqq,
1310                             int compensate,
1311                             enum bfqq_expiration reason)
1312 {
1313         int slow;
1314         BUG_ON(bfqq != bfqd->active_queue);
1315
1316         /* Update disk peak rate for autotuning and check whether the
1317          * process is slow (see bfq_update_peak_rate).
1318          */
1319         slow = bfq_update_peak_rate(bfqd, bfqq, compensate, reason);
1320
1321         /*
1322          * As above explained, 'punish' slow (i.e., seeky), timed-out
1323          * and async queues, to favor sequential sync workloads.
1324          *
1325          * Processes doing IO in the slower disk zones will tend to be
1326          * slow(er) even if not seeky. Hence, since the estimated peak
1327          * rate is actually an average over the disk surface, these
1328          * processes may timeout just for bad luck. To avoid punishing
1329          * them we do not charge a full budget to a process that
1330          * succeeded in consuming at least 2/3 of its budget.
1331          */
1332         if (slow || (reason == BFQ_BFQQ_BUDGET_TIMEOUT &&
1333                      bfq_bfqq_budget_left(bfqq) >=  bfqq->entity.budget / 3))
1334                 bfq_bfqq_charge_full_budget(bfqq);
1335
1336         if (bfqd->low_latency && bfqq->raising_coeff == 1)
1337                 bfqq->last_rais_start_finish = jiffies;
1338
1339         if (bfqd->low_latency && bfqd->bfq_raising_max_softrt_rate > 0) {
1340             if(reason != BFQ_BFQQ_BUDGET_TIMEOUT)
1341                 bfqq->soft_rt_next_start =
1342                         jiffies +
1343                         HZ * bfqq->entity.service /
1344                         bfqd->bfq_raising_max_softrt_rate;
1345                 else
1346                         bfqq->soft_rt_next_start = -1; /* infinity */
1347         }
1348         bfq_log_bfqq(bfqd, bfqq,
1349                 "expire (%d, slow %d, num_disp %d, idle_win %d)", reason, slow,
1350                 bfqq->dispatched, bfq_bfqq_idle_window(bfqq));
1351
1352         /* Increase, decrease or leave budget unchanged according to reason */
1353         __bfq_bfqq_recalc_budget(bfqd, bfqq, reason);
1354         __bfq_bfqq_expire(bfqd, bfqq);
1355 }
1356
1357 /*
1358  * Budget timeout is not implemented through a dedicated timer, but
1359  * just checked on request arrivals and completions, as well as on
1360  * idle timer expirations.
1361  */
1362 static int bfq_bfqq_budget_timeout(struct bfq_queue *bfqq)
1363 {
1364         if (bfq_bfqq_budget_new(bfqq))
1365                 return 0;
1366
1367         if (time_before(jiffies, bfqq->budget_timeout))
1368                 return 0;
1369
1370         return 1;
1371 }
1372
1373 /*
1374  * If we expire a queue that is waiting for the arrival of a new
1375  * request, we may prevent the fictitious timestamp backshifting that
1376  * allows the guarantees of the queue to be preserved (see [1] for
1377  * this tricky aspect). Hence we return true only if this condition
1378  * does not hold, or if the queue is slow enough to deserve only to be
1379  * kicked off for preserving a high throughput.
1380 */
1381 static inline int bfq_may_expire_for_budg_timeout(struct bfq_queue *bfqq)
1382 {
1383         bfq_log_bfqq(bfqq->bfqd, bfqq,
1384                 "may_budget_timeout: wr %d left %d timeout %d",
1385                 bfq_bfqq_wait_request(bfqq),
1386                         bfq_bfqq_budget_left(bfqq) >=  bfqq->entity.budget / 3,
1387                 bfq_bfqq_budget_timeout(bfqq));
1388
1389         return (!bfq_bfqq_wait_request(bfqq) ||
1390                 bfq_bfqq_budget_left(bfqq) >=  bfqq->entity.budget / 3)
1391                 &&
1392                 bfq_bfqq_budget_timeout(bfqq);
1393 }
1394
1395 /*
1396  * Select a queue for service.  If we have a current active queue,
1397  * check whether to continue servicing it, or retrieve and set a new one.
1398  */
1399 static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd)
1400 {
1401         struct bfq_queue *bfqq, *new_bfqq = NULL;
1402         struct request *next_rq;
1403         enum bfqq_expiration reason = BFQ_BFQQ_BUDGET_TIMEOUT;
1404
1405         bfqq = bfqd->active_queue;
1406         if (bfqq == NULL)
1407                 goto new_queue;
1408
1409         bfq_log_bfqq(bfqd, bfqq, "select_queue: already active queue");
1410
1411         /*
1412          * If another queue has a request waiting within our mean seek
1413          * distance, let it run. The expire code will check for close
1414          * cooperators and put the close queue at the front of the
1415          * service tree. If possible, merge the expiring queue with the
1416          * new bfqq.
1417          */
1418         new_bfqq = bfq_close_cooperator(bfqd, bfqq);
1419         if (new_bfqq != NULL && bfqq->new_bfqq == NULL)
1420                 bfq_setup_merge(bfqq, new_bfqq);
1421
1422         if (bfq_may_expire_for_budg_timeout(bfqq))
1423                 goto expire;
1424
1425         next_rq = bfqq->next_rq;
1426         /*
1427          * If bfqq has requests queued and it has enough budget left to
1428          * serve them, keep the queue, otherwise expire it.
1429          */
1430         if (next_rq != NULL) {
1431                 if (bfq_serv_to_charge(next_rq, bfqq) >
1432                         bfq_bfqq_budget_left(bfqq)) {
1433                         reason = BFQ_BFQQ_BUDGET_EXHAUSTED;
1434                         goto expire;
1435                 } else {
1436                         /*
1437                          * The idle timer may be pending because we may not
1438                          * disable disk idling even when a new request arrives
1439                          */
1440                         if (timer_pending(&bfqd->idle_slice_timer)) {
1441                                 /*
1442                                  * If we get here: 1) at least a new request
1443                                  * has arrived but we have not disabled the
1444                                  * timer because the request was too small,
1445                                  * 2) then the block layer has unplugged the
1446                                  * device, causing the dispatch to be invoked.
1447                                  *
1448                                  * Since the device is unplugged, now the
1449                                  * requests are probably large enough to
1450                                  * provide a reasonable throughput.
1451                                  * So we disable idling.
1452                                  */
1453                                 bfq_clear_bfqq_wait_request(bfqq);
1454                                 del_timer(&bfqd->idle_slice_timer);
1455                         }
1456                         if (new_bfqq == NULL)
1457                                 goto keep_queue;
1458                         else
1459                                 goto expire;
1460                 }
1461         }
1462
1463         /*
1464          * No requests pending.  If there is no cooperator, and the active
1465          * queue still has requests in flight or is idling for a new request,
1466          * then keep it.
1467          */
1468         if (new_bfqq == NULL && (timer_pending(&bfqd->idle_slice_timer) ||
1469                 (bfqq->dispatched != 0 && bfq_bfqq_idle_window(bfqq)))) {
1470                 bfqq = NULL;
1471                 goto keep_queue;
1472         } else if (new_bfqq != NULL && timer_pending(&bfqd->idle_slice_timer)) {
1473                 /*
1474                  * Expiring the queue because there is a close cooperator,
1475                  * cancel timer.
1476                  */
1477                 bfq_clear_bfqq_wait_request(bfqq);
1478                 del_timer(&bfqd->idle_slice_timer);
1479         }
1480
1481         reason = BFQ_BFQQ_NO_MORE_REQUESTS;
1482 expire:
1483         bfq_bfqq_expire(bfqd, bfqq, 0, reason);
1484 new_queue:
1485         bfqq = bfq_set_active_queue(bfqd, new_bfqq);
1486         bfq_log(bfqd, "select_queue: new queue %d returned",
1487                 bfqq != NULL ? bfqq->pid : 0);
1488 keep_queue:
1489         return bfqq;
1490 }
1491
1492 static void update_raising_data(struct bfq_data *bfqd, struct bfq_queue *bfqq)
1493 {
1494         if (bfqq->raising_coeff > 1) { /* queue is being boosted */
1495                 struct bfq_entity *entity = &bfqq->entity;
1496
1497                 bfq_log_bfqq(bfqd, bfqq,
1498                         "raising period dur %u/%u msec, "
1499                         "old raising coeff %u, w %d(%d)",
1500                         jiffies_to_msecs(jiffies -
1501                                 bfqq->last_rais_start_finish),
1502                         jiffies_to_msecs(bfqq->raising_cur_max_time),
1503                         bfqq->raising_coeff,
1504                         bfqq->entity.weight, bfqq->entity.orig_weight);
1505
1506                 BUG_ON(bfqq != bfqd->active_queue && entity->weight !=
1507                         entity->orig_weight * bfqq->raising_coeff);
1508                 if(entity->ioprio_changed)
1509                         bfq_log_bfqq(bfqd, bfqq,
1510                         "WARN: pending prio change");
1511                 /*
1512                  * If too much time has elapsed from the beginning
1513                  * of this weight-raising period and process is not soft
1514                  * real-time, stop it
1515                  */
1516                 if (jiffies - bfqq->last_rais_start_finish >
1517                         bfqq->raising_cur_max_time) {
1518                         int soft_rt = bfqd->bfq_raising_max_softrt_rate > 0 &&
1519                                 bfqq->soft_rt_next_start < jiffies;
1520
1521                         bfqq->last_rais_start_finish = jiffies;
1522                         if (soft_rt)
1523                                 bfqq->raising_cur_max_time =
1524                                         bfqd->bfq_raising_rt_max_time;
1525                         else {
1526                                 bfq_log_bfqq(bfqd, bfqq,
1527                                              "wrais ending at %llu msec,"
1528                                              "rais_max_time %u",
1529                                              bfqq->last_rais_start_finish,
1530                                              jiffies_to_msecs(bfqq->
1531                                                 raising_cur_max_time));
1532                                 bfqq->raising_coeff = 1;
1533                                 entity->ioprio_changed = 1;
1534                                 __bfq_entity_update_weight_prio(
1535                                         bfq_entity_service_tree(entity),
1536                                         entity);
1537                         }
1538                 }
1539         }
1540 }
1541
1542
1543 /*
1544  * Dispatch one request from bfqq, moving it to the request queue
1545  * dispatch list.
1546  */
1547 static int bfq_dispatch_request(struct bfq_data *bfqd,
1548                                 struct bfq_queue *bfqq)
1549 {
1550         int dispatched = 0;
1551         struct request *rq;
1552         unsigned long service_to_charge;
1553
1554         BUG_ON(RB_EMPTY_ROOT(&bfqq->sort_list));
1555
1556         /* Follow expired path, else get first next available. */
1557         rq = bfq_check_fifo(bfqq);
1558         if (rq == NULL)
1559                 rq = bfqq->next_rq;
1560         service_to_charge = bfq_serv_to_charge(rq, bfqq);
1561
1562         if (service_to_charge > bfq_bfqq_budget_left(bfqq)) {
1563                 /*
1564                  * This may happen if the next rq is chosen
1565                  * in fifo order instead of sector order.
1566                  * The budget is properly dimensioned
1567                  * to be always sufficient to serve the next request
1568                  * only if it is chosen in sector order. The reason is
1569                  * that it would be quite inefficient and little useful
1570                  * to always make sure that the budget is large enough
1571                  * to serve even the possible next rq in fifo order.
1572                  * In fact, requests are seldom served in fifo order.
1573                  *
1574                  * Expire the queue for budget exhaustion, and
1575                  * make sure that the next act_budget is enough
1576                  * to serve the next request, even if it comes
1577                  * from the fifo expired path.
1578                  */
1579                 bfqq->next_rq = rq;
1580                 /*
1581                  * Since this dispatch is failed, make sure that
1582                  * a new one will be performed
1583                  */
1584                 if (!bfqd->rq_in_driver)
1585                         bfq_schedule_dispatch(bfqd);
1586                 goto expire;
1587         }
1588
1589         /* Finally, insert request into driver dispatch list. */
1590         bfq_bfqq_served(bfqq, service_to_charge);
1591         bfq_dispatch_insert(bfqd->queue, rq);
1592
1593         update_raising_data(bfqd, bfqq);
1594
1595         bfq_log_bfqq(bfqd, bfqq, "dispatched %u sec req (%llu), "
1596                         "budg left %lu",
1597                         blk_rq_sectors(rq),
1598                         (long long unsigned)blk_rq_pos(rq),
1599                         bfq_bfqq_budget_left(bfqq));
1600
1601         dispatched++;
1602
1603         if (bfqd->active_bic == NULL) {
1604                 atomic_long_inc(&RQ_BIC(rq)->icq.ioc->refcount);
1605                 bfqd->active_bic = RQ_BIC(rq);
1606         }
1607
1608         if (bfqd->busy_queues > 1 && ((!bfq_bfqq_sync(bfqq) &&
1609             dispatched >= bfqd->bfq_max_budget_async_rq) ||
1610             bfq_class_idle(bfqq)))
1611                 goto expire;
1612
1613         return dispatched;
1614
1615 expire:
1616         bfq_bfqq_expire(bfqd, bfqq, 0, BFQ_BFQQ_BUDGET_EXHAUSTED);
1617         return dispatched;
1618 }
1619
1620 static int __bfq_forced_dispatch_bfqq(struct bfq_queue *bfqq)
1621 {
1622         int dispatched = 0;
1623
1624         while (bfqq->next_rq != NULL) {
1625                 bfq_dispatch_insert(bfqq->bfqd->queue, bfqq->next_rq);
1626                 dispatched++;
1627         }
1628
1629         BUG_ON(!list_empty(&bfqq->fifo));
1630         return dispatched;
1631 }
1632
1633 /*
1634  * Drain our current requests.  Used for barriers and when switching
1635  * io schedulers on-the-fly.
1636  */
1637 static int bfq_forced_dispatch(struct bfq_data *bfqd)
1638 {
1639         struct bfq_queue *bfqq, *n;
1640         struct bfq_service_tree *st;
1641         int dispatched = 0;
1642
1643         bfqq = bfqd->active_queue;
1644         if (bfqq != NULL)
1645                 __bfq_bfqq_expire(bfqd, bfqq);
1646
1647         /*
1648          * Loop through classes, and be careful to leave the scheduler
1649          * in a consistent state, as feedback mechanisms and vtime
1650          * updates cannot be disabled during the process.
1651          */
1652         list_for_each_entry_safe(bfqq, n, &bfqd->active_list, bfqq_list) {
1653                 st = bfq_entity_service_tree(&bfqq->entity);
1654
1655                 dispatched += __bfq_forced_dispatch_bfqq(bfqq);
1656                 bfqq->max_budget = bfq_max_budget(bfqd);
1657
1658                 bfq_forget_idle(st);
1659         }
1660
1661         BUG_ON(bfqd->busy_queues != 0);
1662
1663         return dispatched;
1664 }
1665
1666 static int bfq_dispatch_requests(struct request_queue *q, int force)
1667 {
1668         struct bfq_data *bfqd = q->elevator->elevator_data;
1669         struct bfq_queue *bfqq;
1670         int max_dispatch;
1671
1672         bfq_log(bfqd, "dispatch requests: %d busy queues", bfqd->busy_queues);
1673         if (bfqd->busy_queues == 0)
1674                 return 0;
1675
1676         if (unlikely(force))
1677                 return bfq_forced_dispatch(bfqd);
1678
1679         if((bfqq = bfq_select_queue(bfqd)) == NULL)
1680                 return 0;
1681
1682         max_dispatch = bfqd->bfq_quantum;
1683         if (bfq_class_idle(bfqq))
1684                 max_dispatch = 1;
1685
1686         if (!bfq_bfqq_sync(bfqq))
1687                 max_dispatch = bfqd->bfq_max_budget_async_rq;
1688
1689         if (bfqq->dispatched >= max_dispatch) {
1690                 if (bfqd->busy_queues > 1)
1691                         return 0;
1692                 if (bfqq->dispatched >= 4 * max_dispatch)
1693                         return 0;
1694         }
1695
1696         if (bfqd->sync_flight != 0 && !bfq_bfqq_sync(bfqq))
1697                 return 0;
1698
1699         bfq_clear_bfqq_wait_request(bfqq);
1700         BUG_ON(timer_pending(&bfqd->idle_slice_timer));
1701
1702         if (! bfq_dispatch_request(bfqd, bfqq))
1703                 return 0;
1704
1705         bfq_log_bfqq(bfqd, bfqq, "dispatched one request of %d"
1706                      "(max_disp %d)", bfqq->pid, max_dispatch);
1707
1708         return 1;
1709 }
1710
1711 /*
1712  * Task holds one reference to the queue, dropped when task exits.  Each rq
1713  * in-flight on this queue also holds a reference, dropped when rq is freed.
1714  *
1715  * Queue lock must be held here.
1716  */
1717 static void bfq_put_queue(struct bfq_queue *bfqq)
1718 {
1719         struct bfq_data *bfqd = bfqq->bfqd;
1720
1721         BUG_ON(atomic_read(&bfqq->ref) <= 0);
1722
1723         bfq_log_bfqq(bfqd, bfqq, "put_queue: %p %d", bfqq,
1724                      atomic_read(&bfqq->ref));
1725         if (!atomic_dec_and_test(&bfqq->ref))
1726                 return;
1727
1728         BUG_ON(rb_first(&bfqq->sort_list) != NULL);
1729         BUG_ON(bfqq->allocated[READ] + bfqq->allocated[WRITE] != 0);
1730         BUG_ON(bfqq->entity.tree != NULL);
1731         BUG_ON(bfq_bfqq_busy(bfqq));
1732         BUG_ON(bfqd->active_queue == bfqq);
1733
1734         bfq_log_bfqq(bfqd, bfqq, "put_queue: %p freed", bfqq);
1735
1736         kmem_cache_free(bfq_pool, bfqq);
1737 }
1738
1739 static void bfq_put_cooperator(struct bfq_queue *bfqq)
1740 {
1741         struct bfq_queue *__bfqq, *next;
1742
1743         /*
1744          * If this queue was scheduled to merge with another queue, be
1745          * sure to drop the reference taken on that queue (and others in
1746          * the merge chain). See bfq_setup_merge and bfq_merge_bfqqs.
1747          */
1748         __bfqq = bfqq->new_bfqq;
1749         while (__bfqq) {
1750                 if (__bfqq == bfqq) {
1751                         WARN(1, "bfqq->new_bfqq loop detected.\n");
1752                         break;
1753                 }
1754                 next = __bfqq->new_bfqq;
1755                 bfq_put_queue(__bfqq);
1756                 __bfqq = next;
1757         }
1758 }
1759
1760 static void bfq_exit_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq)
1761 {
1762         if (bfqq == bfqd->active_queue) {
1763                 __bfq_bfqq_expire(bfqd, bfqq);
1764                 bfq_schedule_dispatch(bfqd);
1765         }
1766
1767         bfq_log_bfqq(bfqd, bfqq, "exit_bfqq: %p, %d", bfqq,
1768                      atomic_read(&bfqq->ref));
1769
1770         bfq_put_cooperator(bfqq);
1771
1772         bfq_put_queue(bfqq);
1773 }
1774
1775 static void bfq_init_icq(struct io_cq *icq)
1776 {
1777         struct bfq_io_cq *bic = icq_to_bic(icq);
1778
1779         bic->ttime.last_end_request = jiffies;
1780 }
1781
1782 static void bfq_exit_icq(struct io_cq *icq)
1783 {
1784         struct bfq_io_cq *bic = icq_to_bic(icq);
1785         struct bfq_data *bfqd = bic_to_bfqd(bic);
1786
1787         if (bic->bfqq[BLK_RW_ASYNC]) {
1788                 bfq_exit_bfqq(bfqd, bic->bfqq[BLK_RW_ASYNC]);
1789                 bic->bfqq[BLK_RW_ASYNC] = NULL;
1790         }
1791
1792         if (bic->bfqq[BLK_RW_SYNC]) {
1793                 bfq_exit_bfqq(bfqd, bic->bfqq[BLK_RW_SYNC]);
1794                 bic->bfqq[BLK_RW_SYNC] = NULL;
1795         }
1796 }
1797
1798 /*
1799  * Update the entity prio values; note that the new values will not
1800  * be used until the next (re)activation.
1801  */
1802 static void bfq_init_prio_data(struct bfq_queue *bfqq, struct io_context *ioc)
1803 {
1804         struct task_struct *tsk = current;
1805         int ioprio_class;
1806
1807         if (!bfq_bfqq_prio_changed(bfqq))
1808                 return;
1809
1810         ioprio_class = IOPRIO_PRIO_CLASS(ioc->ioprio);
1811         switch (ioprio_class) {
1812         default:
1813                 printk(KERN_ERR "bfq: bad prio %x\n", ioprio_class);
1814         case IOPRIO_CLASS_NONE:
1815                 /*
1816                  * No prio set, inherit CPU scheduling settings.
1817                  */
1818                 bfqq->entity.new_ioprio = task_nice_ioprio(tsk);
1819                 bfqq->entity.new_ioprio_class = task_nice_ioclass(tsk);
1820                 break;
1821         case IOPRIO_CLASS_RT:
1822                 bfqq->entity.new_ioprio = task_ioprio(ioc);
1823                 bfqq->entity.new_ioprio_class = IOPRIO_CLASS_RT;
1824                 break;
1825         case IOPRIO_CLASS_BE:
1826                 bfqq->entity.new_ioprio = task_ioprio(ioc);
1827                 bfqq->entity.new_ioprio_class = IOPRIO_CLASS_BE;
1828                 break;
1829         case IOPRIO_CLASS_IDLE:
1830                 bfqq->entity.new_ioprio_class = IOPRIO_CLASS_IDLE;
1831                 bfqq->entity.new_ioprio = 7;
1832                 bfq_clear_bfqq_idle_window(bfqq);
1833                 break;
1834         }
1835
1836         bfqq->entity.ioprio_changed = 1;
1837
1838         /*
1839          * Keep track of original prio settings in case we have to temporarily
1840          * elevate the priority of this queue.
1841          */
1842         bfqq->org_ioprio = bfqq->entity.new_ioprio;
1843         bfq_clear_bfqq_prio_changed(bfqq);
1844 }
1845
1846 static void bfq_changed_ioprio(struct io_context *ioc,
1847                                struct bfq_io_cq *bic)
1848 {
1849         struct bfq_data *bfqd;
1850         struct bfq_queue *bfqq, *new_bfqq;
1851         struct bfq_group *bfqg;
1852         unsigned long uninitialized_var(flags);
1853
1854         bfqd = bfq_get_bfqd_locked(&(bic->icq.q->elevator->elevator_data), &flags);
1855         if (unlikely(bfqd == NULL))
1856                 return;
1857
1858         bfqq = bic->bfqq[BLK_RW_ASYNC];
1859         if (bfqq != NULL) {
1860                 bfqg = container_of(bfqq->entity.sched_data, struct bfq_group,
1861                                     sched_data);
1862                 new_bfqq = bfq_get_queue(bfqd, bfqg, BLK_RW_ASYNC, bic->icq.ioc,
1863                                          GFP_ATOMIC);
1864                 if (new_bfqq != NULL) {
1865                         bic->bfqq[BLK_RW_ASYNC] = new_bfqq;
1866                         bfq_log_bfqq(bfqd, bfqq,
1867                                      "changed_ioprio: bfqq %p %d",
1868                                      bfqq, atomic_read(&bfqq->ref));
1869                         bfq_put_queue(bfqq);
1870                 }
1871         }
1872
1873         bfqq = bic->bfqq[BLK_RW_SYNC];
1874         if (bfqq != NULL)
1875                 bfq_mark_bfqq_prio_changed(bfqq);
1876
1877         bfq_put_bfqd_unlock(bfqd, &flags);
1878 }
1879
1880 static void bfq_init_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq,
1881                           pid_t pid, int is_sync)
1882 {
1883         RB_CLEAR_NODE(&bfqq->entity.rb_node);
1884         INIT_LIST_HEAD(&bfqq->fifo);
1885
1886         atomic_set(&bfqq->ref, 0);
1887         bfqq->bfqd = bfqd;
1888
1889         bfq_mark_bfqq_prio_changed(bfqq);
1890
1891         if (is_sync) {
1892                 if (!bfq_class_idle(bfqq))
1893                         bfq_mark_bfqq_idle_window(bfqq);
1894                 bfq_mark_bfqq_sync(bfqq);
1895         }
1896
1897         /* Tentative initial value to trade off between thr and lat */
1898         bfqq->max_budget = (2 * bfq_max_budget(bfqd)) / 3;
1899         bfqq->pid = pid;
1900
1901         bfqq->raising_coeff = 1;
1902         bfqq->last_rais_start_finish = 0;
1903         bfqq->soft_rt_next_start = -1;
1904 }
1905
1906 static struct bfq_queue *bfq_find_alloc_queue(struct bfq_data *bfqd,
1907                                               struct bfq_group *bfqg,
1908                                               int is_sync,
1909                                               struct io_context *ioc,
1910                                               gfp_t gfp_mask)
1911 {
1912         struct bfq_queue *bfqq, *new_bfqq = NULL;
1913         struct bfq_io_cq *bic;
1914
1915 retry:
1916         bic = bfq_bic_lookup(bfqd, ioc);
1917         /* bic always exists here */
1918         bfqq = bic_to_bfqq(bic, is_sync);
1919
1920         /*
1921          * Always try a new alloc if we fall back to the OOM bfqq
1922          * originally, since it should just be a temporary situation.
1923          */
1924         if (bfqq == NULL || bfqq == &bfqd->oom_bfqq) {
1925                 bfqq = NULL;
1926                 if (new_bfqq != NULL) {
1927                         bfqq = new_bfqq;
1928                         new_bfqq = NULL;
1929                 } else if (gfp_mask & __GFP_WAIT) {
1930                         spin_unlock_irq(bfqd->queue->queue_lock);
1931                         new_bfqq = kmem_cache_alloc_node(bfq_pool,
1932                                         gfp_mask | __GFP_ZERO,
1933                                         bfqd->queue->node);
1934                         spin_lock_irq(bfqd->queue->queue_lock);
1935                         if (new_bfqq != NULL)
1936                                 goto retry;
1937                 } else {
1938                         bfqq = kmem_cache_alloc_node(bfq_pool,
1939                                         gfp_mask | __GFP_ZERO,
1940                                         bfqd->queue->node);
1941                 }
1942
1943                 if (bfqq != NULL) {
1944                         bfq_init_bfqq(bfqd, bfqq, current->pid, is_sync);
1945                         bfq_log_bfqq(bfqd, bfqq, "allocated");
1946                 } else {
1947                         bfqq = &bfqd->oom_bfqq;
1948                         bfq_log_bfqq(bfqd, bfqq, "using oom bfqq");
1949                 }
1950
1951                 bfq_init_prio_data(bfqq, ioc);
1952                 bfq_init_entity(&bfqq->entity, bfqg);
1953         }
1954
1955         if (new_bfqq != NULL)
1956                 kmem_cache_free(bfq_pool, new_bfqq);
1957
1958         return bfqq;
1959 }
1960
1961 static struct bfq_queue **bfq_async_queue_prio(struct bfq_data *bfqd,
1962                                                struct bfq_group *bfqg,
1963                                                int ioprio_class, int ioprio)
1964 {
1965         switch (ioprio_class) {
1966         case IOPRIO_CLASS_RT:
1967                 return &bfqg->async_bfqq[0][ioprio];
1968         case IOPRIO_CLASS_BE:
1969                 return &bfqg->async_bfqq[1][ioprio];
1970         case IOPRIO_CLASS_IDLE:
1971                 return &bfqg->async_idle_bfqq;
1972         default:
1973                 BUG();
1974         }
1975 }
1976
1977 static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd,
1978                                        struct bfq_group *bfqg, int is_sync,
1979                                        struct io_context *ioc, gfp_t gfp_mask)
1980 {
1981         const int ioprio = task_ioprio(ioc);
1982         const int ioprio_class = task_ioprio_class(ioc);
1983         struct bfq_queue **async_bfqq = NULL;
1984         struct bfq_queue *bfqq = NULL;
1985
1986         if (!is_sync) {
1987                 async_bfqq = bfq_async_queue_prio(bfqd, bfqg, ioprio_class,
1988                                                   ioprio);
1989                 bfqq = *async_bfqq;
1990         }
1991
1992         if (bfqq == NULL)
1993                 bfqq = bfq_find_alloc_queue(bfqd, bfqg, is_sync, ioc, gfp_mask);
1994
1995         /*
1996          * Pin the queue now that it's allocated, scheduler exit will prune it.
1997          */
1998         if (!is_sync && *async_bfqq == NULL) {
1999                 atomic_inc(&bfqq->ref);
2000                 bfq_log_bfqq(bfqd, bfqq, "get_queue, bfqq not in async: %p, %d",
2001                              bfqq, atomic_read(&bfqq->ref));
2002                 *async_bfqq = bfqq;
2003         }
2004
2005         atomic_inc(&bfqq->ref);
2006         bfq_log_bfqq(bfqd, bfqq, "get_queue, at end: %p, %d", bfqq,
2007                      atomic_read(&bfqq->ref));
2008         return bfqq;
2009 }
2010
2011 static void bfq_update_io_thinktime(struct bfq_data *bfqd,
2012                                     struct bfq_io_cq *bic)
2013 {
2014         unsigned long elapsed = jiffies - bic->ttime.last_end_request;
2015         unsigned long ttime = min(elapsed, 2UL * bfqd->bfq_slice_idle);
2016
2017         bic->ttime.ttime_samples = (7*bic->ttime.ttime_samples + 256) / 8;
2018         bic->ttime.ttime_total = (7*bic->ttime.ttime_total + 256*ttime) / 8;
2019         bic->ttime.ttime_mean = (bic->ttime.ttime_total + 128) / bic->ttime.ttime_samples;
2020 }
2021
2022 static void bfq_update_io_seektime(struct bfq_data *bfqd,
2023                                    struct bfq_queue *bfqq,
2024                                    struct request *rq)
2025 {
2026         sector_t sdist;
2027         u64 total;
2028
2029         if (bfqq->last_request_pos < blk_rq_pos(rq))
2030                 sdist = blk_rq_pos(rq) - bfqq->last_request_pos;
2031         else
2032                 sdist = bfqq->last_request_pos - blk_rq_pos(rq);
2033
2034         /*
2035          * Don't allow the seek distance to get too large from the
2036          * odd fragment, pagein, etc.
2037          */
2038         if (bfqq->seek_samples == 0) /* first request, not really a seek */
2039                 sdist = 0;
2040         else if (bfqq->seek_samples <= 60) /* second & third seek */
2041                 sdist = min(sdist, (bfqq->seek_mean * 4) + 2*1024*1024);
2042         else
2043                 sdist = min(sdist, (bfqq->seek_mean * 4) + 2*1024*64);
2044
2045         bfqq->seek_samples = (7*bfqq->seek_samples + 256) / 8;
2046         bfqq->seek_total = (7*bfqq->seek_total + (u64)256*sdist) / 8;
2047         total = bfqq->seek_total + (bfqq->seek_samples/2);
2048         do_div(total, bfqq->seek_samples);
2049         if (bfq_bfqq_coop(bfqq)) {
2050                 /*
2051                  * If the mean seektime increases for a (non-seeky) shared
2052                  * queue, some cooperator is likely to be idling too much.
2053                  * On the contrary,  if it decreases, some cooperator has
2054                  * probably waked up.
2055                  *
2056                  */
2057                 if ((sector_t)total < bfqq->seek_mean)
2058                         bfq_mark_bfqq_some_coop_idle(bfqq) ;
2059                 else if ((sector_t)total > bfqq->seek_mean)
2060                         bfq_clear_bfqq_some_coop_idle(bfqq) ;
2061         }
2062         bfqq->seek_mean = (sector_t)total;
2063
2064         bfq_log_bfqq(bfqd, bfqq, "dist=%llu mean=%llu", (u64)sdist,
2065                         (u64)bfqq->seek_mean);
2066 }
2067
2068 /*
2069  * Disable idle window if the process thinks too long or seeks so much that
2070  * it doesn't matter.
2071  */
2072 static void bfq_update_idle_window(struct bfq_data *bfqd,
2073                                    struct bfq_queue *bfqq,
2074                                    struct bfq_io_cq *bic)
2075 {
2076         int enable_idle;
2077
2078         /* Don't idle for async or idle io prio class. */
2079         if (!bfq_bfqq_sync(bfqq) || bfq_class_idle(bfqq))
2080                 return;
2081
2082         enable_idle = bfq_bfqq_idle_window(bfqq);
2083
2084         if (atomic_read(&bic->icq.ioc->nr_tasks) == 0 ||
2085             bfqd->bfq_slice_idle == 0 ||
2086                 (bfqd->hw_tag && BFQQ_SEEKY(bfqq) &&
2087                         bfqq->raising_coeff == 1))
2088                 enable_idle = 0;
2089         else if (bfq_sample_valid(bic->ttime.ttime_samples)) {
2090                 if (bic->ttime.ttime_mean > bfqd->bfq_slice_idle &&
2091                         bfqq->raising_coeff == 1)
2092                         enable_idle = 0;
2093                 else
2094                         enable_idle = 1;
2095         }
2096         bfq_log_bfqq(bfqd, bfqq, "update_idle_window: enable_idle %d",
2097                 enable_idle);
2098
2099         if (enable_idle)
2100                 bfq_mark_bfqq_idle_window(bfqq);
2101         else
2102                 bfq_clear_bfqq_idle_window(bfqq);
2103 }
2104
2105 /*
2106  * Called when a new fs request (rq) is added to bfqq.  Check if there's
2107  * something we should do about it.
2108  */
2109 static void bfq_rq_enqueued(struct bfq_data *bfqd, struct bfq_queue *bfqq,
2110                             struct request *rq)
2111 {
2112         struct bfq_io_cq *bic = RQ_BIC(rq);
2113
2114         if (rq->cmd_flags & REQ_META)
2115                 bfqq->meta_pending++;
2116
2117         bfq_update_io_thinktime(bfqd, bic);
2118         bfq_update_io_seektime(bfqd, bfqq, rq);
2119         if (bfqq->entity.service > bfq_max_budget(bfqd) / 8 ||
2120             !BFQQ_SEEKY(bfqq))
2121                 bfq_update_idle_window(bfqd, bfqq, bic);
2122
2123         bfq_log_bfqq(bfqd, bfqq,
2124                      "rq_enqueued: idle_window=%d (seeky %d, mean %llu)",
2125                      bfq_bfqq_idle_window(bfqq), BFQQ_SEEKY(bfqq),
2126                      (long long unsigned)bfqq->seek_mean);
2127
2128         bfqq->last_request_pos = blk_rq_pos(rq) + blk_rq_sectors(rq);
2129
2130         if (bfqq == bfqd->active_queue) {
2131                 /*
2132                  * If there is just this request queued and the request
2133                  * is small, just exit.
2134                  * In this way, if the disk is being idled to wait for a new
2135                  * request from the active queue, we avoid unplugging the
2136                  * device now.
2137                  *
2138                  * By doing so, we spare the disk to be committed
2139                  * to serve just a small request. On the contrary, we wait for
2140                  * the block layer to decide when to unplug the device:
2141                  * hopefully, new requests will be merged to this
2142                  * one quickly, then the device will be unplugged
2143                  * and larger requests will be dispatched.
2144                  */
2145                 if (bfqq->queued[rq_is_sync(rq)] == 1 &&
2146                     blk_rq_sectors(rq) < 32) {
2147                         return;
2148                 }
2149                 if (bfq_bfqq_wait_request(bfqq)) {
2150                         /*
2151                          * If we are waiting for a request for this queue, let
2152                          * it rip immediately and flag that we must not expire
2153                          * this queue just now.
2154                          */
2155                         bfq_clear_bfqq_wait_request(bfqq);
2156                         del_timer(&bfqd->idle_slice_timer);
2157                         /*
2158                          * Here we can safely expire the queue, in
2159                          * case of budget timeout, without wasting
2160                          * guarantees
2161                          */
2162                         if (bfq_bfqq_budget_timeout(bfqq))
2163                                 bfq_bfqq_expire(bfqd, bfqq, 0,
2164                                                 BFQ_BFQQ_BUDGET_TIMEOUT);
2165                         __blk_run_queue(bfqd->queue);
2166                 }
2167         }
2168 }
2169
2170 static void bfq_insert_request(struct request_queue *q, struct request *rq)
2171 {
2172         struct bfq_data *bfqd = q->elevator->elevator_data;
2173         struct bfq_queue *bfqq = RQ_BFQQ(rq);
2174
2175         assert_spin_locked(bfqd->queue->queue_lock);
2176         bfq_init_prio_data(bfqq, RQ_BIC(rq)->icq.ioc);
2177
2178         bfq_add_rq_rb(rq);
2179
2180         rq_set_fifo_time(rq, jiffies + bfqd->bfq_fifo_expire[rq_is_sync(rq)]);
2181         list_add_tail(&rq->queuelist, &bfqq->fifo);
2182
2183         bfq_rq_enqueued(bfqd, bfqq, rq);
2184 }
2185
2186 static void bfq_update_hw_tag(struct bfq_data *bfqd)
2187 {
2188         bfqd->max_rq_in_driver = max(bfqd->max_rq_in_driver,
2189                                      bfqd->rq_in_driver);
2190
2191         /*
2192          * This sample is valid if the number of outstanding requests
2193          * is large enough to allow a queueing behavior.  Note that the
2194          * sum is not exact, as it's not taking into account deactivated
2195          * requests.
2196          */
2197         if (bfqd->rq_in_driver + bfqd->queued < BFQ_HW_QUEUE_THRESHOLD)
2198                 return;
2199
2200         if (bfqd->hw_tag_samples++ < BFQ_HW_QUEUE_SAMPLES)
2201                 return;
2202
2203         bfqd->hw_tag = bfqd->max_rq_in_driver > BFQ_HW_QUEUE_THRESHOLD;
2204         bfqd->max_rq_in_driver = 0;
2205         bfqd->hw_tag_samples = 0;
2206 }
2207
2208 static void bfq_completed_request(struct request_queue *q, struct request *rq)
2209 {
2210         struct bfq_queue *bfqq = RQ_BFQQ(rq);
2211         struct bfq_data *bfqd = bfqq->bfqd;
2212         const int sync = rq_is_sync(rq);
2213
2214         bfq_log_bfqq(bfqd, bfqq, "completed %u sects req (%d)",
2215                         blk_rq_sectors(rq), sync);
2216
2217         bfq_update_hw_tag(bfqd);
2218
2219         WARN_ON(!bfqd->rq_in_driver);
2220         WARN_ON(!bfqq->dispatched);
2221         bfqd->rq_in_driver--;
2222         bfqq->dispatched--;
2223
2224         if (bfq_bfqq_sync(bfqq))
2225                 bfqd->sync_flight--;
2226
2227         if (sync)
2228                 RQ_BIC(rq)->ttime.last_end_request = jiffies;
2229
2230         /*
2231          * If this is the active queue, check if it needs to be expired,
2232          * or if we want to idle in case it has no pending requests.
2233          */
2234         if (bfqd->active_queue == bfqq) {
2235                 if (bfq_bfqq_budget_new(bfqq))
2236                         bfq_set_budget_timeout(bfqd);
2237
2238                 /* Idling is disabled also for cooperation issues:
2239                  * 1) there is a close cooperator for the queue, or
2240                  * 2) the queue is shared and some cooperator is likely
2241                  *    to be idle (in this case, by not arming the idle timer,
2242                  *    we try to slow down the queue, to prevent the zones
2243                  *    of the disk accessed by the active cooperators to become
2244                  *    too distant from the zone that will be accessed by the
2245                  *    currently idle cooperators)
2246                  */
2247                 if (bfq_may_expire_for_budg_timeout(bfqq))
2248                         bfq_bfqq_expire(bfqd, bfqq, 0, BFQ_BFQQ_BUDGET_TIMEOUT);
2249                 else if (sync &&
2250                         (bfqd->rq_in_driver == 0 ||
2251                                 bfqq->raising_coeff > 1)
2252                         && RB_EMPTY_ROOT(&bfqq->sort_list)
2253                         && !bfq_close_cooperator(bfqd, bfqq)
2254                         && (!bfq_bfqq_coop(bfqq) ||
2255                                 !bfq_bfqq_some_coop_idle(bfqq)))
2256                         bfq_arm_slice_timer(bfqd);
2257         }
2258
2259         if (!bfqd->rq_in_driver)
2260                 bfq_schedule_dispatch(bfqd);
2261 }
2262
2263 static inline int __bfq_may_queue(struct bfq_queue *bfqq)
2264 {
2265         if (bfq_bfqq_wait_request(bfqq) && bfq_bfqq_must_alloc(bfqq)) {
2266                 bfq_clear_bfqq_must_alloc(bfqq);
2267                 return ELV_MQUEUE_MUST;
2268         }
2269
2270         return ELV_MQUEUE_MAY;
2271 }
2272
2273 static int bfq_may_queue(struct request_queue *q, int rw)
2274 {
2275         struct bfq_data *bfqd = q->elevator->elevator_data;
2276         struct task_struct *tsk = current;
2277         struct bfq_io_cq *bic;
2278         struct bfq_queue *bfqq;
2279
2280         /*
2281          * Don't force setup of a queue from here, as a call to may_queue
2282          * does not necessarily imply that a request actually will be queued.
2283          * So just lookup a possibly existing queue, or return 'may queue'
2284          * if that fails.
2285          */
2286         bic = bfq_bic_lookup(bfqd, tsk->io_context);
2287         if (bic == NULL)
2288                 return ELV_MQUEUE_MAY;
2289
2290         bfqq = bic_to_bfqq(bic, rw_is_sync(rw));
2291         if (bfqq != NULL) {
2292                 bfq_init_prio_data(bfqq, bic->icq.ioc);
2293
2294                 return __bfq_may_queue(bfqq);
2295         }
2296
2297         return ELV_MQUEUE_MAY;
2298 }
2299
2300 /*
2301  * Queue lock held here.
2302  */
2303 static void bfq_put_request(struct request *rq)
2304 {
2305         struct bfq_queue *bfqq = RQ_BFQQ(rq);
2306
2307         if (bfqq != NULL) {
2308                 const int rw = rq_data_dir(rq);
2309
2310                 BUG_ON(!bfqq->allocated[rw]);
2311                 bfqq->allocated[rw]--;
2312
2313                 rq->elv.priv[0] = NULL;
2314                 rq->elv.priv[1] = NULL;
2315
2316                 bfq_log_bfqq(bfqq->bfqd, bfqq, "put_request %p, %d",
2317                              bfqq, atomic_read(&bfqq->ref));
2318                 bfq_put_queue(bfqq);
2319         }
2320 }
2321
2322 static struct bfq_queue *
2323 bfq_merge_bfqqs(struct bfq_data *bfqd, struct bfq_io_cq *bic,
2324                 struct bfq_queue *bfqq)
2325 {
2326         bfq_log_bfqq(bfqd, bfqq, "merging with queue %lu",
2327                 (long unsigned)bfqq->new_bfqq->pid);
2328         bic_set_bfqq(bic, bfqq->new_bfqq, 1);
2329         bfq_mark_bfqq_coop(bfqq->new_bfqq);
2330         bfq_put_queue(bfqq);
2331         return bic_to_bfqq(bic, 1);
2332 }
2333
2334 /*
2335  * Returns NULL if a new bfqq should be allocated, or the old bfqq if this
2336  * was the last process referring to said bfqq.
2337  */
2338 static struct bfq_queue *
2339 bfq_split_bfqq(struct bfq_io_cq *bic, struct bfq_queue *bfqq)
2340 {
2341         bfq_log_bfqq(bfqq->bfqd, bfqq, "splitting queue");
2342         if (bfqq_process_refs(bfqq) == 1) {
2343                 bfqq->pid = current->pid;
2344                 bfq_clear_bfqq_some_coop_idle(bfqq);
2345                 bfq_clear_bfqq_coop(bfqq);
2346                 bfq_clear_bfqq_split_coop(bfqq);
2347                 return bfqq;
2348         }
2349
2350         bic_set_bfqq(bic, NULL, 1);
2351
2352         bfq_put_cooperator(bfqq);
2353
2354         bfq_put_queue(bfqq);
2355         return NULL;
2356 }
2357
2358 /*
2359  * Allocate bfq data structures associated with this request.
2360  */
2361 static int bfq_set_request(struct request_queue *q, struct request *rq,
2362                            gfp_t gfp_mask)
2363 {
2364         struct bfq_data *bfqd = q->elevator->elevator_data;
2365         struct bfq_io_cq *bic = icq_to_bic(rq->elv.icq);
2366         const int rw = rq_data_dir(rq);
2367         const int is_sync = rq_is_sync(rq);
2368         struct bfq_queue *bfqq;
2369         struct bfq_group *bfqg;
2370         unsigned long flags;
2371
2372         /* handle changed prio notifications; cgroup change is handled separately */
2373         if (unlikely(icq_get_changed(&bic->icq) & ICQ_IOPRIO_CHANGED))
2374                 bfq_changed_ioprio(bic->icq.ioc, bic);
2375
2376         might_sleep_if(gfp_mask & __GFP_WAIT);
2377
2378         spin_lock_irqsave(q->queue_lock, flags);
2379
2380         if (bic == NULL)
2381                 goto queue_fail;
2382
2383         bfqg = bfq_bic_update_cgroup(bic);
2384
2385 new_queue:
2386         bfqq = bic_to_bfqq(bic, is_sync);
2387         if (bfqq == NULL || bfqq == &bfqd->oom_bfqq) {
2388                 bfqq = bfq_get_queue(bfqd, bfqg, is_sync, bic->icq.ioc, gfp_mask);
2389                 bic_set_bfqq(bic, bfqq, is_sync);
2390         } else {
2391                 /*
2392                  * If the queue was seeky for too long, break it apart.
2393                  */
2394                 if (bfq_bfqq_coop(bfqq) && bfq_bfqq_split_coop(bfqq)) {
2395                         bfq_log_bfqq(bfqd, bfqq, "breaking apart bfqq");
2396                         bfqq = bfq_split_bfqq(bic, bfqq);
2397                         if (!bfqq)
2398                                 goto new_queue;
2399                 }
2400
2401                 /*
2402                  * Check to see if this queue is scheduled to merge with
2403                  * another closely cooperating queue. The merging of queues
2404                  * happens here as it must be done in process context.
2405                  * The reference on new_bfqq was taken in merge_bfqqs.
2406                  */
2407                 if (bfqq->new_bfqq != NULL)
2408                         bfqq = bfq_merge_bfqqs(bfqd, bic, bfqq);
2409         }
2410
2411         bfqq->allocated[rw]++;
2412         atomic_inc(&bfqq->ref);
2413         bfq_log_bfqq(bfqd, bfqq, "set_request: bfqq %p, %d", bfqq,
2414                      atomic_read(&bfqq->ref));
2415
2416         rq->elv.priv[0] = bic;
2417         rq->elv.priv[1] = bfqq;
2418
2419         spin_unlock_irqrestore(q->queue_lock, flags);
2420
2421         return 0;
2422
2423 queue_fail:
2424         bfq_schedule_dispatch(bfqd);
2425         spin_unlock_irqrestore(q->queue_lock, flags);
2426
2427         return 1;
2428 }
2429
2430 static void bfq_kick_queue(struct work_struct *work)
2431 {
2432         struct bfq_data *bfqd =
2433                 container_of(work, struct bfq_data, unplug_work);
2434         struct request_queue *q = bfqd->queue;
2435
2436         spin_lock_irq(q->queue_lock);
2437         __blk_run_queue(q);
2438         spin_unlock_irq(q->queue_lock);
2439 }
2440
2441 /*
2442  * Handler of the expiration of the timer running if the active_queue
2443  * is idling inside its time slice.
2444  */
2445 static void bfq_idle_slice_timer(unsigned long data)
2446 {
2447         struct bfq_data *bfqd = (struct bfq_data *)data;
2448         struct bfq_queue *bfqq;
2449         unsigned long flags;
2450         enum bfqq_expiration reason;
2451
2452         spin_lock_irqsave(bfqd->queue->queue_lock, flags);
2453
2454         bfqq = bfqd->active_queue;
2455         /*
2456          * Theoretical race here: active_queue can be NULL or different
2457          * from the queue that was idling if the timer handler spins on
2458          * the queue_lock and a new request arrives for the current
2459          * queue and there is a full dispatch cycle that changes the
2460          * active_queue.  This can hardly happen, but in the worst case
2461          * we just expire a queue too early.
2462          */
2463         if (bfqq != NULL) {
2464                 bfq_log_bfqq(bfqd, bfqq, "slice_timer expired");
2465                 if (bfq_bfqq_budget_timeout(bfqq))
2466                         /*
2467                          * Also here the queue can be safely expired
2468                          * for budget timeout without wasting
2469                          * guarantees
2470                          */
2471                         reason = BFQ_BFQQ_BUDGET_TIMEOUT;
2472                 else if (bfqq->queued[0] == 0 && bfqq->queued[1] == 0)
2473                         /*
2474                          * The queue may not be empty upon timer expiration,
2475                          * because we may not disable the timer when the first
2476                          * request of the active queue arrives during
2477                          * disk idling
2478                          */
2479                         reason = BFQ_BFQQ_TOO_IDLE;
2480                 else
2481                         goto schedule_dispatch;
2482
2483                 bfq_bfqq_expire(bfqd, bfqq, 1, reason);
2484         }
2485
2486 schedule_dispatch:
2487         bfq_schedule_dispatch(bfqd);
2488
2489         spin_unlock_irqrestore(bfqd->queue->queue_lock, flags);
2490 }
2491
2492 static void bfq_shutdown_timer_wq(struct bfq_data *bfqd)
2493 {
2494         del_timer_sync(&bfqd->idle_slice_timer);
2495         cancel_work_sync(&bfqd->unplug_work);
2496 }
2497
2498 static inline void __bfq_put_async_bfqq(struct bfq_data *bfqd,
2499                                         struct bfq_queue **bfqq_ptr)
2500 {
2501         struct bfq_group *root_group = bfqd->root_group;
2502         struct bfq_queue *bfqq = *bfqq_ptr;
2503
2504         bfq_log(bfqd, "put_async_bfqq: %p", bfqq);
2505         if (bfqq != NULL) {
2506                 bfq_bfqq_move(bfqd, bfqq, &bfqq->entity, root_group);
2507                 bfq_log_bfqq(bfqd, bfqq, "put_async_bfqq: putting %p, %d",
2508                              bfqq, atomic_read(&bfqq->ref));
2509                 bfq_put_queue(bfqq);
2510                 *bfqq_ptr = NULL;
2511         }
2512 }
2513
2514 /*
2515  * Release all the bfqg references to its async queues.  If we are
2516  * deallocating the group these queues may still contain requests, so
2517  * we reparent them to the root cgroup (i.e., the only one that will
2518  * exist for sure untill all the requests on a device are gone).
2519  */
2520 static void bfq_put_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg)
2521 {
2522         int i, j;
2523
2524         for (i = 0; i < 2; i++)
2525                 for (j = 0; j < IOPRIO_BE_NR; j++)
2526                         __bfq_put_async_bfqq(bfqd, &bfqg->async_bfqq[i][j]);
2527
2528         __bfq_put_async_bfqq(bfqd, &bfqg->async_idle_bfqq);
2529 }
2530
2531 static void bfq_exit_queue(struct elevator_queue *e)
2532 {
2533         struct bfq_data *bfqd = e->elevator_data;
2534         struct request_queue *q = bfqd->queue;
2535         struct bfq_queue *bfqq, *n;
2536
2537         bfq_shutdown_timer_wq(bfqd);
2538
2539         spin_lock_irq(q->queue_lock);
2540
2541         BUG_ON(bfqd->active_queue != NULL);
2542         list_for_each_entry_safe(bfqq, n, &bfqd->idle_list, bfqq_list)
2543                 bfq_deactivate_bfqq(bfqd, bfqq, 0);
2544
2545         bfq_disconnect_groups(bfqd);
2546         spin_unlock_irq(q->queue_lock);
2547
2548         bfq_shutdown_timer_wq(bfqd);
2549
2550         synchronize_rcu();
2551
2552         BUG_ON(timer_pending(&bfqd->idle_slice_timer));
2553
2554         bfq_free_root_group(bfqd);
2555         kfree(bfqd);
2556 }
2557
2558 static void *bfq_init_queue(struct request_queue *q)
2559 {
2560         struct bfq_group *bfqg;
2561         struct bfq_data *bfqd;
2562
2563         bfqd = kmalloc_node(sizeof(*bfqd), GFP_KERNEL | __GFP_ZERO, q->node);
2564         if (bfqd == NULL)
2565                 return NULL;
2566
2567         /*
2568          * Our fallback bfqq if bfq_find_alloc_queue() runs into OOM issues.
2569          * Grab a permanent reference to it, so that the normal code flow
2570          * will not attempt to free it.
2571          */
2572         bfq_init_bfqq(bfqd, &bfqd->oom_bfqq, 1, 0);
2573         atomic_inc(&bfqd->oom_bfqq.ref);
2574
2575         bfqd->queue = q;
2576
2577         bfqg = bfq_alloc_root_group(bfqd, q->node);
2578         if (bfqg == NULL) {
2579                 kfree(bfqd);
2580                 return NULL;
2581         }
2582
2583         bfqd->root_group = bfqg;
2584
2585         init_timer(&bfqd->idle_slice_timer);
2586         bfqd->idle_slice_timer.function = bfq_idle_slice_timer;
2587         bfqd->idle_slice_timer.data = (unsigned long)bfqd;
2588
2589         bfqd->rq_pos_tree = RB_ROOT;
2590
2591         INIT_WORK(&bfqd->unplug_work, bfq_kick_queue);
2592
2593         INIT_LIST_HEAD(&bfqd->active_list);
2594         INIT_LIST_HEAD(&bfqd->idle_list);
2595
2596         bfqd->hw_tag = 1;
2597
2598         bfqd->bfq_max_budget = bfq_default_max_budget;
2599
2600         bfqd->bfq_quantum = bfq_quantum;
2601         bfqd->bfq_fifo_expire[0] = bfq_fifo_expire[0];
2602         bfqd->bfq_fifo_expire[1] = bfq_fifo_expire[1];
2603         bfqd->bfq_back_max = bfq_back_max;
2604         bfqd->bfq_back_penalty = bfq_back_penalty;
2605         bfqd->bfq_slice_idle = bfq_slice_idle;
2606         bfqd->bfq_class_idle_last_service = 0;
2607         bfqd->bfq_max_budget_async_rq = bfq_max_budget_async_rq;
2608         bfqd->bfq_timeout[BLK_RW_ASYNC] = bfq_timeout_async;
2609         bfqd->bfq_timeout[BLK_RW_SYNC] = bfq_timeout_sync;
2610
2611         bfqd->low_latency = true;
2612
2613         bfqd->bfq_raising_coeff = 20;
2614         bfqd->bfq_raising_rt_max_time = msecs_to_jiffies(300);
2615         bfqd->bfq_raising_max_time = msecs_to_jiffies(7500);
2616         bfqd->bfq_raising_min_idle_time = msecs_to_jiffies(2000);
2617         bfqd->bfq_raising_min_inter_arr_async = msecs_to_jiffies(500);
2618         bfqd->bfq_raising_max_softrt_rate = 7000;
2619
2620         return bfqd;
2621 }
2622
2623 static void bfq_slab_kill(void)
2624 {
2625         if (bfq_pool != NULL)
2626                 kmem_cache_destroy(bfq_pool);
2627 }
2628
2629 static int __init bfq_slab_setup(void)
2630 {
2631         bfq_pool = KMEM_CACHE(bfq_queue, 0);
2632         if (bfq_pool == NULL)
2633                 return -ENOMEM;
2634         return 0;
2635 }
2636
2637 static ssize_t bfq_var_show(unsigned int var, char *page)
2638 {
2639         return sprintf(page, "%d\n", var);
2640 }
2641
2642 static ssize_t bfq_var_store(unsigned long *var, const char *page, size_t count)
2643 {
2644         unsigned long new_val;
2645         int ret = strict_strtoul(page, 10, &new_val);
2646
2647         if (ret == 0)
2648                 *var = new_val;
2649
2650         return count;
2651 }
2652
2653 static ssize_t bfq_weights_show(struct elevator_queue *e, char *page)
2654 {
2655         struct bfq_queue *bfqq;
2656         struct bfq_data *bfqd = e->elevator_data;
2657         ssize_t num_char = 0;
2658
2659         num_char += sprintf(page + num_char, "Active:\n");
2660         list_for_each_entry(bfqq, &bfqd->active_list, bfqq_list) {
2661                 num_char += sprintf(page + num_char,
2662                         "pid%d: weight %hu, dur %d/%u\n",
2663                         bfqq->pid,
2664                         bfqq->entity.weight,
2665                         jiffies_to_msecs(jiffies -
2666                                 bfqq->last_rais_start_finish),
2667                         jiffies_to_msecs(bfqq->raising_cur_max_time));
2668         }
2669         num_char += sprintf(page + num_char, "Idle:\n");
2670         list_for_each_entry(bfqq, &bfqd->idle_list, bfqq_list) {
2671                         num_char += sprintf(page + num_char,
2672                                 "pid%d: weight %hu, dur %d/%u\n",
2673                                 bfqq->pid,
2674                                 bfqq->entity.weight,
2675                                 jiffies_to_msecs(jiffies -
2676                                         bfqq->last_rais_start_finish),
2677                                 jiffies_to_msecs(bfqq->raising_cur_max_time));
2678         }
2679         return num_char;
2680 }
2681
2682 #define SHOW_FUNCTION(__FUNC, __VAR, __CONV)                            \
2683 static ssize_t __FUNC(struct elevator_queue *e, char *page)             \
2684 {                                                                       \
2685         struct bfq_data *bfqd = e->elevator_data;                       \
2686         unsigned int __data = __VAR;                                    \
2687         if (__CONV)                                                     \
2688                 __data = jiffies_to_msecs(__data);                      \
2689         return bfq_var_show(__data, (page));                            \
2690 }
2691 SHOW_FUNCTION(bfq_quantum_show, bfqd->bfq_quantum, 0);
2692 SHOW_FUNCTION(bfq_fifo_expire_sync_show, bfqd->bfq_fifo_expire[1], 1);
2693 SHOW_FUNCTION(bfq_fifo_expire_async_show, bfqd->bfq_fifo_expire[0], 1);
2694 SHOW_FUNCTION(bfq_back_seek_max_show, bfqd->bfq_back_max, 0);
2695 SHOW_FUNCTION(bfq_back_seek_penalty_show, bfqd->bfq_back_penalty, 0);
2696 SHOW_FUNCTION(bfq_slice_idle_show, bfqd->bfq_slice_idle, 1);
2697 SHOW_FUNCTION(bfq_max_budget_show, bfqd->bfq_user_max_budget, 0);
2698 SHOW_FUNCTION(bfq_max_budget_async_rq_show, bfqd->bfq_max_budget_async_rq, 0);
2699 SHOW_FUNCTION(bfq_timeout_sync_show, bfqd->bfq_timeout[BLK_RW_SYNC], 1);
2700 SHOW_FUNCTION(bfq_timeout_async_show, bfqd->bfq_timeout[BLK_RW_ASYNC], 1);
2701 SHOW_FUNCTION(bfq_low_latency_show, bfqd->low_latency, 0);
2702 SHOW_FUNCTION(bfq_raising_coeff_show, bfqd->bfq_raising_coeff, 0);
2703 SHOW_FUNCTION(bfq_raising_max_time_show, bfqd->bfq_raising_max_time, 1);
2704 SHOW_FUNCTION(bfq_raising_rt_max_time_show, bfqd->bfq_raising_rt_max_time, 1);
2705 SHOW_FUNCTION(bfq_raising_min_idle_time_show, bfqd->bfq_raising_min_idle_time,
2706         1);
2707 SHOW_FUNCTION(bfq_raising_min_inter_arr_async_show,
2708         bfqd->bfq_raising_min_inter_arr_async,
2709         1);
2710 SHOW_FUNCTION(bfq_raising_max_softrt_rate_show,
2711         bfqd->bfq_raising_max_softrt_rate, 0);
2712 #undef SHOW_FUNCTION
2713
2714 #define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV)                 \
2715 static ssize_t                                                          \
2716 __FUNC(struct elevator_queue *e, const char *page, size_t count)        \
2717 {                                                                       \
2718         struct bfq_data *bfqd = e->elevator_data;                       \
2719         unsigned long __data;                                           \
2720         int ret = bfq_var_store(&__data, (page), count);                \
2721         if (__data < (MIN))                                             \
2722                 __data = (MIN);                                         \
2723         else if (__data > (MAX))                                        \
2724                 __data = (MAX);                                         \
2725         if (__CONV)                                                     \
2726                 *(__PTR) = msecs_to_jiffies(__data);                    \
2727         else                                                            \
2728                 *(__PTR) = __data;                                      \
2729         return ret;                                                     \
2730 }
2731 STORE_FUNCTION(bfq_quantum_store, &bfqd->bfq_quantum, 1, INT_MAX, 0);
2732 STORE_FUNCTION(bfq_fifo_expire_sync_store, &bfqd->bfq_fifo_expire[1], 1,
2733                 INT_MAX, 1);
2734 STORE_FUNCTION(bfq_fifo_expire_async_store, &bfqd->bfq_fifo_expire[0], 1,
2735                 INT_MAX, 1);
2736 STORE_FUNCTION(bfq_back_seek_max_store, &bfqd->bfq_back_max, 0, INT_MAX, 0);
2737 STORE_FUNCTION(bfq_back_seek_penalty_store, &bfqd->bfq_back_penalty, 1,
2738                 INT_MAX, 0);
2739 STORE_FUNCTION(bfq_slice_idle_store, &bfqd->bfq_slice_idle, 0, INT_MAX, 1);
2740 STORE_FUNCTION(bfq_max_budget_async_rq_store, &bfqd->bfq_max_budget_async_rq,
2741                 1, INT_MAX, 0);
2742 STORE_FUNCTION(bfq_timeout_async_store, &bfqd->bfq_timeout[BLK_RW_ASYNC], 0,
2743                 INT_MAX, 1);
2744 STORE_FUNCTION(bfq_raising_coeff_store, &bfqd->bfq_raising_coeff, 1,
2745                 INT_MAX, 0);
2746 STORE_FUNCTION(bfq_raising_max_time_store, &bfqd->bfq_raising_max_time, 0,
2747                 INT_MAX, 1);
2748 STORE_FUNCTION(bfq_raising_rt_max_time_store, &bfqd->bfq_raising_rt_max_time, 0,
2749                 INT_MAX, 1);
2750 STORE_FUNCTION(bfq_raising_min_idle_time_store,
2751                &bfqd->bfq_raising_min_idle_time, 0, INT_MAX, 1);
2752 STORE_FUNCTION(bfq_raising_min_inter_arr_async_store,
2753                 &bfqd->bfq_raising_min_inter_arr_async, 0, INT_MAX, 1);
2754 STORE_FUNCTION(bfq_raising_max_softrt_rate_store,
2755                &bfqd->bfq_raising_max_softrt_rate, 0, INT_MAX, 0);
2756 #undef STORE_FUNCTION
2757
2758 /* do nothing for the moment */
2759 static ssize_t bfq_weights_store(struct elevator_queue *e,
2760                                     const char *page, size_t count)
2761 {
2762         return count;
2763 }
2764
2765 static inline unsigned long bfq_estimated_max_budget(struct bfq_data *bfqd)
2766 {
2767         u64 timeout = jiffies_to_msecs(bfqd->bfq_timeout[BLK_RW_SYNC]);
2768
2769         if (bfqd->peak_rate_samples >= BFQ_PEAK_RATE_SAMPLES)
2770                 return bfq_calc_max_budget(bfqd->peak_rate, timeout);
2771         else
2772                 return bfq_default_max_budget;
2773 }
2774
2775 static ssize_t bfq_max_budget_store(struct elevator_queue *e,
2776                                     const char *page, size_t count)
2777 {
2778         struct bfq_data *bfqd = e->elevator_data;
2779         unsigned long __data;
2780         int ret = bfq_var_store(&__data, (page), count);
2781
2782         if (__data == 0)
2783                 bfqd->bfq_max_budget = bfq_estimated_max_budget(bfqd);
2784         else {
2785                 if (__data > INT_MAX)
2786                         __data = INT_MAX;
2787                 bfqd->bfq_max_budget = __data;
2788         }
2789
2790         bfqd->bfq_user_max_budget = __data;
2791
2792         return ret;
2793 }
2794
2795 static ssize_t bfq_timeout_sync_store(struct elevator_queue *e,
2796                                       const char *page, size_t count)
2797 {
2798         struct bfq_data *bfqd = e->elevator_data;
2799         unsigned long __data;
2800         int ret = bfq_var_store(&__data, (page), count);
2801
2802         if (__data < 1)
2803                 __data = 1;
2804         else if (__data > INT_MAX)
2805                 __data = INT_MAX;
2806
2807         bfqd->bfq_timeout[BLK_RW_SYNC] = msecs_to_jiffies(__data);
2808         if (bfqd->bfq_user_max_budget == 0)
2809                 bfqd->bfq_max_budget = bfq_estimated_max_budget(bfqd);
2810
2811         return ret;
2812 }
2813
2814 static ssize_t bfq_low_latency_store(struct elevator_queue *e,
2815                                      const char *page, size_t count)
2816 {
2817         struct bfq_data *bfqd = e->elevator_data;
2818         unsigned long __data;
2819         int ret = bfq_var_store(&__data, (page), count);
2820
2821         if (__data > 1)
2822                 __data = 1;
2823         bfqd->low_latency = __data;
2824
2825         return ret;
2826 }
2827
2828 #define BFQ_ATTR(name) \
2829         __ATTR(name, S_IRUGO|S_IWUSR, bfq_##name##_show, bfq_##name##_store)
2830
2831 static struct elv_fs_entry bfq_attrs[] = {
2832         BFQ_ATTR(quantum),
2833         BFQ_ATTR(fifo_expire_sync),
2834         BFQ_ATTR(fifo_expire_async),
2835         BFQ_ATTR(back_seek_max),
2836         BFQ_ATTR(back_seek_penalty),
2837         BFQ_ATTR(slice_idle),
2838         BFQ_ATTR(max_budget),
2839         BFQ_ATTR(max_budget_async_rq),
2840         BFQ_ATTR(timeout_sync),
2841         BFQ_ATTR(timeout_async),
2842         BFQ_ATTR(low_latency),
2843         BFQ_ATTR(raising_coeff),
2844         BFQ_ATTR(raising_max_time),
2845         BFQ_ATTR(raising_rt_max_time),
2846         BFQ_ATTR(raising_min_idle_time),
2847         BFQ_ATTR(raising_min_inter_arr_async),
2848         BFQ_ATTR(raising_max_softrt_rate),
2849         BFQ_ATTR(weights),
2850         __ATTR_NULL
2851 };
2852
2853 static struct elevator_type iosched_bfq = {
2854         .ops = {
2855                 .elevator_merge_fn =            bfq_merge,
2856                 .elevator_merged_fn =           bfq_merged_request,
2857                 .elevator_merge_req_fn =        bfq_merged_requests,
2858                 .elevator_allow_merge_fn =      bfq_allow_merge,
2859                 .elevator_dispatch_fn =         bfq_dispatch_requests,
2860                 .elevator_add_req_fn =          bfq_insert_request,
2861                 .elevator_activate_req_fn =     bfq_activate_request,
2862                 .elevator_deactivate_req_fn =   bfq_deactivate_request,
2863                 .elevator_completed_req_fn =    bfq_completed_request,
2864                 .elevator_former_req_fn =       elv_rb_former_request,
2865                 .elevator_latter_req_fn =       elv_rb_latter_request,
2866                 .elevator_init_icq_fn =         bfq_init_icq,
2867                 .elevator_exit_icq_fn =         bfq_exit_icq,
2868                 .elevator_set_req_fn =          bfq_set_request,
2869                 .elevator_put_req_fn =          bfq_put_request,
2870                 .elevator_may_queue_fn =        bfq_may_queue,
2871                 .elevator_init_fn =             bfq_init_queue,
2872                 .elevator_exit_fn =             bfq_exit_queue,
2873         },
2874         .icq_size =             sizeof(struct bfq_io_cq),
2875         .icq_align =            __alignof__(struct bfq_io_cq),
2876         .elevator_attrs =       bfq_attrs,
2877         .elevator_name =        "bfq",
2878         .elevator_owner =       THIS_MODULE,
2879 };
2880
2881 static int __init bfq_init(void)
2882 {
2883         /*
2884          * Can be 0 on HZ < 1000 setups.
2885          */
2886         if (bfq_slice_idle == 0)
2887                 bfq_slice_idle = 1;
2888
2889         if (bfq_timeout_async == 0)
2890                 bfq_timeout_async = 1;
2891
2892         if (bfq_slab_setup())
2893                 return -ENOMEM;
2894
2895         elv_register(&iosched_bfq);
2896
2897         return 0;
2898 }
2899
2900 static void __exit bfq_exit(void)
2901 {
2902         elv_unregister(&iosched_bfq);
2903         bfq_slab_kill();
2904 }
2905
2906 module_init(bfq_init);
2907 module_exit(bfq_exit);
2908
2909 MODULE_AUTHOR("Fabio Checconi, Paolo Valente");
2910 MODULE_LICENSE("GPL");
2911 MODULE_DESCRIPTION("Budget Fair Queueing IO scheduler");