block/blk-mq.c

   1 // SPDX-License-Identifier: GPL-2.0
   2 /*
   3  * Block multiqueue core code
   4  *
   5  * Copyright (C) 2013-2014 Jens Axboe
   6  * Copyright (C) 2013-2014 Christoph Hellwig
   7  */
   8 #include <linux/kernel.h>
   9 #include <linux/module.h>
  10 #include <linux/backing-dev.h>
  11 #include <linux/bio.h>
  12 #include <linux/blkdev.h>
  13 #include <linux/blk-integrity.h>
  14 #include <linux/kmemleak.h>
  15 #include <linux/mm.h>
  16 #include <linux/init.h>
  17 #include <linux/slab.h>
  18 #include <linux/workqueue.h>
  19 #include <linux/smp.h>
  20 #include <linux/interrupt.h>
  21 #include <linux/llist.h>
  22 #include <linux/cpu.h>
  23 #include <linux/cache.h>
  24 #include <linux/sched/topology.h>
  25 #include <linux/sched/signal.h>
  26 #include <linux/delay.h>
  27 #include <linux/crash_dump.h>
  28 #include <linux/prefetch.h>
  29 #include <linux/blk-crypto.h>
  30 #include <linux/part_stat.h>
  31 #include <linux/sched/isolation.h>
  32
  33 #include <trace/events/block.h>
  34
  35 #include <linux/t10-pi.h>
  36 #include "blk.h"
  37 #include "blk-mq.h"
  38 #include "blk-mq-debugfs.h"
  39 #include "blk-pm.h"
  40 #include "blk-stat.h"
  41 #include "blk-mq-sched.h"
  42 #include "blk-rq-qos.h"
  43
  44 static DEFINE_PER_CPU(struct llist_head, blk_cpu_done);
  45 static DEFINE_PER_CPU(call_single_data_t, blk_cpu_csd);
  46
  47 static void blk_mq_insert_request(struct request *rq, blk_insert_t flags);
  48 static void blk_mq_request_bypass_insert(struct request *rq,
  49                 blk_insert_t flags);
  50 static void blk_mq_try_issue_list_directly(struct blk_mq_hw_ctx *hctx,
  51                 struct list_head *list);
  52 static int blk_hctx_poll(struct request_queue *q, struct blk_mq_hw_ctx *hctx,
  53                          struct io_comp_batch *iob, unsigned int flags);
  54
  55 /*
  56  * Check if any of the ctx, dispatch list or elevator
  57  * have pending work in this hardware queue.
  58  */
  59 static bool blk_mq_hctx_has_pending(struct blk_mq_hw_ctx *hctx)
  60 {
  61         return !list_empty_careful(&hctx->dispatch) ||
  62                 sbitmap_any_bit_set(&hctx->ctx_map) ||
  63                         blk_mq_sched_has_work(hctx);
  64 }
  65
  66 /*
  67  * Mark this ctx as having pending work in this hardware queue
  68  */
  69 static void blk_mq_hctx_mark_pending(struct blk_mq_hw_ctx *hctx,
  70                                      struct blk_mq_ctx *ctx)
  71 {
  72         const int bit = ctx->index_hw[hctx->type];
  73
  74         if (!sbitmap_test_bit(&hctx->ctx_map, bit))
  75                 sbitmap_set_bit(&hctx->ctx_map, bit);
  76 }
  77
  78 static void blk_mq_hctx_clear_pending(struct blk_mq_hw_ctx *hctx,
  79                                       struct blk_mq_ctx *ctx)
  80 {
  81         const int bit = ctx->index_hw[hctx->type];
  82
  83         sbitmap_clear_bit(&hctx->ctx_map, bit);
  84 }
  85
  86 struct mq_inflight {
  87         struct block_device *part;
  88         unsigned int inflight[2];
  89 };
  90
  91 static bool blk_mq_check_inflight(struct request *rq, void *priv)
  92 {
  93         struct mq_inflight *mi = priv;
  94
  95         if (rq->rq_flags & RQF_IO_STAT &&
  96             (!bdev_is_partition(mi->part) || rq->part == mi->part) &&
  97             blk_mq_rq_state(rq) == MQ_RQ_IN_FLIGHT)
  98                 mi->inflight[rq_data_dir(rq)]++;
  99
 100         return true;
 101 }
 102
 103 unsigned int blk_mq_in_flight(struct request_queue *q,
 104                 struct block_device *part)
 105 {
 106         struct mq_inflight mi = { .part = part };
 107
 108         blk_mq_queue_tag_busy_iter(q, blk_mq_check_inflight, &mi);
 109
 110         return mi.inflight[0] + mi.inflight[1];
 111 }
 112
 113 void blk_mq_in_flight_rw(struct request_queue *q, struct block_device *part,
 114                 unsigned int inflight[2])
 115 {
 116         struct mq_inflight mi = { .part = part };
 117
 118         blk_mq_queue_tag_busy_iter(q, blk_mq_check_inflight, &mi);
 119         inflight[0] = mi.inflight[0];
 120         inflight[1] = mi.inflight[1];
 121 }
 122
 123 #ifdef CONFIG_LOCKDEP
 124 static bool blk_freeze_set_owner(struct request_queue *q,
 125                                  struct task_struct *owner)
 126 {
 127         if (!owner)
 128                 return false;
 129
 130         if (!q->mq_freeze_depth) {
 131                 q->mq_freeze_owner = owner;
 132                 q->mq_freeze_owner_depth = 1;
 133                 return true;
 134         }
 135
 136         if (owner == q->mq_freeze_owner)
 137                 q->mq_freeze_owner_depth += 1;
 138         return false;
 139 }
 140
 141 /* verify the last unfreeze in owner context */
 142 static bool blk_unfreeze_check_owner(struct request_queue *q)
 143 {
 144         if (!q->mq_freeze_owner)
 145                 return false;
 146         if (q->mq_freeze_owner != current)
 147                 return false;
 148         if (--q->mq_freeze_owner_depth == 0) {
 149                 q->mq_freeze_owner = NULL;
 150                 return true;
 151         }
 152         return false;
 153 }
 154
 155 #else
 156
 157 static bool blk_freeze_set_owner(struct request_queue *q,
 158                                  struct task_struct *owner)
 159 {
 160         return false;
 161 }
 162
 163 static bool blk_unfreeze_check_owner(struct request_queue *q)
 164 {
 165         return false;
 166 }
 167 #endif
 168
 169 bool __blk_freeze_queue_start(struct request_queue *q,
 170                               struct task_struct *owner)
 171 {
 172         bool freeze;
 173
 174         mutex_lock(&q->mq_freeze_lock);
 175         freeze = blk_freeze_set_owner(q, owner);
 176         if (++q->mq_freeze_depth == 1) {
 177                 percpu_ref_kill(&q->q_usage_counter);
 178                 mutex_unlock(&q->mq_freeze_lock);
 179                 if (queue_is_mq(q))
 180                         blk_mq_run_hw_queues(q, false);
 181         } else {
 182                 mutex_unlock(&q->mq_freeze_lock);
 183         }
 184
 185         return freeze;
 186 }
 187
 188 void blk_freeze_queue_start(struct request_queue *q)
 189 {
 190         if (__blk_freeze_queue_start(q, current))
 191                 blk_freeze_acquire_lock(q, false, false);
 192 }
 193 EXPORT_SYMBOL_GPL(blk_freeze_queue_start);
 194
 195 void blk_mq_freeze_queue_wait(struct request_queue *q)
 196 {
 197         wait_event(q->mq_freeze_wq, percpu_ref_is_zero(&q->q_usage_counter));
 198 }
 199 EXPORT_SYMBOL_GPL(blk_mq_freeze_queue_wait);
 200
 201 int blk_mq_freeze_queue_wait_timeout(struct request_queue *q,
 202                                      unsigned long timeout)
 203 {
 204         return wait_event_timeout(q->mq_freeze_wq,
 205                                         percpu_ref_is_zero(&q->q_usage_counter),
 206                                         timeout);
 207 }
 208 EXPORT_SYMBOL_GPL(blk_mq_freeze_queue_wait_timeout);
 209
 210 void blk_mq_freeze_queue(struct request_queue *q)
 211 {
 212         blk_freeze_queue_start(q);
 213         blk_mq_freeze_queue_wait(q);
 214 }
 215 EXPORT_SYMBOL_GPL(blk_mq_freeze_queue);
 216
 217 bool __blk_mq_unfreeze_queue(struct request_queue *q, bool force_atomic)
 218 {
 219         bool unfreeze;
 220
 221         mutex_lock(&q->mq_freeze_lock);
 222         if (force_atomic)
 223                 q->q_usage_counter.data->force_atomic = true;
 224         q->mq_freeze_depth--;
 225         WARN_ON_ONCE(q->mq_freeze_depth < 0);
 226         if (!q->mq_freeze_depth) {
 227                 percpu_ref_resurrect(&q->q_usage_counter);
 228                 wake_up_all(&q->mq_freeze_wq);
 229         }
 230         unfreeze = blk_unfreeze_check_owner(q);
 231         mutex_unlock(&q->mq_freeze_lock);
 232
 233         return unfreeze;
 234 }
 235
 236 void blk_mq_unfreeze_queue(struct request_queue *q)
 237 {
 238         if (__blk_mq_unfreeze_queue(q, false))
 239                 blk_unfreeze_release_lock(q, false, false);
 240 }
 241 EXPORT_SYMBOL_GPL(blk_mq_unfreeze_queue);
 242
 243 /*
 244  * non_owner variant of blk_freeze_queue_start
 245  *
 246  * Unlike blk_freeze_queue_start, the queue doesn't need to be unfrozen
 247  * by the same task.  This is fragile and should not be used if at all
 248  * possible.
 249  */
 250 void blk_freeze_queue_start_non_owner(struct request_queue *q)
 251 {
 252         __blk_freeze_queue_start(q, NULL);
 253 }
 254 EXPORT_SYMBOL_GPL(blk_freeze_queue_start_non_owner);
 255
 256 /* non_owner variant of blk_mq_unfreeze_queue */
 257 void blk_mq_unfreeze_queue_non_owner(struct request_queue *q)
 258 {
 259         __blk_mq_unfreeze_queue(q, false);
 260 }
 261 EXPORT_SYMBOL_GPL(blk_mq_unfreeze_queue_non_owner);
 262
 263 /*
 264  * FIXME: replace the scsi_internal_device_*block_nowait() calls in the
 265  * mpt3sas driver such that this function can be removed.
 266  */
 267 void blk_mq_quiesce_queue_nowait(struct request_queue *q)
 268 {
 269         unsigned long flags;
 270
 271         spin_lock_irqsave(&q->queue_lock, flags);
 272         if (!q->quiesce_depth++)
 273                 blk_queue_flag_set(QUEUE_FLAG_QUIESCED, q);
 274         spin_unlock_irqrestore(&q->queue_lock, flags);
 275 }
 276 EXPORT_SYMBOL_GPL(blk_mq_quiesce_queue_nowait);
 277
 278 /**
 279  * blk_mq_wait_quiesce_done() - wait until in-progress quiesce is done
 280  * @set: tag_set to wait on
 281  *
 282  * Note: it is driver's responsibility for making sure that quiesce has
 283  * been started on or more of the request_queues of the tag_set.  This
 284  * function only waits for the quiesce on those request_queues that had
 285  * the quiesce flag set using blk_mq_quiesce_queue_nowait.
 286  */
 287 void blk_mq_wait_quiesce_done(struct blk_mq_tag_set *set)
 288 {
 289         if (set->flags & BLK_MQ_F_BLOCKING)
 290                 synchronize_srcu(set->srcu);
 291         else
 292                 synchronize_rcu();
 293 }
 294 EXPORT_SYMBOL_GPL(blk_mq_wait_quiesce_done);
 295
 296 /**
 297  * blk_mq_quiesce_queue() - wait until all ongoing dispatches have finished
 298  * @q: request queue.
 299  *
 300  * Note: this function does not prevent that the struct request end_io()
 301  * callback function is invoked. Once this function is returned, we make
 302  * sure no dispatch can happen until the queue is unquiesced via
 303  * blk_mq_unquiesce_queue().
 304  */
 305 void blk_mq_quiesce_queue(struct request_queue *q)
 306 {
 307         blk_mq_quiesce_queue_nowait(q);
 308         /* nothing to wait for non-mq queues */
 309         if (queue_is_mq(q))
 310                 blk_mq_wait_quiesce_done(q->tag_set);
 311 }
 312 EXPORT_SYMBOL_GPL(blk_mq_quiesce_queue);
 313
 314 /*
 315  * blk_mq_unquiesce_queue() - counterpart of blk_mq_quiesce_queue()
 316  * @q: request queue.
 317  *
 318  * This function recovers queue into the state before quiescing
 319  * which is done by blk_mq_quiesce_queue.
 320  */
 321 void blk_mq_unquiesce_queue(struct request_queue *q)
 322 {
 323         unsigned long flags;
 324         bool run_queue = false;
 325
 326         spin_lock_irqsave(&q->queue_lock, flags);
 327         if (WARN_ON_ONCE(q->quiesce_depth <= 0)) {
 328                 ;
 329         } else if (!--q->quiesce_depth) {
 330                 blk_queue_flag_clear(QUEUE_FLAG_QUIESCED, q);
 331                 run_queue = true;
 332         }
 333         spin_unlock_irqrestore(&q->queue_lock, flags);
 334
 335         /* dispatch requests which are inserted during quiescing */
 336         if (run_queue)
 337                 blk_mq_run_hw_queues(q, true);
 338 }
 339 EXPORT_SYMBOL_GPL(blk_mq_unquiesce_queue);
 340
 341 void blk_mq_quiesce_tagset(struct blk_mq_tag_set *set)
 342 {
 343         struct request_queue *q;
 344
 345         mutex_lock(&set->tag_list_lock);
 346         list_for_each_entry(q, &set->tag_list, tag_set_list) {
 347                 if (!blk_queue_skip_tagset_quiesce(q))
 348                         blk_mq_quiesce_queue_nowait(q);
 349         }
 350         mutex_unlock(&set->tag_list_lock);
 351
 352         blk_mq_wait_quiesce_done(set);
 353 }
 354 EXPORT_SYMBOL_GPL(blk_mq_quiesce_tagset);
 355
 356 void blk_mq_unquiesce_tagset(struct blk_mq_tag_set *set)
 357 {
 358         struct request_queue *q;
 359
 360         mutex_lock(&set->tag_list_lock);
 361         list_for_each_entry(q, &set->tag_list, tag_set_list) {
 362                 if (!blk_queue_skip_tagset_quiesce(q))
 363                         blk_mq_unquiesce_queue(q);
 364         }
 365         mutex_unlock(&set->tag_list_lock);
 366 }
 367 EXPORT_SYMBOL_GPL(blk_mq_unquiesce_tagset);
 368
 369 void blk_mq_wake_waiters(struct request_queue *q)
 370 {
 371         struct blk_mq_hw_ctx *hctx;
 372         unsigned long i;
 373
 374         queue_for_each_hw_ctx(q, hctx, i)
 375                 if (blk_mq_hw_queue_mapped(hctx))
 376                         blk_mq_tag_wakeup_all(hctx->tags, true);
 377 }
 378
 379 void blk_rq_init(struct request_queue *q, struct request *rq)
 380 {
 381         memset(rq, 0, sizeof(*rq));
 382
 383         INIT_LIST_HEAD(&rq->queuelist);
 384         rq->q = q;
 385         rq->__sector = (sector_t) -1;
 386         INIT_HLIST_NODE(&rq->hash);
 387         RB_CLEAR_NODE(&rq->rb_node);
 388         rq->tag = BLK_MQ_NO_TAG;
 389         rq->internal_tag = BLK_MQ_NO_TAG;
 390         rq->start_time_ns = blk_time_get_ns();
 391         blk_crypto_rq_set_defaults(rq);
 392 }
 393 EXPORT_SYMBOL(blk_rq_init);
 394
 395 /* Set start and alloc time when the allocated request is actually used */
 396 static inline void blk_mq_rq_time_init(struct request *rq, u64 alloc_time_ns)
 397 {
 398 #ifdef CONFIG_BLK_RQ_ALLOC_TIME
 399         if (blk_queue_rq_alloc_time(rq->q))
 400                 rq->alloc_time_ns = alloc_time_ns;
 401         else
 402                 rq->alloc_time_ns = 0;
 403 #endif
 404 }
 405
 406 static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data,
 407                 struct blk_mq_tags *tags, unsigned int tag)
 408 {
 409         struct blk_mq_ctx *ctx = data->ctx;
 410         struct blk_mq_hw_ctx *hctx = data->hctx;
 411         struct request_queue *q = data->q;
 412         struct request *rq = tags->static_rqs[tag];
 413
 414         rq->q = q;
 415         rq->mq_ctx = ctx;
 416         rq->mq_hctx = hctx;
 417         rq->cmd_flags = data->cmd_flags;
 418
 419         if (data->flags & BLK_MQ_REQ_PM)
 420                 data->rq_flags |= RQF_PM;
 421         rq->rq_flags = data->rq_flags;
 422
 423         if (data->rq_flags & RQF_SCHED_TAGS) {
 424                 rq->tag = BLK_MQ_NO_TAG;
 425                 rq->internal_tag = tag;
 426         } else {
 427                 rq->tag = tag;
 428                 rq->internal_tag = BLK_MQ_NO_TAG;
 429         }
 430         rq->timeout = 0;
 431
 432         rq->part = NULL;
 433         rq->io_start_time_ns = 0;
 434         rq->stats_sectors = 0;
 435         rq->nr_phys_segments = 0;
 436         rq->nr_integrity_segments = 0;
 437         rq->end_io = NULL;
 438         rq->end_io_data = NULL;
 439
 440         blk_crypto_rq_set_defaults(rq);
 441         INIT_LIST_HEAD(&rq->queuelist);
 442         /* tag was already set */
 443         WRITE_ONCE(rq->deadline, 0);
 444         req_ref_set(rq, 1);
 445
 446         if (rq->rq_flags & RQF_USE_SCHED) {
 447                 struct elevator_queue *e = data->q->elevator;
 448
 449                 INIT_HLIST_NODE(&rq->hash);
 450                 RB_CLEAR_NODE(&rq->rb_node);
 451
 452                 if (e->type->ops.prepare_request)
 453                         e->type->ops.prepare_request(rq);
 454         }
 455
 456         return rq;
 457 }
 458
 459 static inline struct request *
 460 __blk_mq_alloc_requests_batch(struct blk_mq_alloc_data *data)
 461 {
 462         unsigned int tag, tag_offset;
 463         struct blk_mq_tags *tags;
 464         struct request *rq;
 465         unsigned long tag_mask;
 466         int i, nr = 0;
 467
 468         tag_mask = blk_mq_get_tags(data, data->nr_tags, &tag_offset);
 469         if (unlikely(!tag_mask))
 470                 return NULL;
 471
 472         tags = blk_mq_tags_from_data(data);
 473         for (i = 0; tag_mask; i++) {
 474                 if (!(tag_mask & (1UL << i)))
 475                         continue;
 476                 tag = tag_offset + i;
 477                 prefetch(tags->static_rqs[tag]);
 478                 tag_mask &= ~(1UL << i);
 479                 rq = blk_mq_rq_ctx_init(data, tags, tag);
 480                 rq_list_add_head(data->cached_rqs, rq);
 481                 nr++;
 482         }
 483         if (!(data->rq_flags & RQF_SCHED_TAGS))
 484                 blk_mq_add_active_requests(data->hctx, nr);
 485         /* caller already holds a reference, add for remainder */
 486         percpu_ref_get_many(&data->q->q_usage_counter, nr - 1);
 487         data->nr_tags -= nr;
 488
 489         return rq_list_pop(data->cached_rqs);
 490 }
 491
 492 static struct request *__blk_mq_alloc_requests(struct blk_mq_alloc_data *data)
 493 {
 494         struct request_queue *q = data->q;
 495         u64 alloc_time_ns = 0;
 496         struct request *rq;
 497         unsigned int tag;
 498
 499         /* alloc_time includes depth and tag waits */
 500         if (blk_queue_rq_alloc_time(q))
 501                 alloc_time_ns = blk_time_get_ns();
 502
 503         if (data->cmd_flags & REQ_NOWAIT)
 504                 data->flags |= BLK_MQ_REQ_NOWAIT;
 505
 506 retry:
 507         data->ctx = blk_mq_get_ctx(q);
 508         data->hctx = blk_mq_map_queue(q, data->cmd_flags, data->ctx);
 509
 510         if (q->elevator) {
 511                 /*
 512                  * All requests use scheduler tags when an I/O scheduler is
 513                  * enabled for the queue.
 514                  */
 515                 data->rq_flags |= RQF_SCHED_TAGS;
 516
 517                 /*
 518                  * Flush/passthrough requests are special and go directly to the
 519                  * dispatch list.
 520                  */
 521                 if ((data->cmd_flags & REQ_OP_MASK) != REQ_OP_FLUSH &&
 522                     !blk_op_is_passthrough(data->cmd_flags)) {
 523                         struct elevator_mq_ops *ops = &q->elevator->type->ops;
 524
 525                         WARN_ON_ONCE(data->flags & BLK_MQ_REQ_RESERVED);
 526
 527                         data->rq_flags |= RQF_USE_SCHED;
 528                         if (ops->limit_depth)
 529                                 ops->limit_depth(data->cmd_flags, data);
 530                 }
 531         } else {
 532                 blk_mq_tag_busy(data->hctx);
 533         }
 534
 535         if (data->flags & BLK_MQ_REQ_RESERVED)
 536                 data->rq_flags |= RQF_RESV;
 537
 538         /*
 539          * Try batched alloc if we want more than 1 tag.
 540          */
 541         if (data->nr_tags > 1) {
 542                 rq = __blk_mq_alloc_requests_batch(data);
 543                 if (rq) {
 544                         blk_mq_rq_time_init(rq, alloc_time_ns);
 545                         return rq;
 546                 }
 547                 data->nr_tags = 1;
 548         }
 549
 550         /*
 551          * Waiting allocations only fail because of an inactive hctx.  In that
 552          * case just retry the hctx assignment and tag allocation as CPU hotplug
 553          * should have migrated us to an online CPU by now.
 554          */
 555         tag = blk_mq_get_tag(data);
 556         if (tag == BLK_MQ_NO_TAG) {
 557                 if (data->flags & BLK_MQ_REQ_NOWAIT)
 558                         return NULL;
 559                 /*
 560                  * Give up the CPU and sleep for a random short time to
 561                  * ensure that thread using a realtime scheduling class
 562                  * are migrated off the CPU, and thus off the hctx that
 563                  * is going away.
 564                  */
 565                 msleep(3);
 566                 goto retry;
 567         }
 568
 569         if (!(data->rq_flags & RQF_SCHED_TAGS))
 570                 blk_mq_inc_active_requests(data->hctx);
 571         rq = blk_mq_rq_ctx_init(data, blk_mq_tags_from_data(data), tag);
 572         blk_mq_rq_time_init(rq, alloc_time_ns);
 573         return rq;
 574 }
 575
 576 static struct request *blk_mq_rq_cache_fill(struct request_queue *q,
 577                                             struct blk_plug *plug,
 578                                             blk_opf_t opf,
 579                                             blk_mq_req_flags_t flags)
 580 {
 581         struct blk_mq_alloc_data data = {
 582                 .q              = q,
 583                 .flags          = flags,
 584                 .cmd_flags      = opf,
 585                 .nr_tags        = plug->nr_ios,
 586                 .cached_rqs     = &plug->cached_rqs,
 587         };
 588         struct request *rq;
 589
 590         if (blk_queue_enter(q, flags))
 591                 return NULL;
 592
 593         plug->nr_ios = 1;
 594
 595         rq = __blk_mq_alloc_requests(&data);
 596         if (unlikely(!rq))
 597                 blk_queue_exit(q);
 598         return rq;
 599 }
 600
 601 static struct request *blk_mq_alloc_cached_request(struct request_queue *q,
 602                                                    blk_opf_t opf,
 603                                                    blk_mq_req_flags_t flags)
 604 {
 605         struct blk_plug *plug = current->plug;
 606         struct request *rq;
 607
 608         if (!plug)
 609                 return NULL;
 610
 611         if (rq_list_empty(&plug->cached_rqs)) {
 612                 if (plug->nr_ios == 1)
 613                         return NULL;
 614                 rq = blk_mq_rq_cache_fill(q, plug, opf, flags);
 615                 if (!rq)
 616                         return NULL;
 617         } else {
 618                 rq = rq_list_peek(&plug->cached_rqs);
 619                 if (!rq || rq->q != q)
 620                         return NULL;
 621
 622                 if (blk_mq_get_hctx_type(opf) != rq->mq_hctx->type)
 623                         return NULL;
 624                 if (op_is_flush(rq->cmd_flags) != op_is_flush(opf))
 625                         return NULL;
 626
 627                 rq_list_pop(&plug->cached_rqs);
 628                 blk_mq_rq_time_init(rq, blk_time_get_ns());
 629         }
 630
 631         rq->cmd_flags = opf;
 632         INIT_LIST_HEAD(&rq->queuelist);
 633         return rq;
 634 }
 635
 636 struct request *blk_mq_alloc_request(struct request_queue *q, blk_opf_t opf,
 637                 blk_mq_req_flags_t flags)
 638 {
 639         struct request *rq;
 640
 641         rq = blk_mq_alloc_cached_request(q, opf, flags);
 642         if (!rq) {
 643                 struct blk_mq_alloc_data data = {
 644                         .q              = q,
 645                         .flags          = flags,
 646                         .cmd_flags      = opf,
 647                         .nr_tags        = 1,
 648                 };
 649                 int ret;
 650
 651                 ret = blk_queue_enter(q, flags);
 652                 if (ret)
 653                         return ERR_PTR(ret);
 654
 655                 rq = __blk_mq_alloc_requests(&data);
 656                 if (!rq)
 657                         goto out_queue_exit;
 658         }
 659         rq->__data_len = 0;
 660         rq->__sector = (sector_t) -1;
 661         rq->bio = rq->biotail = NULL;
 662         return rq;
 663 out_queue_exit:
 664         blk_queue_exit(q);
 665         return ERR_PTR(-EWOULDBLOCK);
 666 }
 667 EXPORT_SYMBOL(blk_mq_alloc_request);
 668
 669 struct request *blk_mq_alloc_request_hctx(struct request_queue *q,
 670         blk_opf_t opf, blk_mq_req_flags_t flags, unsigned int hctx_idx)
 671 {
 672         struct blk_mq_alloc_data data = {
 673                 .q              = q,
 674                 .flags          = flags,
 675                 .cmd_flags      = opf,
 676                 .nr_tags        = 1,
 677         };
 678         u64 alloc_time_ns = 0;
 679         struct request *rq;
 680         unsigned int cpu;
 681         unsigned int tag;
 682         int ret;
 683
 684         /* alloc_time includes depth and tag waits */
 685         if (blk_queue_rq_alloc_time(q))
 686                 alloc_time_ns = blk_time_get_ns();
 687
 688         /*
 689          * If the tag allocator sleeps we could get an allocation for a
 690          * different hardware context.  No need to complicate the low level
 691          * allocator for this for the rare use case of a command tied to
 692          * a specific queue.
 693          */
 694         if (WARN_ON_ONCE(!(flags & BLK_MQ_REQ_NOWAIT)) ||
 695             WARN_ON_ONCE(!(flags & BLK_MQ_REQ_RESERVED)))
 696                 return ERR_PTR(-EINVAL);
 697
 698         if (hctx_idx >= q->nr_hw_queues)
 699                 return ERR_PTR(-EIO);
 700
 701         ret = blk_queue_enter(q, flags);
 702         if (ret)
 703                 return ERR_PTR(ret);
 704
 705         /*
 706          * Check if the hardware context is actually mapped to anything.
 707          * If not tell the caller that it should skip this queue.
 708          */
 709         ret = -EXDEV;
 710         data.hctx = xa_load(&q->hctx_table, hctx_idx);
 711         if (!blk_mq_hw_queue_mapped(data.hctx))
 712                 goto out_queue_exit;
 713         cpu = cpumask_first_and(data.hctx->cpumask, cpu_online_mask);
 714         if (cpu >= nr_cpu_ids)
 715                 goto out_queue_exit;
 716         data.ctx = __blk_mq_get_ctx(q, cpu);
 717
 718         if (q->elevator)
 719                 data.rq_flags |= RQF_SCHED_TAGS;
 720         else
 721                 blk_mq_tag_busy(data.hctx);
 722
 723         if (flags & BLK_MQ_REQ_RESERVED)
 724                 data.rq_flags |= RQF_RESV;
 725
 726         ret = -EWOULDBLOCK;
 727         tag = blk_mq_get_tag(&data);
 728         if (tag == BLK_MQ_NO_TAG)
 729                 goto out_queue_exit;
 730         if (!(data.rq_flags & RQF_SCHED_TAGS))
 731                 blk_mq_inc_active_requests(data.hctx);
 732         rq = blk_mq_rq_ctx_init(&data, blk_mq_tags_from_data(&data), tag);
 733         blk_mq_rq_time_init(rq, alloc_time_ns);
 734         rq->__data_len = 0;
 735         rq->__sector = (sector_t) -1;
 736         rq->bio = rq->biotail = NULL;
 737         return rq;
 738
 739 out_queue_exit:
 740         blk_queue_exit(q);
 741         return ERR_PTR(ret);
 742 }
 743 EXPORT_SYMBOL_GPL(blk_mq_alloc_request_hctx);
 744
 745 static void blk_mq_finish_request(struct request *rq)
 746 {
 747         struct request_queue *q = rq->q;
 748
 749         blk_zone_finish_request(rq);
 750
 751         if (rq->rq_flags & RQF_USE_SCHED) {
 752                 q->elevator->type->ops.finish_request(rq);
 753                 /*
 754                  * For postflush request that may need to be
 755                  * completed twice, we should clear this flag
 756                  * to avoid double finish_request() on the rq.
 757                  */
 758                 rq->rq_flags &= ~RQF_USE_SCHED;
 759         }
 760 }
 761
 762 static void __blk_mq_free_request(struct request *rq)
 763 {
 764         struct request_queue *q = rq->q;
 765         struct blk_mq_ctx *ctx = rq->mq_ctx;
 766         struct blk_mq_hw_ctx *hctx = rq->mq_hctx;
 767         const int sched_tag = rq->internal_tag;
 768
 769         blk_crypto_free_request(rq);
 770         blk_pm_mark_last_busy(rq);
 771         rq->mq_hctx = NULL;
 772
 773         if (rq->tag != BLK_MQ_NO_TAG) {
 774                 blk_mq_dec_active_requests(hctx);
 775                 blk_mq_put_tag(hctx->tags, ctx, rq->tag);
 776         }
 777         if (sched_tag != BLK_MQ_NO_TAG)
 778                 blk_mq_put_tag(hctx->sched_tags, ctx, sched_tag);
 779         blk_mq_sched_restart(hctx);
 780         blk_queue_exit(q);
 781 }
 782
 783 void blk_mq_free_request(struct request *rq)
 784 {
 785         struct request_queue *q = rq->q;
 786
 787         blk_mq_finish_request(rq);
 788
 789         if (unlikely(laptop_mode && !blk_rq_is_passthrough(rq)))
 790                 laptop_io_completion(q->disk->bdi);
 791
 792         rq_qos_done(q, rq);
 793
 794         WRITE_ONCE(rq->state, MQ_RQ_IDLE);
 795         if (req_ref_put_and_test(rq))
 796                 __blk_mq_free_request(rq);
 797 }
 798 EXPORT_SYMBOL_GPL(blk_mq_free_request);
 799
 800 void blk_mq_free_plug_rqs(struct blk_plug *plug)
 801 {
 802         struct request *rq;
 803
 804         while ((rq = rq_list_pop(&plug->cached_rqs)) != NULL)
 805                 blk_mq_free_request(rq);
 806 }
 807
 808 void blk_dump_rq_flags(struct request *rq, char *msg)
 809 {
 810         printk(KERN_INFO "%s: dev %s: flags=%llx\n", msg,
 811                 rq->q->disk ? rq->q->disk->disk_name : "?",
 812                 (__force unsigned long long) rq->cmd_flags);
 813
 814         printk(KERN_INFO "  sector %llu, nr/cnr %u/%u\n",
 815                (unsigned long long)blk_rq_pos(rq),
 816                blk_rq_sectors(rq), blk_rq_cur_sectors(rq));
 817         printk(KERN_INFO "  bio %p, biotail %p, len %u\n",
 818                rq->bio, rq->biotail, blk_rq_bytes(rq));
 819 }
 820 EXPORT_SYMBOL(blk_dump_rq_flags);
 821
 822 static void blk_account_io_completion(struct request *req, unsigned int bytes)
 823 {
 824         if (req->rq_flags & RQF_IO_STAT) {
 825                 const int sgrp = op_stat_group(req_op(req));
 826
 827                 part_stat_lock();
 828                 part_stat_add(req->part, sectors[sgrp], bytes >> 9);
 829                 part_stat_unlock();
 830         }
 831 }
 832
 833 static void blk_print_req_error(struct request *req, blk_status_t status)
 834 {
 835         printk_ratelimited(KERN_ERR
 836                 "%s error, dev %s, sector %llu op 0x%x:(%s) flags 0x%x "
 837                 "phys_seg %u prio class %u\n",
 838                 blk_status_to_str(status),
 839                 req->q->disk ? req->q->disk->disk_name : "?",
 840                 blk_rq_pos(req), (__force u32)req_op(req),
 841                 blk_op_str(req_op(req)),
 842                 (__force u32)(req->cmd_flags & ~REQ_OP_MASK),
 843                 req->nr_phys_segments,
 844                 IOPRIO_PRIO_CLASS(req_get_ioprio(req)));
 845 }
 846
 847 /*
 848  * Fully end IO on a request. Does not support partial completions, or
 849  * errors.
 850  */
 851 static void blk_complete_request(struct request *req)
 852 {
 853         const bool is_flush = (req->rq_flags & RQF_FLUSH_SEQ) != 0;
 854         int total_bytes = blk_rq_bytes(req);
 855         struct bio *bio = req->bio;
 856
 857         trace_block_rq_complete(req, BLK_STS_OK, total_bytes);
 858
 859         if (!bio)
 860                 return;
 861
 862         if (blk_integrity_rq(req) && req_op(req) == REQ_OP_READ)
 863                 blk_integrity_complete(req, total_bytes);
 864
 865         /*
 866          * Upper layers may call blk_crypto_evict_key() anytime after the last
 867          * bio_endio().  Therefore, the keyslot must be released before that.
 868          */
 869         blk_crypto_rq_put_keyslot(req);
 870
 871         blk_account_io_completion(req, total_bytes);
 872
 873         do {
 874                 struct bio *next = bio->bi_next;
 875
 876                 /* Completion has already been traced */
 877                 bio_clear_flag(bio, BIO_TRACE_COMPLETION);
 878
 879                 blk_zone_update_request_bio(req, bio);
 880
 881                 if (!is_flush)
 882                         bio_endio(bio);
 883                 bio = next;
 884         } while (bio);
 885
 886         /*
 887          * Reset counters so that the request stacking driver
 888          * can find how many bytes remain in the request
 889          * later.
 890          */
 891         if (!req->end_io) {
 892                 req->bio = NULL;
 893                 req->__data_len = 0;
 894         }
 895 }
 896
 897 /**
 898  * blk_update_request - Complete multiple bytes without completing the request
 899  * @req:      the request being processed
 900  * @error:    block status code
 901  * @nr_bytes: number of bytes to complete for @req
 902  *
 903  * Description:
 904  *     Ends I/O on a number of bytes attached to @req, but doesn't complete
 905  *     the request structure even if @req doesn't have leftover.
 906  *     If @req has leftover, sets it up for the next range of segments.
 907  *
 908  *     Passing the result of blk_rq_bytes() as @nr_bytes guarantees
 909  *     %false return from this function.
 910  *
 911  * Note:
 912  *      The RQF_SPECIAL_PAYLOAD flag is ignored on purpose in this function
 913  *      except in the consistency check at the end of this function.
 914  *
 915  * Return:
 916  *     %false - this request doesn't have any more data
 917  *     %true  - this request has more data
 918  **/
 919 bool blk_update_request(struct request *req, blk_status_t error,
 920                 unsigned int nr_bytes)
 921 {
 922         bool is_flush = req->rq_flags & RQF_FLUSH_SEQ;
 923         bool quiet = req->rq_flags & RQF_QUIET;
 924         int total_bytes;
 925
 926         trace_block_rq_complete(req, error, nr_bytes);
 927
 928         if (!req->bio)
 929                 return false;
 930
 931         if (blk_integrity_rq(req) && req_op(req) == REQ_OP_READ &&
 932             error == BLK_STS_OK)
 933                 blk_integrity_complete(req, nr_bytes);
 934
 935         /*
 936          * Upper layers may call blk_crypto_evict_key() anytime after the last
 937          * bio_endio().  Therefore, the keyslot must be released before that.
 938          */
 939         if (blk_crypto_rq_has_keyslot(req) && nr_bytes >= blk_rq_bytes(req))
 940                 __blk_crypto_rq_put_keyslot(req);
 941
 942         if (unlikely(error && !blk_rq_is_passthrough(req) && !quiet) &&
 943             !test_bit(GD_DEAD, &req->q->disk->state)) {
 944                 blk_print_req_error(req, error);
 945                 trace_block_rq_error(req, error, nr_bytes);
 946         }
 947
 948         blk_account_io_completion(req, nr_bytes);
 949
 950         total_bytes = 0;
 951         while (req->bio) {
 952                 struct bio *bio = req->bio;
 953                 unsigned bio_bytes = min(bio->bi_iter.bi_size, nr_bytes);
 954
 955                 if (unlikely(error))
 956                         bio->bi_status = error;
 957
 958                 if (bio_bytes == bio->bi_iter.bi_size) {
 959                         req->bio = bio->bi_next;
 960                 } else if (bio_is_zone_append(bio) && error == BLK_STS_OK) {
 961                         /*
 962                          * Partial zone append completions cannot be supported
 963                          * as the BIO fragments may end up not being written
 964                          * sequentially.
 965                          */
 966                         bio->bi_status = BLK_STS_IOERR;
 967                 }
 968
 969                 /* Completion has already been traced */
 970                 bio_clear_flag(bio, BIO_TRACE_COMPLETION);
 971                 if (unlikely(quiet))
 972                         bio_set_flag(bio, BIO_QUIET);
 973
 974                 bio_advance(bio, bio_bytes);
 975
 976                 /* Don't actually finish bio if it's part of flush sequence */
 977                 if (!bio->bi_iter.bi_size) {
 978                         blk_zone_update_request_bio(req, bio);
 979                         if (!is_flush)
 980                                 bio_endio(bio);
 981                 }
 982
 983                 total_bytes += bio_bytes;
 984                 nr_bytes -= bio_bytes;
 985
 986                 if (!nr_bytes)
 987                         break;
 988         }
 989
 990         /*
 991          * completely done
 992          */
 993         if (!req->bio) {
 994                 /*
 995                  * Reset counters so that the request stacking driver
 996                  * can find how many bytes remain in the request
 997                  * later.
 998                  */
 999                 req->__data_len = 0;
1000                 return false;
1001         }
1002
1003         req->__data_len -= total_bytes;
1004
1005         /* update sector only for requests with clear definition of sector */
1006         if (!blk_rq_is_passthrough(req))
1007                 req->__sector += total_bytes >> 9;
1008
1009         /* mixed attributes always follow the first bio */
1010         if (req->rq_flags & RQF_MIXED_MERGE) {
1011                 req->cmd_flags &= ~REQ_FAILFAST_MASK;
1012                 req->cmd_flags |= req->bio->bi_opf & REQ_FAILFAST_MASK;
1013         }
1014
1015         if (!(req->rq_flags & RQF_SPECIAL_PAYLOAD)) {
1016                 /*
1017                  * If total number of sectors is less than the first segment
1018                  * size, something has gone terribly wrong.
1019                  */
1020                 if (blk_rq_bytes(req) < blk_rq_cur_bytes(req)) {
1021                         blk_dump_rq_flags(req, "request botched");
1022                         req->__data_len = blk_rq_cur_bytes(req);
1023                 }
1024
1025                 /* recalculate the number of segments */
1026                 req->nr_phys_segments = blk_recalc_rq_segments(req);
1027         }
1028
1029         return true;
1030 }
1031 EXPORT_SYMBOL_GPL(blk_update_request);
1032
1033 static inline void blk_account_io_done(struct request *req, u64 now)
1034 {
1035         trace_block_io_done(req);
1036
1037         /*
1038          * Account IO completion.  flush_rq isn't accounted as a
1039          * normal IO on queueing nor completion.  Accounting the
1040          * containing request is enough.
1041          */
1042         if ((req->rq_flags & (RQF_IO_STAT|RQF_FLUSH_SEQ)) == RQF_IO_STAT) {
1043                 const int sgrp = op_stat_group(req_op(req));
1044
1045                 part_stat_lock();
1046                 update_io_ticks(req->part, jiffies, true);
1047                 part_stat_inc(req->part, ios[sgrp]);
1048                 part_stat_add(req->part, nsecs[sgrp], now - req->start_time_ns);
1049                 part_stat_local_dec(req->part,
1050                                     in_flight[op_is_write(req_op(req))]);
1051                 part_stat_unlock();
1052         }
1053 }
1054
1055 static inline bool blk_rq_passthrough_stats(struct request *req)
1056 {
1057         struct bio *bio = req->bio;
1058
1059         if (!blk_queue_passthrough_stat(req->q))
1060                 return false;
1061
1062         /* Requests without a bio do not transfer data. */
1063         if (!bio)
1064                 return false;
1065
1066         /*
1067          * Stats are accumulated in the bdev, so must have one attached to a
1068          * bio to track stats. Most drivers do not set the bdev for passthrough
1069          * requests, but nvme is one that will set it.
1070          */
1071         if (!bio->bi_bdev)
1072                 return false;
1073
1074         /*
1075          * We don't know what a passthrough command does, but we know the
1076          * payload size and data direction. Ensuring the size is aligned to the
1077          * block size filters out most commands with payloads that don't
1078          * represent sector access.
1079          */
1080         if (blk_rq_bytes(req) & (bdev_logical_block_size(bio->bi_bdev) - 1))
1081                 return false;
1082         return true;
1083 }
1084
1085 static inline void blk_account_io_start(struct request *req)
1086 {
1087         trace_block_io_start(req);
1088
1089         if (!blk_queue_io_stat(req->q))
1090                 return;
1091         if (blk_rq_is_passthrough(req) && !blk_rq_passthrough_stats(req))
1092                 return;
1093
1094         req->rq_flags |= RQF_IO_STAT;
1095         req->start_time_ns = blk_time_get_ns();
1096
1097         /*
1098          * All non-passthrough requests are created from a bio with one
1099          * exception: when a flush command that is part of a flush sequence
1100          * generated by the state machine in blk-flush.c is cloned onto the
1101          * lower device by dm-multipath we can get here without a bio.
1102          */
1103         if (req->bio)
1104                 req->part = req->bio->bi_bdev;
1105         else
1106                 req->part = req->q->disk->part0;
1107
1108         part_stat_lock();
1109         update_io_ticks(req->part, jiffies, false);
1110         part_stat_local_inc(req->part, in_flight[op_is_write(req_op(req))]);
1111         part_stat_unlock();
1112 }
1113
1114 static inline void __blk_mq_end_request_acct(struct request *rq, u64 now)
1115 {
1116         if (rq->rq_flags & RQF_STATS)
1117                 blk_stat_add(rq, now);
1118
1119         blk_mq_sched_completed_request(rq, now);
1120         blk_account_io_done(rq, now);
1121 }
1122
1123 inline void __blk_mq_end_request(struct request *rq, blk_status_t error)
1124 {
1125         if (blk_mq_need_time_stamp(rq))
1126                 __blk_mq_end_request_acct(rq, blk_time_get_ns());
1127
1128         blk_mq_finish_request(rq);
1129
1130         if (rq->end_io) {
1131                 rq_qos_done(rq->q, rq);
1132                 if (rq->end_io(rq, error) == RQ_END_IO_FREE)
1133                         blk_mq_free_request(rq);
1134         } else {
1135                 blk_mq_free_request(rq);
1136         }
1137 }
1138 EXPORT_SYMBOL(__blk_mq_end_request);
1139
1140 void blk_mq_end_request(struct request *rq, blk_status_t error)
1141 {
1142         if (blk_update_request(rq, error, blk_rq_bytes(rq)))
1143                 BUG();
1144         __blk_mq_end_request(rq, error);
1145 }
1146 EXPORT_SYMBOL(blk_mq_end_request);
1147
1148 #define TAG_COMP_BATCH          32
1149
1150 static inline void blk_mq_flush_tag_batch(struct blk_mq_hw_ctx *hctx,
1151                                           int *tag_array, int nr_tags)
1152 {
1153         struct request_queue *q = hctx->queue;
1154
1155         blk_mq_sub_active_requests(hctx, nr_tags);
1156
1157         blk_mq_put_tags(hctx->tags, tag_array, nr_tags);
1158         percpu_ref_put_many(&q->q_usage_counter, nr_tags);
1159 }
1160
1161 void blk_mq_end_request_batch(struct io_comp_batch *iob)
1162 {
1163         int tags[TAG_COMP_BATCH], nr_tags = 0;
1164         struct blk_mq_hw_ctx *cur_hctx = NULL;
1165         struct request *rq;
1166         u64 now = 0;
1167
1168         if (iob->need_ts)
1169                 now = blk_time_get_ns();
1170
1171         while ((rq = rq_list_pop(&iob->req_list)) != NULL) {
1172                 prefetch(rq->bio);
1173                 prefetch(rq->rq_next);
1174
1175                 blk_complete_request(rq);
1176                 if (iob->need_ts)
1177                         __blk_mq_end_request_acct(rq, now);
1178
1179                 blk_mq_finish_request(rq);
1180
1181                 rq_qos_done(rq->q, rq);
1182
1183                 /*
1184                  * If end_io handler returns NONE, then it still has
1185                  * ownership of the request.
1186                  */
1187                 if (rq->end_io && rq->end_io(rq, 0) == RQ_END_IO_NONE)
1188                         continue;
1189
1190                 WRITE_ONCE(rq->state, MQ_RQ_IDLE);
1191                 if (!req_ref_put_and_test(rq))
1192                         continue;
1193
1194                 blk_crypto_free_request(rq);
1195                 blk_pm_mark_last_busy(rq);
1196
1197                 if (nr_tags == TAG_COMP_BATCH || cur_hctx != rq->mq_hctx) {
1198                         if (cur_hctx)
1199                                 blk_mq_flush_tag_batch(cur_hctx, tags, nr_tags);
1200                         nr_tags = 0;
1201                         cur_hctx = rq->mq_hctx;
1202                 }
1203                 tags[nr_tags++] = rq->tag;
1204         }
1205
1206         if (nr_tags)
1207                 blk_mq_flush_tag_batch(cur_hctx, tags, nr_tags);
1208 }
1209 EXPORT_SYMBOL_GPL(blk_mq_end_request_batch);
1210
1211 static void blk_complete_reqs(struct llist_head *list)
1212 {
1213         struct llist_node *entry = llist_reverse_order(llist_del_all(list));
1214         struct request *rq, *next;
1215
1216         llist_for_each_entry_safe(rq, next, entry, ipi_list)
1217                 rq->q->mq_ops->complete(rq);
1218 }
1219
1220 static __latent_entropy void blk_done_softirq(void)
1221 {
1222         blk_complete_reqs(this_cpu_ptr(&blk_cpu_done));
1223 }
1224
1225 static int blk_softirq_cpu_dead(unsigned int cpu)
1226 {
1227         blk_complete_reqs(&per_cpu(blk_cpu_done, cpu));
1228         return 0;
1229 }
1230
1231 static void __blk_mq_complete_request_remote(void *data)
1232 {
1233         __raise_softirq_irqoff(BLOCK_SOFTIRQ);
1234 }
1235
1236 static inline bool blk_mq_complete_need_ipi(struct request *rq)
1237 {
1238         int cpu = raw_smp_processor_id();
1239
1240         if (!IS_ENABLED(CONFIG_SMP) ||
1241             !test_bit(QUEUE_FLAG_SAME_COMP, &rq->q->queue_flags))
1242                 return false;
1243         /*
1244          * With force threaded interrupts enabled, raising softirq from an SMP
1245          * function call will always result in waking the ksoftirqd thread.
1246          * This is probably worse than completing the request on a different
1247          * cache domain.
1248          */
1249         if (force_irqthreads())
1250                 return false;
1251
1252         /* same CPU or cache domain and capacity?  Complete locally */
1253         if (cpu == rq->mq_ctx->cpu ||
1254             (!test_bit(QUEUE_FLAG_SAME_FORCE, &rq->q->queue_flags) &&
1255              cpus_share_cache(cpu, rq->mq_ctx->cpu) &&
1256              cpus_equal_capacity(cpu, rq->mq_ctx->cpu)))
1257                 return false;
1258
1259         /* don't try to IPI to an offline CPU */
1260         return cpu_online(rq->mq_ctx->cpu);
1261 }
1262
1263 static void blk_mq_complete_send_ipi(struct request *rq)
1264 {
1265         unsigned int cpu;
1266
1267         cpu = rq->mq_ctx->cpu;
1268         if (llist_add(&rq->ipi_list, &per_cpu(blk_cpu_done, cpu)))
1269                 smp_call_function_single_async(cpu, &per_cpu(blk_cpu_csd, cpu));
1270 }
1271
1272 static void blk_mq_raise_softirq(struct request *rq)
1273 {
1274         struct llist_head *list;
1275
1276         preempt_disable();
1277         list = this_cpu_ptr(&blk_cpu_done);
1278         if (llist_add(&rq->ipi_list, list))
1279                 raise_softirq(BLOCK_SOFTIRQ);
1280         preempt_enable();
1281 }
1282
1283 bool blk_mq_complete_request_remote(struct request *rq)
1284 {
1285         WRITE_ONCE(rq->state, MQ_RQ_COMPLETE);
1286
1287         /*
1288          * For request which hctx has only one ctx mapping,
1289          * or a polled request, always complete locally,
1290          * it's pointless to redirect the completion.
1291          */
1292         if ((rq->mq_hctx->nr_ctx == 1 &&
1293              rq->mq_ctx->cpu == raw_smp_processor_id()) ||
1294              rq->cmd_flags & REQ_POLLED)
1295                 return false;
1296
1297         if (blk_mq_complete_need_ipi(rq)) {
1298                 blk_mq_complete_send_ipi(rq);
1299                 return true;
1300         }
1301
1302         if (rq->q->nr_hw_queues == 1) {
1303                 blk_mq_raise_softirq(rq);
1304                 return true;
1305         }
1306         return false;
1307 }
1308 EXPORT_SYMBOL_GPL(blk_mq_complete_request_remote);
1309
1310 /**
1311  * blk_mq_complete_request - end I/O on a request
1312  * @rq:         the request being processed
1313  *
1314  * Description:
1315  *      Complete a request by scheduling the ->complete_rq operation.
1316  **/
1317 void blk_mq_complete_request(struct request *rq)
1318 {
1319         if (!blk_mq_complete_request_remote(rq))
1320                 rq->q->mq_ops->complete(rq);
1321 }
1322 EXPORT_SYMBOL(blk_mq_complete_request);
1323
1324 /**
1325  * blk_mq_start_request - Start processing a request
1326  * @rq: Pointer to request to be started
1327  *
1328  * Function used by device drivers to notify the block layer that a request
1329  * is going to be processed now, so blk layer can do proper initializations
1330  * such as starting the timeout timer.
1331  */
1332 void blk_mq_start_request(struct request *rq)
1333 {
1334         struct request_queue *q = rq->q;
1335
1336         trace_block_rq_issue(rq);
1337
1338         if (test_bit(QUEUE_FLAG_STATS, &q->queue_flags) &&
1339             !blk_rq_is_passthrough(rq)) {
1340                 rq->io_start_time_ns = blk_time_get_ns();
1341                 rq->stats_sectors = blk_rq_sectors(rq);
1342                 rq->rq_flags |= RQF_STATS;
1343                 rq_qos_issue(q, rq);
1344         }
1345
1346         WARN_ON_ONCE(blk_mq_rq_state(rq) != MQ_RQ_IDLE);
1347
1348         blk_add_timer(rq);
1349         WRITE_ONCE(rq->state, MQ_RQ_IN_FLIGHT);
1350         rq->mq_hctx->tags->rqs[rq->tag] = rq;
1351
1352         if (blk_integrity_rq(rq) && req_op(rq) == REQ_OP_WRITE)
1353                 blk_integrity_prepare(rq);
1354
1355         if (rq->bio && rq->bio->bi_opf & REQ_POLLED)
1356                 WRITE_ONCE(rq->bio->bi_cookie, rq->mq_hctx->queue_num);
1357 }
1358 EXPORT_SYMBOL(blk_mq_start_request);
1359
1360 /*
1361  * Allow 2x BLK_MAX_REQUEST_COUNT requests on plug queue for multiple
1362  * queues. This is important for md arrays to benefit from merging
1363  * requests.
1364  */
1365 static inline unsigned short blk_plug_max_rq_count(struct blk_plug *plug)
1366 {
1367         if (plug->multiple_queues)
1368                 return BLK_MAX_REQUEST_COUNT * 2;
1369         return BLK_MAX_REQUEST_COUNT;
1370 }
1371
1372 static void blk_add_rq_to_plug(struct blk_plug *plug, struct request *rq)
1373 {
1374         struct request *last = rq_list_peek(&plug->mq_list);
1375
1376         if (!plug->rq_count) {
1377                 trace_block_plug(rq->q);
1378         } else if (plug->rq_count >= blk_plug_max_rq_count(plug) ||
1379                    (!blk_queue_nomerges(rq->q) &&
1380                     blk_rq_bytes(last) >= BLK_PLUG_FLUSH_SIZE)) {
1381                 blk_mq_flush_plug_list(plug, false);
1382                 last = NULL;
1383                 trace_block_plug(rq->q);
1384         }
1385
1386         if (!plug->multiple_queues && last && last->q != rq->q)
1387                 plug->multiple_queues = true;
1388         /*
1389          * Any request allocated from sched tags can't be issued to
1390          * ->queue_rqs() directly
1391          */
1392         if (!plug->has_elevator && (rq->rq_flags & RQF_SCHED_TAGS))
1393                 plug->has_elevator = true;
1394         rq_list_add_tail(&plug->mq_list, rq);
1395         plug->rq_count++;
1396 }
1397
1398 /**
1399  * blk_execute_rq_nowait - insert a request to I/O scheduler for execution
1400  * @rq:         request to insert
1401  * @at_head:    insert request at head or tail of queue
1402  *
1403  * Description:
1404  *    Insert a fully prepared request at the back of the I/O scheduler queue
1405  *    for execution.  Don't wait for completion.
1406  *
1407  * Note:
1408  *    This function will invoke @done directly if the queue is dead.
1409  */
1410 void blk_execute_rq_nowait(struct request *rq, bool at_head)
1411 {
1412         struct blk_mq_hw_ctx *hctx = rq->mq_hctx;
1413
1414         WARN_ON(irqs_disabled());
1415         WARN_ON(!blk_rq_is_passthrough(rq));
1416
1417         blk_account_io_start(rq);
1418
1419         if (current->plug && !at_head) {
1420                 blk_add_rq_to_plug(current->plug, rq);
1421                 return;
1422         }
1423
1424         blk_mq_insert_request(rq, at_head ? BLK_MQ_INSERT_AT_HEAD : 0);
1425         blk_mq_run_hw_queue(hctx, hctx->flags & BLK_MQ_F_BLOCKING);
1426 }
1427 EXPORT_SYMBOL_GPL(blk_execute_rq_nowait);
1428
1429 struct blk_rq_wait {
1430         struct completion done;
1431         blk_status_t ret;
1432 };
1433
1434 static enum rq_end_io_ret blk_end_sync_rq(struct request *rq, blk_status_t ret)
1435 {
1436         struct blk_rq_wait *wait = rq->end_io_data;
1437
1438         wait->ret = ret;
1439         complete(&wait->done);
1440         return RQ_END_IO_NONE;
1441 }
1442
1443 bool blk_rq_is_poll(struct request *rq)
1444 {
1445         if (!rq->mq_hctx)
1446                 return false;
1447         if (rq->mq_hctx->type != HCTX_TYPE_POLL)
1448                 return false;
1449         return true;
1450 }
1451 EXPORT_SYMBOL_GPL(blk_rq_is_poll);
1452
1453 static void blk_rq_poll_completion(struct request *rq, struct completion *wait)
1454 {
1455         do {
1456                 blk_hctx_poll(rq->q, rq->mq_hctx, NULL, 0);
1457                 cond_resched();
1458         } while (!completion_done(wait));
1459 }
1460
1461 /**
1462  * blk_execute_rq - insert a request into queue for execution
1463  * @rq:         request to insert
1464  * @at_head:    insert request at head or tail of queue
1465  *
1466  * Description:
1467  *    Insert a fully prepared request at the back of the I/O scheduler queue
1468  *    for execution and wait for completion.
1469  * Return: The blk_status_t result provided to blk_mq_end_request().
1470  */
1471 blk_status_t blk_execute_rq(struct request *rq, bool at_head)
1472 {
1473         struct blk_mq_hw_ctx *hctx = rq->mq_hctx;
1474         struct blk_rq_wait wait = {
1475                 .done = COMPLETION_INITIALIZER_ONSTACK(wait.done),
1476         };
1477
1478         WARN_ON(irqs_disabled());
1479         WARN_ON(!blk_rq_is_passthrough(rq));
1480
1481         rq->end_io_data = &wait;
1482         rq->end_io = blk_end_sync_rq;
1483
1484         blk_account_io_start(rq);
1485         blk_mq_insert_request(rq, at_head ? BLK_MQ_INSERT_AT_HEAD : 0);
1486         blk_mq_run_hw_queue(hctx, false);
1487
1488         if (blk_rq_is_poll(rq))
1489                 blk_rq_poll_completion(rq, &wait.done);
1490         else
1491                 blk_wait_io(&wait.done);
1492
1493         return wait.ret;
1494 }
1495 EXPORT_SYMBOL(blk_execute_rq);
1496
1497 static void __blk_mq_requeue_request(struct request *rq)
1498 {
1499         struct request_queue *q = rq->q;
1500
1501         blk_mq_put_driver_tag(rq);
1502
1503         trace_block_rq_requeue(rq);
1504         rq_qos_requeue(q, rq);
1505
1506         if (blk_mq_request_started(rq)) {
1507                 WRITE_ONCE(rq->state, MQ_RQ_IDLE);
1508                 rq->rq_flags &= ~RQF_TIMED_OUT;
1509         }
1510 }
1511
1512 void blk_mq_requeue_request(struct request *rq, bool kick_requeue_list)
1513 {
1514         struct request_queue *q = rq->q;
1515         unsigned long flags;
1516
1517         __blk_mq_requeue_request(rq);
1518
1519         /* this request will be re-inserted to io scheduler queue */
1520         blk_mq_sched_requeue_request(rq);
1521
1522         spin_lock_irqsave(&q->requeue_lock, flags);
1523         list_add_tail(&rq->queuelist, &q->requeue_list);
1524         spin_unlock_irqrestore(&q->requeue_lock, flags);
1525
1526         if (kick_requeue_list)
1527                 blk_mq_kick_requeue_list(q);
1528 }
1529 EXPORT_SYMBOL(blk_mq_requeue_request);
1530
1531 static void blk_mq_requeue_work(struct work_struct *work)
1532 {
1533         struct request_queue *q =
1534                 container_of(work, struct request_queue, requeue_work.work);
1535         LIST_HEAD(rq_list);
1536         LIST_HEAD(flush_list);
1537         struct request *rq;
1538
1539         spin_lock_irq(&q->requeue_lock);
1540         list_splice_init(&q->requeue_list, &rq_list);
1541         list_splice_init(&q->flush_list, &flush_list);
1542         spin_unlock_irq(&q->requeue_lock);
1543
1544         while (!list_empty(&rq_list)) {
1545                 rq = list_entry(rq_list.next, struct request, queuelist);
1546                 /*
1547                  * If RQF_DONTPREP ist set, the request has been started by the
1548                  * driver already and might have driver-specific data allocated
1549                  * already.  Insert it into the hctx dispatch list to avoid
1550                  * block layer merges for the request.
1551                  */
1552                 if (rq->rq_flags & RQF_DONTPREP) {
1553                         list_del_init(&rq->queuelist);
1554                         blk_mq_request_bypass_insert(rq, 0);
1555                 } else {
1556                         list_del_init(&rq->queuelist);
1557                         blk_mq_insert_request(rq, BLK_MQ_INSERT_AT_HEAD);
1558                 }
1559         }
1560
1561         while (!list_empty(&flush_list)) {
1562                 rq = list_entry(flush_list.next, struct request, queuelist);
1563                 list_del_init(&rq->queuelist);
1564                 blk_mq_insert_request(rq, 0);
1565         }
1566
1567         blk_mq_run_hw_queues(q, false);
1568 }
1569
1570 void blk_mq_kick_requeue_list(struct request_queue *q)
1571 {
1572         kblockd_mod_delayed_work_on(WORK_CPU_UNBOUND, &q->requeue_work, 0);
1573 }
1574 EXPORT_SYMBOL(blk_mq_kick_requeue_list);
1575
1576 void blk_mq_delay_kick_requeue_list(struct request_queue *q,
1577                                     unsigned long msecs)
1578 {
1579         kblockd_mod_delayed_work_on(WORK_CPU_UNBOUND, &q->requeue_work,
1580                                     msecs_to_jiffies(msecs));
1581 }
1582 EXPORT_SYMBOL(blk_mq_delay_kick_requeue_list);
1583
1584 static bool blk_is_flush_data_rq(struct request *rq)
1585 {
1586         return (rq->rq_flags & RQF_FLUSH_SEQ) && !is_flush_rq(rq);
1587 }
1588
1589 static bool blk_mq_rq_inflight(struct request *rq, void *priv)
1590 {
1591         /*
1592          * If we find a request that isn't idle we know the queue is busy
1593          * as it's checked in the iter.
1594          * Return false to stop the iteration.
1595          *
1596          * In case of queue quiesce, if one flush data request is completed,
1597          * don't count it as inflight given the flush sequence is suspended,
1598          * and the original flush data request is invisible to driver, just
1599          * like other pending requests because of quiesce
1600          */
1601         if (blk_mq_request_started(rq) && !(blk_queue_quiesced(rq->q) &&
1602                                 blk_is_flush_data_rq(rq) &&
1603                                 blk_mq_request_completed(rq))) {
1604                 bool *busy = priv;
1605
1606                 *busy = true;
1607                 return false;
1608         }
1609
1610         return true;
1611 }
1612
1613 bool blk_mq_queue_inflight(struct request_queue *q)
1614 {
1615         bool busy = false;
1616
1617         blk_mq_queue_tag_busy_iter(q, blk_mq_rq_inflight, &busy);
1618         return busy;
1619 }
1620 EXPORT_SYMBOL_GPL(blk_mq_queue_inflight);
1621
1622 static void blk_mq_rq_timed_out(struct request *req)
1623 {
1624         req->rq_flags |= RQF_TIMED_OUT;
1625         if (req->q->mq_ops->timeout) {
1626                 enum blk_eh_timer_return ret;
1627
1628                 ret = req->q->mq_ops->timeout(req);
1629                 if (ret == BLK_EH_DONE)
1630                         return;
1631                 WARN_ON_ONCE(ret != BLK_EH_RESET_TIMER);
1632         }
1633
1634         blk_add_timer(req);
1635 }
1636
1637 struct blk_expired_data {
1638         bool has_timedout_rq;
1639         unsigned long next;
1640         unsigned long timeout_start;
1641 };
1642
1643 static bool blk_mq_req_expired(struct request *rq, struct blk_expired_data *expired)
1644 {
1645         unsigned long deadline;
1646
1647         if (blk_mq_rq_state(rq) != MQ_RQ_IN_FLIGHT)
1648                 return false;
1649         if (rq->rq_flags & RQF_TIMED_OUT)
1650                 return false;
1651
1652         deadline = READ_ONCE(rq->deadline);
1653         if (time_after_eq(expired->timeout_start, deadline))
1654                 return true;
1655
1656         if (expired->next == 0)
1657                 expired->next = deadline;
1658         else if (time_after(expired->next, deadline))
1659                 expired->next = deadline;
1660         return false;
1661 }
1662
1663 void blk_mq_put_rq_ref(struct request *rq)
1664 {
1665         if (is_flush_rq(rq)) {
1666                 if (rq->end_io(rq, 0) == RQ_END_IO_FREE)
1667                         blk_mq_free_request(rq);
1668         } else if (req_ref_put_and_test(rq)) {
1669                 __blk_mq_free_request(rq);
1670         }
1671 }
1672
1673 static bool blk_mq_check_expired(struct request *rq, void *priv)
1674 {
1675         struct blk_expired_data *expired = priv;
1676
1677         /*
1678          * blk_mq_queue_tag_busy_iter() has locked the request, so it cannot
1679          * be reallocated underneath the timeout handler's processing, then
1680          * the expire check is reliable. If the request is not expired, then
1681          * it was completed and reallocated as a new request after returning
1682          * from blk_mq_check_expired().
1683          */
1684         if (blk_mq_req_expired(rq, expired)) {
1685                 expired->has_timedout_rq = true;
1686                 return false;
1687         }
1688         return true;
1689 }
1690
1691 static bool blk_mq_handle_expired(struct request *rq, void *priv)
1692 {
1693         struct blk_expired_data *expired = priv;
1694
1695         if (blk_mq_req_expired(rq, expired))
1696                 blk_mq_rq_timed_out(rq);
1697         return true;
1698 }
1699
1700 static void blk_mq_timeout_work(struct work_struct *work)
1701 {
1702         struct request_queue *q =
1703                 container_of(work, struct request_queue, timeout_work);
1704         struct blk_expired_data expired = {
1705                 .timeout_start = jiffies,
1706         };
1707         struct blk_mq_hw_ctx *hctx;
1708         unsigned long i;
1709
1710         /* A deadlock might occur if a request is stuck requiring a
1711          * timeout at the same time a queue freeze is waiting
1712          * completion, since the timeout code would not be able to
1713          * acquire the queue reference here.
1714          *
1715          * That's why we don't use blk_queue_enter here; instead, we use
1716          * percpu_ref_tryget directly, because we need to be able to
1717          * obtain a reference even in the short window between the queue
1718          * starting to freeze, by dropping the first reference in
1719          * blk_freeze_queue_start, and the moment the last request is
1720          * consumed, marked by the instant q_usage_counter reaches
1721          * zero.
1722          */
1723         if (!percpu_ref_tryget(&q->q_usage_counter))
1724                 return;
1725
1726         /* check if there is any timed-out request */
1727         blk_mq_queue_tag_busy_iter(q, blk_mq_check_expired, &expired);
1728         if (expired.has_timedout_rq) {
1729                 /*
1730                  * Before walking tags, we must ensure any submit started
1731                  * before the current time has finished. Since the submit
1732                  * uses srcu or rcu, wait for a synchronization point to
1733                  * ensure all running submits have finished
1734                  */
1735                 blk_mq_wait_quiesce_done(q->tag_set);
1736
1737                 expired.next = 0;
1738                 blk_mq_queue_tag_busy_iter(q, blk_mq_handle_expired, &expired);
1739         }
1740
1741         if (expired.next != 0) {
1742                 mod_timer(&q->timeout, expired.next);
1743         } else {
1744                 /*
1745                  * Request timeouts are handled as a forward rolling timer. If
1746                  * we end up here it means that no requests are pending and
1747                  * also that no request has been pending for a while. Mark
1748                  * each hctx as idle.
1749                  */
1750                 queue_for_each_hw_ctx(q, hctx, i) {
1751                         /* the hctx may be unmapped, so check it here */
1752                         if (blk_mq_hw_queue_mapped(hctx))
1753                                 blk_mq_tag_idle(hctx);
1754                 }
1755         }
1756         blk_queue_exit(q);
1757 }
1758
1759 struct flush_busy_ctx_data {
1760         struct blk_mq_hw_ctx *hctx;
1761         struct list_head *list;
1762 };
1763
1764 static bool flush_busy_ctx(struct sbitmap *sb, unsigned int bitnr, void *data)
1765 {
1766         struct flush_busy_ctx_data *flush_data = data;
1767         struct blk_mq_hw_ctx *hctx = flush_data->hctx;
1768         struct blk_mq_ctx *ctx = hctx->ctxs[bitnr];
1769         enum hctx_type type = hctx->type;
1770
1771         spin_lock(&ctx->lock);
1772         list_splice_tail_init(&ctx->rq_lists[type], flush_data->list);
1773         sbitmap_clear_bit(sb, bitnr);
1774         spin_unlock(&ctx->lock);
1775         return true;
1776 }
1777
1778 /*
1779  * Process software queues that have been marked busy, splicing them
1780  * to the for-dispatch
1781  */
1782 void blk_mq_flush_busy_ctxs(struct blk_mq_hw_ctx *hctx, struct list_head *list)
1783 {
1784         struct flush_busy_ctx_data data = {
1785                 .hctx = hctx,
1786                 .list = list,
1787         };
1788
1789         sbitmap_for_each_set(&hctx->ctx_map, flush_busy_ctx, &data);
1790 }
1791
1792 struct dispatch_rq_data {
1793         struct blk_mq_hw_ctx *hctx;
1794         struct request *rq;
1795 };
1796
1797 static bool dispatch_rq_from_ctx(struct sbitmap *sb, unsigned int bitnr,
1798                 void *data)
1799 {
1800         struct dispatch_rq_data *dispatch_data = data;
1801         struct blk_mq_hw_ctx *hctx = dispatch_data->hctx;
1802         struct blk_mq_ctx *ctx = hctx->ctxs[bitnr];
1803         enum hctx_type type = hctx->type;
1804
1805         spin_lock(&ctx->lock);
1806         if (!list_empty(&ctx->rq_lists[type])) {
1807                 dispatch_data->rq = list_entry_rq(ctx->rq_lists[type].next);
1808                 list_del_init(&dispatch_data->rq->queuelist);
1809                 if (list_empty(&ctx->rq_lists[type]))
1810                         sbitmap_clear_bit(sb, bitnr);
1811         }
1812         spin_unlock(&ctx->lock);
1813
1814         return !dispatch_data->rq;
1815 }
1816
1817 struct request *blk_mq_dequeue_from_ctx(struct blk_mq_hw_ctx *hctx,
1818                                         struct blk_mq_ctx *start)
1819 {
1820         unsigned off = start ? start->index_hw[hctx->type] : 0;
1821         struct dispatch_rq_data data = {
1822                 .hctx = hctx,
1823                 .rq   = NULL,
1824         };
1825
1826         __sbitmap_for_each_set(&hctx->ctx_map, off,
1827                                dispatch_rq_from_ctx, &data);
1828
1829         return data.rq;
1830 }
1831
1832 bool __blk_mq_alloc_driver_tag(struct request *rq)
1833 {
1834         struct sbitmap_queue *bt = &rq->mq_hctx->tags->bitmap_tags;
1835         unsigned int tag_offset = rq->mq_hctx->tags->nr_reserved_tags;
1836         int tag;
1837
1838         blk_mq_tag_busy(rq->mq_hctx);
1839
1840         if (blk_mq_tag_is_reserved(rq->mq_hctx->sched_tags, rq->internal_tag)) {
1841                 bt = &rq->mq_hctx->tags->breserved_tags;
1842                 tag_offset = 0;
1843         } else {
1844                 if (!hctx_may_queue(rq->mq_hctx, bt))
1845                         return false;
1846         }
1847
1848         tag = __sbitmap_queue_get(bt);
1849         if (tag == BLK_MQ_NO_TAG)
1850                 return false;
1851
1852         rq->tag = tag + tag_offset;
1853         blk_mq_inc_active_requests(rq->mq_hctx);
1854         return true;
1855 }
1856
1857 static int blk_mq_dispatch_wake(wait_queue_entry_t *wait, unsigned mode,
1858                                 int flags, void *key)
1859 {
1860         struct blk_mq_hw_ctx *hctx;
1861
1862         hctx = container_of(wait, struct blk_mq_hw_ctx, dispatch_wait);
1863
1864         spin_lock(&hctx->dispatch_wait_lock);
1865         if (!list_empty(&wait->entry)) {
1866                 struct sbitmap_queue *sbq;
1867
1868                 list_del_init(&wait->entry);
1869                 sbq = &hctx->tags->bitmap_tags;
1870                 atomic_dec(&sbq->ws_active);
1871         }
1872         spin_unlock(&hctx->dispatch_wait_lock);
1873
1874         blk_mq_run_hw_queue(hctx, true);
1875         return 1;
1876 }
1877
1878 /*
1879  * Mark us waiting for a tag. For shared tags, this involves hooking us into
1880  * the tag wakeups. For non-shared tags, we can simply mark us needing a
1881  * restart. For both cases, take care to check the condition again after
1882  * marking us as waiting.
1883  */
1884 static bool blk_mq_mark_tag_wait(struct blk_mq_hw_ctx *hctx,
1885                                  struct request *rq)
1886 {
1887         struct sbitmap_queue *sbq;
1888         struct wait_queue_head *wq;
1889         wait_queue_entry_t *wait;
1890         bool ret;
1891
1892         if (!(hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED) &&
1893             !(blk_mq_is_shared_tags(hctx->flags))) {
1894                 blk_mq_sched_mark_restart_hctx(hctx);
1895
1896                 /*
1897                  * It's possible that a tag was freed in the window between the
1898                  * allocation failure and adding the hardware queue to the wait
1899                  * queue.
1900                  *
1901                  * Don't clear RESTART here, someone else could have set it.
1902                  * At most this will cost an extra queue run.
1903                  */
1904                 return blk_mq_get_driver_tag(rq);
1905         }
1906
1907         wait = &hctx->dispatch_wait;
1908         if (!list_empty_careful(&wait->entry))
1909                 return false;
1910
1911         if (blk_mq_tag_is_reserved(rq->mq_hctx->sched_tags, rq->internal_tag))
1912                 sbq = &hctx->tags->breserved_tags;
1913         else
1914                 sbq = &hctx->tags->bitmap_tags;
1915         wq = &bt_wait_ptr(sbq, hctx)->wait;
1916
1917         spin_lock_irq(&wq->lock);
1918         spin_lock(&hctx->dispatch_wait_lock);
1919         if (!list_empty(&wait->entry)) {
1920                 spin_unlock(&hctx->dispatch_wait_lock);
1921                 spin_unlock_irq(&wq->lock);
1922                 return false;
1923         }
1924
1925         atomic_inc(&sbq->ws_active);
1926         wait->flags &= ~WQ_FLAG_EXCLUSIVE;
1927         __add_wait_queue(wq, wait);
1928
1929         /*
1930          * Add one explicit barrier since blk_mq_get_driver_tag() may
1931          * not imply barrier in case of failure.
1932          *
1933          * Order adding us to wait queue and allocating driver tag.
1934          *
1935          * The pair is the one implied in sbitmap_queue_wake_up() which
1936          * orders clearing sbitmap tag bits and waitqueue_active() in
1937          * __sbitmap_queue_wake_up(), since waitqueue_active() is lockless
1938          *
1939          * Otherwise, re-order of adding wait queue and getting driver tag
1940          * may cause __sbitmap_queue_wake_up() to wake up nothing because
1941          * the waitqueue_active() may not observe us in wait queue.
1942          */
1943         smp_mb();
1944
1945         /*
1946          * It's possible that a tag was freed in the window between the
1947          * allocation failure and adding the hardware queue to the wait
1948          * queue.
1949          */
1950         ret = blk_mq_get_driver_tag(rq);
1951         if (!ret) {
1952                 spin_unlock(&hctx->dispatch_wait_lock);
1953                 spin_unlock_irq(&wq->lock);
1954                 return false;
1955         }
1956
1957         /*
1958          * We got a tag, remove ourselves from the wait queue to ensure
1959          * someone else gets the wakeup.
1960          */
1961         list_del_init(&wait->entry);
1962         atomic_dec(&sbq->ws_active);
1963         spin_unlock(&hctx->dispatch_wait_lock);
1964         spin_unlock_irq(&wq->lock);
1965
1966         return true;
1967 }
1968
1969 #define BLK_MQ_DISPATCH_BUSY_EWMA_WEIGHT  8
1970 #define BLK_MQ_DISPATCH_BUSY_EWMA_FACTOR  4
1971 /*
1972  * Update dispatch busy with the Exponential Weighted Moving Average(EWMA):
1973  * - EWMA is one simple way to compute running average value
1974  * - weight(7/8 and 1/8) is applied so that it can decrease exponentially
1975  * - take 4 as factor for avoiding to get too small(0) result, and this
1976  *   factor doesn't matter because EWMA decreases exponentially
1977  */
1978 static void blk_mq_update_dispatch_busy(struct blk_mq_hw_ctx *hctx, bool busy)
1979 {
1980         unsigned int ewma;
1981
1982         ewma = hctx->dispatch_busy;
1983
1984         if (!ewma && !busy)
1985                 return;
1986
1987         ewma *= BLK_MQ_DISPATCH_BUSY_EWMA_WEIGHT - 1;
1988         if (busy)
1989                 ewma += 1 << BLK_MQ_DISPATCH_BUSY_EWMA_FACTOR;
1990         ewma /= BLK_MQ_DISPATCH_BUSY_EWMA_WEIGHT;
1991
1992         hctx->dispatch_busy = ewma;
1993 }
1994
1995 #define BLK_MQ_RESOURCE_DELAY   3               /* ms units */
1996
1997 static void blk_mq_handle_dev_resource(struct request *rq,
1998                                        struct list_head *list)
1999 {
2000         list_add(&rq->queuelist, list);
2001         __blk_mq_requeue_request(rq);
2002 }
2003
2004 enum prep_dispatch {
2005         PREP_DISPATCH_OK,
2006         PREP_DISPATCH_NO_TAG,
2007         PREP_DISPATCH_NO_BUDGET,
2008 };
2009
2010 static enum prep_dispatch blk_mq_prep_dispatch_rq(struct request *rq,
2011                                                   bool need_budget)
2012 {
2013         struct blk_mq_hw_ctx *hctx = rq->mq_hctx;
2014         int budget_token = -1;
2015
2016         if (need_budget) {
2017                 budget_token = blk_mq_get_dispatch_budget(rq->q);
2018                 if (budget_token < 0) {
2019                         blk_mq_put_driver_tag(rq);
2020                         return PREP_DISPATCH_NO_BUDGET;
2021                 }
2022                 blk_mq_set_rq_budget_token(rq, budget_token);
2023         }
2024
2025         if (!blk_mq_get_driver_tag(rq)) {
2026                 /*
2027                  * The initial allocation attempt failed, so we need to
2028                  * rerun the hardware queue when a tag is freed. The
2029                  * waitqueue takes care of that. If the queue is run
2030                  * before we add this entry back on the dispatch list,
2031                  * we'll re-run it below.
2032                  */
2033                 if (!blk_mq_mark_tag_wait(hctx, rq)) {
2034                         /*
2035                          * All budgets not got from this function will be put
2036                          * together during handling partial dispatch
2037                          */
2038                         if (need_budget)
2039                                 blk_mq_put_dispatch_budget(rq->q, budget_token);
2040                         return PREP_DISPATCH_NO_TAG;
2041                 }
2042         }
2043
2044         return PREP_DISPATCH_OK;
2045 }
2046
2047 /* release all allocated budgets before calling to blk_mq_dispatch_rq_list */
2048 static void blk_mq_release_budgets(struct request_queue *q,
2049                 struct list_head *list)
2050 {
2051         struct request *rq;
2052
2053         list_for_each_entry(rq, list, queuelist) {
2054                 int budget_token = blk_mq_get_rq_budget_token(rq);
2055
2056                 if (budget_token >= 0)
2057                         blk_mq_put_dispatch_budget(q, budget_token);
2058         }
2059 }
2060
2061 /*
2062  * blk_mq_commit_rqs will notify driver using bd->last that there is no
2063  * more requests. (See comment in struct blk_mq_ops for commit_rqs for
2064  * details)
2065  * Attention, we should explicitly call this in unusual cases:
2066  *  1) did not queue everything initially scheduled to queue
2067  *  2) the last attempt to queue a request failed
2068  */
2069 static void blk_mq_commit_rqs(struct blk_mq_hw_ctx *hctx, int queued,
2070                               bool from_schedule)
2071 {
2072         if (hctx->queue->mq_ops->commit_rqs && queued) {
2073                 trace_block_unplug(hctx->queue, queued, !from_schedule);
2074                 hctx->queue->mq_ops->commit_rqs(hctx);
2075         }
2076 }
2077
2078 /*
2079  * Returns true if we did some work AND can potentially do more.
2080  */
2081 bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *list,
2082                              unsigned int nr_budgets)
2083 {
2084         enum prep_dispatch prep;
2085         struct request_queue *q = hctx->queue;
2086         struct request *rq;
2087         int queued;
2088         blk_status_t ret = BLK_STS_OK;
2089         bool needs_resource = false;
2090
2091         if (list_empty(list))
2092                 return false;
2093
2094         /*
2095          * Now process all the entries, sending them to the driver.
2096          */
2097         queued = 0;
2098         do {
2099                 struct blk_mq_queue_data bd;
2100
2101                 rq = list_first_entry(list, struct request, queuelist);
2102
2103                 WARN_ON_ONCE(hctx != rq->mq_hctx);
2104                 prep = blk_mq_prep_dispatch_rq(rq, !nr_budgets);
2105                 if (prep != PREP_DISPATCH_OK)
2106                         break;
2107
2108                 list_del_init(&rq->queuelist);
2109
2110                 bd.rq = rq;
2111                 bd.last = list_empty(list);
2112
2113                 /*
2114                  * once the request is queued to lld, no need to cover the
2115                  * budget any more
2116                  */
2117                 if (nr_budgets)
2118                         nr_budgets--;
2119                 ret = q->mq_ops->queue_rq(hctx, &bd);
2120                 switch (ret) {
2121                 case BLK_STS_OK:
2122                         queued++;
2123                         break;
2124                 case BLK_STS_RESOURCE:
2125                         needs_resource = true;
2126                         fallthrough;
2127                 case BLK_STS_DEV_RESOURCE:
2128                         blk_mq_handle_dev_resource(rq, list);
2129                         goto out;
2130                 default:
2131                         blk_mq_end_request(rq, ret);
2132                 }
2133         } while (!list_empty(list));
2134 out:
2135         /* If we didn't flush the entire list, we could have told the driver
2136          * there was more coming, but that turned out to be a lie.
2137          */
2138         if (!list_empty(list) || ret != BLK_STS_OK)
2139                 blk_mq_commit_rqs(hctx, queued, false);
2140
2141         /*
2142          * Any items that need requeuing? Stuff them into hctx->dispatch,
2143          * that is where we will continue on next queue run.
2144          */
2145         if (!list_empty(list)) {
2146                 bool needs_restart;
2147                 /* For non-shared tags, the RESTART check will suffice */
2148                 bool no_tag = prep == PREP_DISPATCH_NO_TAG &&
2149                         ((hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED) ||
2150                         blk_mq_is_shared_tags(hctx->flags));
2151
2152                 if (nr_budgets)
2153                         blk_mq_release_budgets(q, list);
2154
2155                 spin_lock(&hctx->lock);
2156                 list_splice_tail_init(list, &hctx->dispatch);
2157                 spin_unlock(&hctx->lock);
2158
2159                 /*
2160                  * Order adding requests to hctx->dispatch and checking
2161                  * SCHED_RESTART flag. The pair of this smp_mb() is the one
2162                  * in blk_mq_sched_restart(). Avoid restart code path to
2163                  * miss the new added requests to hctx->dispatch, meantime
2164                  * SCHED_RESTART is observed here.
2165                  */
2166                 smp_mb();
2167
2168                 /*
2169                  * If SCHED_RESTART was set by the caller of this function and
2170                  * it is no longer set that means that it was cleared by another
2171                  * thread and hence that a queue rerun is needed.
2172                  *
2173                  * If 'no_tag' is set, that means that we failed getting
2174                  * a driver tag with an I/O scheduler attached. If our dispatch
2175                  * waitqueue is no longer active, ensure that we run the queue
2176                  * AFTER adding our entries back to the list.
2177                  *
2178                  * If no I/O scheduler has been configured it is possible that
2179                  * the hardware queue got stopped and restarted before requests
2180                  * were pushed back onto the dispatch list. Rerun the queue to
2181                  * avoid starvation. Notes:
2182                  * - blk_mq_run_hw_queue() checks whether or not a queue has
2183                  *   been stopped before rerunning a queue.
2184                  * - Some but not all block drivers stop a queue before
2185                  *   returning BLK_STS_RESOURCE. Two exceptions are scsi-mq
2186                  *   and dm-rq.
2187                  *
2188                  * If driver returns BLK_STS_RESOURCE and SCHED_RESTART
2189                  * bit is set, run queue after a delay to avoid IO stalls
2190                  * that could otherwise occur if the queue is idle.  We'll do
2191                  * similar if we couldn't get budget or couldn't lock a zone
2192                  * and SCHED_RESTART is set.
2193                  */
2194                 needs_restart = blk_mq_sched_needs_restart(hctx);
2195                 if (prep == PREP_DISPATCH_NO_BUDGET)
2196                         needs_resource = true;
2197                 if (!needs_restart ||
2198                     (no_tag && list_empty_careful(&hctx->dispatch_wait.entry)))
2199                         blk_mq_run_hw_queue(hctx, true);
2200                 else if (needs_resource)
2201                         blk_mq_delay_run_hw_queue(hctx, BLK_MQ_RESOURCE_DELAY);
2202
2203                 blk_mq_update_dispatch_busy(hctx, true);
2204                 return false;
2205         }
2206
2207         blk_mq_update_dispatch_busy(hctx, false);
2208         return true;
2209 }
2210
2211 static inline int blk_mq_first_mapped_cpu(struct blk_mq_hw_ctx *hctx)
2212 {
2213         int cpu = cpumask_first_and(hctx->cpumask, cpu_online_mask);
2214
2215         if (cpu >= nr_cpu_ids)
2216                 cpu = cpumask_first(hctx->cpumask);
2217         return cpu;
2218 }
2219
2220 /*
2221  * ->next_cpu is always calculated from hctx->cpumask, so simply use
2222  * it for speeding up the check
2223  */
2224 static bool blk_mq_hctx_empty_cpumask(struct blk_mq_hw_ctx *hctx)
2225 {
2226         return hctx->next_cpu >= nr_cpu_ids;
2227 }
2228
2229 /*
2230  * It'd be great if the workqueue API had a way to pass
2231  * in a mask and had some smarts for more clever placement.
2232  * For now we just round-robin here, switching for every
2233  * BLK_MQ_CPU_WORK_BATCH queued items.
2234  */
2235 static int blk_mq_hctx_next_cpu(struct blk_mq_hw_ctx *hctx)
2236 {
2237         bool tried = false;
2238         int next_cpu = hctx->next_cpu;
2239
2240         /* Switch to unbound if no allowable CPUs in this hctx */
2241         if (hctx->queue->nr_hw_queues == 1 || blk_mq_hctx_empty_cpumask(hctx))
2242                 return WORK_CPU_UNBOUND;
2243
2244         if (--hctx->next_cpu_batch <= 0) {
2245 select_cpu:
2246                 next_cpu = cpumask_next_and(next_cpu, hctx->cpumask,
2247                                 cpu_online_mask);
2248                 if (next_cpu >= nr_cpu_ids)
2249                         next_cpu = blk_mq_first_mapped_cpu(hctx);
2250                 hctx->next_cpu_batch = BLK_MQ_CPU_WORK_BATCH;
2251         }
2252
2253         /*
2254          * Do unbound schedule if we can't find a online CPU for this hctx,
2255          * and it should only happen in the path of handling CPU DEAD.
2256          */
2257         if (!cpu_online(next_cpu)) {
2258                 if (!tried) {
2259                         tried = true;
2260                         goto select_cpu;
2261                 }
2262
2263                 /*
2264                  * Make sure to re-select CPU next time once after CPUs
2265                  * in hctx->cpumask become online again.
2266                  */
2267                 hctx->next_cpu = next_cpu;
2268                 hctx->next_cpu_batch = 1;
2269                 return WORK_CPU_UNBOUND;
2270         }
2271
2272         hctx->next_cpu = next_cpu;
2273         return next_cpu;
2274 }
2275
2276 /**
2277  * blk_mq_delay_run_hw_queue - Run a hardware queue asynchronously.
2278  * @hctx: Pointer to the hardware queue to run.
2279  * @msecs: Milliseconds of delay to wait before running the queue.
2280  *
2281  * Run a hardware queue asynchronously with a delay of @msecs.
2282  */
2283 void blk_mq_delay_run_hw_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs)
2284 {
2285         if (unlikely(blk_mq_hctx_stopped(hctx)))
2286                 return;
2287         kblockd_mod_delayed_work_on(blk_mq_hctx_next_cpu(hctx), &hctx->run_work,
2288                                     msecs_to_jiffies(msecs));
2289 }
2290 EXPORT_SYMBOL(blk_mq_delay_run_hw_queue);
2291
2292 static inline bool blk_mq_hw_queue_need_run(struct blk_mq_hw_ctx *hctx)
2293 {
2294         bool need_run;
2295
2296         /*
2297          * When queue is quiesced, we may be switching io scheduler, or
2298          * updating nr_hw_queues, or other things, and we can't run queue
2299          * any more, even blk_mq_hctx_has_pending() can't be called safely.
2300          *
2301          * And queue will be rerun in blk_mq_unquiesce_queue() if it is
2302          * quiesced.
2303          */
2304         __blk_mq_run_dispatch_ops(hctx->queue, false,
2305                 need_run = !blk_queue_quiesced(hctx->queue) &&
2306                 blk_mq_hctx_has_pending(hctx));
2307         return need_run;
2308 }
2309
2310 /**
2311  * blk_mq_run_hw_queue - Start to run a hardware queue.
2312  * @hctx: Pointer to the hardware queue to run.
2313  * @async: If we want to run the queue asynchronously.
2314  *
2315  * Check if the request queue is not in a quiesced state and if there are
2316  * pending requests to be sent. If this is true, run the queue to send requests
2317  * to hardware.
2318  */
2319 void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async)
2320 {
2321         bool need_run;
2322
2323         /*
2324          * We can't run the queue inline with interrupts disabled.
2325          */
2326         WARN_ON_ONCE(!async && in_interrupt());
2327
2328         might_sleep_if(!async && hctx->flags & BLK_MQ_F_BLOCKING);
2329
2330         need_run = blk_mq_hw_queue_need_run(hctx);
2331         if (!need_run) {
2332                 unsigned long flags;
2333
2334                 /*
2335                  * Synchronize with blk_mq_unquiesce_queue(), because we check
2336                  * if hw queue is quiesced locklessly above, we need the use
2337                  * ->queue_lock to make sure we see the up-to-date status to
2338                  * not miss rerunning the hw queue.
2339                  */
2340                 spin_lock_irqsave(&hctx->queue->queue_lock, flags);
2341                 need_run = blk_mq_hw_queue_need_run(hctx);
2342                 spin_unlock_irqrestore(&hctx->queue->queue_lock, flags);
2343
2344                 if (!need_run)
2345                         return;
2346         }
2347
2348         if (async || !cpumask_test_cpu(raw_smp_processor_id(), hctx->cpumask)) {
2349                 blk_mq_delay_run_hw_queue(hctx, 0);
2350                 return;
2351         }
2352
2353         blk_mq_run_dispatch_ops(hctx->queue,
2354                                 blk_mq_sched_dispatch_requests(hctx));
2355 }
2356 EXPORT_SYMBOL(blk_mq_run_hw_queue);
2357
2358 /*
2359  * Return prefered queue to dispatch from (if any) for non-mq aware IO
2360  * scheduler.
2361  */
2362 static struct blk_mq_hw_ctx *blk_mq_get_sq_hctx(struct request_queue *q)
2363 {
2364         struct blk_mq_ctx *ctx = blk_mq_get_ctx(q);
2365         /*
2366          * If the IO scheduler does not respect hardware queues when
2367          * dispatching, we just don't bother with multiple HW queues and
2368          * dispatch from hctx for the current CPU since running multiple queues
2369          * just causes lock contention inside the scheduler and pointless cache
2370          * bouncing.
2371          */
2372         struct blk_mq_hw_ctx *hctx = ctx->hctxs[HCTX_TYPE_DEFAULT];
2373
2374         if (!blk_mq_hctx_stopped(hctx))
2375                 return hctx;
2376         return NULL;
2377 }
2378
2379 /**
2380  * blk_mq_run_hw_queues - Run all hardware queues in a request queue.
2381  * @q: Pointer to the request queue to run.
2382  * @async: If we want to run the queue asynchronously.
2383  */
2384 void blk_mq_run_hw_queues(struct request_queue *q, bool async)
2385 {
2386         struct blk_mq_hw_ctx *hctx, *sq_hctx;
2387         unsigned long i;
2388
2389         sq_hctx = NULL;
2390         if (blk_queue_sq_sched(q))
2391                 sq_hctx = blk_mq_get_sq_hctx(q);
2392         queue_for_each_hw_ctx(q, hctx, i) {
2393                 if (blk_mq_hctx_stopped(hctx))
2394                         continue;
2395                 /*
2396                  * Dispatch from this hctx either if there's no hctx preferred
2397                  * by IO scheduler or if it has requests that bypass the
2398                  * scheduler.
2399                  */
2400                 if (!sq_hctx || sq_hctx == hctx ||
2401                     !list_empty_careful(&hctx->dispatch))
2402                         blk_mq_run_hw_queue(hctx, async);
2403         }
2404 }
2405 EXPORT_SYMBOL(blk_mq_run_hw_queues);
2406
2407 /**
2408  * blk_mq_delay_run_hw_queues - Run all hardware queues asynchronously.
2409  * @q: Pointer to the request queue to run.
2410  * @msecs: Milliseconds of delay to wait before running the queues.
2411  */
2412 void blk_mq_delay_run_hw_queues(struct request_queue *q, unsigned long msecs)
2413 {
2414         struct blk_mq_hw_ctx *hctx, *sq_hctx;
2415         unsigned long i;
2416
2417         sq_hctx = NULL;
2418         if (blk_queue_sq_sched(q))
2419                 sq_hctx = blk_mq_get_sq_hctx(q);
2420         queue_for_each_hw_ctx(q, hctx, i) {
2421                 if (blk_mq_hctx_stopped(hctx))
2422                         continue;
2423                 /*
2424                  * If there is already a run_work pending, leave the
2425                  * pending delay untouched. Otherwise, a hctx can stall
2426                  * if another hctx is re-delaying the other's work
2427                  * before the work executes.
2428                  */
2429                 if (delayed_work_pending(&hctx->run_work))
2430                         continue;
2431                 /*
2432                  * Dispatch from this hctx either if there's no hctx preferred
2433                  * by IO scheduler or if it has requests that bypass the
2434                  * scheduler.
2435                  */
2436                 if (!sq_hctx || sq_hctx == hctx ||
2437                     !list_empty_careful(&hctx->dispatch))
2438                         blk_mq_delay_run_hw_queue(hctx, msecs);
2439         }
2440 }
2441 EXPORT_SYMBOL(blk_mq_delay_run_hw_queues);
2442
2443 /*
2444  * This function is often used for pausing .queue_rq() by driver when
2445  * there isn't enough resource or some conditions aren't satisfied, and
2446  * BLK_STS_RESOURCE is usually returned.
2447  *
2448  * We do not guarantee that dispatch can be drained or blocked
2449  * after blk_mq_stop_hw_queue() returns. Please use
2450  * blk_mq_quiesce_queue() for that requirement.
2451  */
2452 void blk_mq_stop_hw_queue(struct blk_mq_hw_ctx *hctx)
2453 {
2454         cancel_delayed_work(&hctx->run_work);
2455
2456         set_bit(BLK_MQ_S_STOPPED, &hctx->state);
2457 }
2458 EXPORT_SYMBOL(blk_mq_stop_hw_queue);
2459
2460 /*
2461  * This function is often used for pausing .queue_rq() by driver when
2462  * there isn't enough resource or some conditions aren't satisfied, and
2463  * BLK_STS_RESOURCE is usually returned.
2464  *
2465  * We do not guarantee that dispatch can be drained or blocked
2466  * after blk_mq_stop_hw_queues() returns. Please use
2467  * blk_mq_quiesce_queue() for that requirement.
2468  */
2469 void blk_mq_stop_hw_queues(struct request_queue *q)
2470 {
2471         struct blk_mq_hw_ctx *hctx;
2472         unsigned long i;
2473
2474         queue_for_each_hw_ctx(q, hctx, i)
2475                 blk_mq_stop_hw_queue(hctx);
2476 }
2477 EXPORT_SYMBOL(blk_mq_stop_hw_queues);
2478
2479 void blk_mq_start_hw_queue(struct blk_mq_hw_ctx *hctx)
2480 {
2481         clear_bit(BLK_MQ_S_STOPPED, &hctx->state);
2482
2483         blk_mq_run_hw_queue(hctx, hctx->flags & BLK_MQ_F_BLOCKING);
2484 }
2485 EXPORT_SYMBOL(blk_mq_start_hw_queue);
2486
2487 void blk_mq_start_hw_queues(struct request_queue *q)
2488 {
2489         struct blk_mq_hw_ctx *hctx;
2490         unsigned long i;
2491
2492         queue_for_each_hw_ctx(q, hctx, i)
2493                 blk_mq_start_hw_queue(hctx);
2494 }
2495 EXPORT_SYMBOL(blk_mq_start_hw_queues);
2496
2497 void blk_mq_start_stopped_hw_queue(struct blk_mq_hw_ctx *hctx, bool async)
2498 {
2499         if (!blk_mq_hctx_stopped(hctx))
2500                 return;
2501
2502         clear_bit(BLK_MQ_S_STOPPED, &hctx->state);
2503         /*
2504          * Pairs with the smp_mb() in blk_mq_hctx_stopped() to order the
2505          * clearing of BLK_MQ_S_STOPPED above and the checking of dispatch
2506          * list in the subsequent routine.
2507          */
2508         smp_mb__after_atomic();
2509         blk_mq_run_hw_queue(hctx, async);
2510 }
2511 EXPORT_SYMBOL_GPL(blk_mq_start_stopped_hw_queue);
2512
2513 void blk_mq_start_stopped_hw_queues(struct request_queue *q, bool async)
2514 {
2515         struct blk_mq_hw_ctx *hctx;
2516         unsigned long i;
2517
2518         queue_for_each_hw_ctx(q, hctx, i)
2519                 blk_mq_start_stopped_hw_queue(hctx, async ||
2520                                         (hctx->flags & BLK_MQ_F_BLOCKING));
2521 }
2522 EXPORT_SYMBOL(blk_mq_start_stopped_hw_queues);
2523
2524 static void blk_mq_run_work_fn(struct work_struct *work)
2525 {
2526         struct blk_mq_hw_ctx *hctx =
2527                 container_of(work, struct blk_mq_hw_ctx, run_work.work);
2528
2529         blk_mq_run_dispatch_ops(hctx->queue,
2530                                 blk_mq_sched_dispatch_requests(hctx));
2531 }
2532
2533 /**
2534  * blk_mq_request_bypass_insert - Insert a request at dispatch list.
2535  * @rq: Pointer to request to be inserted.
2536  * @flags: BLK_MQ_INSERT_*
2537  *
2538  * Should only be used carefully, when the caller knows we want to
2539  * bypass a potential IO scheduler on the target device.
2540  */
2541 static void blk_mq_request_bypass_insert(struct request *rq, blk_insert_t flags)
2542 {
2543         struct blk_mq_hw_ctx *hctx = rq->mq_hctx;
2544
2545         spin_lock(&hctx->lock);
2546         if (flags & BLK_MQ_INSERT_AT_HEAD)
2547                 list_add(&rq->queuelist, &hctx->dispatch);
2548         else
2549                 list_add_tail(&rq->queuelist, &hctx->dispatch);
2550         spin_unlock(&hctx->lock);
2551 }
2552
2553 static void blk_mq_insert_requests(struct blk_mq_hw_ctx *hctx,
2554                 struct blk_mq_ctx *ctx, struct list_head *list,
2555                 bool run_queue_async)
2556 {
2557         struct request *rq;
2558         enum hctx_type type = hctx->type;
2559
2560         /*
2561          * Try to issue requests directly if the hw queue isn't busy to save an
2562          * extra enqueue & dequeue to the sw queue.
2563          */
2564         if (!hctx->dispatch_busy && !run_queue_async) {
2565                 blk_mq_run_dispatch_ops(hctx->queue,
2566                         blk_mq_try_issue_list_directly(hctx, list));
2567                 if (list_empty(list))
2568                         goto out;
2569         }
2570
2571         /*
2572          * preemption doesn't flush plug list, so it's possible ctx->cpu is
2573          * offline now
2574          */
2575         list_for_each_entry(rq, list, queuelist) {
2576                 BUG_ON(rq->mq_ctx != ctx);
2577                 trace_block_rq_insert(rq);
2578                 if (rq->cmd_flags & REQ_NOWAIT)
2579                         run_queue_async = true;
2580         }
2581
2582         spin_lock(&ctx->lock);
2583         list_splice_tail_init(list, &ctx->rq_lists[type]);
2584         blk_mq_hctx_mark_pending(hctx, ctx);
2585         spin_unlock(&ctx->lock);
2586 out:
2587         blk_mq_run_hw_queue(hctx, run_queue_async);
2588 }
2589
2590 static void blk_mq_insert_request(struct request *rq, blk_insert_t flags)
2591 {
2592         struct request_queue *q = rq->q;
2593         struct blk_mq_ctx *ctx = rq->mq_ctx;
2594         struct blk_mq_hw_ctx *hctx = rq->mq_hctx;
2595
2596         if (blk_rq_is_passthrough(rq)) {
2597                 /*
2598                  * Passthrough request have to be added to hctx->dispatch
2599                  * directly.  The device may be in a situation where it can't
2600                  * handle FS request, and always returns BLK_STS_RESOURCE for
2601                  * them, which gets them added to hctx->dispatch.
2602                  *
2603                  * If a passthrough request is required to unblock the queues,
2604                  * and it is added to the scheduler queue, there is no chance to
2605                  * dispatch it given we prioritize requests in hctx->dispatch.
2606                  */
2607                 blk_mq_request_bypass_insert(rq, flags);
2608         } else if (req_op(rq) == REQ_OP_FLUSH) {
2609                 /*
2610                  * Firstly normal IO request is inserted to scheduler queue or
2611                  * sw queue, meantime we add flush request to dispatch queue(
2612                  * hctx->dispatch) directly and there is at most one in-flight
2613                  * flush request for each hw queue, so it doesn't matter to add
2614                  * flush request to tail or front of the dispatch queue.
2615                  *
2616                  * Secondly in case of NCQ, flush request belongs to non-NCQ
2617                  * command, and queueing it will fail when there is any
2618                  * in-flight normal IO request(NCQ command). When adding flush
2619                  * rq to the front of hctx->dispatch, it is easier to introduce
2620                  * extra time to flush rq's latency because of S_SCHED_RESTART
2621                  * compared with adding to the tail of dispatch queue, then
2622                  * chance of flush merge is increased, and less flush requests
2623                  * will be issued to controller. It is observed that ~10% time
2624                  * is saved in blktests block/004 on disk attached to AHCI/NCQ
2625                  * drive when adding flush rq to the front of hctx->dispatch.
2626                  *
2627                  * Simply queue flush rq to the front of hctx->dispatch so that
2628                  * intensive flush workloads can benefit in case of NCQ HW.
2629                  */
2630                 blk_mq_request_bypass_insert(rq, BLK_MQ_INSERT_AT_HEAD);
2631         } else if (q->elevator) {
2632                 LIST_HEAD(list);
2633
2634                 WARN_ON_ONCE(rq->tag != BLK_MQ_NO_TAG);
2635
2636                 list_add(&rq->queuelist, &list);
2637                 q->elevator->type->ops.insert_requests(hctx, &list, flags);
2638         } else {
2639                 trace_block_rq_insert(rq);
2640
2641                 spin_lock(&ctx->lock);
2642                 if (flags & BLK_MQ_INSERT_AT_HEAD)
2643                         list_add(&rq->queuelist, &ctx->rq_lists[hctx->type]);
2644                 else
2645                         list_add_tail(&rq->queuelist,
2646                                       &ctx->rq_lists[hctx->type]);
2647                 blk_mq_hctx_mark_pending(hctx, ctx);
2648                 spin_unlock(&ctx->lock);
2649         }
2650 }
2651
2652 static void blk_mq_bio_to_request(struct request *rq, struct bio *bio,
2653                 unsigned int nr_segs)
2654 {
2655         int err;
2656
2657         if (bio->bi_opf & REQ_RAHEAD)
2658                 rq->cmd_flags |= REQ_FAILFAST_MASK;
2659
2660         rq->__sector = bio->bi_iter.bi_sector;
2661         blk_rq_bio_prep(rq, bio, nr_segs);
2662         if (bio_integrity(bio))
2663                 rq->nr_integrity_segments = blk_rq_count_integrity_sg(rq->q,
2664                                                                       bio);
2665
2666         /* This can't fail, since GFP_NOIO includes __GFP_DIRECT_RECLAIM. */
2667         err = blk_crypto_rq_bio_prep(rq, bio, GFP_NOIO);
2668         WARN_ON_ONCE(err);
2669
2670         blk_account_io_start(rq);
2671 }
2672
2673 static blk_status_t __blk_mq_issue_directly(struct blk_mq_hw_ctx *hctx,
2674                                             struct request *rq, bool last)
2675 {
2676         struct request_queue *q = rq->q;
2677         struct blk_mq_queue_data bd = {
2678                 .rq = rq,
2679                 .last = last,
2680         };
2681         blk_status_t ret;
2682
2683         /*
2684          * For OK queue, we are done. For error, caller may kill it.
2685          * Any other error (busy), just add it to our list as we
2686          * previously would have done.
2687          */
2688         ret = q->mq_ops->queue_rq(hctx, &bd);
2689         switch (ret) {
2690         case BLK_STS_OK:
2691                 blk_mq_update_dispatch_busy(hctx, false);
2692                 break;
2693         case BLK_STS_RESOURCE:
2694         case BLK_STS_DEV_RESOURCE:
2695                 blk_mq_update_dispatch_busy(hctx, true);
2696                 __blk_mq_requeue_request(rq);
2697                 break;
2698         default:
2699                 blk_mq_update_dispatch_busy(hctx, false);
2700                 break;
2701         }
2702
2703         return ret;
2704 }
2705
2706 static bool blk_mq_get_budget_and_tag(struct request *rq)
2707 {
2708         int budget_token;
2709
2710         budget_token = blk_mq_get_dispatch_budget(rq->q);
2711         if (budget_token < 0)
2712                 return false;
2713         blk_mq_set_rq_budget_token(rq, budget_token);
2714         if (!blk_mq_get_driver_tag(rq)) {
2715                 blk_mq_put_dispatch_budget(rq->q, budget_token);
2716                 return false;
2717         }
2718         return true;
2719 }
2720
2721 /**
2722  * blk_mq_try_issue_directly - Try to send a request directly to device driver.
2723  * @hctx: Pointer of the associated hardware queue.
2724  * @rq: Pointer to request to be sent.
2725  *
2726  * If the device has enough resources to accept a new request now, send the
2727  * request directly to device driver. Else, insert at hctx->dispatch queue, so
2728  * we can try send it another time in the future. Requests inserted at this
2729  * queue have higher priority.
2730  */
2731 static void blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx,
2732                 struct request *rq)
2733 {
2734         blk_status_t ret;
2735
2736         if (blk_mq_hctx_stopped(hctx) || blk_queue_quiesced(rq->q)) {
2737                 blk_mq_insert_request(rq, 0);
2738                 blk_mq_run_hw_queue(hctx, false);
2739                 return;
2740         }
2741
2742         if ((rq->rq_flags & RQF_USE_SCHED) || !blk_mq_get_budget_and_tag(rq)) {
2743                 blk_mq_insert_request(rq, 0);
2744                 blk_mq_run_hw_queue(hctx, rq->cmd_flags & REQ_NOWAIT);
2745                 return;
2746         }
2747
2748         ret = __blk_mq_issue_directly(hctx, rq, true);
2749         switch (ret) {
2750         case BLK_STS_OK:
2751                 break;
2752         case BLK_STS_RESOURCE:
2753         case BLK_STS_DEV_RESOURCE:
2754                 blk_mq_request_bypass_insert(rq, 0);
2755                 blk_mq_run_hw_queue(hctx, false);
2756                 break;
2757         default:
2758                 blk_mq_end_request(rq, ret);
2759                 break;
2760         }
2761 }
2762
2763 static blk_status_t blk_mq_request_issue_directly(struct request *rq, bool last)
2764 {
2765         struct blk_mq_hw_ctx *hctx = rq->mq_hctx;
2766
2767         if (blk_mq_hctx_stopped(hctx) || blk_queue_quiesced(rq->q)) {
2768                 blk_mq_insert_request(rq, 0);
2769                 blk_mq_run_hw_queue(hctx, false);
2770                 return BLK_STS_OK;
2771         }
2772
2773         if (!blk_mq_get_budget_and_tag(rq))
2774                 return BLK_STS_RESOURCE;
2775         return __blk_mq_issue_directly(hctx, rq, last);
2776 }
2777
2778 static void blk_mq_plug_issue_direct(struct blk_plug *plug)
2779 {
2780         struct blk_mq_hw_ctx *hctx = NULL;
2781         struct request *rq;
2782         int queued = 0;
2783         blk_status_t ret = BLK_STS_OK;
2784
2785         while ((rq = rq_list_pop(&plug->mq_list))) {
2786                 bool last = rq_list_empty(&plug->mq_list);
2787
2788                 if (hctx != rq->mq_hctx) {
2789                         if (hctx) {
2790                                 blk_mq_commit_rqs(hctx, queued, false);
2791                                 queued = 0;
2792                         }
2793                         hctx = rq->mq_hctx;
2794                 }
2795
2796                 ret = blk_mq_request_issue_directly(rq, last);
2797                 switch (ret) {
2798                 case BLK_STS_OK:
2799                         queued++;
2800                         break;
2801                 case BLK_STS_RESOURCE:
2802                 case BLK_STS_DEV_RESOURCE:
2803                         blk_mq_request_bypass_insert(rq, 0);
2804                         blk_mq_run_hw_queue(hctx, false);
2805                         goto out;
2806                 default:
2807                         blk_mq_end_request(rq, ret);
2808                         break;
2809                 }
2810         }
2811
2812 out:
2813         if (ret != BLK_STS_OK)
2814                 blk_mq_commit_rqs(hctx, queued, false);
2815 }
2816
2817 static void __blk_mq_flush_plug_list(struct request_queue *q,
2818                                      struct blk_plug *plug)
2819 {
2820         if (blk_queue_quiesced(q))
2821                 return;
2822         q->mq_ops->queue_rqs(&plug->mq_list);
2823 }
2824
2825 static void blk_mq_dispatch_plug_list(struct blk_plug *plug, bool from_sched)
2826 {
2827         struct blk_mq_hw_ctx *this_hctx = NULL;
2828         struct blk_mq_ctx *this_ctx = NULL;
2829         struct rq_list requeue_list = {};
2830         unsigned int depth = 0;
2831         bool is_passthrough = false;
2832         LIST_HEAD(list);
2833
2834         do {
2835                 struct request *rq = rq_list_pop(&plug->mq_list);
2836
2837                 if (!this_hctx) {
2838                         this_hctx = rq->mq_hctx;
2839                         this_ctx = rq->mq_ctx;
2840                         is_passthrough = blk_rq_is_passthrough(rq);
2841                 } else if (this_hctx != rq->mq_hctx || this_ctx != rq->mq_ctx ||
2842                            is_passthrough != blk_rq_is_passthrough(rq)) {
2843                         rq_list_add_tail(&requeue_list, rq);
2844                         continue;
2845                 }
2846                 list_add_tail(&rq->queuelist, &list);
2847                 depth++;
2848         } while (!rq_list_empty(&plug->mq_list));
2849
2850         plug->mq_list = requeue_list;
2851         trace_block_unplug(this_hctx->queue, depth, !from_sched);
2852
2853         percpu_ref_get(&this_hctx->queue->q_usage_counter);
2854         /* passthrough requests should never be issued to the I/O scheduler */
2855         if (is_passthrough) {
2856                 spin_lock(&this_hctx->lock);
2857                 list_splice_tail_init(&list, &this_hctx->dispatch);
2858                 spin_unlock(&this_hctx->lock);
2859                 blk_mq_run_hw_queue(this_hctx, from_sched);
2860         } else if (this_hctx->queue->elevator) {
2861                 this_hctx->queue->elevator->type->ops.insert_requests(this_hctx,
2862                                 &list, 0);
2863                 blk_mq_run_hw_queue(this_hctx, from_sched);
2864         } else {
2865                 blk_mq_insert_requests(this_hctx, this_ctx, &list, from_sched);
2866         }
2867         percpu_ref_put(&this_hctx->queue->q_usage_counter);
2868 }
2869
2870 void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule)
2871 {
2872         struct request *rq;
2873         unsigned int depth;
2874
2875         /*
2876          * We may have been called recursively midway through handling
2877          * plug->mq_list via a schedule() in the driver's queue_rq() callback.
2878          * To avoid mq_list changing under our feet, clear rq_count early and
2879          * bail out specifically if rq_count is 0 rather than checking
2880          * whether the mq_list is empty.
2881          */
2882         if (plug->rq_count == 0)
2883                 return;
2884         depth = plug->rq_count;
2885         plug->rq_count = 0;
2886
2887         if (!plug->multiple_queues && !plug->has_elevator && !from_schedule) {
2888                 struct request_queue *q;
2889
2890                 rq = rq_list_peek(&plug->mq_list);
2891                 q = rq->q;
2892                 trace_block_unplug(q, depth, true);
2893
2894                 /*
2895                  * Peek first request and see if we have a ->queue_rqs() hook.
2896                  * If we do, we can dispatch the whole plug list in one go. We
2897                  * already know at this point that all requests belong to the
2898                  * same queue, caller must ensure that's the case.
2899                  */
2900                 if (q->mq_ops->queue_rqs) {
2901                         blk_mq_run_dispatch_ops(q,
2902                                 __blk_mq_flush_plug_list(q, plug));
2903                         if (rq_list_empty(&plug->mq_list))
2904                                 return;
2905                 }
2906
2907                 blk_mq_run_dispatch_ops(q,
2908                                 blk_mq_plug_issue_direct(plug));
2909                 if (rq_list_empty(&plug->mq_list))
2910                         return;
2911         }
2912
2913         do {
2914                 blk_mq_dispatch_plug_list(plug, from_schedule);
2915         } while (!rq_list_empty(&plug->mq_list));
2916 }
2917
2918 static void blk_mq_try_issue_list_directly(struct blk_mq_hw_ctx *hctx,
2919                 struct list_head *list)
2920 {
2921         int queued = 0;
2922         blk_status_t ret = BLK_STS_OK;
2923
2924         while (!list_empty(list)) {
2925                 struct request *rq = list_first_entry(list, struct request,
2926                                 queuelist);
2927
2928                 list_del_init(&rq->queuelist);
2929                 ret = blk_mq_request_issue_directly(rq, list_empty(list));
2930                 switch (ret) {
2931                 case BLK_STS_OK:
2932                         queued++;
2933                         break;
2934                 case BLK_STS_RESOURCE:
2935                 case BLK_STS_DEV_RESOURCE:
2936                         blk_mq_request_bypass_insert(rq, 0);
2937                         if (list_empty(list))
2938                                 blk_mq_run_hw_queue(hctx, false);
2939                         goto out;
2940                 default:
2941                         blk_mq_end_request(rq, ret);
2942                         break;
2943                 }
2944         }
2945
2946 out:
2947         if (ret != BLK_STS_OK)
2948                 blk_mq_commit_rqs(hctx, queued, false);
2949 }
2950
2951 static bool blk_mq_attempt_bio_merge(struct request_queue *q,
2952                                      struct bio *bio, unsigned int nr_segs)
2953 {
2954         if (!blk_queue_nomerges(q) && bio_mergeable(bio)) {
2955                 if (blk_attempt_plug_merge(q, bio, nr_segs))
2956                         return true;
2957                 if (blk_mq_sched_bio_merge(q, bio, nr_segs))
2958                         return true;
2959         }
2960         return false;
2961 }
2962
2963 static struct request *blk_mq_get_new_requests(struct request_queue *q,
2964                                                struct blk_plug *plug,
2965                                                struct bio *bio,
2966                                                unsigned int nsegs)
2967 {
2968         struct blk_mq_alloc_data data = {
2969                 .q              = q,
2970                 .nr_tags        = 1,
2971                 .cmd_flags      = bio->bi_opf,
2972         };
2973         struct request *rq;
2974
2975         rq_qos_throttle(q, bio);
2976
2977         if (plug) {
2978                 data.nr_tags = plug->nr_ios;
2979                 plug->nr_ios = 1;
2980                 data.cached_rqs = &plug->cached_rqs;
2981         }
2982
2983         rq = __blk_mq_alloc_requests(&data);
2984         if (rq)
2985                 return rq;
2986         rq_qos_cleanup(q, bio);
2987         if (bio->bi_opf & REQ_NOWAIT)
2988                 bio_wouldblock_error(bio);
2989         return NULL;
2990 }
2991
2992 /*
2993  * Check if there is a suitable cached request and return it.
2994  */
2995 static struct request *blk_mq_peek_cached_request(struct blk_plug *plug,
2996                 struct request_queue *q, blk_opf_t opf)
2997 {
2998         enum hctx_type type = blk_mq_get_hctx_type(opf);
2999         struct request *rq;
3000
3001         if (!plug)
3002                 return NULL;
3003         rq = rq_list_peek(&plug->cached_rqs);
3004         if (!rq || rq->q != q)
3005                 return NULL;
3006         if (type != rq->mq_hctx->type &&
3007             (type != HCTX_TYPE_READ || rq->mq_hctx->type != HCTX_TYPE_DEFAULT))
3008                 return NULL;
3009         if (op_is_flush(rq->cmd_flags) != op_is_flush(opf))
3010                 return NULL;
3011         return rq;
3012 }
3013
3014 static void blk_mq_use_cached_rq(struct request *rq, struct blk_plug *plug,
3015                 struct bio *bio)
3016 {
3017         if (rq_list_pop(&plug->cached_rqs) != rq)
3018                 WARN_ON_ONCE(1);
3019
3020         /*
3021          * If any qos ->throttle() end up blocking, we will have flushed the
3022          * plug and hence killed the cached_rq list as well. Pop this entry
3023          * before we throttle.
3024          */
3025         rq_qos_throttle(rq->q, bio);
3026
3027         blk_mq_rq_time_init(rq, blk_time_get_ns());
3028         rq->cmd_flags = bio->bi_opf;
3029         INIT_LIST_HEAD(&rq->queuelist);
3030 }
3031
3032 static bool bio_unaligned(const struct bio *bio, struct request_queue *q)
3033 {
3034         unsigned int bs_mask = queue_logical_block_size(q) - 1;
3035
3036         /* .bi_sector of any zero sized bio need to be initialized */
3037         if ((bio->bi_iter.bi_size & bs_mask) ||
3038             ((bio->bi_iter.bi_sector << SECTOR_SHIFT) & bs_mask))
3039                 return true;
3040         return false;
3041 }
3042
3043 /**
3044  * blk_mq_submit_bio - Create and send a request to block device.
3045  * @bio: Bio pointer.
3046  *
3047  * Builds up a request structure from @q and @bio and send to the device. The
3048  * request may not be queued directly to hardware if:
3049  * * This request can be merged with another one
3050  * * We want to place request at plug queue for possible future merging
3051  * * There is an IO scheduler active at this queue
3052  *
3053  * It will not queue the request if there is an error with the bio, or at the
3054  * request creation.
3055  */
3056 void blk_mq_submit_bio(struct bio *bio)
3057 {
3058         struct request_queue *q = bdev_get_queue(bio->bi_bdev);
3059         struct blk_plug *plug = current->plug;
3060         const int is_sync = op_is_sync(bio->bi_opf);
3061         struct blk_mq_hw_ctx *hctx;
3062         unsigned int nr_segs;
3063         struct request *rq;
3064         blk_status_t ret;
3065
3066         /*
3067          * If the plug has a cached request for this queue, try to use it.
3068          */
3069         rq = blk_mq_peek_cached_request(plug, q, bio->bi_opf);
3070
3071         /*
3072          * A BIO that was released from a zone write plug has already been
3073          * through the preparation in this function, already holds a reference
3074          * on the queue usage counter, and is the only write BIO in-flight for
3075          * the target zone. Go straight to preparing a request for it.
3076          */
3077         if (bio_zone_write_plugging(bio)) {
3078                 nr_segs = bio->__bi_nr_segments;
3079                 if (rq)
3080                         blk_queue_exit(q);
3081                 goto new_request;
3082         }
3083
3084         bio = blk_queue_bounce(bio, q);
3085
3086         /*
3087          * The cached request already holds a q_usage_counter reference and we
3088          * don't have to acquire a new one if we use it.
3089          */
3090         if (!rq) {
3091                 if (unlikely(bio_queue_enter(bio)))
3092                         return;
3093         }
3094
3095         /*
3096          * Device reconfiguration may change logical block size, so alignment
3097          * check has to be done with queue usage counter held
3098          */
3099         if (unlikely(bio_unaligned(bio, q))) {
3100                 bio_io_error(bio);
3101                 goto queue_exit;
3102         }
3103
3104         bio = __bio_split_to_limits(bio, &q->limits, &nr_segs);
3105         if (!bio)
3106                 goto queue_exit;
3107
3108         if (!bio_integrity_prep(bio))
3109                 goto queue_exit;
3110
3111         if (blk_mq_attempt_bio_merge(q, bio, nr_segs))
3112                 goto queue_exit;
3113
3114         if (blk_queue_is_zoned(q) && blk_zone_plug_bio(bio, nr_segs))
3115                 goto queue_exit;
3116
3117 new_request:
3118         if (!rq) {
3119                 rq = blk_mq_get_new_requests(q, plug, bio, nr_segs);
3120                 if (unlikely(!rq))
3121                         goto queue_exit;
3122         } else {
3123                 blk_mq_use_cached_rq(rq, plug, bio);
3124         }
3125
3126         trace_block_getrq(bio);
3127
3128         rq_qos_track(q, rq, bio);
3129
3130         blk_mq_bio_to_request(rq, bio, nr_segs);
3131
3132         ret = blk_crypto_rq_get_keyslot(rq);
3133         if (ret != BLK_STS_OK) {
3134                 bio->bi_status = ret;
3135                 bio_endio(bio);
3136                 blk_mq_free_request(rq);
3137                 return;
3138         }
3139
3140         if (bio_zone_write_plugging(bio))
3141                 blk_zone_write_plug_init_request(rq);
3142
3143         if (op_is_flush(bio->bi_opf) && blk_insert_flush(rq))
3144                 return;
3145
3146         if (plug) {
3147                 blk_add_rq_to_plug(plug, rq);
3148                 return;
3149         }
3150
3151         hctx = rq->mq_hctx;
3152         if ((rq->rq_flags & RQF_USE_SCHED) ||
3153             (hctx->dispatch_busy && (q->nr_hw_queues == 1 || !is_sync))) {
3154                 blk_mq_insert_request(rq, 0);
3155                 blk_mq_run_hw_queue(hctx, true);
3156         } else {
3157                 blk_mq_run_dispatch_ops(q, blk_mq_try_issue_directly(hctx, rq));
3158         }
3159         return;
3160
3161 queue_exit:
3162         /*
3163          * Don't drop the queue reference if we were trying to use a cached
3164          * request and thus didn't acquire one.
3165          */
3166         if (!rq)
3167                 blk_queue_exit(q);
3168 }
3169
3170 #ifdef CONFIG_BLK_MQ_STACKING
3171 /**
3172  * blk_insert_cloned_request - Helper for stacking drivers to submit a request
3173  * @rq: the request being queued
3174  */
3175 blk_status_t blk_insert_cloned_request(struct request *rq)
3176 {
3177         struct request_queue *q = rq->q;
3178         unsigned int max_sectors = blk_queue_get_max_sectors(rq);
3179         unsigned int max_segments = blk_rq_get_max_segments(rq);
3180         blk_status_t ret;
3181
3182         if (blk_rq_sectors(rq) > max_sectors) {
3183                 /*
3184                  * SCSI device does not have a good way to return if
3185                  * Write Same/Zero is actually supported. If a device rejects
3186                  * a non-read/write command (discard, write same,etc.) the
3187                  * low-level device driver will set the relevant queue limit to
3188                  * 0 to prevent blk-lib from issuing more of the offending
3189                  * operations. Commands queued prior to the queue limit being
3190                  * reset need to be completed with BLK_STS_NOTSUPP to avoid I/O
3191                  * errors being propagated to upper layers.
3192                  */
3193                 if (max_sectors == 0)
3194                         return BLK_STS_NOTSUPP;
3195
3196                 printk(KERN_ERR "%s: over max size limit. (%u > %u)\n",
3197                         __func__, blk_rq_sectors(rq), max_sectors);
3198                 return BLK_STS_IOERR;
3199         }
3200
3201         /*
3202          * The queue settings related to segment counting may differ from the
3203          * original queue.
3204          */
3205         rq->nr_phys_segments = blk_recalc_rq_segments(rq);
3206         if (rq->nr_phys_segments > max_segments) {
3207                 printk(KERN_ERR "%s: over max segments limit. (%u > %u)\n",
3208                         __func__, rq->nr_phys_segments, max_segments);
3209                 return BLK_STS_IOERR;
3210         }
3211
3212         if (q->disk && should_fail_request(q->disk->part0, blk_rq_bytes(rq)))
3213                 return BLK_STS_IOERR;
3214
3215         ret = blk_crypto_rq_get_keyslot(rq);
3216         if (ret != BLK_STS_OK)
3217                 return ret;
3218
3219         blk_account_io_start(rq);
3220
3221         /*
3222          * Since we have a scheduler attached on the top device,
3223          * bypass a potential scheduler on the bottom device for
3224          * insert.
3225          */
3226         blk_mq_run_dispatch_ops(q,
3227                         ret = blk_mq_request_issue_directly(rq, true));
3228         if (ret)
3229                 blk_account_io_done(rq, blk_time_get_ns());
3230         return ret;
3231 }
3232 EXPORT_SYMBOL_GPL(blk_insert_cloned_request);
3233
3234 /**
3235  * blk_rq_unprep_clone - Helper function to free all bios in a cloned request
3236  * @rq: the clone request to be cleaned up
3237  *
3238  * Description:
3239  *     Free all bios in @rq for a cloned request.
3240  */
3241 void blk_rq_unprep_clone(struct request *rq)
3242 {
3243         struct bio *bio;
3244
3245         while ((bio = rq->bio) != NULL) {
3246                 rq->bio = bio->bi_next;
3247
3248                 bio_put(bio);
3249         }
3250 }
3251 EXPORT_SYMBOL_GPL(blk_rq_unprep_clone);
3252
3253 /**
3254  * blk_rq_prep_clone - Helper function to setup clone request
3255  * @rq: the request to be setup
3256  * @rq_src: original request to be cloned
3257  * @bs: bio_set that bios for clone are allocated from
3258  * @gfp_mask: memory allocation mask for bio
3259  * @bio_ctr: setup function to be called for each clone bio.
3260  *           Returns %0 for success, non %0 for failure.
3261  * @data: private data to be passed to @bio_ctr
3262  *
3263  * Description:
3264  *     Clones bios in @rq_src to @rq, and copies attributes of @rq_src to @rq.
3265  *     Also, pages which the original bios are pointing to are not copied
3266  *     and the cloned bios just point same pages.
3267  *     So cloned bios must be completed before original bios, which means
3268  *     the caller must complete @rq before @rq_src.
3269  */
3270 int blk_rq_prep_clone(struct request *rq, struct request *rq_src,
3271                       struct bio_set *bs, gfp_t gfp_mask,
3272                       int (*bio_ctr)(struct bio *, struct bio *, void *),
3273                       void *data)
3274 {
3275         struct bio *bio_src;
3276
3277         if (!bs)
3278                 bs = &fs_bio_set;
3279
3280         __rq_for_each_bio(bio_src, rq_src) {
3281                 struct bio *bio  = bio_alloc_clone(rq->q->disk->part0, bio_src,
3282                                         gfp_mask, bs);
3283                 if (!bio)
3284                         goto free_and_out;
3285
3286                 if (bio_ctr && bio_ctr(bio, bio_src, data)) {
3287                         bio_put(bio);
3288                         goto free_and_out;
3289                 }
3290
3291                 if (rq->bio) {
3292                         rq->biotail->bi_next = bio;
3293                         rq->biotail = bio;
3294                 } else {
3295                         rq->bio = rq->biotail = bio;
3296                 }
3297         }
3298
3299         /* Copy attributes of the original request to the clone request. */
3300         rq->__sector = blk_rq_pos(rq_src);
3301         rq->__data_len = blk_rq_bytes(rq_src);
3302         if (rq_src->rq_flags & RQF_SPECIAL_PAYLOAD) {
3303                 rq->rq_flags |= RQF_SPECIAL_PAYLOAD;
3304                 rq->special_vec = rq_src->special_vec;
3305         }
3306         rq->nr_phys_segments = rq_src->nr_phys_segments;
3307
3308         if (rq->bio && blk_crypto_rq_bio_prep(rq, rq->bio, gfp_mask) < 0)
3309                 goto free_and_out;
3310
3311         return 0;
3312
3313 free_and_out:
3314         blk_rq_unprep_clone(rq);
3315
3316         return -ENOMEM;
3317 }
3318 EXPORT_SYMBOL_GPL(blk_rq_prep_clone);
3319 #endif /* CONFIG_BLK_MQ_STACKING */
3320
3321 /*
3322  * Steal bios from a request and add them to a bio list.
3323  * The request must not have been partially completed before.
3324  */
3325 void blk_steal_bios(struct bio_list *list, struct request *rq)
3326 {
3327         if (rq->bio) {
3328                 if (list->tail)
3329                         list->tail->bi_next = rq->bio;
3330                 else
3331                         list->head = rq->bio;
3332                 list->tail = rq->biotail;
3333
3334                 rq->bio = NULL;
3335                 rq->biotail = NULL;
3336         }
3337
3338         rq->__data_len = 0;
3339 }
3340 EXPORT_SYMBOL_GPL(blk_steal_bios);
3341
3342 static size_t order_to_size(unsigned int order)
3343 {
3344         return (size_t)PAGE_SIZE << order;
3345 }
3346
3347 /* called before freeing request pool in @tags */
3348 static void blk_mq_clear_rq_mapping(struct blk_mq_tags *drv_tags,
3349                                     struct blk_mq_tags *tags)
3350 {
3351         struct page *page;
3352         unsigned long flags;
3353
3354         /*
3355          * There is no need to clear mapping if driver tags is not initialized
3356          * or the mapping belongs to the driver tags.
3357          */
3358         if (!drv_tags || drv_tags == tags)
3359                 return;
3360
3361         list_for_each_entry(page, &tags->page_list, lru) {
3362                 unsigned long start = (unsigned long)page_address(page);
3363                 unsigned long end = start + order_to_size(page->private);
3364                 int i;
3365
3366                 for (i = 0; i < drv_tags->nr_tags; i++) {
3367                         struct request *rq = drv_tags->rqs[i];
3368                         unsigned long rq_addr = (unsigned long)rq;
3369
3370                         if (rq_addr >= start && rq_addr < end) {
3371                                 WARN_ON_ONCE(req_ref_read(rq) != 0);
3372                                 cmpxchg(&drv_tags->rqs[i], rq, NULL);
3373                         }
3374                 }
3375         }
3376
3377         /*
3378          * Wait until all pending iteration is done.
3379          *
3380          * Request reference is cleared and it is guaranteed to be observed
3381          * after the ->lock is released.
3382          */
3383         spin_lock_irqsave(&drv_tags->lock, flags);
3384         spin_unlock_irqrestore(&drv_tags->lock, flags);
3385 }
3386
3387 void blk_mq_free_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags,
3388                      unsigned int hctx_idx)
3389 {
3390         struct blk_mq_tags *drv_tags;
3391         struct page *page;
3392
3393         if (list_empty(&tags->page_list))
3394                 return;
3395
3396         if (blk_mq_is_shared_tags(set->flags))
3397                 drv_tags = set->shared_tags;
3398         else
3399                 drv_tags = set->tags[hctx_idx];
3400
3401         if (tags->static_rqs && set->ops->exit_request) {
3402                 int i;
3403
3404                 for (i = 0; i < tags->nr_tags; i++) {
3405                         struct request *rq = tags->static_rqs[i];
3406
3407                         if (!rq)
3408                                 continue;
3409                         set->ops->exit_request(set, rq, hctx_idx);
3410                         tags->static_rqs[i] = NULL;
3411                 }
3412         }
3413
3414         blk_mq_clear_rq_mapping(drv_tags, tags);
3415
3416         while (!list_empty(&tags->page_list)) {
3417                 page = list_first_entry(&tags->page_list, struct page, lru);
3418                 list_del_init(&page->lru);
3419                 /*
3420                  * Remove kmemleak object previously allocated in
3421                  * blk_mq_alloc_rqs().
3422                  */
3423                 kmemleak_free(page_address(page));
3424                 __free_pages(page, page->private);
3425         }
3426 }
3427
3428 void blk_mq_free_rq_map(struct blk_mq_tags *tags)
3429 {
3430         kfree(tags->rqs);
3431         tags->rqs = NULL;
3432         kfree(tags->static_rqs);
3433         tags->static_rqs = NULL;
3434
3435         blk_mq_free_tags(tags);
3436 }
3437
3438 static enum hctx_type hctx_idx_to_type(struct blk_mq_tag_set *set,
3439                 unsigned int hctx_idx)
3440 {
3441         int i;
3442
3443         for (i = 0; i < set->nr_maps; i++) {
3444                 unsigned int start = set->map[i].queue_offset;
3445                 unsigned int end = start + set->map[i].nr_queues;
3446
3447                 if (hctx_idx >= start && hctx_idx < end)
3448                         break;
3449         }
3450
3451         if (i >= set->nr_maps)
3452                 i = HCTX_TYPE_DEFAULT;
3453
3454         return i;
3455 }
3456
3457 static int blk_mq_get_hctx_node(struct blk_mq_tag_set *set,
3458                 unsigned int hctx_idx)
3459 {
3460         enum hctx_type type = hctx_idx_to_type(set, hctx_idx);
3461
3462         return blk_mq_hw_queue_to_node(&set->map[type], hctx_idx);
3463 }
3464
3465 static struct blk_mq_tags *blk_mq_alloc_rq_map(struct blk_mq_tag_set *set,
3466                                                unsigned int hctx_idx,
3467                                                unsigned int nr_tags,
3468                                                unsigned int reserved_tags)
3469 {
3470         int node = blk_mq_get_hctx_node(set, hctx_idx);
3471         struct blk_mq_tags *tags;
3472
3473         if (node == NUMA_NO_NODE)
3474                 node = set->numa_node;
3475
3476         tags = blk_mq_init_tags(nr_tags, reserved_tags, node,
3477                                 BLK_MQ_FLAG_TO_ALLOC_POLICY(set->flags));
3478         if (!tags)
3479                 return NULL;
3480
3481         tags->rqs = kcalloc_node(nr_tags, sizeof(struct request *),
3482                                  GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY,
3483                                  node);
3484         if (!tags->rqs)
3485                 goto err_free_tags;
3486
3487         tags->static_rqs = kcalloc_node(nr_tags, sizeof(struct request *),
3488                                         GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY,
3489                                         node);
3490         if (!tags->static_rqs)
3491                 goto err_free_rqs;
3492
3493         return tags;
3494
3495 err_free_rqs:
3496         kfree(tags->rqs);
3497 err_free_tags:
3498         blk_mq_free_tags(tags);
3499         return NULL;
3500 }
3501
3502 static int blk_mq_init_request(struct blk_mq_tag_set *set, struct request *rq,
3503                                unsigned int hctx_idx, int node)
3504 {
3505         int ret;
3506
3507         if (set->ops->init_request) {
3508                 ret = set->ops->init_request(set, rq, hctx_idx, node);
3509                 if (ret)
3510                         return ret;
3511         }
3512
3513         WRITE_ONCE(rq->state, MQ_RQ_IDLE);
3514         return 0;
3515 }
3516
3517 static int blk_mq_alloc_rqs(struct blk_mq_tag_set *set,
3518                             struct blk_mq_tags *tags,
3519                             unsigned int hctx_idx, unsigned int depth)
3520 {
3521         unsigned int i, j, entries_per_page, max_order = 4;
3522         int node = blk_mq_get_hctx_node(set, hctx_idx);
3523         size_t rq_size, left;
3524
3525         if (node == NUMA_NO_NODE)
3526                 node = set->numa_node;
3527
3528         INIT_LIST_HEAD(&tags->page_list);
3529
3530         /*
3531          * rq_size is the size of the request plus driver payload, rounded
3532          * to the cacheline size
3533          */
3534         rq_size = round_up(sizeof(struct request) + set->cmd_size,
3535                                 cache_line_size());
3536         left = rq_size * depth;
3537
3538         for (i = 0; i < depth; ) {
3539                 int this_order = max_order;
3540                 struct page *page;
3541                 int to_do;
3542                 void *p;
3543
3544                 while (this_order && left < order_to_size(this_order - 1))
3545                         this_order--;
3546
3547                 do {
3548                         page = alloc_pages_node(node,
3549                                 GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY | __GFP_ZERO,
3550                                 this_order);
3551                         if (page)
3552                                 break;
3553                         if (!this_order--)
3554                                 break;
3555                         if (order_to_size(this_order) < rq_size)
3556                                 break;
3557                 } while (1);
3558
3559                 if (!page)
3560                         goto fail;
3561
3562                 page->private = this_order;
3563                 list_add_tail(&page->lru, &tags->page_list);
3564
3565                 p = page_address(page);
3566                 /*
3567                  * Allow kmemleak to scan these pages as they contain pointers
3568                  * to additional allocations like via ops->init_request().
3569                  */
3570                 kmemleak_alloc(p, order_to_size(this_order), 1, GFP_NOIO);
3571                 entries_per_page = order_to_size(this_order) / rq_size;
3572                 to_do = min(entries_per_page, depth - i);
3573                 left -= to_do * rq_size;
3574                 for (j = 0; j < to_do; j++) {
3575                         struct request *rq = p;
3576
3577                         tags->static_rqs[i] = rq;
3578                         if (blk_mq_init_request(set, rq, hctx_idx, node)) {
3579                                 tags->static_rqs[i] = NULL;
3580                                 goto fail;
3581                         }
3582
3583                         p += rq_size;
3584                         i++;
3585                 }
3586         }
3587         return 0;
3588
3589 fail:
3590         blk_mq_free_rqs(set, tags, hctx_idx);
3591         return -ENOMEM;
3592 }
3593
3594 struct rq_iter_data {
3595         struct blk_mq_hw_ctx *hctx;
3596         bool has_rq;
3597 };
3598
3599 static bool blk_mq_has_request(struct request *rq, void *data)
3600 {
3601         struct rq_iter_data *iter_data = data;
3602
3603         if (rq->mq_hctx != iter_data->hctx)
3604                 return true;
3605         iter_data->has_rq = true;
3606         return false;
3607 }
3608
3609 static bool blk_mq_hctx_has_requests(struct blk_mq_hw_ctx *hctx)
3610 {
3611         struct blk_mq_tags *tags = hctx->sched_tags ?
3612                         hctx->sched_tags : hctx->tags;
3613         struct rq_iter_data data = {
3614                 .hctx   = hctx,
3615         };
3616
3617         blk_mq_all_tag_iter(tags, blk_mq_has_request, &data);
3618         return data.has_rq;
3619 }
3620
3621 static bool blk_mq_hctx_has_online_cpu(struct blk_mq_hw_ctx *hctx,
3622                 unsigned int this_cpu)
3623 {
3624         enum hctx_type type = hctx->type;
3625         int cpu;
3626
3627         /*
3628          * hctx->cpumask has to rule out isolated CPUs, but userspace still
3629          * might submit IOs on these isolated CPUs, so use the queue map to
3630          * check if all CPUs mapped to this hctx are offline
3631          */
3632         for_each_online_cpu(cpu) {
3633                 struct blk_mq_hw_ctx *h = blk_mq_map_queue_type(hctx->queue,
3634                                 type, cpu);
3635
3636                 if (h != hctx)
3637                         continue;
3638
3639                 /* this hctx has at least one online CPU */
3640                 if (this_cpu != cpu)
3641                         return true;
3642         }
3643
3644         return false;
3645 }
3646
3647 static int blk_mq_hctx_notify_offline(unsigned int cpu, struct hlist_node *node)
3648 {
3649         struct blk_mq_hw_ctx *hctx = hlist_entry_safe(node,
3650                         struct blk_mq_hw_ctx, cpuhp_online);
3651
3652         if (blk_mq_hctx_has_online_cpu(hctx, cpu))
3653                 return 0;
3654
3655         /*
3656          * Prevent new request from being allocated on the current hctx.
3657          *
3658          * The smp_mb__after_atomic() Pairs with the implied barrier in
3659          * test_and_set_bit_lock in sbitmap_get().  Ensures the inactive flag is
3660          * seen once we return from the tag allocator.
3661          */
3662         set_bit(BLK_MQ_S_INACTIVE, &hctx->state);
3663         smp_mb__after_atomic();
3664
3665         /*
3666          * Try to grab a reference to the queue and wait for any outstanding
3667          * requests.  If we could not grab a reference the queue has been
3668          * frozen and there are no requests.
3669          */
3670         if (percpu_ref_tryget(&hctx->queue->q_usage_counter)) {
3671                 while (blk_mq_hctx_has_requests(hctx))
3672                         msleep(5);
3673                 percpu_ref_put(&hctx->queue->q_usage_counter);
3674         }
3675
3676         return 0;
3677 }
3678
3679 /*
3680  * Check if one CPU is mapped to the specified hctx
3681  *
3682  * Isolated CPUs have been ruled out from hctx->cpumask, which is supposed
3683  * to be used for scheduling kworker only. For other usage, please call this
3684  * helper for checking if one CPU belongs to the specified hctx
3685  */
3686 static bool blk_mq_cpu_mapped_to_hctx(unsigned int cpu,
3687                 const struct blk_mq_hw_ctx *hctx)
3688 {
3689         struct blk_mq_hw_ctx *mapped_hctx = blk_mq_map_queue_type(hctx->queue,
3690                         hctx->type, cpu);
3691
3692         return mapped_hctx == hctx;
3693 }
3694
3695 static int blk_mq_hctx_notify_online(unsigned int cpu, struct hlist_node *node)
3696 {
3697         struct blk_mq_hw_ctx *hctx = hlist_entry_safe(node,
3698                         struct blk_mq_hw_ctx, cpuhp_online);
3699
3700         if (blk_mq_cpu_mapped_to_hctx(cpu, hctx))
3701                 clear_bit(BLK_MQ_S_INACTIVE, &hctx->state);
3702         return 0;
3703 }
3704
3705 /*
3706  * 'cpu' is going away. splice any existing rq_list entries from this
3707  * software queue to the hw queue dispatch list, and ensure that it
3708  * gets run.
3709  */
3710 static int blk_mq_hctx_notify_dead(unsigned int cpu, struct hlist_node *node)
3711 {
3712         struct blk_mq_hw_ctx *hctx;
3713         struct blk_mq_ctx *ctx;
3714         LIST_HEAD(tmp);
3715         enum hctx_type type;
3716
3717         hctx = hlist_entry_safe(node, struct blk_mq_hw_ctx, cpuhp_dead);
3718         if (!blk_mq_cpu_mapped_to_hctx(cpu, hctx))
3719                 return 0;
3720
3721         ctx = __blk_mq_get_ctx(hctx->queue, cpu);
3722         type = hctx->type;
3723
3724         spin_lock(&ctx->lock);
3725         if (!list_empty(&ctx->rq_lists[type])) {
3726                 list_splice_init(&ctx->rq_lists[type], &tmp);
3727                 blk_mq_hctx_clear_pending(hctx, ctx);
3728         }
3729         spin_unlock(&ctx->lock);
3730
3731         if (list_empty(&tmp))
3732                 return 0;
3733
3734         spin_lock(&hctx->lock);
3735         list_splice_tail_init(&tmp, &hctx->dispatch);
3736         spin_unlock(&hctx->lock);
3737
3738         blk_mq_run_hw_queue(hctx, true);
3739         return 0;
3740 }
3741
3742 static void blk_mq_remove_cpuhp(struct blk_mq_hw_ctx *hctx)
3743 {
3744         if (!(hctx->flags & BLK_MQ_F_STACKING))
3745                 cpuhp_state_remove_instance_nocalls(CPUHP_AP_BLK_MQ_ONLINE,
3746                                                     &hctx->cpuhp_online);
3747         cpuhp_state_remove_instance_nocalls(CPUHP_BLK_MQ_DEAD,
3748                                             &hctx->cpuhp_dead);
3749 }
3750
3751 /*
3752  * Before freeing hw queue, clearing the flush request reference in
3753  * tags->rqs[] for avoiding potential UAF.
3754  */
3755 static void blk_mq_clear_flush_rq_mapping(struct blk_mq_tags *tags,
3756                 unsigned int queue_depth, struct request *flush_rq)
3757 {
3758         int i;
3759         unsigned long flags;
3760
3761         /* The hw queue may not be mapped yet */
3762         if (!tags)
3763                 return;
3764
3765         WARN_ON_ONCE(req_ref_read(flush_rq) != 0);
3766
3767         for (i = 0; i < queue_depth; i++)
3768                 cmpxchg(&tags->rqs[i], flush_rq, NULL);
3769
3770         /*
3771          * Wait until all pending iteration is done.
3772          *
3773          * Request reference is cleared and it is guaranteed to be observed
3774          * after the ->lock is released.
3775          */
3776         spin_lock_irqsave(&tags->lock, flags);
3777         spin_unlock_irqrestore(&tags->lock, flags);
3778 }
3779
3780 /* hctx->ctxs will be freed in queue's release handler */
3781 static void blk_mq_exit_hctx(struct request_queue *q,
3782                 struct blk_mq_tag_set *set,
3783                 struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx)
3784 {
3785         struct request *flush_rq = hctx->fq->flush_rq;
3786
3787         if (blk_mq_hw_queue_mapped(hctx))
3788                 blk_mq_tag_idle(hctx);
3789
3790         if (blk_queue_init_done(q))
3791                 blk_mq_clear_flush_rq_mapping(set->tags[hctx_idx],
3792                                 set->queue_depth, flush_rq);
3793         if (set->ops->exit_request)
3794                 set->ops->exit_request(set, flush_rq, hctx_idx);
3795
3796         if (set->ops->exit_hctx)
3797                 set->ops->exit_hctx(hctx, hctx_idx);
3798
3799         blk_mq_remove_cpuhp(hctx);
3800
3801         xa_erase(&q->hctx_table, hctx_idx);
3802
3803         spin_lock(&q->unused_hctx_lock);
3804         list_add(&hctx->hctx_list, &q->unused_hctx_list);
3805         spin_unlock(&q->unused_hctx_lock);
3806 }
3807
3808 static void blk_mq_exit_hw_queues(struct request_queue *q,
3809                 struct blk_mq_tag_set *set, int nr_queue)
3810 {
3811         struct blk_mq_hw_ctx *hctx;
3812         unsigned long i;
3813
3814         queue_for_each_hw_ctx(q, hctx, i) {
3815                 if (i == nr_queue)
3816                         break;
3817                 blk_mq_exit_hctx(q, set, hctx, i);
3818         }
3819 }
3820
3821 static int blk_mq_init_hctx(struct request_queue *q,
3822                 struct blk_mq_tag_set *set,
3823                 struct blk_mq_hw_ctx *hctx, unsigned hctx_idx)
3824 {
3825         hctx->queue_num = hctx_idx;
3826
3827         if (!(hctx->flags & BLK_MQ_F_STACKING))
3828                 cpuhp_state_add_instance_nocalls(CPUHP_AP_BLK_MQ_ONLINE,
3829                                 &hctx->cpuhp_online);
3830         cpuhp_state_add_instance_nocalls(CPUHP_BLK_MQ_DEAD, &hctx->cpuhp_dead);
3831
3832         hctx->tags = set->tags[hctx_idx];
3833
3834         if (set->ops->init_hctx &&
3835             set->ops->init_hctx(hctx, set->driver_data, hctx_idx))
3836                 goto unregister_cpu_notifier;
3837
3838         if (blk_mq_init_request(set, hctx->fq->flush_rq, hctx_idx,
3839                                 hctx->numa_node))
3840                 goto exit_hctx;
3841
3842         if (xa_insert(&q->hctx_table, hctx_idx, hctx, GFP_KERNEL))
3843                 goto exit_flush_rq;
3844
3845         return 0;
3846
3847  exit_flush_rq:
3848         if (set->ops->exit_request)
3849                 set->ops->exit_request(set, hctx->fq->flush_rq, hctx_idx);
3850  exit_hctx:
3851         if (set->ops->exit_hctx)
3852                 set->ops->exit_hctx(hctx, hctx_idx);
3853  unregister_cpu_notifier:
3854         blk_mq_remove_cpuhp(hctx);
3855         return -1;
3856 }
3857
3858 static struct blk_mq_hw_ctx *
3859 blk_mq_alloc_hctx(struct request_queue *q, struct blk_mq_tag_set *set,
3860                 int node)
3861 {
3862         struct blk_mq_hw_ctx *hctx;
3863         gfp_t gfp = GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY;
3864
3865         hctx = kzalloc_node(sizeof(struct blk_mq_hw_ctx), gfp, node);
3866         if (!hctx)
3867                 goto fail_alloc_hctx;
3868
3869         if (!zalloc_cpumask_var_node(&hctx->cpumask, gfp, node))
3870                 goto free_hctx;
3871
3872         atomic_set(&hctx->nr_active, 0);
3873         if (node == NUMA_NO_NODE)
3874                 node = set->numa_node;
3875         hctx->numa_node = node;
3876
3877         INIT_DELAYED_WORK(&hctx->run_work, blk_mq_run_work_fn);
3878         spin_lock_init(&hctx->lock);
3879         INIT_LIST_HEAD(&hctx->dispatch);
3880         hctx->queue = q;
3881         hctx->flags = set->flags & ~BLK_MQ_F_TAG_QUEUE_SHARED;
3882
3883         INIT_LIST_HEAD(&hctx->hctx_list);
3884
3885         /*
3886          * Allocate space for all possible cpus to avoid allocation at
3887          * runtime
3888          */
3889         hctx->ctxs = kmalloc_array_node(nr_cpu_ids, sizeof(void *),
3890                         gfp, node);
3891         if (!hctx->ctxs)
3892                 goto free_cpumask;
3893
3894         if (sbitmap_init_node(&hctx->ctx_map, nr_cpu_ids, ilog2(8),
3895                                 gfp, node, false, false))
3896                 goto free_ctxs;
3897         hctx->nr_ctx = 0;
3898
3899         spin_lock_init(&hctx->dispatch_wait_lock);
3900         init_waitqueue_func_entry(&hctx->dispatch_wait, blk_mq_dispatch_wake);
3901         INIT_LIST_HEAD(&hctx->dispatch_wait.entry);
3902
3903         hctx->fq = blk_alloc_flush_queue(hctx->numa_node, set->cmd_size, gfp);
3904         if (!hctx->fq)
3905                 goto free_bitmap;
3906
3907         blk_mq_hctx_kobj_init(hctx);
3908
3909         return hctx;
3910
3911  free_bitmap:
3912         sbitmap_free(&hctx->ctx_map);
3913  free_ctxs:
3914         kfree(hctx->ctxs);
3915  free_cpumask:
3916         free_cpumask_var(hctx->cpumask);
3917  free_hctx:
3918         kfree(hctx);
3919  fail_alloc_hctx:
3920         return NULL;
3921 }
3922
3923 static void blk_mq_init_cpu_queues(struct request_queue *q,
3924                                    unsigned int nr_hw_queues)
3925 {
3926         struct blk_mq_tag_set *set = q->tag_set;
3927         unsigned int i, j;
3928
3929         for_each_possible_cpu(i) {
3930                 struct blk_mq_ctx *__ctx = per_cpu_ptr(q->queue_ctx, i);
3931                 struct blk_mq_hw_ctx *hctx;
3932                 int k;
3933
3934                 __ctx->cpu = i;
3935                 spin_lock_init(&__ctx->lock);
3936                 for (k = HCTX_TYPE_DEFAULT; k < HCTX_MAX_TYPES; k++)
3937                         INIT_LIST_HEAD(&__ctx->rq_lists[k]);
3938
3939                 __ctx->queue = q;
3940
3941                 /*
3942                  * Set local node, IFF we have more than one hw queue. If
3943                  * not, we remain on the home node of the device
3944                  */
3945                 for (j = 0; j < set->nr_maps; j++) {
3946                         hctx = blk_mq_map_queue_type(q, j, i);
3947                         if (nr_hw_queues > 1 && hctx->numa_node == NUMA_NO_NODE)
3948                                 hctx->numa_node = cpu_to_node(i);
3949                 }
3950         }
3951 }
3952
3953 struct blk_mq_tags *blk_mq_alloc_map_and_rqs(struct blk_mq_tag_set *set,
3954                                              unsigned int hctx_idx,
3955                                              unsigned int depth)
3956 {
3957         struct blk_mq_tags *tags;
3958         int ret;
3959
3960         tags = blk_mq_alloc_rq_map(set, hctx_idx, depth, set->reserved_tags);
3961         if (!tags)
3962                 return NULL;
3963
3964         ret = blk_mq_alloc_rqs(set, tags, hctx_idx, depth);
3965         if (ret) {
3966                 blk_mq_free_rq_map(tags);
3967                 return NULL;
3968         }
3969
3970         return tags;
3971 }
3972
3973 static bool __blk_mq_alloc_map_and_rqs(struct blk_mq_tag_set *set,
3974                                        int hctx_idx)
3975 {
3976         if (blk_mq_is_shared_tags(set->flags)) {
3977                 set->tags[hctx_idx] = set->shared_tags;
3978
3979                 return true;
3980         }
3981
3982         set->tags[hctx_idx] = blk_mq_alloc_map_and_rqs(set, hctx_idx,
3983                                                        set->queue_depth);
3984
3985         return set->tags[hctx_idx];
3986 }
3987
3988 void blk_mq_free_map_and_rqs(struct blk_mq_tag_set *set,
3989                              struct blk_mq_tags *tags,
3990                              unsigned int hctx_idx)
3991 {
3992         if (tags) {
3993                 blk_mq_free_rqs(set, tags, hctx_idx);
3994                 blk_mq_free_rq_map(tags);
3995         }
3996 }
3997
3998 static void __blk_mq_free_map_and_rqs(struct blk_mq_tag_set *set,
3999                                       unsigned int hctx_idx)
4000 {
4001         if (!blk_mq_is_shared_tags(set->flags))
4002                 blk_mq_free_map_and_rqs(set, set->tags[hctx_idx], hctx_idx);
4003
4004         set->tags[hctx_idx] = NULL;
4005 }
4006
4007 static void blk_mq_map_swqueue(struct request_queue *q)
4008 {
4009         unsigned int j, hctx_idx;
4010         unsigned long i;
4011         struct blk_mq_hw_ctx *hctx;
4012         struct blk_mq_ctx *ctx;
4013         struct blk_mq_tag_set *set = q->tag_set;
4014
4015         queue_for_each_hw_ctx(q, hctx, i) {
4016                 cpumask_clear(hctx->cpumask);
4017                 hctx->nr_ctx = 0;
4018                 hctx->dispatch_from = NULL;
4019         }
4020
4021         /*
4022          * Map software to hardware queues.
4023          *
4024          * If the cpu isn't present, the cpu is mapped to first hctx.
4025          */
4026         for_each_possible_cpu(i) {
4027
4028                 ctx = per_cpu_ptr(q->queue_ctx, i);
4029                 for (j = 0; j < set->nr_maps; j++) {
4030                         if (!set->map[j].nr_queues) {
4031                                 ctx->hctxs[j] = blk_mq_map_queue_type(q,
4032                                                 HCTX_TYPE_DEFAULT, i);
4033                                 continue;
4034                         }
4035                         hctx_idx = set->map[j].mq_map[i];
4036                         /* unmapped hw queue can be remapped after CPU topo changed */
4037                         if (!set->tags[hctx_idx] &&
4038                             !__blk_mq_alloc_map_and_rqs(set, hctx_idx)) {
4039                                 /*
4040                                  * If tags initialization fail for some hctx,
4041                                  * that hctx won't be brought online.  In this
4042                                  * case, remap the current ctx to hctx[0] which
4043                                  * is guaranteed to always have tags allocated
4044                                  */
4045                                 set->map[j].mq_map[i] = 0;
4046                         }
4047
4048                         hctx = blk_mq_map_queue_type(q, j, i);
4049                         ctx->hctxs[j] = hctx;
4050                         /*
4051                          * If the CPU is already set in the mask, then we've
4052                          * mapped this one already. This can happen if
4053                          * devices share queues across queue maps.
4054                          */
4055                         if (cpumask_test_cpu(i, hctx->cpumask))
4056                                 continue;
4057
4058                         cpumask_set_cpu(i, hctx->cpumask);
4059                         hctx->type = j;
4060                         ctx->index_hw[hctx->type] = hctx->nr_ctx;
4061                         hctx->ctxs[hctx->nr_ctx++] = ctx;
4062
4063                         /*
4064                          * If the nr_ctx type overflows, we have exceeded the
4065                          * amount of sw queues we can support.
4066                          */
4067                         BUG_ON(!hctx->nr_ctx);
4068                 }
4069
4070                 for (; j < HCTX_MAX_TYPES; j++)
4071                         ctx->hctxs[j] = blk_mq_map_queue_type(q,
4072                                         HCTX_TYPE_DEFAULT, i);
4073         }
4074
4075         queue_for_each_hw_ctx(q, hctx, i) {
4076                 int cpu;
4077
4078                 /*
4079                  * If no software queues are mapped to this hardware queue,
4080                  * disable it and free the request entries.
4081                  */
4082                 if (!hctx->nr_ctx) {
4083                         /* Never unmap queue 0.  We need it as a
4084                          * fallback in case of a new remap fails
4085                          * allocation
4086                          */
4087                         if (i)
4088                                 __blk_mq_free_map_and_rqs(set, i);
4089
4090                         hctx->tags = NULL;
4091                         continue;
4092                 }
4093
4094                 hctx->tags = set->tags[i];
4095                 WARN_ON(!hctx->tags);
4096
4097                 /*
4098                  * Set the map size to the number of mapped software queues.
4099                  * This is more accurate and more efficient than looping
4100                  * over all possibly mapped software queues.
4101                  */
4102                 sbitmap_resize(&hctx->ctx_map, hctx->nr_ctx);
4103
4104                 /*
4105                  * Rule out isolated CPUs from hctx->cpumask to avoid
4106                  * running block kworker on isolated CPUs
4107                  */
4108                 for_each_cpu(cpu, hctx->cpumask) {
4109                         if (cpu_is_isolated(cpu))
4110                                 cpumask_clear_cpu(cpu, hctx->cpumask);
4111                 }
4112
4113                 /*
4114                  * Initialize batch roundrobin counts
4115                  */
4116                 hctx->next_cpu = blk_mq_first_mapped_cpu(hctx);
4117                 hctx->next_cpu_batch = BLK_MQ_CPU_WORK_BATCH;
4118         }
4119 }
4120
4121 /*
4122  * Caller needs to ensure that we're either frozen/quiesced, or that
4123  * the queue isn't live yet.
4124  */
4125 static void queue_set_hctx_shared(struct request_queue *q, bool shared)
4126 {
4127         struct blk_mq_hw_ctx *hctx;
4128         unsigned long i;
4129
4130         queue_for_each_hw_ctx(q, hctx, i) {
4131                 if (shared) {
4132                         hctx->flags |= BLK_MQ_F_TAG_QUEUE_SHARED;
4133                 } else {
4134                         blk_mq_tag_idle(hctx);
4135                         hctx->flags &= ~BLK_MQ_F_TAG_QUEUE_SHARED;
4136                 }
4137         }
4138 }
4139
4140 static void blk_mq_update_tag_set_shared(struct blk_mq_tag_set *set,
4141                                          bool shared)
4142 {
4143         struct request_queue *q;
4144
4145         lockdep_assert_held(&set->tag_list_lock);
4146
4147         list_for_each_entry(q, &set->tag_list, tag_set_list) {
4148                 blk_mq_freeze_queue(q);
4149                 queue_set_hctx_shared(q, shared);
4150                 blk_mq_unfreeze_queue(q);
4151         }
4152 }
4153
4154 static void blk_mq_del_queue_tag_set(struct request_queue *q)
4155 {
4156         struct blk_mq_tag_set *set = q->tag_set;
4157
4158         mutex_lock(&set->tag_list_lock);
4159         list_del(&q->tag_set_list);
4160         if (list_is_singular(&set->tag_list)) {
4161                 /* just transitioned to unshared */
4162                 set->flags &= ~BLK_MQ_F_TAG_QUEUE_SHARED;
4163                 /* update existing queue */
4164                 blk_mq_update_tag_set_shared(set, false);
4165         }
4166         mutex_unlock(&set->tag_list_lock);
4167         INIT_LIST_HEAD(&q->tag_set_list);
4168 }
4169
4170 static void blk_mq_add_queue_tag_set(struct blk_mq_tag_set *set,
4171                                      struct request_queue *q)
4172 {
4173         mutex_lock(&set->tag_list_lock);
4174
4175         /*
4176          * Check to see if we're transitioning to shared (from 1 to 2 queues).
4177          */
4178         if (!list_empty(&set->tag_list) &&
4179             !(set->flags & BLK_MQ_F_TAG_QUEUE_SHARED)) {
4180                 set->flags |= BLK_MQ_F_TAG_QUEUE_SHARED;
4181                 /* update existing queue */
4182                 blk_mq_update_tag_set_shared(set, true);
4183         }
4184         if (set->flags & BLK_MQ_F_TAG_QUEUE_SHARED)
4185                 queue_set_hctx_shared(q, true);
4186         list_add_tail(&q->tag_set_list, &set->tag_list);
4187
4188         mutex_unlock(&set->tag_list_lock);
4189 }
4190
4191 /* All allocations will be freed in release handler of q->mq_kobj */
4192 static int blk_mq_alloc_ctxs(struct request_queue *q)
4193 {
4194         struct blk_mq_ctxs *ctxs;
4195         int cpu;
4196
4197         ctxs = kzalloc(sizeof(*ctxs), GFP_KERNEL);
4198         if (!ctxs)
4199                 return -ENOMEM;
4200
4201         ctxs->queue_ctx = alloc_percpu(struct blk_mq_ctx);
4202         if (!ctxs->queue_ctx)
4203                 goto fail;
4204
4205         for_each_possible_cpu(cpu) {
4206                 struct blk_mq_ctx *ctx = per_cpu_ptr(ctxs->queue_ctx, cpu);
4207                 ctx->ctxs = ctxs;
4208         }
4209
4210         q->mq_kobj = &ctxs->kobj;
4211         q->queue_ctx = ctxs->queue_ctx;
4212
4213         return 0;
4214  fail:
4215         kfree(ctxs);
4216         return -ENOMEM;
4217 }
4218
4219 /*
4220  * It is the actual release handler for mq, but we do it from
4221  * request queue's release handler for avoiding use-after-free
4222  * and headache because q->mq_kobj shouldn't have been introduced,
4223  * but we can't group ctx/kctx kobj without it.
4224  */
4225 void blk_mq_release(struct request_queue *q)
4226 {
4227         struct blk_mq_hw_ctx *hctx, *next;
4228         unsigned long i;
4229
4230         queue_for_each_hw_ctx(q, hctx, i)
4231                 WARN_ON_ONCE(hctx && list_empty(&hctx->hctx_list));
4232
4233         /* all hctx are in .unused_hctx_list now */
4234         list_for_each_entry_safe(hctx, next, &q->unused_hctx_list, hctx_list) {
4235                 list_del_init(&hctx->hctx_list);
4236                 kobject_put(&hctx->kobj);
4237         }
4238
4239         xa_destroy(&q->hctx_table);
4240
4241         /*
4242          * release .mq_kobj and sw queue's kobject now because
4243          * both share lifetime with request queue.
4244          */
4245         blk_mq_sysfs_deinit(q);
4246 }
4247
4248 static bool blk_mq_can_poll(struct blk_mq_tag_set *set)
4249 {
4250         return set->nr_maps > HCTX_TYPE_POLL &&
4251                 set->map[HCTX_TYPE_POLL].nr_queues;
4252 }
4253
4254 struct request_queue *blk_mq_alloc_queue(struct blk_mq_tag_set *set,
4255                 struct queue_limits *lim, void *queuedata)
4256 {
4257         struct queue_limits default_lim = { };
4258         struct request_queue *q;
4259         int ret;
4260
4261         if (!lim)
4262                 lim = &default_lim;
4263         lim->features |= BLK_FEAT_IO_STAT | BLK_FEAT_NOWAIT;
4264         if (blk_mq_can_poll(set))
4265                 lim->features |= BLK_FEAT_POLL;
4266
4267         q = blk_alloc_queue(lim, set->numa_node);
4268         if (IS_ERR(q))
4269                 return q;
4270         q->queuedata = queuedata;
4271         ret = blk_mq_init_allocated_queue(set, q);
4272         if (ret) {
4273                 blk_put_queue(q);
4274                 return ERR_PTR(ret);
4275         }
4276         return q;
4277 }
4278 EXPORT_SYMBOL(blk_mq_alloc_queue);
4279
4280 /**
4281  * blk_mq_destroy_queue - shutdown a request queue
4282  * @q: request queue to shutdown
4283  *
4284  * This shuts down a request queue allocated by blk_mq_alloc_queue(). All future
4285  * requests will be failed with -ENODEV. The caller is responsible for dropping
4286  * the reference from blk_mq_alloc_queue() by calling blk_put_queue().
4287  *
4288  * Context: can sleep
4289  */
4290 void blk_mq_destroy_queue(struct request_queue *q)
4291 {
4292         WARN_ON_ONCE(!queue_is_mq(q));
4293         WARN_ON_ONCE(blk_queue_registered(q));
4294
4295         might_sleep();
4296
4297         blk_queue_flag_set(QUEUE_FLAG_DYING, q);
4298         blk_queue_start_drain(q);
4299         blk_mq_freeze_queue_wait(q);
4300
4301         blk_sync_queue(q);
4302         blk_mq_cancel_work_sync(q);
4303         blk_mq_exit_queue(q);
4304 }
4305 EXPORT_SYMBOL(blk_mq_destroy_queue);
4306
4307 struct gendisk *__blk_mq_alloc_disk(struct blk_mq_tag_set *set,
4308                 struct queue_limits *lim, void *queuedata,
4309                 struct lock_class_key *lkclass)
4310 {
4311         struct request_queue *q;
4312         struct gendisk *disk;
4313
4314         q = blk_mq_alloc_queue(set, lim, queuedata);
4315         if (IS_ERR(q))
4316                 return ERR_CAST(q);
4317
4318         disk = __alloc_disk_node(q, set->numa_node, lkclass);
4319         if (!disk) {
4320                 blk_mq_destroy_queue(q);
4321                 blk_put_queue(q);
4322                 return ERR_PTR(-ENOMEM);
4323         }
4324         set_bit(GD_OWNS_QUEUE, &disk->state);
4325         return disk;
4326 }
4327 EXPORT_SYMBOL(__blk_mq_alloc_disk);
4328
4329 struct gendisk *blk_mq_alloc_disk_for_queue(struct request_queue *q,
4330                 struct lock_class_key *lkclass)
4331 {
4332         struct gendisk *disk;
4333
4334         if (!blk_get_queue(q))
4335                 return NULL;
4336         disk = __alloc_disk_node(q, NUMA_NO_NODE, lkclass);
4337         if (!disk)
4338                 blk_put_queue(q);
4339         return disk;
4340 }
4341 EXPORT_SYMBOL(blk_mq_alloc_disk_for_queue);
4342
4343 static struct blk_mq_hw_ctx *blk_mq_alloc_and_init_hctx(
4344                 struct blk_mq_tag_set *set, struct request_queue *q,
4345                 int hctx_idx, int node)
4346 {
4347         struct blk_mq_hw_ctx *hctx = NULL, *tmp;
4348
4349         /* reuse dead hctx first */
4350         spin_lock(&q->unused_hctx_lock);
4351         list_for_each_entry(tmp, &q->unused_hctx_list, hctx_list) {
4352                 if (tmp->numa_node == node) {
4353                         hctx = tmp;
4354                         break;
4355                 }
4356         }
4357         if (hctx)
4358                 list_del_init(&hctx->hctx_list);
4359         spin_unlock(&q->unused_hctx_lock);
4360
4361         if (!hctx)
4362                 hctx = blk_mq_alloc_hctx(q, set, node);
4363         if (!hctx)
4364                 goto fail;
4365
4366         if (blk_mq_init_hctx(q, set, hctx, hctx_idx))
4367                 goto free_hctx;
4368
4369         return hctx;
4370
4371  free_hctx:
4372         kobject_put(&hctx->kobj);
4373  fail:
4374         return NULL;
4375 }
4376
4377 static void blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set,
4378                                                 struct request_queue *q)
4379 {
4380         struct blk_mq_hw_ctx *hctx;
4381         unsigned long i, j;
4382
4383         /* protect against switching io scheduler  */
4384         mutex_lock(&q->sysfs_lock);
4385         for (i = 0; i < set->nr_hw_queues; i++) {
4386                 int old_node;
4387                 int node = blk_mq_get_hctx_node(set, i);
4388                 struct blk_mq_hw_ctx *old_hctx = xa_load(&q->hctx_table, i);
4389
4390                 if (old_hctx) {
4391                         old_node = old_hctx->numa_node;
4392                         blk_mq_exit_hctx(q, set, old_hctx, i);
4393                 }
4394
4395                 if (!blk_mq_alloc_and_init_hctx(set, q, i, node)) {
4396                         if (!old_hctx)
4397                                 break;
4398                         pr_warn("Allocate new hctx on node %d fails, fallback to previous one on node %d\n",
4399                                         node, old_node);
4400                         hctx = blk_mq_alloc_and_init_hctx(set, q, i, old_node);
4401                         WARN_ON_ONCE(!hctx);
4402                 }
4403         }
4404         /*
4405          * Increasing nr_hw_queues fails. Free the newly allocated
4406          * hctxs and keep the previous q->nr_hw_queues.
4407          */
4408         if (i != set->nr_hw_queues) {
4409                 j = q->nr_hw_queues;
4410         } else {
4411                 j = i;
4412                 q->nr_hw_queues = set->nr_hw_queues;
4413         }
4414
4415         xa_for_each_start(&q->hctx_table, j, hctx, j)
4416                 blk_mq_exit_hctx(q, set, hctx, j);
4417         mutex_unlock(&q->sysfs_lock);
4418 }
4419
4420 int blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
4421                 struct request_queue *q)
4422 {
4423         /* mark the queue as mq asap */
4424         q->mq_ops = set->ops;
4425
4426         /*
4427          * ->tag_set has to be setup before initialize hctx, which cpuphp
4428          * handler needs it for checking queue mapping
4429          */
4430         q->tag_set = set;
4431
4432         if (blk_mq_alloc_ctxs(q))
4433                 goto err_exit;
4434
4435         /* init q->mq_kobj and sw queues' kobjects */
4436         blk_mq_sysfs_init(q);
4437
4438         INIT_LIST_HEAD(&q->unused_hctx_list);
4439         spin_lock_init(&q->unused_hctx_lock);
4440
4441         xa_init(&q->hctx_table);
4442
4443         blk_mq_realloc_hw_ctxs(set, q);
4444         if (!q->nr_hw_queues)
4445                 goto err_hctxs;
4446
4447         INIT_WORK(&q->timeout_work, blk_mq_timeout_work);
4448         blk_queue_rq_timeout(q, set->timeout ? set->timeout : 30 * HZ);
4449
4450         q->queue_flags |= QUEUE_FLAG_MQ_DEFAULT;
4451
4452         INIT_DELAYED_WORK(&q->requeue_work, blk_mq_requeue_work);
4453         INIT_LIST_HEAD(&q->flush_list);
4454         INIT_LIST_HEAD(&q->requeue_list);
4455         spin_lock_init(&q->requeue_lock);
4456
4457         q->nr_requests = set->queue_depth;
4458
4459         blk_mq_init_cpu_queues(q, set->nr_hw_queues);
4460         blk_mq_add_queue_tag_set(set, q);
4461         blk_mq_map_swqueue(q);
4462         return 0;
4463
4464 err_hctxs:
4465         blk_mq_release(q);
4466 err_exit:
4467         q->mq_ops = NULL;
4468         return -ENOMEM;
4469 }
4470 EXPORT_SYMBOL(blk_mq_init_allocated_queue);
4471
4472 /* tags can _not_ be used after returning from blk_mq_exit_queue */
4473 void blk_mq_exit_queue(struct request_queue *q)
4474 {
4475         struct blk_mq_tag_set *set = q->tag_set;
4476
4477         /* Checks hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED. */
4478         blk_mq_exit_hw_queues(q, set, set->nr_hw_queues);
4479         /* May clear BLK_MQ_F_TAG_QUEUE_SHARED in hctx->flags. */
4480         blk_mq_del_queue_tag_set(q);
4481 }
4482
4483 static int __blk_mq_alloc_rq_maps(struct blk_mq_tag_set *set)
4484 {
4485         int i;
4486
4487         if (blk_mq_is_shared_tags(set->flags)) {
4488                 set->shared_tags = blk_mq_alloc_map_and_rqs(set,
4489                                                 BLK_MQ_NO_HCTX_IDX,
4490                                                 set->queue_depth);
4491                 if (!set->shared_tags)
4492                         return -ENOMEM;
4493         }
4494
4495         for (i = 0; i < set->nr_hw_queues; i++) {
4496                 if (!__blk_mq_alloc_map_and_rqs(set, i))
4497                         goto out_unwind;
4498                 cond_resched();
4499         }
4500
4501         return 0;
4502
4503 out_unwind:
4504         while (--i >= 0)
4505                 __blk_mq_free_map_and_rqs(set, i);
4506
4507         if (blk_mq_is_shared_tags(set->flags)) {
4508                 blk_mq_free_map_and_rqs(set, set->shared_tags,
4509                                         BLK_MQ_NO_HCTX_IDX);
4510         }
4511
4512         return -ENOMEM;
4513 }
4514
4515 /*
4516  * Allocate the request maps associated with this tag_set. Note that this
4517  * may reduce the depth asked for, if memory is tight. set->queue_depth
4518  * will be updated to reflect the allocated depth.
4519  */
4520 static int blk_mq_alloc_set_map_and_rqs(struct blk_mq_tag_set *set)
4521 {
4522         unsigned int depth;
4523         int err;
4524
4525         depth = set->queue_depth;
4526         do {
4527                 err = __blk_mq_alloc_rq_maps(set);
4528                 if (!err)
4529                         break;
4530
4531                 set->queue_depth >>= 1;
4532                 if (set->queue_depth < set->reserved_tags + BLK_MQ_TAG_MIN) {
4533                         err = -ENOMEM;
4534                         break;
4535                 }
4536         } while (set->queue_depth);
4537
4538         if (!set->queue_depth || err) {
4539                 pr_err("blk-mq: failed to allocate request map\n");
4540                 return -ENOMEM;
4541         }
4542
4543         if (depth != set->queue_depth)
4544                 pr_info("blk-mq: reduced tag depth (%u -> %u)\n",
4545                                                 depth, set->queue_depth);
4546
4547         return 0;
4548 }
4549
4550 static void blk_mq_update_queue_map(struct blk_mq_tag_set *set)
4551 {
4552         /*
4553          * blk_mq_map_queues() and multiple .map_queues() implementations
4554          * expect that set->map[HCTX_TYPE_DEFAULT].nr_queues is set to the
4555          * number of hardware queues.
4556          */
4557         if (set->nr_maps == 1)
4558                 set->map[HCTX_TYPE_DEFAULT].nr_queues = set->nr_hw_queues;
4559
4560         if (set->ops->map_queues) {
4561                 int i;
4562
4563                 /*
4564                  * transport .map_queues is usually done in the following
4565                  * way:
4566                  *
4567                  * for (queue = 0; queue < set->nr_hw_queues; queue++) {
4568                  *      mask = get_cpu_mask(queue)
4569                  *      for_each_cpu(cpu, mask)
4570                  *              set->map[x].mq_map[cpu] = queue;
4571                  * }
4572                  *
4573                  * When we need to remap, the table has to be cleared for
4574                  * killing stale mapping since one CPU may not be mapped
4575                  * to any hw queue.
4576                  */
4577                 for (i = 0; i < set->nr_maps; i++)
4578                         blk_mq_clear_mq_map(&set->map[i]);
4579
4580                 set->ops->map_queues(set);
4581         } else {
4582                 BUG_ON(set->nr_maps > 1);
4583                 blk_mq_map_queues(&set->map[HCTX_TYPE_DEFAULT]);
4584         }
4585 }
4586
4587 static int blk_mq_realloc_tag_set_tags(struct blk_mq_tag_set *set,
4588                                        int new_nr_hw_queues)
4589 {
4590         struct blk_mq_tags **new_tags;
4591         int i;
4592
4593         if (set->nr_hw_queues >= new_nr_hw_queues)
4594                 goto done;
4595
4596         new_tags = kcalloc_node(new_nr_hw_queues, sizeof(struct blk_mq_tags *),
4597                                 GFP_KERNEL, set->numa_node);
4598         if (!new_tags)
4599                 return -ENOMEM;
4600
4601         if (set->tags)
4602                 memcpy(new_tags, set->tags, set->nr_hw_queues *
4603                        sizeof(*set->tags));
4604         kfree(set->tags);
4605         set->tags = new_tags;
4606
4607         for (i = set->nr_hw_queues; i < new_nr_hw_queues; i++) {
4608                 if (!__blk_mq_alloc_map_and_rqs(set, i)) {
4609                         while (--i >= set->nr_hw_queues)
4610                                 __blk_mq_free_map_and_rqs(set, i);
4611                         return -ENOMEM;
4612                 }
4613                 cond_resched();
4614         }
4615
4616 done:
4617         set->nr_hw_queues = new_nr_hw_queues;
4618         return 0;
4619 }
4620
4621 /*
4622  * Alloc a tag set to be associated with one or more request queues.
4623  * May fail with EINVAL for various error conditions. May adjust the
4624  * requested depth down, if it's too large. In that case, the set
4625  * value will be stored in set->queue_depth.
4626  */
4627 int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set)
4628 {
4629         int i, ret;
4630
4631         BUILD_BUG_ON(BLK_MQ_MAX_DEPTH > 1 << BLK_MQ_UNIQUE_TAG_BITS);
4632
4633         if (!set->nr_hw_queues)
4634                 return -EINVAL;
4635         if (!set->queue_depth)
4636                 return -EINVAL;
4637         if (set->queue_depth < set->reserved_tags + BLK_MQ_TAG_MIN)
4638                 return -EINVAL;
4639
4640         if (!set->ops->queue_rq)
4641                 return -EINVAL;
4642
4643         if (!set->ops->get_budget ^ !set->ops->put_budget)
4644                 return -EINVAL;
4645
4646         if (set->queue_depth > BLK_MQ_MAX_DEPTH) {
4647                 pr_info("blk-mq: reduced tag depth to %u\n",
4648                         BLK_MQ_MAX_DEPTH);
4649                 set->queue_depth = BLK_MQ_MAX_DEPTH;
4650         }
4651
4652         if (!set->nr_maps)
4653                 set->nr_maps = 1;
4654         else if (set->nr_maps > HCTX_MAX_TYPES)
4655                 return -EINVAL;
4656
4657         /*
4658          * If a crashdump is active, then we are potentially in a very
4659          * memory constrained environment. Limit us to  64 tags to prevent
4660          * using too much memory.
4661          */
4662         if (is_kdump_kernel())
4663                 set->queue_depth = min(64U, set->queue_depth);
4664
4665         /*
4666          * There is no use for more h/w queues than cpus if we just have
4667          * a single map
4668          */
4669         if (set->nr_maps == 1 && set->nr_hw_queues > nr_cpu_ids)
4670                 set->nr_hw_queues = nr_cpu_ids;
4671
4672         if (set->flags & BLK_MQ_F_BLOCKING) {
4673                 set->srcu = kmalloc(sizeof(*set->srcu), GFP_KERNEL);
4674                 if (!set->srcu)
4675                         return -ENOMEM;
4676                 ret = init_srcu_struct(set->srcu);
4677                 if (ret)
4678                         goto out_free_srcu;
4679         }
4680
4681         ret = -ENOMEM;
4682         set->tags = kcalloc_node(set->nr_hw_queues,
4683                                  sizeof(struct blk_mq_tags *), GFP_KERNEL,
4684                                  set->numa_node);
4685         if (!set->tags)
4686                 goto out_cleanup_srcu;
4687
4688         for (i = 0; i < set->nr_maps; i++) {
4689                 set->map[i].mq_map = kcalloc_node(nr_cpu_ids,
4690                                                   sizeof(set->map[i].mq_map[0]),
4691                                                   GFP_KERNEL, set->numa_node);
4692                 if (!set->map[i].mq_map)
4693                         goto out_free_mq_map;
4694                 set->map[i].nr_queues = set->nr_hw_queues;
4695         }
4696
4697         blk_mq_update_queue_map(set);
4698
4699         ret = blk_mq_alloc_set_map_and_rqs(set);
4700         if (ret)
4701                 goto out_free_mq_map;
4702
4703         mutex_init(&set->tag_list_lock);
4704         INIT_LIST_HEAD(&set->tag_list);
4705
4706         return 0;
4707
4708 out_free_mq_map:
4709         for (i = 0; i < set->nr_maps; i++) {
4710                 kfree(set->map[i].mq_map);
4711                 set->map[i].mq_map = NULL;
4712         }
4713         kfree(set->tags);
4714         set->tags = NULL;
4715 out_cleanup_srcu:
4716         if (set->flags & BLK_MQ_F_BLOCKING)
4717                 cleanup_srcu_struct(set->srcu);
4718 out_free_srcu:
4719         if (set->flags & BLK_MQ_F_BLOCKING)
4720                 kfree(set->srcu);
4721         return ret;
4722 }
4723 EXPORT_SYMBOL(blk_mq_alloc_tag_set);
4724
4725 /* allocate and initialize a tagset for a simple single-queue device */
4726 int blk_mq_alloc_sq_tag_set(struct blk_mq_tag_set *set,
4727                 const struct blk_mq_ops *ops, unsigned int queue_depth,
4728                 unsigned int set_flags)
4729 {
4730         memset(set, 0, sizeof(*set));
4731         set->ops = ops;
4732         set->nr_hw_queues = 1;
4733         set->nr_maps = 1;
4734         set->queue_depth = queue_depth;
4735         set->numa_node = NUMA_NO_NODE;
4736         set->flags = set_flags;
4737         return blk_mq_alloc_tag_set(set);
4738 }
4739 EXPORT_SYMBOL_GPL(blk_mq_alloc_sq_tag_set);
4740
4741 void blk_mq_free_tag_set(struct blk_mq_tag_set *set)
4742 {
4743         int i, j;
4744
4745         for (i = 0; i < set->nr_hw_queues; i++)
4746                 __blk_mq_free_map_and_rqs(set, i);
4747
4748         if (blk_mq_is_shared_tags(set->flags)) {
4749                 blk_mq_free_map_and_rqs(set, set->shared_tags,
4750                                         BLK_MQ_NO_HCTX_IDX);
4751         }
4752
4753         for (j = 0; j < set->nr_maps; j++) {
4754                 kfree(set->map[j].mq_map);
4755                 set->map[j].mq_map = NULL;
4756         }
4757
4758         kfree(set->tags);
4759         set->tags = NULL;
4760         if (set->flags & BLK_MQ_F_BLOCKING) {
4761                 cleanup_srcu_struct(set->srcu);
4762                 kfree(set->srcu);
4763         }
4764 }
4765 EXPORT_SYMBOL(blk_mq_free_tag_set);
4766
4767 int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr)
4768 {
4769         struct blk_mq_tag_set *set = q->tag_set;
4770         struct blk_mq_hw_ctx *hctx;
4771         int ret;
4772         unsigned long i;
4773
4774         if (WARN_ON_ONCE(!q->mq_freeze_depth))
4775                 return -EINVAL;
4776
4777         if (!set)
4778                 return -EINVAL;
4779
4780         if (q->nr_requests == nr)
4781                 return 0;
4782
4783         blk_mq_quiesce_queue(q);
4784
4785         ret = 0;
4786         queue_for_each_hw_ctx(q, hctx, i) {
4787                 if (!hctx->tags)
4788                         continue;
4789                 /*
4790                  * If we're using an MQ scheduler, just update the scheduler
4791                  * queue depth. This is similar to what the old code would do.
4792                  */
4793                 if (hctx->sched_tags) {
4794                         ret = blk_mq_tag_update_depth(hctx, &hctx->sched_tags,
4795                                                       nr, true);
4796                 } else {
4797                         ret = blk_mq_tag_update_depth(hctx, &hctx->tags, nr,
4798                                                       false);
4799                 }
4800                 if (ret)
4801                         break;
4802                 if (q->elevator && q->elevator->type->ops.depth_updated)
4803                         q->elevator->type->ops.depth_updated(hctx);
4804         }
4805         if (!ret) {
4806                 q->nr_requests = nr;
4807                 if (blk_mq_is_shared_tags(set->flags)) {
4808                         if (q->elevator)
4809                                 blk_mq_tag_update_sched_shared_tags(q);
4810                         else
4811                                 blk_mq_tag_resize_shared_tags(set, nr);
4812                 }
4813         }
4814
4815         blk_mq_unquiesce_queue(q);
4816
4817         return ret;
4818 }
4819
4820 /*
4821  * request_queue and elevator_type pair.
4822  * It is just used by __blk_mq_update_nr_hw_queues to cache
4823  * the elevator_type associated with a request_queue.
4824  */
4825 struct blk_mq_qe_pair {
4826         struct list_head node;
4827         struct request_queue *q;
4828         struct elevator_type *type;
4829 };
4830
4831 /*
4832  * Cache the elevator_type in qe pair list and switch the
4833  * io scheduler to 'none'
4834  */
4835 static bool blk_mq_elv_switch_none(struct list_head *head,
4836                 struct request_queue *q)
4837 {
4838         struct blk_mq_qe_pair *qe;
4839
4840         qe = kmalloc(sizeof(*qe), GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY);
4841         if (!qe)
4842                 return false;
4843
4844         /* q->elevator needs protection from ->sysfs_lock */
4845         mutex_lock(&q->sysfs_lock);
4846
4847         /* the check has to be done with holding sysfs_lock */
4848         if (!q->elevator) {
4849                 kfree(qe);
4850                 goto unlock;
4851         }
4852
4853         INIT_LIST_HEAD(&qe->node);
4854         qe->q = q;
4855         qe->type = q->elevator->type;
4856         /* keep a reference to the elevator module as we'll switch back */
4857         __elevator_get(qe->type);
4858         list_add(&qe->node, head);
4859         elevator_disable(q);
4860 unlock:
4861         mutex_unlock(&q->sysfs_lock);
4862
4863         return true;
4864 }
4865
4866 static struct blk_mq_qe_pair *blk_lookup_qe_pair(struct list_head *head,
4867                                                 struct request_queue *q)
4868 {
4869         struct blk_mq_qe_pair *qe;
4870
4871         list_for_each_entry(qe, head, node)
4872                 if (qe->q == q)
4873                         return qe;
4874
4875         return NULL;
4876 }
4877
4878 static void blk_mq_elv_switch_back(struct list_head *head,
4879                                   struct request_queue *q)
4880 {
4881         struct blk_mq_qe_pair *qe;
4882         struct elevator_type *t;
4883
4884         qe = blk_lookup_qe_pair(head, q);
4885         if (!qe)
4886                 return;
4887         t = qe->type;
4888         list_del(&qe->node);
4889         kfree(qe);
4890
4891         mutex_lock(&q->sysfs_lock);
4892         elevator_switch(q, t);
4893         /* drop the reference acquired in blk_mq_elv_switch_none */
4894         elevator_put(t);
4895         mutex_unlock(&q->sysfs_lock);
4896 }
4897
4898 static void __blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set,
4899                                                         int nr_hw_queues)
4900 {
4901         struct request_queue *q;
4902         LIST_HEAD(head);
4903         int prev_nr_hw_queues = set->nr_hw_queues;
4904         int i;
4905
4906         lockdep_assert_held(&set->tag_list_lock);
4907
4908         if (set->nr_maps == 1 && nr_hw_queues > nr_cpu_ids)
4909                 nr_hw_queues = nr_cpu_ids;
4910         if (nr_hw_queues < 1)
4911                 return;
4912         if (set->nr_maps == 1 && nr_hw_queues == set->nr_hw_queues)
4913                 return;
4914
4915         list_for_each_entry(q, &set->tag_list, tag_set_list)
4916                 blk_mq_freeze_queue(q);
4917         /*
4918          * Switch IO scheduler to 'none', cleaning up the data associated
4919          * with the previous scheduler. We will switch back once we are done
4920          * updating the new sw to hw queue mappings.
4921          */
4922         list_for_each_entry(q, &set->tag_list, tag_set_list)
4923                 if (!blk_mq_elv_switch_none(&head, q))
4924                         goto switch_back;
4925
4926         list_for_each_entry(q, &set->tag_list, tag_set_list) {
4927                 blk_mq_debugfs_unregister_hctxs(q);
4928                 blk_mq_sysfs_unregister_hctxs(q);
4929         }
4930
4931         if (blk_mq_realloc_tag_set_tags(set, nr_hw_queues) < 0)
4932                 goto reregister;
4933
4934 fallback:
4935         blk_mq_update_queue_map(set);
4936         list_for_each_entry(q, &set->tag_list, tag_set_list) {
4937                 struct queue_limits lim;
4938
4939                 blk_mq_realloc_hw_ctxs(set, q);
4940
4941                 if (q->nr_hw_queues != set->nr_hw_queues) {
4942                         int i = prev_nr_hw_queues;
4943
4944                         pr_warn("Increasing nr_hw_queues to %d fails, fallback to %d\n",
4945                                         nr_hw_queues, prev_nr_hw_queues);
4946                         for (; i < set->nr_hw_queues; i++)
4947                                 __blk_mq_free_map_and_rqs(set, i);
4948
4949                         set->nr_hw_queues = prev_nr_hw_queues;
4950                         goto fallback;
4951                 }
4952                 lim = queue_limits_start_update(q);
4953                 if (blk_mq_can_poll(set))
4954                         lim.features |= BLK_FEAT_POLL;
4955                 else
4956                         lim.features &= ~BLK_FEAT_POLL;
4957                 if (queue_limits_commit_update(q, &lim) < 0)
4958                         pr_warn("updating the poll flag failed\n");
4959                 blk_mq_map_swqueue(q);
4960         }
4961
4962 reregister:
4963         list_for_each_entry(q, &set->tag_list, tag_set_list) {
4964                 blk_mq_sysfs_register_hctxs(q);
4965                 blk_mq_debugfs_register_hctxs(q);
4966         }
4967
4968 switch_back:
4969         list_for_each_entry(q, &set->tag_list, tag_set_list)
4970                 blk_mq_elv_switch_back(&head, q);
4971
4972         list_for_each_entry(q, &set->tag_list, tag_set_list)
4973                 blk_mq_unfreeze_queue(q);
4974
4975         /* Free the excess tags when nr_hw_queues shrink. */
4976         for (i = set->nr_hw_queues; i < prev_nr_hw_queues; i++)
4977                 __blk_mq_free_map_and_rqs(set, i);
4978 }
4979
4980 void blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, int nr_hw_queues)
4981 {
4982         mutex_lock(&set->tag_list_lock);
4983         __blk_mq_update_nr_hw_queues(set, nr_hw_queues);
4984         mutex_unlock(&set->tag_list_lock);
4985 }
4986 EXPORT_SYMBOL_GPL(blk_mq_update_nr_hw_queues);
4987
4988 static int blk_hctx_poll(struct request_queue *q, struct blk_mq_hw_ctx *hctx,
4989                          struct io_comp_batch *iob, unsigned int flags)
4990 {
4991         long state = get_current_state();
4992         int ret;
4993
4994         do {
4995                 ret = q->mq_ops->poll(hctx, iob);
4996                 if (ret > 0) {
4997                         __set_current_state(TASK_RUNNING);
4998                         return ret;
4999                 }
5000
5001                 if (signal_pending_state(state, current))
5002                         __set_current_state(TASK_RUNNING);
5003                 if (task_is_running(current))
5004                         return 1;
5005
5006                 if (ret < 0 || (flags & BLK_POLL_ONESHOT))
5007                         break;
5008                 cpu_relax();
5009         } while (!need_resched());
5010
5011         __set_current_state(TASK_RUNNING);
5012         return 0;
5013 }
5014
5015 int blk_mq_poll(struct request_queue *q, blk_qc_t cookie,
5016                 struct io_comp_batch *iob, unsigned int flags)
5017 {
5018         struct blk_mq_hw_ctx *hctx = xa_load(&q->hctx_table, cookie);
5019
5020         return blk_hctx_poll(q, hctx, iob, flags);
5021 }
5022
5023 int blk_rq_poll(struct request *rq, struct io_comp_batch *iob,
5024                 unsigned int poll_flags)
5025 {
5026         struct request_queue *q = rq->q;
5027         int ret;
5028
5029         if (!blk_rq_is_poll(rq))
5030                 return 0;
5031         if (!percpu_ref_tryget(&q->q_usage_counter))
5032                 return 0;
5033
5034         ret = blk_hctx_poll(q, rq->mq_hctx, iob, poll_flags);
5035         blk_queue_exit(q);
5036
5037         return ret;
5038 }
5039 EXPORT_SYMBOL_GPL(blk_rq_poll);
5040
5041 unsigned int blk_mq_rq_cpu(struct request *rq)
5042 {
5043         return rq->mq_ctx->cpu;
5044 }
5045 EXPORT_SYMBOL(blk_mq_rq_cpu);
5046
5047 void blk_mq_cancel_work_sync(struct request_queue *q)
5048 {
5049         struct blk_mq_hw_ctx *hctx;
5050         unsigned long i;
5051
5052         cancel_delayed_work_sync(&q->requeue_work);
5053
5054         queue_for_each_hw_ctx(q, hctx, i)
5055                 cancel_delayed_work_sync(&hctx->run_work);
5056 }
5057
5058 static int __init blk_mq_init(void)
5059 {
5060         int i;
5061
5062         for_each_possible_cpu(i)
5063                 init_llist_head(&per_cpu(blk_cpu_done, i));
5064         for_each_possible_cpu(i)
5065                 INIT_CSD(&per_cpu(blk_cpu_csd, i),
5066                          __blk_mq_complete_request_remote, NULL);
5067         open_softirq(BLOCK_SOFTIRQ, blk_done_softirq);
5068
5069         cpuhp_setup_state_nocalls(CPUHP_BLOCK_SOFTIRQ_DEAD,
5070                                   "block/softirq:dead", NULL,
5071                                   blk_softirq_cpu_dead);
5072         cpuhp_setup_state_multi(CPUHP_BLK_MQ_DEAD, "block/mq:dead", NULL,
5073                                 blk_mq_hctx_notify_dead);
5074         cpuhp_setup_state_multi(CPUHP_AP_BLK_MQ_ONLINE, "block/mq:online",
5075                                 blk_mq_hctx_notify_online,
5076                                 blk_mq_hctx_notify_offline);
5077         return 0;
5078 }
5079 subsys_initcall(blk_mq_init);