1 // SPDX-License-Identifier: GPL-2.0
3 * Tag allocation using scalable bitmaps. Uses active queue tracking to support
4 * fairer distribution of tags between multiple submitters when a shared tag map
7 * Copyright (C) 2013-2014 Jens Axboe
9 #include <linux/kernel.h>
10 #include <linux/module.h>
12 #include <linux/blk-mq.h>
13 #include <linux/delay.h>
16 #include "blk-mq-tag.h"
19 * If a previously inactive queue goes active, bump the active user count.
20 * We need to do this before try to allocate driver tag, then even if fail
21 * to get tag when first time, the other shared-tag users could reserve
24 bool __blk_mq_tag_busy(struct blk_mq_hw_ctx
*hctx
)
26 if (blk_mq_is_sbitmap_shared(hctx
->flags
)) {
27 struct request_queue
*q
= hctx
->queue
;
28 struct blk_mq_tag_set
*set
= q
->tag_set
;
30 if (!test_bit(QUEUE_FLAG_HCTX_ACTIVE
, &q
->queue_flags
) &&
31 !test_and_set_bit(QUEUE_FLAG_HCTX_ACTIVE
, &q
->queue_flags
))
32 atomic_inc(&set
->active_queues_shared_sbitmap
);
34 if (!test_bit(BLK_MQ_S_TAG_ACTIVE
, &hctx
->state
) &&
35 !test_and_set_bit(BLK_MQ_S_TAG_ACTIVE
, &hctx
->state
))
36 atomic_inc(&hctx
->tags
->active_queues
);
43 * Wakeup all potentially sleeping on tags
45 void blk_mq_tag_wakeup_all(struct blk_mq_tags
*tags
, bool include_reserve
)
47 sbitmap_queue_wake_all(tags
->bitmap_tags
);
49 sbitmap_queue_wake_all(tags
->breserved_tags
);
53 * If a previously busy queue goes inactive, potential waiters could now
54 * be allowed to queue. Wake them up and check.
56 void __blk_mq_tag_idle(struct blk_mq_hw_ctx
*hctx
)
58 struct blk_mq_tags
*tags
= hctx
->tags
;
59 struct request_queue
*q
= hctx
->queue
;
60 struct blk_mq_tag_set
*set
= q
->tag_set
;
62 if (blk_mq_is_sbitmap_shared(hctx
->flags
)) {
63 if (!test_and_clear_bit(QUEUE_FLAG_HCTX_ACTIVE
,
66 atomic_dec(&set
->active_queues_shared_sbitmap
);
68 if (!test_and_clear_bit(BLK_MQ_S_TAG_ACTIVE
, &hctx
->state
))
70 atomic_dec(&tags
->active_queues
);
73 blk_mq_tag_wakeup_all(tags
, false);
76 static int __blk_mq_get_tag(struct blk_mq_alloc_data
*data
,
77 struct sbitmap_queue
*bt
)
79 if (!data
->q
->elevator
&& !(data
->flags
& BLK_MQ_REQ_RESERVED
) &&
80 !hctx_may_queue(data
->hctx
, bt
))
83 if (data
->shallow_depth
)
84 return __sbitmap_queue_get_shallow(bt
, data
->shallow_depth
);
86 return __sbitmap_queue_get(bt
);
89 unsigned int blk_mq_get_tag(struct blk_mq_alloc_data
*data
)
91 struct blk_mq_tags
*tags
= blk_mq_tags_from_data(data
);
92 struct sbitmap_queue
*bt
;
93 struct sbq_wait_state
*ws
;
94 DEFINE_SBQ_WAIT(wait
);
95 unsigned int tag_offset
;
98 if (data
->flags
& BLK_MQ_REQ_RESERVED
) {
99 if (unlikely(!tags
->nr_reserved_tags
)) {
101 return BLK_MQ_NO_TAG
;
103 bt
= tags
->breserved_tags
;
106 bt
= tags
->bitmap_tags
;
107 tag_offset
= tags
->nr_reserved_tags
;
110 tag
= __blk_mq_get_tag(data
, bt
);
111 if (tag
!= BLK_MQ_NO_TAG
)
114 if (data
->flags
& BLK_MQ_REQ_NOWAIT
)
115 return BLK_MQ_NO_TAG
;
117 ws
= bt_wait_ptr(bt
, data
->hctx
);
119 struct sbitmap_queue
*bt_prev
;
122 * We're out of tags on this hardware queue, kick any
123 * pending IO submits before going to sleep waiting for
126 blk_mq_run_hw_queue(data
->hctx
, false);
129 * Retry tag allocation after running the hardware queue,
130 * as running the queue may also have found completions.
132 tag
= __blk_mq_get_tag(data
, bt
);
133 if (tag
!= BLK_MQ_NO_TAG
)
136 sbitmap_prepare_to_wait(bt
, ws
, &wait
, TASK_UNINTERRUPTIBLE
);
138 tag
= __blk_mq_get_tag(data
, bt
);
139 if (tag
!= BLK_MQ_NO_TAG
)
145 sbitmap_finish_wait(bt
, ws
, &wait
);
147 data
->ctx
= blk_mq_get_ctx(data
->q
);
148 data
->hctx
= blk_mq_map_queue(data
->q
, data
->cmd_flags
,
150 tags
= blk_mq_tags_from_data(data
);
151 if (data
->flags
& BLK_MQ_REQ_RESERVED
)
152 bt
= tags
->breserved_tags
;
154 bt
= tags
->bitmap_tags
;
157 * If destination hw queue is changed, fake wake up on
158 * previous queue for compensating the wake up miss, so
159 * other allocations on previous queue won't be starved.
162 sbitmap_queue_wake_up(bt_prev
);
164 ws
= bt_wait_ptr(bt
, data
->hctx
);
167 sbitmap_finish_wait(bt
, ws
, &wait
);
171 * Give up this allocation if the hctx is inactive. The caller will
172 * retry on an active hctx.
174 if (unlikely(test_bit(BLK_MQ_S_INACTIVE
, &data
->hctx
->state
))) {
175 blk_mq_put_tag(tags
, data
->ctx
, tag
+ tag_offset
);
176 return BLK_MQ_NO_TAG
;
178 return tag
+ tag_offset
;
181 void blk_mq_put_tag(struct blk_mq_tags
*tags
, struct blk_mq_ctx
*ctx
,
184 if (!blk_mq_tag_is_reserved(tags
, tag
)) {
185 const int real_tag
= tag
- tags
->nr_reserved_tags
;
187 BUG_ON(real_tag
>= tags
->nr_tags
);
188 sbitmap_queue_clear(tags
->bitmap_tags
, real_tag
, ctx
->cpu
);
190 BUG_ON(tag
>= tags
->nr_reserved_tags
);
191 sbitmap_queue_clear(tags
->breserved_tags
, tag
, ctx
->cpu
);
195 struct bt_iter_data
{
196 struct blk_mq_hw_ctx
*hctx
;
202 static bool bt_iter(struct sbitmap
*bitmap
, unsigned int bitnr
, void *data
)
204 struct bt_iter_data
*iter_data
= data
;
205 struct blk_mq_hw_ctx
*hctx
= iter_data
->hctx
;
206 struct blk_mq_tags
*tags
= hctx
->tags
;
207 bool reserved
= iter_data
->reserved
;
211 bitnr
+= tags
->nr_reserved_tags
;
212 rq
= tags
->rqs
[bitnr
];
215 * We can hit rq == NULL here, because the tagging functions
216 * test and set the bit before assigning ->rqs[].
218 if (rq
&& rq
->q
== hctx
->queue
&& rq
->mq_hctx
== hctx
)
219 return iter_data
->fn(hctx
, rq
, iter_data
->data
, reserved
);
224 * bt_for_each - iterate over the requests associated with a hardware queue
225 * @hctx: Hardware queue to examine.
226 * @bt: sbitmap to examine. This is either the breserved_tags member
227 * or the bitmap_tags member of struct blk_mq_tags.
228 * @fn: Pointer to the function that will be called for each request
229 * associated with @hctx that has been assigned a driver tag.
230 * @fn will be called as follows: @fn(@hctx, rq, @data, @reserved)
231 * where rq is a pointer to a request. Return true to continue
232 * iterating tags, false to stop.
233 * @data: Will be passed as third argument to @fn.
234 * @reserved: Indicates whether @bt is the breserved_tags member or the
235 * bitmap_tags member of struct blk_mq_tags.
237 static void bt_for_each(struct blk_mq_hw_ctx
*hctx
, struct sbitmap_queue
*bt
,
238 busy_iter_fn
*fn
, void *data
, bool reserved
)
240 struct bt_iter_data iter_data
= {
244 .reserved
= reserved
,
247 sbitmap_for_each_set(&bt
->sb
, bt_iter
, &iter_data
);
250 struct bt_tags_iter_data
{
251 struct blk_mq_tags
*tags
;
252 busy_tag_iter_fn
*fn
;
257 #define BT_TAG_ITER_RESERVED (1 << 0)
258 #define BT_TAG_ITER_STARTED (1 << 1)
259 #define BT_TAG_ITER_STATIC_RQS (1 << 2)
261 static bool bt_tags_iter(struct sbitmap
*bitmap
, unsigned int bitnr
, void *data
)
263 struct bt_tags_iter_data
*iter_data
= data
;
264 struct blk_mq_tags
*tags
= iter_data
->tags
;
265 bool reserved
= iter_data
->flags
& BT_TAG_ITER_RESERVED
;
269 bitnr
+= tags
->nr_reserved_tags
;
272 * We can hit rq == NULL here, because the tagging functions
273 * test and set the bit before assigning ->rqs[].
275 if (iter_data
->flags
& BT_TAG_ITER_STATIC_RQS
)
276 rq
= tags
->static_rqs
[bitnr
];
278 rq
= tags
->rqs
[bitnr
];
281 if ((iter_data
->flags
& BT_TAG_ITER_STARTED
) &&
282 !blk_mq_request_started(rq
))
284 return iter_data
->fn(rq
, iter_data
->data
, reserved
);
288 * bt_tags_for_each - iterate over the requests in a tag map
289 * @tags: Tag map to iterate over.
290 * @bt: sbitmap to examine. This is either the breserved_tags member
291 * or the bitmap_tags member of struct blk_mq_tags.
292 * @fn: Pointer to the function that will be called for each started
293 * request. @fn will be called as follows: @fn(rq, @data,
294 * @reserved) where rq is a pointer to a request. Return true
295 * to continue iterating tags, false to stop.
296 * @data: Will be passed as second argument to @fn.
297 * @flags: BT_TAG_ITER_*
299 static void bt_tags_for_each(struct blk_mq_tags
*tags
, struct sbitmap_queue
*bt
,
300 busy_tag_iter_fn
*fn
, void *data
, unsigned int flags
)
302 struct bt_tags_iter_data iter_data
= {
310 sbitmap_for_each_set(&bt
->sb
, bt_tags_iter
, &iter_data
);
313 static void __blk_mq_all_tag_iter(struct blk_mq_tags
*tags
,
314 busy_tag_iter_fn
*fn
, void *priv
, unsigned int flags
)
316 WARN_ON_ONCE(flags
& BT_TAG_ITER_RESERVED
);
318 if (tags
->nr_reserved_tags
)
319 bt_tags_for_each(tags
, tags
->breserved_tags
, fn
, priv
,
320 flags
| BT_TAG_ITER_RESERVED
);
321 bt_tags_for_each(tags
, tags
->bitmap_tags
, fn
, priv
, flags
);
325 * blk_mq_all_tag_iter - iterate over all requests in a tag map
326 * @tags: Tag map to iterate over.
327 * @fn: Pointer to the function that will be called for each
328 * request. @fn will be called as follows: @fn(rq, @priv,
329 * reserved) where rq is a pointer to a request. 'reserved'
330 * indicates whether or not @rq is a reserved request. Return
331 * true to continue iterating tags, false to stop.
332 * @priv: Will be passed as second argument to @fn.
334 * Caller has to pass the tag map from which requests are allocated.
336 void blk_mq_all_tag_iter(struct blk_mq_tags
*tags
, busy_tag_iter_fn
*fn
,
339 __blk_mq_all_tag_iter(tags
, fn
, priv
, BT_TAG_ITER_STATIC_RQS
);
343 * blk_mq_tagset_busy_iter - iterate over all started requests in a tag set
344 * @tagset: Tag set to iterate over.
345 * @fn: Pointer to the function that will be called for each started
346 * request. @fn will be called as follows: @fn(rq, @priv,
347 * reserved) where rq is a pointer to a request. 'reserved'
348 * indicates whether or not @rq is a reserved request. Return
349 * true to continue iterating tags, false to stop.
350 * @priv: Will be passed as second argument to @fn.
352 void blk_mq_tagset_busy_iter(struct blk_mq_tag_set
*tagset
,
353 busy_tag_iter_fn
*fn
, void *priv
)
357 for (i
= 0; i
< tagset
->nr_hw_queues
; i
++) {
358 if (tagset
->tags
&& tagset
->tags
[i
])
359 __blk_mq_all_tag_iter(tagset
->tags
[i
], fn
, priv
,
360 BT_TAG_ITER_STARTED
);
363 EXPORT_SYMBOL(blk_mq_tagset_busy_iter
);
365 static bool blk_mq_tagset_count_completed_rqs(struct request
*rq
,
366 void *data
, bool reserved
)
368 unsigned *count
= data
;
370 if (blk_mq_request_completed(rq
))
376 * blk_mq_tagset_wait_completed_request - wait until all completed req's
377 * complete funtion is run
378 * @tagset: Tag set to drain completed request
380 * Note: This function has to be run after all IO queues are shutdown
382 void blk_mq_tagset_wait_completed_request(struct blk_mq_tag_set
*tagset
)
387 blk_mq_tagset_busy_iter(tagset
,
388 blk_mq_tagset_count_completed_rqs
, &count
);
394 EXPORT_SYMBOL(blk_mq_tagset_wait_completed_request
);
397 * blk_mq_queue_tag_busy_iter - iterate over all requests with a driver tag
398 * @q: Request queue to examine.
399 * @fn: Pointer to the function that will be called for each request
400 * on @q. @fn will be called as follows: @fn(hctx, rq, @priv,
401 * reserved) where rq is a pointer to a request and hctx points
402 * to the hardware queue associated with the request. 'reserved'
403 * indicates whether or not @rq is a reserved request.
404 * @priv: Will be passed as third argument to @fn.
406 * Note: if @q->tag_set is shared with other request queues then @fn will be
407 * called for all requests on all queues that share that tag set and not only
408 * for requests associated with @q.
410 void blk_mq_queue_tag_busy_iter(struct request_queue
*q
, busy_iter_fn
*fn
,
413 struct blk_mq_hw_ctx
*hctx
;
417 * __blk_mq_update_nr_hw_queues() updates nr_hw_queues and queue_hw_ctx
418 * while the queue is frozen. So we can use q_usage_counter to avoid
421 if (!percpu_ref_tryget(&q
->q_usage_counter
))
424 queue_for_each_hw_ctx(q
, hctx
, i
) {
425 struct blk_mq_tags
*tags
= hctx
->tags
;
428 * If no software queues are currently mapped to this
429 * hardware queue, there's nothing to check
431 if (!blk_mq_hw_queue_mapped(hctx
))
434 if (tags
->nr_reserved_tags
)
435 bt_for_each(hctx
, tags
->breserved_tags
, fn
, priv
, true);
436 bt_for_each(hctx
, tags
->bitmap_tags
, fn
, priv
, false);
441 static int bt_alloc(struct sbitmap_queue
*bt
, unsigned int depth
,
442 bool round_robin
, int node
)
444 return sbitmap_queue_init_node(bt
, depth
, -1, round_robin
, GFP_KERNEL
,
448 static int blk_mq_init_bitmap_tags(struct blk_mq_tags
*tags
,
449 int node
, int alloc_policy
)
451 unsigned int depth
= tags
->nr_tags
- tags
->nr_reserved_tags
;
452 bool round_robin
= alloc_policy
== BLK_TAG_ALLOC_RR
;
454 if (bt_alloc(&tags
->__bitmap_tags
, depth
, round_robin
, node
))
456 if (bt_alloc(&tags
->__breserved_tags
, tags
->nr_reserved_tags
,
458 goto free_bitmap_tags
;
460 tags
->bitmap_tags
= &tags
->__bitmap_tags
;
461 tags
->breserved_tags
= &tags
->__breserved_tags
;
465 sbitmap_queue_free(&tags
->__bitmap_tags
);
469 int blk_mq_init_shared_sbitmap(struct blk_mq_tag_set
*set
, unsigned int flags
)
471 unsigned int depth
= set
->queue_depth
- set
->reserved_tags
;
472 int alloc_policy
= BLK_MQ_FLAG_TO_ALLOC_POLICY(set
->flags
);
473 bool round_robin
= alloc_policy
== BLK_TAG_ALLOC_RR
;
474 int i
, node
= set
->numa_node
;
476 if (bt_alloc(&set
->__bitmap_tags
, depth
, round_robin
, node
))
478 if (bt_alloc(&set
->__breserved_tags
, set
->reserved_tags
,
480 goto free_bitmap_tags
;
482 for (i
= 0; i
< set
->nr_hw_queues
; i
++) {
483 struct blk_mq_tags
*tags
= set
->tags
[i
];
485 tags
->bitmap_tags
= &set
->__bitmap_tags
;
486 tags
->breserved_tags
= &set
->__breserved_tags
;
491 sbitmap_queue_free(&set
->__bitmap_tags
);
495 void blk_mq_exit_shared_sbitmap(struct blk_mq_tag_set
*set
)
497 sbitmap_queue_free(&set
->__bitmap_tags
);
498 sbitmap_queue_free(&set
->__breserved_tags
);
501 struct blk_mq_tags
*blk_mq_init_tags(unsigned int total_tags
,
502 unsigned int reserved_tags
,
503 int node
, unsigned int flags
)
505 int alloc_policy
= BLK_MQ_FLAG_TO_ALLOC_POLICY(flags
);
506 struct blk_mq_tags
*tags
;
508 if (total_tags
> BLK_MQ_TAG_MAX
) {
509 pr_err("blk-mq: tag depth too large\n");
513 tags
= kzalloc_node(sizeof(*tags
), GFP_KERNEL
, node
);
517 tags
->nr_tags
= total_tags
;
518 tags
->nr_reserved_tags
= reserved_tags
;
520 if (flags
& BLK_MQ_F_TAG_HCTX_SHARED
)
523 if (blk_mq_init_bitmap_tags(tags
, node
, alloc_policy
) < 0) {
530 void blk_mq_free_tags(struct blk_mq_tags
*tags
, unsigned int flags
)
532 if (!(flags
& BLK_MQ_F_TAG_HCTX_SHARED
)) {
533 sbitmap_queue_free(tags
->bitmap_tags
);
534 sbitmap_queue_free(tags
->breserved_tags
);
539 int blk_mq_tag_update_depth(struct blk_mq_hw_ctx
*hctx
,
540 struct blk_mq_tags
**tagsptr
, unsigned int tdepth
,
543 struct blk_mq_tags
*tags
= *tagsptr
;
545 if (tdepth
<= tags
->nr_reserved_tags
)
549 * If we are allowed to grow beyond the original size, allocate
550 * a new set of tags before freeing the old one.
552 if (tdepth
> tags
->nr_tags
) {
553 struct blk_mq_tag_set
*set
= hctx
->queue
->tag_set
;
554 /* Only sched tags can grow, so clear HCTX_SHARED flag */
555 unsigned int flags
= set
->flags
& ~BLK_MQ_F_TAG_HCTX_SHARED
;
556 struct blk_mq_tags
*new;
563 * We need some sort of upper limit, set it high enough that
564 * no valid use cases should require more.
566 if (tdepth
> 16 * BLKDEV_MAX_RQ
)
569 new = blk_mq_alloc_rq_map(set
, hctx
->queue_num
, tdepth
,
570 tags
->nr_reserved_tags
, flags
);
573 ret
= blk_mq_alloc_rqs(set
, new, hctx
->queue_num
, tdepth
);
575 blk_mq_free_rq_map(new, flags
);
579 blk_mq_free_rqs(set
, *tagsptr
, hctx
->queue_num
);
580 blk_mq_free_rq_map(*tagsptr
, flags
);
584 * Don't need (or can't) update reserved tags here, they
585 * remain static and should never need resizing.
587 sbitmap_queue_resize(tags
->bitmap_tags
,
588 tdepth
- tags
->nr_reserved_tags
);
594 void blk_mq_tag_resize_shared_sbitmap(struct blk_mq_tag_set
*set
, unsigned int size
)
596 sbitmap_queue_resize(&set
->__bitmap_tags
, size
- set
->reserved_tags
);
600 * blk_mq_unique_tag() - return a tag that is unique queue-wide
601 * @rq: request for which to compute a unique tag
603 * The tag field in struct request is unique per hardware queue but not over
604 * all hardware queues. Hence this function that returns a tag with the
605 * hardware context index in the upper bits and the per hardware queue tag in
608 * Note: When called for a request that is queued on a non-multiqueue request
609 * queue, the hardware context index is set to zero.
611 u32
blk_mq_unique_tag(struct request
*rq
)
613 return (rq
->mq_hctx
->queue_num
<< BLK_MQ_UNIQUE_TAG_BITS
) |
614 (rq
->tag
& BLK_MQ_UNIQUE_TAG_MASK
);
616 EXPORT_SYMBOL(blk_mq_unique_tag
);