1 // SPDX-License-Identifier: GPL-2.0-only
3 * Copyright (c) 2015 HGST, a Western Digital Company.
5 #include <linux/module.h>
7 #include <linux/slab.h>
8 #include <rdma/ib_verbs.h>
10 #include "core_priv.h"
12 #include <trace/events/rdma_core.h>
13 /* Max size for shared CQ, may require tuning */
14 #define IB_MAX_SHARED_CQ_SZ 4096U
16 /* # of WCs to poll for with a single call to ib_poll_cq */
17 #define IB_POLL_BATCH 16
18 #define IB_POLL_BATCH_DIRECT 8
20 /* # of WCs to iterate over before yielding */
21 #define IB_POLL_BUDGET_IRQ 256
22 #define IB_POLL_BUDGET_WORKQUEUE 65536
24 #define IB_POLL_FLAGS \
25 (IB_CQ_NEXT_COMP | IB_CQ_REPORT_MISSED_EVENTS)
27 static const struct dim_cq_moder
28 rdma_dim_prof
[RDMA_DIM_PARAMS_NUM_PROFILES
] = {
40 static void ib_cq_rdma_dim_work(struct work_struct
*w
)
42 struct dim
*dim
= container_of(w
, struct dim
, work
);
43 struct ib_cq
*cq
= dim
->priv
;
45 u16 usec
= rdma_dim_prof
[dim
->profile_ix
].usec
;
46 u16 comps
= rdma_dim_prof
[dim
->profile_ix
].comps
;
48 dim
->state
= DIM_START_MEASURE
;
50 trace_cq_modify(cq
, comps
, usec
);
51 cq
->device
->ops
.modify_cq(cq
, comps
, usec
);
54 static void rdma_dim_init(struct ib_cq
*cq
)
58 if (!cq
->device
->ops
.modify_cq
|| !cq
->device
->use_cq_dim
||
59 cq
->poll_ctx
== IB_POLL_DIRECT
)
62 dim
= kzalloc(sizeof(struct dim
), GFP_KERNEL
);
66 dim
->state
= DIM_START_MEASURE
;
67 dim
->tune_state
= DIM_GOING_RIGHT
;
68 dim
->profile_ix
= RDMA_DIM_START_PROFILE
;
72 INIT_WORK(&dim
->work
, ib_cq_rdma_dim_work
);
75 static void rdma_dim_destroy(struct ib_cq
*cq
)
80 cancel_work_sync(&cq
->dim
->work
);
84 static int __poll_cq(struct ib_cq
*cq
, int num_entries
, struct ib_wc
*wc
)
88 rc
= ib_poll_cq(cq
, num_entries
, wc
);
89 trace_cq_poll(cq
, num_entries
, rc
);
93 static int __ib_process_cq(struct ib_cq
*cq
, int budget
, struct ib_wc
*wcs
,
96 int i
, n
, completed
= 0;
101 * budget might be (-1) if the caller does not
102 * want to bound this call, thus we need unsigned
105 while ((n
= __poll_cq(cq
, min_t(u32
, batch
,
106 budget
- completed
), wcs
)) > 0) {
107 for (i
= 0; i
< n
; i
++) {
108 struct ib_wc
*wc
= &wcs
[i
];
111 wc
->wr_cqe
->done(cq
, wc
);
113 WARN_ON_ONCE(wc
->status
== IB_WC_SUCCESS
);
118 if (n
!= batch
|| (budget
!= -1 && completed
>= budget
))
126 * ib_process_cq_direct - process a CQ in caller context
128 * @budget: number of CQEs to poll for
130 * This function is used to process all outstanding CQ entries.
131 * It does not offload CQ processing to a different context and does
132 * not ask for completion interrupts from the HCA.
133 * Using direct processing on CQ with non IB_POLL_DIRECT type may trigger
134 * concurrent processing.
136 * Note: do not pass -1 as %budget unless it is guaranteed that the number
137 * of completions that will be processed is small.
139 int ib_process_cq_direct(struct ib_cq
*cq
, int budget
)
141 struct ib_wc wcs
[IB_POLL_BATCH_DIRECT
];
143 return __ib_process_cq(cq
, budget
, wcs
, IB_POLL_BATCH_DIRECT
);
145 EXPORT_SYMBOL(ib_process_cq_direct
);
147 static void ib_cq_completion_direct(struct ib_cq
*cq
, void *private)
149 WARN_ONCE(1, "got unsolicited completion for CQ 0x%p\n", cq
);
152 static int ib_poll_handler(struct irq_poll
*iop
, int budget
)
154 struct ib_cq
*cq
= container_of(iop
, struct ib_cq
, iop
);
155 struct dim
*dim
= cq
->dim
;
158 completed
= __ib_process_cq(cq
, budget
, cq
->wc
, IB_POLL_BATCH
);
159 if (completed
< budget
) {
160 irq_poll_complete(&cq
->iop
);
161 if (ib_req_notify_cq(cq
, IB_POLL_FLAGS
) > 0) {
162 trace_cq_reschedule(cq
);
163 irq_poll_sched(&cq
->iop
);
168 rdma_dim(dim
, completed
);
173 static void ib_cq_completion_softirq(struct ib_cq
*cq
, void *private)
175 trace_cq_schedule(cq
);
176 irq_poll_sched(&cq
->iop
);
179 static void ib_cq_poll_work(struct work_struct
*work
)
181 struct ib_cq
*cq
= container_of(work
, struct ib_cq
, work
);
184 completed
= __ib_process_cq(cq
, IB_POLL_BUDGET_WORKQUEUE
, cq
->wc
,
186 if (completed
>= IB_POLL_BUDGET_WORKQUEUE
||
187 ib_req_notify_cq(cq
, IB_POLL_FLAGS
) > 0)
188 queue_work(cq
->comp_wq
, &cq
->work
);
190 rdma_dim(cq
->dim
, completed
);
193 static void ib_cq_completion_workqueue(struct ib_cq
*cq
, void *private)
195 trace_cq_schedule(cq
);
196 queue_work(cq
->comp_wq
, &cq
->work
);
200 * __ib_alloc_cq - allocate a completion queue
201 * @dev: device to allocate the CQ for
202 * @private: driver private data, accessible from cq->cq_context
203 * @nr_cqe: number of CQEs to allocate
204 * @comp_vector: HCA completion vectors for this CQ
205 * @poll_ctx: context to poll the CQ from.
206 * @caller: module owner name.
208 * This is the proper interface to allocate a CQ for in-kernel users. A
209 * CQ allocated with this interface will automatically be polled from the
210 * specified context. The ULP must use wr->wr_cqe instead of wr->wr_id
211 * to use this CQ abstraction.
213 struct ib_cq
*__ib_alloc_cq(struct ib_device
*dev
, void *private, int nr_cqe
,
214 int comp_vector
, enum ib_poll_context poll_ctx
,
217 struct ib_cq_init_attr cq_attr
= {
219 .comp_vector
= comp_vector
,
224 cq
= rdma_zalloc_drv_obj(dev
, ib_cq
);
229 cq
->cq_context
= private;
230 cq
->poll_ctx
= poll_ctx
;
231 atomic_set(&cq
->usecnt
, 0);
232 cq
->comp_vector
= comp_vector
;
234 cq
->wc
= kmalloc_array(IB_POLL_BATCH
, sizeof(*cq
->wc
), GFP_KERNEL
);
238 rdma_restrack_new(&cq
->res
, RDMA_RESTRACK_CQ
);
239 rdma_restrack_set_name(&cq
->res
, caller
);
241 ret
= dev
->ops
.create_cq(cq
, &cq_attr
, NULL
);
247 switch (cq
->poll_ctx
) {
249 cq
->comp_handler
= ib_cq_completion_direct
;
251 case IB_POLL_SOFTIRQ
:
252 cq
->comp_handler
= ib_cq_completion_softirq
;
254 irq_poll_init(&cq
->iop
, IB_POLL_BUDGET_IRQ
, ib_poll_handler
);
255 ib_req_notify_cq(cq
, IB_CQ_NEXT_COMP
);
257 case IB_POLL_WORKQUEUE
:
258 case IB_POLL_UNBOUND_WORKQUEUE
:
259 cq
->comp_handler
= ib_cq_completion_workqueue
;
260 INIT_WORK(&cq
->work
, ib_cq_poll_work
);
261 ib_req_notify_cq(cq
, IB_CQ_NEXT_COMP
);
262 cq
->comp_wq
= (cq
->poll_ctx
== IB_POLL_WORKQUEUE
) ?
263 ib_comp_wq
: ib_comp_unbound_wq
;
270 rdma_restrack_add(&cq
->res
);
271 trace_cq_alloc(cq
, nr_cqe
, comp_vector
, poll_ctx
);
275 rdma_dim_destroy(cq
);
276 cq
->device
->ops
.destroy_cq(cq
, NULL
);
278 rdma_restrack_put(&cq
->res
);
282 trace_cq_alloc_error(nr_cqe
, comp_vector
, poll_ctx
, ret
);
285 EXPORT_SYMBOL(__ib_alloc_cq
);
288 * __ib_alloc_cq_any - allocate a completion queue
289 * @dev: device to allocate the CQ for
290 * @private: driver private data, accessible from cq->cq_context
291 * @nr_cqe: number of CQEs to allocate
292 * @poll_ctx: context to poll the CQ from
293 * @caller: module owner name
295 * Attempt to spread ULP Completion Queues over each device's interrupt
296 * vectors. A simple best-effort mechanism is used.
298 struct ib_cq
*__ib_alloc_cq_any(struct ib_device
*dev
, void *private,
299 int nr_cqe
, enum ib_poll_context poll_ctx
,
302 static atomic_t counter
;
305 if (dev
->num_comp_vectors
> 1)
307 atomic_inc_return(&counter
) %
308 min_t(int, dev
->num_comp_vectors
, num_online_cpus());
310 return __ib_alloc_cq(dev
, private, nr_cqe
, comp_vector
, poll_ctx
,
313 EXPORT_SYMBOL(__ib_alloc_cq_any
);
316 * ib_free_cq - free a completion queue
317 * @cq: completion queue to free.
319 void ib_free_cq(struct ib_cq
*cq
)
323 if (WARN_ON_ONCE(atomic_read(&cq
->usecnt
)))
325 if (WARN_ON_ONCE(cq
->cqe_used
))
328 switch (cq
->poll_ctx
) {
331 case IB_POLL_SOFTIRQ
:
332 irq_poll_disable(&cq
->iop
);
334 case IB_POLL_WORKQUEUE
:
335 case IB_POLL_UNBOUND_WORKQUEUE
:
336 cancel_work_sync(&cq
->work
);
342 rdma_dim_destroy(cq
);
344 ret
= cq
->device
->ops
.destroy_cq(cq
, NULL
);
345 WARN_ONCE(ret
, "Destroy of kernel CQ shouldn't fail");
346 rdma_restrack_del(&cq
->res
);
350 EXPORT_SYMBOL(ib_free_cq
);
352 void ib_cq_pool_cleanup(struct ib_device
*dev
)
354 struct ib_cq
*cq
, *n
;
357 for (i
= 0; i
< ARRAY_SIZE(dev
->cq_pools
); i
++) {
358 list_for_each_entry_safe(cq
, n
, &dev
->cq_pools
[i
],
360 WARN_ON(cq
->cqe_used
);
361 list_del(&cq
->pool_entry
);
368 static int ib_alloc_cqs(struct ib_device
*dev
, unsigned int nr_cqes
,
369 enum ib_poll_context poll_ctx
)
372 unsigned int nr_cqs
, i
;
373 struct ib_cq
*cq
, *n
;
376 if (poll_ctx
> IB_POLL_LAST_POOL_TYPE
) {
377 WARN_ON_ONCE(poll_ctx
> IB_POLL_LAST_POOL_TYPE
);
382 * Allocate at least as many CQEs as requested, and otherwise
383 * a reasonable batch size so that we can share CQs between
384 * multiple users instead of allocating a larger number of CQs.
386 nr_cqes
= min_t(unsigned int, dev
->attrs
.max_cqe
,
387 max(nr_cqes
, IB_MAX_SHARED_CQ_SZ
));
388 nr_cqs
= min_t(unsigned int, dev
->num_comp_vectors
, num_online_cpus());
389 for (i
= 0; i
< nr_cqs
; i
++) {
390 cq
= ib_alloc_cq(dev
, NULL
, nr_cqes
, i
, poll_ctx
);
396 list_add_tail(&cq
->pool_entry
, &tmp_list
);
399 spin_lock_irq(&dev
->cq_pools_lock
);
400 list_splice(&tmp_list
, &dev
->cq_pools
[poll_ctx
]);
401 spin_unlock_irq(&dev
->cq_pools_lock
);
406 list_for_each_entry_safe(cq
, n
, &tmp_list
, pool_entry
) {
414 * ib_cq_pool_get() - Find the least used completion queue that matches
415 * a given cpu hint (or least used for wild card affinity) and fits
418 * @nr_cqe: number of needed cqe entries
419 * @comp_vector_hint: completion vector hint (-1) for the driver to assign
420 * a comp vector based on internal counter
421 * @poll_ctx: cq polling context
423 * Finds a cq that satisfies @comp_vector_hint and @nr_cqe requirements and
424 * claim entries in it for us. In case there is no available cq, allocate
425 * a new cq with the requirements and add it to the device pool.
426 * IB_POLL_DIRECT cannot be used for shared cqs so it is not a valid value
429 struct ib_cq
*ib_cq_pool_get(struct ib_device
*dev
, unsigned int nr_cqe
,
430 int comp_vector_hint
,
431 enum ib_poll_context poll_ctx
)
433 static unsigned int default_comp_vector
;
434 unsigned int vector
, num_comp_vectors
;
435 struct ib_cq
*cq
, *found
= NULL
;
438 if (poll_ctx
> IB_POLL_LAST_POOL_TYPE
) {
439 WARN_ON_ONCE(poll_ctx
> IB_POLL_LAST_POOL_TYPE
);
440 return ERR_PTR(-EINVAL
);
444 min_t(unsigned int, dev
->num_comp_vectors
, num_online_cpus());
445 /* Project the affinty to the device completion vector range */
446 if (comp_vector_hint
< 0) {
448 (READ_ONCE(default_comp_vector
) + 1) % num_comp_vectors
;
449 WRITE_ONCE(default_comp_vector
, comp_vector_hint
);
451 vector
= comp_vector_hint
% num_comp_vectors
;
454 * Find the least used CQ with correct affinity and
455 * enough free CQ entries
458 spin_lock_irq(&dev
->cq_pools_lock
);
459 list_for_each_entry(cq
, &dev
->cq_pools
[poll_ctx
],
462 * Check to see if we have found a CQ with the
463 * correct completion vector
465 if (vector
!= cq
->comp_vector
)
467 if (cq
->cqe_used
+ nr_cqe
> cq
->cqe
)
474 found
->cqe_used
+= nr_cqe
;
475 spin_unlock_irq(&dev
->cq_pools_lock
);
479 spin_unlock_irq(&dev
->cq_pools_lock
);
482 * Didn't find a match or ran out of CQs in the device
483 * pool, allocate a new array of CQs.
485 ret
= ib_alloc_cqs(dev
, nr_cqe
, poll_ctx
);
492 EXPORT_SYMBOL(ib_cq_pool_get
);
495 * ib_cq_pool_put - Return a CQ taken from a shared pool.
496 * @cq: The CQ to return.
497 * @nr_cqe: The max number of cqes that the user had requested.
499 void ib_cq_pool_put(struct ib_cq
*cq
, unsigned int nr_cqe
)
501 if (WARN_ON_ONCE(nr_cqe
> cq
->cqe_used
))
504 spin_lock_irq(&cq
->device
->cq_pools_lock
);
505 cq
->cqe_used
-= nr_cqe
;
506 spin_unlock_irq(&cq
->device
->cq_pools_lock
);
508 EXPORT_SYMBOL(ib_cq_pool_put
);