1 // SPDX-License-Identifier: GPL-2.0-only
3 * Copyright (c) 2015 HGST, a Western Digital Company.
6 #include <linux/slab.h>
7 #include <rdma/ib_verbs.h>
11 #include <trace/events/rdma_core.h>
12 /* Max size for shared CQ, may require tuning */
13 #define IB_MAX_SHARED_CQ_SZ 4096U
15 /* # of WCs to poll for with a single call to ib_poll_cq */
16 #define IB_POLL_BATCH 16
17 #define IB_POLL_BATCH_DIRECT 8
19 /* # of WCs to iterate over before yielding */
20 #define IB_POLL_BUDGET_IRQ 256
21 #define IB_POLL_BUDGET_WORKQUEUE 65536
23 #define IB_POLL_FLAGS \
24 (IB_CQ_NEXT_COMP | IB_CQ_REPORT_MISSED_EVENTS)
26 static const struct dim_cq_moder
27 rdma_dim_prof
[RDMA_DIM_PARAMS_NUM_PROFILES
] = {
39 static void ib_cq_rdma_dim_work(struct work_struct
*w
)
41 struct dim
*dim
= container_of(w
, struct dim
, work
);
42 struct ib_cq
*cq
= dim
->priv
;
44 u16 usec
= rdma_dim_prof
[dim
->profile_ix
].usec
;
45 u16 comps
= rdma_dim_prof
[dim
->profile_ix
].comps
;
47 dim
->state
= DIM_START_MEASURE
;
49 trace_cq_modify(cq
, comps
, usec
);
50 cq
->device
->ops
.modify_cq(cq
, comps
, usec
);
53 static void rdma_dim_init(struct ib_cq
*cq
)
57 if (!cq
->device
->ops
.modify_cq
|| !cq
->device
->use_cq_dim
||
58 cq
->poll_ctx
== IB_POLL_DIRECT
)
61 dim
= kzalloc(sizeof(struct dim
), GFP_KERNEL
);
65 dim
->state
= DIM_START_MEASURE
;
66 dim
->tune_state
= DIM_GOING_RIGHT
;
67 dim
->profile_ix
= RDMA_DIM_START_PROFILE
;
71 INIT_WORK(&dim
->work
, ib_cq_rdma_dim_work
);
74 static void rdma_dim_destroy(struct ib_cq
*cq
)
79 cancel_work_sync(&cq
->dim
->work
);
83 static int __poll_cq(struct ib_cq
*cq
, int num_entries
, struct ib_wc
*wc
)
87 rc
= ib_poll_cq(cq
, num_entries
, wc
);
88 trace_cq_poll(cq
, num_entries
, rc
);
92 static int __ib_process_cq(struct ib_cq
*cq
, int budget
, struct ib_wc
*wcs
,
95 int i
, n
, completed
= 0;
100 * budget might be (-1) if the caller does not
101 * want to bound this call, thus we need unsigned
104 while ((n
= __poll_cq(cq
, min_t(u32
, batch
,
105 budget
- completed
), wcs
)) > 0) {
106 for (i
= 0; i
< n
; i
++) {
107 struct ib_wc
*wc
= &wcs
[i
];
110 wc
->wr_cqe
->done(cq
, wc
);
112 WARN_ON_ONCE(wc
->status
== IB_WC_SUCCESS
);
117 if (n
!= batch
|| (budget
!= -1 && completed
>= budget
))
125 * ib_process_cq_direct - process a CQ in caller context
127 * @budget: number of CQEs to poll for
129 * This function is used to process all outstanding CQ entries.
130 * It does not offload CQ processing to a different context and does
131 * not ask for completion interrupts from the HCA.
132 * Using direct processing on CQ with non IB_POLL_DIRECT type may trigger
133 * concurrent processing.
135 * Note: do not pass -1 as %budget unless it is guaranteed that the number
136 * of completions that will be processed is small.
138 int ib_process_cq_direct(struct ib_cq
*cq
, int budget
)
140 struct ib_wc wcs
[IB_POLL_BATCH_DIRECT
];
142 return __ib_process_cq(cq
, budget
, wcs
, IB_POLL_BATCH_DIRECT
);
144 EXPORT_SYMBOL(ib_process_cq_direct
);
146 static void ib_cq_completion_direct(struct ib_cq
*cq
, void *private)
148 WARN_ONCE(1, "got unsolicited completion for CQ 0x%p\n", cq
);
151 static int ib_poll_handler(struct irq_poll
*iop
, int budget
)
153 struct ib_cq
*cq
= container_of(iop
, struct ib_cq
, iop
);
154 struct dim
*dim
= cq
->dim
;
157 completed
= __ib_process_cq(cq
, budget
, cq
->wc
, IB_POLL_BATCH
);
158 if (completed
< budget
) {
159 irq_poll_complete(&cq
->iop
);
160 if (ib_req_notify_cq(cq
, IB_POLL_FLAGS
) > 0) {
161 trace_cq_reschedule(cq
);
162 irq_poll_sched(&cq
->iop
);
167 rdma_dim(dim
, completed
);
172 static void ib_cq_completion_softirq(struct ib_cq
*cq
, void *private)
174 trace_cq_schedule(cq
);
175 irq_poll_sched(&cq
->iop
);
178 static void ib_cq_poll_work(struct work_struct
*work
)
180 struct ib_cq
*cq
= container_of(work
, struct ib_cq
, work
);
183 completed
= __ib_process_cq(cq
, IB_POLL_BUDGET_WORKQUEUE
, cq
->wc
,
185 if (completed
>= IB_POLL_BUDGET_WORKQUEUE
||
186 ib_req_notify_cq(cq
, IB_POLL_FLAGS
) > 0)
187 queue_work(cq
->comp_wq
, &cq
->work
);
189 rdma_dim(cq
->dim
, completed
);
192 static void ib_cq_completion_workqueue(struct ib_cq
*cq
, void *private)
194 trace_cq_schedule(cq
);
195 queue_work(cq
->comp_wq
, &cq
->work
);
199 * __ib_alloc_cq - allocate a completion queue
200 * @dev: device to allocate the CQ for
201 * @private: driver private data, accessible from cq->cq_context
202 * @nr_cqe: number of CQEs to allocate
203 * @comp_vector: HCA completion vectors for this CQ
204 * @poll_ctx: context to poll the CQ from.
205 * @caller: module owner name.
207 * This is the proper interface to allocate a CQ for in-kernel users. A
208 * CQ allocated with this interface will automatically be polled from the
209 * specified context. The ULP must use wr->wr_cqe instead of wr->wr_id
210 * to use this CQ abstraction.
212 struct ib_cq
*__ib_alloc_cq(struct ib_device
*dev
, void *private, int nr_cqe
,
213 int comp_vector
, enum ib_poll_context poll_ctx
,
216 struct ib_cq_init_attr cq_attr
= {
218 .comp_vector
= comp_vector
,
223 cq
= rdma_zalloc_drv_obj(dev
, ib_cq
);
228 cq
->cq_context
= private;
229 cq
->poll_ctx
= poll_ctx
;
230 atomic_set(&cq
->usecnt
, 0);
231 cq
->comp_vector
= comp_vector
;
233 cq
->wc
= kmalloc_array(IB_POLL_BATCH
, sizeof(*cq
->wc
), GFP_KERNEL
);
237 rdma_restrack_new(&cq
->res
, RDMA_RESTRACK_CQ
);
238 rdma_restrack_set_name(&cq
->res
, caller
);
240 ret
= dev
->ops
.create_cq(cq
, &cq_attr
, NULL
);
246 switch (cq
->poll_ctx
) {
248 cq
->comp_handler
= ib_cq_completion_direct
;
250 case IB_POLL_SOFTIRQ
:
251 cq
->comp_handler
= ib_cq_completion_softirq
;
253 irq_poll_init(&cq
->iop
, IB_POLL_BUDGET_IRQ
, ib_poll_handler
);
254 ib_req_notify_cq(cq
, IB_CQ_NEXT_COMP
);
256 case IB_POLL_WORKQUEUE
:
257 case IB_POLL_UNBOUND_WORKQUEUE
:
258 cq
->comp_handler
= ib_cq_completion_workqueue
;
259 INIT_WORK(&cq
->work
, ib_cq_poll_work
);
260 ib_req_notify_cq(cq
, IB_CQ_NEXT_COMP
);
261 cq
->comp_wq
= (cq
->poll_ctx
== IB_POLL_WORKQUEUE
) ?
262 ib_comp_wq
: ib_comp_unbound_wq
;
269 rdma_restrack_add(&cq
->res
);
270 trace_cq_alloc(cq
, nr_cqe
, comp_vector
, poll_ctx
);
274 rdma_dim_destroy(cq
);
275 cq
->device
->ops
.destroy_cq(cq
, NULL
);
277 rdma_restrack_put(&cq
->res
);
281 trace_cq_alloc_error(nr_cqe
, comp_vector
, poll_ctx
, ret
);
284 EXPORT_SYMBOL(__ib_alloc_cq
);
287 * __ib_alloc_cq_any - allocate a completion queue
288 * @dev: device to allocate the CQ for
289 * @private: driver private data, accessible from cq->cq_context
290 * @nr_cqe: number of CQEs to allocate
291 * @poll_ctx: context to poll the CQ from
292 * @caller: module owner name
294 * Attempt to spread ULP Completion Queues over each device's interrupt
295 * vectors. A simple best-effort mechanism is used.
297 struct ib_cq
*__ib_alloc_cq_any(struct ib_device
*dev
, void *private,
298 int nr_cqe
, enum ib_poll_context poll_ctx
,
301 static atomic_t counter
;
304 if (dev
->num_comp_vectors
> 1)
306 atomic_inc_return(&counter
) %
307 min_t(int, dev
->num_comp_vectors
, num_online_cpus());
309 return __ib_alloc_cq(dev
, private, nr_cqe
, comp_vector
, poll_ctx
,
312 EXPORT_SYMBOL(__ib_alloc_cq_any
);
315 * ib_free_cq - free a completion queue
316 * @cq: completion queue to free.
318 void ib_free_cq(struct ib_cq
*cq
)
322 if (WARN_ON_ONCE(atomic_read(&cq
->usecnt
)))
324 if (WARN_ON_ONCE(cq
->cqe_used
))
327 switch (cq
->poll_ctx
) {
330 case IB_POLL_SOFTIRQ
:
331 irq_poll_disable(&cq
->iop
);
333 case IB_POLL_WORKQUEUE
:
334 case IB_POLL_UNBOUND_WORKQUEUE
:
335 cancel_work_sync(&cq
->work
);
341 rdma_dim_destroy(cq
);
343 ret
= cq
->device
->ops
.destroy_cq(cq
, NULL
);
344 WARN_ONCE(ret
, "Destroy of kernel CQ shouldn't fail");
345 rdma_restrack_del(&cq
->res
);
349 EXPORT_SYMBOL(ib_free_cq
);
351 void ib_cq_pool_cleanup(struct ib_device
*dev
)
353 struct ib_cq
*cq
, *n
;
356 for (i
= 0; i
< ARRAY_SIZE(dev
->cq_pools
); i
++) {
357 list_for_each_entry_safe(cq
, n
, &dev
->cq_pools
[i
],
359 WARN_ON(cq
->cqe_used
);
360 list_del(&cq
->pool_entry
);
367 static int ib_alloc_cqs(struct ib_device
*dev
, unsigned int nr_cqes
,
368 enum ib_poll_context poll_ctx
)
371 unsigned int nr_cqs
, i
;
372 struct ib_cq
*cq
, *n
;
375 if (poll_ctx
> IB_POLL_LAST_POOL_TYPE
) {
376 WARN_ON_ONCE(poll_ctx
> IB_POLL_LAST_POOL_TYPE
);
381 * Allocate at least as many CQEs as requested, and otherwise
382 * a reasonable batch size so that we can share CQs between
383 * multiple users instead of allocating a larger number of CQs.
385 nr_cqes
= min_t(unsigned int, dev
->attrs
.max_cqe
,
386 max(nr_cqes
, IB_MAX_SHARED_CQ_SZ
));
387 nr_cqs
= min_t(unsigned int, dev
->num_comp_vectors
, num_online_cpus());
388 for (i
= 0; i
< nr_cqs
; i
++) {
389 cq
= ib_alloc_cq(dev
, NULL
, nr_cqes
, i
, poll_ctx
);
395 list_add_tail(&cq
->pool_entry
, &tmp_list
);
398 spin_lock_irq(&dev
->cq_pools_lock
);
399 list_splice(&tmp_list
, &dev
->cq_pools
[poll_ctx
]);
400 spin_unlock_irq(&dev
->cq_pools_lock
);
405 list_for_each_entry_safe(cq
, n
, &tmp_list
, pool_entry
) {
413 * ib_cq_pool_get() - Find the least used completion queue that matches
414 * a given cpu hint (or least used for wild card affinity) and fits
417 * @nr_cqe: number of needed cqe entries
418 * @comp_vector_hint: completion vector hint (-1) for the driver to assign
419 * a comp vector based on internal counter
420 * @poll_ctx: cq polling context
422 * Finds a cq that satisfies @comp_vector_hint and @nr_cqe requirements and
423 * claim entries in it for us. In case there is no available cq, allocate
424 * a new cq with the requirements and add it to the device pool.
425 * IB_POLL_DIRECT cannot be used for shared cqs so it is not a valid value
428 struct ib_cq
*ib_cq_pool_get(struct ib_device
*dev
, unsigned int nr_cqe
,
429 int comp_vector_hint
,
430 enum ib_poll_context poll_ctx
)
432 static unsigned int default_comp_vector
;
433 unsigned int vector
, num_comp_vectors
;
434 struct ib_cq
*cq
, *found
= NULL
;
437 if (poll_ctx
> IB_POLL_LAST_POOL_TYPE
) {
438 WARN_ON_ONCE(poll_ctx
> IB_POLL_LAST_POOL_TYPE
);
439 return ERR_PTR(-EINVAL
);
443 min_t(unsigned int, dev
->num_comp_vectors
, num_online_cpus());
444 /* Project the affinty to the device completion vector range */
445 if (comp_vector_hint
< 0) {
447 (READ_ONCE(default_comp_vector
) + 1) % num_comp_vectors
;
448 WRITE_ONCE(default_comp_vector
, comp_vector_hint
);
450 vector
= comp_vector_hint
% num_comp_vectors
;
453 * Find the least used CQ with correct affinity and
454 * enough free CQ entries
457 spin_lock_irq(&dev
->cq_pools_lock
);
458 list_for_each_entry(cq
, &dev
->cq_pools
[poll_ctx
],
461 * Check to see if we have found a CQ with the
462 * correct completion vector
464 if (vector
!= cq
->comp_vector
)
466 if (cq
->cqe_used
+ nr_cqe
> cq
->cqe
)
473 found
->cqe_used
+= nr_cqe
;
474 spin_unlock_irq(&dev
->cq_pools_lock
);
478 spin_unlock_irq(&dev
->cq_pools_lock
);
481 * Didn't find a match or ran out of CQs in the device
482 * pool, allocate a new array of CQs.
484 ret
= ib_alloc_cqs(dev
, nr_cqe
, poll_ctx
);
491 EXPORT_SYMBOL(ib_cq_pool_get
);
494 * ib_cq_pool_put - Return a CQ taken from a shared pool.
495 * @cq: The CQ to return.
496 * @nr_cqe: The max number of cqes that the user had requested.
498 void ib_cq_pool_put(struct ib_cq
*cq
, unsigned int nr_cqe
)
500 if (WARN_ON_ONCE(nr_cqe
> cq
->cqe_used
))
503 spin_lock_irq(&cq
->device
->cq_pools_lock
);
504 cq
->cqe_used
-= nr_cqe
;
505 spin_unlock_irq(&cq
->device
->cq_pools_lock
);
507 EXPORT_SYMBOL(ib_cq_pool_put
);