2 * Shared Memory Communications over RDMA (SMC-R) and RoCE
4 * Work Requests exploiting Infiniband API
6 * Work requests (WR) of type ib_post_send or ib_post_recv respectively
7 * are submitted to either RC SQ or RC RQ respectively
8 * (reliably connected send/receive queue)
9 * and become work queue entries (WQEs).
10 * While an SQ WR/WQE is pending, we track it until transmission completion.
11 * Through a send or receive completion queue (CQ) respectively,
12 * we get completion queue entries (CQEs) [aka work completions (WCs)].
13 * Since the CQ callback is called from IRQ context, we split work by using
14 * bottom halves implemented by tasklets.
16 * SMC uses this to exchange LLC (link layer control)
17 * and CDC (connection data control) messages.
19 * Copyright IBM Corp. 2016
21 * Author(s): Steffen Maier <maier@linux.vnet.ibm.com>
24 #include <linux/atomic.h>
25 #include <linux/hashtable.h>
26 #include <linux/wait.h>
27 #include <rdma/ib_verbs.h>
28 #include <asm/div64.h>
33 #define SMC_WR_MAX_POLL_CQE 10 /* max. # of compl. queue elements in 1 poll */
35 #define SMC_WR_RX_HASH_BITS 4
36 static DEFINE_HASHTABLE(smc_wr_rx_hash
, SMC_WR_RX_HASH_BITS
);
37 static DEFINE_SPINLOCK(smc_wr_rx_hash_lock
);
39 struct smc_wr_tx_pend
{ /* control data for a pending send request */
40 u64 wr_id
; /* work request id sent */
41 smc_wr_tx_handler handler
;
42 enum ib_wc_status wc_status
; /* CQE status */
43 struct smc_link
*link
;
45 struct smc_wr_tx_pend_priv priv
;
48 /******************************** send queue *********************************/
50 /*------------------------------- completion --------------------------------*/
52 static inline int smc_wr_tx_find_pending_index(struct smc_link
*link
, u64 wr_id
)
56 for (i
= 0; i
< link
->wr_tx_cnt
; i
++) {
57 if (link
->wr_tx_pends
[i
].wr_id
== wr_id
)
60 return link
->wr_tx_cnt
;
63 static inline void smc_wr_tx_process_cqe(struct ib_wc
*wc
)
65 struct smc_wr_tx_pend pnd_snd
;
66 struct smc_link
*link
;
70 link
= wc
->qp
->qp_context
;
71 pnd_snd_idx
= smc_wr_tx_find_pending_index(link
, wc
->wr_id
);
72 if (pnd_snd_idx
== link
->wr_tx_cnt
)
74 link
->wr_tx_pends
[pnd_snd_idx
].wc_status
= wc
->status
;
75 memcpy(&pnd_snd
, &link
->wr_tx_pends
[pnd_snd_idx
], sizeof(pnd_snd
));
76 /* clear the full struct smc_wr_tx_pend including .priv */
77 memset(&link
->wr_tx_pends
[pnd_snd_idx
], 0,
78 sizeof(link
->wr_tx_pends
[pnd_snd_idx
]));
79 memset(&link
->wr_tx_bufs
[pnd_snd_idx
], 0,
80 sizeof(link
->wr_tx_bufs
[pnd_snd_idx
]));
81 if (!test_and_clear_bit(pnd_snd_idx
, link
->wr_tx_mask
))
84 struct smc_link_group
*lgr
;
86 for_each_set_bit(i
, link
->wr_tx_mask
, link
->wr_tx_cnt
) {
87 /* clear full struct smc_wr_tx_pend including .priv */
88 memset(&link
->wr_tx_pends
[i
], 0,
89 sizeof(link
->wr_tx_pends
[i
]));
90 memset(&link
->wr_tx_bufs
[i
], 0,
91 sizeof(link
->wr_tx_bufs
[i
]));
92 clear_bit(i
, link
->wr_tx_mask
);
94 /* terminate connections of this link group abnormally */
95 lgr
= container_of(link
, struct smc_link_group
,
96 lnk
[SMC_SINGLE_LINK
]);
97 smc_lgr_terminate(lgr
);
100 pnd_snd
.handler(&pnd_snd
.priv
, link
, wc
->status
);
101 wake_up(&link
->wr_tx_wait
);
104 static void smc_wr_tx_tasklet_fn(unsigned long data
)
106 struct smc_ib_device
*dev
= (struct smc_ib_device
*)data
;
107 struct ib_wc wc
[SMC_WR_MAX_POLL_CQE
];
114 rc
= ib_poll_cq(dev
->roce_cq_send
, SMC_WR_MAX_POLL_CQE
, wc
);
116 ib_req_notify_cq(dev
->roce_cq_send
,
118 IB_CQ_REPORT_MISSED_EVENTS
);
122 for (i
= 0; i
< rc
; i
++)
123 smc_wr_tx_process_cqe(&wc
[i
]);
129 void smc_wr_tx_cq_handler(struct ib_cq
*ib_cq
, void *cq_context
)
131 struct smc_ib_device
*dev
= (struct smc_ib_device
*)cq_context
;
133 tasklet_schedule(&dev
->send_tasklet
);
136 /*---------------------------- request submission ---------------------------*/
138 static inline int smc_wr_tx_get_free_slot_index(struct smc_link
*link
, u32
*idx
)
140 *idx
= link
->wr_tx_cnt
;
141 for_each_clear_bit(*idx
, link
->wr_tx_mask
, link
->wr_tx_cnt
) {
142 if (!test_and_set_bit(*idx
, link
->wr_tx_mask
))
145 *idx
= link
->wr_tx_cnt
;
150 * smc_wr_tx_get_free_slot() - returns buffer for message assembly,
151 * and sets info for pending transmit tracking
152 * @link: Pointer to smc_link used to later send the message.
153 * @handler: Send completion handler function pointer.
154 * @wr_buf: Out value returns pointer to message buffer.
155 * @wr_pend_priv: Out value returns pointer serving as handler context.
157 * Return: 0 on success, or -errno on error.
159 int smc_wr_tx_get_free_slot(struct smc_link
*link
,
160 smc_wr_tx_handler handler
,
161 struct smc_wr_buf
**wr_buf
,
162 struct smc_wr_tx_pend_priv
**wr_pend_priv
)
164 struct smc_wr_tx_pend
*wr_pend
;
165 struct ib_send_wr
*wr_ib
;
171 *wr_pend_priv
= NULL
;
173 rc
= smc_wr_tx_get_free_slot_index(link
, &idx
);
177 rc
= wait_event_interruptible_timeout(
179 (smc_wr_tx_get_free_slot_index(link
, &idx
) != -EBUSY
),
180 SMC_WR_TX_WAIT_FREE_SLOT_TIME
);
182 /* timeout - terminate connections */
183 struct smc_link_group
*lgr
;
185 lgr
= container_of(link
, struct smc_link_group
,
186 lnk
[SMC_SINGLE_LINK
]);
187 smc_lgr_terminate(lgr
);
190 if (rc
== -ERESTARTSYS
)
192 if (idx
== link
->wr_tx_cnt
)
195 wr_id
= smc_wr_tx_get_next_wr_id(link
);
196 wr_pend
= &link
->wr_tx_pends
[idx
];
197 wr_pend
->wr_id
= wr_id
;
198 wr_pend
->handler
= handler
;
199 wr_pend
->link
= link
;
201 wr_ib
= &link
->wr_tx_ibs
[idx
];
202 wr_ib
->wr_id
= wr_id
;
203 *wr_buf
= &link
->wr_tx_bufs
[idx
];
204 *wr_pend_priv
= &wr_pend
->priv
;
208 int smc_wr_tx_put_slot(struct smc_link
*link
,
209 struct smc_wr_tx_pend_priv
*wr_pend_priv
)
211 struct smc_wr_tx_pend
*pend
;
213 pend
= container_of(wr_pend_priv
, struct smc_wr_tx_pend
, priv
);
214 if (pend
->idx
< link
->wr_tx_cnt
) {
215 /* clear the full struct smc_wr_tx_pend including .priv */
216 memset(&link
->wr_tx_pends
[pend
->idx
], 0,
217 sizeof(link
->wr_tx_pends
[pend
->idx
]));
218 memset(&link
->wr_tx_bufs
[pend
->idx
], 0,
219 sizeof(link
->wr_tx_bufs
[pend
->idx
]));
220 test_and_clear_bit(pend
->idx
, link
->wr_tx_mask
);
227 /* Send prepared WR slot via ib_post_send.
228 * @priv: pointer to smc_wr_tx_pend_priv identifying prepared message buffer
230 int smc_wr_tx_send(struct smc_link
*link
, struct smc_wr_tx_pend_priv
*priv
)
232 struct ib_send_wr
*failed_wr
= NULL
;
233 struct smc_wr_tx_pend
*pend
;
236 ib_req_notify_cq(link
->smcibdev
->roce_cq_send
,
237 IB_CQ_SOLICITED_MASK
| IB_CQ_REPORT_MISSED_EVENTS
);
238 pend
= container_of(priv
, struct smc_wr_tx_pend
, priv
);
239 rc
= ib_post_send(link
->roce_qp
, &link
->wr_tx_ibs
[pend
->idx
],
242 smc_wr_tx_put_slot(link
, priv
);
246 void smc_wr_tx_dismiss_slots(struct smc_link
*link
, u8 wr_rx_hdr_type
,
247 smc_wr_tx_filter filter
,
248 smc_wr_tx_dismisser dismisser
,
251 struct smc_wr_tx_pend_priv
*tx_pend
;
252 struct smc_wr_rx_hdr
*wr_rx
;
255 for_each_set_bit(i
, link
->wr_tx_mask
, link
->wr_tx_cnt
) {
256 wr_rx
= (struct smc_wr_rx_hdr
*)&link
->wr_rx_bufs
[i
];
257 if (wr_rx
->type
!= wr_rx_hdr_type
)
259 tx_pend
= &link
->wr_tx_pends
[i
].priv
;
260 if (filter(tx_pend
, data
))
265 bool smc_wr_tx_has_pending(struct smc_link
*link
, u8 wr_rx_hdr_type
,
266 smc_wr_tx_filter filter
, unsigned long data
)
268 struct smc_wr_tx_pend_priv
*tx_pend
;
269 struct smc_wr_rx_hdr
*wr_rx
;
272 for_each_set_bit(i
, link
->wr_tx_mask
, link
->wr_tx_cnt
) {
273 wr_rx
= (struct smc_wr_rx_hdr
*)&link
->wr_rx_bufs
[i
];
274 if (wr_rx
->type
!= wr_rx_hdr_type
)
276 tx_pend
= &link
->wr_tx_pends
[i
].priv
;
277 if (filter(tx_pend
, data
))
283 /****************************** receive queue ********************************/
285 int smc_wr_rx_register_handler(struct smc_wr_rx_handler
*handler
)
287 struct smc_wr_rx_handler
*h_iter
;
290 spin_lock(&smc_wr_rx_hash_lock
);
291 hash_for_each_possible(smc_wr_rx_hash
, h_iter
, list
, handler
->type
) {
292 if (h_iter
->type
== handler
->type
) {
297 hash_add(smc_wr_rx_hash
, &handler
->list
, handler
->type
);
299 spin_unlock(&smc_wr_rx_hash_lock
);
303 /* Demultiplex a received work request based on the message type to its handler.
304 * Relies on smc_wr_rx_hash having been completely filled before any IB WRs,
305 * and not being modified any more afterwards so we don't need to lock it.
307 static inline void smc_wr_rx_demultiplex(struct ib_wc
*wc
)
309 struct smc_link
*link
= (struct smc_link
*)wc
->qp
->qp_context
;
310 struct smc_wr_rx_handler
*handler
;
311 struct smc_wr_rx_hdr
*wr_rx
;
315 if (wc
->byte_len
< sizeof(*wr_rx
))
316 return; /* short message */
317 temp_wr_id
= wc
->wr_id
;
318 index
= do_div(temp_wr_id
, link
->wr_rx_cnt
);
319 wr_rx
= (struct smc_wr_rx_hdr
*)&link
->wr_rx_bufs
[index
];
320 hash_for_each_possible(smc_wr_rx_hash
, handler
, list
, wr_rx
->type
) {
321 if (handler
->type
== wr_rx
->type
)
322 handler
->handler(wc
, wr_rx
);
326 static inline void smc_wr_rx_process_cqes(struct ib_wc wc
[], int num
)
328 struct smc_link
*link
;
331 for (i
= 0; i
< num
; i
++) {
332 link
= wc
[i
].qp
->qp_context
;
333 if (wc
[i
].status
== IB_WC_SUCCESS
) {
334 smc_wr_rx_demultiplex(&wc
[i
]);
335 smc_wr_rx_post(link
); /* refill WR RX */
337 struct smc_link_group
*lgr
;
339 /* handle status errors */
340 switch (wc
[i
].status
) {
341 case IB_WC_RETRY_EXC_ERR
:
342 case IB_WC_RNR_RETRY_EXC_ERR
:
343 case IB_WC_WR_FLUSH_ERR
:
344 /* terminate connections of this link group
347 lgr
= container_of(link
, struct smc_link_group
,
348 lnk
[SMC_SINGLE_LINK
]);
349 smc_lgr_terminate(lgr
);
352 smc_wr_rx_post(link
); /* refill WR RX */
359 static void smc_wr_rx_tasklet_fn(unsigned long data
)
361 struct smc_ib_device
*dev
= (struct smc_ib_device
*)data
;
362 struct ib_wc wc
[SMC_WR_MAX_POLL_CQE
];
369 memset(&wc
, 0, sizeof(wc
));
370 rc
= ib_poll_cq(dev
->roce_cq_recv
, SMC_WR_MAX_POLL_CQE
, wc
);
372 ib_req_notify_cq(dev
->roce_cq_recv
,
374 | IB_CQ_REPORT_MISSED_EVENTS
);
378 smc_wr_rx_process_cqes(&wc
[0], rc
);
384 void smc_wr_rx_cq_handler(struct ib_cq
*ib_cq
, void *cq_context
)
386 struct smc_ib_device
*dev
= (struct smc_ib_device
*)cq_context
;
388 tasklet_schedule(&dev
->recv_tasklet
);
391 int smc_wr_rx_post_init(struct smc_link
*link
)
396 for (i
= 0; i
< link
->wr_rx_cnt
; i
++)
397 rc
= smc_wr_rx_post(link
);
401 /***************************** init, exit, misc ******************************/
403 void smc_wr_remember_qp_attr(struct smc_link
*lnk
)
405 struct ib_qp_attr
*attr
= &lnk
->qp_attr
;
406 struct ib_qp_init_attr init_attr
;
408 memset(attr
, 0, sizeof(*attr
));
409 memset(&init_attr
, 0, sizeof(init_attr
));
410 ib_query_qp(lnk
->roce_qp
, attr
,
423 IB_QP_MIN_RNR_TIMER
|
425 IB_QP_PATH_MIG_STATE
|
430 lnk
->wr_tx_cnt
= min_t(size_t, SMC_WR_BUF_CNT
,
431 lnk
->qp_attr
.cap
.max_send_wr
);
432 lnk
->wr_rx_cnt
= min_t(size_t, SMC_WR_BUF_CNT
* 3,
433 lnk
->qp_attr
.cap
.max_recv_wr
);
436 static void smc_wr_init_sge(struct smc_link
*lnk
)
440 for (i
= 0; i
< lnk
->wr_tx_cnt
; i
++) {
441 lnk
->wr_tx_sges
[i
].addr
=
442 lnk
->wr_tx_dma_addr
+ i
* SMC_WR_BUF_SIZE
;
443 lnk
->wr_tx_sges
[i
].length
= SMC_WR_TX_SIZE
;
444 lnk
->wr_tx_sges
[i
].lkey
= lnk
->roce_pd
->local_dma_lkey
;
445 lnk
->wr_tx_ibs
[i
].next
= NULL
;
446 lnk
->wr_tx_ibs
[i
].sg_list
= &lnk
->wr_tx_sges
[i
];
447 lnk
->wr_tx_ibs
[i
].num_sge
= 1;
448 lnk
->wr_tx_ibs
[i
].opcode
= IB_WR_SEND
;
449 lnk
->wr_tx_ibs
[i
].send_flags
=
450 IB_SEND_SIGNALED
| IB_SEND_SOLICITED
;
452 for (i
= 0; i
< lnk
->wr_rx_cnt
; i
++) {
453 lnk
->wr_rx_sges
[i
].addr
=
454 lnk
->wr_rx_dma_addr
+ i
* SMC_WR_BUF_SIZE
;
455 lnk
->wr_rx_sges
[i
].length
= SMC_WR_BUF_SIZE
;
456 lnk
->wr_rx_sges
[i
].lkey
= lnk
->roce_pd
->local_dma_lkey
;
457 lnk
->wr_rx_ibs
[i
].next
= NULL
;
458 lnk
->wr_rx_ibs
[i
].sg_list
= &lnk
->wr_rx_sges
[i
];
459 lnk
->wr_rx_ibs
[i
].num_sge
= 1;
463 void smc_wr_free_link(struct smc_link
*lnk
)
465 struct ib_device
*ibdev
;
467 memset(lnk
->wr_tx_mask
, 0,
468 BITS_TO_LONGS(SMC_WR_BUF_CNT
) * sizeof(*lnk
->wr_tx_mask
));
472 ibdev
= lnk
->smcibdev
->ibdev
;
474 if (lnk
->wr_rx_dma_addr
) {
475 ib_dma_unmap_single(ibdev
, lnk
->wr_rx_dma_addr
,
476 SMC_WR_BUF_SIZE
* lnk
->wr_rx_cnt
,
478 lnk
->wr_rx_dma_addr
= 0;
480 if (lnk
->wr_tx_dma_addr
) {
481 ib_dma_unmap_single(ibdev
, lnk
->wr_tx_dma_addr
,
482 SMC_WR_BUF_SIZE
* lnk
->wr_tx_cnt
,
484 lnk
->wr_tx_dma_addr
= 0;
488 void smc_wr_free_link_mem(struct smc_link
*lnk
)
490 kfree(lnk
->wr_tx_pends
);
491 lnk
->wr_tx_pends
= NULL
;
492 kfree(lnk
->wr_tx_mask
);
493 lnk
->wr_tx_mask
= NULL
;
494 kfree(lnk
->wr_tx_sges
);
495 lnk
->wr_tx_sges
= NULL
;
496 kfree(lnk
->wr_rx_sges
);
497 lnk
->wr_rx_sges
= NULL
;
498 kfree(lnk
->wr_rx_ibs
);
499 lnk
->wr_rx_ibs
= NULL
;
500 kfree(lnk
->wr_tx_ibs
);
501 lnk
->wr_tx_ibs
= NULL
;
502 kfree(lnk
->wr_tx_bufs
);
503 lnk
->wr_tx_bufs
= NULL
;
504 kfree(lnk
->wr_rx_bufs
);
505 lnk
->wr_rx_bufs
= NULL
;
508 int smc_wr_alloc_link_mem(struct smc_link
*link
)
510 /* allocate link related memory */
511 link
->wr_tx_bufs
= kcalloc(SMC_WR_BUF_CNT
, SMC_WR_BUF_SIZE
, GFP_KERNEL
);
512 if (!link
->wr_tx_bufs
)
514 link
->wr_rx_bufs
= kcalloc(SMC_WR_BUF_CNT
* 3, SMC_WR_BUF_SIZE
,
516 if (!link
->wr_rx_bufs
)
517 goto no_mem_wr_tx_bufs
;
518 link
->wr_tx_ibs
= kcalloc(SMC_WR_BUF_CNT
, sizeof(link
->wr_tx_ibs
[0]),
520 if (!link
->wr_tx_ibs
)
521 goto no_mem_wr_rx_bufs
;
522 link
->wr_rx_ibs
= kcalloc(SMC_WR_BUF_CNT
* 3,
523 sizeof(link
->wr_rx_ibs
[0]),
525 if (!link
->wr_rx_ibs
)
526 goto no_mem_wr_tx_ibs
;
527 link
->wr_tx_sges
= kcalloc(SMC_WR_BUF_CNT
, sizeof(link
->wr_tx_sges
[0]),
529 if (!link
->wr_tx_sges
)
530 goto no_mem_wr_rx_ibs
;
531 link
->wr_rx_sges
= kcalloc(SMC_WR_BUF_CNT
* 3,
532 sizeof(link
->wr_rx_sges
[0]),
534 if (!link
->wr_rx_sges
)
535 goto no_mem_wr_tx_sges
;
536 link
->wr_tx_mask
= kzalloc(
537 BITS_TO_LONGS(SMC_WR_BUF_CNT
) * sizeof(*link
->wr_tx_mask
),
539 if (!link
->wr_tx_mask
)
540 goto no_mem_wr_rx_sges
;
541 link
->wr_tx_pends
= kcalloc(SMC_WR_BUF_CNT
,
542 sizeof(link
->wr_tx_pends
[0]),
544 if (!link
->wr_tx_pends
)
545 goto no_mem_wr_tx_mask
;
549 kfree(link
->wr_tx_mask
);
551 kfree(link
->wr_rx_sges
);
553 kfree(link
->wr_tx_sges
);
555 kfree(link
->wr_rx_ibs
);
557 kfree(link
->wr_tx_ibs
);
559 kfree(link
->wr_rx_bufs
);
561 kfree(link
->wr_tx_bufs
);
566 void smc_wr_remove_dev(struct smc_ib_device
*smcibdev
)
568 tasklet_kill(&smcibdev
->recv_tasklet
);
569 tasklet_kill(&smcibdev
->send_tasklet
);
572 void smc_wr_add_dev(struct smc_ib_device
*smcibdev
)
574 tasklet_init(&smcibdev
->recv_tasklet
, smc_wr_rx_tasklet_fn
,
575 (unsigned long)smcibdev
);
576 tasklet_init(&smcibdev
->send_tasklet
, smc_wr_tx_tasklet_fn
,
577 (unsigned long)smcibdev
);
580 int smc_wr_create_link(struct smc_link
*lnk
)
582 struct ib_device
*ibdev
= lnk
->smcibdev
->ibdev
;
585 smc_wr_tx_set_wr_id(&lnk
->wr_tx_id
, 0);
587 lnk
->wr_rx_dma_addr
= ib_dma_map_single(
588 ibdev
, lnk
->wr_rx_bufs
, SMC_WR_BUF_SIZE
* lnk
->wr_rx_cnt
,
590 if (ib_dma_mapping_error(ibdev
, lnk
->wr_rx_dma_addr
)) {
591 lnk
->wr_rx_dma_addr
= 0;
595 lnk
->wr_tx_dma_addr
= ib_dma_map_single(
596 ibdev
, lnk
->wr_tx_bufs
, SMC_WR_BUF_SIZE
* lnk
->wr_tx_cnt
,
598 if (ib_dma_mapping_error(ibdev
, lnk
->wr_tx_dma_addr
)) {
602 smc_wr_init_sge(lnk
);
603 memset(lnk
->wr_tx_mask
, 0,
604 BITS_TO_LONGS(SMC_WR_BUF_CNT
) * sizeof(*lnk
->wr_tx_mask
));
608 ib_dma_unmap_single(ibdev
, lnk
->wr_rx_dma_addr
,
609 SMC_WR_BUF_SIZE
* lnk
->wr_rx_cnt
,
611 lnk
->wr_rx_dma_addr
= 0;