1 // SPDX-License-Identifier: GPL-2.0
3 * Shared Memory Communications over RDMA (SMC-R) and RoCE
5 * Work Requests exploiting Infiniband API
7 * Work requests (WR) of type ib_post_send or ib_post_recv respectively
8 * are submitted to either RC SQ or RC RQ respectively
9 * (reliably connected send/receive queue)
10 * and become work queue entries (WQEs).
11 * While an SQ WR/WQE is pending, we track it until transmission completion.
12 * Through a send or receive completion queue (CQ) respectively,
13 * we get completion queue entries (CQEs) [aka work completions (WCs)].
14 * Since the CQ callback is called from IRQ context, we split work by using
15 * bottom halves implemented by tasklets.
17 * SMC uses this to exchange LLC (link layer control)
18 * and CDC (connection data control) messages.
20 * Copyright IBM Corp. 2016
22 * Author(s): Steffen Maier <maier@linux.vnet.ibm.com>
25 #include <linux/atomic.h>
26 #include <linux/hashtable.h>
27 #include <linux/wait.h>
28 #include <rdma/ib_verbs.h>
29 #include <asm/div64.h>
34 #define SMC_WR_MAX_POLL_CQE 10 /* max. # of compl. queue elements in 1 poll */
36 #define SMC_WR_RX_HASH_BITS 4
37 static DEFINE_HASHTABLE(smc_wr_rx_hash
, SMC_WR_RX_HASH_BITS
);
38 static DEFINE_SPINLOCK(smc_wr_rx_hash_lock
);
40 struct smc_wr_tx_pend
{ /* control data for a pending send request */
41 u64 wr_id
; /* work request id sent */
42 smc_wr_tx_handler handler
;
43 enum ib_wc_status wc_status
; /* CQE status */
44 struct smc_link
*link
;
46 struct smc_wr_tx_pend_priv priv
;
49 /******************************** send queue *********************************/
51 /*------------------------------- completion --------------------------------*/
53 static inline int smc_wr_tx_find_pending_index(struct smc_link
*link
, u64 wr_id
)
57 for (i
= 0; i
< link
->wr_tx_cnt
; i
++) {
58 if (link
->wr_tx_pends
[i
].wr_id
== wr_id
)
61 return link
->wr_tx_cnt
;
64 static inline void smc_wr_tx_process_cqe(struct ib_wc
*wc
)
66 struct smc_wr_tx_pend pnd_snd
;
67 struct smc_link
*link
;
71 link
= wc
->qp
->qp_context
;
73 if (wc
->opcode
== IB_WC_REG_MR
) {
75 link
->wr_reg_state
= FAILED
;
77 link
->wr_reg_state
= CONFIRMED
;
78 wake_up(&link
->wr_reg_wait
);
82 pnd_snd_idx
= smc_wr_tx_find_pending_index(link
, wc
->wr_id
);
83 if (pnd_snd_idx
== link
->wr_tx_cnt
)
85 link
->wr_tx_pends
[pnd_snd_idx
].wc_status
= wc
->status
;
86 memcpy(&pnd_snd
, &link
->wr_tx_pends
[pnd_snd_idx
], sizeof(pnd_snd
));
87 /* clear the full struct smc_wr_tx_pend including .priv */
88 memset(&link
->wr_tx_pends
[pnd_snd_idx
], 0,
89 sizeof(link
->wr_tx_pends
[pnd_snd_idx
]));
90 memset(&link
->wr_tx_bufs
[pnd_snd_idx
], 0,
91 sizeof(link
->wr_tx_bufs
[pnd_snd_idx
]));
92 if (!test_and_clear_bit(pnd_snd_idx
, link
->wr_tx_mask
))
95 for_each_set_bit(i
, link
->wr_tx_mask
, link
->wr_tx_cnt
) {
96 /* clear full struct smc_wr_tx_pend including .priv */
97 memset(&link
->wr_tx_pends
[i
], 0,
98 sizeof(link
->wr_tx_pends
[i
]));
99 memset(&link
->wr_tx_bufs
[i
], 0,
100 sizeof(link
->wr_tx_bufs
[i
]));
101 clear_bit(i
, link
->wr_tx_mask
);
103 /* terminate connections of this link group abnormally */
104 smc_lgr_terminate(smc_get_lgr(link
));
107 pnd_snd
.handler(&pnd_snd
.priv
, link
, wc
->status
);
108 wake_up(&link
->wr_tx_wait
);
111 static void smc_wr_tx_tasklet_fn(unsigned long data
)
113 struct smc_ib_device
*dev
= (struct smc_ib_device
*)data
;
114 struct ib_wc wc
[SMC_WR_MAX_POLL_CQE
];
121 memset(&wc
, 0, sizeof(wc
));
122 rc
= ib_poll_cq(dev
->roce_cq_send
, SMC_WR_MAX_POLL_CQE
, wc
);
124 ib_req_notify_cq(dev
->roce_cq_send
,
126 IB_CQ_REPORT_MISSED_EVENTS
);
130 for (i
= 0; i
< rc
; i
++)
131 smc_wr_tx_process_cqe(&wc
[i
]);
137 void smc_wr_tx_cq_handler(struct ib_cq
*ib_cq
, void *cq_context
)
139 struct smc_ib_device
*dev
= (struct smc_ib_device
*)cq_context
;
141 tasklet_schedule(&dev
->send_tasklet
);
144 /*---------------------------- request submission ---------------------------*/
146 static inline int smc_wr_tx_get_free_slot_index(struct smc_link
*link
, u32
*idx
)
148 *idx
= link
->wr_tx_cnt
;
149 for_each_clear_bit(*idx
, link
->wr_tx_mask
, link
->wr_tx_cnt
) {
150 if (!test_and_set_bit(*idx
, link
->wr_tx_mask
))
153 *idx
= link
->wr_tx_cnt
;
158 * smc_wr_tx_get_free_slot() - returns buffer for message assembly,
159 * and sets info for pending transmit tracking
160 * @link: Pointer to smc_link used to later send the message.
161 * @handler: Send completion handler function pointer.
162 * @wr_buf: Out value returns pointer to message buffer.
163 * @wr_pend_priv: Out value returns pointer serving as handler context.
165 * Return: 0 on success, or -errno on error.
167 int smc_wr_tx_get_free_slot(struct smc_link
*link
,
168 smc_wr_tx_handler handler
,
169 struct smc_wr_buf
**wr_buf
,
170 struct smc_wr_tx_pend_priv
**wr_pend_priv
)
172 struct smc_wr_tx_pend
*wr_pend
;
173 u32 idx
= link
->wr_tx_cnt
;
174 struct ib_send_wr
*wr_ib
;
179 *wr_pend_priv
= NULL
;
181 rc
= smc_wr_tx_get_free_slot_index(link
, &idx
);
185 rc
= wait_event_timeout(
187 link
->state
== SMC_LNK_INACTIVE
||
188 (smc_wr_tx_get_free_slot_index(link
, &idx
) != -EBUSY
),
189 SMC_WR_TX_WAIT_FREE_SLOT_TIME
);
191 /* timeout - terminate connections */
192 smc_lgr_terminate(smc_get_lgr(link
));
195 if (idx
== link
->wr_tx_cnt
)
198 wr_id
= smc_wr_tx_get_next_wr_id(link
);
199 wr_pend
= &link
->wr_tx_pends
[idx
];
200 wr_pend
->wr_id
= wr_id
;
201 wr_pend
->handler
= handler
;
202 wr_pend
->link
= link
;
204 wr_ib
= &link
->wr_tx_ibs
[idx
];
205 wr_ib
->wr_id
= wr_id
;
206 *wr_buf
= &link
->wr_tx_bufs
[idx
];
207 *wr_pend_priv
= &wr_pend
->priv
;
211 int smc_wr_tx_put_slot(struct smc_link
*link
,
212 struct smc_wr_tx_pend_priv
*wr_pend_priv
)
214 struct smc_wr_tx_pend
*pend
;
216 pend
= container_of(wr_pend_priv
, struct smc_wr_tx_pend
, priv
);
217 if (pend
->idx
< link
->wr_tx_cnt
) {
220 /* clear the full struct smc_wr_tx_pend including .priv */
221 memset(&link
->wr_tx_pends
[pend
->idx
], 0,
222 sizeof(link
->wr_tx_pends
[pend
->idx
]));
223 memset(&link
->wr_tx_bufs
[pend
->idx
], 0,
224 sizeof(link
->wr_tx_bufs
[pend
->idx
]));
225 test_and_clear_bit(idx
, link
->wr_tx_mask
);
232 /* Send prepared WR slot via ib_post_send.
233 * @priv: pointer to smc_wr_tx_pend_priv identifying prepared message buffer
235 int smc_wr_tx_send(struct smc_link
*link
, struct smc_wr_tx_pend_priv
*priv
)
237 struct smc_wr_tx_pend
*pend
;
240 ib_req_notify_cq(link
->smcibdev
->roce_cq_send
,
241 IB_CQ_NEXT_COMP
| IB_CQ_REPORT_MISSED_EVENTS
);
242 pend
= container_of(priv
, struct smc_wr_tx_pend
, priv
);
243 rc
= ib_post_send(link
->roce_qp
, &link
->wr_tx_ibs
[pend
->idx
], NULL
);
245 smc_wr_tx_put_slot(link
, priv
);
246 smc_lgr_terminate(smc_get_lgr(link
));
251 /* Register a memory region and wait for result. */
252 int smc_wr_reg_send(struct smc_link
*link
, struct ib_mr
*mr
)
256 ib_req_notify_cq(link
->smcibdev
->roce_cq_send
,
257 IB_CQ_NEXT_COMP
| IB_CQ_REPORT_MISSED_EVENTS
);
258 link
->wr_reg_state
= POSTED
;
259 link
->wr_reg
.wr
.wr_id
= (u64
)(uintptr_t)mr
;
260 link
->wr_reg
.mr
= mr
;
261 link
->wr_reg
.key
= mr
->rkey
;
262 rc
= ib_post_send(link
->roce_qp
, &link
->wr_reg
.wr
, NULL
);
266 rc
= wait_event_interruptible_timeout(link
->wr_reg_wait
,
267 (link
->wr_reg_state
!= POSTED
),
268 SMC_WR_REG_MR_WAIT_TIME
);
270 /* timeout - terminate connections */
271 smc_lgr_terminate(smc_get_lgr(link
));
274 if (rc
== -ERESTARTSYS
)
276 switch (link
->wr_reg_state
) {
290 void smc_wr_tx_dismiss_slots(struct smc_link
*link
, u8 wr_tx_hdr_type
,
291 smc_wr_tx_filter filter
,
292 smc_wr_tx_dismisser dismisser
,
295 struct smc_wr_tx_pend_priv
*tx_pend
;
296 struct smc_wr_rx_hdr
*wr_tx
;
299 for_each_set_bit(i
, link
->wr_tx_mask
, link
->wr_tx_cnt
) {
300 wr_tx
= (struct smc_wr_rx_hdr
*)&link
->wr_tx_bufs
[i
];
301 if (wr_tx
->type
!= wr_tx_hdr_type
)
303 tx_pend
= &link
->wr_tx_pends
[i
].priv
;
304 if (filter(tx_pend
, data
))
309 /****************************** receive queue ********************************/
311 int smc_wr_rx_register_handler(struct smc_wr_rx_handler
*handler
)
313 struct smc_wr_rx_handler
*h_iter
;
316 spin_lock(&smc_wr_rx_hash_lock
);
317 hash_for_each_possible(smc_wr_rx_hash
, h_iter
, list
, handler
->type
) {
318 if (h_iter
->type
== handler
->type
) {
323 hash_add(smc_wr_rx_hash
, &handler
->list
, handler
->type
);
325 spin_unlock(&smc_wr_rx_hash_lock
);
329 /* Demultiplex a received work request based on the message type to its handler.
330 * Relies on smc_wr_rx_hash having been completely filled before any IB WRs,
331 * and not being modified any more afterwards so we don't need to lock it.
333 static inline void smc_wr_rx_demultiplex(struct ib_wc
*wc
)
335 struct smc_link
*link
= (struct smc_link
*)wc
->qp
->qp_context
;
336 struct smc_wr_rx_handler
*handler
;
337 struct smc_wr_rx_hdr
*wr_rx
;
341 if (wc
->byte_len
< sizeof(*wr_rx
))
342 return; /* short message */
343 temp_wr_id
= wc
->wr_id
;
344 index
= do_div(temp_wr_id
, link
->wr_rx_cnt
);
345 wr_rx
= (struct smc_wr_rx_hdr
*)&link
->wr_rx_bufs
[index
];
346 hash_for_each_possible(smc_wr_rx_hash
, handler
, list
, wr_rx
->type
) {
347 if (handler
->type
== wr_rx
->type
)
348 handler
->handler(wc
, wr_rx
);
352 static inline void smc_wr_rx_process_cqes(struct ib_wc wc
[], int num
)
354 struct smc_link
*link
;
357 for (i
= 0; i
< num
; i
++) {
358 link
= wc
[i
].qp
->qp_context
;
359 if (wc
[i
].status
== IB_WC_SUCCESS
) {
360 link
->wr_rx_tstamp
= jiffies
;
361 smc_wr_rx_demultiplex(&wc
[i
]);
362 smc_wr_rx_post(link
); /* refill WR RX */
364 /* handle status errors */
365 switch (wc
[i
].status
) {
366 case IB_WC_RETRY_EXC_ERR
:
367 case IB_WC_RNR_RETRY_EXC_ERR
:
368 case IB_WC_WR_FLUSH_ERR
:
369 /* terminate connections of this link group
372 smc_lgr_terminate(smc_get_lgr(link
));
375 smc_wr_rx_post(link
); /* refill WR RX */
382 static void smc_wr_rx_tasklet_fn(unsigned long data
)
384 struct smc_ib_device
*dev
= (struct smc_ib_device
*)data
;
385 struct ib_wc wc
[SMC_WR_MAX_POLL_CQE
];
392 memset(&wc
, 0, sizeof(wc
));
393 rc
= ib_poll_cq(dev
->roce_cq_recv
, SMC_WR_MAX_POLL_CQE
, wc
);
395 ib_req_notify_cq(dev
->roce_cq_recv
,
397 | IB_CQ_REPORT_MISSED_EVENTS
);
401 smc_wr_rx_process_cqes(&wc
[0], rc
);
407 void smc_wr_rx_cq_handler(struct ib_cq
*ib_cq
, void *cq_context
)
409 struct smc_ib_device
*dev
= (struct smc_ib_device
*)cq_context
;
411 tasklet_schedule(&dev
->recv_tasklet
);
414 int smc_wr_rx_post_init(struct smc_link
*link
)
419 for (i
= 0; i
< link
->wr_rx_cnt
; i
++)
420 rc
= smc_wr_rx_post(link
);
424 /***************************** init, exit, misc ******************************/
426 void smc_wr_remember_qp_attr(struct smc_link
*lnk
)
428 struct ib_qp_attr
*attr
= &lnk
->qp_attr
;
429 struct ib_qp_init_attr init_attr
;
431 memset(attr
, 0, sizeof(*attr
));
432 memset(&init_attr
, 0, sizeof(init_attr
));
433 ib_query_qp(lnk
->roce_qp
, attr
,
446 IB_QP_MIN_RNR_TIMER
|
448 IB_QP_PATH_MIG_STATE
|
453 lnk
->wr_tx_cnt
= min_t(size_t, SMC_WR_BUF_CNT
,
454 lnk
->qp_attr
.cap
.max_send_wr
);
455 lnk
->wr_rx_cnt
= min_t(size_t, SMC_WR_BUF_CNT
* 3,
456 lnk
->qp_attr
.cap
.max_recv_wr
);
459 static void smc_wr_init_sge(struct smc_link
*lnk
)
463 for (i
= 0; i
< lnk
->wr_tx_cnt
; i
++) {
464 lnk
->wr_tx_sges
[i
].addr
=
465 lnk
->wr_tx_dma_addr
+ i
* SMC_WR_BUF_SIZE
;
466 lnk
->wr_tx_sges
[i
].length
= SMC_WR_TX_SIZE
;
467 lnk
->wr_tx_sges
[i
].lkey
= lnk
->roce_pd
->local_dma_lkey
;
468 lnk
->wr_tx_ibs
[i
].next
= NULL
;
469 lnk
->wr_tx_ibs
[i
].sg_list
= &lnk
->wr_tx_sges
[i
];
470 lnk
->wr_tx_ibs
[i
].num_sge
= 1;
471 lnk
->wr_tx_ibs
[i
].opcode
= IB_WR_SEND
;
472 lnk
->wr_tx_ibs
[i
].send_flags
=
473 IB_SEND_SIGNALED
| IB_SEND_SOLICITED
;
475 for (i
= 0; i
< lnk
->wr_rx_cnt
; i
++) {
476 lnk
->wr_rx_sges
[i
].addr
=
477 lnk
->wr_rx_dma_addr
+ i
* SMC_WR_BUF_SIZE
;
478 lnk
->wr_rx_sges
[i
].length
= SMC_WR_BUF_SIZE
;
479 lnk
->wr_rx_sges
[i
].lkey
= lnk
->roce_pd
->local_dma_lkey
;
480 lnk
->wr_rx_ibs
[i
].next
= NULL
;
481 lnk
->wr_rx_ibs
[i
].sg_list
= &lnk
->wr_rx_sges
[i
];
482 lnk
->wr_rx_ibs
[i
].num_sge
= 1;
484 lnk
->wr_reg
.wr
.next
= NULL
;
485 lnk
->wr_reg
.wr
.num_sge
= 0;
486 lnk
->wr_reg
.wr
.send_flags
= IB_SEND_SIGNALED
;
487 lnk
->wr_reg
.wr
.opcode
= IB_WR_REG_MR
;
488 lnk
->wr_reg
.access
= IB_ACCESS_LOCAL_WRITE
| IB_ACCESS_REMOTE_WRITE
;
491 void smc_wr_free_link(struct smc_link
*lnk
)
493 struct ib_device
*ibdev
;
495 memset(lnk
->wr_tx_mask
, 0,
496 BITS_TO_LONGS(SMC_WR_BUF_CNT
) * sizeof(*lnk
->wr_tx_mask
));
500 ibdev
= lnk
->smcibdev
->ibdev
;
502 if (lnk
->wr_rx_dma_addr
) {
503 ib_dma_unmap_single(ibdev
, lnk
->wr_rx_dma_addr
,
504 SMC_WR_BUF_SIZE
* lnk
->wr_rx_cnt
,
506 lnk
->wr_rx_dma_addr
= 0;
508 if (lnk
->wr_tx_dma_addr
) {
509 ib_dma_unmap_single(ibdev
, lnk
->wr_tx_dma_addr
,
510 SMC_WR_BUF_SIZE
* lnk
->wr_tx_cnt
,
512 lnk
->wr_tx_dma_addr
= 0;
516 void smc_wr_free_link_mem(struct smc_link
*lnk
)
518 kfree(lnk
->wr_tx_pends
);
519 lnk
->wr_tx_pends
= NULL
;
520 kfree(lnk
->wr_tx_mask
);
521 lnk
->wr_tx_mask
= NULL
;
522 kfree(lnk
->wr_tx_sges
);
523 lnk
->wr_tx_sges
= NULL
;
524 kfree(lnk
->wr_rx_sges
);
525 lnk
->wr_rx_sges
= NULL
;
526 kfree(lnk
->wr_rx_ibs
);
527 lnk
->wr_rx_ibs
= NULL
;
528 kfree(lnk
->wr_tx_ibs
);
529 lnk
->wr_tx_ibs
= NULL
;
530 kfree(lnk
->wr_tx_bufs
);
531 lnk
->wr_tx_bufs
= NULL
;
532 kfree(lnk
->wr_rx_bufs
);
533 lnk
->wr_rx_bufs
= NULL
;
536 int smc_wr_alloc_link_mem(struct smc_link
*link
)
538 /* allocate link related memory */
539 link
->wr_tx_bufs
= kcalloc(SMC_WR_BUF_CNT
, SMC_WR_BUF_SIZE
, GFP_KERNEL
);
540 if (!link
->wr_tx_bufs
)
542 link
->wr_rx_bufs
= kcalloc(SMC_WR_BUF_CNT
* 3, SMC_WR_BUF_SIZE
,
544 if (!link
->wr_rx_bufs
)
545 goto no_mem_wr_tx_bufs
;
546 link
->wr_tx_ibs
= kcalloc(SMC_WR_BUF_CNT
, sizeof(link
->wr_tx_ibs
[0]),
548 if (!link
->wr_tx_ibs
)
549 goto no_mem_wr_rx_bufs
;
550 link
->wr_rx_ibs
= kcalloc(SMC_WR_BUF_CNT
* 3,
551 sizeof(link
->wr_rx_ibs
[0]),
553 if (!link
->wr_rx_ibs
)
554 goto no_mem_wr_tx_ibs
;
555 link
->wr_tx_sges
= kcalloc(SMC_WR_BUF_CNT
, sizeof(link
->wr_tx_sges
[0]),
557 if (!link
->wr_tx_sges
)
558 goto no_mem_wr_rx_ibs
;
559 link
->wr_rx_sges
= kcalloc(SMC_WR_BUF_CNT
* 3,
560 sizeof(link
->wr_rx_sges
[0]),
562 if (!link
->wr_rx_sges
)
563 goto no_mem_wr_tx_sges
;
564 link
->wr_tx_mask
= kcalloc(BITS_TO_LONGS(SMC_WR_BUF_CNT
),
565 sizeof(*link
->wr_tx_mask
),
567 if (!link
->wr_tx_mask
)
568 goto no_mem_wr_rx_sges
;
569 link
->wr_tx_pends
= kcalloc(SMC_WR_BUF_CNT
,
570 sizeof(link
->wr_tx_pends
[0]),
572 if (!link
->wr_tx_pends
)
573 goto no_mem_wr_tx_mask
;
577 kfree(link
->wr_tx_mask
);
579 kfree(link
->wr_rx_sges
);
581 kfree(link
->wr_tx_sges
);
583 kfree(link
->wr_rx_ibs
);
585 kfree(link
->wr_tx_ibs
);
587 kfree(link
->wr_rx_bufs
);
589 kfree(link
->wr_tx_bufs
);
594 void smc_wr_remove_dev(struct smc_ib_device
*smcibdev
)
596 tasklet_kill(&smcibdev
->recv_tasklet
);
597 tasklet_kill(&smcibdev
->send_tasklet
);
600 void smc_wr_add_dev(struct smc_ib_device
*smcibdev
)
602 tasklet_init(&smcibdev
->recv_tasklet
, smc_wr_rx_tasklet_fn
,
603 (unsigned long)smcibdev
);
604 tasklet_init(&smcibdev
->send_tasklet
, smc_wr_tx_tasklet_fn
,
605 (unsigned long)smcibdev
);
608 int smc_wr_create_link(struct smc_link
*lnk
)
610 struct ib_device
*ibdev
= lnk
->smcibdev
->ibdev
;
613 smc_wr_tx_set_wr_id(&lnk
->wr_tx_id
, 0);
615 lnk
->wr_rx_dma_addr
= ib_dma_map_single(
616 ibdev
, lnk
->wr_rx_bufs
, SMC_WR_BUF_SIZE
* lnk
->wr_rx_cnt
,
618 if (ib_dma_mapping_error(ibdev
, lnk
->wr_rx_dma_addr
)) {
619 lnk
->wr_rx_dma_addr
= 0;
623 lnk
->wr_tx_dma_addr
= ib_dma_map_single(
624 ibdev
, lnk
->wr_tx_bufs
, SMC_WR_BUF_SIZE
* lnk
->wr_tx_cnt
,
626 if (ib_dma_mapping_error(ibdev
, lnk
->wr_tx_dma_addr
)) {
630 smc_wr_init_sge(lnk
);
631 memset(lnk
->wr_tx_mask
, 0,
632 BITS_TO_LONGS(SMC_WR_BUF_CNT
) * sizeof(*lnk
->wr_tx_mask
));
633 init_waitqueue_head(&lnk
->wr_tx_wait
);
634 init_waitqueue_head(&lnk
->wr_reg_wait
);
638 ib_dma_unmap_single(ibdev
, lnk
->wr_rx_dma_addr
,
639 SMC_WR_BUF_SIZE
* lnk
->wr_rx_cnt
,
641 lnk
->wr_rx_dma_addr
= 0;