1 // SPDX-License-Identifier: GPL-2.0
3 * Shared Memory Communications over RDMA (SMC-R) and RoCE
5 * Work Requests exploiting Infiniband API
7 * Work requests (WR) of type ib_post_send or ib_post_recv respectively
8 * are submitted to either RC SQ or RC RQ respectively
9 * (reliably connected send/receive queue)
10 * and become work queue entries (WQEs).
11 * While an SQ WR/WQE is pending, we track it until transmission completion.
12 * Through a send or receive completion queue (CQ) respectively,
13 * we get completion queue entries (CQEs) [aka work completions (WCs)].
14 * Since the CQ callback is called from IRQ context, we split work by using
15 * bottom halves implemented by tasklets.
17 * SMC uses this to exchange LLC (link layer control)
18 * and CDC (connection data control) messages.
20 * Copyright IBM Corp. 2016
22 * Author(s): Steffen Maier <maier@linux.vnet.ibm.com>
25 #include <linux/atomic.h>
26 #include <linux/hashtable.h>
27 #include <linux/wait.h>
28 #include <rdma/ib_verbs.h>
29 #include <asm/div64.h>
34 #define SMC_WR_MAX_POLL_CQE 10 /* max. # of compl. queue elements in 1 poll */
36 #define SMC_WR_RX_HASH_BITS 4
37 static DEFINE_HASHTABLE(smc_wr_rx_hash
, SMC_WR_RX_HASH_BITS
);
38 static DEFINE_SPINLOCK(smc_wr_rx_hash_lock
);
40 struct smc_wr_tx_pend
{ /* control data for a pending send request */
41 u64 wr_id
; /* work request id sent */
42 smc_wr_tx_handler handler
;
43 enum ib_wc_status wc_status
; /* CQE status */
44 struct smc_link
*link
;
46 struct smc_wr_tx_pend_priv priv
;
50 /******************************** send queue *********************************/
52 /*------------------------------- completion --------------------------------*/
54 /* returns true if at least one tx work request is pending on the given link */
55 static inline bool smc_wr_is_tx_pend(struct smc_link
*link
)
57 return !bitmap_empty(link
->wr_tx_mask
, link
->wr_tx_cnt
);
60 /* wait till all pending tx work requests on the given link are completed */
61 void smc_wr_tx_wait_no_pending_sends(struct smc_link
*link
)
63 wait_event(link
->wr_tx_wait
, !smc_wr_is_tx_pend(link
));
66 static inline int smc_wr_tx_find_pending_index(struct smc_link
*link
, u64 wr_id
)
70 for (i
= 0; i
< link
->wr_tx_cnt
; i
++) {
71 if (link
->wr_tx_pends
[i
].wr_id
== wr_id
)
74 return link
->wr_tx_cnt
;
77 static inline void smc_wr_tx_process_cqe(struct ib_wc
*wc
)
79 struct smc_wr_tx_pend pnd_snd
;
80 struct smc_link
*link
;
83 link
= wc
->qp
->qp_context
;
85 if (wc
->opcode
== IB_WC_REG_MR
) {
87 link
->wr_reg_state
= FAILED
;
89 link
->wr_reg_state
= CONFIRMED
;
90 smc_wr_wakeup_reg_wait(link
);
94 pnd_snd_idx
= smc_wr_tx_find_pending_index(link
, wc
->wr_id
);
95 if (pnd_snd_idx
== link
->wr_tx_cnt
) {
96 if (link
->lgr
->smc_version
!= SMC_V2
||
97 link
->wr_tx_v2_pend
->wr_id
!= wc
->wr_id
)
99 link
->wr_tx_v2_pend
->wc_status
= wc
->status
;
100 memcpy(&pnd_snd
, link
->wr_tx_v2_pend
, sizeof(pnd_snd
));
101 /* clear the full struct smc_wr_tx_pend including .priv */
102 memset(link
->wr_tx_v2_pend
, 0,
103 sizeof(*link
->wr_tx_v2_pend
));
104 memset(link
->lgr
->wr_tx_buf_v2
, 0,
105 sizeof(*link
->lgr
->wr_tx_buf_v2
));
107 link
->wr_tx_pends
[pnd_snd_idx
].wc_status
= wc
->status
;
108 if (link
->wr_tx_pends
[pnd_snd_idx
].compl_requested
)
109 complete(&link
->wr_tx_compl
[pnd_snd_idx
]);
110 memcpy(&pnd_snd
, &link
->wr_tx_pends
[pnd_snd_idx
],
112 /* clear the full struct smc_wr_tx_pend including .priv */
113 memset(&link
->wr_tx_pends
[pnd_snd_idx
], 0,
114 sizeof(link
->wr_tx_pends
[pnd_snd_idx
]));
115 memset(&link
->wr_tx_bufs
[pnd_snd_idx
], 0,
116 sizeof(link
->wr_tx_bufs
[pnd_snd_idx
]));
117 if (!test_and_clear_bit(pnd_snd_idx
, link
->wr_tx_mask
))
122 if (link
->lgr
->smc_version
== SMC_V2
) {
123 memset(link
->wr_tx_v2_pend
, 0,
124 sizeof(*link
->wr_tx_v2_pend
));
125 memset(link
->lgr
->wr_tx_buf_v2
, 0,
126 sizeof(*link
->lgr
->wr_tx_buf_v2
));
129 smcr_link_down_cond_sched(link
);
132 pnd_snd
.handler(&pnd_snd
.priv
, link
, wc
->status
);
133 wake_up(&link
->wr_tx_wait
);
136 static void smc_wr_tx_tasklet_fn(struct tasklet_struct
*t
)
138 struct smc_ib_device
*dev
= from_tasklet(dev
, t
, send_tasklet
);
139 struct ib_wc wc
[SMC_WR_MAX_POLL_CQE
];
146 memset(&wc
, 0, sizeof(wc
));
147 rc
= ib_poll_cq(dev
->roce_cq_send
, SMC_WR_MAX_POLL_CQE
, wc
);
149 ib_req_notify_cq(dev
->roce_cq_send
,
151 IB_CQ_REPORT_MISSED_EVENTS
);
155 for (i
= 0; i
< rc
; i
++)
156 smc_wr_tx_process_cqe(&wc
[i
]);
162 void smc_wr_tx_cq_handler(struct ib_cq
*ib_cq
, void *cq_context
)
164 struct smc_ib_device
*dev
= (struct smc_ib_device
*)cq_context
;
166 tasklet_schedule(&dev
->send_tasklet
);
169 /*---------------------------- request submission ---------------------------*/
171 static inline int smc_wr_tx_get_free_slot_index(struct smc_link
*link
, u32
*idx
)
173 *idx
= link
->wr_tx_cnt
;
174 if (!smc_link_sendable(link
))
176 for_each_clear_bit(*idx
, link
->wr_tx_mask
, link
->wr_tx_cnt
) {
177 if (!test_and_set_bit(*idx
, link
->wr_tx_mask
))
180 *idx
= link
->wr_tx_cnt
;
185 * smc_wr_tx_get_free_slot() - returns buffer for message assembly,
186 * and sets info for pending transmit tracking
187 * @link: Pointer to smc_link used to later send the message.
188 * @handler: Send completion handler function pointer.
189 * @wr_buf: Out value returns pointer to message buffer.
190 * @wr_rdma_buf: Out value returns pointer to rdma work request.
191 * @wr_pend_priv: Out value returns pointer serving as handler context.
193 * Return: 0 on success, or -errno on error.
195 int smc_wr_tx_get_free_slot(struct smc_link
*link
,
196 smc_wr_tx_handler handler
,
197 struct smc_wr_buf
**wr_buf
,
198 struct smc_rdma_wr
**wr_rdma_buf
,
199 struct smc_wr_tx_pend_priv
**wr_pend_priv
)
201 struct smc_link_group
*lgr
= smc_get_lgr(link
);
202 struct smc_wr_tx_pend
*wr_pend
;
203 u32 idx
= link
->wr_tx_cnt
;
204 struct ib_send_wr
*wr_ib
;
209 *wr_pend_priv
= NULL
;
210 if (in_softirq() || lgr
->terminating
) {
211 rc
= smc_wr_tx_get_free_slot_index(link
, &idx
);
215 rc
= wait_event_interruptible_timeout(
217 !smc_link_sendable(link
) ||
219 (smc_wr_tx_get_free_slot_index(link
, &idx
) != -EBUSY
),
220 SMC_WR_TX_WAIT_FREE_SLOT_TIME
);
222 /* timeout - terminate link */
223 smcr_link_down_cond_sched(link
);
226 if (idx
== link
->wr_tx_cnt
)
229 wr_id
= smc_wr_tx_get_next_wr_id(link
);
230 wr_pend
= &link
->wr_tx_pends
[idx
];
231 wr_pend
->wr_id
= wr_id
;
232 wr_pend
->handler
= handler
;
233 wr_pend
->link
= link
;
235 wr_ib
= &link
->wr_tx_ibs
[idx
];
236 wr_ib
->wr_id
= wr_id
;
237 *wr_buf
= &link
->wr_tx_bufs
[idx
];
239 *wr_rdma_buf
= &link
->wr_tx_rdmas
[idx
];
240 *wr_pend_priv
= &wr_pend
->priv
;
244 int smc_wr_tx_get_v2_slot(struct smc_link
*link
,
245 smc_wr_tx_handler handler
,
246 struct smc_wr_v2_buf
**wr_buf
,
247 struct smc_wr_tx_pend_priv
**wr_pend_priv
)
249 struct smc_wr_tx_pend
*wr_pend
;
250 struct ib_send_wr
*wr_ib
;
253 if (link
->wr_tx_v2_pend
->idx
== link
->wr_tx_cnt
)
257 *wr_pend_priv
= NULL
;
258 wr_id
= smc_wr_tx_get_next_wr_id(link
);
259 wr_pend
= link
->wr_tx_v2_pend
;
260 wr_pend
->wr_id
= wr_id
;
261 wr_pend
->handler
= handler
;
262 wr_pend
->link
= link
;
263 wr_pend
->idx
= link
->wr_tx_cnt
;
264 wr_ib
= link
->wr_tx_v2_ib
;
265 wr_ib
->wr_id
= wr_id
;
266 *wr_buf
= link
->lgr
->wr_tx_buf_v2
;
267 *wr_pend_priv
= &wr_pend
->priv
;
271 int smc_wr_tx_put_slot(struct smc_link
*link
,
272 struct smc_wr_tx_pend_priv
*wr_pend_priv
)
274 struct smc_wr_tx_pend
*pend
;
276 pend
= container_of(wr_pend_priv
, struct smc_wr_tx_pend
, priv
);
277 if (pend
->idx
< link
->wr_tx_cnt
) {
280 /* clear the full struct smc_wr_tx_pend including .priv */
281 memset(&link
->wr_tx_pends
[idx
], 0,
282 sizeof(link
->wr_tx_pends
[idx
]));
283 memset(&link
->wr_tx_bufs
[idx
], 0,
284 sizeof(link
->wr_tx_bufs
[idx
]));
285 test_and_clear_bit(idx
, link
->wr_tx_mask
);
286 wake_up(&link
->wr_tx_wait
);
288 } else if (link
->lgr
->smc_version
== SMC_V2
&&
289 pend
->idx
== link
->wr_tx_cnt
) {
290 /* Large v2 buffer */
291 memset(&link
->wr_tx_v2_pend
, 0,
292 sizeof(link
->wr_tx_v2_pend
));
293 memset(&link
->lgr
->wr_tx_buf_v2
, 0,
294 sizeof(link
->lgr
->wr_tx_buf_v2
));
301 /* Send prepared WR slot via ib_post_send.
302 * @priv: pointer to smc_wr_tx_pend_priv identifying prepared message buffer
304 int smc_wr_tx_send(struct smc_link
*link
, struct smc_wr_tx_pend_priv
*priv
)
306 struct smc_wr_tx_pend
*pend
;
309 ib_req_notify_cq(link
->smcibdev
->roce_cq_send
,
310 IB_CQ_NEXT_COMP
| IB_CQ_REPORT_MISSED_EVENTS
);
311 pend
= container_of(priv
, struct smc_wr_tx_pend
, priv
);
312 rc
= ib_post_send(link
->roce_qp
, &link
->wr_tx_ibs
[pend
->idx
], NULL
);
314 smc_wr_tx_put_slot(link
, priv
);
315 smcr_link_down_cond_sched(link
);
320 int smc_wr_tx_v2_send(struct smc_link
*link
, struct smc_wr_tx_pend_priv
*priv
,
325 link
->wr_tx_v2_ib
->sg_list
[0].length
= len
;
326 ib_req_notify_cq(link
->smcibdev
->roce_cq_send
,
327 IB_CQ_NEXT_COMP
| IB_CQ_REPORT_MISSED_EVENTS
);
328 rc
= ib_post_send(link
->roce_qp
, link
->wr_tx_v2_ib
, NULL
);
330 smc_wr_tx_put_slot(link
, priv
);
331 smcr_link_down_cond_sched(link
);
336 /* Send prepared WR slot via ib_post_send and wait for send completion
338 * @priv: pointer to smc_wr_tx_pend_priv identifying prepared message buffer
340 int smc_wr_tx_send_wait(struct smc_link
*link
, struct smc_wr_tx_pend_priv
*priv
,
341 unsigned long timeout
)
343 struct smc_wr_tx_pend
*pend
;
347 pend
= container_of(priv
, struct smc_wr_tx_pend
, priv
);
348 pend
->compl_requested
= 1;
350 init_completion(&link
->wr_tx_compl
[pnd_idx
]);
352 rc
= smc_wr_tx_send(link
, priv
);
355 /* wait for completion by smc_wr_tx_process_cqe() */
356 rc
= wait_for_completion_interruptible_timeout(
357 &link
->wr_tx_compl
[pnd_idx
], timeout
);
365 /* Register a memory region and wait for result. */
366 int smc_wr_reg_send(struct smc_link
*link
, struct ib_mr
*mr
)
370 ib_req_notify_cq(link
->smcibdev
->roce_cq_send
,
371 IB_CQ_NEXT_COMP
| IB_CQ_REPORT_MISSED_EVENTS
);
372 link
->wr_reg_state
= POSTED
;
373 link
->wr_reg
.wr
.wr_id
= (u64
)(uintptr_t)mr
;
374 link
->wr_reg
.mr
= mr
;
375 link
->wr_reg
.key
= mr
->rkey
;
376 rc
= ib_post_send(link
->roce_qp
, &link
->wr_reg
.wr
, NULL
);
380 percpu_ref_get(&link
->wr_reg_refs
);
381 rc
= wait_event_interruptible_timeout(link
->wr_reg_wait
,
382 (link
->wr_reg_state
!= POSTED
),
383 SMC_WR_REG_MR_WAIT_TIME
);
384 percpu_ref_put(&link
->wr_reg_refs
);
386 /* timeout - terminate link */
387 smcr_link_down_cond_sched(link
);
390 if (rc
== -ERESTARTSYS
)
392 switch (link
->wr_reg_state
) {
406 /****************************** receive queue ********************************/
408 int smc_wr_rx_register_handler(struct smc_wr_rx_handler
*handler
)
410 struct smc_wr_rx_handler
*h_iter
;
413 spin_lock(&smc_wr_rx_hash_lock
);
414 hash_for_each_possible(smc_wr_rx_hash
, h_iter
, list
, handler
->type
) {
415 if (h_iter
->type
== handler
->type
) {
420 hash_add(smc_wr_rx_hash
, &handler
->list
, handler
->type
);
422 spin_unlock(&smc_wr_rx_hash_lock
);
426 /* Demultiplex a received work request based on the message type to its handler.
427 * Relies on smc_wr_rx_hash having been completely filled before any IB WRs,
428 * and not being modified any more afterwards so we don't need to lock it.
430 static inline void smc_wr_rx_demultiplex(struct ib_wc
*wc
)
432 struct smc_link
*link
= (struct smc_link
*)wc
->qp
->qp_context
;
433 struct smc_wr_rx_handler
*handler
;
434 struct smc_wr_rx_hdr
*wr_rx
;
438 if (wc
->byte_len
< sizeof(*wr_rx
))
439 return; /* short message */
440 temp_wr_id
= wc
->wr_id
;
441 index
= do_div(temp_wr_id
, link
->wr_rx_cnt
);
442 wr_rx
= (struct smc_wr_rx_hdr
*)&link
->wr_rx_bufs
[index
];
443 hash_for_each_possible(smc_wr_rx_hash
, handler
, list
, wr_rx
->type
) {
444 if (handler
->type
== wr_rx
->type
)
445 handler
->handler(wc
, wr_rx
);
449 static inline void smc_wr_rx_process_cqes(struct ib_wc wc
[], int num
)
451 struct smc_link
*link
;
454 for (i
= 0; i
< num
; i
++) {
455 link
= wc
[i
].qp
->qp_context
;
456 link
->wr_rx_id_compl
= wc
[i
].wr_id
;
457 if (wc
[i
].status
== IB_WC_SUCCESS
) {
458 link
->wr_rx_tstamp
= jiffies
;
459 smc_wr_rx_demultiplex(&wc
[i
]);
460 smc_wr_rx_post(link
); /* refill WR RX */
462 /* handle status errors */
463 switch (wc
[i
].status
) {
464 case IB_WC_RETRY_EXC_ERR
:
465 case IB_WC_RNR_RETRY_EXC_ERR
:
466 case IB_WC_WR_FLUSH_ERR
:
467 smcr_link_down_cond_sched(link
);
468 if (link
->wr_rx_id_compl
== link
->wr_rx_id
)
469 wake_up(&link
->wr_rx_empty_wait
);
472 smc_wr_rx_post(link
); /* refill WR RX */
479 static void smc_wr_rx_tasklet_fn(struct tasklet_struct
*t
)
481 struct smc_ib_device
*dev
= from_tasklet(dev
, t
, recv_tasklet
);
482 struct ib_wc wc
[SMC_WR_MAX_POLL_CQE
];
489 memset(&wc
, 0, sizeof(wc
));
490 rc
= ib_poll_cq(dev
->roce_cq_recv
, SMC_WR_MAX_POLL_CQE
, wc
);
492 ib_req_notify_cq(dev
->roce_cq_recv
,
494 | IB_CQ_REPORT_MISSED_EVENTS
);
498 smc_wr_rx_process_cqes(&wc
[0], rc
);
504 void smc_wr_rx_cq_handler(struct ib_cq
*ib_cq
, void *cq_context
)
506 struct smc_ib_device
*dev
= (struct smc_ib_device
*)cq_context
;
508 tasklet_schedule(&dev
->recv_tasklet
);
511 int smc_wr_rx_post_init(struct smc_link
*link
)
516 for (i
= 0; i
< link
->wr_rx_cnt
; i
++)
517 rc
= smc_wr_rx_post(link
);
521 /***************************** init, exit, misc ******************************/
523 void smc_wr_remember_qp_attr(struct smc_link
*lnk
)
525 struct ib_qp_attr
*attr
= &lnk
->qp_attr
;
526 struct ib_qp_init_attr init_attr
;
528 memset(attr
, 0, sizeof(*attr
));
529 memset(&init_attr
, 0, sizeof(init_attr
));
530 ib_query_qp(lnk
->roce_qp
, attr
,
543 IB_QP_MIN_RNR_TIMER
|
545 IB_QP_PATH_MIG_STATE
|
550 lnk
->wr_tx_cnt
= min_t(size_t, SMC_WR_BUF_CNT
,
551 lnk
->qp_attr
.cap
.max_send_wr
);
552 lnk
->wr_rx_cnt
= min_t(size_t, SMC_WR_BUF_CNT
* 3,
553 lnk
->qp_attr
.cap
.max_recv_wr
);
556 static void smc_wr_init_sge(struct smc_link
*lnk
)
558 int sges_per_buf
= (lnk
->lgr
->smc_version
== SMC_V2
) ? 2 : 1;
559 bool send_inline
= (lnk
->qp_attr
.cap
.max_inline_data
> SMC_WR_TX_SIZE
);
562 for (i
= 0; i
< lnk
->wr_tx_cnt
; i
++) {
563 lnk
->wr_tx_sges
[i
].addr
= send_inline
? (uintptr_t)(&lnk
->wr_tx_bufs
[i
]) :
564 lnk
->wr_tx_dma_addr
+ i
* SMC_WR_BUF_SIZE
;
565 lnk
->wr_tx_sges
[i
].length
= SMC_WR_TX_SIZE
;
566 lnk
->wr_tx_sges
[i
].lkey
= lnk
->roce_pd
->local_dma_lkey
;
567 lnk
->wr_tx_rdma_sges
[i
].tx_rdma_sge
[0].wr_tx_rdma_sge
[0].lkey
=
568 lnk
->roce_pd
->local_dma_lkey
;
569 lnk
->wr_tx_rdma_sges
[i
].tx_rdma_sge
[0].wr_tx_rdma_sge
[1].lkey
=
570 lnk
->roce_pd
->local_dma_lkey
;
571 lnk
->wr_tx_rdma_sges
[i
].tx_rdma_sge
[1].wr_tx_rdma_sge
[0].lkey
=
572 lnk
->roce_pd
->local_dma_lkey
;
573 lnk
->wr_tx_rdma_sges
[i
].tx_rdma_sge
[1].wr_tx_rdma_sge
[1].lkey
=
574 lnk
->roce_pd
->local_dma_lkey
;
575 lnk
->wr_tx_ibs
[i
].next
= NULL
;
576 lnk
->wr_tx_ibs
[i
].sg_list
= &lnk
->wr_tx_sges
[i
];
577 lnk
->wr_tx_ibs
[i
].num_sge
= 1;
578 lnk
->wr_tx_ibs
[i
].opcode
= IB_WR_SEND
;
579 lnk
->wr_tx_ibs
[i
].send_flags
=
580 IB_SEND_SIGNALED
| IB_SEND_SOLICITED
;
582 lnk
->wr_tx_ibs
[i
].send_flags
|= IB_SEND_INLINE
;
583 lnk
->wr_tx_rdmas
[i
].wr_tx_rdma
[0].wr
.opcode
= IB_WR_RDMA_WRITE
;
584 lnk
->wr_tx_rdmas
[i
].wr_tx_rdma
[1].wr
.opcode
= IB_WR_RDMA_WRITE
;
585 lnk
->wr_tx_rdmas
[i
].wr_tx_rdma
[0].wr
.sg_list
=
586 lnk
->wr_tx_rdma_sges
[i
].tx_rdma_sge
[0].wr_tx_rdma_sge
;
587 lnk
->wr_tx_rdmas
[i
].wr_tx_rdma
[1].wr
.sg_list
=
588 lnk
->wr_tx_rdma_sges
[i
].tx_rdma_sge
[1].wr_tx_rdma_sge
;
591 if (lnk
->lgr
->smc_version
== SMC_V2
) {
592 lnk
->wr_tx_v2_sge
->addr
= lnk
->wr_tx_v2_dma_addr
;
593 lnk
->wr_tx_v2_sge
->length
= SMC_WR_BUF_V2_SIZE
;
594 lnk
->wr_tx_v2_sge
->lkey
= lnk
->roce_pd
->local_dma_lkey
;
596 lnk
->wr_tx_v2_ib
->next
= NULL
;
597 lnk
->wr_tx_v2_ib
->sg_list
= lnk
->wr_tx_v2_sge
;
598 lnk
->wr_tx_v2_ib
->num_sge
= 1;
599 lnk
->wr_tx_v2_ib
->opcode
= IB_WR_SEND
;
600 lnk
->wr_tx_v2_ib
->send_flags
=
601 IB_SEND_SIGNALED
| IB_SEND_SOLICITED
;
604 /* With SMC-Rv2 there can be messages larger than SMC_WR_TX_SIZE.
605 * Each ib_recv_wr gets 2 sges, the second one is a spillover buffer
606 * and the same buffer for all sges. When a larger message arrived then
607 * the content of the first small sge is copied to the beginning of
608 * the larger spillover buffer, allowing easy data mapping.
610 for (i
= 0; i
< lnk
->wr_rx_cnt
; i
++) {
611 int x
= i
* sges_per_buf
;
613 lnk
->wr_rx_sges
[x
].addr
=
614 lnk
->wr_rx_dma_addr
+ i
* SMC_WR_BUF_SIZE
;
615 lnk
->wr_rx_sges
[x
].length
= SMC_WR_TX_SIZE
;
616 lnk
->wr_rx_sges
[x
].lkey
= lnk
->roce_pd
->local_dma_lkey
;
617 if (lnk
->lgr
->smc_version
== SMC_V2
) {
618 lnk
->wr_rx_sges
[x
+ 1].addr
=
619 lnk
->wr_rx_v2_dma_addr
+ SMC_WR_TX_SIZE
;
620 lnk
->wr_rx_sges
[x
+ 1].length
=
621 SMC_WR_BUF_V2_SIZE
- SMC_WR_TX_SIZE
;
622 lnk
->wr_rx_sges
[x
+ 1].lkey
=
623 lnk
->roce_pd
->local_dma_lkey
;
625 lnk
->wr_rx_ibs
[i
].next
= NULL
;
626 lnk
->wr_rx_ibs
[i
].sg_list
= &lnk
->wr_rx_sges
[x
];
627 lnk
->wr_rx_ibs
[i
].num_sge
= sges_per_buf
;
629 lnk
->wr_reg
.wr
.next
= NULL
;
630 lnk
->wr_reg
.wr
.num_sge
= 0;
631 lnk
->wr_reg
.wr
.send_flags
= IB_SEND_SIGNALED
;
632 lnk
->wr_reg
.wr
.opcode
= IB_WR_REG_MR
;
633 lnk
->wr_reg
.access
= IB_ACCESS_LOCAL_WRITE
| IB_ACCESS_REMOTE_WRITE
;
636 void smc_wr_free_link(struct smc_link
*lnk
)
638 struct ib_device
*ibdev
;
642 ibdev
= lnk
->smcibdev
->ibdev
;
644 smc_wr_drain_cq(lnk
);
645 smc_wr_wakeup_reg_wait(lnk
);
646 smc_wr_wakeup_tx_wait(lnk
);
648 smc_wr_tx_wait_no_pending_sends(lnk
);
649 percpu_ref_kill(&lnk
->wr_reg_refs
);
650 wait_for_completion(&lnk
->reg_ref_comp
);
651 percpu_ref_exit(&lnk
->wr_reg_refs
);
652 percpu_ref_kill(&lnk
->wr_tx_refs
);
653 wait_for_completion(&lnk
->tx_ref_comp
);
654 percpu_ref_exit(&lnk
->wr_tx_refs
);
656 if (lnk
->wr_rx_dma_addr
) {
657 ib_dma_unmap_single(ibdev
, lnk
->wr_rx_dma_addr
,
658 SMC_WR_BUF_SIZE
* lnk
->wr_rx_cnt
,
660 lnk
->wr_rx_dma_addr
= 0;
662 if (lnk
->wr_rx_v2_dma_addr
) {
663 ib_dma_unmap_single(ibdev
, lnk
->wr_rx_v2_dma_addr
,
666 lnk
->wr_rx_v2_dma_addr
= 0;
668 if (lnk
->wr_tx_dma_addr
) {
669 ib_dma_unmap_single(ibdev
, lnk
->wr_tx_dma_addr
,
670 SMC_WR_BUF_SIZE
* lnk
->wr_tx_cnt
,
672 lnk
->wr_tx_dma_addr
= 0;
674 if (lnk
->wr_tx_v2_dma_addr
) {
675 ib_dma_unmap_single(ibdev
, lnk
->wr_tx_v2_dma_addr
,
678 lnk
->wr_tx_v2_dma_addr
= 0;
682 void smc_wr_free_lgr_mem(struct smc_link_group
*lgr
)
684 if (lgr
->smc_version
< SMC_V2
)
687 kfree(lgr
->wr_rx_buf_v2
);
688 lgr
->wr_rx_buf_v2
= NULL
;
689 kfree(lgr
->wr_tx_buf_v2
);
690 lgr
->wr_tx_buf_v2
= NULL
;
693 void smc_wr_free_link_mem(struct smc_link
*lnk
)
695 kfree(lnk
->wr_tx_v2_ib
);
696 lnk
->wr_tx_v2_ib
= NULL
;
697 kfree(lnk
->wr_tx_v2_sge
);
698 lnk
->wr_tx_v2_sge
= NULL
;
699 kfree(lnk
->wr_tx_v2_pend
);
700 lnk
->wr_tx_v2_pend
= NULL
;
701 kfree(lnk
->wr_tx_compl
);
702 lnk
->wr_tx_compl
= NULL
;
703 kfree(lnk
->wr_tx_pends
);
704 lnk
->wr_tx_pends
= NULL
;
705 bitmap_free(lnk
->wr_tx_mask
);
706 lnk
->wr_tx_mask
= NULL
;
707 kfree(lnk
->wr_tx_sges
);
708 lnk
->wr_tx_sges
= NULL
;
709 kfree(lnk
->wr_tx_rdma_sges
);
710 lnk
->wr_tx_rdma_sges
= NULL
;
711 kfree(lnk
->wr_rx_sges
);
712 lnk
->wr_rx_sges
= NULL
;
713 kfree(lnk
->wr_tx_rdmas
);
714 lnk
->wr_tx_rdmas
= NULL
;
715 kfree(lnk
->wr_rx_ibs
);
716 lnk
->wr_rx_ibs
= NULL
;
717 kfree(lnk
->wr_tx_ibs
);
718 lnk
->wr_tx_ibs
= NULL
;
719 kfree(lnk
->wr_tx_bufs
);
720 lnk
->wr_tx_bufs
= NULL
;
721 kfree(lnk
->wr_rx_bufs
);
722 lnk
->wr_rx_bufs
= NULL
;
725 int smc_wr_alloc_lgr_mem(struct smc_link_group
*lgr
)
727 if (lgr
->smc_version
< SMC_V2
)
730 lgr
->wr_rx_buf_v2
= kzalloc(SMC_WR_BUF_V2_SIZE
, GFP_KERNEL
);
731 if (!lgr
->wr_rx_buf_v2
)
733 lgr
->wr_tx_buf_v2
= kzalloc(SMC_WR_BUF_V2_SIZE
, GFP_KERNEL
);
734 if (!lgr
->wr_tx_buf_v2
) {
735 kfree(lgr
->wr_rx_buf_v2
);
741 int smc_wr_alloc_link_mem(struct smc_link
*link
)
743 int sges_per_buf
= link
->lgr
->smc_version
== SMC_V2
? 2 : 1;
745 /* allocate link related memory */
746 link
->wr_tx_bufs
= kcalloc(SMC_WR_BUF_CNT
, SMC_WR_BUF_SIZE
, GFP_KERNEL
);
747 if (!link
->wr_tx_bufs
)
749 link
->wr_rx_bufs
= kcalloc(SMC_WR_BUF_CNT
* 3, SMC_WR_BUF_SIZE
,
751 if (!link
->wr_rx_bufs
)
752 goto no_mem_wr_tx_bufs
;
753 link
->wr_tx_ibs
= kcalloc(SMC_WR_BUF_CNT
, sizeof(link
->wr_tx_ibs
[0]),
755 if (!link
->wr_tx_ibs
)
756 goto no_mem_wr_rx_bufs
;
757 link
->wr_rx_ibs
= kcalloc(SMC_WR_BUF_CNT
* 3,
758 sizeof(link
->wr_rx_ibs
[0]),
760 if (!link
->wr_rx_ibs
)
761 goto no_mem_wr_tx_ibs
;
762 link
->wr_tx_rdmas
= kcalloc(SMC_WR_BUF_CNT
,
763 sizeof(link
->wr_tx_rdmas
[0]),
765 if (!link
->wr_tx_rdmas
)
766 goto no_mem_wr_rx_ibs
;
767 link
->wr_tx_rdma_sges
= kcalloc(SMC_WR_BUF_CNT
,
768 sizeof(link
->wr_tx_rdma_sges
[0]),
770 if (!link
->wr_tx_rdma_sges
)
771 goto no_mem_wr_tx_rdmas
;
772 link
->wr_tx_sges
= kcalloc(SMC_WR_BUF_CNT
, sizeof(link
->wr_tx_sges
[0]),
774 if (!link
->wr_tx_sges
)
775 goto no_mem_wr_tx_rdma_sges
;
776 link
->wr_rx_sges
= kcalloc(SMC_WR_BUF_CNT
* 3,
777 sizeof(link
->wr_rx_sges
[0]) * sges_per_buf
,
779 if (!link
->wr_rx_sges
)
780 goto no_mem_wr_tx_sges
;
781 link
->wr_tx_mask
= bitmap_zalloc(SMC_WR_BUF_CNT
, GFP_KERNEL
);
782 if (!link
->wr_tx_mask
)
783 goto no_mem_wr_rx_sges
;
784 link
->wr_tx_pends
= kcalloc(SMC_WR_BUF_CNT
,
785 sizeof(link
->wr_tx_pends
[0]),
787 if (!link
->wr_tx_pends
)
788 goto no_mem_wr_tx_mask
;
789 link
->wr_tx_compl
= kcalloc(SMC_WR_BUF_CNT
,
790 sizeof(link
->wr_tx_compl
[0]),
792 if (!link
->wr_tx_compl
)
793 goto no_mem_wr_tx_pends
;
795 if (link
->lgr
->smc_version
== SMC_V2
) {
796 link
->wr_tx_v2_ib
= kzalloc(sizeof(*link
->wr_tx_v2_ib
),
798 if (!link
->wr_tx_v2_ib
)
799 goto no_mem_tx_compl
;
800 link
->wr_tx_v2_sge
= kzalloc(sizeof(*link
->wr_tx_v2_sge
),
802 if (!link
->wr_tx_v2_sge
)
804 link
->wr_tx_v2_pend
= kzalloc(sizeof(*link
->wr_tx_v2_pend
),
806 if (!link
->wr_tx_v2_pend
)
812 kfree(link
->wr_tx_v2_sge
);
814 kfree(link
->wr_tx_v2_ib
);
816 kfree(link
->wr_tx_compl
);
818 kfree(link
->wr_tx_pends
);
820 kfree(link
->wr_tx_mask
);
822 kfree(link
->wr_rx_sges
);
824 kfree(link
->wr_tx_sges
);
825 no_mem_wr_tx_rdma_sges
:
826 kfree(link
->wr_tx_rdma_sges
);
828 kfree(link
->wr_tx_rdmas
);
830 kfree(link
->wr_rx_ibs
);
832 kfree(link
->wr_tx_ibs
);
834 kfree(link
->wr_rx_bufs
);
836 kfree(link
->wr_tx_bufs
);
841 void smc_wr_remove_dev(struct smc_ib_device
*smcibdev
)
843 tasklet_kill(&smcibdev
->recv_tasklet
);
844 tasklet_kill(&smcibdev
->send_tasklet
);
847 void smc_wr_add_dev(struct smc_ib_device
*smcibdev
)
849 tasklet_setup(&smcibdev
->recv_tasklet
, smc_wr_rx_tasklet_fn
);
850 tasklet_setup(&smcibdev
->send_tasklet
, smc_wr_tx_tasklet_fn
);
853 static void smcr_wr_tx_refs_free(struct percpu_ref
*ref
)
855 struct smc_link
*lnk
= container_of(ref
, struct smc_link
, wr_tx_refs
);
857 complete(&lnk
->tx_ref_comp
);
860 static void smcr_wr_reg_refs_free(struct percpu_ref
*ref
)
862 struct smc_link
*lnk
= container_of(ref
, struct smc_link
, wr_reg_refs
);
864 complete(&lnk
->reg_ref_comp
);
867 int smc_wr_create_link(struct smc_link
*lnk
)
869 struct ib_device
*ibdev
= lnk
->smcibdev
->ibdev
;
872 smc_wr_tx_set_wr_id(&lnk
->wr_tx_id
, 0);
874 lnk
->wr_rx_dma_addr
= ib_dma_map_single(
875 ibdev
, lnk
->wr_rx_bufs
, SMC_WR_BUF_SIZE
* lnk
->wr_rx_cnt
,
877 if (ib_dma_mapping_error(ibdev
, lnk
->wr_rx_dma_addr
)) {
878 lnk
->wr_rx_dma_addr
= 0;
882 if (lnk
->lgr
->smc_version
== SMC_V2
) {
883 lnk
->wr_rx_v2_dma_addr
= ib_dma_map_single(ibdev
,
884 lnk
->lgr
->wr_rx_buf_v2
, SMC_WR_BUF_V2_SIZE
,
886 if (ib_dma_mapping_error(ibdev
, lnk
->wr_rx_v2_dma_addr
)) {
887 lnk
->wr_rx_v2_dma_addr
= 0;
891 lnk
->wr_tx_v2_dma_addr
= ib_dma_map_single(ibdev
,
892 lnk
->lgr
->wr_tx_buf_v2
, SMC_WR_BUF_V2_SIZE
,
894 if (ib_dma_mapping_error(ibdev
, lnk
->wr_tx_v2_dma_addr
)) {
895 lnk
->wr_tx_v2_dma_addr
= 0;
900 lnk
->wr_tx_dma_addr
= ib_dma_map_single(
901 ibdev
, lnk
->wr_tx_bufs
, SMC_WR_BUF_SIZE
* lnk
->wr_tx_cnt
,
903 if (ib_dma_mapping_error(ibdev
, lnk
->wr_tx_dma_addr
)) {
907 smc_wr_init_sge(lnk
);
908 bitmap_zero(lnk
->wr_tx_mask
, SMC_WR_BUF_CNT
);
909 init_waitqueue_head(&lnk
->wr_tx_wait
);
910 rc
= percpu_ref_init(&lnk
->wr_tx_refs
, smcr_wr_tx_refs_free
, 0, GFP_KERNEL
);
913 init_completion(&lnk
->tx_ref_comp
);
914 init_waitqueue_head(&lnk
->wr_reg_wait
);
915 rc
= percpu_ref_init(&lnk
->wr_reg_refs
, smcr_wr_reg_refs_free
, 0, GFP_KERNEL
);
918 init_completion(&lnk
->reg_ref_comp
);
919 init_waitqueue_head(&lnk
->wr_rx_empty_wait
);
923 percpu_ref_exit(&lnk
->wr_tx_refs
);
925 if (lnk
->wr_rx_v2_dma_addr
) {
926 ib_dma_unmap_single(ibdev
, lnk
->wr_rx_v2_dma_addr
,
929 lnk
->wr_rx_v2_dma_addr
= 0;
931 if (lnk
->wr_tx_v2_dma_addr
) {
932 ib_dma_unmap_single(ibdev
, lnk
->wr_tx_v2_dma_addr
,
935 lnk
->wr_tx_v2_dma_addr
= 0;
937 ib_dma_unmap_single(ibdev
, lnk
->wr_rx_dma_addr
,
938 SMC_WR_BUF_SIZE
* lnk
->wr_rx_cnt
,
940 lnk
->wr_rx_dma_addr
= 0;