2 * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
6 * This file contains code imported from the OFED rds source file ib_cm.c
7 * Oracle elects to have and use the contents of ib_cm.c under and governed
8 * by the OpenIB.org BSD license (see below for full license text). However,
9 * the following notice accompanied the original version of this file:
13 * Copyright (c) 2006 Oracle. All rights reserved.
15 * This software is available to you under a choice of one of two
16 * licenses. You may choose to be licensed under the terms of the GNU
17 * General Public License (GPL) Version 2, available from the file
18 * COPYING in the main directory of this source tree, or the
19 * OpenIB.org BSD license below:
21 * Redistribution and use in source and binary forms, with or
22 * without modification, are permitted provided that the following
25 * - Redistributions of source code must retain the above
26 * copyright notice, this list of conditions and the following
29 * - Redistributions in binary form must reproduce the above
30 * copyright notice, this list of conditions and the following
31 * disclaimer in the documentation and/or other materials
32 * provided with the distribution.
34 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
35 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
36 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
37 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
38 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
39 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
40 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
46 #include <sys/ib/clients/of/ofed_kernel.h>
47 #include <sys/ib/clients/of/rdma/ib_addr.h>
48 #include <sys/ib/clients/of/rdma/rdma_cm.h>
50 #include <sys/ib/clients/rdsv3/rdsv3.h>
51 #include <sys/ib/clients/rdsv3/ib.h>
52 #include <sys/ib/clients/rdsv3/rdsv3_debug.h>
54 extern int rdsv3_enable_snd_cq
;
57 * Set the selected protocol version
60 rdsv3_ib_set_protocol(struct rdsv3_connection
*conn
, unsigned int version
)
62 RDSV3_DPRINTF4("rdsv3_ib_set_protocol", "conn: %p version: %d",
64 conn
->c_version
= version
;
71 rdsv3_ib_set_flow_control(struct rdsv3_connection
*conn
, uint32_t credits
)
73 struct rdsv3_ib_connection
*ic
= conn
->c_transport_data
;
75 RDSV3_DPRINTF2("rdsv3_ib_set_flow_control",
76 "Enter: conn: %p credits: %d", conn
, credits
);
78 if (rdsv3_ib_sysctl_flow_control
&& credits
!= 0) {
79 /* We're doing flow control */
81 rdsv3_ib_send_add_credits(conn
, credits
);
86 RDSV3_DPRINTF2("rdsv3_ib_set_flow_control",
87 "Return: conn: %p credits: %d",
92 * Tune RNR behavior. Without flow control, we use a rather
93 * low timeout, but not the absolute minimum - this should
96 * We already set the RNR retry count to 7 (which is the
97 * smallest infinite number :-) above.
98 * If flow control is off, we want to change this back to 0
99 * so that we learn quickly when our credit accounting is
102 * Caller passes in a qp_attr pointer - don't waste stack spacv
103 * by allocation this twice.
106 rdsv3_ib_tune_rnr(struct rdsv3_ib_connection
*ic
, struct ib_qp_attr
*attr
)
110 RDSV3_DPRINTF2("rdsv3_ib_tune_rnr", "Enter ic: %p attr: %p",
113 attr
->min_rnr_timer
= IB_RNR_TIMER_000_32
;
114 ret
= ib_modify_qp(ic
->i_cm_id
->qp
, attr
, IB_QP_MIN_RNR_TIMER
);
116 RDSV3_DPRINTF2("rdsv3_ib_tune_rnr",
117 "ib_modify_qp(IB_QP_MIN_RNR_TIMER): err=%d", -ret
);
121 * Connection established.
122 * We get here for both outgoing and incoming connection.
125 rdsv3_ib_cm_connect_complete(struct rdsv3_connection
*conn
,
126 struct rdma_cm_event
*event
)
128 const struct rdsv3_ib_connect_private
*dp
= NULL
;
129 struct rdsv3_ib_connection
*ic
= conn
->c_transport_data
;
130 struct rdsv3_ib_device
*rds_ibdev
=
131 ib_get_client_data(ic
->i_cm_id
->device
, &rdsv3_ib_client
);
132 struct ib_qp_attr qp_attr
;
135 RDSV3_DPRINTF2("rdsv3_ib_cm_connect_complete",
136 "Enter conn: %p event: %p", conn
, event
);
138 if (event
->param
.conn
.private_data_len
>= sizeof (*dp
)) {
139 dp
= event
->param
.conn
.private_data
;
141 /* make sure it isn't empty data */
142 if (dp
->dp_protocol_major
) {
143 rdsv3_ib_set_protocol(conn
,
144 RDS_PROTOCOL(dp
->dp_protocol_major
,
145 dp
->dp_protocol_minor
));
146 rdsv3_ib_set_flow_control(conn
,
147 ntohl(dp
->dp_credit
));
151 if (conn
->c_version
< RDS_PROTOCOL(3, 1)) {
152 RDSV3_DPRINTF2("rdsv3_ib_cm_connect_complete",
153 "RDS/IB: Connection to %u.%u.%u.%u version %u.%u failed",
154 NIPQUAD(conn
->c_faddr
),
155 RDS_PROTOCOL_MAJOR(conn
->c_version
),
156 RDS_PROTOCOL_MINOR(conn
->c_version
));
157 rdsv3_conn_destroy(conn
);
160 RDSV3_DPRINTF2("rdsv3_ib_cm_connect_complete",
161 "RDS/IB: connected to %u.%u.%u.%u version %u.%u%s",
162 NIPQUAD(conn
->c_faddr
),
163 RDS_PROTOCOL_MAJOR(conn
->c_version
),
164 RDS_PROTOCOL_MINOR(conn
->c_version
),
165 ic
->i_flowctl
? ", flow control" : "");
168 ASSERT(ic
->i_soft_cq
== NULL
);
169 ic
->i_soft_cq
= rdsv3_af_intr_thr_create(rdsv3_ib_tasklet_fn
,
170 (void *)ic
, SCQ_INTR_BIND_CPU
, rds_ibdev
->aft_hcagp
,
172 if (rdsv3_enable_snd_cq
) {
173 ic
->i_snd_soft_cq
= rdsv3_af_intr_thr_create(
174 rdsv3_ib_snd_tasklet_fn
,
175 (void *)ic
, SCQ_INTR_BIND_CPU
, rds_ibdev
->aft_hcagp
,
176 ic
->i_snd_cq
->ibt_cq
);
178 /* rdsv3_ib_refill_fn is expecting i_max_recv_alloc set */
179 ic
->i_max_recv_alloc
= rdsv3_ib_sysctl_max_recv_allocation
;
180 ic
->i_refill_rq
= rdsv3_af_thr_create(rdsv3_ib_refill_fn
, (void *)conn
,
181 SCQ_WRK_BIND_CPU
, rds_ibdev
->aft_hcagp
);
182 rdsv3_af_grp_draw(rds_ibdev
->aft_hcagp
);
184 (void) ib_req_notify_cq(ic
->i_cq
, IB_CQ_SOLICITED
);
185 if (rdsv3_enable_snd_cq
) {
186 (void) ib_req_notify_cq(ic
->i_snd_cq
, IB_CQ_NEXT_COMP
);
190 * Init rings and fill recv. this needs to wait until protocol
192 * is complete, since ring layout is different from 3.0 to 3.1.
194 rdsv3_ib_send_init_ring(ic
);
195 rdsv3_ib_recv_init_ring(ic
);
197 * Post receive buffers - as a side effect, this will update
198 * the posted credit count.
200 (void) rdsv3_ib_recv_refill(conn
, 1);
202 /* Tune RNR behavior */
203 rdsv3_ib_tune_rnr(ic
, &qp_attr
);
205 qp_attr
.qp_state
= IB_QPS_RTS
;
206 err
= ib_modify_qp(ic
->i_cm_id
->qp
, &qp_attr
, IB_QP_STATE
);
208 RDSV3_DPRINTF2("rdsv3_ib_cm_connect_complete",
209 "ib_modify_qp(IB_QP_STATE, RTS): err=%d", err
);
211 /* update ib_device with this local ipaddr & conn */
212 err
= rdsv3_ib_update_ipaddr(rds_ibdev
, conn
->c_laddr
);
214 RDSV3_DPRINTF2("rdsv3_ib_cm_connect_complete",
215 "rdsv3_ib_update_ipaddr failed (%d)", err
);
216 rdsv3_ib_add_conn(rds_ibdev
, conn
);
219 * If the peer gave us the last packet it saw, process this as if
220 * we had received a regular ACK.
222 if (dp
&& dp
->dp_ack_seq
)
223 rdsv3_send_drop_acked(conn
, ntohll(dp
->dp_ack_seq
), NULL
);
225 rdsv3_connect_complete(conn
);
227 RDSV3_DPRINTF2("rdsv3_ib_cm_connect_complete",
228 "Return conn: %p event: %p",
233 rdsv3_ib_cm_fill_conn_param(struct rdsv3_connection
*conn
,
234 struct rdma_conn_param
*conn_param
,
235 struct rdsv3_ib_connect_private
*dp
,
236 uint32_t protocol_version
,
237 uint32_t max_responder_resources
,
238 uint32_t max_initiator_depth
)
240 struct rdsv3_ib_connection
*ic
= conn
->c_transport_data
;
241 struct rdsv3_ib_device
*rds_ibdev
;
243 RDSV3_DPRINTF2("rdsv3_ib_cm_fill_conn_param",
244 "Enter conn: %p conn_param: %p private: %p version: %d",
245 conn
, conn_param
, dp
, protocol_version
);
247 (void) memset(conn_param
, 0, sizeof (struct rdma_conn_param
));
249 rds_ibdev
= ib_get_client_data(ic
->i_cm_id
->device
, &rdsv3_ib_client
);
251 conn_param
->responder_resources
=
252 MIN(rds_ibdev
->max_responder_resources
, max_responder_resources
);
253 conn_param
->initiator_depth
=
254 MIN(rds_ibdev
->max_initiator_depth
, max_initiator_depth
);
255 conn_param
->retry_count
= min(rdsv3_ib_retry_count
, 7);
256 conn_param
->rnr_retry_count
= 7;
259 (void) memset(dp
, 0, sizeof (*dp
));
260 dp
->dp_saddr
= conn
->c_laddr
;
261 dp
->dp_daddr
= conn
->c_faddr
;
262 dp
->dp_protocol_major
= RDS_PROTOCOL_MAJOR(protocol_version
);
263 dp
->dp_protocol_minor
= RDS_PROTOCOL_MINOR(protocol_version
);
264 dp
->dp_protocol_minor_mask
=
265 htons(RDSV3_IB_SUPPORTED_PROTOCOLS
);
266 dp
->dp_ack_seq
= rdsv3_ib_piggyb_ack(ic
);
268 /* Advertise flow control */
270 unsigned int credits
;
272 credits
= IB_GET_POST_CREDITS(
273 atomic_get(&ic
->i_credits
));
274 dp
->dp_credit
= htonl(credits
);
275 atomic_add_32(&ic
->i_credits
,
276 -IB_SET_POST_CREDITS(credits
));
279 conn_param
->private_data
= dp
;
280 conn_param
->private_data_len
= sizeof (*dp
);
283 RDSV3_DPRINTF2("rdsv3_ib_cm_fill_conn_param",
284 "Return conn: %p conn_param: %p private: %p version: %d",
285 conn
, conn_param
, dp
, protocol_version
);
289 rdsv3_ib_cq_event_handler(struct ib_event
*event
, void *data
)
291 RDSV3_DPRINTF3("rdsv3_ib_cq_event_handler", "event %u data %p",
296 rdsv3_ib_snd_cq_comp_handler(struct ib_cq
*cq
, void *context
)
298 struct rdsv3_connection
*conn
= context
;
299 struct rdsv3_ib_connection
*ic
= conn
->c_transport_data
;
301 RDSV3_DPRINTF4("rdsv3_ib_snd_cq_comp_handler",
302 "Enter(conn: %p ic: %p cq: %p)", conn
, ic
, cq
);
304 rdsv3_af_thr_fire(ic
->i_snd_soft_cq
);
308 rdsv3_ib_snd_tasklet_fn(void *data
)
310 struct rdsv3_ib_connection
*ic
= (struct rdsv3_ib_connection
*)data
;
311 struct rdsv3_connection
*conn
= ic
->conn
;
312 struct rdsv3_ib_ack_state ack_state
= { 0, };
316 RDSV3_DPRINTF4("rdsv3_ib_snd_tasklet_fn",
317 "Enter(conn: %p ic: %p)", conn
, ic
);
320 * Poll in a loop before and after enabling the next event
322 while (ibt_poll_cq(RDSV3_CQ2CQHDL(ic
->i_snd_cq
), &wc
, 1, &polled
) ==
324 RDSV3_DPRINTF4("rdsv3_ib_snd_tasklet_fn",
325 "wc_id 0x%llx type %d status %u byte_len %u imm_data %u\n",
326 (unsigned long long)wc
.wc_id
, wc
.wc_type
, wc
.wc_status
,
327 wc
.wc_bytes_xfer
, ntohl(wc
.wc_immed_data
));
329 ASSERT(wc
.wc_id
& RDSV3_IB_SEND_OP
);
330 rdsv3_ib_send_cqe_handler(ic
, &wc
);
332 (void) ibt_enable_cq_notify(RDSV3_CQ2CQHDL(ic
->i_snd_cq
),
333 IBT_NEXT_COMPLETION
);
334 while (ibt_poll_cq(RDSV3_CQ2CQHDL(ic
->i_snd_cq
), &wc
, 1, &polled
) ==
336 RDSV3_DPRINTF4("rdsv3_ib_snd_tasklet_fn",
337 "wc_id 0x%llx type %d status %u byte_len %u imm_data %u\n",
338 (unsigned long long)wc
.wc_id
, wc
.wc_type
, wc
.wc_status
,
339 wc
.wc_bytes_xfer
, ntohl(wc
.wc_immed_data
));
341 ASSERT(wc
.wc_id
& RDSV3_IB_SEND_OP
);
342 rdsv3_ib_send_cqe_handler(ic
, &wc
);
347 rdsv3_ib_cq_comp_handler(struct ib_cq
*cq
, void *context
)
349 struct rdsv3_connection
*conn
= context
;
350 struct rdsv3_ib_connection
*ic
= conn
->c_transport_data
;
352 RDSV3_DPRINTF4("rdsv3_ib_cq_comp_handler",
353 "Enter(conn: %p cq: %p)", conn
, cq
);
355 rdsv3_ib_stats_inc(s_ib_evt_handler_call
);
357 rdsv3_af_thr_fire(ic
->i_soft_cq
);
361 rdsv3_ib_refill_fn(void *data
)
363 struct rdsv3_connection
*conn
= (struct rdsv3_connection
*)data
;
365 (void) rdsv3_ib_recv_refill(conn
, 0);
369 rdsv3_ib_tasklet_fn(void *data
)
371 struct rdsv3_ib_connection
*ic
= (struct rdsv3_ib_connection
*)data
;
372 struct rdsv3_connection
*conn
= ic
->conn
;
373 struct rdsv3_ib_ack_state ack_state
= { 0, };
374 ibt_wc_t wc
[RDSV3_IB_WC_POLL_SIZE
];
378 RDSV3_DPRINTF4("rdsv3_ib_tasklet_fn",
379 "Enter(conn: %p ic: %p)", conn
, ic
);
381 rdsv3_ib_stats_inc(s_ib_tasklet_call
);
384 * Poll in a loop before and after enabling the next event
386 while (ibt_poll_cq(RDSV3_CQ2CQHDL(ic
->i_cq
), &wc
[0],
387 RDSV3_IB_WC_POLL_SIZE
, &polled
) == IBT_SUCCESS
) {
388 for (i
= 0; i
< polled
; i
++) {
389 RDSV3_DPRINTF4("rdsv3_ib_tasklet_fn",
390 "wc_id 0x%llx type %d status %u byte_len %u \
392 (unsigned long long)wc
[i
].wc_id
, wc
[i
].wc_type
,
393 wc
[i
].wc_status
, wc
[i
].wc_bytes_xfer
,
394 ntohl(wc
[i
].wc_immed_data
));
396 if (wc
[i
].wc_id
& RDSV3_IB_SEND_OP
) {
397 rdsv3_ib_send_cqe_handler(ic
, &wc
[i
]);
399 rdsv3_ib_recv_cqe_handler(ic
, &wc
[i
],
404 (void) ibt_enable_cq_notify(RDSV3_CQ2CQHDL(ic
->i_cq
),
406 while (ibt_poll_cq(RDSV3_CQ2CQHDL(ic
->i_cq
), &wc
[0],
407 RDSV3_IB_WC_POLL_SIZE
, &polled
) == IBT_SUCCESS
) {
408 for (i
= 0; i
< polled
; i
++) {
409 RDSV3_DPRINTF4("rdsv3_ib_tasklet_fn",
410 "wc_id 0x%llx type %d status %u byte_len %u \
412 (unsigned long long)wc
[i
].wc_id
, wc
[i
].wc_type
,
413 wc
[i
].wc_status
, wc
[i
].wc_bytes_xfer
,
414 ntohl(wc
[i
].wc_immed_data
));
416 if (wc
[i
].wc_id
& RDSV3_IB_SEND_OP
) {
417 rdsv3_ib_send_cqe_handler(ic
, &wc
[i
]);
419 rdsv3_ib_recv_cqe_handler(ic
, &wc
[i
],
425 if (ack_state
.ack_next_valid
) {
426 rdsv3_ib_set_ack(ic
, ack_state
.ack_next
,
427 ack_state
.ack_required
);
429 if (ack_state
.ack_recv_valid
&& ack_state
.ack_recv
> ic
->i_ack_recv
) {
430 rdsv3_send_drop_acked(conn
, ack_state
.ack_recv
, NULL
);
431 ic
->i_ack_recv
= ack_state
.ack_recv
;
433 if (rdsv3_conn_up(conn
)) {
434 if (!test_bit(RDSV3_LL_SEND_FULL
, &conn
->c_flags
))
435 (void) rdsv3_send_xmit(ic
->conn
);
436 rdsv3_ib_attempt_ack(ic
);
441 rdsv3_ib_qp_event_handler(struct ib_event
*event
, void *data
)
443 struct rdsv3_connection
*conn
= data
;
444 struct rdsv3_ib_connection
*ic
= conn
->c_transport_data
;
446 RDSV3_DPRINTF2("rdsv3_ib_qp_event_handler", "conn %p ic %p event %u",
447 conn
, ic
, event
->event
);
449 switch (event
->event
) {
450 case IB_EVENT_COMM_EST
:
451 (void) rdma_notify(ic
->i_cm_id
, IB_EVENT_COMM_EST
);
455 RDSV3_DPRINTF2("rdsv3_ib_qp_event_handler",
456 "RDS/IB: Fatal QP Event %u - "
457 "connection %u.%u.%u.%u ->%u.%u.%u.%u "
459 event
->event
, NIPQUAD(conn
->c_laddr
),
460 NIPQUAD(conn
->c_faddr
));
461 rdsv3_conn_drop(conn
);
463 RDSV3_DPRINTF2("rdsv3_ib_qp_event_handler",
464 "RDS/IB: Fatal QP Event %u - connection"
465 "...reconnecting", event
->event
);
470 RDSV3_DPRINTF2("rdsv3_ib_qp_event_handler", "Return conn: %p event: %p",
474 extern int rdsv3_ib_alloc_hdrs(ib_device_t
*dev
,
475 struct rdsv3_ib_connection
*ic
);
476 extern void rdsv3_ib_free_hdrs(ib_device_t
*dev
,
477 struct rdsv3_ib_connection
*ic
);
480 * This needs to be very careful to not leave IS_ERR pointers around for
481 * cleanup to trip over.
484 rdsv3_ib_setup_qp(struct rdsv3_connection
*conn
)
486 struct rdsv3_ib_connection
*ic
= conn
->c_transport_data
;
487 struct ib_device
*dev
= ic
->i_cm_id
->device
;
488 struct ib_qp_init_attr attr
;
489 struct rdsv3_ib_device
*rds_ibdev
;
494 RDSV3_DPRINTF2("rdsv3_ib_setup_qp", "Enter conn: %p", conn
);
497 * rdsv3_ib_add_one creates a rdsv3_ib_device object per IB device,
498 * and allocates a protection domain, memory range and FMR pool
499 * for each. If that fails for any reason, it will not register
500 * the rds_ibdev at all.
502 rds_ibdev
= ib_get_client_data(dev
, &rdsv3_ib_client
);
504 RDSV3_DPRINTF2("rdsv3_ib_setup_qp",
505 "RDS/IB: No client_data for device %s", dev
->name
);
506 return (-EOPNOTSUPP
);
508 ic
->rds_ibdev
= rds_ibdev
;
510 if (rds_ibdev
->max_wrs
< ic
->i_send_ring
.w_nr
+ 1)
511 rdsv3_ib_ring_resize(&ic
->i_send_ring
, rds_ibdev
->max_wrs
- 1);
512 if (rds_ibdev
->max_wrs
< ic
->i_recv_ring
.w_nr
+ 1)
513 rdsv3_ib_ring_resize(&ic
->i_recv_ring
, rds_ibdev
->max_wrs
- 1);
515 /* Protection domain and memory range */
516 ic
->i_pd
= rds_ibdev
->pd
;
519 * IB_CQ_VECTOR_LEAST_ATTACHED and/or the corresponding feature is
520 * not implmeneted in Hermon yet, but we can pass it to ib_create_cq()
523 ic
->i_cq
= ib_create_cq(dev
, rdsv3_ib_cq_comp_handler
,
524 rdsv3_ib_cq_event_handler
, conn
,
525 ic
->i_recv_ring
.w_nr
+ ic
->i_send_ring
.w_nr
+ 1,
526 rdsv3_af_grp_get_sched(ic
->rds_ibdev
->aft_hcagp
));
527 if (IS_ERR(ic
->i_cq
)) {
528 ret
= PTR_ERR(ic
->i_cq
);
530 RDSV3_DPRINTF2("rdsv3_ib_setup_qp",
531 "ib_create_cq failed: %d", ret
);
534 if (rdsv3_enable_snd_cq
) {
535 ic
->i_snd_cq
= ib_create_cq(dev
, rdsv3_ib_snd_cq_comp_handler
,
536 rdsv3_ib_cq_event_handler
, conn
, ic
->i_send_ring
.w_nr
+ 1,
537 rdsv3_af_grp_get_sched(ic
->rds_ibdev
->aft_hcagp
));
538 if (IS_ERR(ic
->i_snd_cq
)) {
539 ret
= PTR_ERR(ic
->i_snd_cq
);
540 (void) ib_destroy_cq(ic
->i_cq
);
543 RDSV3_DPRINTF2("rdsv3_ib_setup_qp",
544 "ib_create_cq send cq failed: %d", ret
);
549 /* XXX negotiate max send/recv with remote? */
550 (void) memset(&attr
, 0, sizeof (attr
));
551 attr
.event_handler
= rdsv3_ib_qp_event_handler
;
552 attr
.qp_context
= conn
;
553 /* + 1 to allow for the single ack message */
554 attr
.cap
.max_send_wr
= ic
->i_send_ring
.w_nr
+ 1;
555 attr
.cap
.max_recv_wr
= ic
->i_recv_ring
.w_nr
+ 1;
556 attr
.cap
.max_send_sge
= rds_ibdev
->max_sge
;
557 attr
.cap
.max_recv_sge
= RDSV3_IB_RECV_SGE
;
558 attr
.sq_sig_type
= IB_SIGNAL_REQ_WR
;
559 attr
.qp_type
= IB_QPT_RC
;
560 if (rdsv3_enable_snd_cq
) {
561 attr
.send_cq
= ic
->i_snd_cq
;
563 attr
.send_cq
= ic
->i_cq
;
565 attr
.recv_cq
= ic
->i_cq
;
568 * XXX this can fail if max_*_wr is too large? Are we supposed
569 * to back off until we get a value that the hardware can support?
571 ret
= rdma_create_qp(ic
->i_cm_id
, ic
->i_pd
, &attr
);
573 RDSV3_DPRINTF2("rdsv3_ib_setup_qp",
574 "rdma_create_qp failed: %d", ret
);
578 ret
= rdsv3_ib_alloc_hdrs(dev
, ic
);
581 RDSV3_DPRINTF2("rdsv3_ib_setup_qp",
582 "rdsv3_ib_alloc_hdrs failed: %d", ret
);
586 ic
->i_sends
= kmem_alloc(ic
->i_send_ring
.w_nr
*
587 sizeof (struct rdsv3_ib_send_work
), KM_NOSLEEP
);
588 if (ic
->i_sends
== NULL
) {
590 RDSV3_DPRINTF2("rdsv3_ib_setup_qp",
591 "send allocation failed: %d", ret
);
594 (void) memset(ic
->i_sends
, 0, ic
->i_send_ring
.w_nr
*
595 sizeof (struct rdsv3_ib_send_work
));
598 kmem_alloc(ic
->i_send_ring
.w_nr
* (sizeof (ibt_send_wr_t
) +
599 RDSV3_IB_MAX_SGE
* sizeof (ibt_wr_ds_t
)), KM_NOSLEEP
);
600 if (ic
->i_send_wrs
== NULL
) {
602 RDSV3_DPRINTF2("rdsv3_ib_setup_qp",
603 "Send WR allocation failed: %d", ret
);
606 sgl
= (ibt_wr_ds_t
*)((uint8_t *)ic
->i_send_wrs
+
607 (ic
->i_send_ring
.w_nr
* sizeof (ibt_send_wr_t
)));
608 for (i
= 0; i
< ic
->i_send_ring
.w_nr
; i
++) {
609 wrp
= &ic
->i_send_wrs
[i
];
610 wrp
->wr_sgl
= &sgl
[i
* RDSV3_IB_MAX_SGE
];
613 ic
->i_recvs
= kmem_alloc(ic
->i_recv_ring
.w_nr
*
614 sizeof (struct rdsv3_ib_recv_work
), KM_NOSLEEP
);
615 if (ic
->i_recvs
== NULL
) {
617 RDSV3_DPRINTF2("rdsv3_ib_setup_qp",
618 "recv allocation failed: %d", ret
);
621 (void) memset(ic
->i_recvs
, 0, ic
->i_recv_ring
.w_nr
*
622 sizeof (struct rdsv3_ib_recv_work
));
625 kmem_alloc(ic
->i_recv_ring
.w_nr
* sizeof (ibt_recv_wr_t
),
627 if (ic
->i_recv_wrs
== NULL
) {
629 RDSV3_DPRINTF2("rdsv3_ib_setup_qp",
630 "Recv WR allocation failed: %d", ret
);
634 rdsv3_ib_recv_init_ack(ic
);
636 RDSV3_DPRINTF2("rdsv3_ib_setup_qp", "conn %p pd %p mr %p cq %p",
637 conn
, ic
->i_pd
, ic
->i_mr
, ic
->i_cq
);
644 rdsv3_ib_protocol_compatible(struct rdma_cm_event
*event
)
646 const struct rdsv3_ib_connect_private
*dp
=
647 event
->param
.conn
.private_data
;
649 uint32_t version
= 0;
651 RDSV3_DPRINTF2("rdsv3_ib_protocol_compatible", "Enter event: %p",
655 * rdma_cm private data is odd - when there is any private data in the
656 * request, we will be given a pretty large buffer without telling us
658 * original size. The only way to tell the difference is by looking at
659 * the contents, which are initialized to zero.
660 * If the protocol version fields aren't set,
661 * this is a connection attempt
662 * from an older version. This could could be 3.0 or 2.0 -
664 * We really should have changed this for OFED 1.3 :-(
667 /* Be paranoid. RDS always has privdata */
668 if (!event
->param
.conn
.private_data_len
) {
669 RDSV3_DPRINTF2("rdsv3_ib_protocol_compatible",
670 "RDS incoming connection has no private data, rejecting");
674 /* Even if len is crap *now* I still want to check it. -ASG */
675 if (event
->param
.conn
.private_data_len
< sizeof (*dp
) ||
676 dp
->dp_protocol_major
== 0)
677 return (RDS_PROTOCOL_3_0
);
679 common
= ntohs(dp
->dp_protocol_minor_mask
) &
680 RDSV3_IB_SUPPORTED_PROTOCOLS
;
681 if (dp
->dp_protocol_major
== 3 && common
) {
682 version
= RDS_PROTOCOL_3_0
;
683 while ((common
>>= 1) != 0)
686 RDSV3_DPRINTF2("rdsv3_ib_protocol_compatible",
687 "RDS: Connection from %u.%u.%u.%u using "
688 "incompatible protocol version %u.%u\n",
689 NIPQUAD(dp
->dp_saddr
),
690 dp
->dp_protocol_major
,
691 dp
->dp_protocol_minor
);
694 RDSV3_DPRINTF2("rdsv3_ib_protocol_compatible", "Return event: %p",
701 rdsv3_ib_cm_handle_connect(struct rdma_cm_id
*cm_id
,
702 struct rdma_cm_event
*event
)
704 uint64_be_t lguid
= cm_id
->route
.path_rec
->sgid
.global
.interface_id
;
705 uint64_be_t fguid
= cm_id
->route
.path_rec
->dgid
.global
.interface_id
;
706 const struct rdsv3_ib_connect_private
*dp
=
707 event
->param
.conn
.private_data
;
708 struct rdsv3_ib_connect_private dp_rep
;
709 struct rdsv3_connection
*conn
= NULL
;
710 struct rdsv3_ib_connection
*ic
= NULL
;
711 struct rdma_conn_param conn_param
;
713 int err
, destroy
= 1;
714 boolean_t conn_created
= B_FALSE
;
716 RDSV3_DPRINTF2("rdsv3_ib_cm_handle_connect",
717 "Enter cm_id: %p event: %p", cm_id
, event
);
719 /* Check whether the remote protocol version matches ours. */
720 version
= rdsv3_ib_protocol_compatible(event
);
722 RDSV3_DPRINTF2("rdsv3_ib_cm_handle_connect",
727 RDSV3_DPRINTF2("rdsv3_ib_cm_handle_connect",
728 "saddr %u.%u.%u.%u daddr %u.%u.%u.%u RDSv%d.%d lguid 0x%llx fguid "
729 "0x%llx", NIPQUAD(dp
->dp_saddr
), NIPQUAD(dp
->dp_daddr
),
730 RDS_PROTOCOL_MAJOR(version
), RDS_PROTOCOL_MINOR(version
),
731 (unsigned long long)ntohll(lguid
),
732 (unsigned long long)ntohll(fguid
));
734 conn
= rdsv3_conn_create(dp
->dp_daddr
, dp
->dp_saddr
,
735 &rdsv3_ib_transport
, KM_NOSLEEP
);
737 RDSV3_DPRINTF2("rdsv3_ib_cm_handle_connect",
738 "rdsv3_conn_create failed (%ld)", PTR_ERR(conn
));
744 * The connection request may occur while the
745 * previous connection exist, e.g. in case of failover.
746 * But as connections may be initiated simultaneously
747 * by both hosts, we have a random backoff mechanism -
748 * see the comment above rdsv3_queue_reconnect()
750 mutex_enter(&conn
->c_cm_lock
);
751 if (!rdsv3_conn_transition(conn
, RDSV3_CONN_DOWN
,
752 RDSV3_CONN_CONNECTING
)) {
753 if (rdsv3_conn_state(conn
) == RDSV3_CONN_UP
) {
754 RDSV3_DPRINTF2("rdsv3_ib_cm_handle_connect",
755 "incoming connect when connected: %p",
757 rdsv3_conn_drop(conn
);
758 rdsv3_ib_stats_inc(s_ib_listen_closed_stale
);
759 mutex_exit(&conn
->c_cm_lock
);
761 } else if (rdsv3_conn_state(conn
) == RDSV3_CONN_CONNECTING
) {
762 /* Wait and see - our connect may still be succeeding */
763 RDSV3_DPRINTF2("rdsv3_ib_cm_handle_connect",
764 "peer-to-peer connection request: %p, "
765 "lguid: 0x%llx fguid: 0x%llx",
767 rdsv3_ib_stats_inc(s_ib_connect_raced
);
769 mutex_exit(&conn
->c_cm_lock
);
773 ic
= conn
->c_transport_data
;
775 rdsv3_ib_set_protocol(conn
, version
);
776 rdsv3_ib_set_flow_control(conn
, ntohl(dp
->dp_credit
));
779 * If the peer gave us the last packet it saw, process this as if
780 * we had received a regular ACK.
783 rdsv3_send_drop_acked(conn
, ntohll(dp
->dp_ack_seq
), NULL
);
785 ASSERT(!cm_id
->context
);
786 ASSERT(!ic
->i_cm_id
);
788 if (ic
->i_cm_id
!= NULL
)
792 cm_id
->context
= conn
;
795 * We got halfway through setting up the ib_connection, if we
796 * fail now, we have to take the long route out of this mess.
800 err
= rdsv3_ib_setup_qp(conn
);
802 RDSV3_DPRINTF2("rdsv3_ib_cm_handle_connect",
803 "rdsv3_ib_setup_qp failed (%d)", err
);
804 mutex_exit(&conn
->c_cm_lock
);
805 rdsv3_conn_drop(conn
);
809 rdsv3_ib_cm_fill_conn_param(conn
, &conn_param
, &dp_rep
, version
,
810 event
->param
.conn
.responder_resources
,
811 event
->param
.conn
.initiator_depth
);
813 /* rdma_accept() calls rdma_reject() internally if it fails */
814 err
= rdma_accept(cm_id
, &conn_param
);
815 mutex_exit(&conn
->c_cm_lock
);
817 RDSV3_DPRINTF2("rdsv3_ib_cm_handle_connect",
818 "rdma_accept failed (%d)", err
);
819 rdsv3_conn_drop(conn
);
823 RDSV3_DPRINTF2("rdsv3_ib_cm_handle_connect",
824 "Return cm_id: %p event: %p", cm_id
, event
);
829 (void) rdma_reject(cm_id
, NULL
, 0);
835 rdsv3_ib_cm_initiate_connect(struct rdma_cm_id
*cm_id
)
837 struct rdsv3_connection
*conn
= cm_id
->context
;
838 struct rdsv3_ib_connection
*ic
= conn
->c_transport_data
;
839 struct rdma_conn_param conn_param
;
840 struct rdsv3_ib_connect_private dp
;
843 RDSV3_DPRINTF2("rdsv3_ib_cm_initiate_connect", "Enter: cm_id: %p",
847 * If the peer doesn't do protocol negotiation, we must
850 rdsv3_ib_set_protocol(conn
, RDS_PROTOCOL_3_0
);
852 rdsv3_ib_sysctl_flow_control
; /* advertise flow control */
854 ret
= rdsv3_ib_setup_qp(conn
);
856 RDSV3_DPRINTF2("rdsv3_ib_cm_initiate_connect",
857 "rdsv3_ib_setup_qp failed (%d)", ret
);
858 rdsv3_conn_drop(conn
);
862 rdsv3_ib_cm_fill_conn_param(conn
, &conn_param
, &dp
,
863 RDS_PROTOCOL_VERSION
, UINT_MAX
, UINT_MAX
);
865 ret
= rdma_connect(cm_id
, &conn_param
);
867 RDSV3_DPRINTF2("rdsv3_ib_cm_initiate_connect",
868 "rdma_connect failed (%d)", ret
);
869 rdsv3_conn_drop(conn
);
872 RDSV3_DPRINTF2("rdsv3_ib_cm_initiate_connect",
873 "Return: cm_id: %p", cm_id
);
877 * Beware - returning non-zero tells the rdma_cm to destroy
878 * the cm_id. We should certainly not do it as long as we still
882 if (ic
->i_cm_id
== cm_id
)
889 rdsv3_ib_conn_connect(struct rdsv3_connection
*conn
)
891 struct rdsv3_ib_connection
*ic
= conn
->c_transport_data
;
892 struct sockaddr_in src
, dest
;
893 ipaddr_t laddr
, faddr
;
896 RDSV3_DPRINTF2("rdsv3_ib_conn_connect", "Enter: conn: %p", conn
);
899 * XXX I wonder what affect the port space has
901 /* delegate cm event handler to rdma_transport */
902 ic
->i_cm_id
= rdma_create_id(rdsv3_rdma_cm_event_handler
, conn
,
904 if (IS_ERR(ic
->i_cm_id
)) {
905 ret
= PTR_ERR(ic
->i_cm_id
);
907 RDSV3_DPRINTF2("rdsv3_ib_conn_connect",
908 "rdma_create_id() failed: %d", ret
);
912 RDSV3_DPRINTF3("rdsv3_ib_conn_connect",
913 "created cm id %p for conn %p", ic
->i_cm_id
, conn
);
915 /* The ipaddr should be in the network order */
916 laddr
= conn
->c_laddr
;
917 faddr
= conn
->c_faddr
;
918 ret
= rdsv3_sc_path_lookup(&laddr
, &faddr
);
920 RDSV3_DPRINTF2(LABEL
, "Path not found (0x%x 0x%x)",
921 ntohl(laddr
), ntohl(faddr
));
924 src
.sin_family
= AF_INET
;
925 src
.sin_addr
.s_addr
= (uint32_t)laddr
;
926 src
.sin_port
= (uint16_t)htons(0);
928 dest
.sin_family
= AF_INET
;
929 dest
.sin_addr
.s_addr
= (uint32_t)faddr
;
930 dest
.sin_port
= (uint16_t)htons(RDSV3_PORT
);
932 ret
= rdma_resolve_addr(ic
->i_cm_id
, (struct sockaddr
*)&src
,
933 (struct sockaddr
*)&dest
,
934 RDSV3_RDMA_RESOLVE_TIMEOUT_MS
);
936 RDSV3_DPRINTF2("rdsv3_ib_conn_connect",
937 "addr resolve failed for cm id %p: %d", ic
->i_cm_id
, ret
);
938 rdma_destroy_id(ic
->i_cm_id
);
942 RDSV3_DPRINTF2("rdsv3_ib_conn_connect", "Return: conn: %p", conn
);
949 * This is so careful about only cleaning up resources that were built up
950 * so that it can be called at any point during startup. In fact it
951 * can be called multiple times for a given connection.
954 rdsv3_ib_conn_shutdown(struct rdsv3_connection
*conn
)
956 struct rdsv3_ib_connection
*ic
= conn
->c_transport_data
;
959 RDSV3_DPRINTF2("rdsv3_ib_conn_shutdown",
960 "cm %p pd %p cq %p qp %p", ic
->i_cm_id
,
961 ic
->i_pd
, ic
->i_cq
, ic
->i_cm_id
? ic
->i_cm_id
->qp
: NULL
);
964 struct ib_device
*dev
= ic
->i_cm_id
->device
;
966 RDSV3_DPRINTF2("rdsv3_ib_conn_shutdown",
967 "disconnecting cm %p", ic
->i_cm_id
);
968 err
= rdma_disconnect(ic
->i_cm_id
);
971 * Actually this may happen quite frequently, when
972 * an outgoing connect raced with an incoming connect.
974 RDSV3_DPRINTF2("rdsv3_ib_conn_shutdown",
975 "failed to disconnect, cm: %p err %d",
979 if (ic
->i_cm_id
->qp
) {
981 ib_get_ibt_channel_hdl(ic
->i_cm_id
));
983 * Don't wait for the send ring to be empty -- there
984 * may be completed non-signaled entries sitting on
985 * there. We unmap these below.
987 rdsv3_wait_event(&ic
->i_recv_ring
.w_empty_wait
,
988 rdsv3_ib_ring_empty(&ic
->i_recv_ring
));
990 * Note that Linux original code calls
991 * rdma_destroy_qp() after rdsv3_ib_recv_clear_ring(ic).
993 rdma_destroy_qp(ic
->i_cm_id
);
996 if (rdsv3_enable_snd_cq
) {
997 if (ic
->i_snd_soft_cq
) {
998 rdsv3_af_thr_destroy(ic
->i_snd_soft_cq
);
999 ic
->i_snd_soft_cq
= NULL
;
1002 (void) ib_destroy_cq(ic
->i_snd_cq
);
1004 if (ic
->i_soft_cq
) {
1005 rdsv3_af_thr_destroy(ic
->i_soft_cq
);
1006 ic
->i_soft_cq
= NULL
;
1008 if (ic
->i_refill_rq
) {
1009 rdsv3_af_thr_destroy(ic
->i_refill_rq
);
1010 ic
->i_refill_rq
= NULL
;
1013 (void) ib_destroy_cq(ic
->i_cq
);
1016 rdsv3_ib_free_hdrs(dev
, ic
);
1019 rdsv3_ib_send_clear_ring(ic
);
1021 rdsv3_ib_recv_clear_ring(ic
);
1023 rdma_destroy_id(ic
->i_cm_id
);
1026 * Move connection back to the nodev list.
1028 if (ic
->i_on_dev_list
)
1029 rdsv3_ib_remove_conn(ic
->rds_ibdev
, conn
);
1035 ic
->i_snd_cq
= NULL
;
1036 ic
->i_send_hdrs
= NULL
;
1037 ic
->i_recv_hdrs
= NULL
;
1040 ASSERT(!ic
->i_on_dev_list
);
1042 /* Clear pending transmit */
1044 rdsv3_message_put(ic
->i_rm
);
1048 /* Clear the ACK state */
1049 clear_bit(IB_ACK_IN_FLIGHT
, &ic
->i_ack_flags
);
1053 /* Clear flow control state */
1057 rdsv3_ib_ring_init(&ic
->i_send_ring
, rdsv3_ib_sysctl_max_send_wr
);
1058 rdsv3_ib_ring_init(&ic
->i_recv_ring
, rdsv3_ib_sysctl_max_recv_wr
);
1061 rdsv3_inc_put(&ic
->i_ibinc
->ii_inc
);
1066 kmem_free(ic
->i_sends
,
1067 ic
->i_send_ring
.w_nr
* sizeof (struct rdsv3_ib_send_work
));
1070 if (ic
->i_send_wrs
) {
1071 kmem_free(ic
->i_send_wrs
, ic
->i_send_ring
.w_nr
*
1072 (sizeof (ibt_send_wr_t
) +
1073 RDSV3_IB_MAX_SGE
* sizeof (ibt_wr_ds_t
)));
1074 ic
->i_send_wrs
= NULL
;
1077 kmem_free(ic
->i_recvs
,
1078 ic
->i_recv_ring
.w_nr
* sizeof (struct rdsv3_ib_recv_work
));
1081 if (ic
->i_recv_wrs
) {
1082 kmem_free(ic
->i_recv_wrs
, ic
->i_recv_ring
.w_nr
*
1083 (sizeof (ibt_recv_wr_t
)));
1084 ic
->i_recv_wrs
= NULL
;
1087 RDSV3_DPRINTF2("rdsv3_ib_conn_shutdown", "Return conn: %p", conn
);
1092 rdsv3_ib_conn_alloc(struct rdsv3_connection
*conn
, int gfp
)
1094 struct rdsv3_ib_connection
*ic
;
1096 RDSV3_DPRINTF2("rdsv3_ib_conn_alloc", "conn: %p", conn
);
1099 ic
= kmem_zalloc(sizeof (struct rdsv3_ib_connection
), gfp
);
1103 list_link_init(&ic
->ib_node
);
1105 mutex_init(&ic
->i_recv_mutex
, NULL
, MUTEX_DRIVER
, NULL
);
1106 mutex_init(&ic
->i_ack_lock
, NULL
, MUTEX_DRIVER
, NULL
);
1109 * rdsv3_ib_conn_shutdown() waits for these to be emptied so they
1110 * must be initialized before it can be called.
1112 rdsv3_ib_ring_init(&ic
->i_send_ring
, rdsv3_ib_sysctl_max_send_wr
);
1113 rdsv3_ib_ring_init(&ic
->i_recv_ring
, rdsv3_ib_sysctl_max_recv_wr
);
1116 conn
->c_transport_data
= ic
;
1118 mutex_enter(&ib_nodev_conns_lock
);
1119 list_insert_tail(&ib_nodev_conns
, ic
);
1120 mutex_exit(&ib_nodev_conns_lock
);
1122 RDSV3_DPRINTF2("rdsv3_ib_conn_alloc", "conn %p conn ic %p",
1123 conn
, conn
->c_transport_data
);
1128 * Free a connection. Connection must be shut down and not set for reconnect.
1131 rdsv3_ib_conn_free(void *arg
)
1133 struct rdsv3_ib_connection
*ic
= arg
;
1136 RDSV3_DPRINTF2("rdsv3_ib_conn_free", "ic %p\n", ic
);
1140 * Conn is either on a dev's list or on the nodev list.
1141 * A race with shutdown() or connect() would cause problems
1142 * (since rds_ibdev would change) but that should never happen.
1144 lock_ptr
= ic
->i_on_dev_list
?
1145 &ic
->rds_ibdev
->spinlock
: &ib_nodev_conns_lock
;
1147 mutex_enter(lock_ptr
);
1148 list_remove_node(&ic
->ib_node
);
1149 mutex_exit(lock_ptr
);
1151 kmem_free(ic
, sizeof (*ic
));
1155 * An error occurred on the connection
1158 __rdsv3_ib_conn_error(struct rdsv3_connection
*conn
)
1160 rdsv3_conn_drop(conn
);