1 // SPDX-License-Identifier: GPL-2.0
3 * Shared Memory Communications over RDMA (SMC-R) and RoCE
5 * CLC (connection layer control) handshake over initial TCP socket to
6 * prepare for RDMA traffic
8 * Copyright IBM Corp. 2016, 2018
10 * Author(s): Ursula Braun <ubraun@linux.vnet.ibm.com>
14 #include <linux/inetdevice.h>
15 #include <linux/if_ether.h>
16 #include <linux/sched/signal.h>
18 #include <net/addrconf.h>
27 /* eye catcher "SMCR" EBCDIC for CLC messages */
28 static const char SMC_EYECATCHER
[4] = {'\xe2', '\xd4', '\xc3', '\xd9'};
30 /* check if received message has a correct header length and contains valid
31 * heading and trailing eyecatchers
33 static bool smc_clc_msg_hdr_valid(struct smc_clc_msg_hdr
*clcm
)
35 struct smc_clc_msg_proposal_prefix
*pclc_prfx
;
36 struct smc_clc_msg_accept_confirm
*clc
;
37 struct smc_clc_msg_proposal
*pclc
;
38 struct smc_clc_msg_decline
*dclc
;
39 struct smc_clc_msg_trail
*trl
;
41 if (memcmp(clcm
->eyecatcher
, SMC_EYECATCHER
, sizeof(SMC_EYECATCHER
)))
44 case SMC_CLC_PROPOSAL
:
45 pclc
= (struct smc_clc_msg_proposal
*)clcm
;
46 pclc_prfx
= smc_clc_proposal_get_prefix(pclc
);
47 if (ntohs(pclc
->hdr
.length
) !=
48 sizeof(*pclc
) + ntohs(pclc
->iparea_offset
) +
50 pclc_prfx
->ipv6_prefixes_cnt
*
51 sizeof(struct smc_clc_ipv6_prefix
) +
54 trl
= (struct smc_clc_msg_trail
*)
55 ((u8
*)pclc
+ ntohs(pclc
->hdr
.length
) - sizeof(*trl
));
59 clc
= (struct smc_clc_msg_accept_confirm
*)clcm
;
60 if (ntohs(clc
->hdr
.length
) != sizeof(*clc
))
65 dclc
= (struct smc_clc_msg_decline
*)clcm
;
66 if (ntohs(dclc
->hdr
.length
) != sizeof(*dclc
))
73 if (memcmp(trl
->eyecatcher
, SMC_EYECATCHER
, sizeof(SMC_EYECATCHER
)))
78 /* find ipv4 addr on device and get the prefix len, fill CLC proposal msg */
79 static int smc_clc_prfx_set4_rcu(struct dst_entry
*dst
, __be32 ipv4
,
80 struct smc_clc_msg_proposal_prefix
*prop
)
82 struct in_device
*in_dev
= __in_dev_get_rcu(dst
->dev
);
87 if (!inet_ifa_match(ipv4
, ifa
))
89 prop
->prefix_len
= inet_mask_len(ifa
->ifa_mask
);
90 prop
->outgoing_subnet
= ifa
->ifa_address
& ifa
->ifa_mask
;
91 /* prop->ipv6_prefixes_cnt = 0; already done by memset before */
97 /* fill CLC proposal msg with ipv6 prefixes from device */
98 static int smc_clc_prfx_set6_rcu(struct dst_entry
*dst
,
99 struct smc_clc_msg_proposal_prefix
*prop
,
100 struct smc_clc_ipv6_prefix
*ipv6_prfx
)
102 #if IS_ENABLED(CONFIG_IPV6)
103 struct inet6_dev
*in6_dev
= __in6_dev_get(dst
->dev
);
104 struct inet6_ifaddr
*ifa
;
109 /* use a maximum of 8 IPv6 prefixes from device */
110 list_for_each_entry(ifa
, &in6_dev
->addr_list
, if_list
) {
111 if (ipv6_addr_type(&ifa
->addr
) & IPV6_ADDR_LINKLOCAL
)
113 ipv6_addr_prefix(&ipv6_prfx
[cnt
].prefix
,
114 &ifa
->addr
, ifa
->prefix_len
);
115 ipv6_prfx
[cnt
].prefix_len
= ifa
->prefix_len
;
117 if (cnt
== SMC_CLC_MAX_V6_PREFIX
)
120 prop
->ipv6_prefixes_cnt
= cnt
;
127 /* retrieve and set prefixes in CLC proposal msg */
128 static int smc_clc_prfx_set(struct socket
*clcsock
,
129 struct smc_clc_msg_proposal_prefix
*prop
,
130 struct smc_clc_ipv6_prefix
*ipv6_prfx
)
132 struct dst_entry
*dst
= sk_dst_get(clcsock
->sk
);
133 struct sockaddr_storage addrs
;
134 struct sockaddr_in6
*addr6
;
135 struct sockaddr_in
*addr
;
138 memset(prop
, 0, sizeof(*prop
));
147 /* get address to which the internal TCP socket is bound */
148 kernel_getsockname(clcsock
, (struct sockaddr
*)&addrs
);
149 /* analyze IP specific data of net_device belonging to TCP socket */
150 addr6
= (struct sockaddr_in6
*)&addrs
;
152 if (addrs
.ss_family
== PF_INET
) {
154 addr
= (struct sockaddr_in
*)&addrs
;
155 rc
= smc_clc_prfx_set4_rcu(dst
, addr
->sin_addr
.s_addr
, prop
);
156 } else if (ipv6_addr_v4mapped(&addr6
->sin6_addr
)) {
157 /* mapped IPv4 address - peer is IPv4 only */
158 rc
= smc_clc_prfx_set4_rcu(dst
, addr6
->sin6_addr
.s6_addr32
[3],
162 rc
= smc_clc_prfx_set6_rcu(dst
, prop
, ipv6_prfx
);
171 /* match ipv4 addrs of dev against addr in CLC proposal */
172 static int smc_clc_prfx_match4_rcu(struct net_device
*dev
,
173 struct smc_clc_msg_proposal_prefix
*prop
)
175 struct in_device
*in_dev
= __in_dev_get_rcu(dev
);
180 if (prop
->prefix_len
== inet_mask_len(ifa
->ifa_mask
) &&
181 inet_ifa_match(prop
->outgoing_subnet
, ifa
))
183 } endfor_ifa(in_dev
);
188 /* match ipv6 addrs of dev against addrs in CLC proposal */
189 static int smc_clc_prfx_match6_rcu(struct net_device
*dev
,
190 struct smc_clc_msg_proposal_prefix
*prop
)
192 #if IS_ENABLED(CONFIG_IPV6)
193 struct inet6_dev
*in6_dev
= __in6_dev_get(dev
);
194 struct smc_clc_ipv6_prefix
*ipv6_prfx
;
195 struct inet6_ifaddr
*ifa
;
200 /* ipv6 prefix list starts behind smc_clc_msg_proposal_prefix */
201 ipv6_prfx
= (struct smc_clc_ipv6_prefix
*)((u8
*)prop
+ sizeof(*prop
));
202 max
= min_t(u8
, prop
->ipv6_prefixes_cnt
, SMC_CLC_MAX_V6_PREFIX
);
203 list_for_each_entry(ifa
, &in6_dev
->addr_list
, if_list
) {
204 if (ipv6_addr_type(&ifa
->addr
) & IPV6_ADDR_LINKLOCAL
)
206 for (i
= 0; i
< max
; i
++) {
207 if (ifa
->prefix_len
== ipv6_prfx
[i
].prefix_len
&&
208 ipv6_prefix_equal(&ifa
->addr
, &ipv6_prfx
[i
].prefix
,
217 /* check if proposed prefixes match one of our device prefixes */
218 int smc_clc_prfx_match(struct socket
*clcsock
,
219 struct smc_clc_msg_proposal_prefix
*prop
)
221 struct dst_entry
*dst
= sk_dst_get(clcsock
->sk
);
233 if (!prop
->ipv6_prefixes_cnt
)
234 rc
= smc_clc_prfx_match4_rcu(dst
->dev
, prop
);
236 rc
= smc_clc_prfx_match6_rcu(dst
->dev
, prop
);
244 /* Wait for data on the tcp-socket, analyze received data
246 * 0 if success and it was not a decline that we received.
247 * SMC_CLC_DECL_REPLY if decline received for fallback w/o another decl send.
248 * clcsock error, -EINTR, -ECONNRESET, -EPROTO otherwise.
250 int smc_clc_wait_msg(struct smc_sock
*smc
, void *buf
, int buflen
,
253 long rcvtimeo
= smc
->clcsock
->sk
->sk_rcvtimeo
;
254 struct sock
*clc_sk
= smc
->clcsock
->sk
;
255 struct smc_clc_msg_hdr
*clcm
= buf
;
256 struct msghdr msg
= {NULL
, 0};
258 struct kvec vec
= {buf
, buflen
};
262 /* peek the first few bytes to determine length of data to receive
263 * so we don't consume any subsequent CLC message or payload data
264 * in the TCP byte stream
267 * Caller must make sure that buflen is no less than
268 * sizeof(struct smc_clc_msg_hdr)
270 krflags
= MSG_PEEK
| MSG_WAITALL
;
271 smc
->clcsock
->sk
->sk_rcvtimeo
= CLC_WAIT_TIME
;
272 iov_iter_kvec(&msg
.msg_iter
, READ
| ITER_KVEC
, &vec
, 1,
273 sizeof(struct smc_clc_msg_hdr
));
274 len
= sock_recvmsg(smc
->clcsock
, &msg
, krflags
);
275 if (signal_pending(current
)) {
276 reason_code
= -EINTR
;
277 clc_sk
->sk_err
= EINTR
;
278 smc
->sk
.sk_err
= EINTR
;
281 if (clc_sk
->sk_err
) {
282 reason_code
= -clc_sk
->sk_err
;
283 smc
->sk
.sk_err
= clc_sk
->sk_err
;
286 if (!len
) { /* peer has performed orderly shutdown */
287 smc
->sk
.sk_err
= ECONNRESET
;
288 reason_code
= -ECONNRESET
;
292 smc
->sk
.sk_err
= -len
;
296 datlen
= ntohs(clcm
->length
);
297 if ((len
< sizeof(struct smc_clc_msg_hdr
)) ||
299 ((clcm
->type
!= SMC_CLC_DECLINE
) &&
300 (clcm
->type
!= expected_type
))) {
301 smc
->sk
.sk_err
= EPROTO
;
302 reason_code
= -EPROTO
;
306 /* receive the complete CLC message */
307 memset(&msg
, 0, sizeof(struct msghdr
));
308 iov_iter_kvec(&msg
.msg_iter
, READ
| ITER_KVEC
, &vec
, 1, datlen
);
309 krflags
= MSG_WAITALL
;
310 len
= sock_recvmsg(smc
->clcsock
, &msg
, krflags
);
311 if (len
< datlen
|| !smc_clc_msg_hdr_valid(clcm
)) {
312 smc
->sk
.sk_err
= EPROTO
;
313 reason_code
= -EPROTO
;
316 if (clcm
->type
== SMC_CLC_DECLINE
) {
317 reason_code
= SMC_CLC_DECL_REPLY
;
318 if (((struct smc_clc_msg_decline
*)buf
)->hdr
.flag
) {
319 smc
->conn
.lgr
->sync_err
= 1;
320 smc_lgr_terminate(smc
->conn
.lgr
);
325 smc
->clcsock
->sk
->sk_rcvtimeo
= rcvtimeo
;
329 /* send CLC DECLINE message across internal TCP socket */
330 int smc_clc_send_decline(struct smc_sock
*smc
, u32 peer_diag_info
)
332 struct smc_clc_msg_decline dclc
;
337 memset(&dclc
, 0, sizeof(dclc
));
338 memcpy(dclc
.hdr
.eyecatcher
, SMC_EYECATCHER
, sizeof(SMC_EYECATCHER
));
339 dclc
.hdr
.type
= SMC_CLC_DECLINE
;
340 dclc
.hdr
.length
= htons(sizeof(struct smc_clc_msg_decline
));
341 dclc
.hdr
.version
= SMC_CLC_V1
;
342 dclc
.hdr
.flag
= (peer_diag_info
== SMC_CLC_DECL_SYNCERR
) ? 1 : 0;
343 memcpy(dclc
.id_for_peer
, local_systemid
, sizeof(local_systemid
));
344 dclc
.peer_diagnosis
= htonl(peer_diag_info
);
345 memcpy(dclc
.trl
.eyecatcher
, SMC_EYECATCHER
, sizeof(SMC_EYECATCHER
));
347 memset(&msg
, 0, sizeof(msg
));
348 vec
.iov_base
= &dclc
;
349 vec
.iov_len
= sizeof(struct smc_clc_msg_decline
);
350 len
= kernel_sendmsg(smc
->clcsock
, &msg
, &vec
, 1,
351 sizeof(struct smc_clc_msg_decline
));
352 if (len
< sizeof(struct smc_clc_msg_decline
))
353 smc
->sk
.sk_err
= EPROTO
;
355 smc
->sk
.sk_err
= -len
;
356 return sock_error(&smc
->sk
);
359 /* send CLC PROPOSAL message across internal TCP socket */
360 int smc_clc_send_proposal(struct smc_sock
*smc
,
361 struct smc_ib_device
*smcibdev
,
364 struct smc_clc_ipv6_prefix ipv6_prfx
[SMC_CLC_MAX_V6_PREFIX
];
365 struct smc_clc_msg_proposal_prefix pclc_prfx
;
366 struct smc_clc_msg_proposal pclc
;
367 struct smc_clc_msg_trail trl
;
368 int len
, i
, plen
, rc
;
373 /* retrieve ip prefixes for CLC proposal msg */
374 rc
= smc_clc_prfx_set(smc
->clcsock
, &pclc_prfx
, ipv6_prfx
);
376 return SMC_CLC_DECL_CNFERR
; /* configuration error */
378 /* send SMC Proposal CLC message */
379 plen
= sizeof(pclc
) + sizeof(pclc_prfx
) +
380 (pclc_prfx
.ipv6_prefixes_cnt
* sizeof(ipv6_prfx
[0])) +
382 memset(&pclc
, 0, sizeof(pclc
));
383 memcpy(pclc
.hdr
.eyecatcher
, SMC_EYECATCHER
, sizeof(SMC_EYECATCHER
));
384 pclc
.hdr
.type
= SMC_CLC_PROPOSAL
;
385 pclc
.hdr
.length
= htons(plen
);
386 pclc
.hdr
.version
= SMC_CLC_V1
; /* SMC version */
387 memcpy(pclc
.lcl
.id_for_peer
, local_systemid
, sizeof(local_systemid
));
388 memcpy(&pclc
.lcl
.gid
, &smcibdev
->gid
[ibport
- 1], SMC_GID_SIZE
);
389 memcpy(&pclc
.lcl
.mac
, &smcibdev
->mac
[ibport
- 1], ETH_ALEN
);
390 pclc
.iparea_offset
= htons(0);
392 memcpy(trl
.eyecatcher
, SMC_EYECATCHER
, sizeof(SMC_EYECATCHER
));
393 memset(&msg
, 0, sizeof(msg
));
395 vec
[i
].iov_base
= &pclc
;
396 vec
[i
++].iov_len
= sizeof(pclc
);
397 vec
[i
].iov_base
= &pclc_prfx
;
398 vec
[i
++].iov_len
= sizeof(pclc_prfx
);
399 if (pclc_prfx
.ipv6_prefixes_cnt
> 0) {
400 vec
[i
].iov_base
= &ipv6_prfx
[0];
401 vec
[i
++].iov_len
= pclc_prfx
.ipv6_prefixes_cnt
*
402 sizeof(ipv6_prfx
[0]);
404 vec
[i
].iov_base
= &trl
;
405 vec
[i
++].iov_len
= sizeof(trl
);
406 /* due to the few bytes needed for clc-handshake this cannot block */
407 len
= kernel_sendmsg(smc
->clcsock
, &msg
, vec
, i
, plen
);
408 if (len
< sizeof(pclc
)) {
410 reason_code
= -ENETUNREACH
;
411 smc
->sk
.sk_err
= -reason_code
;
413 smc
->sk
.sk_err
= smc
->clcsock
->sk
->sk_err
;
414 reason_code
= -smc
->sk
.sk_err
;
421 /* send CLC CONFIRM message across internal TCP socket */
422 int smc_clc_send_confirm(struct smc_sock
*smc
)
424 struct smc_connection
*conn
= &smc
->conn
;
425 struct smc_clc_msg_accept_confirm cclc
;
426 struct smc_link
*link
;
432 link
= &conn
->lgr
->lnk
[SMC_SINGLE_LINK
];
433 /* send SMC Confirm CLC msg */
434 memset(&cclc
, 0, sizeof(cclc
));
435 memcpy(cclc
.hdr
.eyecatcher
, SMC_EYECATCHER
, sizeof(SMC_EYECATCHER
));
436 cclc
.hdr
.type
= SMC_CLC_CONFIRM
;
437 cclc
.hdr
.length
= htons(sizeof(cclc
));
438 cclc
.hdr
.version
= SMC_CLC_V1
; /* SMC version */
439 memcpy(cclc
.lcl
.id_for_peer
, local_systemid
, sizeof(local_systemid
));
440 memcpy(&cclc
.lcl
.gid
, &link
->smcibdev
->gid
[link
->ibport
- 1],
442 memcpy(&cclc
.lcl
.mac
, &link
->smcibdev
->mac
[link
->ibport
- 1], ETH_ALEN
);
443 hton24(cclc
.qpn
, link
->roce_qp
->qp_num
);
445 htonl(conn
->rmb_desc
->mr_rx
[SMC_SINGLE_LINK
]->rkey
);
446 cclc
.rmbe_idx
= 1; /* for now: 1 RMB = 1 RMBE */
447 cclc
.rmbe_alert_token
= htonl(conn
->alert_token_local
);
448 cclc
.qp_mtu
= min(link
->path_mtu
, link
->peer_mtu
);
449 cclc
.rmbe_size
= conn
->rmbe_size_short
;
450 cclc
.rmb_dma_addr
= cpu_to_be64(
451 (u64
)sg_dma_address(conn
->rmb_desc
->sgt
[SMC_SINGLE_LINK
].sgl
));
452 hton24(cclc
.psn
, link
->psn_initial
);
454 memcpy(cclc
.trl
.eyecatcher
, SMC_EYECATCHER
, sizeof(SMC_EYECATCHER
));
456 memset(&msg
, 0, sizeof(msg
));
457 vec
.iov_base
= &cclc
;
458 vec
.iov_len
= sizeof(cclc
);
459 len
= kernel_sendmsg(smc
->clcsock
, &msg
, &vec
, 1, sizeof(cclc
));
460 if (len
< sizeof(cclc
)) {
462 reason_code
= -ENETUNREACH
;
463 smc
->sk
.sk_err
= -reason_code
;
465 smc
->sk
.sk_err
= smc
->clcsock
->sk
->sk_err
;
466 reason_code
= -smc
->sk
.sk_err
;
472 /* send CLC ACCEPT message across internal TCP socket */
473 int smc_clc_send_accept(struct smc_sock
*new_smc
, int srv_first_contact
)
475 struct smc_connection
*conn
= &new_smc
->conn
;
476 struct smc_clc_msg_accept_confirm aclc
;
477 struct smc_link
*link
;
483 link
= &conn
->lgr
->lnk
[SMC_SINGLE_LINK
];
484 memset(&aclc
, 0, sizeof(aclc
));
485 memcpy(aclc
.hdr
.eyecatcher
, SMC_EYECATCHER
, sizeof(SMC_EYECATCHER
));
486 aclc
.hdr
.type
= SMC_CLC_ACCEPT
;
487 aclc
.hdr
.length
= htons(sizeof(aclc
));
488 aclc
.hdr
.version
= SMC_CLC_V1
; /* SMC version */
489 if (srv_first_contact
)
491 memcpy(aclc
.lcl
.id_for_peer
, local_systemid
, sizeof(local_systemid
));
492 memcpy(&aclc
.lcl
.gid
, &link
->smcibdev
->gid
[link
->ibport
- 1],
494 memcpy(&aclc
.lcl
.mac
, link
->smcibdev
->mac
[link
->ibport
- 1], ETH_ALEN
);
495 hton24(aclc
.qpn
, link
->roce_qp
->qp_num
);
497 htonl(conn
->rmb_desc
->mr_rx
[SMC_SINGLE_LINK
]->rkey
);
498 aclc
.rmbe_idx
= 1; /* as long as 1 RMB = 1 RMBE */
499 aclc
.rmbe_alert_token
= htonl(conn
->alert_token_local
);
500 aclc
.qp_mtu
= link
->path_mtu
;
501 aclc
.rmbe_size
= conn
->rmbe_size_short
,
502 aclc
.rmb_dma_addr
= cpu_to_be64(
503 (u64
)sg_dma_address(conn
->rmb_desc
->sgt
[SMC_SINGLE_LINK
].sgl
));
504 hton24(aclc
.psn
, link
->psn_initial
);
505 memcpy(aclc
.trl
.eyecatcher
, SMC_EYECATCHER
, sizeof(SMC_EYECATCHER
));
507 memset(&msg
, 0, sizeof(msg
));
508 vec
.iov_base
= &aclc
;
509 vec
.iov_len
= sizeof(aclc
);
510 len
= kernel_sendmsg(new_smc
->clcsock
, &msg
, &vec
, 1, sizeof(aclc
));
511 if (len
< sizeof(aclc
)) {
513 new_smc
->sk
.sk_err
= EPROTO
;
515 new_smc
->sk
.sk_err
= new_smc
->clcsock
->sk
->sk_err
;
516 rc
= sock_error(&new_smc
->sk
);