2 * Shared Memory Communications over RDMA (SMC-R) and RoCE
4 * AF_SMC protocol family socket handler keeping the AF_INET sock address type
5 * applies to SOCK_STREAM sockets only
6 * offers an alternative communication option for TCP-protocol sockets
7 * applicable with RoCE-cards only
9 * Initial restrictions:
10 * - support for alternate links postponed
12 * Copyright IBM Corp. 2016, 2018
14 * Author(s): Ursula Braun <ubraun@linux.vnet.ibm.com>
15 * based on prototype from Frank Blaschka
18 #define KMSG_COMPONENT "smc"
19 #define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
21 #include <linux/module.h>
22 #include <linux/socket.h>
23 #include <linux/workqueue.h>
25 #include <linux/sched/signal.h>
26 #include <linux/if_vlan.h>
31 #include <asm/ioctls.h>
43 #include "smc_close.h"
45 static DEFINE_MUTEX(smc_create_lgr_pending
); /* serialize link group
49 static void smc_tcp_listen_work(struct work_struct
*);
50 static void smc_connect_work(struct work_struct
*);
52 static void smc_set_keepalive(struct sock
*sk
, int val
)
54 struct smc_sock
*smc
= smc_sk(sk
);
56 smc
->clcsock
->sk
->sk_prot
->keepalive(smc
->clcsock
->sk
, val
);
59 static struct smc_hashinfo smc_v4_hashinfo
= {
60 .lock
= __RW_LOCK_UNLOCKED(smc_v4_hashinfo
.lock
),
63 static struct smc_hashinfo smc_v6_hashinfo
= {
64 .lock
= __RW_LOCK_UNLOCKED(smc_v6_hashinfo
.lock
),
67 int smc_hash_sk(struct sock
*sk
)
69 struct smc_hashinfo
*h
= sk
->sk_prot
->h
.smc_hash
;
70 struct hlist_head
*head
;
74 write_lock_bh(&h
->lock
);
75 sk_add_node(sk
, head
);
76 sock_prot_inuse_add(sock_net(sk
), sk
->sk_prot
, 1);
77 write_unlock_bh(&h
->lock
);
81 EXPORT_SYMBOL_GPL(smc_hash_sk
);
83 void smc_unhash_sk(struct sock
*sk
)
85 struct smc_hashinfo
*h
= sk
->sk_prot
->h
.smc_hash
;
87 write_lock_bh(&h
->lock
);
88 if (sk_del_node_init(sk
))
89 sock_prot_inuse_add(sock_net(sk
), sk
->sk_prot
, -1);
90 write_unlock_bh(&h
->lock
);
92 EXPORT_SYMBOL_GPL(smc_unhash_sk
);
94 struct proto smc_proto
= {
97 .keepalive
= smc_set_keepalive
,
99 .unhash
= smc_unhash_sk
,
100 .obj_size
= sizeof(struct smc_sock
),
101 .h
.smc_hash
= &smc_v4_hashinfo
,
102 .slab_flags
= SLAB_TYPESAFE_BY_RCU
,
104 EXPORT_SYMBOL_GPL(smc_proto
);
106 struct proto smc_proto6
= {
108 .owner
= THIS_MODULE
,
109 .keepalive
= smc_set_keepalive
,
111 .unhash
= smc_unhash_sk
,
112 .obj_size
= sizeof(struct smc_sock
),
113 .h
.smc_hash
= &smc_v6_hashinfo
,
114 .slab_flags
= SLAB_TYPESAFE_BY_RCU
,
116 EXPORT_SYMBOL_GPL(smc_proto6
);
118 static int smc_release(struct socket
*sock
)
120 struct sock
*sk
= sock
->sk
;
121 struct smc_sock
*smc
;
129 /* cleanup for a dangling non-blocking connect */
130 flush_work(&smc
->connect_work
);
131 kfree(smc
->connect_info
);
132 smc
->connect_info
= NULL
;
134 if (sk
->sk_state
== SMC_LISTEN
)
135 /* smc_close_non_accepted() is called and acquires
136 * sock lock for child sockets again
138 lock_sock_nested(sk
, SINGLE_DEPTH_NESTING
);
142 if (!smc
->use_fallback
) {
143 rc
= smc_close_active(smc
);
144 sock_set_flag(sk
, SOCK_DEAD
);
145 sk
->sk_shutdown
|= SHUTDOWN_MASK
;
148 sock_release(smc
->clcsock
);
151 if (smc
->use_fallback
) {
152 if (sk
->sk_state
!= SMC_LISTEN
&& sk
->sk_state
!= SMC_INIT
)
153 sock_put(sk
); /* passive closing */
154 sk
->sk_state
= SMC_CLOSED
;
155 sk
->sk_state_change(sk
);
161 if (!smc
->use_fallback
&& sk
->sk_state
== SMC_CLOSED
)
162 smc_conn_free(&smc
->conn
);
165 sk
->sk_prot
->unhash(sk
);
166 sock_put(sk
); /* final sock_put */
171 static void smc_destruct(struct sock
*sk
)
173 if (sk
->sk_state
!= SMC_CLOSED
)
175 if (!sock_flag(sk
, SOCK_DEAD
))
178 sk_refcnt_debug_dec(sk
);
181 static struct sock
*smc_sock_alloc(struct net
*net
, struct socket
*sock
,
184 struct smc_sock
*smc
;
188 prot
= (protocol
== SMCPROTO_SMC6
) ? &smc_proto6
: &smc_proto
;
189 sk
= sk_alloc(net
, PF_SMC
, GFP_KERNEL
, prot
, 0);
193 sock_init_data(sock
, sk
); /* sets sk_refcnt to 1 */
194 sk
->sk_state
= SMC_INIT
;
195 sk
->sk_destruct
= smc_destruct
;
196 sk
->sk_protocol
= protocol
;
198 INIT_WORK(&smc
->tcp_listen_work
, smc_tcp_listen_work
);
199 INIT_WORK(&smc
->connect_work
, smc_connect_work
);
200 INIT_DELAYED_WORK(&smc
->conn
.tx_work
, smc_tx_work
);
201 INIT_LIST_HEAD(&smc
->accept_q
);
202 spin_lock_init(&smc
->accept_q_lock
);
203 spin_lock_init(&smc
->conn
.send_lock
);
204 sk
->sk_prot
->hash(sk
);
205 sk_refcnt_debug_inc(sk
);
210 static int smc_bind(struct socket
*sock
, struct sockaddr
*uaddr
,
213 struct sockaddr_in
*addr
= (struct sockaddr_in
*)uaddr
;
214 struct sock
*sk
= sock
->sk
;
215 struct smc_sock
*smc
;
220 /* replicate tests from inet_bind(), to be safe wrt. future changes */
222 if (addr_len
< sizeof(struct sockaddr_in
))
226 if (addr
->sin_family
!= AF_INET
&&
227 addr
->sin_family
!= AF_INET6
&&
228 addr
->sin_family
!= AF_UNSPEC
)
230 /* accept AF_UNSPEC (mapped to AF_INET) only if s_addr is INADDR_ANY */
231 if (addr
->sin_family
== AF_UNSPEC
&&
232 addr
->sin_addr
.s_addr
!= htonl(INADDR_ANY
))
237 /* Check if socket is already active */
239 if (sk
->sk_state
!= SMC_INIT
)
242 smc
->clcsock
->sk
->sk_reuse
= sk
->sk_reuse
;
243 rc
= kernel_bind(smc
->clcsock
, uaddr
, addr_len
);
251 static void smc_copy_sock_settings(struct sock
*nsk
, struct sock
*osk
,
254 /* options we don't get control via setsockopt for */
255 nsk
->sk_type
= osk
->sk_type
;
256 nsk
->sk_sndbuf
= osk
->sk_sndbuf
;
257 nsk
->sk_rcvbuf
= osk
->sk_rcvbuf
;
258 nsk
->sk_sndtimeo
= osk
->sk_sndtimeo
;
259 nsk
->sk_rcvtimeo
= osk
->sk_rcvtimeo
;
260 nsk
->sk_mark
= osk
->sk_mark
;
261 nsk
->sk_priority
= osk
->sk_priority
;
262 nsk
->sk_rcvlowat
= osk
->sk_rcvlowat
;
263 nsk
->sk_bound_dev_if
= osk
->sk_bound_dev_if
;
264 nsk
->sk_err
= osk
->sk_err
;
266 nsk
->sk_flags
&= ~mask
;
267 nsk
->sk_flags
|= osk
->sk_flags
& mask
;
270 #define SK_FLAGS_SMC_TO_CLC ((1UL << SOCK_URGINLINE) | \
271 (1UL << SOCK_KEEPOPEN) | \
272 (1UL << SOCK_LINGER) | \
273 (1UL << SOCK_BROADCAST) | \
274 (1UL << SOCK_TIMESTAMP) | \
275 (1UL << SOCK_DBG) | \
276 (1UL << SOCK_RCVTSTAMP) | \
277 (1UL << SOCK_RCVTSTAMPNS) | \
278 (1UL << SOCK_LOCALROUTE) | \
279 (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE) | \
280 (1UL << SOCK_RXQ_OVFL) | \
281 (1UL << SOCK_WIFI_STATUS) | \
282 (1UL << SOCK_NOFCS) | \
283 (1UL << SOCK_FILTER_LOCKED))
284 /* copy only relevant settings and flags of SOL_SOCKET level from smc to
285 * clc socket (since smc is not called for these options from net/core)
287 static void smc_copy_sock_settings_to_clc(struct smc_sock
*smc
)
289 smc_copy_sock_settings(smc
->clcsock
->sk
, &smc
->sk
, SK_FLAGS_SMC_TO_CLC
);
292 #define SK_FLAGS_CLC_TO_SMC ((1UL << SOCK_URGINLINE) | \
293 (1UL << SOCK_KEEPOPEN) | \
294 (1UL << SOCK_LINGER) | \
296 /* copy only settings and flags relevant for smc from clc to smc socket */
297 static void smc_copy_sock_settings_to_smc(struct smc_sock
*smc
)
299 smc_copy_sock_settings(&smc
->sk
, smc
->clcsock
->sk
, SK_FLAGS_CLC_TO_SMC
);
302 /* register a new rmb, optionally send confirm_rkey msg to register with peer */
303 static int smc_reg_rmb(struct smc_link
*link
, struct smc_buf_desc
*rmb_desc
,
306 /* register memory region for new rmb */
307 if (smc_wr_reg_send(link
, rmb_desc
->mr_rx
[SMC_SINGLE_LINK
])) {
308 rmb_desc
->regerr
= 1;
313 /* exchange confirm_rkey msg with peer */
314 if (smc_llc_do_confirm_rkey(link
, rmb_desc
)) {
315 rmb_desc
->regerr
= 1;
321 static int smc_clnt_conf_first_link(struct smc_sock
*smc
)
323 struct net
*net
= sock_net(smc
->clcsock
->sk
);
324 struct smc_link_group
*lgr
= smc
->conn
.lgr
;
325 struct smc_link
*link
;
329 link
= &lgr
->lnk
[SMC_SINGLE_LINK
];
330 /* receive CONFIRM LINK request from server over RoCE fabric */
331 rest
= wait_for_completion_interruptible_timeout(
333 SMC_LLC_WAIT_FIRST_TIME
);
335 struct smc_clc_msg_decline dclc
;
337 rc
= smc_clc_wait_msg(smc
, &dclc
, sizeof(dclc
),
342 if (link
->llc_confirm_rc
)
343 return SMC_CLC_DECL_RMBE_EC
;
345 rc
= smc_ib_modify_qp_rts(link
);
347 return SMC_CLC_DECL_ERR_RDYLNK
;
349 smc_wr_remember_qp_attr(link
);
351 if (smc_reg_rmb(link
, smc
->conn
.rmb_desc
, false))
352 return SMC_CLC_DECL_ERR_REGRMB
;
354 /* send CONFIRM LINK response over RoCE fabric */
355 rc
= smc_llc_send_confirm_link(link
, SMC_LLC_RESP
);
357 return SMC_CLC_DECL_TIMEOUT_CL
;
359 /* receive ADD LINK request from server over RoCE fabric */
360 rest
= wait_for_completion_interruptible_timeout(&link
->llc_add
,
363 struct smc_clc_msg_decline dclc
;
365 rc
= smc_clc_wait_msg(smc
, &dclc
, sizeof(dclc
),
370 /* send add link reject message, only one link supported for now */
371 rc
= smc_llc_send_add_link(link
,
372 link
->smcibdev
->mac
[link
->ibport
- 1],
373 link
->gid
, SMC_LLC_RESP
);
375 return SMC_CLC_DECL_TIMEOUT_AL
;
377 smc_llc_link_active(link
, net
->ipv4
.sysctl_tcp_keepalive_time
);
382 static void smcr_conn_save_peer_info(struct smc_sock
*smc
,
383 struct smc_clc_msg_accept_confirm
*clc
)
385 int bufsize
= smc_uncompress_bufsize(clc
->rmbe_size
);
387 smc
->conn
.peer_rmbe_idx
= clc
->rmbe_idx
;
388 smc
->conn
.local_tx_ctrl
.token
= ntohl(clc
->rmbe_alert_token
);
389 smc
->conn
.peer_rmbe_size
= bufsize
;
390 atomic_set(&smc
->conn
.peer_rmbe_space
, smc
->conn
.peer_rmbe_size
);
391 smc
->conn
.tx_off
= bufsize
* (smc
->conn
.peer_rmbe_idx
- 1);
394 static void smcd_conn_save_peer_info(struct smc_sock
*smc
,
395 struct smc_clc_msg_accept_confirm
*clc
)
397 int bufsize
= smc_uncompress_bufsize(clc
->dmbe_size
);
399 smc
->conn
.peer_rmbe_idx
= clc
->dmbe_idx
;
400 smc
->conn
.peer_token
= clc
->token
;
401 /* msg header takes up space in the buffer */
402 smc
->conn
.peer_rmbe_size
= bufsize
- sizeof(struct smcd_cdc_msg
);
403 atomic_set(&smc
->conn
.peer_rmbe_space
, smc
->conn
.peer_rmbe_size
);
404 smc
->conn
.tx_off
= bufsize
* smc
->conn
.peer_rmbe_idx
;
407 static void smc_conn_save_peer_info(struct smc_sock
*smc
,
408 struct smc_clc_msg_accept_confirm
*clc
)
410 if (smc
->conn
.lgr
->is_smcd
)
411 smcd_conn_save_peer_info(smc
, clc
);
413 smcr_conn_save_peer_info(smc
, clc
);
416 static void smc_link_save_peer_info(struct smc_link
*link
,
417 struct smc_clc_msg_accept_confirm
*clc
)
419 link
->peer_qpn
= ntoh24(clc
->qpn
);
420 memcpy(link
->peer_gid
, clc
->lcl
.gid
, SMC_GID_SIZE
);
421 memcpy(link
->peer_mac
, clc
->lcl
.mac
, sizeof(link
->peer_mac
));
422 link
->peer_psn
= ntoh24(clc
->psn
);
423 link
->peer_mtu
= clc
->qp_mtu
;
426 /* fall back during connect */
427 static int smc_connect_fallback(struct smc_sock
*smc
, int reason_code
)
429 smc
->use_fallback
= true;
430 smc
->fallback_rsn
= reason_code
;
431 smc_copy_sock_settings_to_clc(smc
);
432 if (smc
->sk
.sk_state
== SMC_INIT
)
433 smc
->sk
.sk_state
= SMC_ACTIVE
;
437 /* decline and fall back during connect */
438 static int smc_connect_decline_fallback(struct smc_sock
*smc
, int reason_code
)
442 if (reason_code
< 0) { /* error, fallback is not possible */
443 if (smc
->sk
.sk_state
== SMC_INIT
)
444 sock_put(&smc
->sk
); /* passive closing */
447 if (reason_code
!= SMC_CLC_DECL_PEERDECL
) {
448 rc
= smc_clc_send_decline(smc
, reason_code
);
450 if (smc
->sk
.sk_state
== SMC_INIT
)
451 sock_put(&smc
->sk
); /* passive closing */
455 return smc_connect_fallback(smc
, reason_code
);
458 /* abort connecting */
459 static int smc_connect_abort(struct smc_sock
*smc
, int reason_code
,
462 if (local_contact
== SMC_FIRST_CONTACT
)
463 smc_lgr_forget(smc
->conn
.lgr
);
464 mutex_unlock(&smc_create_lgr_pending
);
465 smc_conn_free(&smc
->conn
);
469 /* check if there is a rdma device available for this connection. */
470 /* called for connect and listen */
471 static int smc_check_rdma(struct smc_sock
*smc
, struct smc_ib_device
**ibdev
,
472 u8
*ibport
, unsigned short vlan_id
, u8 gid
[])
476 /* PNET table look up: search active ib_device and port
477 * within same PNETID that also contains the ethernet device
478 * used for the internal TCP socket
480 smc_pnet_find_roce_resource(smc
->clcsock
->sk
, ibdev
, ibport
, vlan_id
,
483 reason_code
= SMC_CLC_DECL_CNFERR
; /* configuration error */
488 /* check if there is an ISM device available for this connection. */
489 /* called for connect and listen */
490 static int smc_check_ism(struct smc_sock
*smc
, struct smcd_dev
**ismdev
)
492 /* Find ISM device with same PNETID as connecting interface */
493 smc_pnet_find_ism_resource(smc
->clcsock
->sk
, ismdev
);
495 return SMC_CLC_DECL_CNFERR
; /* configuration error */
499 /* Check for VLAN ID and register it on ISM device just for CLC handshake */
500 static int smc_connect_ism_vlan_setup(struct smc_sock
*smc
,
501 struct smcd_dev
*ismdev
,
502 unsigned short vlan_id
)
504 if (vlan_id
&& smc_ism_get_vlan(ismdev
, vlan_id
))
505 return SMC_CLC_DECL_CNFERR
;
509 /* cleanup temporary VLAN ID registration used for CLC handshake. If ISM is
510 * used, the VLAN ID will be registered again during the connection setup.
512 static int smc_connect_ism_vlan_cleanup(struct smc_sock
*smc
, bool is_smcd
,
513 struct smcd_dev
*ismdev
,
514 unsigned short vlan_id
)
518 if (vlan_id
&& smc_ism_put_vlan(ismdev
, vlan_id
))
519 return SMC_CLC_DECL_CNFERR
;
523 /* CLC handshake during connect */
524 static int smc_connect_clc(struct smc_sock
*smc
, int smc_type
,
525 struct smc_clc_msg_accept_confirm
*aclc
,
526 struct smc_ib_device
*ibdev
, u8 ibport
,
527 u8 gid
[], struct smcd_dev
*ismdev
)
531 /* do inband token exchange */
532 rc
= smc_clc_send_proposal(smc
, smc_type
, ibdev
, ibport
, gid
, ismdev
);
535 /* receive SMC Accept CLC message */
536 return smc_clc_wait_msg(smc
, aclc
, sizeof(*aclc
), SMC_CLC_ACCEPT
);
539 /* setup for RDMA connection of client */
540 static int smc_connect_rdma(struct smc_sock
*smc
,
541 struct smc_clc_msg_accept_confirm
*aclc
,
542 struct smc_ib_device
*ibdev
, u8 ibport
)
544 int local_contact
= SMC_FIRST_CONTACT
;
545 struct smc_link
*link
;
548 mutex_lock(&smc_create_lgr_pending
);
549 local_contact
= smc_conn_create(smc
, false, aclc
->hdr
.flag
, ibdev
,
550 ibport
, &aclc
->lcl
, NULL
, 0);
551 if (local_contact
< 0) {
552 if (local_contact
== -ENOMEM
)
553 reason_code
= SMC_CLC_DECL_MEM
;/* insufficient memory*/
554 else if (local_contact
== -ENOLINK
)
555 reason_code
= SMC_CLC_DECL_SYNCERR
; /* synchr. error */
557 reason_code
= SMC_CLC_DECL_INTERR
; /* other error */
558 return smc_connect_abort(smc
, reason_code
, 0);
560 link
= &smc
->conn
.lgr
->lnk
[SMC_SINGLE_LINK
];
562 smc_conn_save_peer_info(smc
, aclc
);
564 /* create send buffer and rmb */
565 if (smc_buf_create(smc
, false))
566 return smc_connect_abort(smc
, SMC_CLC_DECL_MEM
, local_contact
);
568 if (local_contact
== SMC_FIRST_CONTACT
)
569 smc_link_save_peer_info(link
, aclc
);
571 if (smc_rmb_rtoken_handling(&smc
->conn
, aclc
))
572 return smc_connect_abort(smc
, SMC_CLC_DECL_ERR_RTOK
,
578 if (local_contact
== SMC_FIRST_CONTACT
) {
579 if (smc_ib_ready_link(link
))
580 return smc_connect_abort(smc
, SMC_CLC_DECL_ERR_RDYLNK
,
583 if (!smc
->conn
.rmb_desc
->reused
&&
584 smc_reg_rmb(link
, smc
->conn
.rmb_desc
, true))
585 return smc_connect_abort(smc
, SMC_CLC_DECL_ERR_REGRMB
,
588 smc_rmb_sync_sg_for_device(&smc
->conn
);
590 reason_code
= smc_clc_send_confirm(smc
);
592 return smc_connect_abort(smc
, reason_code
, local_contact
);
596 if (local_contact
== SMC_FIRST_CONTACT
) {
597 /* QP confirmation over RoCE fabric */
598 reason_code
= smc_clnt_conf_first_link(smc
);
600 return smc_connect_abort(smc
, reason_code
,
603 mutex_unlock(&smc_create_lgr_pending
);
605 smc_copy_sock_settings_to_clc(smc
);
606 if (smc
->sk
.sk_state
== SMC_INIT
)
607 smc
->sk
.sk_state
= SMC_ACTIVE
;
612 /* setup for ISM connection of client */
613 static int smc_connect_ism(struct smc_sock
*smc
,
614 struct smc_clc_msg_accept_confirm
*aclc
,
615 struct smcd_dev
*ismdev
)
617 int local_contact
= SMC_FIRST_CONTACT
;
620 mutex_lock(&smc_create_lgr_pending
);
621 local_contact
= smc_conn_create(smc
, true, aclc
->hdr
.flag
, NULL
, 0,
622 NULL
, ismdev
, aclc
->gid
);
623 if (local_contact
< 0)
624 return smc_connect_abort(smc
, SMC_CLC_DECL_MEM
, 0);
626 /* Create send and receive buffers */
627 if (smc_buf_create(smc
, true))
628 return smc_connect_abort(smc
, SMC_CLC_DECL_MEM
, local_contact
);
630 smc_conn_save_peer_info(smc
, aclc
);
635 rc
= smc_clc_send_confirm(smc
);
637 return smc_connect_abort(smc
, rc
, local_contact
);
638 mutex_unlock(&smc_create_lgr_pending
);
640 smc_copy_sock_settings_to_clc(smc
);
641 if (smc
->sk
.sk_state
== SMC_INIT
)
642 smc
->sk
.sk_state
= SMC_ACTIVE
;
647 /* perform steps before actually connecting */
648 static int __smc_connect(struct smc_sock
*smc
)
650 bool ism_supported
= false, rdma_supported
= false;
651 struct smc_clc_msg_accept_confirm aclc
;
652 struct smc_ib_device
*ibdev
;
653 struct smcd_dev
*ismdev
;
654 u8 gid
[SMC_GID_SIZE
];
660 sock_hold(&smc
->sk
); /* sock put in passive closing */
662 if (smc
->use_fallback
)
663 return smc_connect_fallback(smc
, smc
->fallback_rsn
);
665 /* if peer has not signalled SMC-capability, fall back */
666 if (!tcp_sk(smc
->clcsock
->sk
)->syn_smc
)
667 return smc_connect_fallback(smc
, SMC_CLC_DECL_PEERNOSMC
);
669 /* IPSec connections opt out of SMC-R optimizations */
670 if (using_ipsec(smc
))
671 return smc_connect_decline_fallback(smc
, SMC_CLC_DECL_IPSEC
);
673 /* check for VLAN ID */
674 if (smc_vlan_by_tcpsk(smc
->clcsock
, &vlan
))
675 return smc_connect_decline_fallback(smc
, SMC_CLC_DECL_CNFERR
);
677 /* check if there is an ism device available */
678 if (!smc_check_ism(smc
, &ismdev
) &&
679 !smc_connect_ism_vlan_setup(smc
, ismdev
, vlan
)) {
680 /* ISM is supported for this connection */
681 ism_supported
= true;
682 smc_type
= SMC_TYPE_D
;
685 /* check if there is a rdma device available */
686 if (!smc_check_rdma(smc
, &ibdev
, &ibport
, vlan
, gid
)) {
687 /* RDMA is supported for this connection */
688 rdma_supported
= true;
690 smc_type
= SMC_TYPE_B
; /* both */
692 smc_type
= SMC_TYPE_R
; /* only RDMA */
695 /* if neither ISM nor RDMA are supported, fallback */
696 if (!rdma_supported
&& !ism_supported
)
697 return smc_connect_decline_fallback(smc
, SMC_CLC_DECL_NOSMCDEV
);
699 /* perform CLC handshake */
700 rc
= smc_connect_clc(smc
, smc_type
, &aclc
, ibdev
, ibport
, gid
, ismdev
);
702 smc_connect_ism_vlan_cleanup(smc
, ism_supported
, ismdev
, vlan
);
703 return smc_connect_decline_fallback(smc
, rc
);
706 /* depending on previous steps, connect using rdma or ism */
707 if (rdma_supported
&& aclc
.hdr
.path
== SMC_TYPE_R
)
708 rc
= smc_connect_rdma(smc
, &aclc
, ibdev
, ibport
);
709 else if (ism_supported
&& aclc
.hdr
.path
== SMC_TYPE_D
)
710 rc
= smc_connect_ism(smc
, &aclc
, ismdev
);
712 rc
= SMC_CLC_DECL_MODEUNSUPP
;
714 smc_connect_ism_vlan_cleanup(smc
, ism_supported
, ismdev
, vlan
);
715 return smc_connect_decline_fallback(smc
, rc
);
718 smc_connect_ism_vlan_cleanup(smc
, ism_supported
, ismdev
, vlan
);
722 static void smc_connect_work(struct work_struct
*work
)
724 struct smc_sock
*smc
= container_of(work
, struct smc_sock
,
729 rc
= kernel_connect(smc
->clcsock
, &smc
->connect_info
->addr
,
730 smc
->connect_info
->alen
, smc
->connect_info
->flags
);
731 if (smc
->clcsock
->sk
->sk_err
) {
732 smc
->sk
.sk_err
= smc
->clcsock
->sk
->sk_err
;
736 smc
->sk
.sk_err
= -rc
;
740 rc
= __smc_connect(smc
);
742 smc
->sk
.sk_err
= -rc
;
746 smc
->sk
.sk_state_change(&smc
->sk
);
748 smc
->sk
.sk_write_space(&smc
->sk
);
749 kfree(smc
->connect_info
);
750 smc
->connect_info
= NULL
;
751 release_sock(&smc
->sk
);
754 static int smc_connect(struct socket
*sock
, struct sockaddr
*addr
,
757 struct sock
*sk
= sock
->sk
;
758 struct smc_sock
*smc
;
763 /* separate smc parameter checking to be safe */
764 if (alen
< sizeof(addr
->sa_family
))
766 if (addr
->sa_family
!= AF_INET
&& addr
->sa_family
!= AF_INET6
)
770 switch (sk
->sk_state
) {
781 smc_copy_sock_settings_to_clc(smc
);
782 tcp_sk(smc
->clcsock
->sk
)->syn_smc
= 1;
783 if (flags
& O_NONBLOCK
) {
784 if (smc
->connect_info
) {
788 smc
->connect_info
= kzalloc(alen
+ 2 * sizeof(int), GFP_KERNEL
);
789 if (!smc
->connect_info
) {
793 smc
->connect_info
->alen
= alen
;
794 smc
->connect_info
->flags
= flags
^ O_NONBLOCK
;
795 memcpy(&smc
->connect_info
->addr
, addr
, alen
);
796 schedule_work(&smc
->connect_work
);
799 rc
= kernel_connect(smc
->clcsock
, addr
, alen
, flags
);
803 rc
= __smc_connect(smc
);
807 rc
= 0; /* success cases including fallback */
816 static int smc_clcsock_accept(struct smc_sock
*lsmc
, struct smc_sock
**new_smc
)
818 struct socket
*new_clcsock
= NULL
;
819 struct sock
*lsk
= &lsmc
->sk
;
824 new_sk
= smc_sock_alloc(sock_net(lsk
), NULL
, lsk
->sk_protocol
);
827 lsk
->sk_err
= ENOMEM
;
832 *new_smc
= smc_sk(new_sk
);
834 rc
= kernel_accept(lsmc
->clcsock
, &new_clcsock
, 0);
838 if (rc
< 0 || lsk
->sk_state
== SMC_CLOSED
) {
840 sock_release(new_clcsock
);
841 new_sk
->sk_state
= SMC_CLOSED
;
842 sock_set_flag(new_sk
, SOCK_DEAD
);
843 new_sk
->sk_prot
->unhash(new_sk
);
844 sock_put(new_sk
); /* final */
849 (*new_smc
)->clcsock
= new_clcsock
;
854 /* add a just created sock to the accept queue of the listen sock as
855 * candidate for a following socket accept call from user space
857 static void smc_accept_enqueue(struct sock
*parent
, struct sock
*sk
)
859 struct smc_sock
*par
= smc_sk(parent
);
861 sock_hold(sk
); /* sock_put in smc_accept_unlink () */
862 spin_lock(&par
->accept_q_lock
);
863 list_add_tail(&smc_sk(sk
)->accept_q
, &par
->accept_q
);
864 spin_unlock(&par
->accept_q_lock
);
865 sk_acceptq_added(parent
);
868 /* remove a socket from the accept queue of its parental listening socket */
869 static void smc_accept_unlink(struct sock
*sk
)
871 struct smc_sock
*par
= smc_sk(sk
)->listen_smc
;
873 spin_lock(&par
->accept_q_lock
);
874 list_del_init(&smc_sk(sk
)->accept_q
);
875 spin_unlock(&par
->accept_q_lock
);
876 sk_acceptq_removed(&smc_sk(sk
)->listen_smc
->sk
);
877 sock_put(sk
); /* sock_hold in smc_accept_enqueue */
880 /* remove a sock from the accept queue to bind it to a new socket created
881 * for a socket accept call from user space
883 struct sock
*smc_accept_dequeue(struct sock
*parent
,
884 struct socket
*new_sock
)
886 struct smc_sock
*isk
, *n
;
889 list_for_each_entry_safe(isk
, n
, &smc_sk(parent
)->accept_q
, accept_q
) {
890 new_sk
= (struct sock
*)isk
;
892 smc_accept_unlink(new_sk
);
893 if (new_sk
->sk_state
== SMC_CLOSED
) {
895 sock_release(isk
->clcsock
);
898 new_sk
->sk_prot
->unhash(new_sk
);
899 sock_put(new_sk
); /* final */
903 sock_graft(new_sk
, new_sock
);
909 /* clean up for a created but never accepted sock */
910 void smc_close_non_accepted(struct sock
*sk
)
912 struct smc_sock
*smc
= smc_sk(sk
);
915 if (!sk
->sk_lingertime
)
916 /* wait for peer closing */
917 sk
->sk_lingertime
= SMC_MAX_STREAM_WAIT_TIMEOUT
;
918 if (!smc
->use_fallback
) {
919 smc_close_active(smc
);
920 sock_set_flag(sk
, SOCK_DEAD
);
921 sk
->sk_shutdown
|= SHUTDOWN_MASK
;
930 if (smc
->use_fallback
) {
931 sock_put(sk
); /* passive closing */
932 sk
->sk_state
= SMC_CLOSED
;
934 if (sk
->sk_state
== SMC_CLOSED
)
935 smc_conn_free(&smc
->conn
);
938 sk
->sk_prot
->unhash(sk
);
939 sock_put(sk
); /* final sock_put */
942 static int smc_serv_conf_first_link(struct smc_sock
*smc
)
944 struct net
*net
= sock_net(smc
->clcsock
->sk
);
945 struct smc_link_group
*lgr
= smc
->conn
.lgr
;
946 struct smc_link
*link
;
950 link
= &lgr
->lnk
[SMC_SINGLE_LINK
];
952 if (smc_reg_rmb(link
, smc
->conn
.rmb_desc
, false))
953 return SMC_CLC_DECL_ERR_REGRMB
;
955 /* send CONFIRM LINK request to client over the RoCE fabric */
956 rc
= smc_llc_send_confirm_link(link
, SMC_LLC_REQ
);
958 return SMC_CLC_DECL_TIMEOUT_CL
;
960 /* receive CONFIRM LINK response from client over the RoCE fabric */
961 rest
= wait_for_completion_interruptible_timeout(
962 &link
->llc_confirm_resp
,
963 SMC_LLC_WAIT_FIRST_TIME
);
965 struct smc_clc_msg_decline dclc
;
967 rc
= smc_clc_wait_msg(smc
, &dclc
, sizeof(dclc
),
972 if (link
->llc_confirm_resp_rc
)
973 return SMC_CLC_DECL_RMBE_EC
;
975 /* send ADD LINK request to client over the RoCE fabric */
976 rc
= smc_llc_send_add_link(link
,
977 link
->smcibdev
->mac
[link
->ibport
- 1],
978 link
->gid
, SMC_LLC_REQ
);
980 return SMC_CLC_DECL_TIMEOUT_AL
;
982 /* receive ADD LINK response from client over the RoCE fabric */
983 rest
= wait_for_completion_interruptible_timeout(&link
->llc_add_resp
,
986 struct smc_clc_msg_decline dclc
;
988 rc
= smc_clc_wait_msg(smc
, &dclc
, sizeof(dclc
),
993 smc_llc_link_active(link
, net
->ipv4
.sysctl_tcp_keepalive_time
);
998 /* listen worker: finish */
999 static void smc_listen_out(struct smc_sock
*new_smc
)
1001 struct smc_sock
*lsmc
= new_smc
->listen_smc
;
1002 struct sock
*newsmcsk
= &new_smc
->sk
;
1004 lock_sock_nested(&lsmc
->sk
, SINGLE_DEPTH_NESTING
);
1005 if (lsmc
->sk
.sk_state
== SMC_LISTEN
) {
1006 smc_accept_enqueue(&lsmc
->sk
, newsmcsk
);
1007 } else { /* no longer listening */
1008 smc_close_non_accepted(newsmcsk
);
1010 release_sock(&lsmc
->sk
);
1012 /* Wake up accept */
1013 lsmc
->sk
.sk_data_ready(&lsmc
->sk
);
1014 sock_put(&lsmc
->sk
); /* sock_hold in smc_tcp_listen_work */
1017 /* listen worker: finish in state connected */
1018 static void smc_listen_out_connected(struct smc_sock
*new_smc
)
1020 struct sock
*newsmcsk
= &new_smc
->sk
;
1022 sk_refcnt_debug_inc(newsmcsk
);
1023 if (newsmcsk
->sk_state
== SMC_INIT
)
1024 newsmcsk
->sk_state
= SMC_ACTIVE
;
1026 smc_listen_out(new_smc
);
1029 /* listen worker: finish in error state */
1030 static void smc_listen_out_err(struct smc_sock
*new_smc
)
1032 struct sock
*newsmcsk
= &new_smc
->sk
;
1034 if (newsmcsk
->sk_state
== SMC_INIT
)
1035 sock_put(&new_smc
->sk
); /* passive closing */
1036 newsmcsk
->sk_state
= SMC_CLOSED
;
1037 smc_conn_free(&new_smc
->conn
);
1039 smc_listen_out(new_smc
);
1042 /* listen worker: decline and fall back if possible */
1043 static void smc_listen_decline(struct smc_sock
*new_smc
, int reason_code
,
1046 /* RDMA setup failed, switch back to TCP */
1047 if (local_contact
== SMC_FIRST_CONTACT
)
1048 smc_lgr_forget(new_smc
->conn
.lgr
);
1049 if (reason_code
< 0) { /* error, no fallback possible */
1050 smc_listen_out_err(new_smc
);
1053 smc_conn_free(&new_smc
->conn
);
1054 new_smc
->use_fallback
= true;
1055 new_smc
->fallback_rsn
= reason_code
;
1056 if (reason_code
&& reason_code
!= SMC_CLC_DECL_PEERDECL
) {
1057 if (smc_clc_send_decline(new_smc
, reason_code
) < 0) {
1058 smc_listen_out_err(new_smc
);
1062 smc_listen_out_connected(new_smc
);
1065 /* listen worker: check prefixes */
1066 static int smc_listen_rdma_check(struct smc_sock
*new_smc
,
1067 struct smc_clc_msg_proposal
*pclc
)
1069 struct smc_clc_msg_proposal_prefix
*pclc_prfx
;
1070 struct socket
*newclcsock
= new_smc
->clcsock
;
1072 pclc_prfx
= smc_clc_proposal_get_prefix(pclc
);
1073 if (smc_clc_prfx_match(newclcsock
, pclc_prfx
))
1074 return SMC_CLC_DECL_CNFERR
;
1079 /* listen worker: initialize connection and buffers */
1080 static int smc_listen_rdma_init(struct smc_sock
*new_smc
,
1081 struct smc_clc_msg_proposal
*pclc
,
1082 struct smc_ib_device
*ibdev
, u8 ibport
,
1085 /* allocate connection / link group */
1086 *local_contact
= smc_conn_create(new_smc
, false, 0, ibdev
, ibport
,
1087 &pclc
->lcl
, NULL
, 0);
1088 if (*local_contact
< 0) {
1089 if (*local_contact
== -ENOMEM
)
1090 return SMC_CLC_DECL_MEM
;/* insufficient memory*/
1091 return SMC_CLC_DECL_INTERR
; /* other error */
1094 /* create send buffer and rmb */
1095 if (smc_buf_create(new_smc
, false))
1096 return SMC_CLC_DECL_MEM
;
1101 /* listen worker: initialize connection and buffers for SMC-D */
1102 static int smc_listen_ism_init(struct smc_sock
*new_smc
,
1103 struct smc_clc_msg_proposal
*pclc
,
1104 struct smcd_dev
*ismdev
,
1107 struct smc_clc_msg_smcd
*pclc_smcd
;
1109 pclc_smcd
= smc_get_clc_msg_smcd(pclc
);
1110 *local_contact
= smc_conn_create(new_smc
, true, 0, NULL
, 0, NULL
,
1111 ismdev
, pclc_smcd
->gid
);
1112 if (*local_contact
< 0) {
1113 if (*local_contact
== -ENOMEM
)
1114 return SMC_CLC_DECL_MEM
;/* insufficient memory*/
1115 return SMC_CLC_DECL_INTERR
; /* other error */
1118 /* Check if peer can be reached via ISM device */
1119 if (smc_ism_cantalk(new_smc
->conn
.lgr
->peer_gid
,
1120 new_smc
->conn
.lgr
->vlan_id
,
1121 new_smc
->conn
.lgr
->smcd
)) {
1122 if (*local_contact
== SMC_FIRST_CONTACT
)
1123 smc_lgr_forget(new_smc
->conn
.lgr
);
1124 smc_conn_free(&new_smc
->conn
);
1125 return SMC_CLC_DECL_CNFERR
;
1128 /* Create send and receive buffers */
1129 if (smc_buf_create(new_smc
, true)) {
1130 if (*local_contact
== SMC_FIRST_CONTACT
)
1131 smc_lgr_forget(new_smc
->conn
.lgr
);
1132 smc_conn_free(&new_smc
->conn
);
1133 return SMC_CLC_DECL_MEM
;
1139 /* listen worker: register buffers */
1140 static int smc_listen_rdma_reg(struct smc_sock
*new_smc
, int local_contact
)
1142 struct smc_link
*link
= &new_smc
->conn
.lgr
->lnk
[SMC_SINGLE_LINK
];
1144 if (local_contact
!= SMC_FIRST_CONTACT
) {
1145 if (!new_smc
->conn
.rmb_desc
->reused
) {
1146 if (smc_reg_rmb(link
, new_smc
->conn
.rmb_desc
, true))
1147 return SMC_CLC_DECL_ERR_REGRMB
;
1150 smc_rmb_sync_sg_for_device(&new_smc
->conn
);
1155 /* listen worker: finish RDMA setup */
1156 static int smc_listen_rdma_finish(struct smc_sock
*new_smc
,
1157 struct smc_clc_msg_accept_confirm
*cclc
,
1160 struct smc_link
*link
= &new_smc
->conn
.lgr
->lnk
[SMC_SINGLE_LINK
];
1161 int reason_code
= 0;
1163 if (local_contact
== SMC_FIRST_CONTACT
)
1164 smc_link_save_peer_info(link
, cclc
);
1166 if (smc_rmb_rtoken_handling(&new_smc
->conn
, cclc
)) {
1167 reason_code
= SMC_CLC_DECL_ERR_RTOK
;
1171 if (local_contact
== SMC_FIRST_CONTACT
) {
1172 if (smc_ib_ready_link(link
)) {
1173 reason_code
= SMC_CLC_DECL_ERR_RDYLNK
;
1176 /* QP confirmation over RoCE fabric */
1177 reason_code
= smc_serv_conf_first_link(new_smc
);
1184 mutex_unlock(&smc_create_lgr_pending
);
1185 smc_listen_decline(new_smc
, reason_code
, local_contact
);
1189 /* setup for RDMA connection of server */
1190 static void smc_listen_work(struct work_struct
*work
)
1192 struct smc_sock
*new_smc
= container_of(work
, struct smc_sock
,
1194 struct socket
*newclcsock
= new_smc
->clcsock
;
1195 struct smc_clc_msg_accept_confirm cclc
;
1196 struct smc_clc_msg_proposal
*pclc
;
1197 struct smc_ib_device
*ibdev
;
1198 bool ism_supported
= false;
1199 struct smcd_dev
*ismdev
;
1200 u8 buf
[SMC_CLC_MAX_LEN
];
1201 int local_contact
= 0;
1202 unsigned short vlan
;
1203 int reason_code
= 0;
1207 if (new_smc
->use_fallback
) {
1208 smc_listen_out_connected(new_smc
);
1212 /* check if peer is smc capable */
1213 if (!tcp_sk(newclcsock
->sk
)->syn_smc
) {
1214 new_smc
->use_fallback
= true;
1215 new_smc
->fallback_rsn
= SMC_CLC_DECL_PEERNOSMC
;
1216 smc_listen_out_connected(new_smc
);
1220 /* do inband token exchange -
1221 * wait for and receive SMC Proposal CLC message
1223 pclc
= (struct smc_clc_msg_proposal
*)&buf
;
1224 reason_code
= smc_clc_wait_msg(new_smc
, pclc
, SMC_CLC_MAX_LEN
,
1227 smc_listen_decline(new_smc
, reason_code
, 0);
1231 /* IPSec connections opt out of SMC-R optimizations */
1232 if (using_ipsec(new_smc
)) {
1233 smc_listen_decline(new_smc
, SMC_CLC_DECL_IPSEC
, 0);
1237 mutex_lock(&smc_create_lgr_pending
);
1238 smc_close_init(new_smc
);
1239 smc_rx_init(new_smc
);
1240 smc_tx_init(new_smc
);
1242 /* check if ISM is available */
1243 if ((pclc
->hdr
.path
== SMC_TYPE_D
|| pclc
->hdr
.path
== SMC_TYPE_B
) &&
1244 !smc_check_ism(new_smc
, &ismdev
) &&
1245 !smc_listen_ism_init(new_smc
, pclc
, ismdev
, &local_contact
)) {
1246 ism_supported
= true;
1249 /* check if RDMA is available */
1250 if (!ism_supported
&&
1251 ((pclc
->hdr
.path
!= SMC_TYPE_R
&& pclc
->hdr
.path
!= SMC_TYPE_B
) ||
1252 smc_vlan_by_tcpsk(new_smc
->clcsock
, &vlan
) ||
1253 smc_check_rdma(new_smc
, &ibdev
, &ibport
, vlan
, NULL
) ||
1254 smc_listen_rdma_check(new_smc
, pclc
) ||
1255 smc_listen_rdma_init(new_smc
, pclc
, ibdev
, ibport
,
1257 smc_listen_rdma_reg(new_smc
, local_contact
))) {
1258 /* SMC not supported, decline */
1259 mutex_unlock(&smc_create_lgr_pending
);
1260 smc_listen_decline(new_smc
, SMC_CLC_DECL_MODEUNSUPP
,
1265 /* send SMC Accept CLC message */
1266 rc
= smc_clc_send_accept(new_smc
, local_contact
);
1268 mutex_unlock(&smc_create_lgr_pending
);
1269 smc_listen_decline(new_smc
, rc
, local_contact
);
1273 /* receive SMC Confirm CLC message */
1274 reason_code
= smc_clc_wait_msg(new_smc
, &cclc
, sizeof(cclc
),
1277 mutex_unlock(&smc_create_lgr_pending
);
1278 smc_listen_decline(new_smc
, reason_code
, local_contact
);
1283 if (!ism_supported
) {
1284 if (smc_listen_rdma_finish(new_smc
, &cclc
, local_contact
))
1287 smc_conn_save_peer_info(new_smc
, &cclc
);
1288 mutex_unlock(&smc_create_lgr_pending
);
1289 smc_listen_out_connected(new_smc
);
1292 static void smc_tcp_listen_work(struct work_struct
*work
)
1294 struct smc_sock
*lsmc
= container_of(work
, struct smc_sock
,
1296 struct sock
*lsk
= &lsmc
->sk
;
1297 struct smc_sock
*new_smc
;
1301 while (lsk
->sk_state
== SMC_LISTEN
) {
1302 rc
= smc_clcsock_accept(lsmc
, &new_smc
);
1308 new_smc
->listen_smc
= lsmc
;
1309 new_smc
->use_fallback
= lsmc
->use_fallback
;
1310 new_smc
->fallback_rsn
= lsmc
->fallback_rsn
;
1311 sock_hold(lsk
); /* sock_put in smc_listen_work */
1312 INIT_WORK(&new_smc
->smc_listen_work
, smc_listen_work
);
1313 smc_copy_sock_settings_to_smc(new_smc
);
1314 new_smc
->sk
.sk_sndbuf
= lsmc
->sk
.sk_sndbuf
;
1315 new_smc
->sk
.sk_rcvbuf
= lsmc
->sk
.sk_rcvbuf
;
1316 sock_hold(&new_smc
->sk
); /* sock_put in passive closing */
1317 if (!schedule_work(&new_smc
->smc_listen_work
))
1318 sock_put(&new_smc
->sk
);
1323 sock_put(&lsmc
->sk
); /* sock_hold in smc_listen */
1326 static int smc_listen(struct socket
*sock
, int backlog
)
1328 struct sock
*sk
= sock
->sk
;
1329 struct smc_sock
*smc
;
1336 if ((sk
->sk_state
!= SMC_INIT
) && (sk
->sk_state
!= SMC_LISTEN
))
1340 if (sk
->sk_state
== SMC_LISTEN
) {
1341 sk
->sk_max_ack_backlog
= backlog
;
1344 /* some socket options are handled in core, so we could not apply
1345 * them to the clc socket -- copy smc socket options to clc socket
1347 smc_copy_sock_settings_to_clc(smc
);
1348 if (!smc
->use_fallback
)
1349 tcp_sk(smc
->clcsock
->sk
)->syn_smc
= 1;
1351 rc
= kernel_listen(smc
->clcsock
, backlog
);
1354 sk
->sk_max_ack_backlog
= backlog
;
1355 sk
->sk_ack_backlog
= 0;
1356 sk
->sk_state
= SMC_LISTEN
;
1357 INIT_WORK(&smc
->tcp_listen_work
, smc_tcp_listen_work
);
1358 sock_hold(sk
); /* sock_hold in tcp_listen_worker */
1359 if (!schedule_work(&smc
->tcp_listen_work
))
1367 static int smc_accept(struct socket
*sock
, struct socket
*new_sock
,
1368 int flags
, bool kern
)
1370 struct sock
*sk
= sock
->sk
, *nsk
;
1371 DECLARE_WAITQUEUE(wait
, current
);
1372 struct smc_sock
*lsmc
;
1377 sock_hold(sk
); /* sock_put below */
1380 if (lsmc
->sk
.sk_state
!= SMC_LISTEN
) {
1386 /* Wait for an incoming connection */
1387 timeo
= sock_rcvtimeo(sk
, flags
& O_NONBLOCK
);
1388 add_wait_queue_exclusive(sk_sleep(sk
), &wait
);
1389 while (!(nsk
= smc_accept_dequeue(sk
, new_sock
))) {
1390 set_current_state(TASK_INTERRUPTIBLE
);
1396 timeo
= schedule_timeout(timeo
);
1397 /* wakeup by sk_data_ready in smc_listen_work() */
1398 sched_annotate_sleep();
1400 if (signal_pending(current
)) {
1401 rc
= sock_intr_errno(timeo
);
1405 set_current_state(TASK_RUNNING
);
1406 remove_wait_queue(sk_sleep(sk
), &wait
);
1409 rc
= sock_error(nsk
);
1414 if (lsmc
->sockopt_defer_accept
&& !(flags
& O_NONBLOCK
)) {
1415 /* wait till data arrives on the socket */
1416 timeo
= msecs_to_jiffies(lsmc
->sockopt_defer_accept
*
1418 if (smc_sk(nsk
)->use_fallback
) {
1419 struct sock
*clcsk
= smc_sk(nsk
)->clcsock
->sk
;
1422 if (skb_queue_empty(&clcsk
->sk_receive_queue
))
1423 sk_wait_data(clcsk
, &timeo
, NULL
);
1424 release_sock(clcsk
);
1425 } else if (!atomic_read(&smc_sk(nsk
)->conn
.bytes_to_rcv
)) {
1427 smc_rx_wait(smc_sk(nsk
), &timeo
, smc_rx_data_available
);
1433 sock_put(sk
); /* sock_hold above */
1437 static int smc_getname(struct socket
*sock
, struct sockaddr
*addr
,
1440 struct smc_sock
*smc
;
1442 if (peer
&& (sock
->sk
->sk_state
!= SMC_ACTIVE
) &&
1443 (sock
->sk
->sk_state
!= SMC_APPCLOSEWAIT1
))
1446 smc
= smc_sk(sock
->sk
);
1448 return smc
->clcsock
->ops
->getname(smc
->clcsock
, addr
, peer
);
1451 static int smc_sendmsg(struct socket
*sock
, struct msghdr
*msg
, size_t len
)
1453 struct sock
*sk
= sock
->sk
;
1454 struct smc_sock
*smc
;
1459 if ((sk
->sk_state
!= SMC_ACTIVE
) &&
1460 (sk
->sk_state
!= SMC_APPCLOSEWAIT1
) &&
1461 (sk
->sk_state
!= SMC_INIT
))
1464 if (msg
->msg_flags
& MSG_FASTOPEN
) {
1465 if (sk
->sk_state
== SMC_INIT
) {
1466 smc
->use_fallback
= true;
1467 smc
->fallback_rsn
= SMC_CLC_DECL_OPTUNSUPP
;
1474 if (smc
->use_fallback
)
1475 rc
= smc
->clcsock
->ops
->sendmsg(smc
->clcsock
, msg
, len
);
1477 rc
= smc_tx_sendmsg(smc
, msg
, len
);
1483 static int smc_recvmsg(struct socket
*sock
, struct msghdr
*msg
, size_t len
,
1486 struct sock
*sk
= sock
->sk
;
1487 struct smc_sock
*smc
;
1492 if ((sk
->sk_state
== SMC_INIT
) ||
1493 (sk
->sk_state
== SMC_LISTEN
) ||
1494 (sk
->sk_state
== SMC_CLOSED
))
1497 if (sk
->sk_state
== SMC_PEERFINCLOSEWAIT
) {
1502 if (smc
->use_fallback
) {
1503 rc
= smc
->clcsock
->ops
->recvmsg(smc
->clcsock
, msg
, len
, flags
);
1505 msg
->msg_namelen
= 0;
1506 rc
= smc_rx_recvmsg(smc
, msg
, NULL
, len
, flags
);
1514 static __poll_t
smc_accept_poll(struct sock
*parent
)
1516 struct smc_sock
*isk
= smc_sk(parent
);
1519 spin_lock(&isk
->accept_q_lock
);
1520 if (!list_empty(&isk
->accept_q
))
1521 mask
= EPOLLIN
| EPOLLRDNORM
;
1522 spin_unlock(&isk
->accept_q_lock
);
1527 static __poll_t
smc_poll(struct file
*file
, struct socket
*sock
,
1530 struct sock
*sk
= sock
->sk
;
1532 struct smc_sock
*smc
;
1537 smc
= smc_sk(sock
->sk
);
1538 if (smc
->use_fallback
) {
1539 /* delegate to CLC child sock */
1540 mask
= smc
->clcsock
->ops
->poll(file
, smc
->clcsock
, wait
);
1541 sk
->sk_err
= smc
->clcsock
->sk
->sk_err
;
1545 if (sk
->sk_state
!= SMC_CLOSED
)
1546 sock_poll_wait(file
, sock
, wait
);
1549 if ((sk
->sk_shutdown
== SHUTDOWN_MASK
) ||
1550 (sk
->sk_state
== SMC_CLOSED
))
1552 if (sk
->sk_state
== SMC_LISTEN
) {
1553 /* woken up by sk_data_ready in smc_listen_work() */
1554 mask
= smc_accept_poll(sk
);
1556 if (atomic_read(&smc
->conn
.sndbuf_space
) ||
1557 sk
->sk_shutdown
& SEND_SHUTDOWN
) {
1558 mask
|= EPOLLOUT
| EPOLLWRNORM
;
1560 sk_set_bit(SOCKWQ_ASYNC_NOSPACE
, sk
);
1561 set_bit(SOCK_NOSPACE
, &sk
->sk_socket
->flags
);
1563 if (atomic_read(&smc
->conn
.bytes_to_rcv
))
1564 mask
|= EPOLLIN
| EPOLLRDNORM
;
1565 if (sk
->sk_shutdown
& RCV_SHUTDOWN
)
1566 mask
|= EPOLLIN
| EPOLLRDNORM
| EPOLLRDHUP
;
1567 if (sk
->sk_state
== SMC_APPCLOSEWAIT1
)
1569 if (smc
->conn
.urg_state
== SMC_URG_VALID
)
1577 static int smc_shutdown(struct socket
*sock
, int how
)
1579 struct sock
*sk
= sock
->sk
;
1580 struct smc_sock
*smc
;
1586 if ((how
< SHUT_RD
) || (how
> SHUT_RDWR
))
1592 if ((sk
->sk_state
!= SMC_ACTIVE
) &&
1593 (sk
->sk_state
!= SMC_PEERCLOSEWAIT1
) &&
1594 (sk
->sk_state
!= SMC_PEERCLOSEWAIT2
) &&
1595 (sk
->sk_state
!= SMC_APPCLOSEWAIT1
) &&
1596 (sk
->sk_state
!= SMC_APPCLOSEWAIT2
) &&
1597 (sk
->sk_state
!= SMC_APPFINCLOSEWAIT
))
1599 if (smc
->use_fallback
) {
1600 rc
= kernel_sock_shutdown(smc
->clcsock
, how
);
1601 sk
->sk_shutdown
= smc
->clcsock
->sk
->sk_shutdown
;
1602 if (sk
->sk_shutdown
== SHUTDOWN_MASK
)
1603 sk
->sk_state
= SMC_CLOSED
;
1607 case SHUT_RDWR
: /* shutdown in both directions */
1608 rc
= smc_close_active(smc
);
1611 rc
= smc_close_shutdown_write(smc
);
1615 /* nothing more to do because peer is not involved */
1619 rc1
= kernel_sock_shutdown(smc
->clcsock
, how
);
1620 /* map sock_shutdown_cmd constants to sk_shutdown value range */
1621 sk
->sk_shutdown
|= how
+ 1;
1625 return rc
? rc
: rc1
;
1628 static int smc_setsockopt(struct socket
*sock
, int level
, int optname
,
1629 char __user
*optval
, unsigned int optlen
)
1631 struct sock
*sk
= sock
->sk
;
1632 struct smc_sock
*smc
;
1637 /* generic setsockopts reaching us here always apply to the
1640 rc
= smc
->clcsock
->ops
->setsockopt(smc
->clcsock
, level
, optname
,
1642 if (smc
->clcsock
->sk
->sk_err
) {
1643 sk
->sk_err
= smc
->clcsock
->sk
->sk_err
;
1644 sk
->sk_error_report(sk
);
1649 if (optlen
< sizeof(int))
1651 if (get_user(val
, (int __user
*)optval
))
1658 case TCP_FASTOPEN_CONNECT
:
1659 case TCP_FASTOPEN_KEY
:
1660 case TCP_FASTOPEN_NO_COOKIE
:
1661 /* option not supported by SMC */
1662 if (sk
->sk_state
== SMC_INIT
) {
1663 smc
->use_fallback
= true;
1664 smc
->fallback_rsn
= SMC_CLC_DECL_OPTUNSUPP
;
1666 if (!smc
->use_fallback
)
1671 if (sk
->sk_state
!= SMC_INIT
&& sk
->sk_state
!= SMC_LISTEN
) {
1672 if (val
&& !smc
->use_fallback
)
1673 mod_delayed_work(system_wq
, &smc
->conn
.tx_work
,
1678 if (sk
->sk_state
!= SMC_INIT
&& sk
->sk_state
!= SMC_LISTEN
) {
1679 if (!val
&& !smc
->use_fallback
)
1680 mod_delayed_work(system_wq
, &smc
->conn
.tx_work
,
1684 case TCP_DEFER_ACCEPT
:
1685 smc
->sockopt_defer_accept
= val
;
1695 static int smc_getsockopt(struct socket
*sock
, int level
, int optname
,
1696 char __user
*optval
, int __user
*optlen
)
1698 struct smc_sock
*smc
;
1700 smc
= smc_sk(sock
->sk
);
1701 /* socket options apply to the CLC socket */
1702 return smc
->clcsock
->ops
->getsockopt(smc
->clcsock
, level
, optname
,
1706 static int smc_ioctl(struct socket
*sock
, unsigned int cmd
,
1709 union smc_host_cursor cons
, urg
;
1710 struct smc_connection
*conn
;
1711 struct smc_sock
*smc
;
1714 smc
= smc_sk(sock
->sk
);
1716 lock_sock(&smc
->sk
);
1717 if (smc
->use_fallback
) {
1718 if (!smc
->clcsock
) {
1719 release_sock(&smc
->sk
);
1722 answ
= smc
->clcsock
->ops
->ioctl(smc
->clcsock
, cmd
, arg
);
1723 release_sock(&smc
->sk
);
1727 case SIOCINQ
: /* same as FIONREAD */
1728 if (smc
->sk
.sk_state
== SMC_LISTEN
) {
1729 release_sock(&smc
->sk
);
1732 if (smc
->sk
.sk_state
== SMC_INIT
||
1733 smc
->sk
.sk_state
== SMC_CLOSED
)
1736 answ
= atomic_read(&smc
->conn
.bytes_to_rcv
);
1739 /* output queue size (not send + not acked) */
1740 if (smc
->sk
.sk_state
== SMC_LISTEN
) {
1741 release_sock(&smc
->sk
);
1744 if (smc
->sk
.sk_state
== SMC_INIT
||
1745 smc
->sk
.sk_state
== SMC_CLOSED
)
1748 answ
= smc
->conn
.sndbuf_desc
->len
-
1749 atomic_read(&smc
->conn
.sndbuf_space
);
1752 /* output queue size (not send only) */
1753 if (smc
->sk
.sk_state
== SMC_LISTEN
) {
1754 release_sock(&smc
->sk
);
1757 if (smc
->sk
.sk_state
== SMC_INIT
||
1758 smc
->sk
.sk_state
== SMC_CLOSED
)
1761 answ
= smc_tx_prepared_sends(&smc
->conn
);
1764 if (smc
->sk
.sk_state
== SMC_LISTEN
) {
1765 release_sock(&smc
->sk
);
1768 if (smc
->sk
.sk_state
== SMC_INIT
||
1769 smc
->sk
.sk_state
== SMC_CLOSED
) {
1772 smc_curs_copy(&cons
, &conn
->local_tx_ctrl
.cons
, conn
);
1773 smc_curs_copy(&urg
, &conn
->urg_curs
, conn
);
1774 answ
= smc_curs_diff(conn
->rmb_desc
->len
,
1779 release_sock(&smc
->sk
);
1780 return -ENOIOCTLCMD
;
1782 release_sock(&smc
->sk
);
1784 return put_user(answ
, (int __user
*)arg
);
1787 static ssize_t
smc_sendpage(struct socket
*sock
, struct page
*page
,
1788 int offset
, size_t size
, int flags
)
1790 struct sock
*sk
= sock
->sk
;
1791 struct smc_sock
*smc
;
1796 if (sk
->sk_state
!= SMC_ACTIVE
) {
1801 if (smc
->use_fallback
)
1802 rc
= kernel_sendpage(smc
->clcsock
, page
, offset
,
1805 rc
= sock_no_sendpage(sock
, page
, offset
, size
, flags
);
1811 /* Map the affected portions of the rmbe into an spd, note the number of bytes
1812 * to splice in conn->splice_pending, and press 'go'. Delays consumer cursor
1813 * updates till whenever a respective page has been fully processed.
1814 * Note that subsequent recv() calls have to wait till all splice() processing
1817 static ssize_t
smc_splice_read(struct socket
*sock
, loff_t
*ppos
,
1818 struct pipe_inode_info
*pipe
, size_t len
,
1821 struct sock
*sk
= sock
->sk
;
1822 struct smc_sock
*smc
;
1828 if (sk
->sk_state
== SMC_INIT
||
1829 sk
->sk_state
== SMC_LISTEN
||
1830 sk
->sk_state
== SMC_CLOSED
)
1833 if (sk
->sk_state
== SMC_PEERFINCLOSEWAIT
) {
1838 if (smc
->use_fallback
) {
1839 rc
= smc
->clcsock
->ops
->splice_read(smc
->clcsock
, ppos
,
1846 if (flags
& SPLICE_F_NONBLOCK
)
1847 flags
= MSG_DONTWAIT
;
1850 rc
= smc_rx_recvmsg(smc
, NULL
, pipe
, len
, flags
);
1858 /* must look like tcp */
1859 static const struct proto_ops smc_sock_ops
= {
1861 .owner
= THIS_MODULE
,
1862 .release
= smc_release
,
1864 .connect
= smc_connect
,
1865 .socketpair
= sock_no_socketpair
,
1866 .accept
= smc_accept
,
1867 .getname
= smc_getname
,
1870 .listen
= smc_listen
,
1871 .shutdown
= smc_shutdown
,
1872 .setsockopt
= smc_setsockopt
,
1873 .getsockopt
= smc_getsockopt
,
1874 .sendmsg
= smc_sendmsg
,
1875 .recvmsg
= smc_recvmsg
,
1876 .mmap
= sock_no_mmap
,
1877 .sendpage
= smc_sendpage
,
1878 .splice_read
= smc_splice_read
,
1881 static int smc_create(struct net
*net
, struct socket
*sock
, int protocol
,
1884 int family
= (protocol
== SMCPROTO_SMC6
) ? PF_INET6
: PF_INET
;
1885 struct smc_sock
*smc
;
1889 rc
= -ESOCKTNOSUPPORT
;
1890 if (sock
->type
!= SOCK_STREAM
)
1893 rc
= -EPROTONOSUPPORT
;
1894 if (protocol
!= SMCPROTO_SMC
&& protocol
!= SMCPROTO_SMC6
)
1898 sock
->ops
= &smc_sock_ops
;
1899 sk
= smc_sock_alloc(net
, sock
, protocol
);
1903 /* create internal TCP socket for CLC handshake and fallback */
1905 smc
->use_fallback
= false; /* assume rdma capability first */
1906 smc
->fallback_rsn
= 0;
1907 rc
= sock_create_kern(net
, family
, SOCK_STREAM
, IPPROTO_TCP
,
1910 sk_common_release(sk
);
1913 smc
->sk
.sk_sndbuf
= max(smc
->clcsock
->sk
->sk_sndbuf
, SMC_BUF_MIN_SIZE
);
1914 smc
->sk
.sk_rcvbuf
= max(smc
->clcsock
->sk
->sk_rcvbuf
, SMC_BUF_MIN_SIZE
);
1920 static const struct net_proto_family smc_sock_family_ops
= {
1922 .owner
= THIS_MODULE
,
1923 .create
= smc_create
,
1926 static int __init
smc_init(void)
1930 rc
= smc_pnet_init();
1934 rc
= smc_llc_init();
1936 pr_err("%s: smc_llc_init fails with %d\n", __func__
, rc
);
1940 rc
= smc_cdc_init();
1942 pr_err("%s: smc_cdc_init fails with %d\n", __func__
, rc
);
1946 rc
= proto_register(&smc_proto
, 1);
1948 pr_err("%s: proto_register(v4) fails with %d\n", __func__
, rc
);
1952 rc
= proto_register(&smc_proto6
, 1);
1954 pr_err("%s: proto_register(v6) fails with %d\n", __func__
, rc
);
1958 rc
= sock_register(&smc_sock_family_ops
);
1960 pr_err("%s: sock_register fails with %d\n", __func__
, rc
);
1963 INIT_HLIST_HEAD(&smc_v4_hashinfo
.ht
);
1964 INIT_HLIST_HEAD(&smc_v6_hashinfo
.ht
);
1966 rc
= smc_ib_register_client();
1968 pr_err("%s: ib_register fails with %d\n", __func__
, rc
);
1972 static_branch_enable(&tcp_have_smc
);
1976 sock_unregister(PF_SMC
);
1978 proto_unregister(&smc_proto6
);
1980 proto_unregister(&smc_proto
);
1986 static void __exit
smc_exit(void)
1989 static_branch_disable(&tcp_have_smc
);
1990 smc_ib_unregister_client();
1991 sock_unregister(PF_SMC
);
1992 proto_unregister(&smc_proto6
);
1993 proto_unregister(&smc_proto
);
1997 module_init(smc_init
);
1998 module_exit(smc_exit
);
2000 MODULE_AUTHOR("Ursula Braun <ubraun@linux.vnet.ibm.com>");
2001 MODULE_DESCRIPTION("smc socket address family");
2002 MODULE_LICENSE("GPL");
2003 MODULE_ALIAS_NETPROTO(PF_SMC
);