1 /* SPDX-License-Identifier: GPL-2.0 */
3 * Shared Memory Communications over RDMA (SMC-R) and RoCE
5 * Definitions for SMC Connections, Link Groups and Links
7 * Copyright IBM Corp. 2016
9 * Author(s): Ursula Braun <ubraun@linux.vnet.ibm.com>
15 #include <linux/atomic.h>
16 #include <linux/smc.h>
17 #include <linux/pci.h>
18 #include <rdma/ib_verbs.h>
19 #include <net/genetlink.h>
24 #define SMC_RMBS_PER_LGR_MAX 255 /* max. # of RMBs per link group */
26 struct smc_lgr_list
{ /* list of link group definition */
27 struct list_head list
;
28 spinlock_t lock
; /* protects list of link groups */
29 u32 num
; /* unique link group number */
32 enum smc_lgr_role
{ /* possible roles of a link group */
33 SMC_CLNT
, /* client */
37 enum smc_link_state
{ /* possible states of a link */
38 SMC_LNK_UNUSED
, /* link is unused */
39 SMC_LNK_INACTIVE
, /* link is inactive */
40 SMC_LNK_ACTIVATING
, /* link is being activated */
41 SMC_LNK_ACTIVE
, /* link is active */
44 #define SMC_WR_BUF_SIZE 48 /* size of work request buffer */
47 u8 raw
[SMC_WR_BUF_SIZE
];
50 #define SMC_WR_REG_MR_WAIT_TIME (5 * HZ)/* wait time for ib_wr_reg_mr result */
52 enum smc_wr_reg_state
{
53 POSTED
, /* ib_wr_reg_mr request posted */
54 CONFIRMED
, /* ib_wr_reg_mr response: successful */
55 FAILED
/* ib_wr_reg_mr response: failure */
58 struct smc_rdma_sge
{ /* sges for RDMA writes */
59 struct ib_sge wr_tx_rdma_sge
[SMC_IB_MAX_SEND_SGE
];
62 #define SMC_MAX_RDMA_WRITES 2 /* max. # of RDMA writes per
66 struct smc_rdma_sges
{ /* sges per message send */
67 struct smc_rdma_sge tx_rdma_sge
[SMC_MAX_RDMA_WRITES
];
70 struct smc_rdma_wr
{ /* work requests per message
73 struct ib_rdma_wr wr_tx_rdma
[SMC_MAX_RDMA_WRITES
];
76 #define SMC_LGR_ID_SIZE 4
79 struct smc_ib_device
*smcibdev
; /* ib-device */
80 u8 ibport
; /* port - values 1 | 2 */
81 struct ib_pd
*roce_pd
; /* IB protection domain,
82 * unique for every RoCE QP
84 struct ib_qp
*roce_qp
; /* IB queue pair */
85 struct ib_qp_attr qp_attr
; /* IB queue pair attributes */
87 struct smc_wr_buf
*wr_tx_bufs
; /* WR send payload buffers */
88 struct ib_send_wr
*wr_tx_ibs
; /* WR send meta data */
89 struct ib_sge
*wr_tx_sges
; /* WR send gather meta data */
90 struct smc_rdma_sges
*wr_tx_rdma_sges
;/*RDMA WRITE gather meta data*/
91 struct smc_rdma_wr
*wr_tx_rdmas
; /* WR RDMA WRITE */
92 struct smc_wr_tx_pend
*wr_tx_pends
; /* WR send waiting for CQE */
93 struct completion
*wr_tx_compl
; /* WR send CQE completion */
94 /* above four vectors have wr_tx_cnt elements and use the same index */
95 dma_addr_t wr_tx_dma_addr
; /* DMA address of wr_tx_bufs */
96 atomic_long_t wr_tx_id
; /* seq # of last sent WR */
97 unsigned long *wr_tx_mask
; /* bit mask of used indexes */
98 u32 wr_tx_cnt
; /* number of WR send buffers */
99 wait_queue_head_t wr_tx_wait
; /* wait for free WR send buf */
101 struct smc_wr_buf
*wr_rx_bufs
; /* WR recv payload buffers */
102 struct ib_recv_wr
*wr_rx_ibs
; /* WR recv meta data */
103 struct ib_sge
*wr_rx_sges
; /* WR recv scatter meta data */
104 /* above three vectors have wr_rx_cnt elements and use the same index */
105 dma_addr_t wr_rx_dma_addr
; /* DMA address of wr_rx_bufs */
106 u64 wr_rx_id
; /* seq # of last recv WR */
107 u32 wr_rx_cnt
; /* number of WR recv buffers */
108 unsigned long wr_rx_tstamp
; /* jiffies when last buf rx */
110 struct ib_reg_wr wr_reg
; /* WR register memory region */
111 wait_queue_head_t wr_reg_wait
; /* wait for wr_reg result */
112 enum smc_wr_reg_state wr_reg_state
; /* state of wr_reg request */
114 u8 gid
[SMC_GID_SIZE
];/* gid matching used vlan id*/
115 u8 sgid_index
; /* gid index for vlan id */
116 u32 peer_qpn
; /* QP number of peer */
117 enum ib_mtu path_mtu
; /* used mtu */
118 enum ib_mtu peer_mtu
; /* mtu size of peer */
119 u32 psn_initial
; /* QP tx initial packet seqno */
120 u32 peer_psn
; /* QP rx initial packet seqno */
121 u8 peer_mac
[ETH_ALEN
]; /* = gid[8:10||13:15] */
122 u8 peer_gid
[SMC_GID_SIZE
]; /* gid of peer*/
123 u8 link_id
; /* unique # within link group */
124 u8 link_uid
[SMC_LGR_ID_SIZE
]; /* unique lnk id */
125 u8 peer_link_uid
[SMC_LGR_ID_SIZE
]; /* peer uid */
126 u8 link_idx
; /* index in lgr link array */
127 u8 link_is_asym
; /* is link asymmetric? */
128 struct smc_link_group
*lgr
; /* parent link group */
129 struct work_struct link_down_wrk
; /* wrk to bring link down */
130 char ibname
[IB_DEVICE_NAME_MAX
]; /* ib device name */
131 int ndev_ifidx
; /* network device ifindex */
133 enum smc_link_state state
; /* state of link */
134 struct delayed_work llc_testlink_wrk
; /* testlink worker */
135 struct completion llc_testlink_resp
; /* wait for rx of testlink */
136 int llc_testlink_time
; /* testlink interval */
137 atomic_t conn_cnt
; /* connections on this link */
140 /* For now we just allow one parallel link per link group. The SMC protocol
141 * allows more (up to 8).
143 #define SMC_LINKS_PER_LGR_MAX 3
144 #define SMC_SINGLE_LINK 0
146 /* tx/rx buffer list element for sndbufs list and rmbs list of a lgr */
147 struct smc_buf_desc
{
148 struct list_head list
;
149 void *cpu_addr
; /* virtual address of buffer */
151 int len
; /* length of buffer */
152 u32 used
; /* currently used / unused */
155 struct sg_table sgt
[SMC_LINKS_PER_LGR_MAX
];
157 struct ib_mr
*mr_rx
[SMC_LINKS_PER_LGR_MAX
];
158 /* for rmb only: memory region
159 * incl. rkey provided to peer
161 u32 order
; /* allocation order */
164 /* confirm_rkey done */
165 u8 is_reg_mr
[SMC_LINKS_PER_LGR_MAX
];
166 /* mem region registered */
167 u8 is_map_ib
[SMC_LINKS_PER_LGR_MAX
];
168 /* mem region mapped to lnk */
170 /* buffer registration err */
173 unsigned short sba_idx
;
174 /* SBA index number */
176 /* DMB token number */
183 struct smc_rtoken
{ /* address/key of remote RMB */
188 #define SMC_BUF_MIN_SIZE 16384 /* minimum size of an RMB */
189 #define SMC_RMBE_SIZES 16 /* number of distinct RMBE sizes */
190 /* theoretically, the RFC states that largest size would be 512K,
191 * i.e. compressed 5 and thus 6 sizes (0..5), despite
192 * struct smc_clc_msg_accept_confirm.rmbe_size being a 4 bit value (0..15)
197 enum smc_lgr_type
{ /* redundancy state of lgr */
198 SMC_LGR_NONE
, /* no active links, lgr to be deleted */
199 SMC_LGR_SINGLE
, /* 1 active RNIC on each peer */
200 SMC_LGR_SYMMETRIC
, /* 2 active RNICs on each peer */
201 SMC_LGR_ASYMMETRIC_PEER
, /* local has 2, peer 1 active RNICs */
202 SMC_LGR_ASYMMETRIC_LOCAL
, /* local has 1, peer 2 active RNICs */
205 enum smc_llc_flowtype
{
206 SMC_LLC_FLOW_NONE
= 0,
207 SMC_LLC_FLOW_ADD_LINK
= 2,
208 SMC_LLC_FLOW_DEL_LINK
= 4,
209 SMC_LLC_FLOW_RKEY
= 6,
212 struct smc_llc_qentry
;
214 struct smc_llc_flow
{
215 enum smc_llc_flowtype type
;
216 struct smc_llc_qentry
*qentry
;
219 struct smc_link_group
{
220 struct list_head list
;
221 struct rb_root conns_all
; /* connection tree */
222 rwlock_t conns_lock
; /* protects conns_all */
223 unsigned int conns_num
; /* current # of connections */
224 unsigned short vlan_id
; /* vlan id of link group */
226 struct list_head sndbufs
[SMC_RMBE_SIZES
];/* tx buffers */
227 struct mutex sndbufs_lock
; /* protects tx buffers */
228 struct list_head rmbs
[SMC_RMBE_SIZES
]; /* rx buffers */
229 struct mutex rmbs_lock
; /* protects rx buffers */
231 u8 id
[SMC_LGR_ID_SIZE
]; /* unique lgr id */
232 struct delayed_work free_work
; /* delayed freeing of an lgr */
233 struct work_struct terminate_work
; /* abnormal lgr termination */
234 struct workqueue_struct
*tx_wq
; /* wq for conn. tx workers */
235 u8 sync_err
: 1; /* lgr no longer fits to peer */
236 u8 terminating
: 1;/* lgr is terminating */
237 u8 freeing
: 1; /* lgr is being freed */
239 bool is_smcd
; /* SMC-R or SMC-D */
241 u8 negotiated_eid
[SMC_MAX_EID_LEN
];
242 u8 peer_os
; /* peer operating system */
244 u8 peer_hostname
[SMC_MAX_HOSTNAME_LEN
];
247 enum smc_lgr_role role
;
248 /* client or server */
249 struct smc_link lnk
[SMC_LINKS_PER_LGR_MAX
];
251 char peer_systemid
[SMC_SYSTEMID_LEN
];
252 /* unique system_id of peer */
253 struct smc_rtoken rtokens
[SMC_RMBS_PER_LGR_MAX
]
254 [SMC_LINKS_PER_LGR_MAX
];
255 /* remote addr/key pairs */
256 DECLARE_BITMAP(rtokens_used_mask
, SMC_RMBS_PER_LGR_MAX
);
257 /* used rtoken elements */
259 enum smc_lgr_type type
;
260 /* redundancy state */
261 u8 pnet_id
[SMC_MAX_PNETID_LEN
+ 1];
262 /* pnet id of this lgr */
263 struct list_head llc_event_q
;
264 /* queue for llc events */
265 spinlock_t llc_event_q_lock
;
266 /* protects llc_event_q */
267 struct mutex llc_conf_mutex
;
268 /* protects lgr reconfig. */
269 struct work_struct llc_add_link_work
;
270 struct work_struct llc_del_link_work
;
271 struct work_struct llc_event_work
;
272 /* llc event worker */
273 wait_queue_head_t llc_flow_waiter
;
274 /* w4 next llc event */
275 wait_queue_head_t llc_msg_waiter
;
276 /* w4 next llc msg */
277 struct smc_llc_flow llc_flow_lcl
;
278 /* llc local control field */
279 struct smc_llc_flow llc_flow_rmt
;
280 /* llc remote control field */
281 struct smc_llc_qentry
*delayed_event
;
282 /* arrived when flow active */
283 spinlock_t llc_flow_lock
;
284 /* protects llc flow */
285 int llc_testlink_time
;
286 /* link keep alive time */
287 u32 llc_termination_rsn
;
288 /* rsn code for termination */
292 /* Peer GID (remote) */
293 struct smcd_dev
*smcd
;
294 /* ISM device for VLAN reg. */
295 u8 peer_shutdown
: 1;
296 /* peer triggered shutdownn */
301 struct smc_clc_msg_local
;
303 struct smc_init_info
{
307 u8 first_contact_peer
;
308 u8 first_contact_local
;
309 unsigned short vlan_id
;
312 struct smc_clc_msg_local
*ib_lcl
;
313 struct smc_ib_device
*ib_dev
;
314 u8 ib_gid
[SMC_GID_SIZE
];
318 u64 ism_peer_gid
[SMC_MAX_ISM_DEVS
+ 1];
319 struct smcd_dev
*ism_dev
[SMC_MAX_ISM_DEVS
+ 1];
320 u16 ism_chid
[SMC_MAX_ISM_DEVS
+ 1];
321 u8 ism_offered_cnt
; /* # of ISM devices offered */
322 u8 ism_selected
; /* index of selected ISM dev*/
326 /* Find the connection associated with the given alert token in the link group.
327 * To use rbtrees we have to implement our own search core.
328 * Requires @conns_lock
329 * @token alert token to search for
330 * @lgr link group to search in
331 * Returns connection associated with token if found, NULL otherwise.
333 static inline struct smc_connection
*smc_lgr_find_conn(
334 u32 token
, struct smc_link_group
*lgr
)
336 struct smc_connection
*res
= NULL
;
337 struct rb_node
*node
;
339 node
= lgr
->conns_all
.rb_node
;
341 struct smc_connection
*cur
= rb_entry(node
,
342 struct smc_connection
, alert_node
);
344 if (cur
->alert_token_local
> token
) {
345 node
= node
->rb_left
;
347 if (cur
->alert_token_local
< token
) {
348 node
= node
->rb_right
;
359 /* returns true if the specified link is usable */
360 static inline bool smc_link_usable(struct smc_link
*lnk
)
362 if (lnk
->state
== SMC_LNK_UNUSED
|| lnk
->state
== SMC_LNK_INACTIVE
)
367 static inline bool smc_link_active(struct smc_link
*lnk
)
369 return lnk
->state
== SMC_LNK_ACTIVE
;
372 static inline void smc_gid_be16_convert(__u8
*buf
, u8
*gid_raw
)
374 sprintf(buf
, "%04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x",
375 be16_to_cpu(((__be16
*)gid_raw
)[0]),
376 be16_to_cpu(((__be16
*)gid_raw
)[1]),
377 be16_to_cpu(((__be16
*)gid_raw
)[2]),
378 be16_to_cpu(((__be16
*)gid_raw
)[3]),
379 be16_to_cpu(((__be16
*)gid_raw
)[4]),
380 be16_to_cpu(((__be16
*)gid_raw
)[5]),
381 be16_to_cpu(((__be16
*)gid_raw
)[6]),
382 be16_to_cpu(((__be16
*)gid_raw
)[7]));
390 __u8 pci_id
[SMC_PCI_ID_STR_LEN
];
393 static inline void smc_set_pci_values(struct pci_dev
*pci_dev
,
394 struct smc_pci_dev
*smc_dev
)
396 smc_dev
->pci_vendor
= pci_dev
->vendor
;
397 smc_dev
->pci_device
= pci_dev
->device
;
398 snprintf(smc_dev
->pci_id
, sizeof(smc_dev
->pci_id
), "%s",
400 #if IS_ENABLED(CONFIG_S390)
401 { /* Set s390 specific PCI information */
402 struct zpci_dev
*zdev
;
404 zdev
= to_zpci(pci_dev
);
405 smc_dev
->pci_fid
= zdev
->fid
;
406 smc_dev
->pci_pchid
= zdev
->pchid
;
412 struct smc_clc_msg_accept_confirm
;
413 struct smc_clc_msg_local
;
415 void smc_lgr_cleanup_early(struct smc_connection
*conn
);
416 void smc_lgr_terminate_sched(struct smc_link_group
*lgr
);
417 void smcr_port_add(struct smc_ib_device
*smcibdev
, u8 ibport
);
418 void smcr_port_err(struct smc_ib_device
*smcibdev
, u8 ibport
);
419 void smc_smcd_terminate(struct smcd_dev
*dev
, u64 peer_gid
,
420 unsigned short vlan
);
421 void smc_smcd_terminate_all(struct smcd_dev
*dev
);
422 void smc_smcr_terminate_all(struct smc_ib_device
*smcibdev
);
423 int smc_buf_create(struct smc_sock
*smc
, bool is_smcd
);
424 int smc_uncompress_bufsize(u8 compressed
);
425 int smc_rmb_rtoken_handling(struct smc_connection
*conn
, struct smc_link
*link
,
426 struct smc_clc_msg_accept_confirm
*clc
);
427 int smc_rtoken_add(struct smc_link
*lnk
, __be64 nw_vaddr
, __be32 nw_rkey
);
428 int smc_rtoken_delete(struct smc_link
*lnk
, __be32 nw_rkey
);
429 void smc_rtoken_set(struct smc_link_group
*lgr
, int link_idx
, int link_idx_new
,
430 __be32 nw_rkey_known
, __be64 nw_vaddr
, __be32 nw_rkey
);
431 void smc_rtoken_set2(struct smc_link_group
*lgr
, int rtok_idx
, int link_id
,
432 __be64 nw_vaddr
, __be32 nw_rkey
);
433 void smc_sndbuf_sync_sg_for_cpu(struct smc_connection
*conn
);
434 void smc_sndbuf_sync_sg_for_device(struct smc_connection
*conn
);
435 void smc_rmb_sync_sg_for_cpu(struct smc_connection
*conn
);
436 void smc_rmb_sync_sg_for_device(struct smc_connection
*conn
);
437 int smc_vlan_by_tcpsk(struct socket
*clcsock
, struct smc_init_info
*ini
);
439 void smc_conn_free(struct smc_connection
*conn
);
440 int smc_conn_create(struct smc_sock
*smc
, struct smc_init_info
*ini
);
441 void smc_lgr_schedule_free_work_fast(struct smc_link_group
*lgr
);
442 int smc_core_init(void);
443 void smc_core_exit(void);
445 int smcr_link_init(struct smc_link_group
*lgr
, struct smc_link
*lnk
,
446 u8 link_idx
, struct smc_init_info
*ini
);
447 void smcr_link_clear(struct smc_link
*lnk
, bool log
);
448 int smcr_buf_map_lgr(struct smc_link
*lnk
);
449 int smcr_buf_reg_lgr(struct smc_link
*lnk
);
450 void smcr_lgr_set_type(struct smc_link_group
*lgr
, enum smc_lgr_type new_type
);
451 void smcr_lgr_set_type_asym(struct smc_link_group
*lgr
,
452 enum smc_lgr_type new_type
, int asym_lnk_idx
);
453 int smcr_link_reg_rmb(struct smc_link
*link
, struct smc_buf_desc
*rmb_desc
);
454 struct smc_link
*smc_switch_conns(struct smc_link_group
*lgr
,
455 struct smc_link
*from_lnk
, bool is_dev_err
);
456 void smcr_link_down_cond(struct smc_link
*lnk
);
457 void smcr_link_down_cond_sched(struct smc_link
*lnk
);
458 int smc_nl_get_sys_info(struct sk_buff
*skb
, struct netlink_callback
*cb
);
459 int smcr_nl_get_lgr(struct sk_buff
*skb
, struct netlink_callback
*cb
);
460 int smcr_nl_get_link(struct sk_buff
*skb
, struct netlink_callback
*cb
);
461 int smcd_nl_get_lgr(struct sk_buff
*skb
, struct netlink_callback
*cb
);
463 static inline struct smc_link_group
*smc_get_lgr(struct smc_link
*link
)