1 // SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause
3 /* Authors: Bernard Metzler <bmt@zurich.ibm.com> */
4 /* Copyright (c) 2008-2019, IBM Corporation */
6 #include <linux/init.h>
7 #include <linux/errno.h>
8 #include <linux/netdevice.h>
9 #include <linux/inetdevice.h>
10 #include <net/net_namespace.h>
11 #include <linux/rtnetlink.h>
12 #include <linux/if_arp.h>
13 #include <linux/list.h>
14 #include <linux/kernel.h>
15 #include <linux/sched.h>
16 #include <linux/module.h>
17 #include <linux/dma-mapping.h>
19 #include <net/addrconf.h>
20 #include <rdma/ib_verbs.h>
21 #include <rdma/ib_user_verbs.h>
22 #include <rdma/rdma_netlink.h>
23 #include <linux/kthread.h>
26 #include "siw_verbs.h"
28 MODULE_AUTHOR("Bernard Metzler");
29 MODULE_DESCRIPTION("Software iWARP Driver");
30 MODULE_LICENSE("Dual BSD/GPL");
32 /* transmit from user buffer, if possible */
33 const bool zcopy_tx
= true;
35 /* Restrict usage of GSO, if hardware peer iwarp is unable to process
36 * large packets. try_gso = true lets siw try to use local GSO,
37 * if peer agrees. Not using GSO severly limits siw maximum tx bandwidth.
41 /* Attach siw also with loopback devices */
42 const bool loopback_enabled
= true;
44 /* We try to negotiate CRC on, if true */
45 const bool mpa_crc_required
;
47 /* MPA CRC on/off enforced */
48 const bool mpa_crc_strict
;
50 /* Control TCP_NODELAY socket option */
51 const bool siw_tcp_nagle
;
53 /* Select MPA version to be used during connection setup */
54 u_char mpa_version
= MPA_REVISION_2
;
56 /* Selects MPA P2P mode (additional handshake during connection
59 const bool peer_to_peer
;
61 struct task_struct
*siw_tx_thread
[NR_CPUS
];
62 struct crypto_shash
*siw_crypto_shash
;
64 static int siw_device_register(struct siw_device
*sdev
, const char *name
)
66 struct ib_device
*base_dev
= &sdev
->base_dev
;
67 static int dev_id
= 1;
70 sdev
->vendor_part_id
= dev_id
++;
72 rv
= ib_register_device(base_dev
, name
, NULL
);
74 pr_warn("siw: device registration error %d\n", rv
);
78 siw_dbg(base_dev
, "HWaddr=%pM\n", sdev
->netdev
->dev_addr
);
83 static void siw_device_cleanup(struct ib_device
*base_dev
)
85 struct siw_device
*sdev
= to_siw_dev(base_dev
);
87 xa_destroy(&sdev
->qp_xa
);
88 xa_destroy(&sdev
->mem_xa
);
91 static int siw_create_tx_threads(void)
93 int cpu
, assigned
= 0;
95 for_each_online_cpu(cpu
) {
97 if (cpu
% cpumask_weight(topology_sibling_cpumask(cpu
)))
101 kthread_create(siw_run_sq
, (unsigned long *)(long)cpu
,
103 if (IS_ERR(siw_tx_thread
[cpu
])) {
104 siw_tx_thread
[cpu
] = NULL
;
107 kthread_bind(siw_tx_thread
[cpu
], cpu
);
109 wake_up_process(siw_tx_thread
[cpu
]);
115 static int siw_dev_qualified(struct net_device
*netdev
)
118 * Additional hardware support can be added here
119 * (e.g. ARPHRD_FDDI, ARPHRD_ATM, ...) - see
120 * <linux/if_arp.h> for type identifiers.
122 if (netdev
->type
== ARPHRD_ETHER
|| netdev
->type
== ARPHRD_IEEE802
||
123 (netdev
->type
== ARPHRD_LOOPBACK
&& loopback_enabled
))
129 static DEFINE_PER_CPU(atomic_t
, siw_use_cnt
);
132 struct cpumask
**tx_valid_cpus
;
136 static int siw_init_cpulist(void)
138 int i
, num_nodes
= num_possible_nodes();
140 memset(siw_tx_thread
, 0, sizeof(siw_tx_thread
));
142 siw_cpu_info
.num_nodes
= num_nodes
;
144 siw_cpu_info
.tx_valid_cpus
=
145 kcalloc(num_nodes
, sizeof(struct cpumask
*), GFP_KERNEL
);
146 if (!siw_cpu_info
.tx_valid_cpus
) {
147 siw_cpu_info
.num_nodes
= 0;
150 for (i
= 0; i
< siw_cpu_info
.num_nodes
; i
++) {
151 siw_cpu_info
.tx_valid_cpus
[i
] =
152 kzalloc(sizeof(struct cpumask
), GFP_KERNEL
);
153 if (!siw_cpu_info
.tx_valid_cpus
[i
])
156 cpumask_clear(siw_cpu_info
.tx_valid_cpus
[i
]);
158 for_each_possible_cpu(i
)
159 cpumask_set_cpu(i
, siw_cpu_info
.tx_valid_cpus
[cpu_to_node(i
)]);
164 siw_cpu_info
.num_nodes
= 0;
166 kfree(siw_cpu_info
.tx_valid_cpus
[i
]);
167 kfree(siw_cpu_info
.tx_valid_cpus
);
168 siw_cpu_info
.tx_valid_cpus
= NULL
;
173 static void siw_destroy_cpulist(void)
177 while (i
< siw_cpu_info
.num_nodes
)
178 kfree(siw_cpu_info
.tx_valid_cpus
[i
++]);
180 kfree(siw_cpu_info
.tx_valid_cpus
);
184 * Choose CPU with least number of active QP's from NUMA node of
187 int siw_get_tx_cpu(struct siw_device
*sdev
)
189 const struct cpumask
*tx_cpumask
;
190 int i
, num_cpus
, cpu
, min_use
, node
= sdev
->numa_node
, tx_cpu
= -1;
193 tx_cpumask
= cpu_online_mask
;
195 tx_cpumask
= siw_cpu_info
.tx_valid_cpus
[node
];
197 num_cpus
= cpumask_weight(tx_cpumask
);
199 /* no CPU on this NUMA node */
200 tx_cpumask
= cpu_online_mask
;
201 num_cpus
= cpumask_weight(tx_cpumask
);
206 cpu
= cpumask_first(tx_cpumask
);
208 for (i
= 0, min_use
= SIW_MAX_QP
; i
< num_cpus
;
209 i
++, cpu
= cpumask_next(cpu
, tx_cpumask
)) {
212 /* Skip any cores which have no TX thread */
213 if (!siw_tx_thread
[cpu
])
216 usage
= atomic_read(&per_cpu(siw_use_cnt
, cpu
));
217 if (usage
<= min_use
) {
222 siw_dbg(&sdev
->base_dev
,
223 "tx cpu %d, node %d, %d qp's\n", tx_cpu
, node
, min_use
);
227 atomic_inc(&per_cpu(siw_use_cnt
, tx_cpu
));
229 pr_warn("siw: no tx cpu found\n");
234 void siw_put_tx_cpu(int cpu
)
236 atomic_dec(&per_cpu(siw_use_cnt
, cpu
));
239 static struct ib_qp
*siw_get_base_qp(struct ib_device
*base_dev
, int id
)
241 struct siw_qp
*qp
= siw_qp_id2obj(to_siw_dev(base_dev
), id
);
245 * siw_qp_id2obj() increments object reference count
253 static const struct ib_device_ops siw_device_ops
= {
254 .owner
= THIS_MODULE
,
255 .uverbs_abi_ver
= SIW_ABI_VERSION
,
256 .driver_id
= RDMA_DRIVER_SIW
,
258 .alloc_mr
= siw_alloc_mr
,
259 .alloc_pd
= siw_alloc_pd
,
260 .alloc_ucontext
= siw_alloc_ucontext
,
261 .create_cq
= siw_create_cq
,
262 .create_qp
= siw_create_qp
,
263 .create_srq
= siw_create_srq
,
264 .dealloc_driver
= siw_device_cleanup
,
265 .dealloc_pd
= siw_dealloc_pd
,
266 .dealloc_ucontext
= siw_dealloc_ucontext
,
267 .dereg_mr
= siw_dereg_mr
,
268 .destroy_cq
= siw_destroy_cq
,
269 .destroy_qp
= siw_destroy_qp
,
270 .destroy_srq
= siw_destroy_srq
,
271 .get_dma_mr
= siw_get_dma_mr
,
272 .get_port_immutable
= siw_get_port_immutable
,
273 .iw_accept
= siw_accept
,
274 .iw_add_ref
= siw_qp_get_ref
,
275 .iw_connect
= siw_connect
,
276 .iw_create_listen
= siw_create_listen
,
277 .iw_destroy_listen
= siw_destroy_listen
,
278 .iw_get_qp
= siw_get_base_qp
,
279 .iw_reject
= siw_reject
,
280 .iw_rem_ref
= siw_qp_put_ref
,
281 .map_mr_sg
= siw_map_mr_sg
,
283 .mmap_free
= siw_mmap_free
,
284 .modify_qp
= siw_verbs_modify_qp
,
285 .modify_srq
= siw_modify_srq
,
286 .poll_cq
= siw_poll_cq
,
287 .post_recv
= siw_post_receive
,
288 .post_send
= siw_post_send
,
289 .post_srq_recv
= siw_post_srq_recv
,
290 .query_device
= siw_query_device
,
291 .query_gid
= siw_query_gid
,
292 .query_port
= siw_query_port
,
293 .query_qp
= siw_query_qp
,
294 .query_srq
= siw_query_srq
,
295 .req_notify_cq
= siw_req_notify_cq
,
296 .reg_user_mr
= siw_reg_user_mr
,
298 INIT_RDMA_OBJ_SIZE(ib_cq
, siw_cq
, base_cq
),
299 INIT_RDMA_OBJ_SIZE(ib_pd
, siw_pd
, base_pd
),
300 INIT_RDMA_OBJ_SIZE(ib_srq
, siw_srq
, base_srq
),
301 INIT_RDMA_OBJ_SIZE(ib_ucontext
, siw_ucontext
, base_ucontext
),
304 static struct siw_device
*siw_device_create(struct net_device
*netdev
)
306 struct siw_device
*sdev
= NULL
;
307 struct ib_device
*base_dev
;
310 sdev
= ib_alloc_device(siw_device
, base_dev
);
314 base_dev
= &sdev
->base_dev
;
316 sdev
->netdev
= netdev
;
318 if (netdev
->type
!= ARPHRD_LOOPBACK
) {
319 addrconf_addr_eui48((unsigned char *)&base_dev
->node_guid
,
323 * The loopback device does not have a HW address,
324 * but connection mangagement lib expects gid != 0
326 size_t len
= min_t(size_t, strlen(base_dev
->name
), 6);
329 memcpy(addr
, base_dev
->name
, len
);
330 addrconf_addr_eui48((unsigned char *)&base_dev
->node_guid
,
334 base_dev
->uverbs_cmd_mask
|= BIT_ULL(IB_USER_VERBS_CMD_POST_SEND
);
336 base_dev
->node_type
= RDMA_NODE_RNIC
;
337 memcpy(base_dev
->node_desc
, SIW_NODE_DESC_COMMON
,
338 sizeof(SIW_NODE_DESC_COMMON
));
341 * Current model (one-to-one device association):
342 * One Softiwarp device per net_device or, equivalently,
345 base_dev
->phys_port_cnt
= 1;
346 base_dev
->num_comp_vectors
= num_possible_cpus();
348 xa_init_flags(&sdev
->qp_xa
, XA_FLAGS_ALLOC1
);
349 xa_init_flags(&sdev
->mem_xa
, XA_FLAGS_ALLOC1
);
351 ib_set_device_ops(base_dev
, &siw_device_ops
);
352 rv
= ib_device_set_netdev(base_dev
, netdev
, 1);
356 memcpy(base_dev
->iw_ifname
, netdev
->name
,
357 sizeof(base_dev
->iw_ifname
));
359 /* Disable TCP port mapping */
360 base_dev
->iw_driver_flags
= IW_F_NO_PORT_MAP
,
362 sdev
->attrs
.max_qp
= SIW_MAX_QP
;
363 sdev
->attrs
.max_qp_wr
= SIW_MAX_QP_WR
;
364 sdev
->attrs
.max_ord
= SIW_MAX_ORD_QP
;
365 sdev
->attrs
.max_ird
= SIW_MAX_IRD_QP
;
366 sdev
->attrs
.max_sge
= SIW_MAX_SGE
;
367 sdev
->attrs
.max_sge_rd
= SIW_MAX_SGE_RD
;
368 sdev
->attrs
.max_cq
= SIW_MAX_CQ
;
369 sdev
->attrs
.max_cqe
= SIW_MAX_CQE
;
370 sdev
->attrs
.max_mr
= SIW_MAX_MR
;
371 sdev
->attrs
.max_pd
= SIW_MAX_PD
;
372 sdev
->attrs
.max_mw
= SIW_MAX_MW
;
373 sdev
->attrs
.max_srq
= SIW_MAX_SRQ
;
374 sdev
->attrs
.max_srq_wr
= SIW_MAX_SRQ_WR
;
375 sdev
->attrs
.max_srq_sge
= SIW_MAX_SGE
;
377 INIT_LIST_HEAD(&sdev
->cep_list
);
378 INIT_LIST_HEAD(&sdev
->qp_list
);
380 atomic_set(&sdev
->num_ctx
, 0);
381 atomic_set(&sdev
->num_srq
, 0);
382 atomic_set(&sdev
->num_qp
, 0);
383 atomic_set(&sdev
->num_cq
, 0);
384 atomic_set(&sdev
->num_mr
, 0);
385 atomic_set(&sdev
->num_pd
, 0);
387 sdev
->numa_node
= dev_to_node(&netdev
->dev
);
388 spin_lock_init(&sdev
->lock
);
392 ib_dealloc_device(base_dev
);
398 * Network link becomes unavailable. Mark all
399 * affected QP's accordingly.
401 static void siw_netdev_down(struct work_struct
*work
)
403 struct siw_device
*sdev
=
404 container_of(work
, struct siw_device
, netdev_down
);
406 struct siw_qp_attrs qp_attrs
;
407 struct list_head
*pos
, *tmp
;
409 memset(&qp_attrs
, 0, sizeof(qp_attrs
));
410 qp_attrs
.state
= SIW_QP_STATE_ERROR
;
412 list_for_each_safe(pos
, tmp
, &sdev
->qp_list
) {
413 struct siw_qp
*qp
= list_entry(pos
, struct siw_qp
, devq
);
415 down_write(&qp
->state_lock
);
416 WARN_ON(siw_qp_modify(qp
, &qp_attrs
, SIW_QP_ATTR_STATE
));
417 up_write(&qp
->state_lock
);
419 ib_device_put(&sdev
->base_dev
);
422 static void siw_device_goes_down(struct siw_device
*sdev
)
424 if (ib_device_try_get(&sdev
->base_dev
)) {
425 INIT_WORK(&sdev
->netdev_down
, siw_netdev_down
);
426 schedule_work(&sdev
->netdev_down
);
430 static int siw_netdev_event(struct notifier_block
*nb
, unsigned long event
,
433 struct net_device
*netdev
= netdev_notifier_info_to_dev(arg
);
434 struct ib_device
*base_dev
;
435 struct siw_device
*sdev
;
437 dev_dbg(&netdev
->dev
, "siw: event %lu\n", event
);
439 if (dev_net(netdev
) != &init_net
)
442 base_dev
= ib_device_get_by_netdev(netdev
, RDMA_DRIVER_SIW
);
446 sdev
= to_siw_dev(base_dev
);
450 sdev
->state
= IB_PORT_ACTIVE
;
451 siw_port_event(sdev
, 1, IB_EVENT_PORT_ACTIVE
);
454 case NETDEV_GOING_DOWN
:
455 siw_device_goes_down(sdev
);
459 sdev
->state
= IB_PORT_DOWN
;
460 siw_port_event(sdev
, 1, IB_EVENT_PORT_ERR
);
463 case NETDEV_REGISTER
:
465 * Device registration now handled only by
466 * rdma netlink commands. So it shall be impossible
467 * to end up here with a valid siw device.
469 siw_dbg(base_dev
, "unexpected NETDEV_REGISTER event\n");
472 case NETDEV_UNREGISTER
:
473 ib_unregister_device_queued(&sdev
->base_dev
);
476 case NETDEV_CHANGEADDR
:
477 siw_port_event(sdev
, 1, IB_EVENT_LID_CHANGE
);
480 * Todo: Below netdev events are currently not handled.
482 case NETDEV_CHANGEMTU
:
489 ib_device_put(&sdev
->base_dev
);
494 static struct notifier_block siw_netdev_nb
= {
495 .notifier_call
= siw_netdev_event
,
498 static int siw_newlink(const char *basedev_name
, struct net_device
*netdev
)
500 struct ib_device
*base_dev
;
501 struct siw_device
*sdev
= NULL
;
504 if (!siw_dev_qualified(netdev
))
507 base_dev
= ib_device_get_by_netdev(netdev
, RDMA_DRIVER_SIW
);
509 ib_device_put(base_dev
);
512 sdev
= siw_device_create(netdev
);
514 dev_dbg(&netdev
->dev
, "siw: new device\n");
516 if (netif_running(netdev
) && netif_carrier_ok(netdev
))
517 sdev
->state
= IB_PORT_ACTIVE
;
519 sdev
->state
= IB_PORT_DOWN
;
521 rv
= siw_device_register(sdev
, basedev_name
);
523 ib_dealloc_device(&sdev
->base_dev
);
528 static struct rdma_link_ops siw_link_ops
= {
530 .newlink
= siw_newlink
,
534 * siw_init_module - Initialize Softiwarp module and register with netdev
537 static __init
int siw_init_module(void)
542 if (SENDPAGE_THRESH
< SIW_MAX_INLINE
) {
543 pr_info("siw: sendpage threshold too small: %u\n",
544 (int)SENDPAGE_THRESH
);
548 rv
= siw_init_cpulist();
556 if (!siw_create_tx_threads()) {
557 pr_info("siw: Could not start any TX thread\n");
562 * Locate CRC32 algorithm. If unsuccessful, fail
563 * loading siw only, if CRC is required.
565 siw_crypto_shash
= crypto_alloc_shash("crc32c", 0, 0);
566 if (IS_ERR(siw_crypto_shash
)) {
567 pr_info("siw: Loading CRC32c failed: %ld\n",
568 PTR_ERR(siw_crypto_shash
));
569 siw_crypto_shash
= NULL
;
570 if (mpa_crc_required
) {
575 rv
= register_netdevice_notifier(&siw_netdev_nb
);
579 rdma_link_register(&siw_link_ops
);
581 pr_info("SoftiWARP attached\n");
585 for (nr_cpu
= 0; nr_cpu
< nr_cpu_ids
; nr_cpu
++) {
586 if (siw_tx_thread
[nr_cpu
]) {
587 siw_stop_tx_thread(nr_cpu
);
588 siw_tx_thread
[nr_cpu
] = NULL
;
591 if (siw_crypto_shash
)
592 crypto_free_shash(siw_crypto_shash
);
594 pr_info("SoftIWARP attach failed. Error: %d\n", rv
);
597 siw_destroy_cpulist();
602 static void __exit
siw_exit_module(void)
606 for_each_possible_cpu(cpu
) {
607 if (siw_tx_thread
[cpu
]) {
608 siw_stop_tx_thread(cpu
);
609 siw_tx_thread
[cpu
] = NULL
;
612 unregister_netdevice_notifier(&siw_netdev_nb
);
613 rdma_link_unregister(&siw_link_ops
);
614 ib_unregister_driver(RDMA_DRIVER_SIW
);
618 siw_destroy_cpulist();
620 if (siw_crypto_shash
)
621 crypto_free_shash(siw_crypto_shash
);
623 pr_info("SoftiWARP detached\n");
626 module_init(siw_init_module
);
627 module_exit(siw_exit_module
);
629 MODULE_ALIAS_RDMA_LINK("siw");