1 // SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause
3 /* Authors: Bernard Metzler <bmt@zurich.ibm.com> */
4 /* Copyright (c) 2008-2019, IBM Corporation */
6 #include <linux/init.h>
7 #include <linux/errno.h>
8 #include <linux/netdevice.h>
9 #include <linux/inetdevice.h>
10 #include <net/net_namespace.h>
11 #include <linux/rtnetlink.h>
12 #include <linux/if_arp.h>
13 #include <linux/list.h>
14 #include <linux/kernel.h>
15 #include <linux/sched.h>
16 #include <linux/module.h>
17 #include <linux/dma-mapping.h>
19 #include <net/addrconf.h>
20 #include <rdma/ib_verbs.h>
21 #include <rdma/ib_user_verbs.h>
22 #include <rdma/rdma_netlink.h>
23 #include <linux/kthread.h>
26 #include "siw_verbs.h"
28 MODULE_AUTHOR("Bernard Metzler");
29 MODULE_DESCRIPTION("Software iWARP Driver");
30 MODULE_LICENSE("Dual BSD/GPL");
32 /* transmit from user buffer, if possible */
33 const bool zcopy_tx
= true;
35 /* Restrict usage of GSO, if hardware peer iwarp is unable to process
36 * large packets. try_gso = true lets siw try to use local GSO,
37 * if peer agrees. Not using GSO severly limits siw maximum tx bandwidth.
41 /* Attach siw also with loopback devices */
42 const bool loopback_enabled
= true;
44 /* We try to negotiate CRC on, if true */
45 const bool mpa_crc_required
;
47 /* MPA CRC on/off enforced */
48 const bool mpa_crc_strict
;
50 /* Control TCP_NODELAY socket option */
51 const bool siw_tcp_nagle
;
53 /* Select MPA version to be used during connection setup */
54 u_char mpa_version
= MPA_REVISION_2
;
56 /* Selects MPA P2P mode (additional handshake during connection
59 const bool peer_to_peer
;
61 struct task_struct
*siw_tx_thread
[NR_CPUS
];
62 struct crypto_shash
*siw_crypto_shash
;
64 static int siw_device_register(struct siw_device
*sdev
, const char *name
)
66 struct ib_device
*base_dev
= &sdev
->base_dev
;
67 static int dev_id
= 1;
70 rv
= ib_register_device(base_dev
, name
);
72 pr_warn("siw: device registration error %d\n", rv
);
75 sdev
->vendor_part_id
= dev_id
++;
77 siw_dbg(base_dev
, "HWaddr=%pM\n", sdev
->netdev
->dev_addr
);
82 static void siw_device_cleanup(struct ib_device
*base_dev
)
84 struct siw_device
*sdev
= to_siw_dev(base_dev
);
86 xa_destroy(&sdev
->qp_xa
);
87 xa_destroy(&sdev
->mem_xa
);
90 static int siw_create_tx_threads(void)
92 int cpu
, assigned
= 0;
94 for_each_online_cpu(cpu
) {
96 if (cpu
% cpumask_weight(topology_sibling_cpumask(cpu
)))
100 kthread_create(siw_run_sq
, (unsigned long *)(long)cpu
,
102 if (IS_ERR(siw_tx_thread
[cpu
])) {
103 siw_tx_thread
[cpu
] = NULL
;
106 kthread_bind(siw_tx_thread
[cpu
], cpu
);
108 wake_up_process(siw_tx_thread
[cpu
]);
114 static int siw_dev_qualified(struct net_device
*netdev
)
117 * Additional hardware support can be added here
118 * (e.g. ARPHRD_FDDI, ARPHRD_ATM, ...) - see
119 * <linux/if_arp.h> for type identifiers.
121 if (netdev
->type
== ARPHRD_ETHER
|| netdev
->type
== ARPHRD_IEEE802
||
122 (netdev
->type
== ARPHRD_LOOPBACK
&& loopback_enabled
))
128 static DEFINE_PER_CPU(atomic_t
, siw_use_cnt
);
131 struct cpumask
**tx_valid_cpus
;
135 static int siw_init_cpulist(void)
137 int i
, num_nodes
= num_possible_nodes();
139 memset(siw_tx_thread
, 0, sizeof(siw_tx_thread
));
141 siw_cpu_info
.num_nodes
= num_nodes
;
143 siw_cpu_info
.tx_valid_cpus
=
144 kcalloc(num_nodes
, sizeof(struct cpumask
*), GFP_KERNEL
);
145 if (!siw_cpu_info
.tx_valid_cpus
) {
146 siw_cpu_info
.num_nodes
= 0;
149 for (i
= 0; i
< siw_cpu_info
.num_nodes
; i
++) {
150 siw_cpu_info
.tx_valid_cpus
[i
] =
151 kzalloc(sizeof(struct cpumask
), GFP_KERNEL
);
152 if (!siw_cpu_info
.tx_valid_cpus
[i
])
155 cpumask_clear(siw_cpu_info
.tx_valid_cpus
[i
]);
157 for_each_possible_cpu(i
)
158 cpumask_set_cpu(i
, siw_cpu_info
.tx_valid_cpus
[cpu_to_node(i
)]);
163 siw_cpu_info
.num_nodes
= 0;
165 kfree(siw_cpu_info
.tx_valid_cpus
[i
]);
166 kfree(siw_cpu_info
.tx_valid_cpus
);
167 siw_cpu_info
.tx_valid_cpus
= NULL
;
172 static void siw_destroy_cpulist(void)
176 while (i
< siw_cpu_info
.num_nodes
)
177 kfree(siw_cpu_info
.tx_valid_cpus
[i
++]);
179 kfree(siw_cpu_info
.tx_valid_cpus
);
183 * Choose CPU with least number of active QP's from NUMA node of
186 int siw_get_tx_cpu(struct siw_device
*sdev
)
188 const struct cpumask
*tx_cpumask
;
189 int i
, num_cpus
, cpu
, min_use
, node
= sdev
->numa_node
, tx_cpu
= -1;
192 tx_cpumask
= cpu_online_mask
;
194 tx_cpumask
= siw_cpu_info
.tx_valid_cpus
[node
];
196 num_cpus
= cpumask_weight(tx_cpumask
);
198 /* no CPU on this NUMA node */
199 tx_cpumask
= cpu_online_mask
;
200 num_cpus
= cpumask_weight(tx_cpumask
);
205 cpu
= cpumask_first(tx_cpumask
);
207 for (i
= 0, min_use
= SIW_MAX_QP
; i
< num_cpus
;
208 i
++, cpu
= cpumask_next(cpu
, tx_cpumask
)) {
211 /* Skip any cores which have no TX thread */
212 if (!siw_tx_thread
[cpu
])
215 usage
= atomic_read(&per_cpu(siw_use_cnt
, cpu
));
216 if (usage
<= min_use
) {
221 siw_dbg(&sdev
->base_dev
,
222 "tx cpu %d, node %d, %d qp's\n", tx_cpu
, node
, min_use
);
226 atomic_inc(&per_cpu(siw_use_cnt
, tx_cpu
));
228 pr_warn("siw: no tx cpu found\n");
233 void siw_put_tx_cpu(int cpu
)
235 atomic_dec(&per_cpu(siw_use_cnt
, cpu
));
238 static struct ib_qp
*siw_get_base_qp(struct ib_device
*base_dev
, int id
)
240 struct siw_qp
*qp
= siw_qp_id2obj(to_siw_dev(base_dev
), id
);
244 * siw_qp_id2obj() increments object reference count
252 static const struct ib_device_ops siw_device_ops
= {
253 .owner
= THIS_MODULE
,
254 .uverbs_abi_ver
= SIW_ABI_VERSION
,
255 .driver_id
= RDMA_DRIVER_SIW
,
257 .alloc_mr
= siw_alloc_mr
,
258 .alloc_pd
= siw_alloc_pd
,
259 .alloc_ucontext
= siw_alloc_ucontext
,
260 .create_cq
= siw_create_cq
,
261 .create_qp
= siw_create_qp
,
262 .create_srq
= siw_create_srq
,
263 .dealloc_driver
= siw_device_cleanup
,
264 .dealloc_pd
= siw_dealloc_pd
,
265 .dealloc_ucontext
= siw_dealloc_ucontext
,
266 .dereg_mr
= siw_dereg_mr
,
267 .destroy_cq
= siw_destroy_cq
,
268 .destroy_qp
= siw_destroy_qp
,
269 .destroy_srq
= siw_destroy_srq
,
270 .get_dma_mr
= siw_get_dma_mr
,
271 .get_port_immutable
= siw_get_port_immutable
,
272 .iw_accept
= siw_accept
,
273 .iw_add_ref
= siw_qp_get_ref
,
274 .iw_connect
= siw_connect
,
275 .iw_create_listen
= siw_create_listen
,
276 .iw_destroy_listen
= siw_destroy_listen
,
277 .iw_get_qp
= siw_get_base_qp
,
278 .iw_reject
= siw_reject
,
279 .iw_rem_ref
= siw_qp_put_ref
,
280 .map_mr_sg
= siw_map_mr_sg
,
282 .mmap_free
= siw_mmap_free
,
283 .modify_qp
= siw_verbs_modify_qp
,
284 .modify_srq
= siw_modify_srq
,
285 .poll_cq
= siw_poll_cq
,
286 .post_recv
= siw_post_receive
,
287 .post_send
= siw_post_send
,
288 .post_srq_recv
= siw_post_srq_recv
,
289 .query_device
= siw_query_device
,
290 .query_gid
= siw_query_gid
,
291 .query_pkey
= siw_query_pkey
,
292 .query_port
= siw_query_port
,
293 .query_qp
= siw_query_qp
,
294 .query_srq
= siw_query_srq
,
295 .req_notify_cq
= siw_req_notify_cq
,
296 .reg_user_mr
= siw_reg_user_mr
,
298 INIT_RDMA_OBJ_SIZE(ib_cq
, siw_cq
, base_cq
),
299 INIT_RDMA_OBJ_SIZE(ib_pd
, siw_pd
, base_pd
),
300 INIT_RDMA_OBJ_SIZE(ib_srq
, siw_srq
, base_srq
),
301 INIT_RDMA_OBJ_SIZE(ib_ucontext
, siw_ucontext
, base_ucontext
),
304 static struct siw_device
*siw_device_create(struct net_device
*netdev
)
306 struct siw_device
*sdev
= NULL
;
307 struct ib_device
*base_dev
;
308 struct device
*parent
= netdev
->dev
.parent
;
313 * The loopback device has no parent device,
314 * so it appears as a top-level device. To support
315 * loopback device connectivity, take this device
316 * as the parent device. Skip all other devices
319 if (netdev
->type
!= ARPHRD_LOOPBACK
) {
320 pr_warn("siw: device %s error: no parent device\n",
324 parent
= &netdev
->dev
;
326 sdev
= ib_alloc_device(siw_device
, base_dev
);
330 base_dev
= &sdev
->base_dev
;
332 sdev
->netdev
= netdev
;
334 if (netdev
->type
!= ARPHRD_LOOPBACK
) {
335 addrconf_addr_eui48((unsigned char *)&base_dev
->node_guid
,
339 * The loopback device does not have a HW address,
340 * but connection mangagement lib expects gid != 0
342 size_t len
= min_t(size_t, strlen(base_dev
->name
), 6);
345 memcpy(addr
, base_dev
->name
, len
);
346 addrconf_addr_eui48((unsigned char *)&base_dev
->node_guid
,
349 base_dev
->uverbs_cmd_mask
=
350 (1ull << IB_USER_VERBS_CMD_QUERY_DEVICE
) |
351 (1ull << IB_USER_VERBS_CMD_QUERY_PORT
) |
352 (1ull << IB_USER_VERBS_CMD_GET_CONTEXT
) |
353 (1ull << IB_USER_VERBS_CMD_ALLOC_PD
) |
354 (1ull << IB_USER_VERBS_CMD_DEALLOC_PD
) |
355 (1ull << IB_USER_VERBS_CMD_REG_MR
) |
356 (1ull << IB_USER_VERBS_CMD_DEREG_MR
) |
357 (1ull << IB_USER_VERBS_CMD_CREATE_COMP_CHANNEL
) |
358 (1ull << IB_USER_VERBS_CMD_CREATE_CQ
) |
359 (1ull << IB_USER_VERBS_CMD_POLL_CQ
) |
360 (1ull << IB_USER_VERBS_CMD_REQ_NOTIFY_CQ
) |
361 (1ull << IB_USER_VERBS_CMD_DESTROY_CQ
) |
362 (1ull << IB_USER_VERBS_CMD_CREATE_QP
) |
363 (1ull << IB_USER_VERBS_CMD_QUERY_QP
) |
364 (1ull << IB_USER_VERBS_CMD_MODIFY_QP
) |
365 (1ull << IB_USER_VERBS_CMD_DESTROY_QP
) |
366 (1ull << IB_USER_VERBS_CMD_POST_SEND
) |
367 (1ull << IB_USER_VERBS_CMD_POST_RECV
) |
368 (1ull << IB_USER_VERBS_CMD_CREATE_SRQ
) |
369 (1ull << IB_USER_VERBS_CMD_POST_SRQ_RECV
) |
370 (1ull << IB_USER_VERBS_CMD_MODIFY_SRQ
) |
371 (1ull << IB_USER_VERBS_CMD_QUERY_SRQ
) |
372 (1ull << IB_USER_VERBS_CMD_DESTROY_SRQ
);
374 base_dev
->node_type
= RDMA_NODE_RNIC
;
375 memcpy(base_dev
->node_desc
, SIW_NODE_DESC_COMMON
,
376 sizeof(SIW_NODE_DESC_COMMON
));
379 * Current model (one-to-one device association):
380 * One Softiwarp device per net_device or, equivalently,
383 base_dev
->phys_port_cnt
= 1;
384 base_dev
->dev
.parent
= parent
;
385 base_dev
->dev
.dma_ops
= &dma_virt_ops
;
386 base_dev
->dev
.dma_parms
= &sdev
->dma_parms
;
387 sdev
->dma_parms
= (struct device_dma_parameters
)
388 { .max_segment_size
= SZ_2G
};
389 base_dev
->num_comp_vectors
= num_possible_cpus();
391 ib_set_device_ops(base_dev
, &siw_device_ops
);
392 rv
= ib_device_set_netdev(base_dev
, netdev
, 1);
396 memcpy(base_dev
->iw_ifname
, netdev
->name
,
397 sizeof(base_dev
->iw_ifname
));
399 /* Disable TCP port mapping */
400 base_dev
->iw_driver_flags
= IW_F_NO_PORT_MAP
,
402 sdev
->attrs
.max_qp
= SIW_MAX_QP
;
403 sdev
->attrs
.max_qp_wr
= SIW_MAX_QP_WR
;
404 sdev
->attrs
.max_ord
= SIW_MAX_ORD_QP
;
405 sdev
->attrs
.max_ird
= SIW_MAX_IRD_QP
;
406 sdev
->attrs
.max_sge
= SIW_MAX_SGE
;
407 sdev
->attrs
.max_sge_rd
= SIW_MAX_SGE_RD
;
408 sdev
->attrs
.max_cq
= SIW_MAX_CQ
;
409 sdev
->attrs
.max_cqe
= SIW_MAX_CQE
;
410 sdev
->attrs
.max_mr
= SIW_MAX_MR
;
411 sdev
->attrs
.max_pd
= SIW_MAX_PD
;
412 sdev
->attrs
.max_mw
= SIW_MAX_MW
;
413 sdev
->attrs
.max_fmr
= SIW_MAX_FMR
;
414 sdev
->attrs
.max_srq
= SIW_MAX_SRQ
;
415 sdev
->attrs
.max_srq_wr
= SIW_MAX_SRQ_WR
;
416 sdev
->attrs
.max_srq_sge
= SIW_MAX_SGE
;
418 xa_init_flags(&sdev
->qp_xa
, XA_FLAGS_ALLOC1
);
419 xa_init_flags(&sdev
->mem_xa
, XA_FLAGS_ALLOC1
);
421 INIT_LIST_HEAD(&sdev
->cep_list
);
422 INIT_LIST_HEAD(&sdev
->qp_list
);
424 atomic_set(&sdev
->num_ctx
, 0);
425 atomic_set(&sdev
->num_srq
, 0);
426 atomic_set(&sdev
->num_qp
, 0);
427 atomic_set(&sdev
->num_cq
, 0);
428 atomic_set(&sdev
->num_mr
, 0);
429 atomic_set(&sdev
->num_pd
, 0);
431 sdev
->numa_node
= dev_to_node(parent
);
432 spin_lock_init(&sdev
->lock
);
436 ib_dealloc_device(base_dev
);
442 * Network link becomes unavailable. Mark all
443 * affected QP's accordingly.
445 static void siw_netdev_down(struct work_struct
*work
)
447 struct siw_device
*sdev
=
448 container_of(work
, struct siw_device
, netdev_down
);
450 struct siw_qp_attrs qp_attrs
;
451 struct list_head
*pos
, *tmp
;
453 memset(&qp_attrs
, 0, sizeof(qp_attrs
));
454 qp_attrs
.state
= SIW_QP_STATE_ERROR
;
456 list_for_each_safe(pos
, tmp
, &sdev
->qp_list
) {
457 struct siw_qp
*qp
= list_entry(pos
, struct siw_qp
, devq
);
459 down_write(&qp
->state_lock
);
460 WARN_ON(siw_qp_modify(qp
, &qp_attrs
, SIW_QP_ATTR_STATE
));
461 up_write(&qp
->state_lock
);
463 ib_device_put(&sdev
->base_dev
);
466 static void siw_device_goes_down(struct siw_device
*sdev
)
468 if (ib_device_try_get(&sdev
->base_dev
)) {
469 INIT_WORK(&sdev
->netdev_down
, siw_netdev_down
);
470 schedule_work(&sdev
->netdev_down
);
474 static int siw_netdev_event(struct notifier_block
*nb
, unsigned long event
,
477 struct net_device
*netdev
= netdev_notifier_info_to_dev(arg
);
478 struct ib_device
*base_dev
;
479 struct siw_device
*sdev
;
481 dev_dbg(&netdev
->dev
, "siw: event %lu\n", event
);
483 if (dev_net(netdev
) != &init_net
)
486 base_dev
= ib_device_get_by_netdev(netdev
, RDMA_DRIVER_SIW
);
490 sdev
= to_siw_dev(base_dev
);
494 sdev
->state
= IB_PORT_ACTIVE
;
495 siw_port_event(sdev
, 1, IB_EVENT_PORT_ACTIVE
);
498 case NETDEV_GOING_DOWN
:
499 siw_device_goes_down(sdev
);
503 sdev
->state
= IB_PORT_DOWN
;
504 siw_port_event(sdev
, 1, IB_EVENT_PORT_ERR
);
507 case NETDEV_REGISTER
:
509 * Device registration now handled only by
510 * rdma netlink commands. So it shall be impossible
511 * to end up here with a valid siw device.
513 siw_dbg(base_dev
, "unexpected NETDEV_REGISTER event\n");
516 case NETDEV_UNREGISTER
:
517 ib_unregister_device_queued(&sdev
->base_dev
);
520 case NETDEV_CHANGEADDR
:
521 siw_port_event(sdev
, 1, IB_EVENT_LID_CHANGE
);
524 * Todo: Below netdev events are currently not handled.
526 case NETDEV_CHANGEMTU
:
533 ib_device_put(&sdev
->base_dev
);
538 static struct notifier_block siw_netdev_nb
= {
539 .notifier_call
= siw_netdev_event
,
542 static int siw_newlink(const char *basedev_name
, struct net_device
*netdev
)
544 struct ib_device
*base_dev
;
545 struct siw_device
*sdev
= NULL
;
548 if (!siw_dev_qualified(netdev
))
551 base_dev
= ib_device_get_by_netdev(netdev
, RDMA_DRIVER_SIW
);
553 ib_device_put(base_dev
);
556 sdev
= siw_device_create(netdev
);
558 dev_dbg(&netdev
->dev
, "siw: new device\n");
560 if (netif_running(netdev
) && netif_carrier_ok(netdev
))
561 sdev
->state
= IB_PORT_ACTIVE
;
563 sdev
->state
= IB_PORT_DOWN
;
565 rv
= siw_device_register(sdev
, basedev_name
);
567 ib_dealloc_device(&sdev
->base_dev
);
572 static struct rdma_link_ops siw_link_ops
= {
574 .newlink
= siw_newlink
,
578 * siw_init_module - Initialize Softiwarp module and register with netdev
581 static __init
int siw_init_module(void)
586 if (SENDPAGE_THRESH
< SIW_MAX_INLINE
) {
587 pr_info("siw: sendpage threshold too small: %u\n",
588 (int)SENDPAGE_THRESH
);
592 rv
= siw_init_cpulist();
600 if (!siw_create_tx_threads()) {
601 pr_info("siw: Could not start any TX thread\n");
606 * Locate CRC32 algorithm. If unsuccessful, fail
607 * loading siw only, if CRC is required.
609 siw_crypto_shash
= crypto_alloc_shash("crc32c", 0, 0);
610 if (IS_ERR(siw_crypto_shash
)) {
611 pr_info("siw: Loading CRC32c failed: %ld\n",
612 PTR_ERR(siw_crypto_shash
));
613 siw_crypto_shash
= NULL
;
614 if (mpa_crc_required
) {
619 rv
= register_netdevice_notifier(&siw_netdev_nb
);
623 rdma_link_register(&siw_link_ops
);
625 pr_info("SoftiWARP attached\n");
629 for (nr_cpu
= 0; nr_cpu
< nr_cpu_ids
; nr_cpu
++) {
630 if (siw_tx_thread
[nr_cpu
]) {
631 siw_stop_tx_thread(nr_cpu
);
632 siw_tx_thread
[nr_cpu
] = NULL
;
635 if (siw_crypto_shash
)
636 crypto_free_shash(siw_crypto_shash
);
638 pr_info("SoftIWARP attach failed. Error: %d\n", rv
);
641 siw_destroy_cpulist();
646 static void __exit
siw_exit_module(void)
650 for_each_possible_cpu(cpu
) {
651 if (siw_tx_thread
[cpu
]) {
652 siw_stop_tx_thread(cpu
);
653 siw_tx_thread
[cpu
] = NULL
;
656 unregister_netdevice_notifier(&siw_netdev_nb
);
657 rdma_link_unregister(&siw_link_ops
);
658 ib_unregister_driver(RDMA_DRIVER_SIW
);
662 siw_destroy_cpulist();
664 if (siw_crypto_shash
)
665 crypto_free_shash(siw_crypto_shash
);
667 pr_info("SoftiWARP detached\n");
670 module_init(siw_init_module
);
671 module_exit(siw_exit_module
);
673 MODULE_ALIAS_RDMA_LINK("siw");