2 * Copyright (c) 2005 Voltaire Inc. All rights reserved.
3 * Copyright (c) 2002-2005, Network Appliance, Inc. All rights reserved.
4 * Copyright (c) 1999-2005, Mellanox Technologies, Inc. All rights reserved.
5 * Copyright (c) 2005 Intel Corporation. All rights reserved.
7 * This software is available to you under a choice of one of two
8 * licenses. You may choose to be licensed under the terms of the GNU
9 * General Public License (GPL) Version 2, available from the file
10 * COPYING in the main directory of this source tree, or the
11 * OpenIB.org BSD license below:
13 * Redistribution and use in source and binary forms, with or
14 * without modification, are permitted provided that the following
17 * - Redistributions of source code must retain the above
18 * copyright notice, this list of conditions and the following
21 * - Redistributions in binary form must reproduce the above
22 * copyright notice, this list of conditions and the following
23 * disclaimer in the documentation and/or other materials
24 * provided with the distribution.
26 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
27 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
28 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
29 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
30 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
31 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
32 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
36 #include <linux/mutex.h>
37 #include <linux/inetdevice.h>
38 #include <linux/slab.h>
39 #include <linux/workqueue.h>
40 #include <linux/module.h>
42 #include <net/neighbour.h>
43 #include <net/route.h>
44 #include <net/netevent.h>
45 #include <net/addrconf.h>
46 #include <net/ip6_route.h>
47 #include <rdma/ib_addr.h>
49 #include <rdma/rdma_netlink.h>
50 #include <net/netlink.h>
52 #include "core_priv.h"
55 struct list_head list
;
56 struct sockaddr_storage src_addr
;
57 struct sockaddr_storage dst_addr
;
58 struct rdma_dev_addr
*addr
;
59 struct rdma_addr_client
*client
;
61 void (*callback
)(int status
, struct sockaddr
*src_addr
,
62 struct rdma_dev_addr
*addr
, void *context
);
63 unsigned long timeout
;
64 struct delayed_work work
;
69 static atomic_t ib_nl_addr_request_seq
= ATOMIC_INIT(0);
71 static void process_req(struct work_struct
*work
);
73 static DEFINE_MUTEX(lock
);
74 static LIST_HEAD(req_list
);
75 static DECLARE_DELAYED_WORK(work
, process_req
);
76 static struct workqueue_struct
*addr_wq
;
78 static const struct nla_policy ib_nl_addr_policy
[LS_NLA_TYPE_MAX
] = {
79 [LS_NLA_TYPE_DGID
] = {.type
= NLA_BINARY
,
80 .len
= sizeof(struct rdma_nla_ls_gid
)},
83 static inline bool ib_nl_is_good_ip_resp(const struct nlmsghdr
*nlh
)
85 struct nlattr
*tb
[LS_NLA_TYPE_MAX
] = {};
88 if (nlh
->nlmsg_flags
& RDMA_NL_LS_F_ERR
)
91 ret
= nla_parse(tb
, LS_NLA_TYPE_MAX
- 1, nlmsg_data(nlh
),
92 nlmsg_len(nlh
), ib_nl_addr_policy
, NULL
);
99 static void ib_nl_process_good_ip_rsep(const struct nlmsghdr
*nlh
)
101 const struct nlattr
*head
, *curr
;
103 struct addr_req
*req
;
107 head
= (const struct nlattr
*)nlmsg_data(nlh
);
108 len
= nlmsg_len(nlh
);
110 nla_for_each_attr(curr
, head
, len
, rem
) {
111 if (curr
->nla_type
== LS_NLA_TYPE_DGID
)
112 memcpy(&gid
, nla_data(curr
), nla_len(curr
));
116 list_for_each_entry(req
, &req_list
, list
) {
117 if (nlh
->nlmsg_seq
!= req
->seq
)
119 /* We set the DGID part, the rest was set earlier */
120 rdma_addr_set_dgid(req
->addr
, &gid
);
128 pr_info("Couldn't find request waiting for DGID: %pI6\n",
132 int ib_nl_handle_ip_res_resp(struct sk_buff
*skb
,
133 struct nlmsghdr
*nlh
,
134 struct netlink_ext_ack
*extack
)
136 if ((nlh
->nlmsg_flags
& NLM_F_REQUEST
) ||
137 !(NETLINK_CB(skb
).sk
))
140 if (ib_nl_is_good_ip_resp(nlh
))
141 ib_nl_process_good_ip_rsep(nlh
);
146 static int ib_nl_ip_send_msg(struct rdma_dev_addr
*dev_addr
,
150 struct sk_buff
*skb
= NULL
;
151 struct nlmsghdr
*nlh
;
152 struct rdma_ls_ip_resolve_header
*header
;
158 if (family
== AF_INET
) {
159 size
= sizeof(struct in_addr
);
160 attrtype
= RDMA_NLA_F_MANDATORY
| LS_NLA_TYPE_IPV4
;
162 size
= sizeof(struct in6_addr
);
163 attrtype
= RDMA_NLA_F_MANDATORY
| LS_NLA_TYPE_IPV6
;
166 len
= nla_total_size(sizeof(size
));
167 len
+= NLMSG_ALIGN(sizeof(*header
));
169 skb
= nlmsg_new(len
, GFP_KERNEL
);
173 data
= ibnl_put_msg(skb
, &nlh
, seq
, 0, RDMA_NL_LS
,
174 RDMA_NL_LS_OP_IP_RESOLVE
, NLM_F_REQUEST
);
180 /* Construct the family header first */
181 header
= skb_put(skb
, NLMSG_ALIGN(sizeof(*header
)));
182 header
->ifindex
= dev_addr
->bound_dev_if
;
183 nla_put(skb
, attrtype
, size
, daddr
);
185 /* Repair the nlmsg header length */
187 rdma_nl_multicast(skb
, RDMA_NL_GROUP_LS
, GFP_KERNEL
);
189 /* Make the request retry, so when we get the response from userspace
190 * we will have something.
195 int rdma_addr_size(struct sockaddr
*addr
)
197 switch (addr
->sa_family
) {
199 return sizeof(struct sockaddr_in
);
201 return sizeof(struct sockaddr_in6
);
203 return sizeof(struct sockaddr_ib
);
208 EXPORT_SYMBOL(rdma_addr_size
);
210 static struct rdma_addr_client self
;
212 void rdma_addr_register_client(struct rdma_addr_client
*client
)
214 atomic_set(&client
->refcount
, 1);
215 init_completion(&client
->comp
);
217 EXPORT_SYMBOL(rdma_addr_register_client
);
219 static inline void put_client(struct rdma_addr_client
*client
)
221 if (atomic_dec_and_test(&client
->refcount
))
222 complete(&client
->comp
);
225 void rdma_addr_unregister_client(struct rdma_addr_client
*client
)
228 wait_for_completion(&client
->comp
);
230 EXPORT_SYMBOL(rdma_addr_unregister_client
);
232 void rdma_copy_addr(struct rdma_dev_addr
*dev_addr
,
233 const struct net_device
*dev
,
234 const unsigned char *dst_dev_addr
)
236 dev_addr
->dev_type
= dev
->type
;
237 memcpy(dev_addr
->src_dev_addr
, dev
->dev_addr
, MAX_ADDR_LEN
);
238 memcpy(dev_addr
->broadcast
, dev
->broadcast
, MAX_ADDR_LEN
);
240 memcpy(dev_addr
->dst_dev_addr
, dst_dev_addr
, MAX_ADDR_LEN
);
241 dev_addr
->bound_dev_if
= dev
->ifindex
;
243 EXPORT_SYMBOL(rdma_copy_addr
);
245 int rdma_translate_ip(const struct sockaddr
*addr
,
246 struct rdma_dev_addr
*dev_addr
)
248 struct net_device
*dev
;
250 if (dev_addr
->bound_dev_if
) {
251 dev
= dev_get_by_index(dev_addr
->net
, dev_addr
->bound_dev_if
);
254 rdma_copy_addr(dev_addr
, dev
, NULL
);
259 switch (addr
->sa_family
) {
261 dev
= ip_dev_find(dev_addr
->net
,
262 ((const struct sockaddr_in
*)addr
)->sin_addr
.s_addr
);
265 return -EADDRNOTAVAIL
;
267 rdma_copy_addr(dev_addr
, dev
, NULL
);
270 #if IS_ENABLED(CONFIG_IPV6)
273 for_each_netdev_rcu(dev_addr
->net
, dev
) {
274 if (ipv6_chk_addr(dev_addr
->net
,
275 &((const struct sockaddr_in6
*)addr
)->sin6_addr
,
277 rdma_copy_addr(dev_addr
, dev
, NULL
);
287 EXPORT_SYMBOL(rdma_translate_ip
);
289 static void set_timeout(struct delayed_work
*delayed_work
, unsigned long time
)
293 delay
= time
- jiffies
;
297 mod_delayed_work(addr_wq
, delayed_work
, delay
);
300 static void queue_req(struct addr_req
*req
)
302 struct addr_req
*temp_req
;
305 list_for_each_entry_reverse(temp_req
, &req_list
, list
) {
306 if (time_after_eq(req
->timeout
, temp_req
->timeout
))
310 list_add(&req
->list
, &temp_req
->list
);
312 set_timeout(&req
->work
, req
->timeout
);
316 static int ib_nl_fetch_ha(struct dst_entry
*dst
, struct rdma_dev_addr
*dev_addr
,
317 const void *daddr
, u32 seq
, u16 family
)
319 if (rdma_nl_chk_listeners(RDMA_NL_GROUP_LS
))
320 return -EADDRNOTAVAIL
;
322 /* We fill in what we can, the response will fill the rest */
323 rdma_copy_addr(dev_addr
, dst
->dev
, NULL
);
324 return ib_nl_ip_send_msg(dev_addr
, daddr
, seq
, family
);
327 static int dst_fetch_ha(struct dst_entry
*dst
, struct rdma_dev_addr
*dev_addr
,
333 n
= dst_neigh_lookup(dst
, daddr
);
336 if (!n
|| !(n
->nud_state
& NUD_VALID
)) {
338 neigh_event_send(n
, NULL
);
341 rdma_copy_addr(dev_addr
, dst
->dev
, n
->ha
);
351 static bool has_gateway(struct dst_entry
*dst
, sa_family_t family
)
354 struct rt6_info
*rt6
;
356 if (family
== AF_INET
) {
357 rt
= container_of(dst
, struct rtable
, dst
);
358 return rt
->rt_uses_gateway
;
361 rt6
= container_of(dst
, struct rt6_info
, dst
);
362 return rt6
->rt6i_flags
& RTF_GATEWAY
;
365 static int fetch_ha(struct dst_entry
*dst
, struct rdma_dev_addr
*dev_addr
,
366 const struct sockaddr
*dst_in
, u32 seq
)
368 const struct sockaddr_in
*dst_in4
=
369 (const struct sockaddr_in
*)dst_in
;
370 const struct sockaddr_in6
*dst_in6
=
371 (const struct sockaddr_in6
*)dst_in
;
372 const void *daddr
= (dst_in
->sa_family
== AF_INET
) ?
373 (const void *)&dst_in4
->sin_addr
.s_addr
:
374 (const void *)&dst_in6
->sin6_addr
;
375 sa_family_t family
= dst_in
->sa_family
;
377 /* Gateway + ARPHRD_INFINIBAND -> IB router */
378 if (has_gateway(dst
, family
) && dst
->dev
->type
== ARPHRD_INFINIBAND
)
379 return ib_nl_fetch_ha(dst
, dev_addr
, daddr
, seq
, family
);
381 return dst_fetch_ha(dst
, dev_addr
, daddr
);
384 static int addr4_resolve(struct sockaddr_in
*src_in
,
385 const struct sockaddr_in
*dst_in
,
386 struct rdma_dev_addr
*addr
,
389 __be32 src_ip
= src_in
->sin_addr
.s_addr
;
390 __be32 dst_ip
= dst_in
->sin_addr
.s_addr
;
395 memset(&fl4
, 0, sizeof(fl4
));
398 fl4
.flowi4_oif
= addr
->bound_dev_if
;
399 rt
= ip_route_output_key(addr
->net
, &fl4
);
400 ret
= PTR_ERR_OR_ZERO(rt
);
404 src_in
->sin_family
= AF_INET
;
405 src_in
->sin_addr
.s_addr
= fl4
.saddr
;
407 /* If there's a gateway and type of device not ARPHRD_INFINIBAND, we're
408 * definitely in RoCE v2 (as RoCE v1 isn't routable) set the network
411 if (rt
->rt_uses_gateway
&& rt
->dst
.dev
->type
!= ARPHRD_INFINIBAND
)
412 addr
->network
= RDMA_NETWORK_IPV4
;
414 addr
->hoplimit
= ip4_dst_hoplimit(&rt
->dst
);
420 #if IS_ENABLED(CONFIG_IPV6)
421 static int addr6_resolve(struct sockaddr_in6
*src_in
,
422 const struct sockaddr_in6
*dst_in
,
423 struct rdma_dev_addr
*addr
,
424 struct dst_entry
**pdst
)
427 struct dst_entry
*dst
;
431 memset(&fl6
, 0, sizeof fl6
);
432 fl6
.daddr
= dst_in
->sin6_addr
;
433 fl6
.saddr
= src_in
->sin6_addr
;
434 fl6
.flowi6_oif
= addr
->bound_dev_if
;
436 ret
= ipv6_stub
->ipv6_dst_lookup(addr
->net
, NULL
, &dst
, &fl6
);
440 rt
= (struct rt6_info
*)dst
;
441 if (ipv6_addr_any(&src_in
->sin6_addr
)) {
442 src_in
->sin6_family
= AF_INET6
;
443 src_in
->sin6_addr
= fl6
.saddr
;
446 /* If there's a gateway and type of device not ARPHRD_INFINIBAND, we're
447 * definitely in RoCE v2 (as RoCE v1 isn't routable) set the network
450 if (rt
->rt6i_flags
& RTF_GATEWAY
&&
451 ip6_dst_idev(dst
)->dev
->type
!= ARPHRD_INFINIBAND
)
452 addr
->network
= RDMA_NETWORK_IPV6
;
454 addr
->hoplimit
= ip6_dst_hoplimit(dst
);
460 static int addr6_resolve(struct sockaddr_in6
*src_in
,
461 const struct sockaddr_in6
*dst_in
,
462 struct rdma_dev_addr
*addr
,
463 struct dst_entry
**pdst
)
465 return -EADDRNOTAVAIL
;
469 static int addr_resolve_neigh(struct dst_entry
*dst
,
470 const struct sockaddr
*dst_in
,
471 struct rdma_dev_addr
*addr
,
474 if (dst
->dev
->flags
& IFF_LOOPBACK
) {
477 ret
= rdma_translate_ip(dst_in
, addr
);
479 memcpy(addr
->dst_dev_addr
, addr
->src_dev_addr
,
485 /* If the device doesn't do ARP internally */
486 if (!(dst
->dev
->flags
& IFF_NOARP
))
487 return fetch_ha(dst
, addr
, dst_in
, seq
);
489 rdma_copy_addr(addr
, dst
->dev
, NULL
);
494 static int addr_resolve(struct sockaddr
*src_in
,
495 const struct sockaddr
*dst_in
,
496 struct rdma_dev_addr
*addr
,
500 struct net_device
*ndev
;
501 struct dst_entry
*dst
;
505 pr_warn_ratelimited("%s: missing namespace\n", __func__
);
509 if (src_in
->sa_family
== AF_INET
) {
510 struct rtable
*rt
= NULL
;
511 const struct sockaddr_in
*dst_in4
=
512 (const struct sockaddr_in
*)dst_in
;
514 ret
= addr4_resolve((struct sockaddr_in
*)src_in
,
520 ret
= addr_resolve_neigh(&rt
->dst
, dst_in
, addr
, seq
);
522 if (addr
->bound_dev_if
) {
523 ndev
= dev_get_by_index(addr
->net
, addr
->bound_dev_if
);
531 const struct sockaddr_in6
*dst_in6
=
532 (const struct sockaddr_in6
*)dst_in
;
534 ret
= addr6_resolve((struct sockaddr_in6
*)src_in
,
541 ret
= addr_resolve_neigh(dst
, dst_in
, addr
, seq
);
543 if (addr
->bound_dev_if
) {
544 ndev
= dev_get_by_index(addr
->net
, addr
->bound_dev_if
);
553 if (ndev
->flags
& IFF_LOOPBACK
) {
554 ret
= rdma_translate_ip(dst_in
, addr
);
556 * Put the loopback device and get the translated
560 ndev
= dev_get_by_index(addr
->net
, addr
->bound_dev_if
);
562 addr
->bound_dev_if
= ndev
->ifindex
;
569 static void process_one_req(struct work_struct
*_work
)
571 struct addr_req
*req
;
572 struct sockaddr
*src_in
, *dst_in
;
575 req
= container_of(_work
, struct addr_req
, work
.work
);
577 if (req
->status
== -ENODATA
) {
578 src_in
= (struct sockaddr
*)&req
->src_addr
;
579 dst_in
= (struct sockaddr
*)&req
->dst_addr
;
580 req
->status
= addr_resolve(src_in
, dst_in
, req
->addr
,
582 if (req
->status
&& time_after_eq(jiffies
, req
->timeout
)) {
583 req
->status
= -ETIMEDOUT
;
584 } else if (req
->status
== -ENODATA
) {
585 /* requeue the work for retrying again */
586 set_timeout(&req
->work
, req
->timeout
);
591 list_del(&req
->list
);
594 req
->callback(req
->status
, (struct sockaddr
*)&req
->src_addr
,
595 req
->addr
, req
->context
);
596 put_client(req
->client
);
600 static void process_req(struct work_struct
*work
)
602 struct addr_req
*req
, *temp_req
;
603 struct sockaddr
*src_in
, *dst_in
;
604 struct list_head done_list
;
606 INIT_LIST_HEAD(&done_list
);
609 list_for_each_entry_safe(req
, temp_req
, &req_list
, list
) {
610 if (req
->status
== -ENODATA
) {
611 src_in
= (struct sockaddr
*) &req
->src_addr
;
612 dst_in
= (struct sockaddr
*) &req
->dst_addr
;
613 req
->status
= addr_resolve(src_in
, dst_in
, req
->addr
,
615 if (req
->status
&& time_after_eq(jiffies
, req
->timeout
))
616 req
->status
= -ETIMEDOUT
;
617 else if (req
->status
== -ENODATA
) {
618 set_timeout(&req
->work
, req
->timeout
);
622 list_move_tail(&req
->list
, &done_list
);
627 list_for_each_entry_safe(req
, temp_req
, &done_list
, list
) {
628 list_del(&req
->list
);
629 /* It is safe to cancel other work items from this work item
630 * because at a time there can be only one work item running
631 * with this single threaded work queue.
633 cancel_delayed_work(&req
->work
);
634 req
->callback(req
->status
, (struct sockaddr
*) &req
->src_addr
,
635 req
->addr
, req
->context
);
636 put_client(req
->client
);
641 int rdma_resolve_ip(struct rdma_addr_client
*client
,
642 struct sockaddr
*src_addr
, struct sockaddr
*dst_addr
,
643 struct rdma_dev_addr
*addr
, int timeout_ms
,
644 void (*callback
)(int status
, struct sockaddr
*src_addr
,
645 struct rdma_dev_addr
*addr
, void *context
),
648 struct sockaddr
*src_in
, *dst_in
;
649 struct addr_req
*req
;
652 req
= kzalloc(sizeof *req
, GFP_KERNEL
);
656 src_in
= (struct sockaddr
*) &req
->src_addr
;
657 dst_in
= (struct sockaddr
*) &req
->dst_addr
;
660 if (src_addr
->sa_family
!= dst_addr
->sa_family
) {
665 memcpy(src_in
, src_addr
, rdma_addr_size(src_addr
));
667 src_in
->sa_family
= dst_addr
->sa_family
;
670 memcpy(dst_in
, dst_addr
, rdma_addr_size(dst_addr
));
672 req
->callback
= callback
;
673 req
->context
= context
;
674 req
->client
= client
;
675 atomic_inc(&client
->refcount
);
676 INIT_DELAYED_WORK(&req
->work
, process_one_req
);
677 req
->seq
= (u32
)atomic_inc_return(&ib_nl_addr_request_seq
);
679 req
->status
= addr_resolve(src_in
, dst_in
, addr
, true, req
->seq
);
680 switch (req
->status
) {
682 req
->timeout
= jiffies
;
686 req
->timeout
= msecs_to_jiffies(timeout_ms
) + jiffies
;
691 atomic_dec(&client
->refcount
);
699 EXPORT_SYMBOL(rdma_resolve_ip
);
701 int rdma_resolve_ip_route(struct sockaddr
*src_addr
,
702 const struct sockaddr
*dst_addr
,
703 struct rdma_dev_addr
*addr
)
705 struct sockaddr_storage ssrc_addr
= {};
706 struct sockaddr
*src_in
= (struct sockaddr
*)&ssrc_addr
;
709 if (src_addr
->sa_family
!= dst_addr
->sa_family
)
712 memcpy(src_in
, src_addr
, rdma_addr_size(src_addr
));
714 src_in
->sa_family
= dst_addr
->sa_family
;
717 return addr_resolve(src_in
, dst_addr
, addr
, false, 0);
719 EXPORT_SYMBOL(rdma_resolve_ip_route
);
721 void rdma_addr_cancel(struct rdma_dev_addr
*addr
)
723 struct addr_req
*req
, *temp_req
;
726 list_for_each_entry_safe(req
, temp_req
, &req_list
, list
) {
727 if (req
->addr
== addr
) {
728 req
->status
= -ECANCELED
;
729 req
->timeout
= jiffies
;
730 list_move(&req
->list
, &req_list
);
731 set_timeout(&req
->work
, req
->timeout
);
737 EXPORT_SYMBOL(rdma_addr_cancel
);
739 struct resolve_cb_context
{
740 struct completion comp
;
744 static void resolve_cb(int status
, struct sockaddr
*src_addr
,
745 struct rdma_dev_addr
*addr
, void *context
)
747 ((struct resolve_cb_context
*)context
)->status
= status
;
748 complete(&((struct resolve_cb_context
*)context
)->comp
);
751 int rdma_addr_find_l2_eth_by_grh(const union ib_gid
*sgid
,
752 const union ib_gid
*dgid
,
753 u8
*dmac
, const struct net_device
*ndev
,
756 struct rdma_dev_addr dev_addr
;
757 struct resolve_cb_context ctx
;
759 struct sockaddr _sockaddr
;
760 struct sockaddr_in _sockaddr_in
;
761 struct sockaddr_in6 _sockaddr_in6
;
762 } sgid_addr
, dgid_addr
;
765 rdma_gid2ip(&sgid_addr
._sockaddr
, sgid
);
766 rdma_gid2ip(&dgid_addr
._sockaddr
, dgid
);
768 memset(&dev_addr
, 0, sizeof(dev_addr
));
769 dev_addr
.bound_dev_if
= ndev
->ifindex
;
770 dev_addr
.net
= &init_net
;
772 init_completion(&ctx
.comp
);
773 ret
= rdma_resolve_ip(&self
, &sgid_addr
._sockaddr
, &dgid_addr
._sockaddr
,
774 &dev_addr
, 1000, resolve_cb
, &ctx
);
778 wait_for_completion(&ctx
.comp
);
784 memcpy(dmac
, dev_addr
.dst_dev_addr
, ETH_ALEN
);
785 *hoplimit
= dev_addr
.hoplimit
;
789 static int netevent_callback(struct notifier_block
*self
, unsigned long event
,
792 if (event
== NETEVENT_NEIGH_UPDATE
) {
793 struct neighbour
*neigh
= ctx
;
795 if (neigh
->nud_state
& NUD_VALID
)
796 set_timeout(&work
, jiffies
);
801 static struct notifier_block nb
= {
802 .notifier_call
= netevent_callback
807 addr_wq
= alloc_ordered_workqueue("ib_addr", 0);
811 register_netevent_notifier(&nb
);
812 rdma_addr_register_client(&self
);
817 void addr_cleanup(void)
819 rdma_addr_unregister_client(&self
);
820 unregister_netevent_notifier(&nb
);
821 destroy_workqueue(addr_wq
);