1 // SPDX-License-Identifier: GPL-2.0-only
3 * Copyright (c) 2009, Microsoft Corporation.
6 * Haiyang Zhang <haiyangz@microsoft.com>
7 * Hank Janssen <hjanssen@microsoft.com>
9 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
11 #include <linux/init.h>
12 #include <linux/atomic.h>
13 #include <linux/module.h>
14 #include <linux/highmem.h>
15 #include <linux/device.h>
17 #include <linux/delay.h>
18 #include <linux/netdevice.h>
19 #include <linux/inetdevice.h>
20 #include <linux/etherdevice.h>
21 #include <linux/pci.h>
22 #include <linux/skbuff.h>
23 #include <linux/if_vlan.h>
25 #include <linux/slab.h>
26 #include <linux/rtnetlink.h>
27 #include <linux/netpoll.h>
28 #include <linux/bpf.h>
31 #include <net/route.h>
33 #include <net/pkt_sched.h>
34 #include <net/checksum.h>
35 #include <net/ip6_checksum.h>
37 #include "hyperv_net.h"
39 #define RING_SIZE_MIN 64
40 #define RETRY_US_LO 5000
41 #define RETRY_US_HI 10000
42 #define RETRY_MAX 2000 /* >10 sec */
44 #define LINKCHANGE_INT (2 * HZ)
45 #define VF_TAKEOVER_INT (HZ / 10)
47 static unsigned int ring_size __ro_after_init
= 128;
48 module_param(ring_size
, uint
, 0444);
49 MODULE_PARM_DESC(ring_size
, "Ring buffer size (# of pages)");
50 unsigned int netvsc_ring_bytes __ro_after_init
;
52 static const u32 default_msg
= NETIF_MSG_DRV
| NETIF_MSG_PROBE
|
53 NETIF_MSG_LINK
| NETIF_MSG_IFUP
|
54 NETIF_MSG_IFDOWN
| NETIF_MSG_RX_ERR
|
57 static int debug
= -1;
58 module_param(debug
, int, 0444);
59 MODULE_PARM_DESC(debug
, "Debug level (0=none,...,16=all)");
61 static LIST_HEAD(netvsc_dev_list
);
63 static void netvsc_change_rx_flags(struct net_device
*net
, int change
)
65 struct net_device_context
*ndev_ctx
= netdev_priv(net
);
66 struct net_device
*vf_netdev
= rtnl_dereference(ndev_ctx
->vf_netdev
);
72 if (change
& IFF_PROMISC
) {
73 inc
= (net
->flags
& IFF_PROMISC
) ? 1 : -1;
74 dev_set_promiscuity(vf_netdev
, inc
);
77 if (change
& IFF_ALLMULTI
) {
78 inc
= (net
->flags
& IFF_ALLMULTI
) ? 1 : -1;
79 dev_set_allmulti(vf_netdev
, inc
);
83 static void netvsc_set_rx_mode(struct net_device
*net
)
85 struct net_device_context
*ndev_ctx
= netdev_priv(net
);
86 struct net_device
*vf_netdev
;
87 struct netvsc_device
*nvdev
;
90 vf_netdev
= rcu_dereference(ndev_ctx
->vf_netdev
);
92 dev_uc_sync(vf_netdev
, net
);
93 dev_mc_sync(vf_netdev
, net
);
96 nvdev
= rcu_dereference(ndev_ctx
->nvdev
);
98 rndis_filter_update(nvdev
);
102 static void netvsc_tx_enable(struct netvsc_device
*nvscdev
,
103 struct net_device
*ndev
)
105 nvscdev
->tx_disable
= false;
106 virt_wmb(); /* ensure queue wake up mechanism is on */
108 netif_tx_wake_all_queues(ndev
);
111 static int netvsc_open(struct net_device
*net
)
113 struct net_device_context
*ndev_ctx
= netdev_priv(net
);
114 struct net_device
*vf_netdev
= rtnl_dereference(ndev_ctx
->vf_netdev
);
115 struct netvsc_device
*nvdev
= rtnl_dereference(ndev_ctx
->nvdev
);
116 struct rndis_device
*rdev
;
119 netif_carrier_off(net
);
121 /* Open up the device */
122 ret
= rndis_filter_open(nvdev
);
124 netdev_err(net
, "unable to open device (ret %d).\n", ret
);
128 rdev
= nvdev
->extension
;
129 if (!rdev
->link_state
) {
130 netif_carrier_on(net
);
131 netvsc_tx_enable(nvdev
, net
);
135 /* Setting synthetic device up transparently sets
136 * slave as up. If open fails, then slave will be
137 * still be offline (and not used).
139 ret
= dev_open(vf_netdev
, NULL
);
142 "unable to open slave: %s: %d\n",
143 vf_netdev
->name
, ret
);
148 static int netvsc_wait_until_empty(struct netvsc_device
*nvdev
)
150 unsigned int retry
= 0;
153 /* Ensure pending bytes in ring are read */
157 for (i
= 0; i
< nvdev
->num_chn
; i
++) {
158 struct vmbus_channel
*chn
159 = nvdev
->chan_table
[i
].channel
;
164 /* make sure receive not running now */
165 napi_synchronize(&nvdev
->chan_table
[i
].napi
);
167 aread
= hv_get_bytes_to_read(&chn
->inbound
);
171 aread
= hv_get_bytes_to_read(&chn
->outbound
);
179 if (++retry
> RETRY_MAX
)
182 usleep_range(RETRY_US_LO
, RETRY_US_HI
);
186 static void netvsc_tx_disable(struct netvsc_device
*nvscdev
,
187 struct net_device
*ndev
)
190 nvscdev
->tx_disable
= true;
191 virt_wmb(); /* ensure txq will not wake up after stop */
194 netif_tx_disable(ndev
);
197 static int netvsc_close(struct net_device
*net
)
199 struct net_device_context
*net_device_ctx
= netdev_priv(net
);
200 struct net_device
*vf_netdev
201 = rtnl_dereference(net_device_ctx
->vf_netdev
);
202 struct netvsc_device
*nvdev
= rtnl_dereference(net_device_ctx
->nvdev
);
205 netvsc_tx_disable(nvdev
, net
);
207 /* No need to close rndis filter if it is removed already */
211 ret
= rndis_filter_close(nvdev
);
213 netdev_err(net
, "unable to close device (ret %d).\n", ret
);
217 ret
= netvsc_wait_until_empty(nvdev
);
219 netdev_err(net
, "Ring buffer not empty after closing rndis\n");
222 dev_close(vf_netdev
);
227 static inline void *init_ppi_data(struct rndis_message
*msg
,
228 u32 ppi_size
, u32 pkt_type
)
230 struct rndis_packet
*rndis_pkt
= &msg
->msg
.pkt
;
231 struct rndis_per_packet_info
*ppi
;
233 rndis_pkt
->data_offset
+= ppi_size
;
234 ppi
= (void *)rndis_pkt
+ rndis_pkt
->per_pkt_info_offset
235 + rndis_pkt
->per_pkt_info_len
;
237 ppi
->size
= ppi_size
;
238 ppi
->type
= pkt_type
;
240 ppi
->ppi_offset
= sizeof(struct rndis_per_packet_info
);
242 rndis_pkt
->per_pkt_info_len
+= ppi_size
;
247 /* Azure hosts don't support non-TCP port numbers in hashing for fragmented
248 * packets. We can use ethtool to change UDP hash level when necessary.
250 static inline u32
netvsc_get_hash(
252 const struct net_device_context
*ndc
)
254 struct flow_keys flow
;
255 u32 hash
, pkt_proto
= 0;
256 static u32 hashrnd __read_mostly
;
258 net_get_random_once(&hashrnd
, sizeof(hashrnd
));
260 if (!skb_flow_dissect_flow_keys(skb
, &flow
, 0))
263 switch (flow
.basic
.ip_proto
) {
265 if (flow
.basic
.n_proto
== htons(ETH_P_IP
))
266 pkt_proto
= HV_TCP4_L4HASH
;
267 else if (flow
.basic
.n_proto
== htons(ETH_P_IPV6
))
268 pkt_proto
= HV_TCP6_L4HASH
;
273 if (flow
.basic
.n_proto
== htons(ETH_P_IP
))
274 pkt_proto
= HV_UDP4_L4HASH
;
275 else if (flow
.basic
.n_proto
== htons(ETH_P_IPV6
))
276 pkt_proto
= HV_UDP6_L4HASH
;
281 if (pkt_proto
& ndc
->l4_hash
) {
282 return skb_get_hash(skb
);
284 if (flow
.basic
.n_proto
== htons(ETH_P_IP
))
285 hash
= jhash2((u32
*)&flow
.addrs
.v4addrs
, 2, hashrnd
);
286 else if (flow
.basic
.n_proto
== htons(ETH_P_IPV6
))
287 hash
= jhash2((u32
*)&flow
.addrs
.v6addrs
, 8, hashrnd
);
291 __skb_set_sw_hash(skb
, hash
, false);
297 static inline int netvsc_get_tx_queue(struct net_device
*ndev
,
298 struct sk_buff
*skb
, int old_idx
)
300 const struct net_device_context
*ndc
= netdev_priv(ndev
);
301 struct sock
*sk
= skb
->sk
;
304 q_idx
= ndc
->tx_table
[netvsc_get_hash(skb
, ndc
) &
305 (VRSS_SEND_TAB_SIZE
- 1)];
307 /* If queue index changed record the new value */
308 if (q_idx
!= old_idx
&&
309 sk
&& sk_fullsock(sk
) && rcu_access_pointer(sk
->sk_dst_cache
))
310 sk_tx_queue_set(sk
, q_idx
);
316 * Select queue for transmit.
318 * If a valid queue has already been assigned, then use that.
319 * Otherwise compute tx queue based on hash and the send table.
321 * This is basically similar to default (netdev_pick_tx) with the added step
322 * of using the host send_table when no other queue has been assigned.
324 * TODO support XPS - but get_xps_queue not exported
326 static u16
netvsc_pick_tx(struct net_device
*ndev
, struct sk_buff
*skb
)
328 int q_idx
= sk_tx_queue_get(skb
->sk
);
330 if (q_idx
< 0 || skb
->ooo_okay
|| q_idx
>= ndev
->real_num_tx_queues
) {
331 /* If forwarding a packet, we use the recorded queue when
332 * available for better cache locality.
334 if (skb_rx_queue_recorded(skb
))
335 q_idx
= skb_get_rx_queue(skb
);
337 q_idx
= netvsc_get_tx_queue(ndev
, skb
, q_idx
);
343 static u16
netvsc_select_queue(struct net_device
*ndev
, struct sk_buff
*skb
,
344 struct net_device
*sb_dev
)
346 struct net_device_context
*ndc
= netdev_priv(ndev
);
347 struct net_device
*vf_netdev
;
351 vf_netdev
= rcu_dereference(ndc
->vf_netdev
);
353 const struct net_device_ops
*vf_ops
= vf_netdev
->netdev_ops
;
355 if (vf_ops
->ndo_select_queue
)
356 txq
= vf_ops
->ndo_select_queue(vf_netdev
, skb
, sb_dev
);
358 txq
= netdev_pick_tx(vf_netdev
, skb
, NULL
);
360 /* Record the queue selected by VF so that it can be
361 * used for common case where VF has more queues than
362 * the synthetic device.
364 qdisc_skb_cb(skb
)->slave_dev_queue_mapping
= txq
;
366 txq
= netvsc_pick_tx(ndev
, skb
);
370 while (unlikely(txq
>= ndev
->real_num_tx_queues
))
371 txq
-= ndev
->real_num_tx_queues
;
376 static u32
fill_pg_buf(struct page
*page
, u32 offset
, u32 len
,
377 struct hv_page_buffer
*pb
)
381 /* Deal with compound pages by ignoring unused part
384 page
+= (offset
>> PAGE_SHIFT
);
385 offset
&= ~PAGE_MASK
;
390 bytes
= PAGE_SIZE
- offset
;
393 pb
[j
].pfn
= page_to_pfn(page
);
394 pb
[j
].offset
= offset
;
400 if (offset
== PAGE_SIZE
&& len
) {
410 static u32
init_page_array(void *hdr
, u32 len
, struct sk_buff
*skb
,
411 struct hv_netvsc_packet
*packet
,
412 struct hv_page_buffer
*pb
)
415 char *data
= skb
->data
;
416 int frags
= skb_shinfo(skb
)->nr_frags
;
419 /* The packet is laid out thus:
420 * 1. hdr: RNDIS header and PPI
422 * 3. skb fragment data
424 slots_used
+= fill_pg_buf(virt_to_page(hdr
),
426 len
, &pb
[slots_used
]);
428 packet
->rmsg_size
= len
;
429 packet
->rmsg_pgcnt
= slots_used
;
431 slots_used
+= fill_pg_buf(virt_to_page(data
),
432 offset_in_page(data
),
433 skb_headlen(skb
), &pb
[slots_used
]);
435 for (i
= 0; i
< frags
; i
++) {
436 skb_frag_t
*frag
= skb_shinfo(skb
)->frags
+ i
;
438 slots_used
+= fill_pg_buf(skb_frag_page(frag
),
440 skb_frag_size(frag
), &pb
[slots_used
]);
445 static int count_skb_frag_slots(struct sk_buff
*skb
)
447 int i
, frags
= skb_shinfo(skb
)->nr_frags
;
450 for (i
= 0; i
< frags
; i
++) {
451 skb_frag_t
*frag
= skb_shinfo(skb
)->frags
+ i
;
452 unsigned long size
= skb_frag_size(frag
);
453 unsigned long offset
= skb_frag_off(frag
);
455 /* Skip unused frames from start of page */
456 offset
&= ~PAGE_MASK
;
457 pages
+= PFN_UP(offset
+ size
);
462 static int netvsc_get_slots(struct sk_buff
*skb
)
464 char *data
= skb
->data
;
465 unsigned int offset
= offset_in_page(data
);
466 unsigned int len
= skb_headlen(skb
);
470 slots
= DIV_ROUND_UP(offset
+ len
, PAGE_SIZE
);
471 frag_slots
= count_skb_frag_slots(skb
);
472 return slots
+ frag_slots
;
475 static u32
net_checksum_info(struct sk_buff
*skb
)
477 if (skb
->protocol
== htons(ETH_P_IP
)) {
478 struct iphdr
*ip
= ip_hdr(skb
);
480 if (ip
->protocol
== IPPROTO_TCP
)
481 return TRANSPORT_INFO_IPV4_TCP
;
482 else if (ip
->protocol
== IPPROTO_UDP
)
483 return TRANSPORT_INFO_IPV4_UDP
;
485 struct ipv6hdr
*ip6
= ipv6_hdr(skb
);
487 if (ip6
->nexthdr
== IPPROTO_TCP
)
488 return TRANSPORT_INFO_IPV6_TCP
;
489 else if (ip6
->nexthdr
== IPPROTO_UDP
)
490 return TRANSPORT_INFO_IPV6_UDP
;
493 return TRANSPORT_INFO_NOT_IP
;
496 /* Send skb on the slave VF device. */
497 static int netvsc_vf_xmit(struct net_device
*net
, struct net_device
*vf_netdev
,
500 struct net_device_context
*ndev_ctx
= netdev_priv(net
);
501 unsigned int len
= skb
->len
;
504 skb
->dev
= vf_netdev
;
505 skb
->queue_mapping
= qdisc_skb_cb(skb
)->slave_dev_queue_mapping
;
507 rc
= dev_queue_xmit(skb
);
508 if (likely(rc
== NET_XMIT_SUCCESS
|| rc
== NET_XMIT_CN
)) {
509 struct netvsc_vf_pcpu_stats
*pcpu_stats
510 = this_cpu_ptr(ndev_ctx
->vf_stats
);
512 u64_stats_update_begin(&pcpu_stats
->syncp
);
513 pcpu_stats
->tx_packets
++;
514 pcpu_stats
->tx_bytes
+= len
;
515 u64_stats_update_end(&pcpu_stats
->syncp
);
517 this_cpu_inc(ndev_ctx
->vf_stats
->tx_dropped
);
523 static int netvsc_xmit(struct sk_buff
*skb
, struct net_device
*net
, bool xdp_tx
)
525 struct net_device_context
*net_device_ctx
= netdev_priv(net
);
526 struct hv_netvsc_packet
*packet
= NULL
;
528 unsigned int num_data_pgs
;
529 struct rndis_message
*rndis_msg
;
530 struct net_device
*vf_netdev
;
533 struct hv_page_buffer pb
[MAX_PAGE_BUFFER_COUNT
];
535 /* if VF is present and up then redirect packets
536 * already called with rcu_read_lock_bh
538 vf_netdev
= rcu_dereference_bh(net_device_ctx
->vf_netdev
);
539 if (vf_netdev
&& netif_running(vf_netdev
) &&
540 !netpoll_tx_running(net
))
541 return netvsc_vf_xmit(net
, vf_netdev
, skb
);
543 /* We will atmost need two pages to describe the rndis
544 * header. We can only transmit MAX_PAGE_BUFFER_COUNT number
545 * of pages in a single packet. If skb is scattered around
546 * more pages we try linearizing it.
549 num_data_pgs
= netvsc_get_slots(skb
) + 2;
551 if (unlikely(num_data_pgs
> MAX_PAGE_BUFFER_COUNT
)) {
552 ++net_device_ctx
->eth_stats
.tx_scattered
;
554 if (skb_linearize(skb
))
557 num_data_pgs
= netvsc_get_slots(skb
) + 2;
558 if (num_data_pgs
> MAX_PAGE_BUFFER_COUNT
) {
559 ++net_device_ctx
->eth_stats
.tx_too_big
;
565 * Place the rndis header in the skb head room and
566 * the skb->cb will be used for hv_netvsc_packet
569 ret
= skb_cow_head(skb
, RNDIS_AND_PPI_SIZE
);
573 /* Use the skb control buffer for building up the packet */
574 BUILD_BUG_ON(sizeof(struct hv_netvsc_packet
) >
575 sizeof_field(struct sk_buff
, cb
));
576 packet
= (struct hv_netvsc_packet
*)skb
->cb
;
578 packet
->q_idx
= skb_get_queue_mapping(skb
);
580 packet
->total_data_buflen
= skb
->len
;
581 packet
->total_bytes
= skb
->len
;
582 packet
->total_packets
= 1;
584 rndis_msg
= (struct rndis_message
*)skb
->head
;
586 /* Add the rndis header */
587 rndis_msg
->ndis_msg_type
= RNDIS_MSG_PACKET
;
588 rndis_msg
->msg_len
= packet
->total_data_buflen
;
590 rndis_msg
->msg
.pkt
= (struct rndis_packet
) {
591 .data_offset
= sizeof(struct rndis_packet
),
592 .data_len
= packet
->total_data_buflen
,
593 .per_pkt_info_offset
= sizeof(struct rndis_packet
),
596 rndis_msg_size
= RNDIS_MESSAGE_SIZE(struct rndis_packet
);
598 hash
= skb_get_hash_raw(skb
);
599 if (hash
!= 0 && net
->real_num_tx_queues
> 1) {
602 rndis_msg_size
+= NDIS_HASH_PPI_SIZE
;
603 hash_info
= init_ppi_data(rndis_msg
, NDIS_HASH_PPI_SIZE
,
608 if (skb_vlan_tag_present(skb
)) {
609 struct ndis_pkt_8021q_info
*vlan
;
611 rndis_msg_size
+= NDIS_VLAN_PPI_SIZE
;
612 vlan
= init_ppi_data(rndis_msg
, NDIS_VLAN_PPI_SIZE
,
616 vlan
->vlanid
= skb_vlan_tag_get_id(skb
);
617 vlan
->cfi
= skb_vlan_tag_get_cfi(skb
);
618 vlan
->pri
= skb_vlan_tag_get_prio(skb
);
621 if (skb_is_gso(skb
)) {
622 struct ndis_tcp_lso_info
*lso_info
;
624 rndis_msg_size
+= NDIS_LSO_PPI_SIZE
;
625 lso_info
= init_ppi_data(rndis_msg
, NDIS_LSO_PPI_SIZE
,
626 TCP_LARGESEND_PKTINFO
);
629 lso_info
->lso_v2_transmit
.type
= NDIS_TCP_LARGE_SEND_OFFLOAD_V2_TYPE
;
630 if (skb
->protocol
== htons(ETH_P_IP
)) {
631 lso_info
->lso_v2_transmit
.ip_version
=
632 NDIS_TCP_LARGE_SEND_OFFLOAD_IPV4
;
633 ip_hdr(skb
)->tot_len
= 0;
634 ip_hdr(skb
)->check
= 0;
635 tcp_hdr(skb
)->check
=
636 ~csum_tcpudp_magic(ip_hdr(skb
)->saddr
,
637 ip_hdr(skb
)->daddr
, 0, IPPROTO_TCP
, 0);
639 lso_info
->lso_v2_transmit
.ip_version
=
640 NDIS_TCP_LARGE_SEND_OFFLOAD_IPV6
;
641 tcp_v6_gso_csum_prep(skb
);
643 lso_info
->lso_v2_transmit
.tcp_header_offset
= skb_transport_offset(skb
);
644 lso_info
->lso_v2_transmit
.mss
= skb_shinfo(skb
)->gso_size
;
645 } else if (skb
->ip_summed
== CHECKSUM_PARTIAL
) {
646 if (net_checksum_info(skb
) & net_device_ctx
->tx_checksum_mask
) {
647 struct ndis_tcp_ip_checksum_info
*csum_info
;
649 rndis_msg_size
+= NDIS_CSUM_PPI_SIZE
;
650 csum_info
= init_ppi_data(rndis_msg
, NDIS_CSUM_PPI_SIZE
,
651 TCPIP_CHKSUM_PKTINFO
);
653 csum_info
->value
= 0;
654 csum_info
->transmit
.tcp_header_offset
= skb_transport_offset(skb
);
656 if (skb
->protocol
== htons(ETH_P_IP
)) {
657 csum_info
->transmit
.is_ipv4
= 1;
659 if (ip_hdr(skb
)->protocol
== IPPROTO_TCP
)
660 csum_info
->transmit
.tcp_checksum
= 1;
662 csum_info
->transmit
.udp_checksum
= 1;
664 csum_info
->transmit
.is_ipv6
= 1;
666 if (ipv6_hdr(skb
)->nexthdr
== IPPROTO_TCP
)
667 csum_info
->transmit
.tcp_checksum
= 1;
669 csum_info
->transmit
.udp_checksum
= 1;
672 /* Can't do offload of this type of checksum */
673 if (skb_checksum_help(skb
))
678 /* Start filling in the page buffers with the rndis hdr */
679 rndis_msg
->msg_len
+= rndis_msg_size
;
680 packet
->total_data_buflen
= rndis_msg
->msg_len
;
681 packet
->page_buf_cnt
= init_page_array(rndis_msg
, rndis_msg_size
,
684 /* timestamp packet in software */
685 skb_tx_timestamp(skb
);
687 ret
= netvsc_send(net
, packet
, rndis_msg
, pb
, skb
, xdp_tx
);
688 if (likely(ret
== 0))
691 if (ret
== -EAGAIN
) {
692 ++net_device_ctx
->eth_stats
.tx_busy
;
693 return NETDEV_TX_BUSY
;
697 ++net_device_ctx
->eth_stats
.tx_no_space
;
700 dev_kfree_skb_any(skb
);
701 net
->stats
.tx_dropped
++;
706 ++net_device_ctx
->eth_stats
.tx_no_memory
;
710 static netdev_tx_t
netvsc_start_xmit(struct sk_buff
*skb
,
711 struct net_device
*ndev
)
713 return netvsc_xmit(skb
, ndev
, false);
717 * netvsc_linkstatus_callback - Link up/down notification
719 void netvsc_linkstatus_callback(struct net_device
*net
,
720 struct rndis_message
*resp
)
722 struct rndis_indicate_status
*indicate
= &resp
->msg
.indicate_status
;
723 struct net_device_context
*ndev_ctx
= netdev_priv(net
);
724 struct netvsc_reconfig
*event
;
727 /* Update the physical link speed when changing to another vSwitch */
728 if (indicate
->status
== RNDIS_STATUS_LINK_SPEED_CHANGE
) {
731 speed
= *(u32
*)((void *)indicate
732 + indicate
->status_buf_offset
) / 10000;
733 ndev_ctx
->speed
= speed
;
737 /* Handle these link change statuses below */
738 if (indicate
->status
!= RNDIS_STATUS_NETWORK_CHANGE
&&
739 indicate
->status
!= RNDIS_STATUS_MEDIA_CONNECT
&&
740 indicate
->status
!= RNDIS_STATUS_MEDIA_DISCONNECT
)
743 if (net
->reg_state
!= NETREG_REGISTERED
)
746 event
= kzalloc(sizeof(*event
), GFP_ATOMIC
);
749 event
->event
= indicate
->status
;
751 spin_lock_irqsave(&ndev_ctx
->lock
, flags
);
752 list_add_tail(&event
->list
, &ndev_ctx
->reconfig_events
);
753 spin_unlock_irqrestore(&ndev_ctx
->lock
, flags
);
755 schedule_delayed_work(&ndev_ctx
->dwork
, 0);
758 static void netvsc_xdp_xmit(struct sk_buff
*skb
, struct net_device
*ndev
)
762 skb
->queue_mapping
= skb_get_rx_queue(skb
);
763 __skb_push(skb
, ETH_HLEN
);
765 rc
= netvsc_xmit(skb
, ndev
, true);
767 if (dev_xmit_complete(rc
))
770 dev_kfree_skb_any(skb
);
771 ndev
->stats
.tx_dropped
++;
774 static void netvsc_comp_ipcsum(struct sk_buff
*skb
)
776 struct iphdr
*iph
= (struct iphdr
*)skb
->data
;
779 iph
->check
= ip_fast_csum(iph
, iph
->ihl
);
782 static struct sk_buff
*netvsc_alloc_recv_skb(struct net_device
*net
,
783 struct netvsc_channel
*nvchan
,
784 struct xdp_buff
*xdp
)
786 struct napi_struct
*napi
= &nvchan
->napi
;
787 const struct ndis_pkt_8021q_info
*vlan
= nvchan
->rsc
.vlan
;
788 const struct ndis_tcp_ip_checksum_info
*csum_info
=
789 nvchan
->rsc
.csum_info
;
790 const u32
*hash_info
= nvchan
->rsc
.hash_info
;
792 void *xbuf
= xdp
->data_hard_start
;
796 unsigned int hdroom
= xdp
->data
- xdp
->data_hard_start
;
797 unsigned int xlen
= xdp
->data_end
- xdp
->data
;
798 unsigned int frag_size
= netvsc_xdp_fraglen(hdroom
+ xlen
);
800 skb
= build_skb(xbuf
, frag_size
);
803 __free_page(virt_to_page(xbuf
));
807 skb_reserve(skb
, hdroom
);
809 skb
->dev
= napi
->dev
;
811 skb
= napi_alloc_skb(napi
, nvchan
->rsc
.pktlen
);
816 /* Copy to skb. This copy is needed here since the memory
817 * pointed by hv_netvsc_packet cannot be deallocated.
819 for (i
= 0; i
< nvchan
->rsc
.cnt
; i
++)
820 skb_put_data(skb
, nvchan
->rsc
.data
[i
],
824 skb
->protocol
= eth_type_trans(skb
, net
);
826 /* skb is already created with CHECKSUM_NONE */
827 skb_checksum_none_assert(skb
);
829 /* Incoming packets may have IP header checksum verified by the host.
830 * They may not have IP header checksum computed after coalescing.
831 * We compute it here if the flags are set, because on Linux, the IP
832 * checksum is always checked.
834 if (csum_info
&& csum_info
->receive
.ip_checksum_value_invalid
&&
835 csum_info
->receive
.ip_checksum_succeeded
&&
836 skb
->protocol
== htons(ETH_P_IP
))
837 netvsc_comp_ipcsum(skb
);
839 /* Do L4 checksum offload if enabled and present. */
840 if (csum_info
&& (net
->features
& NETIF_F_RXCSUM
)) {
841 if (csum_info
->receive
.tcp_checksum_succeeded
||
842 csum_info
->receive
.udp_checksum_succeeded
)
843 skb
->ip_summed
= CHECKSUM_UNNECESSARY
;
846 if (hash_info
&& (net
->features
& NETIF_F_RXHASH
))
847 skb_set_hash(skb
, *hash_info
, PKT_HASH_TYPE_L4
);
850 u16 vlan_tci
= vlan
->vlanid
| (vlan
->pri
<< VLAN_PRIO_SHIFT
) |
851 (vlan
->cfi
? VLAN_CFI_MASK
: 0);
853 __vlan_hwaccel_put_tag(skb
, htons(ETH_P_8021Q
),
861 * netvsc_recv_callback - Callback when we receive a packet from the
862 * "wire" on the specified device.
864 int netvsc_recv_callback(struct net_device
*net
,
865 struct netvsc_device
*net_device
,
866 struct netvsc_channel
*nvchan
)
868 struct net_device_context
*net_device_ctx
= netdev_priv(net
);
869 struct vmbus_channel
*channel
= nvchan
->channel
;
870 u16 q_idx
= channel
->offermsg
.offer
.sub_channel_index
;
872 struct netvsc_stats
*rx_stats
= &nvchan
->rx_stats
;
876 if (net
->reg_state
!= NETREG_REGISTERED
)
877 return NVSP_STAT_FAIL
;
879 act
= netvsc_run_xdp(net
, nvchan
, &xdp
);
881 if (act
!= XDP_PASS
&& act
!= XDP_TX
) {
882 u64_stats_update_begin(&rx_stats
->syncp
);
883 rx_stats
->xdp_drop
++;
884 u64_stats_update_end(&rx_stats
->syncp
);
886 return NVSP_STAT_SUCCESS
; /* consumed by XDP */
889 /* Allocate a skb - TODO direct I/O to pages? */
890 skb
= netvsc_alloc_recv_skb(net
, nvchan
, &xdp
);
892 if (unlikely(!skb
)) {
893 ++net_device_ctx
->eth_stats
.rx_no_memory
;
894 return NVSP_STAT_FAIL
;
897 skb_record_rx_queue(skb
, q_idx
);
900 * Even if injecting the packet, record the statistics
901 * on the synthetic device because modifying the VF device
902 * statistics will not work correctly.
904 u64_stats_update_begin(&rx_stats
->syncp
);
906 rx_stats
->bytes
+= nvchan
->rsc
.pktlen
;
908 if (skb
->pkt_type
== PACKET_BROADCAST
)
909 ++rx_stats
->broadcast
;
910 else if (skb
->pkt_type
== PACKET_MULTICAST
)
911 ++rx_stats
->multicast
;
912 u64_stats_update_end(&rx_stats
->syncp
);
915 netvsc_xdp_xmit(skb
, net
);
916 return NVSP_STAT_SUCCESS
;
919 napi_gro_receive(&nvchan
->napi
, skb
);
920 return NVSP_STAT_SUCCESS
;
923 static void netvsc_get_drvinfo(struct net_device
*net
,
924 struct ethtool_drvinfo
*info
)
926 strlcpy(info
->driver
, KBUILD_MODNAME
, sizeof(info
->driver
));
927 strlcpy(info
->fw_version
, "N/A", sizeof(info
->fw_version
));
930 static void netvsc_get_channels(struct net_device
*net
,
931 struct ethtool_channels
*channel
)
933 struct net_device_context
*net_device_ctx
= netdev_priv(net
);
934 struct netvsc_device
*nvdev
= rtnl_dereference(net_device_ctx
->nvdev
);
937 channel
->max_combined
= nvdev
->max_chn
;
938 channel
->combined_count
= nvdev
->num_chn
;
942 /* Alloc struct netvsc_device_info, and initialize it from either existing
943 * struct netvsc_device, or from default values.
946 struct netvsc_device_info
*netvsc_devinfo_get(struct netvsc_device
*nvdev
)
948 struct netvsc_device_info
*dev_info
;
949 struct bpf_prog
*prog
;
951 dev_info
= kzalloc(sizeof(*dev_info
), GFP_ATOMIC
);
959 dev_info
->num_chn
= nvdev
->num_chn
;
960 dev_info
->send_sections
= nvdev
->send_section_cnt
;
961 dev_info
->send_section_size
= nvdev
->send_section_size
;
962 dev_info
->recv_sections
= nvdev
->recv_section_cnt
;
963 dev_info
->recv_section_size
= nvdev
->recv_section_size
;
965 memcpy(dev_info
->rss_key
, nvdev
->extension
->rss_key
,
968 prog
= netvsc_xdp_get(nvdev
);
971 dev_info
->bprog
= prog
;
974 dev_info
->num_chn
= VRSS_CHANNEL_DEFAULT
;
975 dev_info
->send_sections
= NETVSC_DEFAULT_TX
;
976 dev_info
->send_section_size
= NETVSC_SEND_SECTION_SIZE
;
977 dev_info
->recv_sections
= NETVSC_DEFAULT_RX
;
978 dev_info
->recv_section_size
= NETVSC_RECV_SECTION_SIZE
;
984 /* Free struct netvsc_device_info */
985 static void netvsc_devinfo_put(struct netvsc_device_info
*dev_info
)
987 if (dev_info
->bprog
) {
989 bpf_prog_put(dev_info
->bprog
);
995 static int netvsc_detach(struct net_device
*ndev
,
996 struct netvsc_device
*nvdev
)
998 struct net_device_context
*ndev_ctx
= netdev_priv(ndev
);
999 struct hv_device
*hdev
= ndev_ctx
->device_ctx
;
1002 /* Don't try continuing to try and setup sub channels */
1003 if (cancel_work_sync(&nvdev
->subchan_work
))
1006 netvsc_xdp_set(ndev
, NULL
, NULL
, nvdev
);
1008 /* If device was up (receiving) then shutdown */
1009 if (netif_running(ndev
)) {
1010 netvsc_tx_disable(nvdev
, ndev
);
1012 ret
= rndis_filter_close(nvdev
);
1015 "unable to close device (ret %d).\n", ret
);
1019 ret
= netvsc_wait_until_empty(nvdev
);
1022 "Ring buffer not empty after closing rndis\n");
1027 netif_device_detach(ndev
);
1029 rndis_filter_device_remove(hdev
, nvdev
);
1034 static int netvsc_attach(struct net_device
*ndev
,
1035 struct netvsc_device_info
*dev_info
)
1037 struct net_device_context
*ndev_ctx
= netdev_priv(ndev
);
1038 struct hv_device
*hdev
= ndev_ctx
->device_ctx
;
1039 struct netvsc_device
*nvdev
;
1040 struct rndis_device
*rdev
;
1041 struct bpf_prog
*prog
;
1044 nvdev
= rndis_filter_device_add(hdev
, dev_info
);
1046 return PTR_ERR(nvdev
);
1048 if (nvdev
->num_chn
> 1) {
1049 ret
= rndis_set_subchannel(ndev
, nvdev
, dev_info
);
1051 /* if unavailable, just proceed with one queue */
1058 prog
= dev_info
->bprog
;
1061 ret
= netvsc_xdp_set(ndev
, prog
, NULL
, nvdev
);
1068 /* In any case device is now ready */
1069 nvdev
->tx_disable
= false;
1070 netif_device_attach(ndev
);
1072 /* Note: enable and attach happen when sub-channels setup */
1073 netif_carrier_off(ndev
);
1075 if (netif_running(ndev
)) {
1076 ret
= rndis_filter_open(nvdev
);
1080 rdev
= nvdev
->extension
;
1081 if (!rdev
->link_state
)
1082 netif_carrier_on(ndev
);
1088 netif_device_detach(ndev
);
1091 rndis_filter_device_remove(hdev
, nvdev
);
1096 static int netvsc_set_channels(struct net_device
*net
,
1097 struct ethtool_channels
*channels
)
1099 struct net_device_context
*net_device_ctx
= netdev_priv(net
);
1100 struct netvsc_device
*nvdev
= rtnl_dereference(net_device_ctx
->nvdev
);
1101 unsigned int orig
, count
= channels
->combined_count
;
1102 struct netvsc_device_info
*device_info
;
1105 /* We do not support separate count for rx, tx, or other */
1107 channels
->rx_count
|| channels
->tx_count
|| channels
->other_count
)
1110 if (!nvdev
|| nvdev
->destroy
)
1113 if (nvdev
->nvsp_version
< NVSP_PROTOCOL_VERSION_5
)
1116 if (count
> nvdev
->max_chn
)
1119 orig
= nvdev
->num_chn
;
1121 device_info
= netvsc_devinfo_get(nvdev
);
1126 device_info
->num_chn
= count
;
1128 ret
= netvsc_detach(net
, nvdev
);
1132 ret
= netvsc_attach(net
, device_info
);
1134 device_info
->num_chn
= orig
;
1135 if (netvsc_attach(net
, device_info
))
1136 netdev_err(net
, "restoring channel setting failed\n");
1140 netvsc_devinfo_put(device_info
);
1144 static void netvsc_init_settings(struct net_device
*dev
)
1146 struct net_device_context
*ndc
= netdev_priv(dev
);
1148 ndc
->l4_hash
= HV_DEFAULT_L4HASH
;
1150 ndc
->speed
= SPEED_UNKNOWN
;
1151 ndc
->duplex
= DUPLEX_FULL
;
1153 dev
->features
= NETIF_F_LRO
;
1156 static int netvsc_get_link_ksettings(struct net_device
*dev
,
1157 struct ethtool_link_ksettings
*cmd
)
1159 struct net_device_context
*ndc
= netdev_priv(dev
);
1160 struct net_device
*vf_netdev
;
1162 vf_netdev
= rtnl_dereference(ndc
->vf_netdev
);
1165 return __ethtool_get_link_ksettings(vf_netdev
, cmd
);
1167 cmd
->base
.speed
= ndc
->speed
;
1168 cmd
->base
.duplex
= ndc
->duplex
;
1169 cmd
->base
.port
= PORT_OTHER
;
1174 static int netvsc_set_link_ksettings(struct net_device
*dev
,
1175 const struct ethtool_link_ksettings
*cmd
)
1177 struct net_device_context
*ndc
= netdev_priv(dev
);
1178 struct net_device
*vf_netdev
= rtnl_dereference(ndc
->vf_netdev
);
1181 if (!vf_netdev
->ethtool_ops
->set_link_ksettings
)
1184 return vf_netdev
->ethtool_ops
->set_link_ksettings(vf_netdev
,
1188 return ethtool_virtdev_set_link_ksettings(dev
, cmd
,
1189 &ndc
->speed
, &ndc
->duplex
);
1192 static int netvsc_change_mtu(struct net_device
*ndev
, int mtu
)
1194 struct net_device_context
*ndevctx
= netdev_priv(ndev
);
1195 struct net_device
*vf_netdev
= rtnl_dereference(ndevctx
->vf_netdev
);
1196 struct netvsc_device
*nvdev
= rtnl_dereference(ndevctx
->nvdev
);
1197 int orig_mtu
= ndev
->mtu
;
1198 struct netvsc_device_info
*device_info
;
1201 if (!nvdev
|| nvdev
->destroy
)
1204 device_info
= netvsc_devinfo_get(nvdev
);
1209 /* Change MTU of underlying VF netdev first. */
1211 ret
= dev_set_mtu(vf_netdev
, mtu
);
1216 ret
= netvsc_detach(ndev
, nvdev
);
1222 ret
= netvsc_attach(ndev
, device_info
);
1226 /* Attempt rollback to original MTU */
1227 ndev
->mtu
= orig_mtu
;
1229 if (netvsc_attach(ndev
, device_info
))
1230 netdev_err(ndev
, "restoring mtu failed\n");
1233 dev_set_mtu(vf_netdev
, orig_mtu
);
1236 netvsc_devinfo_put(device_info
);
1240 static void netvsc_get_vf_stats(struct net_device
*net
,
1241 struct netvsc_vf_pcpu_stats
*tot
)
1243 struct net_device_context
*ndev_ctx
= netdev_priv(net
);
1246 memset(tot
, 0, sizeof(*tot
));
1248 for_each_possible_cpu(i
) {
1249 const struct netvsc_vf_pcpu_stats
*stats
1250 = per_cpu_ptr(ndev_ctx
->vf_stats
, i
);
1251 u64 rx_packets
, rx_bytes
, tx_packets
, tx_bytes
;
1255 start
= u64_stats_fetch_begin_irq(&stats
->syncp
);
1256 rx_packets
= stats
->rx_packets
;
1257 tx_packets
= stats
->tx_packets
;
1258 rx_bytes
= stats
->rx_bytes
;
1259 tx_bytes
= stats
->tx_bytes
;
1260 } while (u64_stats_fetch_retry_irq(&stats
->syncp
, start
));
1262 tot
->rx_packets
+= rx_packets
;
1263 tot
->tx_packets
+= tx_packets
;
1264 tot
->rx_bytes
+= rx_bytes
;
1265 tot
->tx_bytes
+= tx_bytes
;
1266 tot
->tx_dropped
+= stats
->tx_dropped
;
1270 static void netvsc_get_pcpu_stats(struct net_device
*net
,
1271 struct netvsc_ethtool_pcpu_stats
*pcpu_tot
)
1273 struct net_device_context
*ndev_ctx
= netdev_priv(net
);
1274 struct netvsc_device
*nvdev
= rcu_dereference_rtnl(ndev_ctx
->nvdev
);
1277 /* fetch percpu stats of vf */
1278 for_each_possible_cpu(i
) {
1279 const struct netvsc_vf_pcpu_stats
*stats
=
1280 per_cpu_ptr(ndev_ctx
->vf_stats
, i
);
1281 struct netvsc_ethtool_pcpu_stats
*this_tot
= &pcpu_tot
[i
];
1285 start
= u64_stats_fetch_begin_irq(&stats
->syncp
);
1286 this_tot
->vf_rx_packets
= stats
->rx_packets
;
1287 this_tot
->vf_tx_packets
= stats
->tx_packets
;
1288 this_tot
->vf_rx_bytes
= stats
->rx_bytes
;
1289 this_tot
->vf_tx_bytes
= stats
->tx_bytes
;
1290 } while (u64_stats_fetch_retry_irq(&stats
->syncp
, start
));
1291 this_tot
->rx_packets
= this_tot
->vf_rx_packets
;
1292 this_tot
->tx_packets
= this_tot
->vf_tx_packets
;
1293 this_tot
->rx_bytes
= this_tot
->vf_rx_bytes
;
1294 this_tot
->tx_bytes
= this_tot
->vf_tx_bytes
;
1297 /* fetch percpu stats of netvsc */
1298 for (i
= 0; i
< nvdev
->num_chn
; i
++) {
1299 const struct netvsc_channel
*nvchan
= &nvdev
->chan_table
[i
];
1300 const struct netvsc_stats
*stats
;
1301 struct netvsc_ethtool_pcpu_stats
*this_tot
=
1302 &pcpu_tot
[nvchan
->channel
->target_cpu
];
1306 stats
= &nvchan
->tx_stats
;
1308 start
= u64_stats_fetch_begin_irq(&stats
->syncp
);
1309 packets
= stats
->packets
;
1310 bytes
= stats
->bytes
;
1311 } while (u64_stats_fetch_retry_irq(&stats
->syncp
, start
));
1313 this_tot
->tx_bytes
+= bytes
;
1314 this_tot
->tx_packets
+= packets
;
1316 stats
= &nvchan
->rx_stats
;
1318 start
= u64_stats_fetch_begin_irq(&stats
->syncp
);
1319 packets
= stats
->packets
;
1320 bytes
= stats
->bytes
;
1321 } while (u64_stats_fetch_retry_irq(&stats
->syncp
, start
));
1323 this_tot
->rx_bytes
+= bytes
;
1324 this_tot
->rx_packets
+= packets
;
1328 static void netvsc_get_stats64(struct net_device
*net
,
1329 struct rtnl_link_stats64
*t
)
1331 struct net_device_context
*ndev_ctx
= netdev_priv(net
);
1332 struct netvsc_device
*nvdev
;
1333 struct netvsc_vf_pcpu_stats vf_tot
;
1338 nvdev
= rcu_dereference(ndev_ctx
->nvdev
);
1342 netdev_stats_to_stats64(t
, &net
->stats
);
1344 netvsc_get_vf_stats(net
, &vf_tot
);
1345 t
->rx_packets
+= vf_tot
.rx_packets
;
1346 t
->tx_packets
+= vf_tot
.tx_packets
;
1347 t
->rx_bytes
+= vf_tot
.rx_bytes
;
1348 t
->tx_bytes
+= vf_tot
.tx_bytes
;
1349 t
->tx_dropped
+= vf_tot
.tx_dropped
;
1351 for (i
= 0; i
< nvdev
->num_chn
; i
++) {
1352 const struct netvsc_channel
*nvchan
= &nvdev
->chan_table
[i
];
1353 const struct netvsc_stats
*stats
;
1354 u64 packets
, bytes
, multicast
;
1357 stats
= &nvchan
->tx_stats
;
1359 start
= u64_stats_fetch_begin_irq(&stats
->syncp
);
1360 packets
= stats
->packets
;
1361 bytes
= stats
->bytes
;
1362 } while (u64_stats_fetch_retry_irq(&stats
->syncp
, start
));
1364 t
->tx_bytes
+= bytes
;
1365 t
->tx_packets
+= packets
;
1367 stats
= &nvchan
->rx_stats
;
1369 start
= u64_stats_fetch_begin_irq(&stats
->syncp
);
1370 packets
= stats
->packets
;
1371 bytes
= stats
->bytes
;
1372 multicast
= stats
->multicast
+ stats
->broadcast
;
1373 } while (u64_stats_fetch_retry_irq(&stats
->syncp
, start
));
1375 t
->rx_bytes
+= bytes
;
1376 t
->rx_packets
+= packets
;
1377 t
->multicast
+= multicast
;
1383 static int netvsc_set_mac_addr(struct net_device
*ndev
, void *p
)
1385 struct net_device_context
*ndc
= netdev_priv(ndev
);
1386 struct net_device
*vf_netdev
= rtnl_dereference(ndc
->vf_netdev
);
1387 struct netvsc_device
*nvdev
= rtnl_dereference(ndc
->nvdev
);
1388 struct sockaddr
*addr
= p
;
1391 err
= eth_prepare_mac_addr_change(ndev
, p
);
1399 err
= dev_set_mac_address(vf_netdev
, addr
, NULL
);
1404 err
= rndis_filter_set_device_mac(nvdev
, addr
->sa_data
);
1406 eth_commit_mac_addr_change(ndev
, p
);
1407 } else if (vf_netdev
) {
1408 /* rollback change on VF */
1409 memcpy(addr
->sa_data
, ndev
->dev_addr
, ETH_ALEN
);
1410 dev_set_mac_address(vf_netdev
, addr
, NULL
);
1416 static const struct {
1417 char name
[ETH_GSTRING_LEN
];
1419 } netvsc_stats
[] = {
1420 { "tx_scattered", offsetof(struct netvsc_ethtool_stats
, tx_scattered
) },
1421 { "tx_no_memory", offsetof(struct netvsc_ethtool_stats
, tx_no_memory
) },
1422 { "tx_no_space", offsetof(struct netvsc_ethtool_stats
, tx_no_space
) },
1423 { "tx_too_big", offsetof(struct netvsc_ethtool_stats
, tx_too_big
) },
1424 { "tx_busy", offsetof(struct netvsc_ethtool_stats
, tx_busy
) },
1425 { "tx_send_full", offsetof(struct netvsc_ethtool_stats
, tx_send_full
) },
1426 { "rx_comp_busy", offsetof(struct netvsc_ethtool_stats
, rx_comp_busy
) },
1427 { "rx_no_memory", offsetof(struct netvsc_ethtool_stats
, rx_no_memory
) },
1428 { "stop_queue", offsetof(struct netvsc_ethtool_stats
, stop_queue
) },
1429 { "wake_queue", offsetof(struct netvsc_ethtool_stats
, wake_queue
) },
1431 { "cpu%u_rx_packets",
1432 offsetof(struct netvsc_ethtool_pcpu_stats
, rx_packets
) },
1434 offsetof(struct netvsc_ethtool_pcpu_stats
, rx_bytes
) },
1435 { "cpu%u_tx_packets",
1436 offsetof(struct netvsc_ethtool_pcpu_stats
, tx_packets
) },
1438 offsetof(struct netvsc_ethtool_pcpu_stats
, tx_bytes
) },
1439 { "cpu%u_vf_rx_packets",
1440 offsetof(struct netvsc_ethtool_pcpu_stats
, vf_rx_packets
) },
1441 { "cpu%u_vf_rx_bytes",
1442 offsetof(struct netvsc_ethtool_pcpu_stats
, vf_rx_bytes
) },
1443 { "cpu%u_vf_tx_packets",
1444 offsetof(struct netvsc_ethtool_pcpu_stats
, vf_tx_packets
) },
1445 { "cpu%u_vf_tx_bytes",
1446 offsetof(struct netvsc_ethtool_pcpu_stats
, vf_tx_bytes
) },
1448 { "vf_rx_packets", offsetof(struct netvsc_vf_pcpu_stats
, rx_packets
) },
1449 { "vf_rx_bytes", offsetof(struct netvsc_vf_pcpu_stats
, rx_bytes
) },
1450 { "vf_tx_packets", offsetof(struct netvsc_vf_pcpu_stats
, tx_packets
) },
1451 { "vf_tx_bytes", offsetof(struct netvsc_vf_pcpu_stats
, tx_bytes
) },
1452 { "vf_tx_dropped", offsetof(struct netvsc_vf_pcpu_stats
, tx_dropped
) },
1455 #define NETVSC_GLOBAL_STATS_LEN ARRAY_SIZE(netvsc_stats)
1456 #define NETVSC_VF_STATS_LEN ARRAY_SIZE(vf_stats)
1458 /* statistics per queue (rx/tx packets/bytes) */
1459 #define NETVSC_PCPU_STATS_LEN (num_present_cpus() * ARRAY_SIZE(pcpu_stats))
1461 /* 5 statistics per queue (rx/tx packets/bytes, rx xdp_drop) */
1462 #define NETVSC_QUEUE_STATS_LEN(dev) ((dev)->num_chn * 5)
1464 static int netvsc_get_sset_count(struct net_device
*dev
, int string_set
)
1466 struct net_device_context
*ndc
= netdev_priv(dev
);
1467 struct netvsc_device
*nvdev
= rtnl_dereference(ndc
->nvdev
);
1472 switch (string_set
) {
1474 return NETVSC_GLOBAL_STATS_LEN
1475 + NETVSC_VF_STATS_LEN
1476 + NETVSC_QUEUE_STATS_LEN(nvdev
)
1477 + NETVSC_PCPU_STATS_LEN
;
1483 static void netvsc_get_ethtool_stats(struct net_device
*dev
,
1484 struct ethtool_stats
*stats
, u64
*data
)
1486 struct net_device_context
*ndc
= netdev_priv(dev
);
1487 struct netvsc_device
*nvdev
= rtnl_dereference(ndc
->nvdev
);
1488 const void *nds
= &ndc
->eth_stats
;
1489 const struct netvsc_stats
*qstats
;
1490 struct netvsc_vf_pcpu_stats sum
;
1491 struct netvsc_ethtool_pcpu_stats
*pcpu_sum
;
1500 for (i
= 0; i
< NETVSC_GLOBAL_STATS_LEN
; i
++)
1501 data
[i
] = *(unsigned long *)(nds
+ netvsc_stats
[i
].offset
);
1503 netvsc_get_vf_stats(dev
, &sum
);
1504 for (j
= 0; j
< NETVSC_VF_STATS_LEN
; j
++)
1505 data
[i
++] = *(u64
*)((void *)&sum
+ vf_stats
[j
].offset
);
1507 for (j
= 0; j
< nvdev
->num_chn
; j
++) {
1508 qstats
= &nvdev
->chan_table
[j
].tx_stats
;
1511 start
= u64_stats_fetch_begin_irq(&qstats
->syncp
);
1512 packets
= qstats
->packets
;
1513 bytes
= qstats
->bytes
;
1514 } while (u64_stats_fetch_retry_irq(&qstats
->syncp
, start
));
1515 data
[i
++] = packets
;
1518 qstats
= &nvdev
->chan_table
[j
].rx_stats
;
1520 start
= u64_stats_fetch_begin_irq(&qstats
->syncp
);
1521 packets
= qstats
->packets
;
1522 bytes
= qstats
->bytes
;
1523 xdp_drop
= qstats
->xdp_drop
;
1524 } while (u64_stats_fetch_retry_irq(&qstats
->syncp
, start
));
1525 data
[i
++] = packets
;
1527 data
[i
++] = xdp_drop
;
1530 pcpu_sum
= kvmalloc_array(num_possible_cpus(),
1531 sizeof(struct netvsc_ethtool_pcpu_stats
),
1533 netvsc_get_pcpu_stats(dev
, pcpu_sum
);
1534 for_each_present_cpu(cpu
) {
1535 struct netvsc_ethtool_pcpu_stats
*this_sum
= &pcpu_sum
[cpu
];
1537 for (j
= 0; j
< ARRAY_SIZE(pcpu_stats
); j
++)
1538 data
[i
++] = *(u64
*)((void *)this_sum
1539 + pcpu_stats
[j
].offset
);
1544 static void netvsc_get_strings(struct net_device
*dev
, u32 stringset
, u8
*data
)
1546 struct net_device_context
*ndc
= netdev_priv(dev
);
1547 struct netvsc_device
*nvdev
= rtnl_dereference(ndc
->nvdev
);
1554 switch (stringset
) {
1556 for (i
= 0; i
< ARRAY_SIZE(netvsc_stats
); i
++) {
1557 memcpy(p
, netvsc_stats
[i
].name
, ETH_GSTRING_LEN
);
1558 p
+= ETH_GSTRING_LEN
;
1561 for (i
= 0; i
< ARRAY_SIZE(vf_stats
); i
++) {
1562 memcpy(p
, vf_stats
[i
].name
, ETH_GSTRING_LEN
);
1563 p
+= ETH_GSTRING_LEN
;
1566 for (i
= 0; i
< nvdev
->num_chn
; i
++) {
1567 sprintf(p
, "tx_queue_%u_packets", i
);
1568 p
+= ETH_GSTRING_LEN
;
1569 sprintf(p
, "tx_queue_%u_bytes", i
);
1570 p
+= ETH_GSTRING_LEN
;
1571 sprintf(p
, "rx_queue_%u_packets", i
);
1572 p
+= ETH_GSTRING_LEN
;
1573 sprintf(p
, "rx_queue_%u_bytes", i
);
1574 p
+= ETH_GSTRING_LEN
;
1575 sprintf(p
, "rx_queue_%u_xdp_drop", i
);
1576 p
+= ETH_GSTRING_LEN
;
1579 for_each_present_cpu(cpu
) {
1580 for (i
= 0; i
< ARRAY_SIZE(pcpu_stats
); i
++) {
1581 sprintf(p
, pcpu_stats
[i
].name
, cpu
);
1582 p
+= ETH_GSTRING_LEN
;
1591 netvsc_get_rss_hash_opts(struct net_device_context
*ndc
,
1592 struct ethtool_rxnfc
*info
)
1594 const u32 l4_flag
= RXH_L4_B_0_1
| RXH_L4_B_2_3
;
1596 info
->data
= RXH_IP_SRC
| RXH_IP_DST
;
1598 switch (info
->flow_type
) {
1600 if (ndc
->l4_hash
& HV_TCP4_L4HASH
)
1601 info
->data
|= l4_flag
;
1606 if (ndc
->l4_hash
& HV_TCP6_L4HASH
)
1607 info
->data
|= l4_flag
;
1612 if (ndc
->l4_hash
& HV_UDP4_L4HASH
)
1613 info
->data
|= l4_flag
;
1618 if (ndc
->l4_hash
& HV_UDP6_L4HASH
)
1619 info
->data
|= l4_flag
;
1635 netvsc_get_rxnfc(struct net_device
*dev
, struct ethtool_rxnfc
*info
,
1638 struct net_device_context
*ndc
= netdev_priv(dev
);
1639 struct netvsc_device
*nvdev
= rtnl_dereference(ndc
->nvdev
);
1644 switch (info
->cmd
) {
1645 case ETHTOOL_GRXRINGS
:
1646 info
->data
= nvdev
->num_chn
;
1650 return netvsc_get_rss_hash_opts(ndc
, info
);
1655 static int netvsc_set_rss_hash_opts(struct net_device_context
*ndc
,
1656 struct ethtool_rxnfc
*info
)
1658 if (info
->data
== (RXH_IP_SRC
| RXH_IP_DST
|
1659 RXH_L4_B_0_1
| RXH_L4_B_2_3
)) {
1660 switch (info
->flow_type
) {
1662 ndc
->l4_hash
|= HV_TCP4_L4HASH
;
1666 ndc
->l4_hash
|= HV_TCP6_L4HASH
;
1670 ndc
->l4_hash
|= HV_UDP4_L4HASH
;
1674 ndc
->l4_hash
|= HV_UDP6_L4HASH
;
1684 if (info
->data
== (RXH_IP_SRC
| RXH_IP_DST
)) {
1685 switch (info
->flow_type
) {
1687 ndc
->l4_hash
&= ~HV_TCP4_L4HASH
;
1691 ndc
->l4_hash
&= ~HV_TCP6_L4HASH
;
1695 ndc
->l4_hash
&= ~HV_UDP4_L4HASH
;
1699 ndc
->l4_hash
&= ~HV_UDP6_L4HASH
;
1713 netvsc_set_rxnfc(struct net_device
*ndev
, struct ethtool_rxnfc
*info
)
1715 struct net_device_context
*ndc
= netdev_priv(ndev
);
1717 if (info
->cmd
== ETHTOOL_SRXFH
)
1718 return netvsc_set_rss_hash_opts(ndc
, info
);
1723 static u32
netvsc_get_rxfh_key_size(struct net_device
*dev
)
1725 return NETVSC_HASH_KEYLEN
;
1728 static u32
netvsc_rss_indir_size(struct net_device
*dev
)
1733 static int netvsc_get_rxfh(struct net_device
*dev
, u32
*indir
, u8
*key
,
1736 struct net_device_context
*ndc
= netdev_priv(dev
);
1737 struct netvsc_device
*ndev
= rtnl_dereference(ndc
->nvdev
);
1738 struct rndis_device
*rndis_dev
;
1745 *hfunc
= ETH_RSS_HASH_TOP
; /* Toeplitz */
1747 rndis_dev
= ndev
->extension
;
1749 for (i
= 0; i
< ITAB_NUM
; i
++)
1750 indir
[i
] = ndc
->rx_table
[i
];
1754 memcpy(key
, rndis_dev
->rss_key
, NETVSC_HASH_KEYLEN
);
1759 static int netvsc_set_rxfh(struct net_device
*dev
, const u32
*indir
,
1760 const u8
*key
, const u8 hfunc
)
1762 struct net_device_context
*ndc
= netdev_priv(dev
);
1763 struct netvsc_device
*ndev
= rtnl_dereference(ndc
->nvdev
);
1764 struct rndis_device
*rndis_dev
;
1770 if (hfunc
!= ETH_RSS_HASH_NO_CHANGE
&& hfunc
!= ETH_RSS_HASH_TOP
)
1773 rndis_dev
= ndev
->extension
;
1775 for (i
= 0; i
< ITAB_NUM
; i
++)
1776 if (indir
[i
] >= ndev
->num_chn
)
1779 for (i
= 0; i
< ITAB_NUM
; i
++)
1780 ndc
->rx_table
[i
] = indir
[i
];
1787 key
= rndis_dev
->rss_key
;
1790 return rndis_filter_set_rss_param(rndis_dev
, key
);
1793 /* Hyper-V RNDIS protocol does not have ring in the HW sense.
1794 * It does have pre-allocated receive area which is divided into sections.
1796 static void __netvsc_get_ringparam(struct netvsc_device
*nvdev
,
1797 struct ethtool_ringparam
*ring
)
1801 ring
->rx_pending
= nvdev
->recv_section_cnt
;
1802 ring
->tx_pending
= nvdev
->send_section_cnt
;
1804 if (nvdev
->nvsp_version
<= NVSP_PROTOCOL_VERSION_2
)
1805 max_buf_size
= NETVSC_RECEIVE_BUFFER_SIZE_LEGACY
;
1807 max_buf_size
= NETVSC_RECEIVE_BUFFER_SIZE
;
1809 ring
->rx_max_pending
= max_buf_size
/ nvdev
->recv_section_size
;
1810 ring
->tx_max_pending
= NETVSC_SEND_BUFFER_SIZE
1811 / nvdev
->send_section_size
;
1814 static void netvsc_get_ringparam(struct net_device
*ndev
,
1815 struct ethtool_ringparam
*ring
)
1817 struct net_device_context
*ndevctx
= netdev_priv(ndev
);
1818 struct netvsc_device
*nvdev
= rtnl_dereference(ndevctx
->nvdev
);
1823 __netvsc_get_ringparam(nvdev
, ring
);
1826 static int netvsc_set_ringparam(struct net_device
*ndev
,
1827 struct ethtool_ringparam
*ring
)
1829 struct net_device_context
*ndevctx
= netdev_priv(ndev
);
1830 struct netvsc_device
*nvdev
= rtnl_dereference(ndevctx
->nvdev
);
1831 struct netvsc_device_info
*device_info
;
1832 struct ethtool_ringparam orig
;
1836 if (!nvdev
|| nvdev
->destroy
)
1839 memset(&orig
, 0, sizeof(orig
));
1840 __netvsc_get_ringparam(nvdev
, &orig
);
1842 new_tx
= clamp_t(u32
, ring
->tx_pending
,
1843 NETVSC_MIN_TX_SECTIONS
, orig
.tx_max_pending
);
1844 new_rx
= clamp_t(u32
, ring
->rx_pending
,
1845 NETVSC_MIN_RX_SECTIONS
, orig
.rx_max_pending
);
1847 if (new_tx
== orig
.tx_pending
&&
1848 new_rx
== orig
.rx_pending
)
1849 return 0; /* no change */
1851 device_info
= netvsc_devinfo_get(nvdev
);
1856 device_info
->send_sections
= new_tx
;
1857 device_info
->recv_sections
= new_rx
;
1859 ret
= netvsc_detach(ndev
, nvdev
);
1863 ret
= netvsc_attach(ndev
, device_info
);
1865 device_info
->send_sections
= orig
.tx_pending
;
1866 device_info
->recv_sections
= orig
.rx_pending
;
1868 if (netvsc_attach(ndev
, device_info
))
1869 netdev_err(ndev
, "restoring ringparam failed");
1873 netvsc_devinfo_put(device_info
);
1877 static netdev_features_t
netvsc_fix_features(struct net_device
*ndev
,
1878 netdev_features_t features
)
1880 struct net_device_context
*ndevctx
= netdev_priv(ndev
);
1881 struct netvsc_device
*nvdev
= rtnl_dereference(ndevctx
->nvdev
);
1883 if (!nvdev
|| nvdev
->destroy
)
1886 if ((features
& NETIF_F_LRO
) && netvsc_xdp_get(nvdev
)) {
1887 features
^= NETIF_F_LRO
;
1888 netdev_info(ndev
, "Skip LRO - unsupported with XDP\n");
1894 static int netvsc_set_features(struct net_device
*ndev
,
1895 netdev_features_t features
)
1897 netdev_features_t change
= features
^ ndev
->features
;
1898 struct net_device_context
*ndevctx
= netdev_priv(ndev
);
1899 struct netvsc_device
*nvdev
= rtnl_dereference(ndevctx
->nvdev
);
1900 struct net_device
*vf_netdev
= rtnl_dereference(ndevctx
->vf_netdev
);
1901 struct ndis_offload_params offloads
;
1904 if (!nvdev
|| nvdev
->destroy
)
1907 if (!(change
& NETIF_F_LRO
))
1910 memset(&offloads
, 0, sizeof(struct ndis_offload_params
));
1912 if (features
& NETIF_F_LRO
) {
1913 offloads
.rsc_ip_v4
= NDIS_OFFLOAD_PARAMETERS_RSC_ENABLED
;
1914 offloads
.rsc_ip_v6
= NDIS_OFFLOAD_PARAMETERS_RSC_ENABLED
;
1916 offloads
.rsc_ip_v4
= NDIS_OFFLOAD_PARAMETERS_RSC_DISABLED
;
1917 offloads
.rsc_ip_v6
= NDIS_OFFLOAD_PARAMETERS_RSC_DISABLED
;
1920 ret
= rndis_filter_set_offload_params(ndev
, nvdev
, &offloads
);
1923 features
^= NETIF_F_LRO
;
1924 ndev
->features
= features
;
1931 vf_netdev
->wanted_features
= features
;
1932 netdev_update_features(vf_netdev
);
1937 static u32
netvsc_get_msglevel(struct net_device
*ndev
)
1939 struct net_device_context
*ndev_ctx
= netdev_priv(ndev
);
1941 return ndev_ctx
->msg_enable
;
1944 static void netvsc_set_msglevel(struct net_device
*ndev
, u32 val
)
1946 struct net_device_context
*ndev_ctx
= netdev_priv(ndev
);
1948 ndev_ctx
->msg_enable
= val
;
1951 static const struct ethtool_ops ethtool_ops
= {
1952 .get_drvinfo
= netvsc_get_drvinfo
,
1953 .get_msglevel
= netvsc_get_msglevel
,
1954 .set_msglevel
= netvsc_set_msglevel
,
1955 .get_link
= ethtool_op_get_link
,
1956 .get_ethtool_stats
= netvsc_get_ethtool_stats
,
1957 .get_sset_count
= netvsc_get_sset_count
,
1958 .get_strings
= netvsc_get_strings
,
1959 .get_channels
= netvsc_get_channels
,
1960 .set_channels
= netvsc_set_channels
,
1961 .get_ts_info
= ethtool_op_get_ts_info
,
1962 .get_rxnfc
= netvsc_get_rxnfc
,
1963 .set_rxnfc
= netvsc_set_rxnfc
,
1964 .get_rxfh_key_size
= netvsc_get_rxfh_key_size
,
1965 .get_rxfh_indir_size
= netvsc_rss_indir_size
,
1966 .get_rxfh
= netvsc_get_rxfh
,
1967 .set_rxfh
= netvsc_set_rxfh
,
1968 .get_link_ksettings
= netvsc_get_link_ksettings
,
1969 .set_link_ksettings
= netvsc_set_link_ksettings
,
1970 .get_ringparam
= netvsc_get_ringparam
,
1971 .set_ringparam
= netvsc_set_ringparam
,
1974 static const struct net_device_ops device_ops
= {
1975 .ndo_open
= netvsc_open
,
1976 .ndo_stop
= netvsc_close
,
1977 .ndo_start_xmit
= netvsc_start_xmit
,
1978 .ndo_change_rx_flags
= netvsc_change_rx_flags
,
1979 .ndo_set_rx_mode
= netvsc_set_rx_mode
,
1980 .ndo_fix_features
= netvsc_fix_features
,
1981 .ndo_set_features
= netvsc_set_features
,
1982 .ndo_change_mtu
= netvsc_change_mtu
,
1983 .ndo_validate_addr
= eth_validate_addr
,
1984 .ndo_set_mac_address
= netvsc_set_mac_addr
,
1985 .ndo_select_queue
= netvsc_select_queue
,
1986 .ndo_get_stats64
= netvsc_get_stats64
,
1987 .ndo_bpf
= netvsc_bpf
,
1991 * Handle link status changes. For RNDIS_STATUS_NETWORK_CHANGE emulate link
1992 * down/up sequence. In case of RNDIS_STATUS_MEDIA_CONNECT when carrier is
1993 * present send GARP packet to network peers with netif_notify_peers().
1995 static void netvsc_link_change(struct work_struct
*w
)
1997 struct net_device_context
*ndev_ctx
=
1998 container_of(w
, struct net_device_context
, dwork
.work
);
1999 struct hv_device
*device_obj
= ndev_ctx
->device_ctx
;
2000 struct net_device
*net
= hv_get_drvdata(device_obj
);
2001 struct netvsc_device
*net_device
;
2002 struct rndis_device
*rdev
;
2003 struct netvsc_reconfig
*event
= NULL
;
2004 bool notify
= false, reschedule
= false;
2005 unsigned long flags
, next_reconfig
, delay
;
2007 /* if changes are happening, comeback later */
2008 if (!rtnl_trylock()) {
2009 schedule_delayed_work(&ndev_ctx
->dwork
, LINKCHANGE_INT
);
2013 net_device
= rtnl_dereference(ndev_ctx
->nvdev
);
2017 rdev
= net_device
->extension
;
2019 next_reconfig
= ndev_ctx
->last_reconfig
+ LINKCHANGE_INT
;
2020 if (time_is_after_jiffies(next_reconfig
)) {
2021 /* link_watch only sends one notification with current state
2022 * per second, avoid doing reconfig more frequently. Handle
2025 delay
= next_reconfig
- jiffies
;
2026 delay
= delay
< LINKCHANGE_INT
? delay
: LINKCHANGE_INT
;
2027 schedule_delayed_work(&ndev_ctx
->dwork
, delay
);
2030 ndev_ctx
->last_reconfig
= jiffies
;
2032 spin_lock_irqsave(&ndev_ctx
->lock
, flags
);
2033 if (!list_empty(&ndev_ctx
->reconfig_events
)) {
2034 event
= list_first_entry(&ndev_ctx
->reconfig_events
,
2035 struct netvsc_reconfig
, list
);
2036 list_del(&event
->list
);
2037 reschedule
= !list_empty(&ndev_ctx
->reconfig_events
);
2039 spin_unlock_irqrestore(&ndev_ctx
->lock
, flags
);
2044 switch (event
->event
) {
2045 /* Only the following events are possible due to the check in
2046 * netvsc_linkstatus_callback()
2048 case RNDIS_STATUS_MEDIA_CONNECT
:
2049 if (rdev
->link_state
) {
2050 rdev
->link_state
= false;
2051 netif_carrier_on(net
);
2052 netvsc_tx_enable(net_device
, net
);
2058 case RNDIS_STATUS_MEDIA_DISCONNECT
:
2059 if (!rdev
->link_state
) {
2060 rdev
->link_state
= true;
2061 netif_carrier_off(net
);
2062 netvsc_tx_disable(net_device
, net
);
2066 case RNDIS_STATUS_NETWORK_CHANGE
:
2067 /* Only makes sense if carrier is present */
2068 if (!rdev
->link_state
) {
2069 rdev
->link_state
= true;
2070 netif_carrier_off(net
);
2071 netvsc_tx_disable(net_device
, net
);
2072 event
->event
= RNDIS_STATUS_MEDIA_CONNECT
;
2073 spin_lock_irqsave(&ndev_ctx
->lock
, flags
);
2074 list_add(&event
->list
, &ndev_ctx
->reconfig_events
);
2075 spin_unlock_irqrestore(&ndev_ctx
->lock
, flags
);
2084 netdev_notify_peers(net
);
2086 /* link_watch only sends one notification with current state per
2087 * second, handle next reconfig event in 2 seconds.
2090 schedule_delayed_work(&ndev_ctx
->dwork
, LINKCHANGE_INT
);
2098 static struct net_device
*get_netvsc_byref(struct net_device
*vf_netdev
)
2100 struct net_device_context
*net_device_ctx
;
2101 struct net_device
*dev
;
2103 dev
= netdev_master_upper_dev_get(vf_netdev
);
2104 if (!dev
|| dev
->netdev_ops
!= &device_ops
)
2105 return NULL
; /* not a netvsc device */
2107 net_device_ctx
= netdev_priv(dev
);
2108 if (!rtnl_dereference(net_device_ctx
->nvdev
))
2109 return NULL
; /* device is removed */
2114 /* Called when VF is injecting data into network stack.
2115 * Change the associated network device from VF to netvsc.
2116 * note: already called with rcu_read_lock
2118 static rx_handler_result_t
netvsc_vf_handle_frame(struct sk_buff
**pskb
)
2120 struct sk_buff
*skb
= *pskb
;
2121 struct net_device
*ndev
= rcu_dereference(skb
->dev
->rx_handler_data
);
2122 struct net_device_context
*ndev_ctx
= netdev_priv(ndev
);
2123 struct netvsc_vf_pcpu_stats
*pcpu_stats
2124 = this_cpu_ptr(ndev_ctx
->vf_stats
);
2126 skb
= skb_share_check(skb
, GFP_ATOMIC
);
2128 return RX_HANDLER_CONSUMED
;
2134 u64_stats_update_begin(&pcpu_stats
->syncp
);
2135 pcpu_stats
->rx_packets
++;
2136 pcpu_stats
->rx_bytes
+= skb
->len
;
2137 u64_stats_update_end(&pcpu_stats
->syncp
);
2139 return RX_HANDLER_ANOTHER
;
2142 static int netvsc_vf_join(struct net_device
*vf_netdev
,
2143 struct net_device
*ndev
)
2145 struct net_device_context
*ndev_ctx
= netdev_priv(ndev
);
2148 ret
= netdev_rx_handler_register(vf_netdev
,
2149 netvsc_vf_handle_frame
, ndev
);
2151 netdev_err(vf_netdev
,
2152 "can not register netvsc VF receive handler (err = %d)\n",
2154 goto rx_handler_failed
;
2157 ret
= netdev_master_upper_dev_link(vf_netdev
, ndev
,
2160 netdev_err(vf_netdev
,
2161 "can not set master device %s (err = %d)\n",
2163 goto upper_link_failed
;
2166 /* set slave flag before open to prevent IPv6 addrconf */
2167 vf_netdev
->flags
|= IFF_SLAVE
;
2169 schedule_delayed_work(&ndev_ctx
->vf_takeover
, VF_TAKEOVER_INT
);
2171 call_netdevice_notifiers(NETDEV_JOIN
, vf_netdev
);
2173 netdev_info(vf_netdev
, "joined to %s\n", ndev
->name
);
2177 netdev_rx_handler_unregister(vf_netdev
);
2182 static void __netvsc_vf_setup(struct net_device
*ndev
,
2183 struct net_device
*vf_netdev
)
2187 /* Align MTU of VF with master */
2188 ret
= dev_set_mtu(vf_netdev
, ndev
->mtu
);
2190 netdev_warn(vf_netdev
,
2191 "unable to change mtu to %u\n", ndev
->mtu
);
2193 /* set multicast etc flags on VF */
2194 dev_change_flags(vf_netdev
, ndev
->flags
| IFF_SLAVE
, NULL
);
2196 /* sync address list from ndev to VF */
2197 netif_addr_lock_bh(ndev
);
2198 dev_uc_sync(vf_netdev
, ndev
);
2199 dev_mc_sync(vf_netdev
, ndev
);
2200 netif_addr_unlock_bh(ndev
);
2202 if (netif_running(ndev
)) {
2203 ret
= dev_open(vf_netdev
, NULL
);
2205 netdev_warn(vf_netdev
,
2206 "unable to open: %d\n", ret
);
2210 /* Setup VF as slave of the synthetic device.
2211 * Runs in workqueue to avoid recursion in netlink callbacks.
2213 static void netvsc_vf_setup(struct work_struct
*w
)
2215 struct net_device_context
*ndev_ctx
2216 = container_of(w
, struct net_device_context
, vf_takeover
.work
);
2217 struct net_device
*ndev
= hv_get_drvdata(ndev_ctx
->device_ctx
);
2218 struct net_device
*vf_netdev
;
2220 if (!rtnl_trylock()) {
2221 schedule_delayed_work(&ndev_ctx
->vf_takeover
, 0);
2225 vf_netdev
= rtnl_dereference(ndev_ctx
->vf_netdev
);
2227 __netvsc_vf_setup(ndev
, vf_netdev
);
2232 /* Find netvsc by VF serial number.
2233 * The PCI hyperv controller records the serial number as the slot kobj name.
2235 static struct net_device
*get_netvsc_byslot(const struct net_device
*vf_netdev
)
2237 struct device
*parent
= vf_netdev
->dev
.parent
;
2238 struct net_device_context
*ndev_ctx
;
2239 struct pci_dev
*pdev
;
2242 if (!parent
|| !dev_is_pci(parent
))
2243 return NULL
; /* not a PCI device */
2245 pdev
= to_pci_dev(parent
);
2247 netdev_notice(vf_netdev
, "no PCI slot information\n");
2251 if (kstrtou32(pci_slot_name(pdev
->slot
), 10, &serial
)) {
2252 netdev_notice(vf_netdev
, "Invalid vf serial:%s\n",
2253 pci_slot_name(pdev
->slot
));
2257 list_for_each_entry(ndev_ctx
, &netvsc_dev_list
, list
) {
2258 if (!ndev_ctx
->vf_alloc
)
2261 if (ndev_ctx
->vf_serial
== serial
)
2262 return hv_get_drvdata(ndev_ctx
->device_ctx
);
2265 netdev_notice(vf_netdev
,
2266 "no netdev found for vf serial:%u\n", serial
);
2270 static int netvsc_register_vf(struct net_device
*vf_netdev
)
2272 struct net_device_context
*net_device_ctx
;
2273 struct netvsc_device
*netvsc_dev
;
2274 struct bpf_prog
*prog
;
2275 struct net_device
*ndev
;
2278 if (vf_netdev
->addr_len
!= ETH_ALEN
)
2281 ndev
= get_netvsc_byslot(vf_netdev
);
2285 net_device_ctx
= netdev_priv(ndev
);
2286 netvsc_dev
= rtnl_dereference(net_device_ctx
->nvdev
);
2287 if (!netvsc_dev
|| rtnl_dereference(net_device_ctx
->vf_netdev
))
2290 /* if synthetic interface is a different namespace,
2291 * then move the VF to that namespace; join will be
2292 * done again in that context.
2294 if (!net_eq(dev_net(ndev
), dev_net(vf_netdev
))) {
2295 ret
= dev_change_net_namespace(vf_netdev
,
2296 dev_net(ndev
), "eth%d");
2298 netdev_err(vf_netdev
,
2299 "could not move to same namespace as %s: %d\n",
2302 netdev_info(vf_netdev
,
2303 "VF moved to namespace with: %s\n",
2308 netdev_info(ndev
, "VF registering: %s\n", vf_netdev
->name
);
2310 if (netvsc_vf_join(vf_netdev
, ndev
) != 0)
2313 dev_hold(vf_netdev
);
2314 rcu_assign_pointer(net_device_ctx
->vf_netdev
, vf_netdev
);
2316 vf_netdev
->wanted_features
= ndev
->features
;
2317 netdev_update_features(vf_netdev
);
2319 prog
= netvsc_xdp_get(netvsc_dev
);
2320 netvsc_vf_setxdp(vf_netdev
, prog
);
2325 /* VF up/down change detected, schedule to change data path */
2326 static int netvsc_vf_changed(struct net_device
*vf_netdev
)
2328 struct net_device_context
*net_device_ctx
;
2329 struct netvsc_device
*netvsc_dev
;
2330 struct net_device
*ndev
;
2331 bool vf_is_up
= netif_running(vf_netdev
);
2333 ndev
= get_netvsc_byref(vf_netdev
);
2337 net_device_ctx
= netdev_priv(ndev
);
2338 netvsc_dev
= rtnl_dereference(net_device_ctx
->nvdev
);
2342 netvsc_switch_datapath(ndev
, vf_is_up
);
2343 netdev_info(ndev
, "Data path switched %s VF: %s\n",
2344 vf_is_up
? "to" : "from", vf_netdev
->name
);
2349 static int netvsc_unregister_vf(struct net_device
*vf_netdev
)
2351 struct net_device
*ndev
;
2352 struct net_device_context
*net_device_ctx
;
2354 ndev
= get_netvsc_byref(vf_netdev
);
2358 net_device_ctx
= netdev_priv(ndev
);
2359 cancel_delayed_work_sync(&net_device_ctx
->vf_takeover
);
2361 netdev_info(ndev
, "VF unregistering: %s\n", vf_netdev
->name
);
2363 netvsc_vf_setxdp(vf_netdev
, NULL
);
2365 netdev_rx_handler_unregister(vf_netdev
);
2366 netdev_upper_dev_unlink(vf_netdev
, ndev
);
2367 RCU_INIT_POINTER(net_device_ctx
->vf_netdev
, NULL
);
2373 static int netvsc_probe(struct hv_device
*dev
,
2374 const struct hv_vmbus_device_id
*dev_id
)
2376 struct net_device
*net
= NULL
;
2377 struct net_device_context
*net_device_ctx
;
2378 struct netvsc_device_info
*device_info
= NULL
;
2379 struct netvsc_device
*nvdev
;
2382 net
= alloc_etherdev_mq(sizeof(struct net_device_context
),
2387 netif_carrier_off(net
);
2389 netvsc_init_settings(net
);
2391 net_device_ctx
= netdev_priv(net
);
2392 net_device_ctx
->device_ctx
= dev
;
2393 net_device_ctx
->msg_enable
= netif_msg_init(debug
, default_msg
);
2394 if (netif_msg_probe(net_device_ctx
))
2395 netdev_dbg(net
, "netvsc msg_enable: %d\n",
2396 net_device_ctx
->msg_enable
);
2398 hv_set_drvdata(dev
, net
);
2400 INIT_DELAYED_WORK(&net_device_ctx
->dwork
, netvsc_link_change
);
2402 spin_lock_init(&net_device_ctx
->lock
);
2403 INIT_LIST_HEAD(&net_device_ctx
->reconfig_events
);
2404 INIT_DELAYED_WORK(&net_device_ctx
->vf_takeover
, netvsc_vf_setup
);
2406 net_device_ctx
->vf_stats
2407 = netdev_alloc_pcpu_stats(struct netvsc_vf_pcpu_stats
);
2408 if (!net_device_ctx
->vf_stats
)
2411 net
->netdev_ops
= &device_ops
;
2412 net
->ethtool_ops
= ðtool_ops
;
2413 SET_NETDEV_DEV(net
, &dev
->device
);
2415 /* We always need headroom for rndis header */
2416 net
->needed_headroom
= RNDIS_AND_PPI_SIZE
;
2418 /* Initialize the number of queues to be 1, we may change it if more
2419 * channels are offered later.
2421 netif_set_real_num_tx_queues(net
, 1);
2422 netif_set_real_num_rx_queues(net
, 1);
2424 /* Notify the netvsc driver of the new device */
2425 device_info
= netvsc_devinfo_get(NULL
);
2429 goto devinfo_failed
;
2432 nvdev
= rndis_filter_device_add(dev
, device_info
);
2433 if (IS_ERR(nvdev
)) {
2434 ret
= PTR_ERR(nvdev
);
2435 netdev_err(net
, "unable to add netvsc device (ret %d)\n", ret
);
2439 memcpy(net
->dev_addr
, device_info
->mac_adr
, ETH_ALEN
);
2441 /* We must get rtnl lock before scheduling nvdev->subchan_work,
2442 * otherwise netvsc_subchan_work() can get rtnl lock first and wait
2443 * all subchannels to show up, but that may not happen because
2444 * netvsc_probe() can't get rtnl lock and as a result vmbus_onoffer()
2445 * -> ... -> device_add() -> ... -> __device_attach() can't get
2446 * the device lock, so all the subchannels can't be processed --
2447 * finally netvsc_subchan_work() hangs forever.
2451 if (nvdev
->num_chn
> 1)
2452 schedule_work(&nvdev
->subchan_work
);
2454 /* hw_features computed in rndis_netdev_set_hwcaps() */
2455 net
->features
= net
->hw_features
|
2456 NETIF_F_HIGHDMA
| NETIF_F_HW_VLAN_CTAG_TX
|
2457 NETIF_F_HW_VLAN_CTAG_RX
;
2458 net
->vlan_features
= net
->features
;
2460 /* MTU range: 68 - 1500 or 65521 */
2461 net
->min_mtu
= NETVSC_MTU_MIN
;
2462 if (nvdev
->nvsp_version
>= NVSP_PROTOCOL_VERSION_2
)
2463 net
->max_mtu
= NETVSC_MTU
- ETH_HLEN
;
2465 net
->max_mtu
= ETH_DATA_LEN
;
2467 nvdev
->tx_disable
= false;
2469 ret
= register_netdevice(net
);
2471 pr_err("Unable to register netdev.\n");
2472 goto register_failed
;
2475 list_add(&net_device_ctx
->list
, &netvsc_dev_list
);
2478 netvsc_devinfo_put(device_info
);
2483 rndis_filter_device_remove(dev
, nvdev
);
2485 netvsc_devinfo_put(device_info
);
2487 free_percpu(net_device_ctx
->vf_stats
);
2489 hv_set_drvdata(dev
, NULL
);
2495 static int netvsc_remove(struct hv_device
*dev
)
2497 struct net_device_context
*ndev_ctx
;
2498 struct net_device
*vf_netdev
, *net
;
2499 struct netvsc_device
*nvdev
;
2501 net
= hv_get_drvdata(dev
);
2503 dev_err(&dev
->device
, "No net device to remove\n");
2507 ndev_ctx
= netdev_priv(net
);
2509 cancel_delayed_work_sync(&ndev_ctx
->dwork
);
2512 nvdev
= rtnl_dereference(ndev_ctx
->nvdev
);
2514 cancel_work_sync(&nvdev
->subchan_work
);
2515 netvsc_xdp_set(net
, NULL
, NULL
, nvdev
);
2519 * Call to the vsc driver to let it know that the device is being
2520 * removed. Also blocks mtu and channel changes.
2522 vf_netdev
= rtnl_dereference(ndev_ctx
->vf_netdev
);
2524 netvsc_unregister_vf(vf_netdev
);
2527 rndis_filter_device_remove(dev
, nvdev
);
2529 unregister_netdevice(net
);
2530 list_del(&ndev_ctx
->list
);
2534 hv_set_drvdata(dev
, NULL
);
2536 free_percpu(ndev_ctx
->vf_stats
);
2541 static int netvsc_suspend(struct hv_device
*dev
)
2543 struct net_device_context
*ndev_ctx
;
2544 struct net_device
*vf_netdev
, *net
;
2545 struct netvsc_device
*nvdev
;
2548 net
= hv_get_drvdata(dev
);
2550 ndev_ctx
= netdev_priv(net
);
2551 cancel_delayed_work_sync(&ndev_ctx
->dwork
);
2555 nvdev
= rtnl_dereference(ndev_ctx
->nvdev
);
2556 if (nvdev
== NULL
) {
2561 vf_netdev
= rtnl_dereference(ndev_ctx
->vf_netdev
);
2563 netvsc_unregister_vf(vf_netdev
);
2565 /* Save the current config info */
2566 ndev_ctx
->saved_netvsc_dev_info
= netvsc_devinfo_get(nvdev
);
2568 ret
= netvsc_detach(net
, nvdev
);
2575 static int netvsc_resume(struct hv_device
*dev
)
2577 struct net_device
*net
= hv_get_drvdata(dev
);
2578 struct net_device_context
*net_device_ctx
;
2579 struct netvsc_device_info
*device_info
;
2584 net_device_ctx
= netdev_priv(net
);
2585 device_info
= net_device_ctx
->saved_netvsc_dev_info
;
2587 ret
= netvsc_attach(net
, device_info
);
2589 netvsc_devinfo_put(device_info
);
2590 net_device_ctx
->saved_netvsc_dev_info
= NULL
;
2596 static const struct hv_vmbus_device_id id_table
[] = {
2602 MODULE_DEVICE_TABLE(vmbus
, id_table
);
2604 /* The one and only one */
2605 static struct hv_driver netvsc_drv
= {
2606 .name
= KBUILD_MODNAME
,
2607 .id_table
= id_table
,
2608 .probe
= netvsc_probe
,
2609 .remove
= netvsc_remove
,
2610 .suspend
= netvsc_suspend
,
2611 .resume
= netvsc_resume
,
2613 .probe_type
= PROBE_FORCE_SYNCHRONOUS
,
2618 * On Hyper-V, every VF interface is matched with a corresponding
2619 * synthetic interface. The synthetic interface is presented first
2620 * to the guest. When the corresponding VF instance is registered,
2621 * we will take care of switching the data path.
2623 static int netvsc_netdev_event(struct notifier_block
*this,
2624 unsigned long event
, void *ptr
)
2626 struct net_device
*event_dev
= netdev_notifier_info_to_dev(ptr
);
2628 /* Skip our own events */
2629 if (event_dev
->netdev_ops
== &device_ops
)
2632 /* Avoid non-Ethernet type devices */
2633 if (event_dev
->type
!= ARPHRD_ETHER
)
2636 /* Avoid Vlan dev with same MAC registering as VF */
2637 if (is_vlan_dev(event_dev
))
2640 /* Avoid Bonding master dev with same MAC registering as VF */
2641 if ((event_dev
->priv_flags
& IFF_BONDING
) &&
2642 (event_dev
->flags
& IFF_MASTER
))
2646 case NETDEV_REGISTER
:
2647 return netvsc_register_vf(event_dev
);
2648 case NETDEV_UNREGISTER
:
2649 return netvsc_unregister_vf(event_dev
);
2652 return netvsc_vf_changed(event_dev
);
2658 static struct notifier_block netvsc_netdev_notifier
= {
2659 .notifier_call
= netvsc_netdev_event
,
2662 static void __exit
netvsc_drv_exit(void)
2664 unregister_netdevice_notifier(&netvsc_netdev_notifier
);
2665 vmbus_driver_unregister(&netvsc_drv
);
2668 static int __init
netvsc_drv_init(void)
2672 if (ring_size
< RING_SIZE_MIN
) {
2673 ring_size
= RING_SIZE_MIN
;
2674 pr_info("Increased ring_size to %u (min allowed)\n",
2677 netvsc_ring_bytes
= ring_size
* PAGE_SIZE
;
2679 ret
= vmbus_driver_register(&netvsc_drv
);
2683 register_netdevice_notifier(&netvsc_netdev_notifier
);
2687 MODULE_LICENSE("GPL");
2688 MODULE_DESCRIPTION("Microsoft Hyper-V network driver");
2690 module_init(netvsc_drv_init
);
2691 module_exit(netvsc_drv_exit
);