2 * Copyright (c) 2009, Microsoft Corporation.
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms and conditions of the GNU General Public License,
6 * version 2, as published by the Free Software Foundation.
8 * This program is distributed in the hope it will be useful, but WITHOUT
9 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
13 * You should have received a copy of the GNU General Public License along with
14 * this program; if not, see <http://www.gnu.org/licenses/>.
17 * Haiyang Zhang <haiyangz@microsoft.com>
18 * Hank Janssen <hjanssen@microsoft.com>
20 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
22 #include <linux/init.h>
23 #include <linux/atomic.h>
24 #include <linux/module.h>
25 #include <linux/highmem.h>
26 #include <linux/device.h>
28 #include <linux/delay.h>
29 #include <linux/netdevice.h>
30 #include <linux/inetdevice.h>
31 #include <linux/etherdevice.h>
32 #include <linux/skbuff.h>
33 #include <linux/if_vlan.h>
35 #include <linux/slab.h>
36 #include <linux/rtnetlink.h>
37 #include <linux/netpoll.h>
38 #include <linux/reciprocal_div.h>
41 #include <net/route.h>
43 #include <net/pkt_sched.h>
44 #include <net/checksum.h>
45 #include <net/ip6_checksum.h>
47 #include "hyperv_net.h"
49 #define RING_SIZE_MIN 64
51 #define LINKCHANGE_INT (2 * HZ)
52 #define VF_TAKEOVER_INT (HZ / 10)
54 static unsigned int ring_size __ro_after_init
= 128;
55 module_param(ring_size
, uint
, S_IRUGO
);
56 MODULE_PARM_DESC(ring_size
, "Ring buffer size (# of pages)");
57 unsigned int netvsc_ring_bytes __ro_after_init
;
58 struct reciprocal_value netvsc_ring_reciprocal __ro_after_init
;
60 static const u32 default_msg
= NETIF_MSG_DRV
| NETIF_MSG_PROBE
|
61 NETIF_MSG_LINK
| NETIF_MSG_IFUP
|
62 NETIF_MSG_IFDOWN
| NETIF_MSG_RX_ERR
|
65 static int debug
= -1;
66 module_param(debug
, int, S_IRUGO
);
67 MODULE_PARM_DESC(debug
, "Debug level (0=none,...,16=all)");
69 static void netvsc_change_rx_flags(struct net_device
*net
, int change
)
71 struct net_device_context
*ndev_ctx
= netdev_priv(net
);
72 struct net_device
*vf_netdev
= rtnl_dereference(ndev_ctx
->vf_netdev
);
78 if (change
& IFF_PROMISC
) {
79 inc
= (net
->flags
& IFF_PROMISC
) ? 1 : -1;
80 dev_set_promiscuity(vf_netdev
, inc
);
83 if (change
& IFF_ALLMULTI
) {
84 inc
= (net
->flags
& IFF_ALLMULTI
) ? 1 : -1;
85 dev_set_allmulti(vf_netdev
, inc
);
89 static void netvsc_set_rx_mode(struct net_device
*net
)
91 struct net_device_context
*ndev_ctx
= netdev_priv(net
);
92 struct net_device
*vf_netdev
= rtnl_dereference(ndev_ctx
->vf_netdev
);
93 struct netvsc_device
*nvdev
= rtnl_dereference(ndev_ctx
->nvdev
);
96 dev_uc_sync(vf_netdev
, net
);
97 dev_mc_sync(vf_netdev
, net
);
100 rndis_filter_update(nvdev
);
103 static int netvsc_open(struct net_device
*net
)
105 struct net_device_context
*ndev_ctx
= netdev_priv(net
);
106 struct net_device
*vf_netdev
= rtnl_dereference(ndev_ctx
->vf_netdev
);
107 struct netvsc_device
*nvdev
= rtnl_dereference(ndev_ctx
->nvdev
);
108 struct rndis_device
*rdev
;
111 netif_carrier_off(net
);
113 /* Open up the device */
114 ret
= rndis_filter_open(nvdev
);
116 netdev_err(net
, "unable to open device (ret %d).\n", ret
);
120 rdev
= nvdev
->extension
;
121 if (!rdev
->link_state
) {
122 netif_carrier_on(net
);
123 netif_tx_wake_all_queues(net
);
127 /* Setting synthetic device up transparently sets
128 * slave as up. If open fails, then slave will be
129 * still be offline (and not used).
131 ret
= dev_open(vf_netdev
);
134 "unable to open slave: %s: %d\n",
135 vf_netdev
->name
, ret
);
140 static int netvsc_close(struct net_device
*net
)
142 struct net_device_context
*net_device_ctx
= netdev_priv(net
);
143 struct net_device
*vf_netdev
144 = rtnl_dereference(net_device_ctx
->vf_netdev
);
145 struct netvsc_device
*nvdev
= rtnl_dereference(net_device_ctx
->nvdev
);
147 u32 aread
, i
, msec
= 10, retry
= 0, retry_max
= 20;
148 struct vmbus_channel
*chn
;
150 netif_tx_disable(net
);
152 /* No need to close rndis filter if it is removed already */
156 ret
= rndis_filter_close(nvdev
);
158 netdev_err(net
, "unable to close device (ret %d).\n", ret
);
162 /* Ensure pending bytes in ring are read */
165 for (i
= 0; i
< nvdev
->num_chn
; i
++) {
166 chn
= nvdev
->chan_table
[i
].channel
;
170 aread
= hv_get_bytes_to_read(&chn
->inbound
);
174 aread
= hv_get_bytes_to_read(&chn
->outbound
);
180 if (retry
> retry_max
|| aread
== 0)
190 netdev_err(net
, "Ring buffer not empty after closing rndis\n");
196 dev_close(vf_netdev
);
201 static inline void *init_ppi_data(struct rndis_message
*msg
,
202 u32 ppi_size
, u32 pkt_type
)
204 struct rndis_packet
*rndis_pkt
= &msg
->msg
.pkt
;
205 struct rndis_per_packet_info
*ppi
;
207 rndis_pkt
->data_offset
+= ppi_size
;
208 ppi
= (void *)rndis_pkt
+ rndis_pkt
->per_pkt_info_offset
209 + rndis_pkt
->per_pkt_info_len
;
211 ppi
->size
= ppi_size
;
212 ppi
->type
= pkt_type
;
213 ppi
->ppi_offset
= sizeof(struct rndis_per_packet_info
);
215 rndis_pkt
->per_pkt_info_len
+= ppi_size
;
220 /* Azure hosts don't support non-TCP port numbers in hashing for fragmented
221 * packets. We can use ethtool to change UDP hash level when necessary.
223 static inline u32
netvsc_get_hash(
225 const struct net_device_context
*ndc
)
227 struct flow_keys flow
;
228 u32 hash
, pkt_proto
= 0;
229 static u32 hashrnd __read_mostly
;
231 net_get_random_once(&hashrnd
, sizeof(hashrnd
));
233 if (!skb_flow_dissect_flow_keys(skb
, &flow
, 0))
236 switch (flow
.basic
.ip_proto
) {
238 if (flow
.basic
.n_proto
== htons(ETH_P_IP
))
239 pkt_proto
= HV_TCP4_L4HASH
;
240 else if (flow
.basic
.n_proto
== htons(ETH_P_IPV6
))
241 pkt_proto
= HV_TCP6_L4HASH
;
246 if (flow
.basic
.n_proto
== htons(ETH_P_IP
))
247 pkt_proto
= HV_UDP4_L4HASH
;
248 else if (flow
.basic
.n_proto
== htons(ETH_P_IPV6
))
249 pkt_proto
= HV_UDP6_L4HASH
;
254 if (pkt_proto
& ndc
->l4_hash
) {
255 return skb_get_hash(skb
);
257 if (flow
.basic
.n_proto
== htons(ETH_P_IP
))
258 hash
= jhash2((u32
*)&flow
.addrs
.v4addrs
, 2, hashrnd
);
259 else if (flow
.basic
.n_proto
== htons(ETH_P_IPV6
))
260 hash
= jhash2((u32
*)&flow
.addrs
.v6addrs
, 8, hashrnd
);
264 skb_set_hash(skb
, hash
, PKT_HASH_TYPE_L3
);
270 static inline int netvsc_get_tx_queue(struct net_device
*ndev
,
271 struct sk_buff
*skb
, int old_idx
)
273 const struct net_device_context
*ndc
= netdev_priv(ndev
);
274 struct sock
*sk
= skb
->sk
;
277 q_idx
= ndc
->tx_table
[netvsc_get_hash(skb
, ndc
) &
278 (VRSS_SEND_TAB_SIZE
- 1)];
280 /* If queue index changed record the new value */
281 if (q_idx
!= old_idx
&&
282 sk
&& sk_fullsock(sk
) && rcu_access_pointer(sk
->sk_dst_cache
))
283 sk_tx_queue_set(sk
, q_idx
);
289 * Select queue for transmit.
291 * If a valid queue has already been assigned, then use that.
292 * Otherwise compute tx queue based on hash and the send table.
294 * This is basically similar to default (__netdev_pick_tx) with the added step
295 * of using the host send_table when no other queue has been assigned.
297 * TODO support XPS - but get_xps_queue not exported
299 static u16
netvsc_pick_tx(struct net_device
*ndev
, struct sk_buff
*skb
)
301 int q_idx
= sk_tx_queue_get(skb
->sk
);
303 if (q_idx
< 0 || skb
->ooo_okay
|| q_idx
>= ndev
->real_num_tx_queues
) {
304 /* If forwarding a packet, we use the recorded queue when
305 * available for better cache locality.
307 if (skb_rx_queue_recorded(skb
))
308 q_idx
= skb_get_rx_queue(skb
);
310 q_idx
= netvsc_get_tx_queue(ndev
, skb
, q_idx
);
316 static u16
netvsc_select_queue(struct net_device
*ndev
, struct sk_buff
*skb
,
318 select_queue_fallback_t fallback
)
320 struct net_device_context
*ndc
= netdev_priv(ndev
);
321 struct net_device
*vf_netdev
;
325 vf_netdev
= rcu_dereference(ndc
->vf_netdev
);
327 const struct net_device_ops
*vf_ops
= vf_netdev
->netdev_ops
;
329 if (vf_ops
->ndo_select_queue
)
330 txq
= vf_ops
->ndo_select_queue(vf_netdev
, skb
,
331 accel_priv
, fallback
);
333 txq
= fallback(vf_netdev
, skb
);
335 /* Record the queue selected by VF so that it can be
336 * used for common case where VF has more queues than
337 * the synthetic device.
339 qdisc_skb_cb(skb
)->slave_dev_queue_mapping
= txq
;
341 txq
= netvsc_pick_tx(ndev
, skb
);
345 while (unlikely(txq
>= ndev
->real_num_tx_queues
))
346 txq
-= ndev
->real_num_tx_queues
;
351 static u32
fill_pg_buf(struct page
*page
, u32 offset
, u32 len
,
352 struct hv_page_buffer
*pb
)
356 /* Deal with compund pages by ignoring unused part
359 page
+= (offset
>> PAGE_SHIFT
);
360 offset
&= ~PAGE_MASK
;
365 bytes
= PAGE_SIZE
- offset
;
368 pb
[j
].pfn
= page_to_pfn(page
);
369 pb
[j
].offset
= offset
;
375 if (offset
== PAGE_SIZE
&& len
) {
385 static u32
init_page_array(void *hdr
, u32 len
, struct sk_buff
*skb
,
386 struct hv_netvsc_packet
*packet
,
387 struct hv_page_buffer
*pb
)
390 char *data
= skb
->data
;
391 int frags
= skb_shinfo(skb
)->nr_frags
;
394 /* The packet is laid out thus:
395 * 1. hdr: RNDIS header and PPI
397 * 3. skb fragment data
399 slots_used
+= fill_pg_buf(virt_to_page(hdr
),
401 len
, &pb
[slots_used
]);
403 packet
->rmsg_size
= len
;
404 packet
->rmsg_pgcnt
= slots_used
;
406 slots_used
+= fill_pg_buf(virt_to_page(data
),
407 offset_in_page(data
),
408 skb_headlen(skb
), &pb
[slots_used
]);
410 for (i
= 0; i
< frags
; i
++) {
411 skb_frag_t
*frag
= skb_shinfo(skb
)->frags
+ i
;
413 slots_used
+= fill_pg_buf(skb_frag_page(frag
),
415 skb_frag_size(frag
), &pb
[slots_used
]);
420 static int count_skb_frag_slots(struct sk_buff
*skb
)
422 int i
, frags
= skb_shinfo(skb
)->nr_frags
;
425 for (i
= 0; i
< frags
; i
++) {
426 skb_frag_t
*frag
= skb_shinfo(skb
)->frags
+ i
;
427 unsigned long size
= skb_frag_size(frag
);
428 unsigned long offset
= frag
->page_offset
;
430 /* Skip unused frames from start of page */
431 offset
&= ~PAGE_MASK
;
432 pages
+= PFN_UP(offset
+ size
);
437 static int netvsc_get_slots(struct sk_buff
*skb
)
439 char *data
= skb
->data
;
440 unsigned int offset
= offset_in_page(data
);
441 unsigned int len
= skb_headlen(skb
);
445 slots
= DIV_ROUND_UP(offset
+ len
, PAGE_SIZE
);
446 frag_slots
= count_skb_frag_slots(skb
);
447 return slots
+ frag_slots
;
450 static u32
net_checksum_info(struct sk_buff
*skb
)
452 if (skb
->protocol
== htons(ETH_P_IP
)) {
453 struct iphdr
*ip
= ip_hdr(skb
);
455 if (ip
->protocol
== IPPROTO_TCP
)
456 return TRANSPORT_INFO_IPV4_TCP
;
457 else if (ip
->protocol
== IPPROTO_UDP
)
458 return TRANSPORT_INFO_IPV4_UDP
;
460 struct ipv6hdr
*ip6
= ipv6_hdr(skb
);
462 if (ip6
->nexthdr
== IPPROTO_TCP
)
463 return TRANSPORT_INFO_IPV6_TCP
;
464 else if (ip6
->nexthdr
== IPPROTO_UDP
)
465 return TRANSPORT_INFO_IPV6_UDP
;
468 return TRANSPORT_INFO_NOT_IP
;
471 /* Send skb on the slave VF device. */
472 static int netvsc_vf_xmit(struct net_device
*net
, struct net_device
*vf_netdev
,
475 struct net_device_context
*ndev_ctx
= netdev_priv(net
);
476 unsigned int len
= skb
->len
;
479 skb
->dev
= vf_netdev
;
480 skb
->queue_mapping
= qdisc_skb_cb(skb
)->slave_dev_queue_mapping
;
482 rc
= dev_queue_xmit(skb
);
483 if (likely(rc
== NET_XMIT_SUCCESS
|| rc
== NET_XMIT_CN
)) {
484 struct netvsc_vf_pcpu_stats
*pcpu_stats
485 = this_cpu_ptr(ndev_ctx
->vf_stats
);
487 u64_stats_update_begin(&pcpu_stats
->syncp
);
488 pcpu_stats
->tx_packets
++;
489 pcpu_stats
->tx_bytes
+= len
;
490 u64_stats_update_end(&pcpu_stats
->syncp
);
492 this_cpu_inc(ndev_ctx
->vf_stats
->tx_dropped
);
498 static int netvsc_start_xmit(struct sk_buff
*skb
, struct net_device
*net
)
500 struct net_device_context
*net_device_ctx
= netdev_priv(net
);
501 struct hv_netvsc_packet
*packet
= NULL
;
503 unsigned int num_data_pgs
;
504 struct rndis_message
*rndis_msg
;
505 struct net_device
*vf_netdev
;
508 struct hv_page_buffer pb
[MAX_PAGE_BUFFER_COUNT
];
510 /* if VF is present and up then redirect packets
511 * already called with rcu_read_lock_bh
513 vf_netdev
= rcu_dereference_bh(net_device_ctx
->vf_netdev
);
514 if (vf_netdev
&& netif_running(vf_netdev
) &&
515 !netpoll_tx_running(net
))
516 return netvsc_vf_xmit(net
, vf_netdev
, skb
);
518 /* We will atmost need two pages to describe the rndis
519 * header. We can only transmit MAX_PAGE_BUFFER_COUNT number
520 * of pages in a single packet. If skb is scattered around
521 * more pages we try linearizing it.
524 num_data_pgs
= netvsc_get_slots(skb
) + 2;
526 if (unlikely(num_data_pgs
> MAX_PAGE_BUFFER_COUNT
)) {
527 ++net_device_ctx
->eth_stats
.tx_scattered
;
529 if (skb_linearize(skb
))
532 num_data_pgs
= netvsc_get_slots(skb
) + 2;
533 if (num_data_pgs
> MAX_PAGE_BUFFER_COUNT
) {
534 ++net_device_ctx
->eth_stats
.tx_too_big
;
540 * Place the rndis header in the skb head room and
541 * the skb->cb will be used for hv_netvsc_packet
544 ret
= skb_cow_head(skb
, RNDIS_AND_PPI_SIZE
);
548 /* Use the skb control buffer for building up the packet */
549 BUILD_BUG_ON(sizeof(struct hv_netvsc_packet
) >
550 FIELD_SIZEOF(struct sk_buff
, cb
));
551 packet
= (struct hv_netvsc_packet
*)skb
->cb
;
553 packet
->q_idx
= skb_get_queue_mapping(skb
);
555 packet
->total_data_buflen
= skb
->len
;
556 packet
->total_bytes
= skb
->len
;
557 packet
->total_packets
= 1;
559 rndis_msg
= (struct rndis_message
*)skb
->head
;
561 /* Add the rndis header */
562 rndis_msg
->ndis_msg_type
= RNDIS_MSG_PACKET
;
563 rndis_msg
->msg_len
= packet
->total_data_buflen
;
565 rndis_msg
->msg
.pkt
= (struct rndis_packet
) {
566 .data_offset
= sizeof(struct rndis_packet
),
567 .data_len
= packet
->total_data_buflen
,
568 .per_pkt_info_offset
= sizeof(struct rndis_packet
),
571 rndis_msg_size
= RNDIS_MESSAGE_SIZE(struct rndis_packet
);
573 hash
= skb_get_hash_raw(skb
);
574 if (hash
!= 0 && net
->real_num_tx_queues
> 1) {
577 rndis_msg_size
+= NDIS_HASH_PPI_SIZE
;
578 hash_info
= init_ppi_data(rndis_msg
, NDIS_HASH_PPI_SIZE
,
583 if (skb_vlan_tag_present(skb
)) {
584 struct ndis_pkt_8021q_info
*vlan
;
586 rndis_msg_size
+= NDIS_VLAN_PPI_SIZE
;
587 vlan
= init_ppi_data(rndis_msg
, NDIS_VLAN_PPI_SIZE
,
591 vlan
->vlanid
= skb
->vlan_tci
& VLAN_VID_MASK
;
592 vlan
->pri
= (skb
->vlan_tci
& VLAN_PRIO_MASK
) >>
596 if (skb_is_gso(skb
)) {
597 struct ndis_tcp_lso_info
*lso_info
;
599 rndis_msg_size
+= NDIS_LSO_PPI_SIZE
;
600 lso_info
= init_ppi_data(rndis_msg
, NDIS_LSO_PPI_SIZE
,
601 TCP_LARGESEND_PKTINFO
);
604 lso_info
->lso_v2_transmit
.type
= NDIS_TCP_LARGE_SEND_OFFLOAD_V2_TYPE
;
605 if (skb
->protocol
== htons(ETH_P_IP
)) {
606 lso_info
->lso_v2_transmit
.ip_version
=
607 NDIS_TCP_LARGE_SEND_OFFLOAD_IPV4
;
608 ip_hdr(skb
)->tot_len
= 0;
609 ip_hdr(skb
)->check
= 0;
610 tcp_hdr(skb
)->check
=
611 ~csum_tcpudp_magic(ip_hdr(skb
)->saddr
,
612 ip_hdr(skb
)->daddr
, 0, IPPROTO_TCP
, 0);
614 lso_info
->lso_v2_transmit
.ip_version
=
615 NDIS_TCP_LARGE_SEND_OFFLOAD_IPV6
;
616 ipv6_hdr(skb
)->payload_len
= 0;
617 tcp_hdr(skb
)->check
=
618 ~csum_ipv6_magic(&ipv6_hdr(skb
)->saddr
,
619 &ipv6_hdr(skb
)->daddr
, 0, IPPROTO_TCP
, 0);
621 lso_info
->lso_v2_transmit
.tcp_header_offset
= skb_transport_offset(skb
);
622 lso_info
->lso_v2_transmit
.mss
= skb_shinfo(skb
)->gso_size
;
623 } else if (skb
->ip_summed
== CHECKSUM_PARTIAL
) {
624 if (net_checksum_info(skb
) & net_device_ctx
->tx_checksum_mask
) {
625 struct ndis_tcp_ip_checksum_info
*csum_info
;
627 rndis_msg_size
+= NDIS_CSUM_PPI_SIZE
;
628 csum_info
= init_ppi_data(rndis_msg
, NDIS_CSUM_PPI_SIZE
,
629 TCPIP_CHKSUM_PKTINFO
);
631 csum_info
->value
= 0;
632 csum_info
->transmit
.tcp_header_offset
= skb_transport_offset(skb
);
634 if (skb
->protocol
== htons(ETH_P_IP
)) {
635 csum_info
->transmit
.is_ipv4
= 1;
637 if (ip_hdr(skb
)->protocol
== IPPROTO_TCP
)
638 csum_info
->transmit
.tcp_checksum
= 1;
640 csum_info
->transmit
.udp_checksum
= 1;
642 csum_info
->transmit
.is_ipv6
= 1;
644 if (ipv6_hdr(skb
)->nexthdr
== IPPROTO_TCP
)
645 csum_info
->transmit
.tcp_checksum
= 1;
647 csum_info
->transmit
.udp_checksum
= 1;
650 /* Can't do offload of this type of checksum */
651 if (skb_checksum_help(skb
))
656 /* Start filling in the page buffers with the rndis hdr */
657 rndis_msg
->msg_len
+= rndis_msg_size
;
658 packet
->total_data_buflen
= rndis_msg
->msg_len
;
659 packet
->page_buf_cnt
= init_page_array(rndis_msg
, rndis_msg_size
,
662 /* timestamp packet in software */
663 skb_tx_timestamp(skb
);
665 ret
= netvsc_send(net
, packet
, rndis_msg
, pb
, skb
);
666 if (likely(ret
== 0))
669 if (ret
== -EAGAIN
) {
670 ++net_device_ctx
->eth_stats
.tx_busy
;
671 return NETDEV_TX_BUSY
;
675 ++net_device_ctx
->eth_stats
.tx_no_space
;
678 dev_kfree_skb_any(skb
);
679 net
->stats
.tx_dropped
++;
684 ++net_device_ctx
->eth_stats
.tx_no_memory
;
689 * netvsc_linkstatus_callback - Link up/down notification
691 void netvsc_linkstatus_callback(struct net_device
*net
,
692 struct rndis_message
*resp
)
694 struct rndis_indicate_status
*indicate
= &resp
->msg
.indicate_status
;
695 struct net_device_context
*ndev_ctx
= netdev_priv(net
);
696 struct netvsc_reconfig
*event
;
699 /* Update the physical link speed when changing to another vSwitch */
700 if (indicate
->status
== RNDIS_STATUS_LINK_SPEED_CHANGE
) {
703 speed
= *(u32
*)((void *)indicate
704 + indicate
->status_buf_offset
) / 10000;
705 ndev_ctx
->speed
= speed
;
709 /* Handle these link change statuses below */
710 if (indicate
->status
!= RNDIS_STATUS_NETWORK_CHANGE
&&
711 indicate
->status
!= RNDIS_STATUS_MEDIA_CONNECT
&&
712 indicate
->status
!= RNDIS_STATUS_MEDIA_DISCONNECT
)
715 if (net
->reg_state
!= NETREG_REGISTERED
)
718 event
= kzalloc(sizeof(*event
), GFP_ATOMIC
);
721 event
->event
= indicate
->status
;
723 spin_lock_irqsave(&ndev_ctx
->lock
, flags
);
724 list_add_tail(&event
->list
, &ndev_ctx
->reconfig_events
);
725 spin_unlock_irqrestore(&ndev_ctx
->lock
, flags
);
727 schedule_delayed_work(&ndev_ctx
->dwork
, 0);
730 static struct sk_buff
*netvsc_alloc_recv_skb(struct net_device
*net
,
731 struct napi_struct
*napi
,
732 const struct ndis_tcp_ip_checksum_info
*csum_info
,
733 const struct ndis_pkt_8021q_info
*vlan
,
734 void *data
, u32 buflen
)
738 skb
= napi_alloc_skb(napi
, buflen
);
743 * Copy to skb. This copy is needed here since the memory pointed by
744 * hv_netvsc_packet cannot be deallocated
746 skb_put_data(skb
, data
, buflen
);
748 skb
->protocol
= eth_type_trans(skb
, net
);
750 /* skb is already created with CHECKSUM_NONE */
751 skb_checksum_none_assert(skb
);
754 * In Linux, the IP checksum is always checked.
755 * Do L4 checksum offload if enabled and present.
757 if (csum_info
&& (net
->features
& NETIF_F_RXCSUM
)) {
758 if (csum_info
->receive
.tcp_checksum_succeeded
||
759 csum_info
->receive
.udp_checksum_succeeded
)
760 skb
->ip_summed
= CHECKSUM_UNNECESSARY
;
764 u16 vlan_tci
= vlan
->vlanid
| (vlan
->pri
<< VLAN_PRIO_SHIFT
);
766 __vlan_hwaccel_put_tag(skb
, htons(ETH_P_8021Q
),
774 * netvsc_recv_callback - Callback when we receive a packet from the
775 * "wire" on the specified device.
777 int netvsc_recv_callback(struct net_device
*net
,
778 struct netvsc_device
*net_device
,
779 struct vmbus_channel
*channel
,
781 const struct ndis_tcp_ip_checksum_info
*csum_info
,
782 const struct ndis_pkt_8021q_info
*vlan
)
784 struct net_device_context
*net_device_ctx
= netdev_priv(net
);
785 u16 q_idx
= channel
->offermsg
.offer
.sub_channel_index
;
786 struct netvsc_channel
*nvchan
= &net_device
->chan_table
[q_idx
];
788 struct netvsc_stats
*rx_stats
;
790 if (net
->reg_state
!= NETREG_REGISTERED
)
791 return NVSP_STAT_FAIL
;
793 /* Allocate a skb - TODO direct I/O to pages? */
794 skb
= netvsc_alloc_recv_skb(net
, &nvchan
->napi
,
795 csum_info
, vlan
, data
, len
);
796 if (unlikely(!skb
)) {
797 ++net_device_ctx
->eth_stats
.rx_no_memory
;
799 return NVSP_STAT_FAIL
;
802 skb_record_rx_queue(skb
, q_idx
);
805 * Even if injecting the packet, record the statistics
806 * on the synthetic device because modifying the VF device
807 * statistics will not work correctly.
809 rx_stats
= &nvchan
->rx_stats
;
810 u64_stats_update_begin(&rx_stats
->syncp
);
812 rx_stats
->bytes
+= len
;
814 if (skb
->pkt_type
== PACKET_BROADCAST
)
815 ++rx_stats
->broadcast
;
816 else if (skb
->pkt_type
== PACKET_MULTICAST
)
817 ++rx_stats
->multicast
;
818 u64_stats_update_end(&rx_stats
->syncp
);
820 napi_gro_receive(&nvchan
->napi
, skb
);
824 static void netvsc_get_drvinfo(struct net_device
*net
,
825 struct ethtool_drvinfo
*info
)
827 strlcpy(info
->driver
, KBUILD_MODNAME
, sizeof(info
->driver
));
828 strlcpy(info
->fw_version
, "N/A", sizeof(info
->fw_version
));
831 static void netvsc_get_channels(struct net_device
*net
,
832 struct ethtool_channels
*channel
)
834 struct net_device_context
*net_device_ctx
= netdev_priv(net
);
835 struct netvsc_device
*nvdev
= rtnl_dereference(net_device_ctx
->nvdev
);
838 channel
->max_combined
= nvdev
->max_chn
;
839 channel
->combined_count
= nvdev
->num_chn
;
843 static int netvsc_set_channels(struct net_device
*net
,
844 struct ethtool_channels
*channels
)
846 struct net_device_context
*net_device_ctx
= netdev_priv(net
);
847 struct hv_device
*dev
= net_device_ctx
->device_ctx
;
848 struct netvsc_device
*nvdev
= rtnl_dereference(net_device_ctx
->nvdev
);
849 unsigned int orig
, count
= channels
->combined_count
;
850 struct netvsc_device_info device_info
;
854 /* We do not support separate count for rx, tx, or other */
856 channels
->rx_count
|| channels
->tx_count
|| channels
->other_count
)
859 if (!nvdev
|| nvdev
->destroy
)
862 if (nvdev
->nvsp_version
< NVSP_PROTOCOL_VERSION_5
)
865 if (count
> nvdev
->max_chn
)
868 orig
= nvdev
->num_chn
;
869 was_opened
= rndis_filter_opened(nvdev
);
871 rndis_filter_close(nvdev
);
873 memset(&device_info
, 0, sizeof(device_info
));
874 device_info
.num_chn
= count
;
875 device_info
.send_sections
= nvdev
->send_section_cnt
;
876 device_info
.send_section_size
= nvdev
->send_section_size
;
877 device_info
.recv_sections
= nvdev
->recv_section_cnt
;
878 device_info
.recv_section_size
= nvdev
->recv_section_size
;
880 rndis_filter_device_remove(dev
, nvdev
);
882 nvdev
= rndis_filter_device_add(dev
, &device_info
);
884 ret
= PTR_ERR(nvdev
);
885 device_info
.num_chn
= orig
;
886 nvdev
= rndis_filter_device_add(dev
, &device_info
);
889 netdev_err(net
, "restoring channel setting failed: %ld\n",
896 rndis_filter_open(nvdev
);
898 /* We may have missed link change notifications */
899 net_device_ctx
->last_reconfig
= 0;
900 schedule_delayed_work(&net_device_ctx
->dwork
, 0);
906 netvsc_validate_ethtool_ss_cmd(const struct ethtool_link_ksettings
*cmd
)
908 struct ethtool_link_ksettings diff1
= *cmd
;
909 struct ethtool_link_ksettings diff2
= {};
911 diff1
.base
.speed
= 0;
912 diff1
.base
.duplex
= 0;
913 /* advertising and cmd are usually set */
914 ethtool_link_ksettings_zero_link_mode(&diff1
, advertising
);
916 /* We set port to PORT_OTHER */
917 diff2
.base
.port
= PORT_OTHER
;
919 return !memcmp(&diff1
, &diff2
, sizeof(diff1
));
922 static void netvsc_init_settings(struct net_device
*dev
)
924 struct net_device_context
*ndc
= netdev_priv(dev
);
926 ndc
->l4_hash
= HV_DEFAULT_L4HASH
;
928 ndc
->speed
= SPEED_UNKNOWN
;
929 ndc
->duplex
= DUPLEX_FULL
;
932 static int netvsc_get_link_ksettings(struct net_device
*dev
,
933 struct ethtool_link_ksettings
*cmd
)
935 struct net_device_context
*ndc
= netdev_priv(dev
);
937 cmd
->base
.speed
= ndc
->speed
;
938 cmd
->base
.duplex
= ndc
->duplex
;
939 cmd
->base
.port
= PORT_OTHER
;
944 static int netvsc_set_link_ksettings(struct net_device
*dev
,
945 const struct ethtool_link_ksettings
*cmd
)
947 struct net_device_context
*ndc
= netdev_priv(dev
);
950 speed
= cmd
->base
.speed
;
951 if (!ethtool_validate_speed(speed
) ||
952 !ethtool_validate_duplex(cmd
->base
.duplex
) ||
953 !netvsc_validate_ethtool_ss_cmd(cmd
))
957 ndc
->duplex
= cmd
->base
.duplex
;
962 static int netvsc_change_mtu(struct net_device
*ndev
, int mtu
)
964 struct net_device_context
*ndevctx
= netdev_priv(ndev
);
965 struct net_device
*vf_netdev
= rtnl_dereference(ndevctx
->vf_netdev
);
966 struct netvsc_device
*nvdev
= rtnl_dereference(ndevctx
->nvdev
);
967 struct hv_device
*hdev
= ndevctx
->device_ctx
;
968 int orig_mtu
= ndev
->mtu
;
969 struct netvsc_device_info device_info
;
973 if (!nvdev
|| nvdev
->destroy
)
976 /* Change MTU of underlying VF netdev first. */
978 ret
= dev_set_mtu(vf_netdev
, mtu
);
983 netif_device_detach(ndev
);
984 was_opened
= rndis_filter_opened(nvdev
);
986 rndis_filter_close(nvdev
);
988 memset(&device_info
, 0, sizeof(device_info
));
989 device_info
.num_chn
= nvdev
->num_chn
;
990 device_info
.send_sections
= nvdev
->send_section_cnt
;
991 device_info
.send_section_size
= nvdev
->send_section_size
;
992 device_info
.recv_sections
= nvdev
->recv_section_cnt
;
993 device_info
.recv_section_size
= nvdev
->recv_section_size
;
995 rndis_filter_device_remove(hdev
, nvdev
);
999 nvdev
= rndis_filter_device_add(hdev
, &device_info
);
1000 if (IS_ERR(nvdev
)) {
1001 ret
= PTR_ERR(nvdev
);
1003 /* Attempt rollback to original MTU */
1004 ndev
->mtu
= orig_mtu
;
1005 nvdev
= rndis_filter_device_add(hdev
, &device_info
);
1008 dev_set_mtu(vf_netdev
, orig_mtu
);
1010 if (IS_ERR(nvdev
)) {
1011 netdev_err(ndev
, "restoring mtu failed: %ld\n",
1018 rndis_filter_open(nvdev
);
1020 netif_device_attach(ndev
);
1022 /* We may have missed link change notifications */
1023 schedule_delayed_work(&ndevctx
->dwork
, 0);
1028 static void netvsc_get_vf_stats(struct net_device
*net
,
1029 struct netvsc_vf_pcpu_stats
*tot
)
1031 struct net_device_context
*ndev_ctx
= netdev_priv(net
);
1034 memset(tot
, 0, sizeof(*tot
));
1036 for_each_possible_cpu(i
) {
1037 const struct netvsc_vf_pcpu_stats
*stats
1038 = per_cpu_ptr(ndev_ctx
->vf_stats
, i
);
1039 u64 rx_packets
, rx_bytes
, tx_packets
, tx_bytes
;
1043 start
= u64_stats_fetch_begin_irq(&stats
->syncp
);
1044 rx_packets
= stats
->rx_packets
;
1045 tx_packets
= stats
->tx_packets
;
1046 rx_bytes
= stats
->rx_bytes
;
1047 tx_bytes
= stats
->tx_bytes
;
1048 } while (u64_stats_fetch_retry_irq(&stats
->syncp
, start
));
1050 tot
->rx_packets
+= rx_packets
;
1051 tot
->tx_packets
+= tx_packets
;
1052 tot
->rx_bytes
+= rx_bytes
;
1053 tot
->tx_bytes
+= tx_bytes
;
1054 tot
->tx_dropped
+= stats
->tx_dropped
;
1058 static void netvsc_get_stats64(struct net_device
*net
,
1059 struct rtnl_link_stats64
*t
)
1061 struct net_device_context
*ndev_ctx
= netdev_priv(net
);
1062 struct netvsc_device
*nvdev
= rcu_dereference_rtnl(ndev_ctx
->nvdev
);
1063 struct netvsc_vf_pcpu_stats vf_tot
;
1069 netdev_stats_to_stats64(t
, &net
->stats
);
1071 netvsc_get_vf_stats(net
, &vf_tot
);
1072 t
->rx_packets
+= vf_tot
.rx_packets
;
1073 t
->tx_packets
+= vf_tot
.tx_packets
;
1074 t
->rx_bytes
+= vf_tot
.rx_bytes
;
1075 t
->tx_bytes
+= vf_tot
.tx_bytes
;
1076 t
->tx_dropped
+= vf_tot
.tx_dropped
;
1078 for (i
= 0; i
< nvdev
->num_chn
; i
++) {
1079 const struct netvsc_channel
*nvchan
= &nvdev
->chan_table
[i
];
1080 const struct netvsc_stats
*stats
;
1081 u64 packets
, bytes
, multicast
;
1084 stats
= &nvchan
->tx_stats
;
1086 start
= u64_stats_fetch_begin_irq(&stats
->syncp
);
1087 packets
= stats
->packets
;
1088 bytes
= stats
->bytes
;
1089 } while (u64_stats_fetch_retry_irq(&stats
->syncp
, start
));
1091 t
->tx_bytes
+= bytes
;
1092 t
->tx_packets
+= packets
;
1094 stats
= &nvchan
->rx_stats
;
1096 start
= u64_stats_fetch_begin_irq(&stats
->syncp
);
1097 packets
= stats
->packets
;
1098 bytes
= stats
->bytes
;
1099 multicast
= stats
->multicast
+ stats
->broadcast
;
1100 } while (u64_stats_fetch_retry_irq(&stats
->syncp
, start
));
1102 t
->rx_bytes
+= bytes
;
1103 t
->rx_packets
+= packets
;
1104 t
->multicast
+= multicast
;
1108 static int netvsc_set_mac_addr(struct net_device
*ndev
, void *p
)
1110 struct net_device_context
*ndc
= netdev_priv(ndev
);
1111 struct net_device
*vf_netdev
= rtnl_dereference(ndc
->vf_netdev
);
1112 struct netvsc_device
*nvdev
= rtnl_dereference(ndc
->nvdev
);
1113 struct sockaddr
*addr
= p
;
1116 err
= eth_prepare_mac_addr_change(ndev
, p
);
1124 err
= dev_set_mac_address(vf_netdev
, addr
);
1129 err
= rndis_filter_set_device_mac(nvdev
, addr
->sa_data
);
1131 eth_commit_mac_addr_change(ndev
, p
);
1132 } else if (vf_netdev
) {
1133 /* rollback change on VF */
1134 memcpy(addr
->sa_data
, ndev
->dev_addr
, ETH_ALEN
);
1135 dev_set_mac_address(vf_netdev
, addr
);
1141 static const struct {
1142 char name
[ETH_GSTRING_LEN
];
1144 } netvsc_stats
[] = {
1145 { "tx_scattered", offsetof(struct netvsc_ethtool_stats
, tx_scattered
) },
1146 { "tx_no_memory", offsetof(struct netvsc_ethtool_stats
, tx_no_memory
) },
1147 { "tx_no_space", offsetof(struct netvsc_ethtool_stats
, tx_no_space
) },
1148 { "tx_too_big", offsetof(struct netvsc_ethtool_stats
, tx_too_big
) },
1149 { "tx_busy", offsetof(struct netvsc_ethtool_stats
, tx_busy
) },
1150 { "tx_send_full", offsetof(struct netvsc_ethtool_stats
, tx_send_full
) },
1151 { "rx_comp_busy", offsetof(struct netvsc_ethtool_stats
, rx_comp_busy
) },
1152 { "rx_no_memory", offsetof(struct netvsc_ethtool_stats
, rx_no_memory
) },
1153 { "stop_queue", offsetof(struct netvsc_ethtool_stats
, stop_queue
) },
1154 { "wake_queue", offsetof(struct netvsc_ethtool_stats
, wake_queue
) },
1156 { "vf_rx_packets", offsetof(struct netvsc_vf_pcpu_stats
, rx_packets
) },
1157 { "vf_rx_bytes", offsetof(struct netvsc_vf_pcpu_stats
, rx_bytes
) },
1158 { "vf_tx_packets", offsetof(struct netvsc_vf_pcpu_stats
, tx_packets
) },
1159 { "vf_tx_bytes", offsetof(struct netvsc_vf_pcpu_stats
, tx_bytes
) },
1160 { "vf_tx_dropped", offsetof(struct netvsc_vf_pcpu_stats
, tx_dropped
) },
1163 #define NETVSC_GLOBAL_STATS_LEN ARRAY_SIZE(netvsc_stats)
1164 #define NETVSC_VF_STATS_LEN ARRAY_SIZE(vf_stats)
1166 /* 4 statistics per queue (rx/tx packets/bytes) */
1167 #define NETVSC_QUEUE_STATS_LEN(dev) ((dev)->num_chn * 4)
1169 static int netvsc_get_sset_count(struct net_device
*dev
, int string_set
)
1171 struct net_device_context
*ndc
= netdev_priv(dev
);
1172 struct netvsc_device
*nvdev
= rtnl_dereference(ndc
->nvdev
);
1177 switch (string_set
) {
1179 return NETVSC_GLOBAL_STATS_LEN
1180 + NETVSC_VF_STATS_LEN
1181 + NETVSC_QUEUE_STATS_LEN(nvdev
);
1187 static void netvsc_get_ethtool_stats(struct net_device
*dev
,
1188 struct ethtool_stats
*stats
, u64
*data
)
1190 struct net_device_context
*ndc
= netdev_priv(dev
);
1191 struct netvsc_device
*nvdev
= rtnl_dereference(ndc
->nvdev
);
1192 const void *nds
= &ndc
->eth_stats
;
1193 const struct netvsc_stats
*qstats
;
1194 struct netvsc_vf_pcpu_stats sum
;
1202 for (i
= 0; i
< NETVSC_GLOBAL_STATS_LEN
; i
++)
1203 data
[i
] = *(unsigned long *)(nds
+ netvsc_stats
[i
].offset
);
1205 netvsc_get_vf_stats(dev
, &sum
);
1206 for (j
= 0; j
< NETVSC_VF_STATS_LEN
; j
++)
1207 data
[i
++] = *(u64
*)((void *)&sum
+ vf_stats
[j
].offset
);
1209 for (j
= 0; j
< nvdev
->num_chn
; j
++) {
1210 qstats
= &nvdev
->chan_table
[j
].tx_stats
;
1213 start
= u64_stats_fetch_begin_irq(&qstats
->syncp
);
1214 packets
= qstats
->packets
;
1215 bytes
= qstats
->bytes
;
1216 } while (u64_stats_fetch_retry_irq(&qstats
->syncp
, start
));
1217 data
[i
++] = packets
;
1220 qstats
= &nvdev
->chan_table
[j
].rx_stats
;
1222 start
= u64_stats_fetch_begin_irq(&qstats
->syncp
);
1223 packets
= qstats
->packets
;
1224 bytes
= qstats
->bytes
;
1225 } while (u64_stats_fetch_retry_irq(&qstats
->syncp
, start
));
1226 data
[i
++] = packets
;
1231 static void netvsc_get_strings(struct net_device
*dev
, u32 stringset
, u8
*data
)
1233 struct net_device_context
*ndc
= netdev_priv(dev
);
1234 struct netvsc_device
*nvdev
= rtnl_dereference(ndc
->nvdev
);
1241 switch (stringset
) {
1243 for (i
= 0; i
< ARRAY_SIZE(netvsc_stats
); i
++) {
1244 memcpy(p
, netvsc_stats
[i
].name
, ETH_GSTRING_LEN
);
1245 p
+= ETH_GSTRING_LEN
;
1248 for (i
= 0; i
< ARRAY_SIZE(vf_stats
); i
++) {
1249 memcpy(p
, vf_stats
[i
].name
, ETH_GSTRING_LEN
);
1250 p
+= ETH_GSTRING_LEN
;
1253 for (i
= 0; i
< nvdev
->num_chn
; i
++) {
1254 sprintf(p
, "tx_queue_%u_packets", i
);
1255 p
+= ETH_GSTRING_LEN
;
1256 sprintf(p
, "tx_queue_%u_bytes", i
);
1257 p
+= ETH_GSTRING_LEN
;
1258 sprintf(p
, "rx_queue_%u_packets", i
);
1259 p
+= ETH_GSTRING_LEN
;
1260 sprintf(p
, "rx_queue_%u_bytes", i
);
1261 p
+= ETH_GSTRING_LEN
;
1269 netvsc_get_rss_hash_opts(struct net_device_context
*ndc
,
1270 struct ethtool_rxnfc
*info
)
1272 const u32 l4_flag
= RXH_L4_B_0_1
| RXH_L4_B_2_3
;
1274 info
->data
= RXH_IP_SRC
| RXH_IP_DST
;
1276 switch (info
->flow_type
) {
1278 if (ndc
->l4_hash
& HV_TCP4_L4HASH
)
1279 info
->data
|= l4_flag
;
1284 if (ndc
->l4_hash
& HV_TCP6_L4HASH
)
1285 info
->data
|= l4_flag
;
1290 if (ndc
->l4_hash
& HV_UDP4_L4HASH
)
1291 info
->data
|= l4_flag
;
1296 if (ndc
->l4_hash
& HV_UDP6_L4HASH
)
1297 info
->data
|= l4_flag
;
1313 netvsc_get_rxnfc(struct net_device
*dev
, struct ethtool_rxnfc
*info
,
1316 struct net_device_context
*ndc
= netdev_priv(dev
);
1317 struct netvsc_device
*nvdev
= rtnl_dereference(ndc
->nvdev
);
1322 switch (info
->cmd
) {
1323 case ETHTOOL_GRXRINGS
:
1324 info
->data
= nvdev
->num_chn
;
1328 return netvsc_get_rss_hash_opts(ndc
, info
);
1333 static int netvsc_set_rss_hash_opts(struct net_device_context
*ndc
,
1334 struct ethtool_rxnfc
*info
)
1336 if (info
->data
== (RXH_IP_SRC
| RXH_IP_DST
|
1337 RXH_L4_B_0_1
| RXH_L4_B_2_3
)) {
1338 switch (info
->flow_type
) {
1340 ndc
->l4_hash
|= HV_TCP4_L4HASH
;
1344 ndc
->l4_hash
|= HV_TCP6_L4HASH
;
1348 ndc
->l4_hash
|= HV_UDP4_L4HASH
;
1352 ndc
->l4_hash
|= HV_UDP6_L4HASH
;
1362 if (info
->data
== (RXH_IP_SRC
| RXH_IP_DST
)) {
1363 switch (info
->flow_type
) {
1365 ndc
->l4_hash
&= ~HV_TCP4_L4HASH
;
1369 ndc
->l4_hash
&= ~HV_TCP6_L4HASH
;
1373 ndc
->l4_hash
&= ~HV_UDP4_L4HASH
;
1377 ndc
->l4_hash
&= ~HV_UDP6_L4HASH
;
1391 netvsc_set_rxnfc(struct net_device
*ndev
, struct ethtool_rxnfc
*info
)
1393 struct net_device_context
*ndc
= netdev_priv(ndev
);
1395 if (info
->cmd
== ETHTOOL_SRXFH
)
1396 return netvsc_set_rss_hash_opts(ndc
, info
);
1401 #ifdef CONFIG_NET_POLL_CONTROLLER
1402 static void netvsc_poll_controller(struct net_device
*dev
)
1404 struct net_device_context
*ndc
= netdev_priv(dev
);
1405 struct netvsc_device
*ndev
;
1409 ndev
= rcu_dereference(ndc
->nvdev
);
1411 for (i
= 0; i
< ndev
->num_chn
; i
++) {
1412 struct netvsc_channel
*nvchan
= &ndev
->chan_table
[i
];
1414 napi_schedule(&nvchan
->napi
);
1421 static u32
netvsc_get_rxfh_key_size(struct net_device
*dev
)
1423 return NETVSC_HASH_KEYLEN
;
1426 static u32
netvsc_rss_indir_size(struct net_device
*dev
)
1431 static int netvsc_get_rxfh(struct net_device
*dev
, u32
*indir
, u8
*key
,
1434 struct net_device_context
*ndc
= netdev_priv(dev
);
1435 struct netvsc_device
*ndev
= rtnl_dereference(ndc
->nvdev
);
1436 struct rndis_device
*rndis_dev
;
1443 *hfunc
= ETH_RSS_HASH_TOP
; /* Toeplitz */
1445 rndis_dev
= ndev
->extension
;
1447 for (i
= 0; i
< ITAB_NUM
; i
++)
1448 indir
[i
] = rndis_dev
->rx_table
[i
];
1452 memcpy(key
, rndis_dev
->rss_key
, NETVSC_HASH_KEYLEN
);
1457 static int netvsc_set_rxfh(struct net_device
*dev
, const u32
*indir
,
1458 const u8
*key
, const u8 hfunc
)
1460 struct net_device_context
*ndc
= netdev_priv(dev
);
1461 struct netvsc_device
*ndev
= rtnl_dereference(ndc
->nvdev
);
1462 struct rndis_device
*rndis_dev
;
1468 if (hfunc
!= ETH_RSS_HASH_NO_CHANGE
&& hfunc
!= ETH_RSS_HASH_TOP
)
1471 rndis_dev
= ndev
->extension
;
1473 for (i
= 0; i
< ITAB_NUM
; i
++)
1474 if (indir
[i
] >= ndev
->num_chn
)
1477 for (i
= 0; i
< ITAB_NUM
; i
++)
1478 rndis_dev
->rx_table
[i
] = indir
[i
];
1485 key
= rndis_dev
->rss_key
;
1488 return rndis_filter_set_rss_param(rndis_dev
, key
);
1491 /* Hyper-V RNDIS protocol does not have ring in the HW sense.
1492 * It does have pre-allocated receive area which is divided into sections.
1494 static void __netvsc_get_ringparam(struct netvsc_device
*nvdev
,
1495 struct ethtool_ringparam
*ring
)
1499 ring
->rx_pending
= nvdev
->recv_section_cnt
;
1500 ring
->tx_pending
= nvdev
->send_section_cnt
;
1502 if (nvdev
->nvsp_version
<= NVSP_PROTOCOL_VERSION_2
)
1503 max_buf_size
= NETVSC_RECEIVE_BUFFER_SIZE_LEGACY
;
1505 max_buf_size
= NETVSC_RECEIVE_BUFFER_SIZE
;
1507 ring
->rx_max_pending
= max_buf_size
/ nvdev
->recv_section_size
;
1508 ring
->tx_max_pending
= NETVSC_SEND_BUFFER_SIZE
1509 / nvdev
->send_section_size
;
1512 static void netvsc_get_ringparam(struct net_device
*ndev
,
1513 struct ethtool_ringparam
*ring
)
1515 struct net_device_context
*ndevctx
= netdev_priv(ndev
);
1516 struct netvsc_device
*nvdev
= rtnl_dereference(ndevctx
->nvdev
);
1521 __netvsc_get_ringparam(nvdev
, ring
);
1524 static int netvsc_set_ringparam(struct net_device
*ndev
,
1525 struct ethtool_ringparam
*ring
)
1527 struct net_device_context
*ndevctx
= netdev_priv(ndev
);
1528 struct netvsc_device
*nvdev
= rtnl_dereference(ndevctx
->nvdev
);
1529 struct hv_device
*hdev
= ndevctx
->device_ctx
;
1530 struct netvsc_device_info device_info
;
1531 struct ethtool_ringparam orig
;
1536 if (!nvdev
|| nvdev
->destroy
)
1539 memset(&orig
, 0, sizeof(orig
));
1540 __netvsc_get_ringparam(nvdev
, &orig
);
1542 new_tx
= clamp_t(u32
, ring
->tx_pending
,
1543 NETVSC_MIN_TX_SECTIONS
, orig
.tx_max_pending
);
1544 new_rx
= clamp_t(u32
, ring
->rx_pending
,
1545 NETVSC_MIN_RX_SECTIONS
, orig
.rx_max_pending
);
1547 if (new_tx
== orig
.tx_pending
&&
1548 new_rx
== orig
.rx_pending
)
1549 return 0; /* no change */
1551 memset(&device_info
, 0, sizeof(device_info
));
1552 device_info
.num_chn
= nvdev
->num_chn
;
1553 device_info
.send_sections
= new_tx
;
1554 device_info
.send_section_size
= nvdev
->send_section_size
;
1555 device_info
.recv_sections
= new_rx
;
1556 device_info
.recv_section_size
= nvdev
->recv_section_size
;
1558 netif_device_detach(ndev
);
1559 was_opened
= rndis_filter_opened(nvdev
);
1561 rndis_filter_close(nvdev
);
1563 rndis_filter_device_remove(hdev
, nvdev
);
1565 nvdev
= rndis_filter_device_add(hdev
, &device_info
);
1566 if (IS_ERR(nvdev
)) {
1567 ret
= PTR_ERR(nvdev
);
1569 device_info
.send_sections
= orig
.tx_pending
;
1570 device_info
.recv_sections
= orig
.rx_pending
;
1571 nvdev
= rndis_filter_device_add(hdev
, &device_info
);
1572 if (IS_ERR(nvdev
)) {
1573 netdev_err(ndev
, "restoring ringparam failed: %ld\n",
1580 rndis_filter_open(nvdev
);
1581 netif_device_attach(ndev
);
1583 /* We may have missed link change notifications */
1584 ndevctx
->last_reconfig
= 0;
1585 schedule_delayed_work(&ndevctx
->dwork
, 0);
1590 static const struct ethtool_ops ethtool_ops
= {
1591 .get_drvinfo
= netvsc_get_drvinfo
,
1592 .get_link
= ethtool_op_get_link
,
1593 .get_ethtool_stats
= netvsc_get_ethtool_stats
,
1594 .get_sset_count
= netvsc_get_sset_count
,
1595 .get_strings
= netvsc_get_strings
,
1596 .get_channels
= netvsc_get_channels
,
1597 .set_channels
= netvsc_set_channels
,
1598 .get_ts_info
= ethtool_op_get_ts_info
,
1599 .get_rxnfc
= netvsc_get_rxnfc
,
1600 .set_rxnfc
= netvsc_set_rxnfc
,
1601 .get_rxfh_key_size
= netvsc_get_rxfh_key_size
,
1602 .get_rxfh_indir_size
= netvsc_rss_indir_size
,
1603 .get_rxfh
= netvsc_get_rxfh
,
1604 .set_rxfh
= netvsc_set_rxfh
,
1605 .get_link_ksettings
= netvsc_get_link_ksettings
,
1606 .set_link_ksettings
= netvsc_set_link_ksettings
,
1607 .get_ringparam
= netvsc_get_ringparam
,
1608 .set_ringparam
= netvsc_set_ringparam
,
1611 static const struct net_device_ops device_ops
= {
1612 .ndo_open
= netvsc_open
,
1613 .ndo_stop
= netvsc_close
,
1614 .ndo_start_xmit
= netvsc_start_xmit
,
1615 .ndo_change_rx_flags
= netvsc_change_rx_flags
,
1616 .ndo_set_rx_mode
= netvsc_set_rx_mode
,
1617 .ndo_change_mtu
= netvsc_change_mtu
,
1618 .ndo_validate_addr
= eth_validate_addr
,
1619 .ndo_set_mac_address
= netvsc_set_mac_addr
,
1620 .ndo_select_queue
= netvsc_select_queue
,
1621 .ndo_get_stats64
= netvsc_get_stats64
,
1622 #ifdef CONFIG_NET_POLL_CONTROLLER
1623 .ndo_poll_controller
= netvsc_poll_controller
,
1628 * Handle link status changes. For RNDIS_STATUS_NETWORK_CHANGE emulate link
1629 * down/up sequence. In case of RNDIS_STATUS_MEDIA_CONNECT when carrier is
1630 * present send GARP packet to network peers with netif_notify_peers().
1632 static void netvsc_link_change(struct work_struct
*w
)
1634 struct net_device_context
*ndev_ctx
=
1635 container_of(w
, struct net_device_context
, dwork
.work
);
1636 struct hv_device
*device_obj
= ndev_ctx
->device_ctx
;
1637 struct net_device
*net
= hv_get_drvdata(device_obj
);
1638 struct netvsc_device
*net_device
;
1639 struct rndis_device
*rdev
;
1640 struct netvsc_reconfig
*event
= NULL
;
1641 bool notify
= false, reschedule
= false;
1642 unsigned long flags
, next_reconfig
, delay
;
1644 /* if changes are happening, comeback later */
1645 if (!rtnl_trylock()) {
1646 schedule_delayed_work(&ndev_ctx
->dwork
, LINKCHANGE_INT
);
1650 net_device
= rtnl_dereference(ndev_ctx
->nvdev
);
1654 rdev
= net_device
->extension
;
1656 next_reconfig
= ndev_ctx
->last_reconfig
+ LINKCHANGE_INT
;
1657 if (time_is_after_jiffies(next_reconfig
)) {
1658 /* link_watch only sends one notification with current state
1659 * per second, avoid doing reconfig more frequently. Handle
1662 delay
= next_reconfig
- jiffies
;
1663 delay
= delay
< LINKCHANGE_INT
? delay
: LINKCHANGE_INT
;
1664 schedule_delayed_work(&ndev_ctx
->dwork
, delay
);
1667 ndev_ctx
->last_reconfig
= jiffies
;
1669 spin_lock_irqsave(&ndev_ctx
->lock
, flags
);
1670 if (!list_empty(&ndev_ctx
->reconfig_events
)) {
1671 event
= list_first_entry(&ndev_ctx
->reconfig_events
,
1672 struct netvsc_reconfig
, list
);
1673 list_del(&event
->list
);
1674 reschedule
= !list_empty(&ndev_ctx
->reconfig_events
);
1676 spin_unlock_irqrestore(&ndev_ctx
->lock
, flags
);
1681 switch (event
->event
) {
1682 /* Only the following events are possible due to the check in
1683 * netvsc_linkstatus_callback()
1685 case RNDIS_STATUS_MEDIA_CONNECT
:
1686 if (rdev
->link_state
) {
1687 rdev
->link_state
= false;
1688 netif_carrier_on(net
);
1689 netif_tx_wake_all_queues(net
);
1695 case RNDIS_STATUS_MEDIA_DISCONNECT
:
1696 if (!rdev
->link_state
) {
1697 rdev
->link_state
= true;
1698 netif_carrier_off(net
);
1699 netif_tx_stop_all_queues(net
);
1703 case RNDIS_STATUS_NETWORK_CHANGE
:
1704 /* Only makes sense if carrier is present */
1705 if (!rdev
->link_state
) {
1706 rdev
->link_state
= true;
1707 netif_carrier_off(net
);
1708 netif_tx_stop_all_queues(net
);
1709 event
->event
= RNDIS_STATUS_MEDIA_CONNECT
;
1710 spin_lock_irqsave(&ndev_ctx
->lock
, flags
);
1711 list_add(&event
->list
, &ndev_ctx
->reconfig_events
);
1712 spin_unlock_irqrestore(&ndev_ctx
->lock
, flags
);
1721 netdev_notify_peers(net
);
1723 /* link_watch only sends one notification with current state per
1724 * second, handle next reconfig event in 2 seconds.
1727 schedule_delayed_work(&ndev_ctx
->dwork
, LINKCHANGE_INT
);
1735 static struct net_device
*get_netvsc_bymac(const u8
*mac
)
1737 struct net_device
*dev
;
1741 for_each_netdev(&init_net
, dev
) {
1742 if (dev
->netdev_ops
!= &device_ops
)
1743 continue; /* not a netvsc device */
1745 if (ether_addr_equal(mac
, dev
->perm_addr
))
1752 static struct net_device
*get_netvsc_byref(struct net_device
*vf_netdev
)
1754 struct net_device
*dev
;
1758 for_each_netdev(&init_net
, dev
) {
1759 struct net_device_context
*net_device_ctx
;
1761 if (dev
->netdev_ops
!= &device_ops
)
1762 continue; /* not a netvsc device */
1764 net_device_ctx
= netdev_priv(dev
);
1765 if (!rtnl_dereference(net_device_ctx
->nvdev
))
1766 continue; /* device is removed */
1768 if (rtnl_dereference(net_device_ctx
->vf_netdev
) == vf_netdev
)
1769 return dev
; /* a match */
1775 /* Called when VF is injecting data into network stack.
1776 * Change the associated network device from VF to netvsc.
1777 * note: already called with rcu_read_lock
1779 static rx_handler_result_t
netvsc_vf_handle_frame(struct sk_buff
**pskb
)
1781 struct sk_buff
*skb
= *pskb
;
1782 struct net_device
*ndev
= rcu_dereference(skb
->dev
->rx_handler_data
);
1783 struct net_device_context
*ndev_ctx
= netdev_priv(ndev
);
1784 struct netvsc_vf_pcpu_stats
*pcpu_stats
1785 = this_cpu_ptr(ndev_ctx
->vf_stats
);
1789 u64_stats_update_begin(&pcpu_stats
->syncp
);
1790 pcpu_stats
->rx_packets
++;
1791 pcpu_stats
->rx_bytes
+= skb
->len
;
1792 u64_stats_update_end(&pcpu_stats
->syncp
);
1794 return RX_HANDLER_ANOTHER
;
1797 static int netvsc_vf_join(struct net_device
*vf_netdev
,
1798 struct net_device
*ndev
)
1800 struct net_device_context
*ndev_ctx
= netdev_priv(ndev
);
1803 ret
= netdev_rx_handler_register(vf_netdev
,
1804 netvsc_vf_handle_frame
, ndev
);
1806 netdev_err(vf_netdev
,
1807 "can not register netvsc VF receive handler (err = %d)\n",
1809 goto rx_handler_failed
;
1812 ret
= netdev_upper_dev_link(vf_netdev
, ndev
, NULL
);
1814 netdev_err(vf_netdev
,
1815 "can not set master device %s (err = %d)\n",
1817 goto upper_link_failed
;
1820 /* set slave flag before open to prevent IPv6 addrconf */
1821 vf_netdev
->flags
|= IFF_SLAVE
;
1823 schedule_delayed_work(&ndev_ctx
->vf_takeover
, VF_TAKEOVER_INT
);
1825 call_netdevice_notifiers(NETDEV_JOIN
, vf_netdev
);
1827 netdev_info(vf_netdev
, "joined to %s\n", ndev
->name
);
1831 netdev_rx_handler_unregister(vf_netdev
);
1836 static void __netvsc_vf_setup(struct net_device
*ndev
,
1837 struct net_device
*vf_netdev
)
1841 /* Align MTU of VF with master */
1842 ret
= dev_set_mtu(vf_netdev
, ndev
->mtu
);
1844 netdev_warn(vf_netdev
,
1845 "unable to change mtu to %u\n", ndev
->mtu
);
1847 /* set multicast etc flags on VF */
1848 dev_change_flags(vf_netdev
, ndev
->flags
| IFF_SLAVE
);
1849 dev_uc_sync(vf_netdev
, ndev
);
1850 dev_mc_sync(vf_netdev
, ndev
);
1852 if (netif_running(ndev
)) {
1853 ret
= dev_open(vf_netdev
);
1855 netdev_warn(vf_netdev
,
1856 "unable to open: %d\n", ret
);
1860 /* Setup VF as slave of the synthetic device.
1861 * Runs in workqueue to avoid recursion in netlink callbacks.
1863 static void netvsc_vf_setup(struct work_struct
*w
)
1865 struct net_device_context
*ndev_ctx
1866 = container_of(w
, struct net_device_context
, vf_takeover
.work
);
1867 struct net_device
*ndev
= hv_get_drvdata(ndev_ctx
->device_ctx
);
1868 struct net_device
*vf_netdev
;
1870 if (!rtnl_trylock()) {
1871 schedule_delayed_work(&ndev_ctx
->vf_takeover
, 0);
1875 vf_netdev
= rtnl_dereference(ndev_ctx
->vf_netdev
);
1877 __netvsc_vf_setup(ndev
, vf_netdev
);
1882 static int netvsc_register_vf(struct net_device
*vf_netdev
)
1884 struct net_device
*ndev
;
1885 struct net_device_context
*net_device_ctx
;
1886 struct netvsc_device
*netvsc_dev
;
1888 if (vf_netdev
->addr_len
!= ETH_ALEN
)
1892 * We will use the MAC address to locate the synthetic interface to
1893 * associate with the VF interface. If we don't find a matching
1894 * synthetic interface, move on.
1896 ndev
= get_netvsc_bymac(vf_netdev
->perm_addr
);
1900 net_device_ctx
= netdev_priv(ndev
);
1901 netvsc_dev
= rtnl_dereference(net_device_ctx
->nvdev
);
1902 if (!netvsc_dev
|| rtnl_dereference(net_device_ctx
->vf_netdev
))
1905 if (netvsc_vf_join(vf_netdev
, ndev
) != 0)
1908 netdev_info(ndev
, "VF registering: %s\n", vf_netdev
->name
);
1910 dev_hold(vf_netdev
);
1911 rcu_assign_pointer(net_device_ctx
->vf_netdev
, vf_netdev
);
1915 /* VF up/down change detected, schedule to change data path */
1916 static int netvsc_vf_changed(struct net_device
*vf_netdev
)
1918 struct net_device_context
*net_device_ctx
;
1919 struct netvsc_device
*netvsc_dev
;
1920 struct net_device
*ndev
;
1921 bool vf_is_up
= netif_running(vf_netdev
);
1923 ndev
= get_netvsc_byref(vf_netdev
);
1927 net_device_ctx
= netdev_priv(ndev
);
1928 netvsc_dev
= rtnl_dereference(net_device_ctx
->nvdev
);
1932 netvsc_switch_datapath(ndev
, vf_is_up
);
1933 netdev_info(ndev
, "Data path switched %s VF: %s\n",
1934 vf_is_up
? "to" : "from", vf_netdev
->name
);
1939 static int netvsc_unregister_vf(struct net_device
*vf_netdev
)
1941 struct net_device
*ndev
;
1942 struct net_device_context
*net_device_ctx
;
1944 ndev
= get_netvsc_byref(vf_netdev
);
1948 net_device_ctx
= netdev_priv(ndev
);
1949 cancel_delayed_work_sync(&net_device_ctx
->vf_takeover
);
1951 netdev_info(ndev
, "VF unregistering: %s\n", vf_netdev
->name
);
1953 netdev_rx_handler_unregister(vf_netdev
);
1954 netdev_upper_dev_unlink(vf_netdev
, ndev
);
1955 RCU_INIT_POINTER(net_device_ctx
->vf_netdev
, NULL
);
1961 static int netvsc_probe(struct hv_device
*dev
,
1962 const struct hv_vmbus_device_id
*dev_id
)
1964 struct net_device
*net
= NULL
;
1965 struct net_device_context
*net_device_ctx
;
1966 struct netvsc_device_info device_info
;
1967 struct netvsc_device
*nvdev
;
1970 net
= alloc_etherdev_mq(sizeof(struct net_device_context
),
1975 netif_carrier_off(net
);
1977 netvsc_init_settings(net
);
1979 net_device_ctx
= netdev_priv(net
);
1980 net_device_ctx
->device_ctx
= dev
;
1981 net_device_ctx
->msg_enable
= netif_msg_init(debug
, default_msg
);
1982 if (netif_msg_probe(net_device_ctx
))
1983 netdev_dbg(net
, "netvsc msg_enable: %d\n",
1984 net_device_ctx
->msg_enable
);
1986 hv_set_drvdata(dev
, net
);
1988 INIT_DELAYED_WORK(&net_device_ctx
->dwork
, netvsc_link_change
);
1990 spin_lock_init(&net_device_ctx
->lock
);
1991 INIT_LIST_HEAD(&net_device_ctx
->reconfig_events
);
1992 INIT_DELAYED_WORK(&net_device_ctx
->vf_takeover
, netvsc_vf_setup
);
1994 net_device_ctx
->vf_stats
1995 = netdev_alloc_pcpu_stats(struct netvsc_vf_pcpu_stats
);
1996 if (!net_device_ctx
->vf_stats
)
1999 net
->netdev_ops
= &device_ops
;
2000 net
->ethtool_ops
= ðtool_ops
;
2001 SET_NETDEV_DEV(net
, &dev
->device
);
2003 /* We always need headroom for rndis header */
2004 net
->needed_headroom
= RNDIS_AND_PPI_SIZE
;
2006 /* Initialize the number of queues to be 1, we may change it if more
2007 * channels are offered later.
2009 netif_set_real_num_tx_queues(net
, 1);
2010 netif_set_real_num_rx_queues(net
, 1);
2012 /* Notify the netvsc driver of the new device */
2013 memset(&device_info
, 0, sizeof(device_info
));
2014 device_info
.num_chn
= VRSS_CHANNEL_DEFAULT
;
2015 device_info
.send_sections
= NETVSC_DEFAULT_TX
;
2016 device_info
.send_section_size
= NETVSC_SEND_SECTION_SIZE
;
2017 device_info
.recv_sections
= NETVSC_DEFAULT_RX
;
2018 device_info
.recv_section_size
= NETVSC_RECV_SECTION_SIZE
;
2020 nvdev
= rndis_filter_device_add(dev
, &device_info
);
2021 if (IS_ERR(nvdev
)) {
2022 ret
= PTR_ERR(nvdev
);
2023 netdev_err(net
, "unable to add netvsc device (ret %d)\n", ret
);
2027 memcpy(net
->dev_addr
, device_info
.mac_adr
, ETH_ALEN
);
2029 /* hw_features computed in rndis_netdev_set_hwcaps() */
2030 net
->features
= net
->hw_features
|
2031 NETIF_F_HIGHDMA
| NETIF_F_SG
|
2032 NETIF_F_HW_VLAN_CTAG_TX
| NETIF_F_HW_VLAN_CTAG_RX
;
2033 net
->vlan_features
= net
->features
;
2035 netdev_lockdep_set_classes(net
);
2037 /* MTU range: 68 - 1500 or 65521 */
2038 net
->min_mtu
= NETVSC_MTU_MIN
;
2039 if (nvdev
->nvsp_version
>= NVSP_PROTOCOL_VERSION_2
)
2040 net
->max_mtu
= NETVSC_MTU
- ETH_HLEN
;
2042 net
->max_mtu
= ETH_DATA_LEN
;
2044 ret
= register_netdev(net
);
2046 pr_err("Unable to register netdev.\n");
2047 goto register_failed
;
2053 rndis_filter_device_remove(dev
, nvdev
);
2055 free_percpu(net_device_ctx
->vf_stats
);
2057 hv_set_drvdata(dev
, NULL
);
2063 static int netvsc_remove(struct hv_device
*dev
)
2065 struct net_device_context
*ndev_ctx
;
2066 struct net_device
*vf_netdev
;
2067 struct net_device
*net
;
2069 net
= hv_get_drvdata(dev
);
2071 dev_err(&dev
->device
, "No net device to remove\n");
2075 ndev_ctx
= netdev_priv(net
);
2077 netif_device_detach(net
);
2079 cancel_delayed_work_sync(&ndev_ctx
->dwork
);
2082 * Call to the vsc driver to let it know that the device is being
2083 * removed. Also blocks mtu and channel changes.
2086 vf_netdev
= rtnl_dereference(ndev_ctx
->vf_netdev
);
2088 netvsc_unregister_vf(vf_netdev
);
2090 unregister_netdevice(net
);
2092 rndis_filter_device_remove(dev
,
2093 rtnl_dereference(ndev_ctx
->nvdev
));
2096 hv_set_drvdata(dev
, NULL
);
2098 free_percpu(ndev_ctx
->vf_stats
);
2103 static const struct hv_vmbus_device_id id_table
[] = {
2109 MODULE_DEVICE_TABLE(vmbus
, id_table
);
2111 /* The one and only one */
2112 static struct hv_driver netvsc_drv
= {
2113 .name
= KBUILD_MODNAME
,
2114 .id_table
= id_table
,
2115 .probe
= netvsc_probe
,
2116 .remove
= netvsc_remove
,
2120 * On Hyper-V, every VF interface is matched with a corresponding
2121 * synthetic interface. The synthetic interface is presented first
2122 * to the guest. When the corresponding VF instance is registered,
2123 * we will take care of switching the data path.
2125 static int netvsc_netdev_event(struct notifier_block
*this,
2126 unsigned long event
, void *ptr
)
2128 struct net_device
*event_dev
= netdev_notifier_info_to_dev(ptr
);
2130 /* Skip our own events */
2131 if (event_dev
->netdev_ops
== &device_ops
)
2134 /* Avoid non-Ethernet type devices */
2135 if (event_dev
->type
!= ARPHRD_ETHER
)
2138 /* Avoid Vlan dev with same MAC registering as VF */
2139 if (is_vlan_dev(event_dev
))
2142 /* Avoid Bonding master dev with same MAC registering as VF */
2143 if ((event_dev
->priv_flags
& IFF_BONDING
) &&
2144 (event_dev
->flags
& IFF_MASTER
))
2148 case NETDEV_REGISTER
:
2149 return netvsc_register_vf(event_dev
);
2150 case NETDEV_UNREGISTER
:
2151 return netvsc_unregister_vf(event_dev
);
2154 return netvsc_vf_changed(event_dev
);
2160 static struct notifier_block netvsc_netdev_notifier
= {
2161 .notifier_call
= netvsc_netdev_event
,
2164 static void __exit
netvsc_drv_exit(void)
2166 unregister_netdevice_notifier(&netvsc_netdev_notifier
);
2167 vmbus_driver_unregister(&netvsc_drv
);
2170 static int __init
netvsc_drv_init(void)
2174 if (ring_size
< RING_SIZE_MIN
) {
2175 ring_size
= RING_SIZE_MIN
;
2176 pr_info("Increased ring_size to %u (min allowed)\n",
2179 netvsc_ring_bytes
= ring_size
* PAGE_SIZE
;
2180 netvsc_ring_reciprocal
= reciprocal_value(netvsc_ring_bytes
);
2182 ret
= vmbus_driver_register(&netvsc_drv
);
2186 register_netdevice_notifier(&netvsc_netdev_notifier
);
2190 MODULE_LICENSE("GPL");
2191 MODULE_DESCRIPTION("Microsoft Hyper-V network driver");
2193 module_init(netvsc_drv_init
);
2194 module_exit(netvsc_drv_exit
);