1 // SPDX-License-Identifier: GPL-2.0-only
3 * Copyright (c) 2009, Microsoft Corporation.
6 * Haiyang Zhang <haiyangz@microsoft.com>
7 * Hank Janssen <hjanssen@microsoft.com>
9 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
11 #include <linux/kernel.h>
12 #include <linux/sched.h>
13 #include <linux/wait.h>
15 #include <linux/delay.h>
17 #include <linux/slab.h>
18 #include <linux/netdevice.h>
19 #include <linux/if_ether.h>
20 #include <linux/vmalloc.h>
21 #include <linux/rtnetlink.h>
22 #include <linux/prefetch.h>
24 #include <asm/sync_bitops.h>
26 #include "hyperv_net.h"
27 #include "netvsc_trace.h"
30 * Switch the data path from the synthetic interface to the VF
33 void netvsc_switch_datapath(struct net_device
*ndev
, bool vf
)
35 struct net_device_context
*net_device_ctx
= netdev_priv(ndev
);
36 struct hv_device
*dev
= net_device_ctx
->device_ctx
;
37 struct netvsc_device
*nv_dev
= rtnl_dereference(net_device_ctx
->nvdev
);
38 struct nvsp_message
*init_pkt
= &nv_dev
->channel_init_pkt
;
40 memset(init_pkt
, 0, sizeof(struct nvsp_message
));
41 init_pkt
->hdr
.msg_type
= NVSP_MSG4_TYPE_SWITCH_DATA_PATH
;
43 init_pkt
->msg
.v4_msg
.active_dp
.active_datapath
=
46 init_pkt
->msg
.v4_msg
.active_dp
.active_datapath
=
47 NVSP_DATAPATH_SYNTHETIC
;
49 trace_nvsp_send(ndev
, init_pkt
);
51 vmbus_sendpacket(dev
->channel
, init_pkt
,
52 sizeof(struct nvsp_message
),
53 (unsigned long)init_pkt
,
54 VM_PKT_DATA_INBAND
, 0);
57 /* Worker to setup sub channels on initial setup
58 * Initial hotplug event occurs in softirq context
59 * and can't wait for channels.
61 static void netvsc_subchan_work(struct work_struct
*w
)
63 struct netvsc_device
*nvdev
=
64 container_of(w
, struct netvsc_device
, subchan_work
);
65 struct rndis_device
*rdev
;
68 /* Avoid deadlock with device removal already under RTNL */
69 if (!rtnl_trylock()) {
74 rdev
= nvdev
->extension
;
76 ret
= rndis_set_subchannel(rdev
->ndev
, nvdev
, NULL
);
78 netif_device_attach(rdev
->ndev
);
80 /* fallback to only primary channel */
81 for (i
= 1; i
< nvdev
->num_chn
; i
++)
82 netif_napi_del(&nvdev
->chan_table
[i
].napi
);
92 static struct netvsc_device
*alloc_net_device(void)
94 struct netvsc_device
*net_device
;
96 net_device
= kzalloc(sizeof(struct netvsc_device
), GFP_KERNEL
);
100 init_waitqueue_head(&net_device
->wait_drain
);
101 net_device
->destroy
= false;
102 net_device
->tx_disable
= false;
104 net_device
->max_pkt
= RNDIS_MAX_PKT_DEFAULT
;
105 net_device
->pkt_align
= RNDIS_PKT_ALIGN_DEFAULT
;
107 init_completion(&net_device
->channel_init_wait
);
108 init_waitqueue_head(&net_device
->subchan_open
);
109 INIT_WORK(&net_device
->subchan_work
, netvsc_subchan_work
);
114 static void free_netvsc_device(struct rcu_head
*head
)
116 struct netvsc_device
*nvdev
117 = container_of(head
, struct netvsc_device
, rcu
);
120 kfree(nvdev
->extension
);
121 vfree(nvdev
->recv_buf
);
122 vfree(nvdev
->send_buf
);
123 kfree(nvdev
->send_section_map
);
125 for (i
= 0; i
< VRSS_CHANNEL_MAX
; i
++)
126 vfree(nvdev
->chan_table
[i
].mrc
.slots
);
131 static void free_netvsc_device_rcu(struct netvsc_device
*nvdev
)
133 call_rcu(&nvdev
->rcu
, free_netvsc_device
);
136 static void netvsc_revoke_recv_buf(struct hv_device
*device
,
137 struct netvsc_device
*net_device
,
138 struct net_device
*ndev
)
140 struct nvsp_message
*revoke_packet
;
144 * If we got a section count, it means we received a
145 * SendReceiveBufferComplete msg (ie sent
146 * NvspMessage1TypeSendReceiveBuffer msg) therefore, we need
147 * to send a revoke msg here
149 if (net_device
->recv_section_cnt
) {
150 /* Send the revoke receive buffer */
151 revoke_packet
= &net_device
->revoke_packet
;
152 memset(revoke_packet
, 0, sizeof(struct nvsp_message
));
154 revoke_packet
->hdr
.msg_type
=
155 NVSP_MSG1_TYPE_REVOKE_RECV_BUF
;
156 revoke_packet
->msg
.v1_msg
.
157 revoke_recv_buf
.id
= NETVSC_RECEIVE_BUFFER_ID
;
159 trace_nvsp_send(ndev
, revoke_packet
);
161 ret
= vmbus_sendpacket(device
->channel
,
163 sizeof(struct nvsp_message
),
164 (unsigned long)revoke_packet
,
165 VM_PKT_DATA_INBAND
, 0);
166 /* If the failure is because the channel is rescinded;
167 * ignore the failure since we cannot send on a rescinded
168 * channel. This would allow us to properly cleanup
169 * even when the channel is rescinded.
171 if (device
->channel
->rescind
)
174 * If we failed here, we might as well return and
175 * have a leak rather than continue and a bugchk
178 netdev_err(ndev
, "unable to send "
179 "revoke receive buffer to netvsp\n");
182 net_device
->recv_section_cnt
= 0;
186 static void netvsc_revoke_send_buf(struct hv_device
*device
,
187 struct netvsc_device
*net_device
,
188 struct net_device
*ndev
)
190 struct nvsp_message
*revoke_packet
;
193 /* Deal with the send buffer we may have setup.
194 * If we got a send section size, it means we received a
195 * NVSP_MSG1_TYPE_SEND_SEND_BUF_COMPLETE msg (ie sent
196 * NVSP_MSG1_TYPE_SEND_SEND_BUF msg) therefore, we need
197 * to send a revoke msg here
199 if (net_device
->send_section_cnt
) {
200 /* Send the revoke receive buffer */
201 revoke_packet
= &net_device
->revoke_packet
;
202 memset(revoke_packet
, 0, sizeof(struct nvsp_message
));
204 revoke_packet
->hdr
.msg_type
=
205 NVSP_MSG1_TYPE_REVOKE_SEND_BUF
;
206 revoke_packet
->msg
.v1_msg
.revoke_send_buf
.id
=
207 NETVSC_SEND_BUFFER_ID
;
209 trace_nvsp_send(ndev
, revoke_packet
);
211 ret
= vmbus_sendpacket(device
->channel
,
213 sizeof(struct nvsp_message
),
214 (unsigned long)revoke_packet
,
215 VM_PKT_DATA_INBAND
, 0);
217 /* If the failure is because the channel is rescinded;
218 * ignore the failure since we cannot send on a rescinded
219 * channel. This would allow us to properly cleanup
220 * even when the channel is rescinded.
222 if (device
->channel
->rescind
)
225 /* If we failed here, we might as well return and
226 * have a leak rather than continue and a bugchk
229 netdev_err(ndev
, "unable to send "
230 "revoke send buffer to netvsp\n");
233 net_device
->send_section_cnt
= 0;
237 static void netvsc_teardown_recv_gpadl(struct hv_device
*device
,
238 struct netvsc_device
*net_device
,
239 struct net_device
*ndev
)
243 if (net_device
->recv_buf_gpadl_handle
) {
244 ret
= vmbus_teardown_gpadl(device
->channel
,
245 net_device
->recv_buf_gpadl_handle
);
247 /* If we failed here, we might as well return and have a leak
248 * rather than continue and a bugchk
252 "unable to teardown receive buffer's gpadl\n");
255 net_device
->recv_buf_gpadl_handle
= 0;
259 static void netvsc_teardown_send_gpadl(struct hv_device
*device
,
260 struct netvsc_device
*net_device
,
261 struct net_device
*ndev
)
265 if (net_device
->send_buf_gpadl_handle
) {
266 ret
= vmbus_teardown_gpadl(device
->channel
,
267 net_device
->send_buf_gpadl_handle
);
269 /* If we failed here, we might as well return and have a leak
270 * rather than continue and a bugchk
274 "unable to teardown send buffer's gpadl\n");
277 net_device
->send_buf_gpadl_handle
= 0;
281 int netvsc_alloc_recv_comp_ring(struct netvsc_device
*net_device
, u32 q_idx
)
283 struct netvsc_channel
*nvchan
= &net_device
->chan_table
[q_idx
];
284 int node
= cpu_to_node(nvchan
->channel
->target_cpu
);
287 size
= net_device
->recv_completion_cnt
* sizeof(struct recv_comp_data
);
288 nvchan
->mrc
.slots
= vzalloc_node(size
, node
);
289 if (!nvchan
->mrc
.slots
)
290 nvchan
->mrc
.slots
= vzalloc(size
);
292 return nvchan
->mrc
.slots
? 0 : -ENOMEM
;
295 static int netvsc_init_buf(struct hv_device
*device
,
296 struct netvsc_device
*net_device
,
297 const struct netvsc_device_info
*device_info
)
299 struct nvsp_1_message_send_receive_buffer_complete
*resp
;
300 struct net_device
*ndev
= hv_get_drvdata(device
);
301 struct nvsp_message
*init_packet
;
302 unsigned int buf_size
;
306 /* Get receive buffer area. */
307 buf_size
= device_info
->recv_sections
* device_info
->recv_section_size
;
308 buf_size
= roundup(buf_size
, PAGE_SIZE
);
310 /* Legacy hosts only allow smaller receive buffer */
311 if (net_device
->nvsp_version
<= NVSP_PROTOCOL_VERSION_2
)
312 buf_size
= min_t(unsigned int, buf_size
,
313 NETVSC_RECEIVE_BUFFER_SIZE_LEGACY
);
315 net_device
->recv_buf
= vzalloc(buf_size
);
316 if (!net_device
->recv_buf
) {
318 "unable to allocate receive buffer of size %u\n",
324 net_device
->recv_buf_size
= buf_size
;
327 * Establish the gpadl handle for this buffer on this
328 * channel. Note: This call uses the vmbus connection rather
329 * than the channel to establish the gpadl handle.
331 ret
= vmbus_establish_gpadl(device
->channel
, net_device
->recv_buf
,
333 &net_device
->recv_buf_gpadl_handle
);
336 "unable to establish receive buffer's gpadl\n");
340 /* Notify the NetVsp of the gpadl handle */
341 init_packet
= &net_device
->channel_init_pkt
;
342 memset(init_packet
, 0, sizeof(struct nvsp_message
));
343 init_packet
->hdr
.msg_type
= NVSP_MSG1_TYPE_SEND_RECV_BUF
;
344 init_packet
->msg
.v1_msg
.send_recv_buf
.
345 gpadl_handle
= net_device
->recv_buf_gpadl_handle
;
346 init_packet
->msg
.v1_msg
.
347 send_recv_buf
.id
= NETVSC_RECEIVE_BUFFER_ID
;
349 trace_nvsp_send(ndev
, init_packet
);
351 /* Send the gpadl notification request */
352 ret
= vmbus_sendpacket(device
->channel
, init_packet
,
353 sizeof(struct nvsp_message
),
354 (unsigned long)init_packet
,
356 VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED
);
359 "unable to send receive buffer's gpadl to netvsp\n");
363 wait_for_completion(&net_device
->channel_init_wait
);
365 /* Check the response */
366 resp
= &init_packet
->msg
.v1_msg
.send_recv_buf_complete
;
367 if (resp
->status
!= NVSP_STAT_SUCCESS
) {
369 "Unable to complete receive buffer initialization with NetVsp - status %d\n",
375 /* Parse the response */
376 netdev_dbg(ndev
, "Receive sections: %u sub_allocs: size %u count: %u\n",
377 resp
->num_sections
, resp
->sections
[0].sub_alloc_size
,
378 resp
->sections
[0].num_sub_allocs
);
380 /* There should only be one section for the entire receive buffer */
381 if (resp
->num_sections
!= 1 || resp
->sections
[0].offset
!= 0) {
386 net_device
->recv_section_size
= resp
->sections
[0].sub_alloc_size
;
387 net_device
->recv_section_cnt
= resp
->sections
[0].num_sub_allocs
;
389 /* Setup receive completion ring */
390 net_device
->recv_completion_cnt
391 = round_up(net_device
->recv_section_cnt
+ 1,
392 PAGE_SIZE
/ sizeof(u64
));
393 ret
= netvsc_alloc_recv_comp_ring(net_device
, 0);
397 /* Now setup the send buffer. */
398 buf_size
= device_info
->send_sections
* device_info
->send_section_size
;
399 buf_size
= round_up(buf_size
, PAGE_SIZE
);
401 net_device
->send_buf
= vzalloc(buf_size
);
402 if (!net_device
->send_buf
) {
403 netdev_err(ndev
, "unable to allocate send buffer of size %u\n",
409 /* Establish the gpadl handle for this buffer on this
410 * channel. Note: This call uses the vmbus connection rather
411 * than the channel to establish the gpadl handle.
413 ret
= vmbus_establish_gpadl(device
->channel
, net_device
->send_buf
,
415 &net_device
->send_buf_gpadl_handle
);
418 "unable to establish send buffer's gpadl\n");
422 /* Notify the NetVsp of the gpadl handle */
423 init_packet
= &net_device
->channel_init_pkt
;
424 memset(init_packet
, 0, sizeof(struct nvsp_message
));
425 init_packet
->hdr
.msg_type
= NVSP_MSG1_TYPE_SEND_SEND_BUF
;
426 init_packet
->msg
.v1_msg
.send_send_buf
.gpadl_handle
=
427 net_device
->send_buf_gpadl_handle
;
428 init_packet
->msg
.v1_msg
.send_send_buf
.id
= NETVSC_SEND_BUFFER_ID
;
430 trace_nvsp_send(ndev
, init_packet
);
432 /* Send the gpadl notification request */
433 ret
= vmbus_sendpacket(device
->channel
, init_packet
,
434 sizeof(struct nvsp_message
),
435 (unsigned long)init_packet
,
437 VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED
);
440 "unable to send send buffer's gpadl to netvsp\n");
444 wait_for_completion(&net_device
->channel_init_wait
);
446 /* Check the response */
447 if (init_packet
->msg
.v1_msg
.
448 send_send_buf_complete
.status
!= NVSP_STAT_SUCCESS
) {
449 netdev_err(ndev
, "Unable to complete send buffer "
450 "initialization with NetVsp - status %d\n",
451 init_packet
->msg
.v1_msg
.
452 send_send_buf_complete
.status
);
457 /* Parse the response */
458 net_device
->send_section_size
= init_packet
->msg
.
459 v1_msg
.send_send_buf_complete
.section_size
;
461 /* Section count is simply the size divided by the section size. */
462 net_device
->send_section_cnt
= buf_size
/ net_device
->send_section_size
;
464 netdev_dbg(ndev
, "Send section size: %d, Section count:%d\n",
465 net_device
->send_section_size
, net_device
->send_section_cnt
);
467 /* Setup state for managing the send buffer. */
468 map_words
= DIV_ROUND_UP(net_device
->send_section_cnt
, BITS_PER_LONG
);
470 net_device
->send_section_map
= kcalloc(map_words
, sizeof(ulong
), GFP_KERNEL
);
471 if (net_device
->send_section_map
== NULL
) {
479 netvsc_revoke_recv_buf(device
, net_device
, ndev
);
480 netvsc_revoke_send_buf(device
, net_device
, ndev
);
481 netvsc_teardown_recv_gpadl(device
, net_device
, ndev
);
482 netvsc_teardown_send_gpadl(device
, net_device
, ndev
);
488 /* Negotiate NVSP protocol version */
489 static int negotiate_nvsp_ver(struct hv_device
*device
,
490 struct netvsc_device
*net_device
,
491 struct nvsp_message
*init_packet
,
494 struct net_device
*ndev
= hv_get_drvdata(device
);
497 memset(init_packet
, 0, sizeof(struct nvsp_message
));
498 init_packet
->hdr
.msg_type
= NVSP_MSG_TYPE_INIT
;
499 init_packet
->msg
.init_msg
.init
.min_protocol_ver
= nvsp_ver
;
500 init_packet
->msg
.init_msg
.init
.max_protocol_ver
= nvsp_ver
;
501 trace_nvsp_send(ndev
, init_packet
);
503 /* Send the init request */
504 ret
= vmbus_sendpacket(device
->channel
, init_packet
,
505 sizeof(struct nvsp_message
),
506 (unsigned long)init_packet
,
508 VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED
);
513 wait_for_completion(&net_device
->channel_init_wait
);
515 if (init_packet
->msg
.init_msg
.init_complete
.status
!=
519 if (nvsp_ver
== NVSP_PROTOCOL_VERSION_1
)
522 /* NVSPv2 or later: Send NDIS config */
523 memset(init_packet
, 0, sizeof(struct nvsp_message
));
524 init_packet
->hdr
.msg_type
= NVSP_MSG2_TYPE_SEND_NDIS_CONFIG
;
525 init_packet
->msg
.v2_msg
.send_ndis_config
.mtu
= ndev
->mtu
+ ETH_HLEN
;
526 init_packet
->msg
.v2_msg
.send_ndis_config
.capability
.ieee8021q
= 1;
528 if (nvsp_ver
>= NVSP_PROTOCOL_VERSION_5
) {
529 init_packet
->msg
.v2_msg
.send_ndis_config
.capability
.sriov
= 1;
531 /* Teaming bit is needed to receive link speed updates */
532 init_packet
->msg
.v2_msg
.send_ndis_config
.capability
.teaming
= 1;
535 if (nvsp_ver
>= NVSP_PROTOCOL_VERSION_61
)
536 init_packet
->msg
.v2_msg
.send_ndis_config
.capability
.rsc
= 1;
538 trace_nvsp_send(ndev
, init_packet
);
540 ret
= vmbus_sendpacket(device
->channel
, init_packet
,
541 sizeof(struct nvsp_message
),
542 (unsigned long)init_packet
,
543 VM_PKT_DATA_INBAND
, 0);
548 static int netvsc_connect_vsp(struct hv_device
*device
,
549 struct netvsc_device
*net_device
,
550 const struct netvsc_device_info
*device_info
)
552 struct net_device
*ndev
= hv_get_drvdata(device
);
553 static const u32 ver_list
[] = {
554 NVSP_PROTOCOL_VERSION_1
, NVSP_PROTOCOL_VERSION_2
,
555 NVSP_PROTOCOL_VERSION_4
, NVSP_PROTOCOL_VERSION_5
,
556 NVSP_PROTOCOL_VERSION_6
, NVSP_PROTOCOL_VERSION_61
558 struct nvsp_message
*init_packet
;
559 int ndis_version
, i
, ret
;
561 init_packet
= &net_device
->channel_init_pkt
;
563 /* Negotiate the latest NVSP protocol supported */
564 for (i
= ARRAY_SIZE(ver_list
) - 1; i
>= 0; i
--)
565 if (negotiate_nvsp_ver(device
, net_device
, init_packet
,
567 net_device
->nvsp_version
= ver_list
[i
];
576 pr_debug("Negotiated NVSP version:%x\n", net_device
->nvsp_version
);
578 /* Send the ndis version */
579 memset(init_packet
, 0, sizeof(struct nvsp_message
));
581 if (net_device
->nvsp_version
<= NVSP_PROTOCOL_VERSION_4
)
582 ndis_version
= 0x00060001;
584 ndis_version
= 0x0006001e;
586 init_packet
->hdr
.msg_type
= NVSP_MSG1_TYPE_SEND_NDIS_VER
;
587 init_packet
->msg
.v1_msg
.
588 send_ndis_ver
.ndis_major_ver
=
589 (ndis_version
& 0xFFFF0000) >> 16;
590 init_packet
->msg
.v1_msg
.
591 send_ndis_ver
.ndis_minor_ver
=
592 ndis_version
& 0xFFFF;
594 trace_nvsp_send(ndev
, init_packet
);
596 /* Send the init request */
597 ret
= vmbus_sendpacket(device
->channel
, init_packet
,
598 sizeof(struct nvsp_message
),
599 (unsigned long)init_packet
,
600 VM_PKT_DATA_INBAND
, 0);
605 ret
= netvsc_init_buf(device
, net_device
, device_info
);
612 * netvsc_device_remove - Callback when the root bus device is removed
614 void netvsc_device_remove(struct hv_device
*device
)
616 struct net_device
*ndev
= hv_get_drvdata(device
);
617 struct net_device_context
*net_device_ctx
= netdev_priv(ndev
);
618 struct netvsc_device
*net_device
619 = rtnl_dereference(net_device_ctx
->nvdev
);
623 * Revoke receive buffer. If host is pre-Win2016 then tear down
624 * receive buffer GPADL. Do the same for send buffer.
626 netvsc_revoke_recv_buf(device
, net_device
, ndev
);
627 if (vmbus_proto_version
< VERSION_WIN10
)
628 netvsc_teardown_recv_gpadl(device
, net_device
, ndev
);
630 netvsc_revoke_send_buf(device
, net_device
, ndev
);
631 if (vmbus_proto_version
< VERSION_WIN10
)
632 netvsc_teardown_send_gpadl(device
, net_device
, ndev
);
634 RCU_INIT_POINTER(net_device_ctx
->nvdev
, NULL
);
636 /* And disassociate NAPI context from device */
637 for (i
= 0; i
< net_device
->num_chn
; i
++)
638 netif_napi_del(&net_device
->chan_table
[i
].napi
);
641 * At this point, no one should be accessing net_device
644 netdev_dbg(ndev
, "net device safe to remove\n");
646 /* Now, we can close the channel safely */
647 vmbus_close(device
->channel
);
650 * If host is Win2016 or higher then we do the GPADL tear down
651 * here after VMBus is closed.
653 if (vmbus_proto_version
>= VERSION_WIN10
) {
654 netvsc_teardown_recv_gpadl(device
, net_device
, ndev
);
655 netvsc_teardown_send_gpadl(device
, net_device
, ndev
);
658 /* Release all resources */
659 free_netvsc_device_rcu(net_device
);
662 #define RING_AVAIL_PERCENT_HIWATER 20
663 #define RING_AVAIL_PERCENT_LOWATER 10
665 static inline void netvsc_free_send_slot(struct netvsc_device
*net_device
,
668 sync_change_bit(index
, net_device
->send_section_map
);
671 static void netvsc_send_tx_complete(struct net_device
*ndev
,
672 struct netvsc_device
*net_device
,
673 struct vmbus_channel
*channel
,
674 const struct vmpacket_descriptor
*desc
,
677 struct sk_buff
*skb
= (struct sk_buff
*)(unsigned long)desc
->trans_id
;
678 struct net_device_context
*ndev_ctx
= netdev_priv(ndev
);
682 /* Notify the layer above us */
684 const struct hv_netvsc_packet
*packet
685 = (struct hv_netvsc_packet
*)skb
->cb
;
686 u32 send_index
= packet
->send_buf_index
;
687 struct netvsc_stats
*tx_stats
;
689 if (send_index
!= NETVSC_INVALID_INDEX
)
690 netvsc_free_send_slot(net_device
, send_index
);
691 q_idx
= packet
->q_idx
;
693 tx_stats
= &net_device
->chan_table
[q_idx
].tx_stats
;
695 u64_stats_update_begin(&tx_stats
->syncp
);
696 tx_stats
->packets
+= packet
->total_packets
;
697 tx_stats
->bytes
+= packet
->total_bytes
;
698 u64_stats_update_end(&tx_stats
->syncp
);
700 napi_consume_skb(skb
, budget
);
704 atomic_dec_return(&net_device
->chan_table
[q_idx
].queue_sends
);
706 if (unlikely(net_device
->destroy
)) {
707 if (queue_sends
== 0)
708 wake_up(&net_device
->wait_drain
);
710 struct netdev_queue
*txq
= netdev_get_tx_queue(ndev
, q_idx
);
712 if (netif_tx_queue_stopped(txq
) && !net_device
->tx_disable
&&
713 (hv_get_avail_to_write_percent(&channel
->outbound
) >
714 RING_AVAIL_PERCENT_HIWATER
|| queue_sends
< 1)) {
715 netif_tx_wake_queue(txq
);
716 ndev_ctx
->eth_stats
.wake_queue
++;
721 static void netvsc_send_completion(struct net_device
*ndev
,
722 struct netvsc_device
*net_device
,
723 struct vmbus_channel
*incoming_channel
,
724 const struct vmpacket_descriptor
*desc
,
727 const struct nvsp_message
*nvsp_packet
= hv_pkt_data(desc
);
729 switch (nvsp_packet
->hdr
.msg_type
) {
730 case NVSP_MSG_TYPE_INIT_COMPLETE
:
731 case NVSP_MSG1_TYPE_SEND_RECV_BUF_COMPLETE
:
732 case NVSP_MSG1_TYPE_SEND_SEND_BUF_COMPLETE
:
733 case NVSP_MSG5_TYPE_SUBCHANNEL
:
734 /* Copy the response back */
735 memcpy(&net_device
->channel_init_pkt
, nvsp_packet
,
736 sizeof(struct nvsp_message
));
737 complete(&net_device
->channel_init_wait
);
740 case NVSP_MSG1_TYPE_SEND_RNDIS_PKT_COMPLETE
:
741 netvsc_send_tx_complete(ndev
, net_device
, incoming_channel
,
747 "Unknown send completion type %d received!!\n",
748 nvsp_packet
->hdr
.msg_type
);
752 static u32
netvsc_get_next_send_section(struct netvsc_device
*net_device
)
754 unsigned long *map_addr
= net_device
->send_section_map
;
757 for_each_clear_bit(i
, map_addr
, net_device
->send_section_cnt
) {
758 if (sync_test_and_set_bit(i
, map_addr
) == 0)
762 return NETVSC_INVALID_INDEX
;
765 static void netvsc_copy_to_send_buf(struct netvsc_device
*net_device
,
766 unsigned int section_index
,
768 struct hv_netvsc_packet
*packet
,
769 struct rndis_message
*rndis_msg
,
770 struct hv_page_buffer
*pb
,
773 char *start
= net_device
->send_buf
;
774 char *dest
= start
+ (section_index
* net_device
->send_section_size
)
778 u32 page_count
= packet
->cp_partial
? packet
->rmsg_pgcnt
:
779 packet
->page_buf_cnt
;
783 remain
= packet
->total_data_buflen
& (net_device
->pkt_align
- 1);
784 if (xmit_more
&& remain
) {
785 padding
= net_device
->pkt_align
- remain
;
786 rndis_msg
->msg_len
+= padding
;
787 packet
->total_data_buflen
+= padding
;
790 for (i
= 0; i
< page_count
; i
++) {
791 char *src
= phys_to_virt(pb
[i
].pfn
<< PAGE_SHIFT
);
792 u32 offset
= pb
[i
].offset
;
795 memcpy(dest
, (src
+ offset
), len
);
800 memset(dest
, 0, padding
);
803 static inline int netvsc_send_pkt(
804 struct hv_device
*device
,
805 struct hv_netvsc_packet
*packet
,
806 struct netvsc_device
*net_device
,
807 struct hv_page_buffer
*pb
,
810 struct nvsp_message nvmsg
;
811 struct nvsp_1_message_send_rndis_packet
*rpkt
=
812 &nvmsg
.msg
.v1_msg
.send_rndis_pkt
;
813 struct netvsc_channel
* const nvchan
=
814 &net_device
->chan_table
[packet
->q_idx
];
815 struct vmbus_channel
*out_channel
= nvchan
->channel
;
816 struct net_device
*ndev
= hv_get_drvdata(device
);
817 struct net_device_context
*ndev_ctx
= netdev_priv(ndev
);
818 struct netdev_queue
*txq
= netdev_get_tx_queue(ndev
, packet
->q_idx
);
821 u32 ring_avail
= hv_get_avail_to_write_percent(&out_channel
->outbound
);
823 nvmsg
.hdr
.msg_type
= NVSP_MSG1_TYPE_SEND_RNDIS_PKT
;
825 rpkt
->channel_type
= 0; /* 0 is RMC_DATA */
827 rpkt
->channel_type
= 1; /* 1 is RMC_CONTROL */
829 rpkt
->send_buf_section_index
= packet
->send_buf_index
;
830 if (packet
->send_buf_index
== NETVSC_INVALID_INDEX
)
831 rpkt
->send_buf_section_size
= 0;
833 rpkt
->send_buf_section_size
= packet
->total_data_buflen
;
837 if (out_channel
->rescind
)
840 trace_nvsp_send_pkt(ndev
, out_channel
, rpkt
);
842 if (packet
->page_buf_cnt
) {
843 if (packet
->cp_partial
)
844 pb
+= packet
->rmsg_pgcnt
;
846 ret
= vmbus_sendpacket_pagebuffer(out_channel
,
847 pb
, packet
->page_buf_cnt
,
848 &nvmsg
, sizeof(nvmsg
),
851 ret
= vmbus_sendpacket(out_channel
,
852 &nvmsg
, sizeof(nvmsg
),
853 req_id
, VM_PKT_DATA_INBAND
,
854 VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED
);
858 atomic_inc_return(&nvchan
->queue_sends
);
860 if (ring_avail
< RING_AVAIL_PERCENT_LOWATER
) {
861 netif_tx_stop_queue(txq
);
862 ndev_ctx
->eth_stats
.stop_queue
++;
864 } else if (ret
== -EAGAIN
) {
865 netif_tx_stop_queue(txq
);
866 ndev_ctx
->eth_stats
.stop_queue
++;
869 "Unable to send packet pages %u len %u, ret %d\n",
870 packet
->page_buf_cnt
, packet
->total_data_buflen
,
874 if (netif_tx_queue_stopped(txq
) &&
875 atomic_read(&nvchan
->queue_sends
) < 1 &&
876 !net_device
->tx_disable
) {
877 netif_tx_wake_queue(txq
);
878 ndev_ctx
->eth_stats
.wake_queue
++;
886 /* Move packet out of multi send data (msd), and clear msd */
887 static inline void move_pkt_msd(struct hv_netvsc_packet
**msd_send
,
888 struct sk_buff
**msd_skb
,
889 struct multi_send_data
*msdp
)
891 *msd_skb
= msdp
->skb
;
892 *msd_send
= msdp
->pkt
;
898 /* RCU already held by caller */
899 int netvsc_send(struct net_device
*ndev
,
900 struct hv_netvsc_packet
*packet
,
901 struct rndis_message
*rndis_msg
,
902 struct hv_page_buffer
*pb
,
905 struct net_device_context
*ndev_ctx
= netdev_priv(ndev
);
906 struct netvsc_device
*net_device
907 = rcu_dereference_bh(ndev_ctx
->nvdev
);
908 struct hv_device
*device
= ndev_ctx
->device_ctx
;
910 struct netvsc_channel
*nvchan
;
911 u32 pktlen
= packet
->total_data_buflen
, msd_len
= 0;
912 unsigned int section_index
= NETVSC_INVALID_INDEX
;
913 struct multi_send_data
*msdp
;
914 struct hv_netvsc_packet
*msd_send
= NULL
, *cur_send
= NULL
;
915 struct sk_buff
*msd_skb
= NULL
;
916 bool try_batch
, xmit_more
;
918 /* If device is rescinded, return error and packet will get dropped. */
919 if (unlikely(!net_device
|| net_device
->destroy
))
922 nvchan
= &net_device
->chan_table
[packet
->q_idx
];
923 packet
->send_buf_index
= NETVSC_INVALID_INDEX
;
924 packet
->cp_partial
= false;
926 /* Send control message directly without accessing msd (Multi-Send
927 * Data) field which may be changed during data packet processing.
930 return netvsc_send_pkt(device
, packet
, net_device
, pb
, skb
);
932 /* batch packets in send buffer if possible */
935 msd_len
= msdp
->pkt
->total_data_buflen
;
937 try_batch
= msd_len
> 0 && msdp
->count
< net_device
->max_pkt
;
938 if (try_batch
&& msd_len
+ pktlen
+ net_device
->pkt_align
<
939 net_device
->send_section_size
) {
940 section_index
= msdp
->pkt
->send_buf_index
;
942 } else if (try_batch
&& msd_len
+ packet
->rmsg_size
<
943 net_device
->send_section_size
) {
944 section_index
= msdp
->pkt
->send_buf_index
;
945 packet
->cp_partial
= true;
947 } else if (pktlen
+ net_device
->pkt_align
<
948 net_device
->send_section_size
) {
949 section_index
= netvsc_get_next_send_section(net_device
);
950 if (unlikely(section_index
== NETVSC_INVALID_INDEX
)) {
951 ++ndev_ctx
->eth_stats
.tx_send_full
;
953 move_pkt_msd(&msd_send
, &msd_skb
, msdp
);
958 /* Keep aggregating only if stack says more data is coming
959 * and not doing mixed modes send and not flow blocked
961 xmit_more
= netdev_xmit_more() &&
962 !packet
->cp_partial
&&
963 !netif_xmit_stopped(netdev_get_tx_queue(ndev
, packet
->q_idx
));
965 if (section_index
!= NETVSC_INVALID_INDEX
) {
966 netvsc_copy_to_send_buf(net_device
,
967 section_index
, msd_len
,
968 packet
, rndis_msg
, pb
, xmit_more
);
970 packet
->send_buf_index
= section_index
;
972 if (packet
->cp_partial
) {
973 packet
->page_buf_cnt
-= packet
->rmsg_pgcnt
;
974 packet
->total_data_buflen
= msd_len
+ packet
->rmsg_size
;
976 packet
->page_buf_cnt
= 0;
977 packet
->total_data_buflen
+= msd_len
;
981 packet
->total_packets
+= msdp
->pkt
->total_packets
;
982 packet
->total_bytes
+= msdp
->pkt
->total_bytes
;
986 dev_consume_skb_any(msdp
->skb
);
999 move_pkt_msd(&msd_send
, &msd_skb
, msdp
);
1004 int m_ret
= netvsc_send_pkt(device
, msd_send
, net_device
,
1008 netvsc_free_send_slot(net_device
,
1009 msd_send
->send_buf_index
);
1010 dev_kfree_skb_any(msd_skb
);
1015 ret
= netvsc_send_pkt(device
, cur_send
, net_device
, pb
, skb
);
1017 if (ret
!= 0 && section_index
!= NETVSC_INVALID_INDEX
)
1018 netvsc_free_send_slot(net_device
, section_index
);
1023 /* Send pending recv completions */
1024 static int send_recv_completions(struct net_device
*ndev
,
1025 struct netvsc_device
*nvdev
,
1026 struct netvsc_channel
*nvchan
)
1028 struct multi_recv_comp
*mrc
= &nvchan
->mrc
;
1029 struct recv_comp_msg
{
1030 struct nvsp_message_header hdr
;
1033 struct recv_comp_msg msg
= {
1034 .hdr
.msg_type
= NVSP_MSG1_TYPE_SEND_RNDIS_PKT_COMPLETE
,
1038 while (mrc
->first
!= mrc
->next
) {
1039 const struct recv_comp_data
*rcd
1040 = mrc
->slots
+ mrc
->first
;
1042 msg
.status
= rcd
->status
;
1043 ret
= vmbus_sendpacket(nvchan
->channel
, &msg
, sizeof(msg
),
1044 rcd
->tid
, VM_PKT_COMP
, 0);
1045 if (unlikely(ret
)) {
1046 struct net_device_context
*ndev_ctx
= netdev_priv(ndev
);
1048 ++ndev_ctx
->eth_stats
.rx_comp_busy
;
1052 if (++mrc
->first
== nvdev
->recv_completion_cnt
)
1056 /* receive completion ring has been emptied */
1057 if (unlikely(nvdev
->destroy
))
1058 wake_up(&nvdev
->wait_drain
);
1063 /* Count how many receive completions are outstanding */
1064 static void recv_comp_slot_avail(const struct netvsc_device
*nvdev
,
1065 const struct multi_recv_comp
*mrc
,
1066 u32
*filled
, u32
*avail
)
1068 u32 count
= nvdev
->recv_completion_cnt
;
1070 if (mrc
->next
>= mrc
->first
)
1071 *filled
= mrc
->next
- mrc
->first
;
1073 *filled
= (count
- mrc
->first
) + mrc
->next
;
1075 *avail
= count
- *filled
- 1;
1078 /* Add receive complete to ring to send to host. */
1079 static void enq_receive_complete(struct net_device
*ndev
,
1080 struct netvsc_device
*nvdev
, u16 q_idx
,
1081 u64 tid
, u32 status
)
1083 struct netvsc_channel
*nvchan
= &nvdev
->chan_table
[q_idx
];
1084 struct multi_recv_comp
*mrc
= &nvchan
->mrc
;
1085 struct recv_comp_data
*rcd
;
1088 recv_comp_slot_avail(nvdev
, mrc
, &filled
, &avail
);
1090 if (unlikely(filled
> NAPI_POLL_WEIGHT
)) {
1091 send_recv_completions(ndev
, nvdev
, nvchan
);
1092 recv_comp_slot_avail(nvdev
, mrc
, &filled
, &avail
);
1095 if (unlikely(!avail
)) {
1096 netdev_err(ndev
, "Recv_comp full buf q:%hd, tid:%llx\n",
1101 rcd
= mrc
->slots
+ mrc
->next
;
1103 rcd
->status
= status
;
1105 if (++mrc
->next
== nvdev
->recv_completion_cnt
)
1109 static int netvsc_receive(struct net_device
*ndev
,
1110 struct netvsc_device
*net_device
,
1111 struct netvsc_channel
*nvchan
,
1112 const struct vmpacket_descriptor
*desc
,
1113 const struct nvsp_message
*nvsp
)
1115 struct net_device_context
*net_device_ctx
= netdev_priv(ndev
);
1116 struct vmbus_channel
*channel
= nvchan
->channel
;
1117 const struct vmtransfer_page_packet_header
*vmxferpage_packet
1118 = container_of(desc
, const struct vmtransfer_page_packet_header
, d
);
1119 u16 q_idx
= channel
->offermsg
.offer
.sub_channel_index
;
1120 char *recv_buf
= net_device
->recv_buf
;
1121 u32 status
= NVSP_STAT_SUCCESS
;
1125 /* Make sure this is a valid nvsp packet */
1126 if (unlikely(nvsp
->hdr
.msg_type
!= NVSP_MSG1_TYPE_SEND_RNDIS_PKT
)) {
1127 netif_err(net_device_ctx
, rx_err
, ndev
,
1128 "Unknown nvsp packet type received %u\n",
1129 nvsp
->hdr
.msg_type
);
1133 if (unlikely(vmxferpage_packet
->xfer_pageset_id
!= NETVSC_RECEIVE_BUFFER_ID
)) {
1134 netif_err(net_device_ctx
, rx_err
, ndev
,
1135 "Invalid xfer page set id - expecting %x got %x\n",
1136 NETVSC_RECEIVE_BUFFER_ID
,
1137 vmxferpage_packet
->xfer_pageset_id
);
1141 count
= vmxferpage_packet
->range_cnt
;
1143 /* Each range represents 1 RNDIS pkt that contains 1 ethernet frame */
1144 for (i
= 0; i
< count
; i
++) {
1145 u32 offset
= vmxferpage_packet
->ranges
[i
].byte_offset
;
1146 u32 buflen
= vmxferpage_packet
->ranges
[i
].byte_count
;
1150 if (unlikely(offset
+ buflen
> net_device
->recv_buf_size
)) {
1151 nvchan
->rsc
.cnt
= 0;
1152 status
= NVSP_STAT_FAIL
;
1153 netif_err(net_device_ctx
, rx_err
, ndev
,
1154 "Packet offset:%u + len:%u too big\n",
1160 data
= recv_buf
+ offset
;
1162 nvchan
->rsc
.is_last
= (i
== count
- 1);
1164 trace_rndis_recv(ndev
, q_idx
, data
);
1166 /* Pass it to the upper layer */
1167 ret
= rndis_filter_receive(ndev
, net_device
,
1168 nvchan
, data
, buflen
);
1170 if (unlikely(ret
!= NVSP_STAT_SUCCESS
))
1171 status
= NVSP_STAT_FAIL
;
1174 enq_receive_complete(ndev
, net_device
, q_idx
,
1175 vmxferpage_packet
->d
.trans_id
, status
);
1180 static void netvsc_send_table(struct net_device
*ndev
,
1181 const struct nvsp_message
*nvmsg
)
1183 struct net_device_context
*net_device_ctx
= netdev_priv(ndev
);
1187 count
= nvmsg
->msg
.v5_msg
.send_table
.count
;
1188 if (count
!= VRSS_SEND_TAB_SIZE
) {
1189 netdev_err(ndev
, "Received wrong send-table size:%u\n", count
);
1193 tab
= (u32
*)((unsigned long)&nvmsg
->msg
.v5_msg
.send_table
+
1194 nvmsg
->msg
.v5_msg
.send_table
.offset
);
1196 for (i
= 0; i
< count
; i
++)
1197 net_device_ctx
->tx_table
[i
] = tab
[i
];
1200 static void netvsc_send_vf(struct net_device
*ndev
,
1201 const struct nvsp_message
*nvmsg
)
1203 struct net_device_context
*net_device_ctx
= netdev_priv(ndev
);
1205 net_device_ctx
->vf_alloc
= nvmsg
->msg
.v4_msg
.vf_assoc
.allocated
;
1206 net_device_ctx
->vf_serial
= nvmsg
->msg
.v4_msg
.vf_assoc
.serial
;
1207 netdev_info(ndev
, "VF slot %u %s\n",
1208 net_device_ctx
->vf_serial
,
1209 net_device_ctx
->vf_alloc
? "added" : "removed");
1212 static void netvsc_receive_inband(struct net_device
*ndev
,
1213 const struct nvsp_message
*nvmsg
)
1215 switch (nvmsg
->hdr
.msg_type
) {
1216 case NVSP_MSG5_TYPE_SEND_INDIRECTION_TABLE
:
1217 netvsc_send_table(ndev
, nvmsg
);
1220 case NVSP_MSG4_TYPE_SEND_VF_ASSOCIATION
:
1221 netvsc_send_vf(ndev
, nvmsg
);
1226 static int netvsc_process_raw_pkt(struct hv_device
*device
,
1227 struct netvsc_channel
*nvchan
,
1228 struct netvsc_device
*net_device
,
1229 struct net_device
*ndev
,
1230 const struct vmpacket_descriptor
*desc
,
1233 struct vmbus_channel
*channel
= nvchan
->channel
;
1234 const struct nvsp_message
*nvmsg
= hv_pkt_data(desc
);
1236 trace_nvsp_recv(ndev
, channel
, nvmsg
);
1238 switch (desc
->type
) {
1240 netvsc_send_completion(ndev
, net_device
, channel
,
1244 case VM_PKT_DATA_USING_XFER_PAGES
:
1245 return netvsc_receive(ndev
, net_device
, nvchan
,
1249 case VM_PKT_DATA_INBAND
:
1250 netvsc_receive_inband(ndev
, nvmsg
);
1254 netdev_err(ndev
, "unhandled packet type %d, tid %llx\n",
1255 desc
->type
, desc
->trans_id
);
1262 static struct hv_device
*netvsc_channel_to_device(struct vmbus_channel
*channel
)
1264 struct vmbus_channel
*primary
= channel
->primary_channel
;
1266 return primary
? primary
->device_obj
: channel
->device_obj
;
1269 /* Network processing softirq
1270 * Process data in incoming ring buffer from host
1271 * Stops when ring is empty or budget is met or exceeded.
1273 int netvsc_poll(struct napi_struct
*napi
, int budget
)
1275 struct netvsc_channel
*nvchan
1276 = container_of(napi
, struct netvsc_channel
, napi
);
1277 struct netvsc_device
*net_device
= nvchan
->net_device
;
1278 struct vmbus_channel
*channel
= nvchan
->channel
;
1279 struct hv_device
*device
= netvsc_channel_to_device(channel
);
1280 struct net_device
*ndev
= hv_get_drvdata(device
);
1284 /* If starting a new interval */
1286 nvchan
->desc
= hv_pkt_iter_first(channel
);
1288 while (nvchan
->desc
&& work_done
< budget
) {
1289 work_done
+= netvsc_process_raw_pkt(device
, nvchan
, net_device
,
1290 ndev
, nvchan
->desc
, budget
);
1291 nvchan
->desc
= hv_pkt_iter_next(channel
, nvchan
->desc
);
1294 /* Send any pending receive completions */
1295 ret
= send_recv_completions(ndev
, net_device
, nvchan
);
1297 /* If it did not exhaust NAPI budget this time
1298 * and not doing busy poll
1299 * then re-enable host interrupts
1300 * and reschedule if ring is not empty
1301 * or sending receive completion failed.
1303 if (work_done
< budget
&&
1304 napi_complete_done(napi
, work_done
) &&
1305 (ret
|| hv_end_read(&channel
->inbound
)) &&
1306 napi_schedule_prep(napi
)) {
1307 hv_begin_read(&channel
->inbound
);
1308 __napi_schedule(napi
);
1311 /* Driver may overshoot since multiple packets per descriptor */
1312 return min(work_done
, budget
);
1315 /* Call back when data is available in host ring buffer.
1316 * Processing is deferred until network softirq (NAPI)
1318 void netvsc_channel_cb(void *context
)
1320 struct netvsc_channel
*nvchan
= context
;
1321 struct vmbus_channel
*channel
= nvchan
->channel
;
1322 struct hv_ring_buffer_info
*rbi
= &channel
->inbound
;
1324 /* preload first vmpacket descriptor */
1325 prefetch(hv_get_ring_buffer(rbi
) + rbi
->priv_read_index
);
1327 if (napi_schedule_prep(&nvchan
->napi
)) {
1328 /* disable interrupts from host */
1331 __napi_schedule_irqoff(&nvchan
->napi
);
1336 * netvsc_device_add - Callback when the device belonging to this
1339 struct netvsc_device
*netvsc_device_add(struct hv_device
*device
,
1340 const struct netvsc_device_info
*device_info
)
1343 struct netvsc_device
*net_device
;
1344 struct net_device
*ndev
= hv_get_drvdata(device
);
1345 struct net_device_context
*net_device_ctx
= netdev_priv(ndev
);
1347 net_device
= alloc_net_device();
1349 return ERR_PTR(-ENOMEM
);
1351 for (i
= 0; i
< VRSS_SEND_TAB_SIZE
; i
++)
1352 net_device_ctx
->tx_table
[i
] = 0;
1354 /* Because the device uses NAPI, all the interrupt batching and
1355 * control is done via Net softirq, not the channel handling
1357 set_channel_read_mode(device
->channel
, HV_CALL_ISR
);
1359 /* If we're reopening the device we may have multiple queues, fill the
1360 * chn_table with the default channel to use it before subchannels are
1362 * Initialize the channel state before we open;
1363 * we can be interrupted as soon as we open the channel.
1366 for (i
= 0; i
< VRSS_CHANNEL_MAX
; i
++) {
1367 struct netvsc_channel
*nvchan
= &net_device
->chan_table
[i
];
1369 nvchan
->channel
= device
->channel
;
1370 nvchan
->net_device
= net_device
;
1371 u64_stats_init(&nvchan
->tx_stats
.syncp
);
1372 u64_stats_init(&nvchan
->rx_stats
.syncp
);
1375 /* Enable NAPI handler before init callbacks */
1376 netif_napi_add(ndev
, &net_device
->chan_table
[0].napi
,
1377 netvsc_poll
, NAPI_POLL_WEIGHT
);
1379 /* Open the channel */
1380 ret
= vmbus_open(device
->channel
, netvsc_ring_bytes
,
1381 netvsc_ring_bytes
, NULL
, 0,
1382 netvsc_channel_cb
, net_device
->chan_table
);
1385 netdev_err(ndev
, "unable to open channel: %d\n", ret
);
1389 /* Channel is opened */
1390 netdev_dbg(ndev
, "hv_netvsc channel opened successfully\n");
1392 napi_enable(&net_device
->chan_table
[0].napi
);
1394 /* Connect with the NetVsp */
1395 ret
= netvsc_connect_vsp(device
, net_device
, device_info
);
1398 "unable to connect to NetVSP - %d\n", ret
);
1402 /* Writing nvdev pointer unlocks netvsc_send(), make sure chn_table is
1405 rcu_assign_pointer(net_device_ctx
->nvdev
, net_device
);
1410 RCU_INIT_POINTER(net_device_ctx
->nvdev
, NULL
);
1411 napi_disable(&net_device
->chan_table
[0].napi
);
1413 /* Now, we can close the channel safely */
1414 vmbus_close(device
->channel
);
1417 netif_napi_del(&net_device
->chan_table
[0].napi
);
1418 free_netvsc_device(&net_device
->rcu
);
1420 return ERR_PTR(ret
);