1 // SPDX-License-Identifier: GPL-2.0-only
3 * Copyright (c) 2009, Microsoft Corporation.
6 * Haiyang Zhang <haiyangz@microsoft.com>
7 * Hank Janssen <hjanssen@microsoft.com>
9 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
11 #include <linux/kernel.h>
12 #include <linux/sched.h>
13 #include <linux/wait.h>
15 #include <linux/delay.h>
17 #include <linux/slab.h>
18 #include <linux/netdevice.h>
19 #include <linux/if_ether.h>
20 #include <linux/vmalloc.h>
21 #include <linux/rtnetlink.h>
22 #include <linux/prefetch.h>
23 #include <linux/filter.h>
25 #include <asm/sync_bitops.h>
26 #include <asm/mshyperv.h>
28 #include "hyperv_net.h"
29 #include "netvsc_trace.h"
32 * Switch the data path from the synthetic interface to the VF
35 int netvsc_switch_datapath(struct net_device
*ndev
, bool vf
)
37 struct net_device_context
*net_device_ctx
= netdev_priv(ndev
);
38 struct hv_device
*dev
= net_device_ctx
->device_ctx
;
39 struct netvsc_device
*nv_dev
= rtnl_dereference(net_device_ctx
->nvdev
);
40 struct nvsp_message
*init_pkt
= &nv_dev
->channel_init_pkt
;
43 /* Block sending traffic to VF if it's about to be gone */
45 net_device_ctx
->data_path_is_vf
= vf
;
47 memset(init_pkt
, 0, sizeof(struct nvsp_message
));
48 init_pkt
->hdr
.msg_type
= NVSP_MSG4_TYPE_SWITCH_DATA_PATH
;
50 init_pkt
->msg
.v4_msg
.active_dp
.active_datapath
=
53 init_pkt
->msg
.v4_msg
.active_dp
.active_datapath
=
54 NVSP_DATAPATH_SYNTHETIC
;
57 trace_nvsp_send(ndev
, init_pkt
);
59 ret
= vmbus_sendpacket(dev
->channel
, init_pkt
,
60 sizeof(struct nvsp_message
),
61 (unsigned long)init_pkt
, VM_PKT_DATA_INBAND
,
62 VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED
);
64 /* If failed to switch to/from VF, let data_path_is_vf stay false,
65 * so we use synthetic path to send data.
70 "Unable to send sw datapath msg, err: %d\n",
75 if (retry
++ < RETRY_MAX
) {
76 usleep_range(RETRY_US_LO
, RETRY_US_HI
);
81 "Retry failed to send sw datapath msg, err: %d\n",
87 wait_for_completion(&nv_dev
->channel_init_wait
);
88 net_device_ctx
->data_path_is_vf
= vf
;
93 /* Worker to setup sub channels on initial setup
94 * Initial hotplug event occurs in softirq context
95 * and can't wait for channels.
97 static void netvsc_subchan_work(struct work_struct
*w
)
99 struct netvsc_device
*nvdev
=
100 container_of(w
, struct netvsc_device
, subchan_work
);
101 struct rndis_device
*rdev
;
104 /* Avoid deadlock with device removal already under RTNL */
105 if (!rtnl_trylock()) {
110 rdev
= nvdev
->extension
;
112 ret
= rndis_set_subchannel(rdev
->ndev
, nvdev
, NULL
);
114 netif_device_attach(rdev
->ndev
);
116 /* fallback to only primary channel */
117 for (i
= 1; i
< nvdev
->num_chn
; i
++)
118 netif_napi_del(&nvdev
->chan_table
[i
].napi
);
128 static struct netvsc_device
*alloc_net_device(void)
130 struct netvsc_device
*net_device
;
132 net_device
= kzalloc(sizeof(struct netvsc_device
), GFP_KERNEL
);
136 init_waitqueue_head(&net_device
->wait_drain
);
137 net_device
->destroy
= false;
138 net_device
->tx_disable
= true;
140 net_device
->max_pkt
= RNDIS_MAX_PKT_DEFAULT
;
141 net_device
->pkt_align
= RNDIS_PKT_ALIGN_DEFAULT
;
143 init_completion(&net_device
->channel_init_wait
);
144 init_waitqueue_head(&net_device
->subchan_open
);
145 INIT_WORK(&net_device
->subchan_work
, netvsc_subchan_work
);
150 static void free_netvsc_device(struct rcu_head
*head
)
152 struct netvsc_device
*nvdev
153 = container_of(head
, struct netvsc_device
, rcu
);
156 kfree(nvdev
->extension
);
158 if (!nvdev
->recv_buf_gpadl_handle
.decrypted
)
159 vfree(nvdev
->recv_buf
);
160 if (!nvdev
->send_buf_gpadl_handle
.decrypted
)
161 vfree(nvdev
->send_buf
);
162 bitmap_free(nvdev
->send_section_map
);
164 for (i
= 0; i
< VRSS_CHANNEL_MAX
; i
++) {
165 xdp_rxq_info_unreg(&nvdev
->chan_table
[i
].xdp_rxq
);
166 kfree(nvdev
->chan_table
[i
].recv_buf
);
167 vfree(nvdev
->chan_table
[i
].mrc
.slots
);
173 static void free_netvsc_device_rcu(struct netvsc_device
*nvdev
)
175 call_rcu(&nvdev
->rcu
, free_netvsc_device
);
178 static void netvsc_revoke_recv_buf(struct hv_device
*device
,
179 struct netvsc_device
*net_device
,
180 struct net_device
*ndev
)
182 struct nvsp_message
*revoke_packet
;
186 * If we got a section count, it means we received a
187 * SendReceiveBufferComplete msg (ie sent
188 * NvspMessage1TypeSendReceiveBuffer msg) therefore, we need
189 * to send a revoke msg here
191 if (net_device
->recv_section_cnt
) {
192 /* Send the revoke receive buffer */
193 revoke_packet
= &net_device
->revoke_packet
;
194 memset(revoke_packet
, 0, sizeof(struct nvsp_message
));
196 revoke_packet
->hdr
.msg_type
=
197 NVSP_MSG1_TYPE_REVOKE_RECV_BUF
;
198 revoke_packet
->msg
.v1_msg
.
199 revoke_recv_buf
.id
= NETVSC_RECEIVE_BUFFER_ID
;
201 trace_nvsp_send(ndev
, revoke_packet
);
203 ret
= vmbus_sendpacket(device
->channel
,
205 sizeof(struct nvsp_message
),
206 VMBUS_RQST_ID_NO_RESPONSE
,
207 VM_PKT_DATA_INBAND
, 0);
208 /* If the failure is because the channel is rescinded;
209 * ignore the failure since we cannot send on a rescinded
210 * channel. This would allow us to properly cleanup
211 * even when the channel is rescinded.
213 if (device
->channel
->rescind
)
216 * If we failed here, we might as well return and
217 * have a leak rather than continue and a bugchk
220 netdev_err(ndev
, "unable to send "
221 "revoke receive buffer to netvsp\n");
224 net_device
->recv_section_cnt
= 0;
228 static void netvsc_revoke_send_buf(struct hv_device
*device
,
229 struct netvsc_device
*net_device
,
230 struct net_device
*ndev
)
232 struct nvsp_message
*revoke_packet
;
235 /* Deal with the send buffer we may have setup.
236 * If we got a send section size, it means we received a
237 * NVSP_MSG1_TYPE_SEND_SEND_BUF_COMPLETE msg (ie sent
238 * NVSP_MSG1_TYPE_SEND_SEND_BUF msg) therefore, we need
239 * to send a revoke msg here
241 if (net_device
->send_section_cnt
) {
242 /* Send the revoke receive buffer */
243 revoke_packet
= &net_device
->revoke_packet
;
244 memset(revoke_packet
, 0, sizeof(struct nvsp_message
));
246 revoke_packet
->hdr
.msg_type
=
247 NVSP_MSG1_TYPE_REVOKE_SEND_BUF
;
248 revoke_packet
->msg
.v1_msg
.revoke_send_buf
.id
=
249 NETVSC_SEND_BUFFER_ID
;
251 trace_nvsp_send(ndev
, revoke_packet
);
253 ret
= vmbus_sendpacket(device
->channel
,
255 sizeof(struct nvsp_message
),
256 VMBUS_RQST_ID_NO_RESPONSE
,
257 VM_PKT_DATA_INBAND
, 0);
259 /* If the failure is because the channel is rescinded;
260 * ignore the failure since we cannot send on a rescinded
261 * channel. This would allow us to properly cleanup
262 * even when the channel is rescinded.
264 if (device
->channel
->rescind
)
267 /* If we failed here, we might as well return and
268 * have a leak rather than continue and a bugchk
271 netdev_err(ndev
, "unable to send "
272 "revoke send buffer to netvsp\n");
275 net_device
->send_section_cnt
= 0;
279 static void netvsc_teardown_recv_gpadl(struct hv_device
*device
,
280 struct netvsc_device
*net_device
,
281 struct net_device
*ndev
)
285 if (net_device
->recv_buf_gpadl_handle
.gpadl_handle
) {
286 ret
= vmbus_teardown_gpadl(device
->channel
,
287 &net_device
->recv_buf_gpadl_handle
);
289 /* If we failed here, we might as well return and have a leak
290 * rather than continue and a bugchk
294 "unable to teardown receive buffer's gpadl\n");
300 static void netvsc_teardown_send_gpadl(struct hv_device
*device
,
301 struct netvsc_device
*net_device
,
302 struct net_device
*ndev
)
306 if (net_device
->send_buf_gpadl_handle
.gpadl_handle
) {
307 ret
= vmbus_teardown_gpadl(device
->channel
,
308 &net_device
->send_buf_gpadl_handle
);
310 /* If we failed here, we might as well return and have a leak
311 * rather than continue and a bugchk
315 "unable to teardown send buffer's gpadl\n");
321 int netvsc_alloc_recv_comp_ring(struct netvsc_device
*net_device
, u32 q_idx
)
323 struct netvsc_channel
*nvchan
= &net_device
->chan_table
[q_idx
];
324 int node
= cpu_to_node(nvchan
->channel
->target_cpu
);
327 size
= net_device
->recv_completion_cnt
* sizeof(struct recv_comp_data
);
328 nvchan
->mrc
.slots
= vzalloc_node(size
, node
);
329 if (!nvchan
->mrc
.slots
)
330 nvchan
->mrc
.slots
= vzalloc(size
);
332 return nvchan
->mrc
.slots
? 0 : -ENOMEM
;
335 static int netvsc_init_buf(struct hv_device
*device
,
336 struct netvsc_device
*net_device
,
337 const struct netvsc_device_info
*device_info
)
339 struct nvsp_1_message_send_receive_buffer_complete
*resp
;
340 struct net_device
*ndev
= hv_get_drvdata(device
);
341 struct nvsp_message
*init_packet
;
342 unsigned int buf_size
;
345 /* Get receive buffer area. */
346 buf_size
= device_info
->recv_sections
* device_info
->recv_section_size
;
347 buf_size
= roundup(buf_size
, PAGE_SIZE
);
349 /* Legacy hosts only allow smaller receive buffer */
350 if (net_device
->nvsp_version
<= NVSP_PROTOCOL_VERSION_2
)
351 buf_size
= min_t(unsigned int, buf_size
,
352 NETVSC_RECEIVE_BUFFER_SIZE_LEGACY
);
354 net_device
->recv_buf
= vzalloc(buf_size
);
355 if (!net_device
->recv_buf
) {
357 "unable to allocate receive buffer of size %u\n",
363 net_device
->recv_buf_size
= buf_size
;
366 * Establish the gpadl handle for this buffer on this
367 * channel. Note: This call uses the vmbus connection rather
368 * than the channel to establish the gpadl handle.
370 ret
= vmbus_establish_gpadl(device
->channel
, net_device
->recv_buf
,
372 &net_device
->recv_buf_gpadl_handle
);
375 "unable to establish receive buffer's gpadl\n");
379 /* Notify the NetVsp of the gpadl handle */
380 init_packet
= &net_device
->channel_init_pkt
;
381 memset(init_packet
, 0, sizeof(struct nvsp_message
));
382 init_packet
->hdr
.msg_type
= NVSP_MSG1_TYPE_SEND_RECV_BUF
;
383 init_packet
->msg
.v1_msg
.send_recv_buf
.
384 gpadl_handle
= net_device
->recv_buf_gpadl_handle
.gpadl_handle
;
385 init_packet
->msg
.v1_msg
.
386 send_recv_buf
.id
= NETVSC_RECEIVE_BUFFER_ID
;
388 trace_nvsp_send(ndev
, init_packet
);
390 /* Send the gpadl notification request */
391 ret
= vmbus_sendpacket(device
->channel
, init_packet
,
392 sizeof(struct nvsp_message
),
393 (unsigned long)init_packet
,
395 VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED
);
398 "unable to send receive buffer's gpadl to netvsp\n");
402 wait_for_completion(&net_device
->channel_init_wait
);
404 /* Check the response */
405 resp
= &init_packet
->msg
.v1_msg
.send_recv_buf_complete
;
406 if (resp
->status
!= NVSP_STAT_SUCCESS
) {
408 "Unable to complete receive buffer initialization with NetVsp - status %d\n",
414 /* Parse the response */
415 netdev_dbg(ndev
, "Receive sections: %u sub_allocs: size %u count: %u\n",
416 resp
->num_sections
, resp
->sections
[0].sub_alloc_size
,
417 resp
->sections
[0].num_sub_allocs
);
419 /* There should only be one section for the entire receive buffer */
420 if (resp
->num_sections
!= 1 || resp
->sections
[0].offset
!= 0) {
425 net_device
->recv_section_size
= resp
->sections
[0].sub_alloc_size
;
426 net_device
->recv_section_cnt
= resp
->sections
[0].num_sub_allocs
;
428 /* Ensure buffer will not overflow */
429 if (net_device
->recv_section_size
< NETVSC_MTU_MIN
|| (u64
)net_device
->recv_section_size
*
430 (u64
)net_device
->recv_section_cnt
> (u64
)buf_size
) {
431 netdev_err(ndev
, "invalid recv_section_size %u\n",
432 net_device
->recv_section_size
);
437 for (i
= 0; i
< VRSS_CHANNEL_MAX
; i
++) {
438 struct netvsc_channel
*nvchan
= &net_device
->chan_table
[i
];
440 nvchan
->recv_buf
= kzalloc(net_device
->recv_section_size
, GFP_KERNEL
);
441 if (nvchan
->recv_buf
== NULL
) {
447 /* Setup receive completion ring.
448 * Add 1 to the recv_section_cnt because at least one entry in a
449 * ring buffer has to be empty.
451 net_device
->recv_completion_cnt
= net_device
->recv_section_cnt
+ 1;
452 ret
= netvsc_alloc_recv_comp_ring(net_device
, 0);
456 /* Now setup the send buffer. */
457 buf_size
= device_info
->send_sections
* device_info
->send_section_size
;
458 buf_size
= round_up(buf_size
, PAGE_SIZE
);
460 net_device
->send_buf
= vzalloc(buf_size
);
461 if (!net_device
->send_buf
) {
462 netdev_err(ndev
, "unable to allocate send buffer of size %u\n",
467 net_device
->send_buf_size
= buf_size
;
469 /* Establish the gpadl handle for this buffer on this
470 * channel. Note: This call uses the vmbus connection rather
471 * than the channel to establish the gpadl handle.
473 ret
= vmbus_establish_gpadl(device
->channel
, net_device
->send_buf
,
475 &net_device
->send_buf_gpadl_handle
);
478 "unable to establish send buffer's gpadl\n");
482 /* Notify the NetVsp of the gpadl handle */
483 init_packet
= &net_device
->channel_init_pkt
;
484 memset(init_packet
, 0, sizeof(struct nvsp_message
));
485 init_packet
->hdr
.msg_type
= NVSP_MSG1_TYPE_SEND_SEND_BUF
;
486 init_packet
->msg
.v1_msg
.send_send_buf
.gpadl_handle
=
487 net_device
->send_buf_gpadl_handle
.gpadl_handle
;
488 init_packet
->msg
.v1_msg
.send_send_buf
.id
= NETVSC_SEND_BUFFER_ID
;
490 trace_nvsp_send(ndev
, init_packet
);
492 /* Send the gpadl notification request */
493 ret
= vmbus_sendpacket(device
->channel
, init_packet
,
494 sizeof(struct nvsp_message
),
495 (unsigned long)init_packet
,
497 VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED
);
500 "unable to send send buffer's gpadl to netvsp\n");
504 wait_for_completion(&net_device
->channel_init_wait
);
506 /* Check the response */
507 if (init_packet
->msg
.v1_msg
.
508 send_send_buf_complete
.status
!= NVSP_STAT_SUCCESS
) {
509 netdev_err(ndev
, "Unable to complete send buffer "
510 "initialization with NetVsp - status %d\n",
511 init_packet
->msg
.v1_msg
.
512 send_send_buf_complete
.status
);
517 /* Parse the response */
518 net_device
->send_section_size
= init_packet
->msg
.
519 v1_msg
.send_send_buf_complete
.section_size
;
520 if (net_device
->send_section_size
< NETVSC_MTU_MIN
) {
521 netdev_err(ndev
, "invalid send_section_size %u\n",
522 net_device
->send_section_size
);
527 /* Section count is simply the size divided by the section size. */
528 net_device
->send_section_cnt
= buf_size
/ net_device
->send_section_size
;
530 netdev_dbg(ndev
, "Send section size: %d, Section count:%d\n",
531 net_device
->send_section_size
, net_device
->send_section_cnt
);
533 /* Setup state for managing the send buffer. */
534 net_device
->send_section_map
= bitmap_zalloc(net_device
->send_section_cnt
,
536 if (!net_device
->send_section_map
) {
544 netvsc_revoke_recv_buf(device
, net_device
, ndev
);
545 netvsc_revoke_send_buf(device
, net_device
, ndev
);
546 netvsc_teardown_recv_gpadl(device
, net_device
, ndev
);
547 netvsc_teardown_send_gpadl(device
, net_device
, ndev
);
553 /* Negotiate NVSP protocol version */
554 static int negotiate_nvsp_ver(struct hv_device
*device
,
555 struct netvsc_device
*net_device
,
556 struct nvsp_message
*init_packet
,
559 struct net_device
*ndev
= hv_get_drvdata(device
);
562 memset(init_packet
, 0, sizeof(struct nvsp_message
));
563 init_packet
->hdr
.msg_type
= NVSP_MSG_TYPE_INIT
;
564 init_packet
->msg
.init_msg
.init
.min_protocol_ver
= nvsp_ver
;
565 init_packet
->msg
.init_msg
.init
.max_protocol_ver
= nvsp_ver
;
566 trace_nvsp_send(ndev
, init_packet
);
568 /* Send the init request */
569 ret
= vmbus_sendpacket(device
->channel
, init_packet
,
570 sizeof(struct nvsp_message
),
571 (unsigned long)init_packet
,
573 VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED
);
578 wait_for_completion(&net_device
->channel_init_wait
);
580 if (init_packet
->msg
.init_msg
.init_complete
.status
!=
584 if (nvsp_ver
== NVSP_PROTOCOL_VERSION_1
)
587 /* NVSPv2 or later: Send NDIS config */
588 memset(init_packet
, 0, sizeof(struct nvsp_message
));
589 init_packet
->hdr
.msg_type
= NVSP_MSG2_TYPE_SEND_NDIS_CONFIG
;
590 init_packet
->msg
.v2_msg
.send_ndis_config
.mtu
= ndev
->mtu
+ ETH_HLEN
;
591 init_packet
->msg
.v2_msg
.send_ndis_config
.capability
.ieee8021q
= 1;
593 if (nvsp_ver
>= NVSP_PROTOCOL_VERSION_5
) {
594 if (hv_is_isolation_supported())
595 netdev_info(ndev
, "SR-IOV not advertised by guests on the host supporting isolation\n");
597 init_packet
->msg
.v2_msg
.send_ndis_config
.capability
.sriov
= 1;
599 /* Teaming bit is needed to receive link speed updates */
600 init_packet
->msg
.v2_msg
.send_ndis_config
.capability
.teaming
= 1;
603 if (nvsp_ver
>= NVSP_PROTOCOL_VERSION_61
)
604 init_packet
->msg
.v2_msg
.send_ndis_config
.capability
.rsc
= 1;
606 trace_nvsp_send(ndev
, init_packet
);
608 ret
= vmbus_sendpacket(device
->channel
, init_packet
,
609 sizeof(struct nvsp_message
),
610 VMBUS_RQST_ID_NO_RESPONSE
,
611 VM_PKT_DATA_INBAND
, 0);
616 static int netvsc_connect_vsp(struct hv_device
*device
,
617 struct netvsc_device
*net_device
,
618 const struct netvsc_device_info
*device_info
)
620 struct net_device
*ndev
= hv_get_drvdata(device
);
621 static const u32 ver_list
[] = {
622 NVSP_PROTOCOL_VERSION_1
, NVSP_PROTOCOL_VERSION_2
,
623 NVSP_PROTOCOL_VERSION_4
, NVSP_PROTOCOL_VERSION_5
,
624 NVSP_PROTOCOL_VERSION_6
, NVSP_PROTOCOL_VERSION_61
626 struct nvsp_message
*init_packet
;
627 int ndis_version
, i
, ret
;
629 init_packet
= &net_device
->channel_init_pkt
;
631 /* Negotiate the latest NVSP protocol supported */
632 for (i
= ARRAY_SIZE(ver_list
) - 1; i
>= 0; i
--)
633 if (negotiate_nvsp_ver(device
, net_device
, init_packet
,
635 net_device
->nvsp_version
= ver_list
[i
];
644 if (hv_is_isolation_supported() && net_device
->nvsp_version
< NVSP_PROTOCOL_VERSION_61
) {
645 netdev_err(ndev
, "Invalid NVSP version 0x%x (expected >= 0x%x) from the host supporting isolation\n",
646 net_device
->nvsp_version
, NVSP_PROTOCOL_VERSION_61
);
651 pr_debug("Negotiated NVSP version:%x\n", net_device
->nvsp_version
);
653 /* Send the ndis version */
654 memset(init_packet
, 0, sizeof(struct nvsp_message
));
656 if (net_device
->nvsp_version
<= NVSP_PROTOCOL_VERSION_4
)
657 ndis_version
= 0x00060001;
659 ndis_version
= 0x0006001e;
661 init_packet
->hdr
.msg_type
= NVSP_MSG1_TYPE_SEND_NDIS_VER
;
662 init_packet
->msg
.v1_msg
.
663 send_ndis_ver
.ndis_major_ver
=
664 (ndis_version
& 0xFFFF0000) >> 16;
665 init_packet
->msg
.v1_msg
.
666 send_ndis_ver
.ndis_minor_ver
=
667 ndis_version
& 0xFFFF;
669 trace_nvsp_send(ndev
, init_packet
);
671 /* Send the init request */
672 ret
= vmbus_sendpacket(device
->channel
, init_packet
,
673 sizeof(struct nvsp_message
),
674 VMBUS_RQST_ID_NO_RESPONSE
,
675 VM_PKT_DATA_INBAND
, 0);
680 ret
= netvsc_init_buf(device
, net_device
, device_info
);
687 * netvsc_device_remove - Callback when the root bus device is removed
689 void netvsc_device_remove(struct hv_device
*device
)
691 struct net_device
*ndev
= hv_get_drvdata(device
);
692 struct net_device_context
*net_device_ctx
= netdev_priv(ndev
);
693 struct netvsc_device
*net_device
694 = rtnl_dereference(net_device_ctx
->nvdev
);
698 * Revoke receive buffer. If host is pre-Win2016 then tear down
699 * receive buffer GPADL. Do the same for send buffer.
701 netvsc_revoke_recv_buf(device
, net_device
, ndev
);
702 if (vmbus_proto_version
< VERSION_WIN10
)
703 netvsc_teardown_recv_gpadl(device
, net_device
, ndev
);
705 netvsc_revoke_send_buf(device
, net_device
, ndev
);
706 if (vmbus_proto_version
< VERSION_WIN10
)
707 netvsc_teardown_send_gpadl(device
, net_device
, ndev
);
709 RCU_INIT_POINTER(net_device_ctx
->nvdev
, NULL
);
711 /* Disable NAPI and disassociate its context from the device. */
712 for (i
= 0; i
< net_device
->num_chn
; i
++) {
713 /* See also vmbus_reset_channel_cb(). */
714 /* only disable enabled NAPI channel */
715 if (i
< ndev
->real_num_rx_queues
) {
716 netif_queue_set_napi(ndev
, i
, NETDEV_QUEUE_TYPE_TX
,
718 netif_queue_set_napi(ndev
, i
, NETDEV_QUEUE_TYPE_RX
,
720 napi_disable(&net_device
->chan_table
[i
].napi
);
723 netif_napi_del(&net_device
->chan_table
[i
].napi
);
727 * At this point, no one should be accessing net_device
730 netdev_dbg(ndev
, "net device safe to remove\n");
732 /* Now, we can close the channel safely */
733 vmbus_close(device
->channel
);
736 * If host is Win2016 or higher then we do the GPADL tear down
737 * here after VMBus is closed.
739 if (vmbus_proto_version
>= VERSION_WIN10
) {
740 netvsc_teardown_recv_gpadl(device
, net_device
, ndev
);
741 netvsc_teardown_send_gpadl(device
, net_device
, ndev
);
744 /* Release all resources */
745 free_netvsc_device_rcu(net_device
);
748 #define RING_AVAIL_PERCENT_HIWATER 20
749 #define RING_AVAIL_PERCENT_LOWATER 10
751 static inline void netvsc_free_send_slot(struct netvsc_device
*net_device
,
754 sync_change_bit(index
, net_device
->send_section_map
);
757 static void netvsc_send_tx_complete(struct net_device
*ndev
,
758 struct netvsc_device
*net_device
,
759 struct vmbus_channel
*channel
,
760 const struct vmpacket_descriptor
*desc
,
763 struct net_device_context
*ndev_ctx
= netdev_priv(ndev
);
769 cmd_rqst
= channel
->request_addr_callback(channel
, desc
->trans_id
);
770 if (cmd_rqst
== VMBUS_RQST_ERROR
) {
771 netdev_err(ndev
, "Invalid transaction ID %llx\n", desc
->trans_id
);
775 skb
= (struct sk_buff
*)(unsigned long)cmd_rqst
;
777 /* Notify the layer above us */
779 struct hv_netvsc_packet
*packet
780 = (struct hv_netvsc_packet
*)skb
->cb
;
781 u32 send_index
= packet
->send_buf_index
;
782 struct netvsc_stats_tx
*tx_stats
;
784 if (send_index
!= NETVSC_INVALID_INDEX
)
785 netvsc_free_send_slot(net_device
, send_index
);
786 q_idx
= packet
->q_idx
;
788 tx_stats
= &net_device
->chan_table
[q_idx
].tx_stats
;
790 u64_stats_update_begin(&tx_stats
->syncp
);
791 tx_stats
->packets
+= packet
->total_packets
;
792 tx_stats
->bytes
+= packet
->total_bytes
;
793 u64_stats_update_end(&tx_stats
->syncp
);
795 netvsc_dma_unmap(ndev_ctx
->device_ctx
, packet
);
796 napi_consume_skb(skb
, budget
);
800 atomic_dec_return(&net_device
->chan_table
[q_idx
].queue_sends
);
802 if (unlikely(net_device
->destroy
)) {
803 if (queue_sends
== 0)
804 wake_up(&net_device
->wait_drain
);
806 struct netdev_queue
*txq
= netdev_get_tx_queue(ndev
, q_idx
);
808 if (netif_tx_queue_stopped(txq
) && !net_device
->tx_disable
&&
809 (hv_get_avail_to_write_percent(&channel
->outbound
) >
810 RING_AVAIL_PERCENT_HIWATER
|| queue_sends
< 1)) {
811 netif_tx_wake_queue(txq
);
812 ndev_ctx
->eth_stats
.wake_queue
++;
817 static void netvsc_send_completion(struct net_device
*ndev
,
818 struct netvsc_device
*net_device
,
819 struct vmbus_channel
*incoming_channel
,
820 const struct vmpacket_descriptor
*desc
,
823 const struct nvsp_message
*nvsp_packet
;
824 u32 msglen
= hv_pkt_datalen(desc
);
825 struct nvsp_message
*pkt_rqst
;
829 /* First check if this is a VMBUS completion without data payload */
831 cmd_rqst
= incoming_channel
->request_addr_callback(incoming_channel
,
833 if (cmd_rqst
== VMBUS_RQST_ERROR
) {
834 netdev_err(ndev
, "Invalid transaction ID %llx\n", desc
->trans_id
);
838 pkt_rqst
= (struct nvsp_message
*)(uintptr_t)cmd_rqst
;
839 switch (pkt_rqst
->hdr
.msg_type
) {
840 case NVSP_MSG4_TYPE_SWITCH_DATA_PATH
:
841 complete(&net_device
->channel_init_wait
);
845 netdev_err(ndev
, "Unexpected VMBUS completion!!\n");
850 /* Ensure packet is big enough to read header fields */
851 if (msglen
< sizeof(struct nvsp_message_header
)) {
852 netdev_err(ndev
, "nvsp_message length too small: %u\n", msglen
);
856 nvsp_packet
= hv_pkt_data(desc
);
857 switch (nvsp_packet
->hdr
.msg_type
) {
858 case NVSP_MSG_TYPE_INIT_COMPLETE
:
859 if (msglen
< sizeof(struct nvsp_message_header
) +
860 sizeof(struct nvsp_message_init_complete
)) {
861 netdev_err(ndev
, "nvsp_msg length too small: %u\n",
867 case NVSP_MSG1_TYPE_SEND_RECV_BUF_COMPLETE
:
868 if (msglen
< sizeof(struct nvsp_message_header
) +
869 sizeof(struct nvsp_1_message_send_receive_buffer_complete
)) {
870 netdev_err(ndev
, "nvsp_msg1 length too small: %u\n",
876 case NVSP_MSG1_TYPE_SEND_SEND_BUF_COMPLETE
:
877 if (msglen
< sizeof(struct nvsp_message_header
) +
878 sizeof(struct nvsp_1_message_send_send_buffer_complete
)) {
879 netdev_err(ndev
, "nvsp_msg1 length too small: %u\n",
885 case NVSP_MSG5_TYPE_SUBCHANNEL
:
886 if (msglen
< sizeof(struct nvsp_message_header
) +
887 sizeof(struct nvsp_5_subchannel_complete
)) {
888 netdev_err(ndev
, "nvsp_msg5 length too small: %u\n",
894 case NVSP_MSG1_TYPE_SEND_RNDIS_PKT_COMPLETE
:
895 if (msglen
< sizeof(struct nvsp_message_header
) +
896 sizeof(struct nvsp_1_message_send_rndis_packet_complete
)) {
898 netdev_err(ndev
, "nvsp_rndis_pkt_complete length too small: %u\n",
903 /* If status indicates an error, output a message so we know
904 * there's a problem. But process the completion anyway so the
905 * resources are released.
907 status
= nvsp_packet
->msg
.v1_msg
.send_rndis_pkt_complete
.status
;
908 if (status
!= NVSP_STAT_SUCCESS
&& net_ratelimit())
909 netdev_err(ndev
, "nvsp_rndis_pkt_complete error status: %x\n",
912 netvsc_send_tx_complete(ndev
, net_device
, incoming_channel
,
918 "Unknown send completion type %d received!!\n",
919 nvsp_packet
->hdr
.msg_type
);
923 /* Copy the response back */
924 memcpy(&net_device
->channel_init_pkt
, nvsp_packet
,
925 sizeof(struct nvsp_message
));
926 complete(&net_device
->channel_init_wait
);
929 static u32
netvsc_get_next_send_section(struct netvsc_device
*net_device
)
931 unsigned long *map_addr
= net_device
->send_section_map
;
934 for_each_clear_bit(i
, map_addr
, net_device
->send_section_cnt
) {
935 if (sync_test_and_set_bit(i
, map_addr
) == 0)
939 return NETVSC_INVALID_INDEX
;
942 static void netvsc_copy_to_send_buf(struct netvsc_device
*net_device
,
943 unsigned int section_index
,
945 struct hv_netvsc_packet
*packet
,
946 struct rndis_message
*rndis_msg
,
947 struct hv_page_buffer
*pb
,
950 char *start
= net_device
->send_buf
;
951 char *dest
= start
+ (section_index
* net_device
->send_section_size
)
955 u32 page_count
= packet
->cp_partial
? packet
->rmsg_pgcnt
:
956 packet
->page_buf_cnt
;
960 remain
= packet
->total_data_buflen
& (net_device
->pkt_align
- 1);
961 if (xmit_more
&& remain
) {
962 padding
= net_device
->pkt_align
- remain
;
963 rndis_msg
->msg_len
+= padding
;
964 packet
->total_data_buflen
+= padding
;
967 for (i
= 0; i
< page_count
; i
++) {
968 char *src
= phys_to_virt(pb
[i
].pfn
<< HV_HYP_PAGE_SHIFT
);
969 u32 offset
= pb
[i
].offset
;
972 memcpy(dest
, (src
+ offset
), len
);
977 memset(dest
, 0, padding
);
980 void netvsc_dma_unmap(struct hv_device
*hv_dev
,
981 struct hv_netvsc_packet
*packet
)
985 if (!hv_is_isolation_supported())
988 if (!packet
->dma_range
)
991 for (i
= 0; i
< packet
->page_buf_cnt
; i
++)
992 dma_unmap_single(&hv_dev
->device
, packet
->dma_range
[i
].dma
,
993 packet
->dma_range
[i
].mapping_size
,
996 kfree(packet
->dma_range
);
999 /* netvsc_dma_map - Map swiotlb bounce buffer with data page of
1000 * packet sent by vmbus_sendpacket_pagebuffer() in the Isolation
1003 * In isolation VM, netvsc send buffer has been marked visible to
1004 * host and so the data copied to send buffer doesn't need to use
1005 * bounce buffer. The data pages handled by vmbus_sendpacket_pagebuffer()
1006 * may not be copied to send buffer and so these pages need to be
1007 * mapped with swiotlb bounce buffer. netvsc_dma_map() is to do
1008 * that. The pfns in the struct hv_page_buffer need to be converted
1009 * to bounce buffer's pfn. The loop here is necessary because the
1010 * entries in the page buffer array are not necessarily full
1011 * pages of data. Each entry in the array has a separate offset and
1012 * len that may be non-zero, even for entries in the middle of the
1013 * array. And the entries are not physically contiguous. So each
1014 * entry must be individually mapped rather than as a contiguous unit.
1015 * So not use dma_map_sg() here.
1017 static int netvsc_dma_map(struct hv_device
*hv_dev
,
1018 struct hv_netvsc_packet
*packet
,
1019 struct hv_page_buffer
*pb
)
1021 u32 page_count
= packet
->page_buf_cnt
;
1025 if (!hv_is_isolation_supported())
1028 packet
->dma_range
= kcalloc(page_count
,
1029 sizeof(*packet
->dma_range
),
1031 if (!packet
->dma_range
)
1034 for (i
= 0; i
< page_count
; i
++) {
1035 char *src
= phys_to_virt((pb
[i
].pfn
<< HV_HYP_PAGE_SHIFT
)
1037 u32 len
= pb
[i
].len
;
1039 dma
= dma_map_single(&hv_dev
->device
, src
, len
,
1041 if (dma_mapping_error(&hv_dev
->device
, dma
)) {
1042 kfree(packet
->dma_range
);
1046 /* pb[].offset and pb[].len are not changed during dma mapping
1047 * and so not reassign.
1049 packet
->dma_range
[i
].dma
= dma
;
1050 packet
->dma_range
[i
].mapping_size
= len
;
1051 pb
[i
].pfn
= dma
>> HV_HYP_PAGE_SHIFT
;
1057 static inline int netvsc_send_pkt(
1058 struct hv_device
*device
,
1059 struct hv_netvsc_packet
*packet
,
1060 struct netvsc_device
*net_device
,
1061 struct hv_page_buffer
*pb
,
1062 struct sk_buff
*skb
)
1064 struct nvsp_message nvmsg
;
1065 struct nvsp_1_message_send_rndis_packet
*rpkt
=
1066 &nvmsg
.msg
.v1_msg
.send_rndis_pkt
;
1067 struct netvsc_channel
* const nvchan
=
1068 &net_device
->chan_table
[packet
->q_idx
];
1069 struct vmbus_channel
*out_channel
= nvchan
->channel
;
1070 struct net_device
*ndev
= hv_get_drvdata(device
);
1071 struct net_device_context
*ndev_ctx
= netdev_priv(ndev
);
1072 struct netdev_queue
*txq
= netdev_get_tx_queue(ndev
, packet
->q_idx
);
1075 u32 ring_avail
= hv_get_avail_to_write_percent(&out_channel
->outbound
);
1077 memset(&nvmsg
, 0, sizeof(struct nvsp_message
));
1078 nvmsg
.hdr
.msg_type
= NVSP_MSG1_TYPE_SEND_RNDIS_PKT
;
1080 rpkt
->channel_type
= 0; /* 0 is RMC_DATA */
1082 rpkt
->channel_type
= 1; /* 1 is RMC_CONTROL */
1084 rpkt
->send_buf_section_index
= packet
->send_buf_index
;
1085 if (packet
->send_buf_index
== NETVSC_INVALID_INDEX
)
1086 rpkt
->send_buf_section_size
= 0;
1088 rpkt
->send_buf_section_size
= packet
->total_data_buflen
;
1090 req_id
= (ulong
)skb
;
1092 if (out_channel
->rescind
)
1095 trace_nvsp_send_pkt(ndev
, out_channel
, rpkt
);
1097 packet
->dma_range
= NULL
;
1098 if (packet
->page_buf_cnt
) {
1099 if (packet
->cp_partial
)
1100 pb
+= packet
->rmsg_pgcnt
;
1102 ret
= netvsc_dma_map(ndev_ctx
->device_ctx
, packet
, pb
);
1108 ret
= vmbus_sendpacket_pagebuffer(out_channel
,
1109 pb
, packet
->page_buf_cnt
,
1110 &nvmsg
, sizeof(nvmsg
),
1114 netvsc_dma_unmap(ndev_ctx
->device_ctx
, packet
);
1116 ret
= vmbus_sendpacket(out_channel
,
1117 &nvmsg
, sizeof(nvmsg
),
1118 req_id
, VM_PKT_DATA_INBAND
,
1119 VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED
);
1124 atomic_inc_return(&nvchan
->queue_sends
);
1126 if (ring_avail
< RING_AVAIL_PERCENT_LOWATER
) {
1127 netif_tx_stop_queue(txq
);
1128 ndev_ctx
->eth_stats
.stop_queue
++;
1130 } else if (ret
== -EAGAIN
) {
1131 netif_tx_stop_queue(txq
);
1132 ndev_ctx
->eth_stats
.stop_queue
++;
1135 "Unable to send packet pages %u len %u, ret %d\n",
1136 packet
->page_buf_cnt
, packet
->total_data_buflen
,
1140 if (netif_tx_queue_stopped(txq
) &&
1141 atomic_read(&nvchan
->queue_sends
) < 1 &&
1142 !net_device
->tx_disable
) {
1143 netif_tx_wake_queue(txq
);
1144 ndev_ctx
->eth_stats
.wake_queue
++;
1152 /* Move packet out of multi send data (msd), and clear msd */
1153 static inline void move_pkt_msd(struct hv_netvsc_packet
**msd_send
,
1154 struct sk_buff
**msd_skb
,
1155 struct multi_send_data
*msdp
)
1157 *msd_skb
= msdp
->skb
;
1158 *msd_send
= msdp
->pkt
;
1164 /* RCU already held by caller */
1165 /* Batching/bouncing logic is designed to attempt to optimize
1168 * For small, non-LSO packets we copy the packet to a send buffer
1169 * which is pre-registered with the Hyper-V side. This enables the
1170 * hypervisor to avoid remapping the aperture to access the packet
1171 * descriptor and data.
1173 * If we already started using a buffer and the netdev is transmitting
1174 * a burst of packets, keep on copying into the buffer until it is
1175 * full or we are done collecting a burst. If there is an existing
1176 * buffer with space for the RNDIS descriptor but not the packet, copy
1177 * the RNDIS descriptor to the buffer, keeping the packet in place.
1179 * If we do batching and send more than one packet using a single
1180 * NetVSC message, free the SKBs of the packets copied, except for the
1181 * last packet. This is done to streamline the handling of the case
1182 * where the last packet only had the RNDIS descriptor copied to the
1183 * send buffer, with the data pointers included in the NetVSC message.
1185 int netvsc_send(struct net_device
*ndev
,
1186 struct hv_netvsc_packet
*packet
,
1187 struct rndis_message
*rndis_msg
,
1188 struct hv_page_buffer
*pb
,
1189 struct sk_buff
*skb
,
1192 struct net_device_context
*ndev_ctx
= netdev_priv(ndev
);
1193 struct netvsc_device
*net_device
1194 = rcu_dereference_bh(ndev_ctx
->nvdev
);
1195 struct hv_device
*device
= ndev_ctx
->device_ctx
;
1197 struct netvsc_channel
*nvchan
;
1198 u32 pktlen
= packet
->total_data_buflen
, msd_len
= 0;
1199 unsigned int section_index
= NETVSC_INVALID_INDEX
;
1200 struct multi_send_data
*msdp
;
1201 struct hv_netvsc_packet
*msd_send
= NULL
, *cur_send
= NULL
;
1202 struct sk_buff
*msd_skb
= NULL
;
1203 bool try_batch
, xmit_more
;
1205 /* If device is rescinded, return error and packet will get dropped. */
1206 if (unlikely(!net_device
|| net_device
->destroy
))
1209 nvchan
= &net_device
->chan_table
[packet
->q_idx
];
1210 packet
->send_buf_index
= NETVSC_INVALID_INDEX
;
1211 packet
->cp_partial
= false;
1213 /* Send a control message or XDP packet directly without accessing
1214 * msd (Multi-Send Data) field which may be changed during data packet
1218 return netvsc_send_pkt(device
, packet
, net_device
, pb
, skb
);
1220 /* batch packets in send buffer if possible */
1221 msdp
= &nvchan
->msd
;
1223 msd_len
= msdp
->pkt
->total_data_buflen
;
1225 try_batch
= msd_len
> 0 && msdp
->count
< net_device
->max_pkt
;
1226 if (try_batch
&& msd_len
+ pktlen
+ net_device
->pkt_align
<
1227 net_device
->send_section_size
) {
1228 section_index
= msdp
->pkt
->send_buf_index
;
1230 } else if (try_batch
&& msd_len
+ packet
->rmsg_size
<
1231 net_device
->send_section_size
) {
1232 section_index
= msdp
->pkt
->send_buf_index
;
1233 packet
->cp_partial
= true;
1235 } else if (pktlen
+ net_device
->pkt_align
<
1236 net_device
->send_section_size
) {
1237 section_index
= netvsc_get_next_send_section(net_device
);
1238 if (unlikely(section_index
== NETVSC_INVALID_INDEX
)) {
1239 ++ndev_ctx
->eth_stats
.tx_send_full
;
1241 move_pkt_msd(&msd_send
, &msd_skb
, msdp
);
1246 /* Keep aggregating only if stack says more data is coming
1247 * and not doing mixed modes send and not flow blocked
1249 xmit_more
= netdev_xmit_more() &&
1250 !packet
->cp_partial
&&
1251 !netif_xmit_stopped(netdev_get_tx_queue(ndev
, packet
->q_idx
));
1253 if (section_index
!= NETVSC_INVALID_INDEX
) {
1254 netvsc_copy_to_send_buf(net_device
,
1255 section_index
, msd_len
,
1256 packet
, rndis_msg
, pb
, xmit_more
);
1258 packet
->send_buf_index
= section_index
;
1260 if (packet
->cp_partial
) {
1261 packet
->page_buf_cnt
-= packet
->rmsg_pgcnt
;
1262 packet
->total_data_buflen
= msd_len
+ packet
->rmsg_size
;
1264 packet
->page_buf_cnt
= 0;
1265 packet
->total_data_buflen
+= msd_len
;
1269 packet
->total_packets
+= msdp
->pkt
->total_packets
;
1270 packet
->total_bytes
+= msdp
->pkt
->total_bytes
;
1274 dev_consume_skb_any(msdp
->skb
);
1287 move_pkt_msd(&msd_send
, &msd_skb
, msdp
);
1292 int m_ret
= netvsc_send_pkt(device
, msd_send
, net_device
,
1296 netvsc_free_send_slot(net_device
,
1297 msd_send
->send_buf_index
);
1298 dev_kfree_skb_any(msd_skb
);
1303 ret
= netvsc_send_pkt(device
, cur_send
, net_device
, pb
, skb
);
1305 if (ret
!= 0 && section_index
!= NETVSC_INVALID_INDEX
)
1306 netvsc_free_send_slot(net_device
, section_index
);
1311 /* Send pending recv completions */
1312 static int send_recv_completions(struct net_device
*ndev
,
1313 struct netvsc_device
*nvdev
,
1314 struct netvsc_channel
*nvchan
)
1316 struct multi_recv_comp
*mrc
= &nvchan
->mrc
;
1317 struct recv_comp_msg
{
1318 struct nvsp_message_header hdr
;
1321 struct recv_comp_msg msg
= {
1322 .hdr
.msg_type
= NVSP_MSG1_TYPE_SEND_RNDIS_PKT_COMPLETE
,
1326 while (mrc
->first
!= mrc
->next
) {
1327 const struct recv_comp_data
*rcd
1328 = mrc
->slots
+ mrc
->first
;
1330 msg
.status
= rcd
->status
;
1331 ret
= vmbus_sendpacket(nvchan
->channel
, &msg
, sizeof(msg
),
1332 rcd
->tid
, VM_PKT_COMP
, 0);
1333 if (unlikely(ret
)) {
1334 struct net_device_context
*ndev_ctx
= netdev_priv(ndev
);
1336 ++ndev_ctx
->eth_stats
.rx_comp_busy
;
1340 if (++mrc
->first
== nvdev
->recv_completion_cnt
)
1344 /* receive completion ring has been emptied */
1345 if (unlikely(nvdev
->destroy
))
1346 wake_up(&nvdev
->wait_drain
);
1351 /* Count how many receive completions are outstanding */
1352 static void recv_comp_slot_avail(const struct netvsc_device
*nvdev
,
1353 const struct multi_recv_comp
*mrc
,
1354 u32
*filled
, u32
*avail
)
1356 u32 count
= nvdev
->recv_completion_cnt
;
1358 if (mrc
->next
>= mrc
->first
)
1359 *filled
= mrc
->next
- mrc
->first
;
1361 *filled
= (count
- mrc
->first
) + mrc
->next
;
1363 *avail
= count
- *filled
- 1;
1366 /* Add receive complete to ring to send to host. */
1367 static void enq_receive_complete(struct net_device
*ndev
,
1368 struct netvsc_device
*nvdev
, u16 q_idx
,
1369 u64 tid
, u32 status
)
1371 struct netvsc_channel
*nvchan
= &nvdev
->chan_table
[q_idx
];
1372 struct multi_recv_comp
*mrc
= &nvchan
->mrc
;
1373 struct recv_comp_data
*rcd
;
1376 recv_comp_slot_avail(nvdev
, mrc
, &filled
, &avail
);
1378 if (unlikely(filled
> NAPI_POLL_WEIGHT
)) {
1379 send_recv_completions(ndev
, nvdev
, nvchan
);
1380 recv_comp_slot_avail(nvdev
, mrc
, &filled
, &avail
);
1383 if (unlikely(!avail
)) {
1384 netdev_err(ndev
, "Recv_comp full buf q:%hd, tid:%llx\n",
1389 rcd
= mrc
->slots
+ mrc
->next
;
1391 rcd
->status
= status
;
1393 if (++mrc
->next
== nvdev
->recv_completion_cnt
)
1397 static int netvsc_receive(struct net_device
*ndev
,
1398 struct netvsc_device
*net_device
,
1399 struct netvsc_channel
*nvchan
,
1400 const struct vmpacket_descriptor
*desc
)
1402 struct net_device_context
*net_device_ctx
= netdev_priv(ndev
);
1403 struct vmbus_channel
*channel
= nvchan
->channel
;
1404 const struct vmtransfer_page_packet_header
*vmxferpage_packet
1405 = container_of(desc
, const struct vmtransfer_page_packet_header
, d
);
1406 const struct nvsp_message
*nvsp
= hv_pkt_data(desc
);
1407 u32 msglen
= hv_pkt_datalen(desc
);
1408 u16 q_idx
= channel
->offermsg
.offer
.sub_channel_index
;
1409 char *recv_buf
= net_device
->recv_buf
;
1410 u32 status
= NVSP_STAT_SUCCESS
;
1414 /* Ensure packet is big enough to read header fields */
1415 if (msglen
< sizeof(struct nvsp_message_header
)) {
1416 netif_err(net_device_ctx
, rx_err
, ndev
,
1417 "invalid nvsp header, length too small: %u\n",
1422 /* Make sure this is a valid nvsp packet */
1423 if (unlikely(nvsp
->hdr
.msg_type
!= NVSP_MSG1_TYPE_SEND_RNDIS_PKT
)) {
1424 netif_err(net_device_ctx
, rx_err
, ndev
,
1425 "Unknown nvsp packet type received %u\n",
1426 nvsp
->hdr
.msg_type
);
1430 /* Validate xfer page pkt header */
1431 if ((desc
->offset8
<< 3) < sizeof(struct vmtransfer_page_packet_header
)) {
1432 netif_err(net_device_ctx
, rx_err
, ndev
,
1433 "Invalid xfer page pkt, offset too small: %u\n",
1434 desc
->offset8
<< 3);
1438 if (unlikely(vmxferpage_packet
->xfer_pageset_id
!= NETVSC_RECEIVE_BUFFER_ID
)) {
1439 netif_err(net_device_ctx
, rx_err
, ndev
,
1440 "Invalid xfer page set id - expecting %x got %x\n",
1441 NETVSC_RECEIVE_BUFFER_ID
,
1442 vmxferpage_packet
->xfer_pageset_id
);
1446 count
= vmxferpage_packet
->range_cnt
;
1448 /* Check count for a valid value */
1449 if (NETVSC_XFER_HEADER_SIZE(count
) > desc
->offset8
<< 3) {
1450 netif_err(net_device_ctx
, rx_err
, ndev
,
1451 "Range count is not valid: %d\n",
1456 /* Each range represents 1 RNDIS pkt that contains 1 ethernet frame */
1457 for (i
= 0; i
< count
; i
++) {
1458 u32 offset
= vmxferpage_packet
->ranges
[i
].byte_offset
;
1459 u32 buflen
= vmxferpage_packet
->ranges
[i
].byte_count
;
1463 if (unlikely(offset
> net_device
->recv_buf_size
||
1464 buflen
> net_device
->recv_buf_size
- offset
)) {
1465 nvchan
->rsc
.cnt
= 0;
1466 status
= NVSP_STAT_FAIL
;
1467 netif_err(net_device_ctx
, rx_err
, ndev
,
1468 "Packet offset:%u + len:%u too big\n",
1474 /* We're going to copy (sections of) the packet into nvchan->recv_buf;
1475 * make sure that nvchan->recv_buf is large enough to hold the packet.
1477 if (unlikely(buflen
> net_device
->recv_section_size
)) {
1478 nvchan
->rsc
.cnt
= 0;
1479 status
= NVSP_STAT_FAIL
;
1480 netif_err(net_device_ctx
, rx_err
, ndev
,
1481 "Packet too big: buflen=%u recv_section_size=%u\n",
1482 buflen
, net_device
->recv_section_size
);
1487 data
= recv_buf
+ offset
;
1489 nvchan
->rsc
.is_last
= (i
== count
- 1);
1491 trace_rndis_recv(ndev
, q_idx
, data
);
1493 /* Pass it to the upper layer */
1494 ret
= rndis_filter_receive(ndev
, net_device
,
1495 nvchan
, data
, buflen
);
1497 if (unlikely(ret
!= NVSP_STAT_SUCCESS
)) {
1498 /* Drop incomplete packet */
1499 nvchan
->rsc
.cnt
= 0;
1500 status
= NVSP_STAT_FAIL
;
1504 enq_receive_complete(ndev
, net_device
, q_idx
,
1505 vmxferpage_packet
->d
.trans_id
, status
);
1510 static void netvsc_send_table(struct net_device
*ndev
,
1511 struct netvsc_device
*nvscdev
,
1512 const struct nvsp_message
*nvmsg
,
1515 struct net_device_context
*net_device_ctx
= netdev_priv(ndev
);
1516 u32 count
, offset
, *tab
;
1519 /* Ensure packet is big enough to read send_table fields */
1520 if (msglen
< sizeof(struct nvsp_message_header
) +
1521 sizeof(struct nvsp_5_send_indirect_table
)) {
1522 netdev_err(ndev
, "nvsp_v5_msg length too small: %u\n", msglen
);
1526 count
= nvmsg
->msg
.v5_msg
.send_table
.count
;
1527 offset
= nvmsg
->msg
.v5_msg
.send_table
.offset
;
1529 if (count
!= VRSS_SEND_TAB_SIZE
) {
1530 netdev_err(ndev
, "Received wrong send-table size:%u\n", count
);
1534 /* If negotiated version <= NVSP_PROTOCOL_VERSION_6, the offset may be
1535 * wrong due to a host bug. So fix the offset here.
1537 if (nvscdev
->nvsp_version
<= NVSP_PROTOCOL_VERSION_6
&&
1538 msglen
>= sizeof(struct nvsp_message_header
) +
1539 sizeof(union nvsp_6_message_uber
) + count
* sizeof(u32
))
1540 offset
= sizeof(struct nvsp_message_header
) +
1541 sizeof(union nvsp_6_message_uber
);
1543 /* Boundary check for all versions */
1544 if (msglen
< count
* sizeof(u32
) || offset
> msglen
- count
* sizeof(u32
)) {
1545 netdev_err(ndev
, "Received send-table offset too big:%u\n",
1550 tab
= (void *)nvmsg
+ offset
;
1552 for (i
= 0; i
< count
; i
++)
1553 net_device_ctx
->tx_table
[i
] = tab
[i
];
1556 static void netvsc_send_vf(struct net_device
*ndev
,
1557 const struct nvsp_message
*nvmsg
,
1560 struct net_device_context
*net_device_ctx
= netdev_priv(ndev
);
1562 /* Ensure packet is big enough to read its fields */
1563 if (msglen
< sizeof(struct nvsp_message_header
) +
1564 sizeof(struct nvsp_4_send_vf_association
)) {
1565 netdev_err(ndev
, "nvsp_v4_msg length too small: %u\n", msglen
);
1569 net_device_ctx
->vf_alloc
= nvmsg
->msg
.v4_msg
.vf_assoc
.allocated
;
1570 net_device_ctx
->vf_serial
= nvmsg
->msg
.v4_msg
.vf_assoc
.serial
;
1572 if (net_device_ctx
->vf_alloc
)
1573 complete(&net_device_ctx
->vf_add
);
1575 netdev_info(ndev
, "VF slot %u %s\n",
1576 net_device_ctx
->vf_serial
,
1577 net_device_ctx
->vf_alloc
? "added" : "removed");
1580 static void netvsc_receive_inband(struct net_device
*ndev
,
1581 struct netvsc_device
*nvscdev
,
1582 const struct vmpacket_descriptor
*desc
)
1584 const struct nvsp_message
*nvmsg
= hv_pkt_data(desc
);
1585 u32 msglen
= hv_pkt_datalen(desc
);
1587 /* Ensure packet is big enough to read header fields */
1588 if (msglen
< sizeof(struct nvsp_message_header
)) {
1589 netdev_err(ndev
, "inband nvsp_message length too small: %u\n", msglen
);
1593 switch (nvmsg
->hdr
.msg_type
) {
1594 case NVSP_MSG5_TYPE_SEND_INDIRECTION_TABLE
:
1595 netvsc_send_table(ndev
, nvscdev
, nvmsg
, msglen
);
1598 case NVSP_MSG4_TYPE_SEND_VF_ASSOCIATION
:
1599 if (hv_is_isolation_supported())
1600 netdev_err(ndev
, "Ignore VF_ASSOCIATION msg from the host supporting isolation\n");
1602 netvsc_send_vf(ndev
, nvmsg
, msglen
);
1607 static int netvsc_process_raw_pkt(struct hv_device
*device
,
1608 struct netvsc_channel
*nvchan
,
1609 struct netvsc_device
*net_device
,
1610 struct net_device
*ndev
,
1611 const struct vmpacket_descriptor
*desc
,
1614 struct vmbus_channel
*channel
= nvchan
->channel
;
1615 const struct nvsp_message
*nvmsg
= hv_pkt_data(desc
);
1617 trace_nvsp_recv(ndev
, channel
, nvmsg
);
1619 switch (desc
->type
) {
1621 netvsc_send_completion(ndev
, net_device
, channel
, desc
, budget
);
1624 case VM_PKT_DATA_USING_XFER_PAGES
:
1625 return netvsc_receive(ndev
, net_device
, nvchan
, desc
);
1627 case VM_PKT_DATA_INBAND
:
1628 netvsc_receive_inband(ndev
, net_device
, desc
);
1632 netdev_err(ndev
, "unhandled packet type %d, tid %llx\n",
1633 desc
->type
, desc
->trans_id
);
1640 static struct hv_device
*netvsc_channel_to_device(struct vmbus_channel
*channel
)
1642 struct vmbus_channel
*primary
= channel
->primary_channel
;
1644 return primary
? primary
->device_obj
: channel
->device_obj
;
1647 /* Network processing softirq
1648 * Process data in incoming ring buffer from host
1649 * Stops when ring is empty or budget is met or exceeded.
1651 int netvsc_poll(struct napi_struct
*napi
, int budget
)
1653 struct netvsc_channel
*nvchan
1654 = container_of(napi
, struct netvsc_channel
, napi
);
1655 struct netvsc_device
*net_device
= nvchan
->net_device
;
1656 struct vmbus_channel
*channel
= nvchan
->channel
;
1657 struct hv_device
*device
= netvsc_channel_to_device(channel
);
1658 struct net_device
*ndev
= hv_get_drvdata(device
);
1662 /* If starting a new interval */
1664 nvchan
->desc
= hv_pkt_iter_first(channel
);
1666 nvchan
->xdp_flush
= false;
1668 while (nvchan
->desc
&& work_done
< budget
) {
1669 work_done
+= netvsc_process_raw_pkt(device
, nvchan
, net_device
,
1670 ndev
, nvchan
->desc
, budget
);
1671 nvchan
->desc
= hv_pkt_iter_next(channel
, nvchan
->desc
);
1674 if (nvchan
->xdp_flush
)
1677 /* Send any pending receive completions */
1678 ret
= send_recv_completions(ndev
, net_device
, nvchan
);
1680 /* If it did not exhaust NAPI budget this time
1681 * and not doing busy poll
1682 * then re-enable host interrupts
1683 * and reschedule if ring is not empty
1684 * or sending receive completion failed.
1686 if (work_done
< budget
&&
1687 napi_complete_done(napi
, work_done
) &&
1688 (ret
|| hv_end_read(&channel
->inbound
)) &&
1689 napi_schedule_prep(napi
)) {
1690 hv_begin_read(&channel
->inbound
);
1691 __napi_schedule(napi
);
1694 /* Driver may overshoot since multiple packets per descriptor */
1695 return min(work_done
, budget
);
1698 /* Call back when data is available in host ring buffer.
1699 * Processing is deferred until network softirq (NAPI)
1701 void netvsc_channel_cb(void *context
)
1703 struct netvsc_channel
*nvchan
= context
;
1704 struct vmbus_channel
*channel
= nvchan
->channel
;
1705 struct hv_ring_buffer_info
*rbi
= &channel
->inbound
;
1707 /* preload first vmpacket descriptor */
1708 prefetch(hv_get_ring_buffer(rbi
) + rbi
->priv_read_index
);
1710 if (napi_schedule_prep(&nvchan
->napi
)) {
1711 /* disable interrupts from host */
1714 __napi_schedule_irqoff(&nvchan
->napi
);
1719 * netvsc_device_add - Callback when the device belonging to this
1722 struct netvsc_device
*netvsc_device_add(struct hv_device
*device
,
1723 const struct netvsc_device_info
*device_info
)
1726 struct netvsc_device
*net_device
;
1727 struct net_device
*ndev
= hv_get_drvdata(device
);
1728 struct net_device_context
*net_device_ctx
= netdev_priv(ndev
);
1730 net_device
= alloc_net_device();
1732 return ERR_PTR(-ENOMEM
);
1734 for (i
= 0; i
< VRSS_SEND_TAB_SIZE
; i
++)
1735 net_device_ctx
->tx_table
[i
] = 0;
1737 /* Because the device uses NAPI, all the interrupt batching and
1738 * control is done via Net softirq, not the channel handling
1740 set_channel_read_mode(device
->channel
, HV_CALL_ISR
);
1742 /* If we're reopening the device we may have multiple queues, fill the
1743 * chn_table with the default channel to use it before subchannels are
1745 * Initialize the channel state before we open;
1746 * we can be interrupted as soon as we open the channel.
1749 for (i
= 0; i
< VRSS_CHANNEL_MAX
; i
++) {
1750 struct netvsc_channel
*nvchan
= &net_device
->chan_table
[i
];
1752 nvchan
->channel
= device
->channel
;
1753 nvchan
->net_device
= net_device
;
1754 u64_stats_init(&nvchan
->tx_stats
.syncp
);
1755 u64_stats_init(&nvchan
->rx_stats
.syncp
);
1757 ret
= xdp_rxq_info_reg(&nvchan
->xdp_rxq
, ndev
, i
, 0);
1760 netdev_err(ndev
, "xdp_rxq_info_reg fail: %d\n", ret
);
1764 ret
= xdp_rxq_info_reg_mem_model(&nvchan
->xdp_rxq
,
1765 MEM_TYPE_PAGE_SHARED
, NULL
);
1768 netdev_err(ndev
, "xdp reg_mem_model fail: %d\n", ret
);
1773 /* Enable NAPI handler before init callbacks */
1774 netif_napi_add(ndev
, &net_device
->chan_table
[0].napi
, netvsc_poll
);
1776 /* Open the channel */
1777 device
->channel
->next_request_id_callback
= vmbus_next_request_id
;
1778 device
->channel
->request_addr_callback
= vmbus_request_addr
;
1779 device
->channel
->rqstor_size
= netvsc_rqstor_size(netvsc_ring_bytes
);
1780 device
->channel
->max_pkt_size
= NETVSC_MAX_PKT_SIZE
;
1782 ret
= vmbus_open(device
->channel
, netvsc_ring_bytes
,
1783 netvsc_ring_bytes
, NULL
, 0,
1784 netvsc_channel_cb
, net_device
->chan_table
);
1787 netdev_err(ndev
, "unable to open channel: %d\n", ret
);
1791 /* Channel is opened */
1792 netdev_dbg(ndev
, "hv_netvsc channel opened successfully\n");
1794 napi_enable(&net_device
->chan_table
[0].napi
);
1795 netif_queue_set_napi(ndev
, 0, NETDEV_QUEUE_TYPE_RX
,
1796 &net_device
->chan_table
[0].napi
);
1797 netif_queue_set_napi(ndev
, 0, NETDEV_QUEUE_TYPE_TX
,
1798 &net_device
->chan_table
[0].napi
);
1800 /* Connect with the NetVsp */
1801 ret
= netvsc_connect_vsp(device
, net_device
, device_info
);
1804 "unable to connect to NetVSP - %d\n", ret
);
1808 /* Writing nvdev pointer unlocks netvsc_send(), make sure chn_table is
1811 rcu_assign_pointer(net_device_ctx
->nvdev
, net_device
);
1816 RCU_INIT_POINTER(net_device_ctx
->nvdev
, NULL
);
1817 netif_queue_set_napi(ndev
, 0, NETDEV_QUEUE_TYPE_TX
, NULL
);
1818 netif_queue_set_napi(ndev
, 0, NETDEV_QUEUE_TYPE_RX
, NULL
);
1819 napi_disable(&net_device
->chan_table
[0].napi
);
1821 /* Now, we can close the channel safely */
1822 vmbus_close(device
->channel
);
1825 netif_napi_del(&net_device
->chan_table
[0].napi
);
1828 free_netvsc_device(&net_device
->rcu
);
1830 return ERR_PTR(ret
);