2 * Network-device interface management.
4 * Copyright (c) 2004-2005, Keir Fraser
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License version 2
8 * as published by the Free Software Foundation; or, when distributed
9 * separately from the Linux kernel or incorporated into other
10 * software packages, subject to the following license:
12 * Permission is hereby granted, free of charge, to any person obtaining a copy
13 * of this source file (the "Software"), to deal in the Software without
14 * restriction, including without limitation the rights to use, copy, modify,
15 * merge, publish, distribute, sublicense, and/or sell copies of the Software,
16 * and to permit persons to whom the Software is furnished to do so, subject to
17 * the following conditions:
19 * The above copyright notice and this permission notice shall be included in
20 * all copies or substantial portions of the Software.
22 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
23 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
24 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
25 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
26 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
27 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
33 #include <linux/kthread.h>
34 #include <linux/ethtool.h>
35 #include <linux/rtnetlink.h>
36 #include <linux/if_vlan.h>
37 #include <linux/vmalloc.h>
39 #include <xen/events.h>
40 #include <asm/xen/hypercall.h>
41 #include <xen/balloon.h>
43 #define XENVIF_QUEUE_LENGTH 32
44 #define XENVIF_NAPI_WEIGHT 64
46 /* Number of bytes allowed on the internal guest Rx queue. */
47 #define XENVIF_RX_QUEUE_BYTES (XEN_NETIF_RX_RING_SIZE/2 * PAGE_SIZE)
49 /* This function is used to set SKBTX_DEV_ZEROCOPY as well as
50 * increasing the inflight counter. We need to increase the inflight
51 * counter because core driver calls into xenvif_zerocopy_callback
52 * which calls xenvif_skb_zerocopy_complete.
54 void xenvif_skb_zerocopy_prepare(struct xenvif_queue
*queue
,
57 skb_shinfo(skb
)->tx_flags
|= SKBTX_DEV_ZEROCOPY
;
58 atomic_inc(&queue
->inflight_packets
);
61 void xenvif_skb_zerocopy_complete(struct xenvif_queue
*queue
)
63 atomic_dec(&queue
->inflight_packets
);
65 /* Wake the dealloc thread _after_ decrementing inflight_packets so
66 * that if kthread_stop() has already been called, the dealloc thread
67 * does not wait forever with nothing to wake it.
69 wake_up(&queue
->dealloc_wq
);
72 int xenvif_schedulable(struct xenvif
*vif
)
74 return netif_running(vif
->dev
) &&
75 test_bit(VIF_STATUS_CONNECTED
, &vif
->status
) &&
79 static irqreturn_t
xenvif_tx_interrupt(int irq
, void *dev_id
)
81 struct xenvif_queue
*queue
= dev_id
;
83 if (RING_HAS_UNCONSUMED_REQUESTS(&queue
->tx
))
84 napi_schedule(&queue
->napi
);
89 static int xenvif_poll(struct napi_struct
*napi
, int budget
)
91 struct xenvif_queue
*queue
=
92 container_of(napi
, struct xenvif_queue
, napi
);
95 /* This vif is rogue, we pretend we've there is nothing to do
96 * for this vif to deschedule it from NAPI. But this interface
97 * will be turned off in thread context later.
99 if (unlikely(queue
->vif
->disabled
)) {
104 work_done
= xenvif_tx_action(queue
, budget
);
106 if (work_done
< budget
) {
108 xenvif_napi_schedule_or_enable_events(queue
);
114 static irqreturn_t
xenvif_rx_interrupt(int irq
, void *dev_id
)
116 struct xenvif_queue
*queue
= dev_id
;
118 xenvif_kick_thread(queue
);
123 irqreturn_t
xenvif_interrupt(int irq
, void *dev_id
)
125 xenvif_tx_interrupt(irq
, dev_id
);
126 xenvif_rx_interrupt(irq
, dev_id
);
131 int xenvif_queue_stopped(struct xenvif_queue
*queue
)
133 struct net_device
*dev
= queue
->vif
->dev
;
134 unsigned int id
= queue
->id
;
135 return netif_tx_queue_stopped(netdev_get_tx_queue(dev
, id
));
138 void xenvif_wake_queue(struct xenvif_queue
*queue
)
140 struct net_device
*dev
= queue
->vif
->dev
;
141 unsigned int id
= queue
->id
;
142 netif_tx_wake_queue(netdev_get_tx_queue(dev
, id
));
145 static int xenvif_start_xmit(struct sk_buff
*skb
, struct net_device
*dev
)
147 struct xenvif
*vif
= netdev_priv(dev
);
148 struct xenvif_queue
*queue
= NULL
;
149 unsigned int num_queues
= vif
->num_queues
;
151 struct xenvif_rx_cb
*cb
;
153 BUG_ON(skb
->dev
!= dev
);
155 /* Drop the packet if queues are not set up */
159 /* Obtain the queue to be used to transmit this packet */
160 index
= skb_get_queue_mapping(skb
);
161 if (index
>= num_queues
) {
162 pr_warn_ratelimited("Invalid queue %hu for packet on interface %s\n.",
163 index
, vif
->dev
->name
);
166 queue
= &vif
->queues
[index
];
168 /* Drop the packet if queue is not ready */
169 if (queue
->task
== NULL
||
170 queue
->dealloc_task
== NULL
||
171 !xenvif_schedulable(vif
))
174 if (vif
->multicast_control
&& skb
->pkt_type
== PACKET_MULTICAST
) {
175 struct ethhdr
*eth
= (struct ethhdr
*)skb
->data
;
177 if (!xenvif_mcast_match(vif
, eth
->h_dest
))
181 cb
= XENVIF_RX_CB(skb
);
182 cb
->expires
= jiffies
+ vif
->drain_timeout
;
184 xenvif_rx_queue_tail(queue
, skb
);
185 xenvif_kick_thread(queue
);
190 vif
->dev
->stats
.tx_dropped
++;
195 static struct net_device_stats
*xenvif_get_stats(struct net_device
*dev
)
197 struct xenvif
*vif
= netdev_priv(dev
);
198 struct xenvif_queue
*queue
= NULL
;
199 unsigned int num_queues
= vif
->num_queues
;
200 unsigned long rx_bytes
= 0;
201 unsigned long rx_packets
= 0;
202 unsigned long tx_bytes
= 0;
203 unsigned long tx_packets
= 0;
206 if (vif
->queues
== NULL
)
209 /* Aggregate tx and rx stats from each queue */
210 for (index
= 0; index
< num_queues
; ++index
) {
211 queue
= &vif
->queues
[index
];
212 rx_bytes
+= queue
->stats
.rx_bytes
;
213 rx_packets
+= queue
->stats
.rx_packets
;
214 tx_bytes
+= queue
->stats
.tx_bytes
;
215 tx_packets
+= queue
->stats
.tx_packets
;
219 vif
->dev
->stats
.rx_bytes
= rx_bytes
;
220 vif
->dev
->stats
.rx_packets
= rx_packets
;
221 vif
->dev
->stats
.tx_bytes
= tx_bytes
;
222 vif
->dev
->stats
.tx_packets
= tx_packets
;
224 return &vif
->dev
->stats
;
227 static void xenvif_up(struct xenvif
*vif
)
229 struct xenvif_queue
*queue
= NULL
;
230 unsigned int num_queues
= vif
->num_queues
;
231 unsigned int queue_index
;
233 for (queue_index
= 0; queue_index
< num_queues
; ++queue_index
) {
234 queue
= &vif
->queues
[queue_index
];
235 napi_enable(&queue
->napi
);
236 enable_irq(queue
->tx_irq
);
237 if (queue
->tx_irq
!= queue
->rx_irq
)
238 enable_irq(queue
->rx_irq
);
239 xenvif_napi_schedule_or_enable_events(queue
);
243 static void xenvif_down(struct xenvif
*vif
)
245 struct xenvif_queue
*queue
= NULL
;
246 unsigned int num_queues
= vif
->num_queues
;
247 unsigned int queue_index
;
249 for (queue_index
= 0; queue_index
< num_queues
; ++queue_index
) {
250 queue
= &vif
->queues
[queue_index
];
251 disable_irq(queue
->tx_irq
);
252 if (queue
->tx_irq
!= queue
->rx_irq
)
253 disable_irq(queue
->rx_irq
);
254 napi_disable(&queue
->napi
);
255 del_timer_sync(&queue
->credit_timeout
);
259 static int xenvif_open(struct net_device
*dev
)
261 struct xenvif
*vif
= netdev_priv(dev
);
262 if (test_bit(VIF_STATUS_CONNECTED
, &vif
->status
))
264 netif_tx_start_all_queues(dev
);
268 static int xenvif_close(struct net_device
*dev
)
270 struct xenvif
*vif
= netdev_priv(dev
);
271 if (test_bit(VIF_STATUS_CONNECTED
, &vif
->status
))
273 netif_tx_stop_all_queues(dev
);
277 static int xenvif_change_mtu(struct net_device
*dev
, int mtu
)
279 struct xenvif
*vif
= netdev_priv(dev
);
280 int max
= vif
->can_sg
? 65535 - VLAN_ETH_HLEN
: ETH_DATA_LEN
;
288 static netdev_features_t
xenvif_fix_features(struct net_device
*dev
,
289 netdev_features_t features
)
291 struct xenvif
*vif
= netdev_priv(dev
);
294 features
&= ~NETIF_F_SG
;
295 if (~(vif
->gso_mask
| vif
->gso_prefix_mask
) & GSO_BIT(TCPV4
))
296 features
&= ~NETIF_F_TSO
;
297 if (~(vif
->gso_mask
| vif
->gso_prefix_mask
) & GSO_BIT(TCPV6
))
298 features
&= ~NETIF_F_TSO6
;
300 features
&= ~NETIF_F_IP_CSUM
;
302 features
&= ~NETIF_F_IPV6_CSUM
;
307 static const struct xenvif_stat
{
308 char name
[ETH_GSTRING_LEN
];
312 "rx_gso_checksum_fixup",
313 offsetof(struct xenvif_stats
, rx_gso_checksum_fixup
)
315 /* If (sent != success + fail), there are probably packets never
320 offsetof(struct xenvif_stats
, tx_zerocopy_sent
),
323 "tx_zerocopy_success",
324 offsetof(struct xenvif_stats
, tx_zerocopy_success
),
328 offsetof(struct xenvif_stats
, tx_zerocopy_fail
)
330 /* Number of packets exceeding MAX_SKB_FRAG slots. You should use
331 * a guest with the same MAX_SKB_FRAG
335 offsetof(struct xenvif_stats
, tx_frag_overflow
)
339 static int xenvif_get_sset_count(struct net_device
*dev
, int string_set
)
341 switch (string_set
) {
343 return ARRAY_SIZE(xenvif_stats
);
349 static void xenvif_get_ethtool_stats(struct net_device
*dev
,
350 struct ethtool_stats
*stats
, u64
* data
)
352 struct xenvif
*vif
= netdev_priv(dev
);
353 unsigned int num_queues
= vif
->num_queues
;
355 unsigned int queue_index
;
357 for (i
= 0; i
< ARRAY_SIZE(xenvif_stats
); i
++) {
358 unsigned long accum
= 0;
359 for (queue_index
= 0; queue_index
< num_queues
; ++queue_index
) {
360 void *vif_stats
= &vif
->queues
[queue_index
].stats
;
361 accum
+= *(unsigned long *)(vif_stats
+ xenvif_stats
[i
].offset
);
367 static void xenvif_get_strings(struct net_device
*dev
, u32 stringset
, u8
* data
)
373 for (i
= 0; i
< ARRAY_SIZE(xenvif_stats
); i
++)
374 memcpy(data
+ i
* ETH_GSTRING_LEN
,
375 xenvif_stats
[i
].name
, ETH_GSTRING_LEN
);
380 static const struct ethtool_ops xenvif_ethtool_ops
= {
381 .get_link
= ethtool_op_get_link
,
383 .get_sset_count
= xenvif_get_sset_count
,
384 .get_ethtool_stats
= xenvif_get_ethtool_stats
,
385 .get_strings
= xenvif_get_strings
,
388 static const struct net_device_ops xenvif_netdev_ops
= {
389 .ndo_start_xmit
= xenvif_start_xmit
,
390 .ndo_get_stats
= xenvif_get_stats
,
391 .ndo_open
= xenvif_open
,
392 .ndo_stop
= xenvif_close
,
393 .ndo_change_mtu
= xenvif_change_mtu
,
394 .ndo_fix_features
= xenvif_fix_features
,
395 .ndo_set_mac_address
= eth_mac_addr
,
396 .ndo_validate_addr
= eth_validate_addr
,
399 struct xenvif
*xenvif_alloc(struct device
*parent
, domid_t domid
,
403 struct net_device
*dev
;
405 char name
[IFNAMSIZ
] = {};
407 snprintf(name
, IFNAMSIZ
- 1, "vif%u.%u", domid
, handle
);
408 /* Allocate a netdev with the max. supported number of queues.
409 * When the guest selects the desired number, it will be updated
410 * via netif_set_real_num_*_queues().
412 dev
= alloc_netdev_mq(sizeof(struct xenvif
), name
, NET_NAME_UNKNOWN
,
413 ether_setup
, xenvif_max_queues
);
415 pr_warn("Could not allocate netdev for %s\n", name
);
416 return ERR_PTR(-ENOMEM
);
419 SET_NETDEV_DEV(dev
, parent
);
421 vif
= netdev_priv(dev
);
424 vif
->handle
= handle
;
428 vif
->disabled
= false;
429 vif
->drain_timeout
= msecs_to_jiffies(rx_drain_timeout_msecs
);
430 vif
->stall_timeout
= msecs_to_jiffies(rx_stall_timeout_msecs
);
432 /* Start out with no queues. */
436 spin_lock_init(&vif
->lock
);
437 INIT_LIST_HEAD(&vif
->fe_mcast_addr
);
439 dev
->netdev_ops
= &xenvif_netdev_ops
;
440 dev
->hw_features
= NETIF_F_SG
|
441 NETIF_F_IP_CSUM
| NETIF_F_IPV6_CSUM
|
442 NETIF_F_TSO
| NETIF_F_TSO6
;
443 dev
->features
= dev
->hw_features
| NETIF_F_RXCSUM
;
444 dev
->ethtool_ops
= &xenvif_ethtool_ops
;
446 dev
->tx_queue_len
= XENVIF_QUEUE_LENGTH
;
449 * Initialise a dummy MAC address. We choose the numerically
450 * largest non-broadcast address to prevent the address getting
451 * stolen by an Ethernet bridge for STP purposes.
452 * (FE:FF:FF:FF:FF:FF)
454 eth_broadcast_addr(dev
->dev_addr
);
455 dev
->dev_addr
[0] &= ~0x01;
457 netif_carrier_off(dev
);
459 err
= register_netdev(dev
);
461 netdev_warn(dev
, "Could not register device: err=%d\n", err
);
466 netdev_dbg(dev
, "Successfully created xenvif\n");
468 __module_get(THIS_MODULE
);
473 int xenvif_init_queue(struct xenvif_queue
*queue
)
477 queue
->credit_bytes
= queue
->remaining_credit
= ~0UL;
478 queue
->credit_usec
= 0UL;
479 init_timer(&queue
->credit_timeout
);
480 queue
->credit_timeout
.function
= xenvif_tx_credit_callback
;
481 queue
->credit_window_start
= get_jiffies_64();
483 queue
->rx_queue_max
= XENVIF_RX_QUEUE_BYTES
;
485 skb_queue_head_init(&queue
->rx_queue
);
486 skb_queue_head_init(&queue
->tx_queue
);
488 queue
->pending_cons
= 0;
489 queue
->pending_prod
= MAX_PENDING_REQS
;
490 for (i
= 0; i
< MAX_PENDING_REQS
; ++i
)
491 queue
->pending_ring
[i
] = i
;
493 spin_lock_init(&queue
->callback_lock
);
494 spin_lock_init(&queue
->response_lock
);
496 /* If ballooning is disabled, this will consume real memory, so you
497 * better enable it. The long term solution would be to use just a
498 * bunch of valid page descriptors, without dependency on ballooning
500 err
= gnttab_alloc_pages(MAX_PENDING_REQS
,
503 netdev_err(queue
->vif
->dev
, "Could not reserve mmap_pages\n");
507 for (i
= 0; i
< MAX_PENDING_REQS
; i
++) {
508 queue
->pending_tx_info
[i
].callback_struct
= (struct ubuf_info
)
509 { .callback
= xenvif_zerocopy_callback
,
512 queue
->grant_tx_handle
[i
] = NETBACK_INVALID_HANDLE
;
518 void xenvif_carrier_on(struct xenvif
*vif
)
521 if (!vif
->can_sg
&& vif
->dev
->mtu
> ETH_DATA_LEN
)
522 dev_set_mtu(vif
->dev
, ETH_DATA_LEN
);
523 netdev_update_features(vif
->dev
);
524 set_bit(VIF_STATUS_CONNECTED
, &vif
->status
);
525 if (netif_running(vif
->dev
))
530 int xenvif_connect(struct xenvif_queue
*queue
, unsigned long tx_ring_ref
,
531 unsigned long rx_ring_ref
, unsigned int tx_evtchn
,
532 unsigned int rx_evtchn
)
534 struct task_struct
*task
;
537 BUG_ON(queue
->tx_irq
);
539 BUG_ON(queue
->dealloc_task
);
541 err
= xenvif_map_frontend_rings(queue
, tx_ring_ref
, rx_ring_ref
);
545 init_waitqueue_head(&queue
->wq
);
546 init_waitqueue_head(&queue
->dealloc_wq
);
547 atomic_set(&queue
->inflight_packets
, 0);
549 netif_napi_add(queue
->vif
->dev
, &queue
->napi
, xenvif_poll
,
552 if (tx_evtchn
== rx_evtchn
) {
553 /* feature-split-event-channels == 0 */
554 err
= bind_interdomain_evtchn_to_irqhandler(
555 queue
->vif
->domid
, tx_evtchn
, xenvif_interrupt
, 0,
559 queue
->tx_irq
= queue
->rx_irq
= err
;
560 disable_irq(queue
->tx_irq
);
562 /* feature-split-event-channels == 1 */
563 snprintf(queue
->tx_irq_name
, sizeof(queue
->tx_irq_name
),
564 "%s-tx", queue
->name
);
565 err
= bind_interdomain_evtchn_to_irqhandler(
566 queue
->vif
->domid
, tx_evtchn
, xenvif_tx_interrupt
, 0,
567 queue
->tx_irq_name
, queue
);
571 disable_irq(queue
->tx_irq
);
573 snprintf(queue
->rx_irq_name
, sizeof(queue
->rx_irq_name
),
574 "%s-rx", queue
->name
);
575 err
= bind_interdomain_evtchn_to_irqhandler(
576 queue
->vif
->domid
, rx_evtchn
, xenvif_rx_interrupt
, 0,
577 queue
->rx_irq_name
, queue
);
581 disable_irq(queue
->rx_irq
);
584 queue
->stalled
= true;
586 task
= kthread_create(xenvif_kthread_guest_rx
,
587 (void *)queue
, "%s-guest-rx", queue
->name
);
589 pr_warn("Could not allocate kthread for %s\n", queue
->name
);
594 get_task_struct(task
);
596 task
= kthread_create(xenvif_dealloc_kthread
,
597 (void *)queue
, "%s-dealloc", queue
->name
);
599 pr_warn("Could not allocate kthread for %s\n", queue
->name
);
603 queue
->dealloc_task
= task
;
605 wake_up_process(queue
->task
);
606 wake_up_process(queue
->dealloc_task
);
611 unbind_from_irqhandler(queue
->rx_irq
, queue
);
614 unbind_from_irqhandler(queue
->tx_irq
, queue
);
617 xenvif_unmap_frontend_rings(queue
);
619 module_put(THIS_MODULE
);
623 void xenvif_carrier_off(struct xenvif
*vif
)
625 struct net_device
*dev
= vif
->dev
;
628 if (test_and_clear_bit(VIF_STATUS_CONNECTED
, &vif
->status
)) {
629 netif_carrier_off(dev
); /* discard queued packets */
630 if (netif_running(dev
))
636 void xenvif_disconnect(struct xenvif
*vif
)
638 struct xenvif_queue
*queue
= NULL
;
639 unsigned int num_queues
= vif
->num_queues
;
640 unsigned int queue_index
;
642 xenvif_carrier_off(vif
);
644 for (queue_index
= 0; queue_index
< num_queues
; ++queue_index
) {
645 queue
= &vif
->queues
[queue_index
];
647 netif_napi_del(&queue
->napi
);
650 kthread_stop(queue
->task
);
651 put_task_struct(queue
->task
);
655 if (queue
->dealloc_task
) {
656 kthread_stop(queue
->dealloc_task
);
657 queue
->dealloc_task
= NULL
;
661 if (queue
->tx_irq
== queue
->rx_irq
)
662 unbind_from_irqhandler(queue
->tx_irq
, queue
);
664 unbind_from_irqhandler(queue
->tx_irq
, queue
);
665 unbind_from_irqhandler(queue
->rx_irq
, queue
);
670 xenvif_unmap_frontend_rings(queue
);
673 xenvif_mcast_addr_list_free(vif
);
676 /* Reverse the relevant parts of xenvif_init_queue().
677 * Used for queue teardown from xenvif_free(), and on the
678 * error handling paths in xenbus.c:connect().
680 void xenvif_deinit_queue(struct xenvif_queue
*queue
)
682 gnttab_free_pages(MAX_PENDING_REQS
, queue
->mmap_pages
);
685 void xenvif_free(struct xenvif
*vif
)
687 struct xenvif_queue
*queue
= NULL
;
688 unsigned int num_queues
= vif
->num_queues
;
689 unsigned int queue_index
;
691 unregister_netdev(vif
->dev
);
693 for (queue_index
= 0; queue_index
< num_queues
; ++queue_index
) {
694 queue
= &vif
->queues
[queue_index
];
695 xenvif_deinit_queue(queue
);
702 free_netdev(vif
->dev
);
704 module_put(THIS_MODULE
);