1 // SPDX-License-Identifier: GPL-2.0-only
5 * Copyright (C) 2007 OpenVZ http://openvz.org, SWsoft Inc
7 * Author: Pavel Emelianov <xemul@openvz.org>
8 * Ethtool interface from: Eric W. Biederman <ebiederm@xmission.com>
12 #include <linux/netdevice.h>
13 #include <linux/slab.h>
14 #include <linux/ethtool.h>
15 #include <linux/etherdevice.h>
16 #include <linux/u64_stats_sync.h>
18 #include <net/rtnetlink.h>
22 #include <linux/veth.h>
23 #include <linux/module.h>
24 #include <linux/bpf.h>
25 #include <linux/filter.h>
26 #include <linux/ptr_ring.h>
27 #include <linux/bpf_trace.h>
28 #include <linux/net_tstamp.h>
29 #include <linux/skbuff_ref.h>
30 #include <net/page_pool/helpers.h>
32 #define DRV_NAME "veth"
33 #define DRV_VERSION "1.0"
35 #define VETH_XDP_FLAG BIT(0)
36 #define VETH_RING_SIZE 256
37 #define VETH_XDP_HEADROOM (XDP_PACKET_HEADROOM + NET_IP_ALIGN)
39 #define VETH_XDP_TX_BULK_SIZE 16
40 #define VETH_XDP_BATCH 16
52 u64 peer_tq_xdp_xmit_err
;
55 struct veth_rq_stats
{
57 struct u64_stats_sync syncp
;
61 struct napi_struct xdp_napi
;
62 struct napi_struct __rcu
*napi
; /* points to xdp_napi when the latter is initialized */
63 struct net_device
*dev
;
64 struct bpf_prog __rcu
*xdp_prog
;
65 struct xdp_mem_info xdp_mem
;
66 struct veth_rq_stats stats
;
67 bool rx_notify_masked
;
68 struct ptr_ring xdp_ring
;
69 struct xdp_rxq_info xdp_rxq
;
70 struct page_pool
*page_pool
;
74 struct net_device __rcu
*peer
;
76 struct bpf_prog
*_xdp_prog
;
78 unsigned int requested_headroom
;
81 struct veth_xdp_tx_bq
{
82 struct xdp_frame
*q
[VETH_XDP_TX_BULK_SIZE
];
90 struct veth_q_stat_desc
{
91 char desc
[ETH_GSTRING_LEN
];
95 #define VETH_RQ_STAT(m) offsetof(struct veth_stats, m)
97 static const struct veth_q_stat_desc veth_rq_stats_desc
[] = {
98 { "xdp_packets", VETH_RQ_STAT(xdp_packets
) },
99 { "xdp_bytes", VETH_RQ_STAT(xdp_bytes
) },
100 { "drops", VETH_RQ_STAT(rx_drops
) },
101 { "xdp_redirect", VETH_RQ_STAT(xdp_redirect
) },
102 { "xdp_drops", VETH_RQ_STAT(xdp_drops
) },
103 { "xdp_tx", VETH_RQ_STAT(xdp_tx
) },
104 { "xdp_tx_errors", VETH_RQ_STAT(xdp_tx_err
) },
107 #define VETH_RQ_STATS_LEN ARRAY_SIZE(veth_rq_stats_desc)
109 static const struct veth_q_stat_desc veth_tq_stats_desc
[] = {
110 { "xdp_xmit", VETH_RQ_STAT(peer_tq_xdp_xmit
) },
111 { "xdp_xmit_errors", VETH_RQ_STAT(peer_tq_xdp_xmit_err
) },
114 #define VETH_TQ_STATS_LEN ARRAY_SIZE(veth_tq_stats_desc)
117 const char string
[ETH_GSTRING_LEN
];
118 } ethtool_stats_keys
[] = {
122 struct veth_xdp_buff
{
127 static int veth_get_link_ksettings(struct net_device
*dev
,
128 struct ethtool_link_ksettings
*cmd
)
130 cmd
->base
.speed
= SPEED_10000
;
131 cmd
->base
.duplex
= DUPLEX_FULL
;
132 cmd
->base
.port
= PORT_TP
;
133 cmd
->base
.autoneg
= AUTONEG_DISABLE
;
137 static void veth_get_drvinfo(struct net_device
*dev
, struct ethtool_drvinfo
*info
)
139 strscpy(info
->driver
, DRV_NAME
, sizeof(info
->driver
));
140 strscpy(info
->version
, DRV_VERSION
, sizeof(info
->version
));
143 static void veth_get_strings(struct net_device
*dev
, u32 stringset
, u8
*buf
)
150 memcpy(p
, ðtool_stats_keys
, sizeof(ethtool_stats_keys
));
151 p
+= sizeof(ethtool_stats_keys
);
152 for (i
= 0; i
< dev
->real_num_rx_queues
; i
++)
153 for (j
= 0; j
< VETH_RQ_STATS_LEN
; j
++)
154 ethtool_sprintf(&p
, "rx_queue_%u_%.18s",
155 i
, veth_rq_stats_desc
[j
].desc
);
157 for (i
= 0; i
< dev
->real_num_tx_queues
; i
++)
158 for (j
= 0; j
< VETH_TQ_STATS_LEN
; j
++)
159 ethtool_sprintf(&p
, "tx_queue_%u_%.18s",
160 i
, veth_tq_stats_desc
[j
].desc
);
162 page_pool_ethtool_stats_get_strings(p
);
167 static int veth_get_sset_count(struct net_device
*dev
, int sset
)
171 return ARRAY_SIZE(ethtool_stats_keys
) +
172 VETH_RQ_STATS_LEN
* dev
->real_num_rx_queues
+
173 VETH_TQ_STATS_LEN
* dev
->real_num_tx_queues
+
174 page_pool_ethtool_stats_get_count();
180 static void veth_get_page_pool_stats(struct net_device
*dev
, u64
*data
)
182 #ifdef CONFIG_PAGE_POOL_STATS
183 struct veth_priv
*priv
= netdev_priv(dev
);
184 struct page_pool_stats pp_stats
= {};
187 for (i
= 0; i
< dev
->real_num_rx_queues
; i
++) {
188 if (!priv
->rq
[i
].page_pool
)
190 page_pool_get_stats(priv
->rq
[i
].page_pool
, &pp_stats
);
192 page_pool_ethtool_stats_get(data
, &pp_stats
);
193 #endif /* CONFIG_PAGE_POOL_STATS */
196 static void veth_get_ethtool_stats(struct net_device
*dev
,
197 struct ethtool_stats
*stats
, u64
*data
)
199 struct veth_priv
*rcv_priv
, *priv
= netdev_priv(dev
);
200 struct net_device
*peer
= rtnl_dereference(priv
->peer
);
201 int i
, j
, idx
, pp_idx
;
203 data
[0] = peer
? peer
->ifindex
: 0;
205 for (i
= 0; i
< dev
->real_num_rx_queues
; i
++) {
206 const struct veth_rq_stats
*rq_stats
= &priv
->rq
[i
].stats
;
207 const void *stats_base
= (void *)&rq_stats
->vs
;
212 start
= u64_stats_fetch_begin(&rq_stats
->syncp
);
213 for (j
= 0; j
< VETH_RQ_STATS_LEN
; j
++) {
214 offset
= veth_rq_stats_desc
[j
].offset
;
215 data
[idx
+ j
] = *(u64
*)(stats_base
+ offset
);
217 } while (u64_stats_fetch_retry(&rq_stats
->syncp
, start
));
218 idx
+= VETH_RQ_STATS_LEN
;
223 goto page_pool_stats
;
225 rcv_priv
= netdev_priv(peer
);
226 for (i
= 0; i
< peer
->real_num_rx_queues
; i
++) {
227 const struct veth_rq_stats
*rq_stats
= &rcv_priv
->rq
[i
].stats
;
228 const void *base
= (void *)&rq_stats
->vs
;
229 unsigned int start
, tx_idx
= idx
;
232 tx_idx
+= (i
% dev
->real_num_tx_queues
) * VETH_TQ_STATS_LEN
;
234 start
= u64_stats_fetch_begin(&rq_stats
->syncp
);
235 for (j
= 0; j
< VETH_TQ_STATS_LEN
; j
++) {
236 offset
= veth_tq_stats_desc
[j
].offset
;
237 data
[tx_idx
+ j
] += *(u64
*)(base
+ offset
);
239 } while (u64_stats_fetch_retry(&rq_stats
->syncp
, start
));
241 pp_idx
= idx
+ dev
->real_num_tx_queues
* VETH_TQ_STATS_LEN
;
244 veth_get_page_pool_stats(dev
, &data
[pp_idx
]);
247 static void veth_get_channels(struct net_device
*dev
,
248 struct ethtool_channels
*channels
)
250 channels
->tx_count
= dev
->real_num_tx_queues
;
251 channels
->rx_count
= dev
->real_num_rx_queues
;
252 channels
->max_tx
= dev
->num_tx_queues
;
253 channels
->max_rx
= dev
->num_rx_queues
;
256 static int veth_set_channels(struct net_device
*dev
,
257 struct ethtool_channels
*ch
);
259 static const struct ethtool_ops veth_ethtool_ops
= {
260 .get_drvinfo
= veth_get_drvinfo
,
261 .get_link
= ethtool_op_get_link
,
262 .get_strings
= veth_get_strings
,
263 .get_sset_count
= veth_get_sset_count
,
264 .get_ethtool_stats
= veth_get_ethtool_stats
,
265 .get_link_ksettings
= veth_get_link_ksettings
,
266 .get_ts_info
= ethtool_op_get_ts_info
,
267 .get_channels
= veth_get_channels
,
268 .set_channels
= veth_set_channels
,
271 /* general routines */
273 static bool veth_is_xdp_frame(void *ptr
)
275 return (unsigned long)ptr
& VETH_XDP_FLAG
;
278 static struct xdp_frame
*veth_ptr_to_xdp(void *ptr
)
280 return (void *)((unsigned long)ptr
& ~VETH_XDP_FLAG
);
283 static void *veth_xdp_to_ptr(struct xdp_frame
*xdp
)
285 return (void *)((unsigned long)xdp
| VETH_XDP_FLAG
);
288 static void veth_ptr_free(void *ptr
)
290 if (veth_is_xdp_frame(ptr
))
291 xdp_return_frame(veth_ptr_to_xdp(ptr
));
296 static void __veth_xdp_flush(struct veth_rq
*rq
)
298 /* Write ptr_ring before reading rx_notify_masked */
300 if (!READ_ONCE(rq
->rx_notify_masked
) &&
301 napi_schedule_prep(&rq
->xdp_napi
)) {
302 WRITE_ONCE(rq
->rx_notify_masked
, true);
303 __napi_schedule(&rq
->xdp_napi
);
307 static int veth_xdp_rx(struct veth_rq
*rq
, struct sk_buff
*skb
)
309 if (unlikely(ptr_ring_produce(&rq
->xdp_ring
, skb
))) {
310 dev_kfree_skb_any(skb
);
314 return NET_RX_SUCCESS
;
317 static int veth_forward_skb(struct net_device
*dev
, struct sk_buff
*skb
,
318 struct veth_rq
*rq
, bool xdp
)
320 return __dev_forward_skb(dev
, skb
) ?: xdp
?
321 veth_xdp_rx(rq
, skb
) :
325 /* return true if the specified skb has chances of GRO aggregation
326 * Don't strive for accuracy, but try to avoid GRO overhead in the most
328 * When XDP is enabled, all traffic is considered eligible, as the xmit
329 * device has TSO off.
330 * When TSO is enabled on the xmit device, we are likely interested only
331 * in UDP aggregation, explicitly check for that if the skb is suspected
332 * - the sock_wfree destructor is used by UDP, ICMP and XDP sockets -
333 * to belong to locally generated UDP traffic.
335 static bool veth_skb_is_eligible_for_gro(const struct net_device
*dev
,
336 const struct net_device
*rcv
,
337 const struct sk_buff
*skb
)
339 return !(dev
->features
& NETIF_F_ALL_TSO
) ||
340 (skb
->destructor
== sock_wfree
&&
341 rcv
->features
& (NETIF_F_GRO_FRAGLIST
| NETIF_F_GRO_UDP_FWD
));
344 static netdev_tx_t
veth_xmit(struct sk_buff
*skb
, struct net_device
*dev
)
346 struct veth_priv
*rcv_priv
, *priv
= netdev_priv(dev
);
347 struct veth_rq
*rq
= NULL
;
348 int ret
= NETDEV_TX_OK
;
349 struct net_device
*rcv
;
350 int length
= skb
->len
;
351 bool use_napi
= false;
355 rcv
= rcu_dereference(priv
->peer
);
356 if (unlikely(!rcv
) || !pskb_may_pull(skb
, ETH_HLEN
)) {
361 rcv_priv
= netdev_priv(rcv
);
362 rxq
= skb_get_queue_mapping(skb
);
363 if (rxq
< rcv
->real_num_rx_queues
) {
364 rq
= &rcv_priv
->rq
[rxq
];
366 /* The napi pointer is available when an XDP program is
367 * attached or when GRO is enabled
368 * Don't bother with napi/GRO if the skb can't be aggregated
370 use_napi
= rcu_access_pointer(rq
->napi
) &&
371 veth_skb_is_eligible_for_gro(dev
, rcv
, skb
);
374 skb_tx_timestamp(skb
);
375 if (likely(veth_forward_skb(rcv
, skb
, rq
, use_napi
) == NET_RX_SUCCESS
)) {
377 dev_sw_netstats_tx_add(dev
, 1, length
);
379 __veth_xdp_flush(rq
);
382 atomic64_inc(&priv
->dropped
);
391 static void veth_stats_rx(struct veth_stats
*result
, struct net_device
*dev
)
393 struct veth_priv
*priv
= netdev_priv(dev
);
396 result
->peer_tq_xdp_xmit_err
= 0;
397 result
->xdp_packets
= 0;
398 result
->xdp_tx_err
= 0;
399 result
->xdp_bytes
= 0;
400 result
->rx_drops
= 0;
401 for (i
= 0; i
< dev
->num_rx_queues
; i
++) {
402 u64 packets
, bytes
, drops
, xdp_tx_err
, peer_tq_xdp_xmit_err
;
403 struct veth_rq_stats
*stats
= &priv
->rq
[i
].stats
;
407 start
= u64_stats_fetch_begin(&stats
->syncp
);
408 peer_tq_xdp_xmit_err
= stats
->vs
.peer_tq_xdp_xmit_err
;
409 xdp_tx_err
= stats
->vs
.xdp_tx_err
;
410 packets
= stats
->vs
.xdp_packets
;
411 bytes
= stats
->vs
.xdp_bytes
;
412 drops
= stats
->vs
.rx_drops
;
413 } while (u64_stats_fetch_retry(&stats
->syncp
, start
));
414 result
->peer_tq_xdp_xmit_err
+= peer_tq_xdp_xmit_err
;
415 result
->xdp_tx_err
+= xdp_tx_err
;
416 result
->xdp_packets
+= packets
;
417 result
->xdp_bytes
+= bytes
;
418 result
->rx_drops
+= drops
;
422 static void veth_get_stats64(struct net_device
*dev
,
423 struct rtnl_link_stats64
*tot
)
425 struct veth_priv
*priv
= netdev_priv(dev
);
426 struct net_device
*peer
;
427 struct veth_stats rx
;
429 tot
->tx_dropped
= atomic64_read(&priv
->dropped
);
430 dev_fetch_sw_netstats(tot
, dev
->tstats
);
432 veth_stats_rx(&rx
, dev
);
433 tot
->tx_dropped
+= rx
.xdp_tx_err
;
434 tot
->rx_dropped
= rx
.rx_drops
+ rx
.peer_tq_xdp_xmit_err
;
435 tot
->rx_bytes
+= rx
.xdp_bytes
;
436 tot
->rx_packets
+= rx
.xdp_packets
;
439 peer
= rcu_dereference(priv
->peer
);
441 struct rtnl_link_stats64 tot_peer
= {};
443 dev_fetch_sw_netstats(&tot_peer
, peer
->tstats
);
444 tot
->rx_bytes
+= tot_peer
.tx_bytes
;
445 tot
->rx_packets
+= tot_peer
.tx_packets
;
447 veth_stats_rx(&rx
, peer
);
448 tot
->tx_dropped
+= rx
.peer_tq_xdp_xmit_err
;
449 tot
->rx_dropped
+= rx
.xdp_tx_err
;
450 tot
->tx_bytes
+= rx
.xdp_bytes
;
451 tot
->tx_packets
+= rx
.xdp_packets
;
456 /* fake multicast ability */
457 static void veth_set_multicast_list(struct net_device
*dev
)
461 static int veth_select_rxq(struct net_device
*dev
)
463 return smp_processor_id() % dev
->real_num_rx_queues
;
466 static struct net_device
*veth_peer_dev(struct net_device
*dev
)
468 struct veth_priv
*priv
= netdev_priv(dev
);
470 /* Callers must be under RCU read side. */
471 return rcu_dereference(priv
->peer
);
474 static int veth_xdp_xmit(struct net_device
*dev
, int n
,
475 struct xdp_frame
**frames
,
476 u32 flags
, bool ndo_xmit
)
478 struct veth_priv
*rcv_priv
, *priv
= netdev_priv(dev
);
479 int i
, ret
= -ENXIO
, nxmit
= 0;
480 struct net_device
*rcv
;
481 unsigned int max_len
;
484 if (unlikely(flags
& ~XDP_XMIT_FLAGS_MASK
))
488 rcv
= rcu_dereference(priv
->peer
);
492 rcv_priv
= netdev_priv(rcv
);
493 rq
= &rcv_priv
->rq
[veth_select_rxq(rcv
)];
494 /* The napi pointer is set if NAPI is enabled, which ensures that
495 * xdp_ring is initialized on receive side and the peer device is up.
497 if (!rcu_access_pointer(rq
->napi
))
500 max_len
= rcv
->mtu
+ rcv
->hard_header_len
+ VLAN_HLEN
;
502 spin_lock(&rq
->xdp_ring
.producer_lock
);
503 for (i
= 0; i
< n
; i
++) {
504 struct xdp_frame
*frame
= frames
[i
];
505 void *ptr
= veth_xdp_to_ptr(frame
);
507 if (unlikely(xdp_get_frame_len(frame
) > max_len
||
508 __ptr_ring_produce(&rq
->xdp_ring
, ptr
)))
512 spin_unlock(&rq
->xdp_ring
.producer_lock
);
514 if (flags
& XDP_XMIT_FLUSH
)
515 __veth_xdp_flush(rq
);
519 u64_stats_update_begin(&rq
->stats
.syncp
);
520 rq
->stats
.vs
.peer_tq_xdp_xmit
+= nxmit
;
521 rq
->stats
.vs
.peer_tq_xdp_xmit_err
+= n
- nxmit
;
522 u64_stats_update_end(&rq
->stats
.syncp
);
531 static int veth_ndo_xdp_xmit(struct net_device
*dev
, int n
,
532 struct xdp_frame
**frames
, u32 flags
)
536 err
= veth_xdp_xmit(dev
, n
, frames
, flags
, true);
538 struct veth_priv
*priv
= netdev_priv(dev
);
540 atomic64_add(n
, &priv
->dropped
);
546 static void veth_xdp_flush_bq(struct veth_rq
*rq
, struct veth_xdp_tx_bq
*bq
)
548 int sent
, i
, err
= 0, drops
;
550 sent
= veth_xdp_xmit(rq
->dev
, bq
->count
, bq
->q
, 0, false);
556 for (i
= sent
; unlikely(i
< bq
->count
); i
++)
557 xdp_return_frame(bq
->q
[i
]);
559 drops
= bq
->count
- sent
;
560 trace_xdp_bulk_tx(rq
->dev
, sent
, drops
, err
);
562 u64_stats_update_begin(&rq
->stats
.syncp
);
563 rq
->stats
.vs
.xdp_tx
+= sent
;
564 rq
->stats
.vs
.xdp_tx_err
+= drops
;
565 u64_stats_update_end(&rq
->stats
.syncp
);
570 static void veth_xdp_flush(struct veth_rq
*rq
, struct veth_xdp_tx_bq
*bq
)
572 struct veth_priv
*rcv_priv
, *priv
= netdev_priv(rq
->dev
);
573 struct net_device
*rcv
;
574 struct veth_rq
*rcv_rq
;
577 veth_xdp_flush_bq(rq
, bq
);
578 rcv
= rcu_dereference(priv
->peer
);
582 rcv_priv
= netdev_priv(rcv
);
583 rcv_rq
= &rcv_priv
->rq
[veth_select_rxq(rcv
)];
584 /* xdp_ring is initialized on receive side? */
585 if (unlikely(!rcu_access_pointer(rcv_rq
->xdp_prog
)))
588 __veth_xdp_flush(rcv_rq
);
593 static int veth_xdp_tx(struct veth_rq
*rq
, struct xdp_buff
*xdp
,
594 struct veth_xdp_tx_bq
*bq
)
596 struct xdp_frame
*frame
= xdp_convert_buff_to_frame(xdp
);
598 if (unlikely(!frame
))
601 if (unlikely(bq
->count
== VETH_XDP_TX_BULK_SIZE
))
602 veth_xdp_flush_bq(rq
, bq
);
604 bq
->q
[bq
->count
++] = frame
;
609 static struct xdp_frame
*veth_xdp_rcv_one(struct veth_rq
*rq
,
610 struct xdp_frame
*frame
,
611 struct veth_xdp_tx_bq
*bq
,
612 struct veth_stats
*stats
)
614 struct xdp_frame orig_frame
;
615 struct bpf_prog
*xdp_prog
;
618 xdp_prog
= rcu_dereference(rq
->xdp_prog
);
619 if (likely(xdp_prog
)) {
620 struct veth_xdp_buff vxbuf
;
621 struct xdp_buff
*xdp
= &vxbuf
.xdp
;
624 xdp_convert_frame_to_buff(frame
, xdp
);
625 xdp
->rxq
= &rq
->xdp_rxq
;
628 act
= bpf_prog_run_xdp(xdp_prog
, xdp
);
632 if (xdp_update_frame_from_buff(xdp
, frame
))
637 xdp
->rxq
->mem
= frame
->mem
;
638 if (unlikely(veth_xdp_tx(rq
, xdp
, bq
) < 0)) {
639 trace_xdp_exception(rq
->dev
, xdp_prog
, act
);
649 xdp
->rxq
->mem
= frame
->mem
;
650 if (xdp_do_redirect(rq
->dev
, xdp
, xdp_prog
)) {
655 stats
->xdp_redirect
++;
659 bpf_warn_invalid_xdp_action(rq
->dev
, xdp_prog
, act
);
662 trace_xdp_exception(rq
->dev
, xdp_prog
, act
);
674 xdp_return_frame(frame
);
679 /* frames array contains VETH_XDP_BATCH at most */
680 static void veth_xdp_rcv_bulk_skb(struct veth_rq
*rq
, void **frames
,
681 int n_xdpf
, struct veth_xdp_tx_bq
*bq
,
682 struct veth_stats
*stats
)
684 void *skbs
[VETH_XDP_BATCH
];
687 if (xdp_alloc_skb_bulk(skbs
, n_xdpf
,
688 GFP_ATOMIC
| __GFP_ZERO
) < 0) {
689 for (i
= 0; i
< n_xdpf
; i
++)
690 xdp_return_frame(frames
[i
]);
691 stats
->rx_drops
+= n_xdpf
;
696 for (i
= 0; i
< n_xdpf
; i
++) {
697 struct sk_buff
*skb
= skbs
[i
];
699 skb
= __xdp_build_skb_from_frame(frames
[i
], skb
,
702 xdp_return_frame(frames
[i
]);
706 napi_gro_receive(&rq
->xdp_napi
, skb
);
710 static void veth_xdp_get(struct xdp_buff
*xdp
)
712 struct skb_shared_info
*sinfo
= xdp_get_shared_info_from_buff(xdp
);
715 get_page(virt_to_page(xdp
->data
));
716 if (likely(!xdp_buff_has_frags(xdp
)))
719 for (i
= 0; i
< sinfo
->nr_frags
; i
++)
720 __skb_frag_ref(&sinfo
->frags
[i
]);
723 static int veth_convert_skb_to_xdp_buff(struct veth_rq
*rq
,
724 struct xdp_buff
*xdp
,
725 struct sk_buff
**pskb
)
727 struct sk_buff
*skb
= *pskb
;
730 if (skb_shared(skb
) || skb_head_is_locked(skb
) ||
731 skb_shinfo(skb
)->nr_frags
||
732 skb_headroom(skb
) < XDP_PACKET_HEADROOM
) {
733 if (skb_pp_cow_data(rq
->page_pool
, pskb
, XDP_PACKET_HEADROOM
))
739 /* SKB "head" area always have tailroom for skb_shared_info */
740 frame_sz
= skb_end_pointer(skb
) - skb
->head
;
741 frame_sz
+= SKB_DATA_ALIGN(sizeof(struct skb_shared_info
));
742 xdp_init_buff(xdp
, frame_sz
, &rq
->xdp_rxq
);
743 xdp_prepare_buff(xdp
, skb
->head
, skb_headroom(skb
),
744 skb_headlen(skb
), true);
746 if (skb_is_nonlinear(skb
)) {
747 skb_shinfo(skb
)->xdp_frags_size
= skb
->data_len
;
748 xdp_buff_set_frags_flag(xdp
);
750 xdp_buff_clear_frags_flag(xdp
);
762 static struct sk_buff
*veth_xdp_rcv_skb(struct veth_rq
*rq
,
764 struct veth_xdp_tx_bq
*bq
,
765 struct veth_stats
*stats
)
767 void *orig_data
, *orig_data_end
;
768 struct bpf_prog
*xdp_prog
;
769 struct veth_xdp_buff vxbuf
;
770 struct xdp_buff
*xdp
= &vxbuf
.xdp
;
774 skb_prepare_for_gro(skb
);
777 xdp_prog
= rcu_dereference(rq
->xdp_prog
);
778 if (unlikely(!xdp_prog
)) {
783 __skb_push(skb
, skb
->data
- skb_mac_header(skb
));
784 if (veth_convert_skb_to_xdp_buff(rq
, xdp
, &skb
))
788 orig_data
= xdp
->data
;
789 orig_data_end
= xdp
->data_end
;
791 act
= bpf_prog_run_xdp(xdp_prog
, xdp
);
799 xdp
->rxq
->mem
= rq
->xdp_mem
;
800 if (unlikely(veth_xdp_tx(rq
, xdp
, bq
) < 0)) {
801 trace_xdp_exception(rq
->dev
, xdp_prog
, act
);
811 xdp
->rxq
->mem
= rq
->xdp_mem
;
812 if (xdp_do_redirect(rq
->dev
, xdp
, xdp_prog
)) {
816 stats
->xdp_redirect
++;
820 bpf_warn_invalid_xdp_action(rq
->dev
, xdp_prog
, act
);
823 trace_xdp_exception(rq
->dev
, xdp_prog
, act
);
831 /* check if bpf_xdp_adjust_head was used */
832 off
= orig_data
- xdp
->data
;
834 __skb_push(skb
, off
);
836 __skb_pull(skb
, -off
);
838 skb_reset_mac_header(skb
);
840 /* check if bpf_xdp_adjust_tail was used */
841 off
= xdp
->data_end
- orig_data_end
;
843 __skb_put(skb
, off
); /* positive on grow, negative on shrink */
845 /* XDP frag metadata (e.g. nr_frags) are updated in eBPF helpers
846 * (e.g. bpf_xdp_adjust_tail), we need to update data_len here.
848 if (xdp_buff_has_frags(xdp
))
849 skb
->data_len
= skb_shinfo(skb
)->xdp_frags_size
;
853 skb
->protocol
= eth_type_trans(skb
, rq
->dev
);
855 metalen
= xdp
->data
- xdp
->data_meta
;
857 skb_metadata_set(skb
, metalen
);
868 xdp_return_buff(xdp
);
873 static int veth_xdp_rcv(struct veth_rq
*rq
, int budget
,
874 struct veth_xdp_tx_bq
*bq
,
875 struct veth_stats
*stats
)
877 int i
, done
= 0, n_xdpf
= 0;
878 void *xdpf
[VETH_XDP_BATCH
];
880 for (i
= 0; i
< budget
; i
++) {
881 void *ptr
= __ptr_ring_consume(&rq
->xdp_ring
);
886 if (veth_is_xdp_frame(ptr
)) {
888 struct xdp_frame
*frame
= veth_ptr_to_xdp(ptr
);
890 stats
->xdp_bytes
+= xdp_get_frame_len(frame
);
891 frame
= veth_xdp_rcv_one(rq
, frame
, bq
, stats
);
894 xdpf
[n_xdpf
++] = frame
;
895 if (n_xdpf
== VETH_XDP_BATCH
) {
896 veth_xdp_rcv_bulk_skb(rq
, xdpf
, n_xdpf
,
903 struct sk_buff
*skb
= ptr
;
905 stats
->xdp_bytes
+= skb
->len
;
906 skb
= veth_xdp_rcv_skb(rq
, skb
, bq
, stats
);
908 if (skb_shared(skb
) || skb_unclone(skb
, GFP_ATOMIC
))
909 netif_receive_skb(skb
);
911 napi_gro_receive(&rq
->xdp_napi
, skb
);
918 veth_xdp_rcv_bulk_skb(rq
, xdpf
, n_xdpf
, bq
, stats
);
920 u64_stats_update_begin(&rq
->stats
.syncp
);
921 rq
->stats
.vs
.xdp_redirect
+= stats
->xdp_redirect
;
922 rq
->stats
.vs
.xdp_bytes
+= stats
->xdp_bytes
;
923 rq
->stats
.vs
.xdp_drops
+= stats
->xdp_drops
;
924 rq
->stats
.vs
.rx_drops
+= stats
->rx_drops
;
925 rq
->stats
.vs
.xdp_packets
+= done
;
926 u64_stats_update_end(&rq
->stats
.syncp
);
931 static int veth_poll(struct napi_struct
*napi
, int budget
)
934 container_of(napi
, struct veth_rq
, xdp_napi
);
935 struct veth_stats stats
= {};
936 struct veth_xdp_tx_bq bq
;
941 xdp_set_return_frame_no_direct();
942 done
= veth_xdp_rcv(rq
, budget
, &bq
, &stats
);
944 if (stats
.xdp_redirect
> 0)
947 if (done
< budget
&& napi_complete_done(napi
, done
)) {
948 /* Write rx_notify_masked before reading ptr_ring */
949 smp_store_mb(rq
->rx_notify_masked
, false);
950 if (unlikely(!__ptr_ring_empty(&rq
->xdp_ring
))) {
951 if (napi_schedule_prep(&rq
->xdp_napi
)) {
952 WRITE_ONCE(rq
->rx_notify_masked
, true);
953 __napi_schedule(&rq
->xdp_napi
);
958 if (stats
.xdp_tx
> 0)
959 veth_xdp_flush(rq
, &bq
);
960 xdp_clear_return_frame_no_direct();
965 static int veth_create_page_pool(struct veth_rq
*rq
)
967 struct page_pool_params pp_params
= {
969 .pool_size
= VETH_RING_SIZE
,
971 .dev
= &rq
->dev
->dev
,
974 rq
->page_pool
= page_pool_create(&pp_params
);
975 if (IS_ERR(rq
->page_pool
)) {
976 int err
= PTR_ERR(rq
->page_pool
);
978 rq
->page_pool
= NULL
;
985 static int __veth_napi_enable_range(struct net_device
*dev
, int start
, int end
)
987 struct veth_priv
*priv
= netdev_priv(dev
);
990 for (i
= start
; i
< end
; i
++) {
991 err
= veth_create_page_pool(&priv
->rq
[i
]);
996 for (i
= start
; i
< end
; i
++) {
997 struct veth_rq
*rq
= &priv
->rq
[i
];
999 err
= ptr_ring_init(&rq
->xdp_ring
, VETH_RING_SIZE
, GFP_KERNEL
);
1004 for (i
= start
; i
< end
; i
++) {
1005 struct veth_rq
*rq
= &priv
->rq
[i
];
1007 napi_enable(&rq
->xdp_napi
);
1008 rcu_assign_pointer(priv
->rq
[i
].napi
, &priv
->rq
[i
].xdp_napi
);
1014 for (i
--; i
>= start
; i
--)
1015 ptr_ring_cleanup(&priv
->rq
[i
].xdp_ring
, veth_ptr_free
);
1018 for (i
--; i
>= start
; i
--) {
1019 page_pool_destroy(priv
->rq
[i
].page_pool
);
1020 priv
->rq
[i
].page_pool
= NULL
;
1026 static int __veth_napi_enable(struct net_device
*dev
)
1028 return __veth_napi_enable_range(dev
, 0, dev
->real_num_rx_queues
);
1031 static void veth_napi_del_range(struct net_device
*dev
, int start
, int end
)
1033 struct veth_priv
*priv
= netdev_priv(dev
);
1036 for (i
= start
; i
< end
; i
++) {
1037 struct veth_rq
*rq
= &priv
->rq
[i
];
1039 rcu_assign_pointer(priv
->rq
[i
].napi
, NULL
);
1040 napi_disable(&rq
->xdp_napi
);
1041 __netif_napi_del(&rq
->xdp_napi
);
1045 for (i
= start
; i
< end
; i
++) {
1046 struct veth_rq
*rq
= &priv
->rq
[i
];
1048 rq
->rx_notify_masked
= false;
1049 ptr_ring_cleanup(&rq
->xdp_ring
, veth_ptr_free
);
1052 for (i
= start
; i
< end
; i
++) {
1053 page_pool_destroy(priv
->rq
[i
].page_pool
);
1054 priv
->rq
[i
].page_pool
= NULL
;
1058 static void veth_napi_del(struct net_device
*dev
)
1060 veth_napi_del_range(dev
, 0, dev
->real_num_rx_queues
);
1063 static bool veth_gro_requested(const struct net_device
*dev
)
1065 return !!(dev
->wanted_features
& NETIF_F_GRO
);
1068 static int veth_enable_xdp_range(struct net_device
*dev
, int start
, int end
,
1069 bool napi_already_on
)
1071 struct veth_priv
*priv
= netdev_priv(dev
);
1074 for (i
= start
; i
< end
; i
++) {
1075 struct veth_rq
*rq
= &priv
->rq
[i
];
1077 if (!napi_already_on
)
1078 netif_napi_add(dev
, &rq
->xdp_napi
, veth_poll
);
1079 err
= xdp_rxq_info_reg(&rq
->xdp_rxq
, dev
, i
, rq
->xdp_napi
.napi_id
);
1083 err
= xdp_rxq_info_reg_mem_model(&rq
->xdp_rxq
,
1084 MEM_TYPE_PAGE_SHARED
,
1089 /* Save original mem info as it can be overwritten */
1090 rq
->xdp_mem
= rq
->xdp_rxq
.mem
;
1095 xdp_rxq_info_unreg(&priv
->rq
[i
].xdp_rxq
);
1097 for (i
--; i
>= start
; i
--) {
1098 struct veth_rq
*rq
= &priv
->rq
[i
];
1100 xdp_rxq_info_unreg(&rq
->xdp_rxq
);
1101 if (!napi_already_on
)
1102 netif_napi_del(&rq
->xdp_napi
);
1108 static void veth_disable_xdp_range(struct net_device
*dev
, int start
, int end
,
1111 struct veth_priv
*priv
= netdev_priv(dev
);
1114 for (i
= start
; i
< end
; i
++) {
1115 struct veth_rq
*rq
= &priv
->rq
[i
];
1117 rq
->xdp_rxq
.mem
= rq
->xdp_mem
;
1118 xdp_rxq_info_unreg(&rq
->xdp_rxq
);
1121 netif_napi_del(&rq
->xdp_napi
);
1125 static int veth_enable_xdp(struct net_device
*dev
)
1127 bool napi_already_on
= veth_gro_requested(dev
) && (dev
->flags
& IFF_UP
);
1128 struct veth_priv
*priv
= netdev_priv(dev
);
1131 if (!xdp_rxq_info_is_reg(&priv
->rq
[0].xdp_rxq
)) {
1132 err
= veth_enable_xdp_range(dev
, 0, dev
->real_num_rx_queues
, napi_already_on
);
1136 if (!napi_already_on
) {
1137 err
= __veth_napi_enable(dev
);
1139 veth_disable_xdp_range(dev
, 0, dev
->real_num_rx_queues
, true);
1145 for (i
= 0; i
< dev
->real_num_rx_queues
; i
++) {
1146 rcu_assign_pointer(priv
->rq
[i
].xdp_prog
, priv
->_xdp_prog
);
1147 rcu_assign_pointer(priv
->rq
[i
].napi
, &priv
->rq
[i
].xdp_napi
);
1153 static void veth_disable_xdp(struct net_device
*dev
)
1155 struct veth_priv
*priv
= netdev_priv(dev
);
1158 for (i
= 0; i
< dev
->real_num_rx_queues
; i
++)
1159 rcu_assign_pointer(priv
->rq
[i
].xdp_prog
, NULL
);
1161 if (!netif_running(dev
) || !veth_gro_requested(dev
))
1164 veth_disable_xdp_range(dev
, 0, dev
->real_num_rx_queues
, false);
1167 static int veth_napi_enable_range(struct net_device
*dev
, int start
, int end
)
1169 struct veth_priv
*priv
= netdev_priv(dev
);
1172 for (i
= start
; i
< end
; i
++) {
1173 struct veth_rq
*rq
= &priv
->rq
[i
];
1175 netif_napi_add(dev
, &rq
->xdp_napi
, veth_poll
);
1178 err
= __veth_napi_enable_range(dev
, start
, end
);
1180 for (i
= start
; i
< end
; i
++) {
1181 struct veth_rq
*rq
= &priv
->rq
[i
];
1183 netif_napi_del(&rq
->xdp_napi
);
1190 static int veth_napi_enable(struct net_device
*dev
)
1192 return veth_napi_enable_range(dev
, 0, dev
->real_num_rx_queues
);
1195 static void veth_disable_range_safe(struct net_device
*dev
, int start
, int end
)
1197 struct veth_priv
*priv
= netdev_priv(dev
);
1202 if (priv
->_xdp_prog
) {
1203 veth_napi_del_range(dev
, start
, end
);
1204 veth_disable_xdp_range(dev
, start
, end
, false);
1205 } else if (veth_gro_requested(dev
)) {
1206 veth_napi_del_range(dev
, start
, end
);
1210 static int veth_enable_range_safe(struct net_device
*dev
, int start
, int end
)
1212 struct veth_priv
*priv
= netdev_priv(dev
);
1218 if (priv
->_xdp_prog
) {
1219 /* these channels are freshly initialized, napi is not on there even
1220 * when GRO is requeste
1222 err
= veth_enable_xdp_range(dev
, start
, end
, false);
1226 err
= __veth_napi_enable_range(dev
, start
, end
);
1228 /* on error always delete the newly added napis */
1229 veth_disable_xdp_range(dev
, start
, end
, true);
1232 } else if (veth_gro_requested(dev
)) {
1233 return veth_napi_enable_range(dev
, start
, end
);
1238 static void veth_set_xdp_features(struct net_device
*dev
)
1240 struct veth_priv
*priv
= netdev_priv(dev
);
1241 struct net_device
*peer
;
1243 peer
= rtnl_dereference(priv
->peer
);
1244 if (peer
&& peer
->real_num_tx_queues
<= dev
->real_num_rx_queues
) {
1245 struct veth_priv
*priv_peer
= netdev_priv(peer
);
1246 xdp_features_t val
= NETDEV_XDP_ACT_BASIC
|
1247 NETDEV_XDP_ACT_REDIRECT
|
1248 NETDEV_XDP_ACT_RX_SG
;
1250 if (priv_peer
->_xdp_prog
|| veth_gro_requested(peer
))
1251 val
|= NETDEV_XDP_ACT_NDO_XMIT
|
1252 NETDEV_XDP_ACT_NDO_XMIT_SG
;
1253 xdp_set_features_flag(dev
, val
);
1255 xdp_clear_features_flag(dev
);
1259 static int veth_set_channels(struct net_device
*dev
,
1260 struct ethtool_channels
*ch
)
1262 struct veth_priv
*priv
= netdev_priv(dev
);
1263 unsigned int old_rx_count
, new_rx_count
;
1264 struct veth_priv
*peer_priv
;
1265 struct net_device
*peer
;
1268 /* sanity check. Upper bounds are already enforced by the caller */
1269 if (!ch
->rx_count
|| !ch
->tx_count
)
1272 /* avoid braking XDP, if that is enabled */
1273 peer
= rtnl_dereference(priv
->peer
);
1274 peer_priv
= peer
? netdev_priv(peer
) : NULL
;
1275 if (priv
->_xdp_prog
&& peer
&& ch
->rx_count
< peer
->real_num_tx_queues
)
1278 if (peer
&& peer_priv
&& peer_priv
->_xdp_prog
&& ch
->tx_count
> peer
->real_num_rx_queues
)
1281 old_rx_count
= dev
->real_num_rx_queues
;
1282 new_rx_count
= ch
->rx_count
;
1283 if (netif_running(dev
)) {
1284 /* turn device off */
1285 netif_carrier_off(dev
);
1287 netif_carrier_off(peer
);
1289 /* try to allocate new resurces, as needed*/
1290 err
= veth_enable_range_safe(dev
, old_rx_count
, new_rx_count
);
1295 err
= netif_set_real_num_rx_queues(dev
, ch
->rx_count
);
1299 err
= netif_set_real_num_tx_queues(dev
, ch
->tx_count
);
1301 int err2
= netif_set_real_num_rx_queues(dev
, old_rx_count
);
1303 /* this error condition could happen only if rx and tx change
1304 * in opposite directions (e.g. tx nr raises, rx nr decreases)
1305 * and we can't do anything to fully restore the original
1309 pr_warn("Can't restore rx queues config %d -> %d %d",
1310 new_rx_count
, old_rx_count
, err2
);
1316 if (netif_running(dev
)) {
1317 /* note that we need to swap the arguments WRT the enable part
1318 * to identify the range we have to disable
1320 veth_disable_range_safe(dev
, new_rx_count
, old_rx_count
);
1321 netif_carrier_on(dev
);
1323 netif_carrier_on(peer
);
1326 /* update XDP supported features */
1327 veth_set_xdp_features(dev
);
1329 veth_set_xdp_features(peer
);
1334 new_rx_count
= old_rx_count
;
1335 old_rx_count
= ch
->rx_count
;
1339 static int veth_open(struct net_device
*dev
)
1341 struct veth_priv
*priv
= netdev_priv(dev
);
1342 struct net_device
*peer
= rtnl_dereference(priv
->peer
);
1348 if (priv
->_xdp_prog
) {
1349 err
= veth_enable_xdp(dev
);
1352 } else if (veth_gro_requested(dev
)) {
1353 err
= veth_napi_enable(dev
);
1358 if (peer
->flags
& IFF_UP
) {
1359 netif_carrier_on(dev
);
1360 netif_carrier_on(peer
);
1363 veth_set_xdp_features(dev
);
1368 static int veth_close(struct net_device
*dev
)
1370 struct veth_priv
*priv
= netdev_priv(dev
);
1371 struct net_device
*peer
= rtnl_dereference(priv
->peer
);
1373 netif_carrier_off(dev
);
1375 netif_carrier_off(peer
);
1377 if (priv
->_xdp_prog
)
1378 veth_disable_xdp(dev
);
1379 else if (veth_gro_requested(dev
))
1385 static int is_valid_veth_mtu(int mtu
)
1387 return mtu
>= ETH_MIN_MTU
&& mtu
<= ETH_MAX_MTU
;
1390 static int veth_alloc_queues(struct net_device
*dev
)
1392 struct veth_priv
*priv
= netdev_priv(dev
);
1395 priv
->rq
= kvcalloc(dev
->num_rx_queues
, sizeof(*priv
->rq
),
1396 GFP_KERNEL_ACCOUNT
| __GFP_RETRY_MAYFAIL
);
1400 for (i
= 0; i
< dev
->num_rx_queues
; i
++) {
1401 priv
->rq
[i
].dev
= dev
;
1402 u64_stats_init(&priv
->rq
[i
].stats
.syncp
);
1408 static void veth_free_queues(struct net_device
*dev
)
1410 struct veth_priv
*priv
= netdev_priv(dev
);
1415 static int veth_dev_init(struct net_device
*dev
)
1417 netdev_lockdep_set_classes(dev
);
1418 return veth_alloc_queues(dev
);
1421 static void veth_dev_free(struct net_device
*dev
)
1423 veth_free_queues(dev
);
1426 #ifdef CONFIG_NET_POLL_CONTROLLER
1427 static void veth_poll_controller(struct net_device
*dev
)
1429 /* veth only receives frames when its peer sends one
1430 * Since it has nothing to do with disabling irqs, we are guaranteed
1431 * never to have pending data when we poll for it so
1432 * there is nothing to do here.
1434 * We need this though so netpoll recognizes us as an interface that
1435 * supports polling, which enables bridge devices in virt setups to
1436 * still use netconsole
1439 #endif /* CONFIG_NET_POLL_CONTROLLER */
1441 static int veth_get_iflink(const struct net_device
*dev
)
1443 struct veth_priv
*priv
= netdev_priv(dev
);
1444 struct net_device
*peer
;
1448 peer
= rcu_dereference(priv
->peer
);
1449 iflink
= peer
? READ_ONCE(peer
->ifindex
) : 0;
1455 static netdev_features_t
veth_fix_features(struct net_device
*dev
,
1456 netdev_features_t features
)
1458 struct veth_priv
*priv
= netdev_priv(dev
);
1459 struct net_device
*peer
;
1461 peer
= rtnl_dereference(priv
->peer
);
1463 struct veth_priv
*peer_priv
= netdev_priv(peer
);
1465 if (peer_priv
->_xdp_prog
)
1466 features
&= ~NETIF_F_GSO_SOFTWARE
;
1472 static int veth_set_features(struct net_device
*dev
,
1473 netdev_features_t features
)
1475 netdev_features_t changed
= features
^ dev
->features
;
1476 struct veth_priv
*priv
= netdev_priv(dev
);
1477 struct net_device
*peer
;
1480 if (!(changed
& NETIF_F_GRO
) || !(dev
->flags
& IFF_UP
) || priv
->_xdp_prog
)
1483 peer
= rtnl_dereference(priv
->peer
);
1484 if (features
& NETIF_F_GRO
) {
1485 err
= veth_napi_enable(dev
);
1490 xdp_features_set_redirect_target(peer
, true);
1493 xdp_features_clear_redirect_target(peer
);
1499 static void veth_set_rx_headroom(struct net_device
*dev
, int new_hr
)
1501 struct veth_priv
*peer_priv
, *priv
= netdev_priv(dev
);
1502 struct net_device
*peer
;
1508 peer
= rcu_dereference(priv
->peer
);
1509 if (unlikely(!peer
))
1512 peer_priv
= netdev_priv(peer
);
1513 priv
->requested_headroom
= new_hr
;
1514 new_hr
= max(priv
->requested_headroom
, peer_priv
->requested_headroom
);
1515 dev
->needed_headroom
= new_hr
;
1516 peer
->needed_headroom
= new_hr
;
1522 static int veth_xdp_set(struct net_device
*dev
, struct bpf_prog
*prog
,
1523 struct netlink_ext_ack
*extack
)
1525 struct veth_priv
*priv
= netdev_priv(dev
);
1526 struct bpf_prog
*old_prog
;
1527 struct net_device
*peer
;
1528 unsigned int max_mtu
;
1531 old_prog
= priv
->_xdp_prog
;
1532 priv
->_xdp_prog
= prog
;
1533 peer
= rtnl_dereference(priv
->peer
);
1537 NL_SET_ERR_MSG_MOD(extack
, "Cannot set XDP when peer is detached");
1542 max_mtu
= SKB_WITH_OVERHEAD(PAGE_SIZE
- VETH_XDP_HEADROOM
) -
1543 peer
->hard_header_len
;
1544 /* Allow increasing the max_mtu if the program supports
1547 if (prog
->aux
->xdp_has_frags
)
1548 max_mtu
+= PAGE_SIZE
* MAX_SKB_FRAGS
;
1550 if (peer
->mtu
> max_mtu
) {
1551 NL_SET_ERR_MSG_MOD(extack
, "Peer MTU is too large to set XDP");
1556 if (dev
->real_num_rx_queues
< peer
->real_num_tx_queues
) {
1557 NL_SET_ERR_MSG_MOD(extack
, "XDP expects number of rx queues not less than peer tx queues");
1562 if (dev
->flags
& IFF_UP
) {
1563 err
= veth_enable_xdp(dev
);
1565 NL_SET_ERR_MSG_MOD(extack
, "Setup for XDP failed");
1571 peer
->hw_features
&= ~NETIF_F_GSO_SOFTWARE
;
1572 peer
->max_mtu
= max_mtu
;
1575 xdp_features_set_redirect_target(peer
, true);
1580 if (peer
&& !veth_gro_requested(dev
))
1581 xdp_features_clear_redirect_target(peer
);
1583 if (dev
->flags
& IFF_UP
)
1584 veth_disable_xdp(dev
);
1587 peer
->hw_features
|= NETIF_F_GSO_SOFTWARE
;
1588 peer
->max_mtu
= ETH_MAX_MTU
;
1591 bpf_prog_put(old_prog
);
1594 if ((!!old_prog
^ !!prog
) && peer
)
1595 netdev_update_features(peer
);
1599 priv
->_xdp_prog
= old_prog
;
1604 static int veth_xdp(struct net_device
*dev
, struct netdev_bpf
*xdp
)
1606 switch (xdp
->command
) {
1607 case XDP_SETUP_PROG
:
1608 return veth_xdp_set(dev
, xdp
->prog
, xdp
->extack
);
1614 static int veth_xdp_rx_timestamp(const struct xdp_md
*ctx
, u64
*timestamp
)
1616 struct veth_xdp_buff
*_ctx
= (void *)ctx
;
1621 *timestamp
= skb_hwtstamps(_ctx
->skb
)->hwtstamp
;
1625 static int veth_xdp_rx_hash(const struct xdp_md
*ctx
, u32
*hash
,
1626 enum xdp_rss_hash_type
*rss_type
)
1628 struct veth_xdp_buff
*_ctx
= (void *)ctx
;
1629 struct sk_buff
*skb
= _ctx
->skb
;
1634 *hash
= skb_get_hash(skb
);
1635 *rss_type
= skb
->l4_hash
? XDP_RSS_TYPE_L4_ANY
: XDP_RSS_TYPE_NONE
;
1640 static int veth_xdp_rx_vlan_tag(const struct xdp_md
*ctx
, __be16
*vlan_proto
,
1643 const struct veth_xdp_buff
*_ctx
= (void *)ctx
;
1644 const struct sk_buff
*skb
= _ctx
->skb
;
1650 err
= __vlan_hwaccel_get_tag(skb
, vlan_tci
);
1654 *vlan_proto
= skb
->vlan_proto
;
1658 static const struct net_device_ops veth_netdev_ops
= {
1659 .ndo_init
= veth_dev_init
,
1660 .ndo_open
= veth_open
,
1661 .ndo_stop
= veth_close
,
1662 .ndo_start_xmit
= veth_xmit
,
1663 .ndo_get_stats64
= veth_get_stats64
,
1664 .ndo_set_rx_mode
= veth_set_multicast_list
,
1665 .ndo_set_mac_address
= eth_mac_addr
,
1666 #ifdef CONFIG_NET_POLL_CONTROLLER
1667 .ndo_poll_controller
= veth_poll_controller
,
1669 .ndo_get_iflink
= veth_get_iflink
,
1670 .ndo_fix_features
= veth_fix_features
,
1671 .ndo_set_features
= veth_set_features
,
1672 .ndo_features_check
= passthru_features_check
,
1673 .ndo_set_rx_headroom
= veth_set_rx_headroom
,
1674 .ndo_bpf
= veth_xdp
,
1675 .ndo_xdp_xmit
= veth_ndo_xdp_xmit
,
1676 .ndo_get_peer_dev
= veth_peer_dev
,
1679 static const struct xdp_metadata_ops veth_xdp_metadata_ops
= {
1680 .xmo_rx_timestamp
= veth_xdp_rx_timestamp
,
1681 .xmo_rx_hash
= veth_xdp_rx_hash
,
1682 .xmo_rx_vlan_tag
= veth_xdp_rx_vlan_tag
,
1685 #define VETH_FEATURES (NETIF_F_SG | NETIF_F_FRAGLIST | NETIF_F_HW_CSUM | \
1686 NETIF_F_RXCSUM | NETIF_F_SCTP_CRC | NETIF_F_HIGHDMA | \
1687 NETIF_F_GSO_SOFTWARE | NETIF_F_GSO_ENCAP_ALL | \
1688 NETIF_F_HW_VLAN_CTAG_TX | NETIF_F_HW_VLAN_CTAG_RX | \
1689 NETIF_F_HW_VLAN_STAG_TX | NETIF_F_HW_VLAN_STAG_RX )
1691 static void veth_setup(struct net_device
*dev
)
1695 dev
->priv_flags
&= ~IFF_TX_SKB_SHARING
;
1696 dev
->priv_flags
|= IFF_LIVE_ADDR_CHANGE
;
1697 dev
->priv_flags
|= IFF_NO_QUEUE
;
1698 dev
->priv_flags
|= IFF_PHONY_HEADROOM
;
1699 dev
->priv_flags
|= IFF_DISABLE_NETPOLL
;
1702 dev
->netdev_ops
= &veth_netdev_ops
;
1703 dev
->xdp_metadata_ops
= &veth_xdp_metadata_ops
;
1704 dev
->ethtool_ops
= &veth_ethtool_ops
;
1705 dev
->features
|= VETH_FEATURES
;
1706 dev
->vlan_features
= dev
->features
&
1707 ~(NETIF_F_HW_VLAN_CTAG_TX
|
1708 NETIF_F_HW_VLAN_STAG_TX
|
1709 NETIF_F_HW_VLAN_CTAG_RX
|
1710 NETIF_F_HW_VLAN_STAG_RX
);
1711 dev
->needs_free_netdev
= true;
1712 dev
->priv_destructor
= veth_dev_free
;
1713 dev
->pcpu_stat_type
= NETDEV_PCPU_STAT_TSTATS
;
1714 dev
->max_mtu
= ETH_MAX_MTU
;
1716 dev
->hw_features
= VETH_FEATURES
;
1717 dev
->hw_enc_features
= VETH_FEATURES
;
1718 dev
->mpls_features
= NETIF_F_HW_CSUM
| NETIF_F_GSO_SOFTWARE
;
1719 netif_set_tso_max_size(dev
, GSO_MAX_SIZE
);
1726 static int veth_validate(struct nlattr
*tb
[], struct nlattr
*data
[],
1727 struct netlink_ext_ack
*extack
)
1729 if (tb
[IFLA_ADDRESS
]) {
1730 if (nla_len(tb
[IFLA_ADDRESS
]) != ETH_ALEN
)
1732 if (!is_valid_ether_addr(nla_data(tb
[IFLA_ADDRESS
])))
1733 return -EADDRNOTAVAIL
;
1736 if (!is_valid_veth_mtu(nla_get_u32(tb
[IFLA_MTU
])))
1742 static struct rtnl_link_ops veth_link_ops
;
1744 static void veth_disable_gro(struct net_device
*dev
)
1746 dev
->features
&= ~NETIF_F_GRO
;
1747 dev
->wanted_features
&= ~NETIF_F_GRO
;
1748 netdev_update_features(dev
);
1751 static int veth_init_queues(struct net_device
*dev
, struct nlattr
*tb
[])
1755 if (!tb
[IFLA_NUM_TX_QUEUES
] && dev
->num_tx_queues
> 1) {
1756 err
= netif_set_real_num_tx_queues(dev
, 1);
1760 if (!tb
[IFLA_NUM_RX_QUEUES
] && dev
->num_rx_queues
> 1) {
1761 err
= netif_set_real_num_rx_queues(dev
, 1);
1768 static int veth_newlink(struct net
*peer_net
, struct net_device
*dev
,
1769 struct nlattr
*tb
[], struct nlattr
*data
[],
1770 struct netlink_ext_ack
*extack
)
1773 struct net_device
*peer
;
1774 struct veth_priv
*priv
;
1775 char ifname
[IFNAMSIZ
];
1776 struct nlattr
*peer_tb
[IFLA_MAX
+ 1], **tbp
;
1777 unsigned char name_assign_type
;
1778 struct ifinfomsg
*ifmp
;
1781 * create and register peer first
1783 if (data
&& data
[VETH_INFO_PEER
]) {
1784 struct nlattr
*nla_peer
= data
[VETH_INFO_PEER
];
1786 ifmp
= nla_data(nla_peer
);
1787 rtnl_nla_parse_ifinfomsg(peer_tb
, nla_peer
, extack
);
1794 if (ifmp
&& tbp
[IFLA_IFNAME
]) {
1795 nla_strscpy(ifname
, tbp
[IFLA_IFNAME
], IFNAMSIZ
);
1796 name_assign_type
= NET_NAME_USER
;
1798 snprintf(ifname
, IFNAMSIZ
, DRV_NAME
"%%d");
1799 name_assign_type
= NET_NAME_ENUM
;
1802 peer
= rtnl_create_link(peer_net
, ifname
, name_assign_type
,
1803 &veth_link_ops
, tbp
, extack
);
1805 return PTR_ERR(peer
);
1807 if (!ifmp
|| !tbp
[IFLA_ADDRESS
])
1808 eth_hw_addr_random(peer
);
1810 if (ifmp
&& (dev
->ifindex
!= 0))
1811 peer
->ifindex
= ifmp
->ifi_index
;
1813 netif_inherit_tso_max(peer
, dev
);
1815 err
= register_netdevice(peer
);
1817 goto err_register_peer
;
1819 /* keep GRO disabled by default to be consistent with the established
1822 veth_disable_gro(peer
);
1823 netif_carrier_off(peer
);
1825 err
= rtnl_configure_link(peer
, ifmp
, 0, NULL
);
1827 goto err_configure_peer
;
1832 * note, that since we've registered new device the dev's name
1833 * should be re-allocated
1836 if (tb
[IFLA_ADDRESS
] == NULL
)
1837 eth_hw_addr_random(dev
);
1839 if (tb
[IFLA_IFNAME
])
1840 nla_strscpy(dev
->name
, tb
[IFLA_IFNAME
], IFNAMSIZ
);
1842 snprintf(dev
->name
, IFNAMSIZ
, DRV_NAME
"%%d");
1844 err
= register_netdevice(dev
);
1846 goto err_register_dev
;
1848 netif_carrier_off(dev
);
1851 * tie the deviced together
1854 priv
= netdev_priv(dev
);
1855 rcu_assign_pointer(priv
->peer
, peer
);
1856 err
= veth_init_queues(dev
, tb
);
1860 priv
= netdev_priv(peer
);
1861 rcu_assign_pointer(priv
->peer
, dev
);
1862 err
= veth_init_queues(peer
, tb
);
1866 veth_disable_gro(dev
);
1867 /* update XDP supported features */
1868 veth_set_xdp_features(dev
);
1869 veth_set_xdp_features(peer
);
1874 unregister_netdevice(dev
);
1878 unregister_netdevice(peer
);
1886 static void veth_dellink(struct net_device
*dev
, struct list_head
*head
)
1888 struct veth_priv
*priv
;
1889 struct net_device
*peer
;
1891 priv
= netdev_priv(dev
);
1892 peer
= rtnl_dereference(priv
->peer
);
1894 /* Note : dellink() is called from default_device_exit_batch(),
1895 * before a rcu_synchronize() point. The devices are guaranteed
1896 * not being freed before one RCU grace period.
1898 RCU_INIT_POINTER(priv
->peer
, NULL
);
1899 unregister_netdevice_queue(dev
, head
);
1902 priv
= netdev_priv(peer
);
1903 RCU_INIT_POINTER(priv
->peer
, NULL
);
1904 unregister_netdevice_queue(peer
, head
);
1908 static const struct nla_policy veth_policy
[VETH_INFO_MAX
+ 1] = {
1909 [VETH_INFO_PEER
] = { .len
= sizeof(struct ifinfomsg
) },
1912 static struct net
*veth_get_link_net(const struct net_device
*dev
)
1914 struct veth_priv
*priv
= netdev_priv(dev
);
1915 struct net_device
*peer
= rtnl_dereference(priv
->peer
);
1917 return peer
? dev_net(peer
) : dev_net(dev
);
1920 static unsigned int veth_get_num_queues(void)
1922 /* enforce the same queue limit as rtnl_create_link */
1923 int queues
= num_possible_cpus();
1930 static struct rtnl_link_ops veth_link_ops
= {
1932 .priv_size
= sizeof(struct veth_priv
),
1933 .setup
= veth_setup
,
1934 .validate
= veth_validate
,
1935 .newlink
= veth_newlink
,
1936 .dellink
= veth_dellink
,
1937 .policy
= veth_policy
,
1938 .peer_type
= VETH_INFO_PEER
,
1939 .maxtype
= VETH_INFO_MAX
,
1940 .get_link_net
= veth_get_link_net
,
1941 .get_num_tx_queues
= veth_get_num_queues
,
1942 .get_num_rx_queues
= veth_get_num_queues
,
1949 static __init
int veth_init(void)
1951 return rtnl_link_register(&veth_link_ops
);
1954 static __exit
void veth_exit(void)
1956 rtnl_link_unregister(&veth_link_ops
);
1959 module_init(veth_init
);
1960 module_exit(veth_exit
);
1962 MODULE_DESCRIPTION("Virtual Ethernet Tunnel");
1963 MODULE_LICENSE("GPL v2");
1964 MODULE_ALIAS_RTNL_LINK(DRV_NAME
);