1 // SPDX-License-Identifier: GPL-2.0-only
2 /* Copyright (c) 2016 Thomas Graf <tgraf@tgraf.ch>
5 #include <linux/kernel.h>
6 #include <linux/module.h>
7 #include <linux/skbuff.h>
8 #include <linux/types.h>
10 #include <net/lwtunnel.h>
12 #include <net/ip6_route.h>
13 #include <net/ipv6_stubs.h>
16 struct bpf_prog
*prog
;
21 struct bpf_lwt_prog in
;
22 struct bpf_lwt_prog out
;
23 struct bpf_lwt_prog xmit
;
27 #define MAX_PROG_NAME 256
29 static inline struct bpf_lwt
*bpf_lwt_lwtunnel(struct lwtunnel_state
*lwt
)
31 return (struct bpf_lwt
*)lwt
->data
;
34 #define NO_REDIRECT false
35 #define CAN_REDIRECT true
37 static int run_lwt_bpf(struct sk_buff
*skb
, struct bpf_lwt_prog
*lwt
,
38 struct dst_entry
*dst
, bool can_redirect
)
42 /* Preempt disable is needed to protect per-cpu redirect_info between
43 * BPF prog and skb_do_redirect(). The call_rcu in bpf_prog_put() and
44 * access to maps strictly require a rcu_read_lock() for protection,
45 * mixing with BH RCU lock doesn't work.
48 bpf_compute_data_pointers(skb
);
49 ret
= bpf_prog_run_save_cb(lwt
->prog
, skb
);
57 if (unlikely(!can_redirect
)) {
58 pr_warn_once("Illegal redirect return code in prog %s\n",
59 lwt
->name
? : "<unknown>");
62 skb_reset_mac_header(skb
);
63 ret
= skb_do_redirect(skb
);
75 pr_warn_once("bpf-lwt: Illegal return value %u, expect packet loss\n", ret
);
86 static int bpf_lwt_input_reroute(struct sk_buff
*skb
)
90 if (skb
->protocol
== htons(ETH_P_IP
)) {
91 struct net_device
*dev
= skb_dst(skb
)->dev
;
92 struct iphdr
*iph
= ip_hdr(skb
);
96 err
= ip_route_input_noref(skb
, iph
->daddr
, iph
->saddr
,
99 } else if (skb
->protocol
== htons(ETH_P_IPV6
)) {
101 err
= ipv6_stub
->ipv6_route_input(skb
);
108 return dst_input(skb
);
115 static int bpf_input(struct sk_buff
*skb
)
117 struct dst_entry
*dst
= skb_dst(skb
);
121 bpf
= bpf_lwt_lwtunnel(dst
->lwtstate
);
123 ret
= run_lwt_bpf(skb
, &bpf
->in
, dst
, NO_REDIRECT
);
126 if (ret
== BPF_LWT_REROUTE
)
127 return bpf_lwt_input_reroute(skb
);
130 if (unlikely(!dst
->lwtstate
->orig_input
)) {
135 return dst
->lwtstate
->orig_input(skb
);
138 static int bpf_output(struct net
*net
, struct sock
*sk
, struct sk_buff
*skb
)
140 struct dst_entry
*dst
= skb_dst(skb
);
144 bpf
= bpf_lwt_lwtunnel(dst
->lwtstate
);
146 ret
= run_lwt_bpf(skb
, &bpf
->out
, dst
, NO_REDIRECT
);
151 if (unlikely(!dst
->lwtstate
->orig_output
)) {
152 pr_warn_once("orig_output not set on dst for prog %s\n",
158 return dst
->lwtstate
->orig_output(net
, sk
, skb
);
161 static int xmit_check_hhlen(struct sk_buff
*skb
)
163 int hh_len
= skb_dst(skb
)->dev
->hard_header_len
;
165 if (skb_headroom(skb
) < hh_len
) {
166 int nhead
= HH_DATA_ALIGN(hh_len
- skb_headroom(skb
));
168 if (pskb_expand_head(skb
, nhead
, 0, GFP_ATOMIC
))
175 static int bpf_lwt_xmit_reroute(struct sk_buff
*skb
)
177 struct net_device
*l3mdev
= l3mdev_master_dev_rcu(skb_dst(skb
)->dev
);
178 int oif
= l3mdev
? l3mdev
->ifindex
: 0;
179 struct dst_entry
*dst
= NULL
;
180 int err
= -EAFNOSUPPORT
;
185 if (skb
->protocol
== htons(ETH_P_IP
))
187 else if (skb
->protocol
== htons(ETH_P_IPV6
))
192 sk
= sk_to_full_sk(skb
->sk
);
194 if (sk
->sk_bound_dev_if
)
195 oif
= sk
->sk_bound_dev_if
;
198 net
= dev_net(skb_dst(skb
)->dev
);
202 struct iphdr
*iph
= ip_hdr(skb
);
203 struct flowi4 fl4
= {};
206 fl4
.flowi4_oif
= oif
;
207 fl4
.flowi4_mark
= skb
->mark
;
208 fl4
.flowi4_uid
= sock_net_uid(net
, sk
);
209 fl4
.flowi4_tos
= RT_TOS(iph
->tos
);
210 fl4
.flowi4_flags
= FLOWI_FLAG_ANYSRC
;
211 fl4
.flowi4_proto
= iph
->protocol
;
212 fl4
.daddr
= iph
->daddr
;
213 fl4
.saddr
= iph
->saddr
;
215 rt
= ip_route_output_key(net
, &fl4
);
222 struct ipv6hdr
*iph6
= ipv6_hdr(skb
);
223 struct flowi6 fl6
= {};
225 fl6
.flowi6_oif
= oif
;
226 fl6
.flowi6_mark
= skb
->mark
;
227 fl6
.flowi6_uid
= sock_net_uid(net
, sk
);
228 fl6
.flowlabel
= ip6_flowinfo(iph6
);
229 fl6
.flowi6_proto
= iph6
->nexthdr
;
230 fl6
.daddr
= iph6
->daddr
;
231 fl6
.saddr
= iph6
->saddr
;
233 dst
= ipv6_stub
->ipv6_dst_lookup_flow(net
, skb
->sk
, &fl6
, NULL
);
239 if (unlikely(dst
->error
)) {
245 /* Although skb header was reserved in bpf_lwt_push_ip_encap(), it
246 * was done for the previous dst, so we are doing it here again, in
247 * case the new dst needs much more space. The call below is a noop
248 * if there is enough header space in skb.
250 err
= skb_cow_head(skb
, LL_RESERVED_SPACE(dst
->dev
));
255 skb_dst_set(skb
, dst
);
257 err
= dst_output(dev_net(skb_dst(skb
)->dev
), skb
->sk
, skb
);
261 /* ip[6]_finish_output2 understand LWTUNNEL_XMIT_DONE */
262 return LWTUNNEL_XMIT_DONE
;
269 static int bpf_xmit(struct sk_buff
*skb
)
271 struct dst_entry
*dst
= skb_dst(skb
);
274 bpf
= bpf_lwt_lwtunnel(dst
->lwtstate
);
275 if (bpf
->xmit
.prog
) {
276 __be16 proto
= skb
->protocol
;
279 ret
= run_lwt_bpf(skb
, &bpf
->xmit
, dst
, CAN_REDIRECT
);
282 /* If the header changed, e.g. via bpf_lwt_push_encap,
283 * BPF_LWT_REROUTE below should have been used if the
284 * protocol was also changed.
286 if (skb
->protocol
!= proto
) {
290 /* If the header was expanded, headroom might be too
291 * small for L2 header to come, expand as needed.
293 ret
= xmit_check_hhlen(skb
);
297 return LWTUNNEL_XMIT_CONTINUE
;
299 return LWTUNNEL_XMIT_DONE
;
300 case BPF_LWT_REROUTE
:
301 return bpf_lwt_xmit_reroute(skb
);
307 return LWTUNNEL_XMIT_CONTINUE
;
310 static void bpf_lwt_prog_destroy(struct bpf_lwt_prog
*prog
)
313 bpf_prog_put(prog
->prog
);
318 static void bpf_destroy_state(struct lwtunnel_state
*lwt
)
320 struct bpf_lwt
*bpf
= bpf_lwt_lwtunnel(lwt
);
322 bpf_lwt_prog_destroy(&bpf
->in
);
323 bpf_lwt_prog_destroy(&bpf
->out
);
324 bpf_lwt_prog_destroy(&bpf
->xmit
);
327 static const struct nla_policy bpf_prog_policy
[LWT_BPF_PROG_MAX
+ 1] = {
328 [LWT_BPF_PROG_FD
] = { .type
= NLA_U32
, },
329 [LWT_BPF_PROG_NAME
] = { .type
= NLA_NUL_STRING
,
330 .len
= MAX_PROG_NAME
},
333 static int bpf_parse_prog(struct nlattr
*attr
, struct bpf_lwt_prog
*prog
,
334 enum bpf_prog_type type
)
336 struct nlattr
*tb
[LWT_BPF_PROG_MAX
+ 1];
341 ret
= nla_parse_nested_deprecated(tb
, LWT_BPF_PROG_MAX
, attr
,
342 bpf_prog_policy
, NULL
);
346 if (!tb
[LWT_BPF_PROG_FD
] || !tb
[LWT_BPF_PROG_NAME
])
349 prog
->name
= nla_memdup(tb
[LWT_BPF_PROG_NAME
], GFP_ATOMIC
);
353 fd
= nla_get_u32(tb
[LWT_BPF_PROG_FD
]);
354 p
= bpf_prog_get_type(fd
, type
);
363 static const struct nla_policy bpf_nl_policy
[LWT_BPF_MAX
+ 1] = {
364 [LWT_BPF_IN
] = { .type
= NLA_NESTED
, },
365 [LWT_BPF_OUT
] = { .type
= NLA_NESTED
, },
366 [LWT_BPF_XMIT
] = { .type
= NLA_NESTED
, },
367 [LWT_BPF_XMIT_HEADROOM
] = { .type
= NLA_U32
},
370 static int bpf_build_state(struct net
*net
, struct nlattr
*nla
,
371 unsigned int family
, const void *cfg
,
372 struct lwtunnel_state
**ts
,
373 struct netlink_ext_ack
*extack
)
375 struct nlattr
*tb
[LWT_BPF_MAX
+ 1];
376 struct lwtunnel_state
*newts
;
380 if (family
!= AF_INET
&& family
!= AF_INET6
)
381 return -EAFNOSUPPORT
;
383 ret
= nla_parse_nested_deprecated(tb
, LWT_BPF_MAX
, nla
, bpf_nl_policy
,
388 if (!tb
[LWT_BPF_IN
] && !tb
[LWT_BPF_OUT
] && !tb
[LWT_BPF_XMIT
])
391 newts
= lwtunnel_state_alloc(sizeof(*bpf
));
395 newts
->type
= LWTUNNEL_ENCAP_BPF
;
396 bpf
= bpf_lwt_lwtunnel(newts
);
398 if (tb
[LWT_BPF_IN
]) {
399 newts
->flags
|= LWTUNNEL_STATE_INPUT_REDIRECT
;
400 ret
= bpf_parse_prog(tb
[LWT_BPF_IN
], &bpf
->in
,
401 BPF_PROG_TYPE_LWT_IN
);
406 if (tb
[LWT_BPF_OUT
]) {
407 newts
->flags
|= LWTUNNEL_STATE_OUTPUT_REDIRECT
;
408 ret
= bpf_parse_prog(tb
[LWT_BPF_OUT
], &bpf
->out
,
409 BPF_PROG_TYPE_LWT_OUT
);
414 if (tb
[LWT_BPF_XMIT
]) {
415 newts
->flags
|= LWTUNNEL_STATE_XMIT_REDIRECT
;
416 ret
= bpf_parse_prog(tb
[LWT_BPF_XMIT
], &bpf
->xmit
,
417 BPF_PROG_TYPE_LWT_XMIT
);
422 if (tb
[LWT_BPF_XMIT_HEADROOM
]) {
423 u32 headroom
= nla_get_u32(tb
[LWT_BPF_XMIT_HEADROOM
]);
425 if (headroom
> LWT_BPF_MAX_HEADROOM
) {
430 newts
->headroom
= headroom
;
433 bpf
->family
= family
;
439 bpf_destroy_state(newts
);
444 static int bpf_fill_lwt_prog(struct sk_buff
*skb
, int attr
,
445 struct bpf_lwt_prog
*prog
)
452 nest
= nla_nest_start_noflag(skb
, attr
);
457 nla_put_string(skb
, LWT_BPF_PROG_NAME
, prog
->name
))
460 return nla_nest_end(skb
, nest
);
463 static int bpf_fill_encap_info(struct sk_buff
*skb
, struct lwtunnel_state
*lwt
)
465 struct bpf_lwt
*bpf
= bpf_lwt_lwtunnel(lwt
);
467 if (bpf_fill_lwt_prog(skb
, LWT_BPF_IN
, &bpf
->in
) < 0 ||
468 bpf_fill_lwt_prog(skb
, LWT_BPF_OUT
, &bpf
->out
) < 0 ||
469 bpf_fill_lwt_prog(skb
, LWT_BPF_XMIT
, &bpf
->xmit
) < 0)
475 static int bpf_encap_nlsize(struct lwtunnel_state
*lwtstate
)
477 int nest_len
= nla_total_size(sizeof(struct nlattr
)) +
478 nla_total_size(MAX_PROG_NAME
) + /* LWT_BPF_PROG_NAME */
481 return nest_len
+ /* LWT_BPF_IN */
482 nest_len
+ /* LWT_BPF_OUT */
483 nest_len
+ /* LWT_BPF_XMIT */
487 static int bpf_lwt_prog_cmp(struct bpf_lwt_prog
*a
, struct bpf_lwt_prog
*b
)
490 * The LWT state is currently rebuilt for delete requests which
491 * results in a new bpf_prog instance. Comparing names for now.
493 if (!a
->name
&& !b
->name
)
496 if (!a
->name
|| !b
->name
)
499 return strcmp(a
->name
, b
->name
);
502 static int bpf_encap_cmp(struct lwtunnel_state
*a
, struct lwtunnel_state
*b
)
504 struct bpf_lwt
*a_bpf
= bpf_lwt_lwtunnel(a
);
505 struct bpf_lwt
*b_bpf
= bpf_lwt_lwtunnel(b
);
507 return bpf_lwt_prog_cmp(&a_bpf
->in
, &b_bpf
->in
) ||
508 bpf_lwt_prog_cmp(&a_bpf
->out
, &b_bpf
->out
) ||
509 bpf_lwt_prog_cmp(&a_bpf
->xmit
, &b_bpf
->xmit
);
512 static const struct lwtunnel_encap_ops bpf_encap_ops
= {
513 .build_state
= bpf_build_state
,
514 .destroy_state
= bpf_destroy_state
,
516 .output
= bpf_output
,
518 .fill_encap
= bpf_fill_encap_info
,
519 .get_encap_size
= bpf_encap_nlsize
,
520 .cmp_encap
= bpf_encap_cmp
,
521 .owner
= THIS_MODULE
,
524 static int handle_gso_type(struct sk_buff
*skb
, unsigned int gso_type
,
527 struct skb_shared_info
*shinfo
= skb_shinfo(skb
);
529 gso_type
|= SKB_GSO_DODGY
;
530 shinfo
->gso_type
|= gso_type
;
531 skb_decrease_gso_size(shinfo
, encap_len
);
532 shinfo
->gso_segs
= 0;
536 static int handle_gso_encap(struct sk_buff
*skb
, bool ipv4
, int encap_len
)
542 /* SCTP and UDP_L4 gso need more nuanced handling than what
543 * handle_gso_type() does above: skb_decrease_gso_size() is not enough.
544 * So at the moment only TCP GSO packets are let through.
546 if (!(skb_shinfo(skb
)->gso_type
& (SKB_GSO_TCPV4
| SKB_GSO_TCPV6
)))
550 protocol
= ip_hdr(skb
)->protocol
;
551 next_hdr_offset
= sizeof(struct iphdr
);
552 next_hdr
= skb_network_header(skb
) + next_hdr_offset
;
554 protocol
= ipv6_hdr(skb
)->nexthdr
;
555 next_hdr_offset
= sizeof(struct ipv6hdr
);
556 next_hdr
= skb_network_header(skb
) + next_hdr_offset
;
561 next_hdr_offset
+= sizeof(struct gre_base_hdr
);
562 if (next_hdr_offset
> encap_len
)
565 if (((struct gre_base_hdr
*)next_hdr
)->flags
& GRE_CSUM
)
566 return handle_gso_type(skb
, SKB_GSO_GRE_CSUM
,
568 return handle_gso_type(skb
, SKB_GSO_GRE
, encap_len
);
571 next_hdr_offset
+= sizeof(struct udphdr
);
572 if (next_hdr_offset
> encap_len
)
575 if (((struct udphdr
*)next_hdr
)->check
)
576 return handle_gso_type(skb
, SKB_GSO_UDP_TUNNEL_CSUM
,
578 return handle_gso_type(skb
, SKB_GSO_UDP_TUNNEL
, encap_len
);
583 return handle_gso_type(skb
, SKB_GSO_IPXIP4
, encap_len
);
585 return handle_gso_type(skb
, SKB_GSO_IPXIP6
, encap_len
);
588 return -EPROTONOSUPPORT
;
592 int bpf_lwt_push_ip_encap(struct sk_buff
*skb
, void *hdr
, u32 len
, bool ingress
)
598 if (unlikely(len
< sizeof(struct iphdr
) || len
> LWT_BPF_MAX_HEADROOM
))
601 /* validate protocol and length */
602 iph
= (struct iphdr
*)hdr
;
603 if (iph
->version
== 4) {
605 if (unlikely(len
< iph
->ihl
* 4))
607 } else if (iph
->version
== 6) {
609 if (unlikely(len
< sizeof(struct ipv6hdr
)))
616 err
= skb_cow_head(skb
, len
+ skb
->mac_len
);
618 err
= skb_cow_head(skb
,
619 len
+ LL_RESERVED_SPACE(skb_dst(skb
)->dev
));
623 /* push the encap headers and fix pointers */
624 skb_reset_inner_headers(skb
);
625 skb_reset_inner_mac_header(skb
); /* mac header is not yet set */
626 skb_set_inner_protocol(skb
, skb
->protocol
);
627 skb
->encapsulation
= 1;
630 skb_postpush_rcsum(skb
, iph
, len
);
631 skb_reset_network_header(skb
);
632 memcpy(skb_network_header(skb
), hdr
, len
);
633 bpf_compute_data_pointers(skb
);
637 skb
->protocol
= htons(ETH_P_IP
);
641 iph
->check
= ip_fast_csum((unsigned char *)iph
,
644 skb
->protocol
= htons(ETH_P_IPV6
);
648 return handle_gso_encap(skb
, ipv4
, len
);
653 static int __init
bpf_lwt_init(void)
655 return lwtunnel_encap_add_ops(&bpf_encap_ops
, LWTUNNEL_ENCAP_BPF
);
658 subsys_initcall(bpf_lwt_init
)