1 // SPDX-License-Identifier: GPL-2.0-only
2 /* Copyright (c) 2016 Thomas Graf <tgraf@tgraf.ch>
5 #include <linux/filter.h>
6 #include <linux/kernel.h>
7 #include <linux/module.h>
8 #include <linux/skbuff.h>
9 #include <linux/types.h>
10 #include <linux/bpf.h>
11 #include <net/lwtunnel.h>
13 #include <net/ip6_route.h>
14 #include <net/ipv6_stubs.h>
15 #include <net/inet_dscp.h>
18 struct bpf_prog
*prog
;
23 struct bpf_lwt_prog in
;
24 struct bpf_lwt_prog out
;
25 struct bpf_lwt_prog xmit
;
29 #define MAX_PROG_NAME 256
31 static inline struct bpf_lwt
*bpf_lwt_lwtunnel(struct lwtunnel_state
*lwt
)
33 return (struct bpf_lwt
*)lwt
->data
;
36 #define NO_REDIRECT false
37 #define CAN_REDIRECT true
39 static int run_lwt_bpf(struct sk_buff
*skb
, struct bpf_lwt_prog
*lwt
,
40 struct dst_entry
*dst
, bool can_redirect
)
42 struct bpf_net_context __bpf_net_ctx
, *bpf_net_ctx
;
45 /* Disabling BH is needed to protect per-CPU bpf_redirect_info between
46 * BPF prog and skb_do_redirect().
49 bpf_net_ctx
= bpf_net_ctx_set(&__bpf_net_ctx
);
50 bpf_compute_data_pointers(skb
);
51 ret
= bpf_prog_run_save_cb(lwt
->prog
, skb
);
59 if (unlikely(!can_redirect
)) {
60 pr_warn_once("Illegal redirect return code in prog %s\n",
61 lwt
->name
? : "<unknown>");
64 skb_reset_mac_header(skb
);
76 pr_warn_once("bpf-lwt: Illegal return value %u, expect packet loss\n", ret
);
82 bpf_net_ctx_clear(bpf_net_ctx
);
88 static int bpf_lwt_input_reroute(struct sk_buff
*skb
)
92 if (skb
->protocol
== htons(ETH_P_IP
)) {
93 struct net_device
*dev
= skb_dst(skb
)->dev
;
94 struct iphdr
*iph
= ip_hdr(skb
);
98 err
= ip_route_input_noref(skb
, iph
->daddr
, iph
->saddr
,
101 } else if (skb
->protocol
== htons(ETH_P_IPV6
)) {
103 err
= ipv6_stub
->ipv6_route_input(skb
);
110 return dst_input(skb
);
117 static int bpf_input(struct sk_buff
*skb
)
119 struct dst_entry
*dst
= skb_dst(skb
);
123 bpf
= bpf_lwt_lwtunnel(dst
->lwtstate
);
125 ret
= run_lwt_bpf(skb
, &bpf
->in
, dst
, NO_REDIRECT
);
128 if (ret
== BPF_LWT_REROUTE
)
129 return bpf_lwt_input_reroute(skb
);
132 if (unlikely(!dst
->lwtstate
->orig_input
)) {
137 return dst
->lwtstate
->orig_input(skb
);
140 static int bpf_output(struct net
*net
, struct sock
*sk
, struct sk_buff
*skb
)
142 struct dst_entry
*dst
= skb_dst(skb
);
146 bpf
= bpf_lwt_lwtunnel(dst
->lwtstate
);
148 ret
= run_lwt_bpf(skb
, &bpf
->out
, dst
, NO_REDIRECT
);
153 if (unlikely(!dst
->lwtstate
->orig_output
)) {
154 pr_warn_once("orig_output not set on dst for prog %s\n",
160 return dst
->lwtstate
->orig_output(net
, sk
, skb
);
163 static int xmit_check_hhlen(struct sk_buff
*skb
, int hh_len
)
165 if (skb_headroom(skb
) < hh_len
) {
166 int nhead
= HH_DATA_ALIGN(hh_len
- skb_headroom(skb
));
168 if (pskb_expand_head(skb
, nhead
, 0, GFP_ATOMIC
))
175 static int bpf_lwt_xmit_reroute(struct sk_buff
*skb
)
177 struct net_device
*l3mdev
= l3mdev_master_dev_rcu(skb_dst(skb
)->dev
);
178 int oif
= l3mdev
? l3mdev
->ifindex
: 0;
179 struct dst_entry
*dst
= NULL
;
180 int err
= -EAFNOSUPPORT
;
185 if (skb
->protocol
== htons(ETH_P_IP
))
187 else if (skb
->protocol
== htons(ETH_P_IPV6
))
192 sk
= sk_to_full_sk(skb
->sk
);
194 if (sk
->sk_bound_dev_if
)
195 oif
= sk
->sk_bound_dev_if
;
198 net
= dev_net(skb_dst(skb
)->dev
);
202 struct iphdr
*iph
= ip_hdr(skb
);
203 struct flowi4 fl4
= {};
206 fl4
.flowi4_oif
= oif
;
207 fl4
.flowi4_mark
= skb
->mark
;
208 fl4
.flowi4_uid
= sock_net_uid(net
, sk
);
209 fl4
.flowi4_tos
= iph
->tos
& INET_DSCP_MASK
;
210 fl4
.flowi4_flags
= FLOWI_FLAG_ANYSRC
;
211 fl4
.flowi4_proto
= iph
->protocol
;
212 fl4
.daddr
= iph
->daddr
;
213 fl4
.saddr
= iph
->saddr
;
215 rt
= ip_route_output_key(net
, &fl4
);
222 struct ipv6hdr
*iph6
= ipv6_hdr(skb
);
223 struct flowi6 fl6
= {};
225 fl6
.flowi6_oif
= oif
;
226 fl6
.flowi6_mark
= skb
->mark
;
227 fl6
.flowi6_uid
= sock_net_uid(net
, sk
);
228 fl6
.flowlabel
= ip6_flowinfo(iph6
);
229 fl6
.flowi6_proto
= iph6
->nexthdr
;
230 fl6
.daddr
= iph6
->daddr
;
231 fl6
.saddr
= iph6
->saddr
;
233 dst
= ipv6_stub
->ipv6_dst_lookup_flow(net
, skb
->sk
, &fl6
, NULL
);
239 if (unlikely(dst
->error
)) {
245 /* Although skb header was reserved in bpf_lwt_push_ip_encap(), it
246 * was done for the previous dst, so we are doing it here again, in
247 * case the new dst needs much more space. The call below is a noop
248 * if there is enough header space in skb.
250 err
= skb_cow_head(skb
, LL_RESERVED_SPACE(dst
->dev
));
255 skb_dst_set(skb
, dst
);
257 err
= dst_output(dev_net(skb_dst(skb
)->dev
), skb
->sk
, skb
);
259 return net_xmit_errno(err
);
261 /* ip[6]_finish_output2 understand LWTUNNEL_XMIT_DONE */
262 return LWTUNNEL_XMIT_DONE
;
269 static int bpf_xmit(struct sk_buff
*skb
)
271 struct dst_entry
*dst
= skb_dst(skb
);
274 bpf
= bpf_lwt_lwtunnel(dst
->lwtstate
);
275 if (bpf
->xmit
.prog
) {
276 int hh_len
= dst
->dev
->hard_header_len
;
277 __be16 proto
= skb
->protocol
;
280 ret
= run_lwt_bpf(skb
, &bpf
->xmit
, dst
, CAN_REDIRECT
);
283 /* If the header changed, e.g. via bpf_lwt_push_encap,
284 * BPF_LWT_REROUTE below should have been used if the
285 * protocol was also changed.
287 if (skb
->protocol
!= proto
) {
291 /* If the header was expanded, headroom might be too
292 * small for L2 header to come, expand as needed.
294 ret
= xmit_check_hhlen(skb
, hh_len
);
298 return LWTUNNEL_XMIT_CONTINUE
;
300 return LWTUNNEL_XMIT_DONE
;
301 case BPF_LWT_REROUTE
:
302 return bpf_lwt_xmit_reroute(skb
);
308 return LWTUNNEL_XMIT_CONTINUE
;
311 static void bpf_lwt_prog_destroy(struct bpf_lwt_prog
*prog
)
314 bpf_prog_put(prog
->prog
);
319 static void bpf_destroy_state(struct lwtunnel_state
*lwt
)
321 struct bpf_lwt
*bpf
= bpf_lwt_lwtunnel(lwt
);
323 bpf_lwt_prog_destroy(&bpf
->in
);
324 bpf_lwt_prog_destroy(&bpf
->out
);
325 bpf_lwt_prog_destroy(&bpf
->xmit
);
328 static const struct nla_policy bpf_prog_policy
[LWT_BPF_PROG_MAX
+ 1] = {
329 [LWT_BPF_PROG_FD
] = { .type
= NLA_U32
, },
330 [LWT_BPF_PROG_NAME
] = { .type
= NLA_NUL_STRING
,
331 .len
= MAX_PROG_NAME
},
334 static int bpf_parse_prog(struct nlattr
*attr
, struct bpf_lwt_prog
*prog
,
335 enum bpf_prog_type type
)
337 struct nlattr
*tb
[LWT_BPF_PROG_MAX
+ 1];
342 ret
= nla_parse_nested_deprecated(tb
, LWT_BPF_PROG_MAX
, attr
,
343 bpf_prog_policy
, NULL
);
347 if (!tb
[LWT_BPF_PROG_FD
] || !tb
[LWT_BPF_PROG_NAME
])
350 prog
->name
= nla_memdup(tb
[LWT_BPF_PROG_NAME
], GFP_ATOMIC
);
354 fd
= nla_get_u32(tb
[LWT_BPF_PROG_FD
]);
355 p
= bpf_prog_get_type(fd
, type
);
364 static const struct nla_policy bpf_nl_policy
[LWT_BPF_MAX
+ 1] = {
365 [LWT_BPF_IN
] = { .type
= NLA_NESTED
, },
366 [LWT_BPF_OUT
] = { .type
= NLA_NESTED
, },
367 [LWT_BPF_XMIT
] = { .type
= NLA_NESTED
, },
368 [LWT_BPF_XMIT_HEADROOM
] = { .type
= NLA_U32
},
371 static int bpf_build_state(struct net
*net
, struct nlattr
*nla
,
372 unsigned int family
, const void *cfg
,
373 struct lwtunnel_state
**ts
,
374 struct netlink_ext_ack
*extack
)
376 struct nlattr
*tb
[LWT_BPF_MAX
+ 1];
377 struct lwtunnel_state
*newts
;
381 if (family
!= AF_INET
&& family
!= AF_INET6
)
382 return -EAFNOSUPPORT
;
384 ret
= nla_parse_nested_deprecated(tb
, LWT_BPF_MAX
, nla
, bpf_nl_policy
,
389 if (!tb
[LWT_BPF_IN
] && !tb
[LWT_BPF_OUT
] && !tb
[LWT_BPF_XMIT
])
392 newts
= lwtunnel_state_alloc(sizeof(*bpf
));
396 newts
->type
= LWTUNNEL_ENCAP_BPF
;
397 bpf
= bpf_lwt_lwtunnel(newts
);
399 if (tb
[LWT_BPF_IN
]) {
400 newts
->flags
|= LWTUNNEL_STATE_INPUT_REDIRECT
;
401 ret
= bpf_parse_prog(tb
[LWT_BPF_IN
], &bpf
->in
,
402 BPF_PROG_TYPE_LWT_IN
);
407 if (tb
[LWT_BPF_OUT
]) {
408 newts
->flags
|= LWTUNNEL_STATE_OUTPUT_REDIRECT
;
409 ret
= bpf_parse_prog(tb
[LWT_BPF_OUT
], &bpf
->out
,
410 BPF_PROG_TYPE_LWT_OUT
);
415 if (tb
[LWT_BPF_XMIT
]) {
416 newts
->flags
|= LWTUNNEL_STATE_XMIT_REDIRECT
;
417 ret
= bpf_parse_prog(tb
[LWT_BPF_XMIT
], &bpf
->xmit
,
418 BPF_PROG_TYPE_LWT_XMIT
);
423 if (tb
[LWT_BPF_XMIT_HEADROOM
]) {
424 u32 headroom
= nla_get_u32(tb
[LWT_BPF_XMIT_HEADROOM
]);
426 if (headroom
> LWT_BPF_MAX_HEADROOM
) {
431 newts
->headroom
= headroom
;
434 bpf
->family
= family
;
440 bpf_destroy_state(newts
);
445 static int bpf_fill_lwt_prog(struct sk_buff
*skb
, int attr
,
446 struct bpf_lwt_prog
*prog
)
453 nest
= nla_nest_start_noflag(skb
, attr
);
458 nla_put_string(skb
, LWT_BPF_PROG_NAME
, prog
->name
))
461 return nla_nest_end(skb
, nest
);
464 static int bpf_fill_encap_info(struct sk_buff
*skb
, struct lwtunnel_state
*lwt
)
466 struct bpf_lwt
*bpf
= bpf_lwt_lwtunnel(lwt
);
468 if (bpf_fill_lwt_prog(skb
, LWT_BPF_IN
, &bpf
->in
) < 0 ||
469 bpf_fill_lwt_prog(skb
, LWT_BPF_OUT
, &bpf
->out
) < 0 ||
470 bpf_fill_lwt_prog(skb
, LWT_BPF_XMIT
, &bpf
->xmit
) < 0)
476 static int bpf_encap_nlsize(struct lwtunnel_state
*lwtstate
)
478 int nest_len
= nla_total_size(sizeof(struct nlattr
)) +
479 nla_total_size(MAX_PROG_NAME
) + /* LWT_BPF_PROG_NAME */
482 return nest_len
+ /* LWT_BPF_IN */
483 nest_len
+ /* LWT_BPF_OUT */
484 nest_len
+ /* LWT_BPF_XMIT */
488 static int bpf_lwt_prog_cmp(struct bpf_lwt_prog
*a
, struct bpf_lwt_prog
*b
)
491 * The LWT state is currently rebuilt for delete requests which
492 * results in a new bpf_prog instance. Comparing names for now.
494 if (!a
->name
&& !b
->name
)
497 if (!a
->name
|| !b
->name
)
500 return strcmp(a
->name
, b
->name
);
503 static int bpf_encap_cmp(struct lwtunnel_state
*a
, struct lwtunnel_state
*b
)
505 struct bpf_lwt
*a_bpf
= bpf_lwt_lwtunnel(a
);
506 struct bpf_lwt
*b_bpf
= bpf_lwt_lwtunnel(b
);
508 return bpf_lwt_prog_cmp(&a_bpf
->in
, &b_bpf
->in
) ||
509 bpf_lwt_prog_cmp(&a_bpf
->out
, &b_bpf
->out
) ||
510 bpf_lwt_prog_cmp(&a_bpf
->xmit
, &b_bpf
->xmit
);
513 static const struct lwtunnel_encap_ops bpf_encap_ops
= {
514 .build_state
= bpf_build_state
,
515 .destroy_state
= bpf_destroy_state
,
517 .output
= bpf_output
,
519 .fill_encap
= bpf_fill_encap_info
,
520 .get_encap_size
= bpf_encap_nlsize
,
521 .cmp_encap
= bpf_encap_cmp
,
522 .owner
= THIS_MODULE
,
525 static int handle_gso_type(struct sk_buff
*skb
, unsigned int gso_type
,
528 struct skb_shared_info
*shinfo
= skb_shinfo(skb
);
530 gso_type
|= SKB_GSO_DODGY
;
531 shinfo
->gso_type
|= gso_type
;
532 skb_decrease_gso_size(shinfo
, encap_len
);
533 shinfo
->gso_segs
= 0;
537 static int handle_gso_encap(struct sk_buff
*skb
, bool ipv4
, int encap_len
)
543 /* SCTP and UDP_L4 gso need more nuanced handling than what
544 * handle_gso_type() does above: skb_decrease_gso_size() is not enough.
545 * So at the moment only TCP GSO packets are let through.
547 if (!(skb_shinfo(skb
)->gso_type
& (SKB_GSO_TCPV4
| SKB_GSO_TCPV6
)))
551 protocol
= ip_hdr(skb
)->protocol
;
552 next_hdr_offset
= sizeof(struct iphdr
);
553 next_hdr
= skb_network_header(skb
) + next_hdr_offset
;
555 protocol
= ipv6_hdr(skb
)->nexthdr
;
556 next_hdr_offset
= sizeof(struct ipv6hdr
);
557 next_hdr
= skb_network_header(skb
) + next_hdr_offset
;
562 next_hdr_offset
+= sizeof(struct gre_base_hdr
);
563 if (next_hdr_offset
> encap_len
)
566 if (((struct gre_base_hdr
*)next_hdr
)->flags
& GRE_CSUM
)
567 return handle_gso_type(skb
, SKB_GSO_GRE_CSUM
,
569 return handle_gso_type(skb
, SKB_GSO_GRE
, encap_len
);
572 next_hdr_offset
+= sizeof(struct udphdr
);
573 if (next_hdr_offset
> encap_len
)
576 if (((struct udphdr
*)next_hdr
)->check
)
577 return handle_gso_type(skb
, SKB_GSO_UDP_TUNNEL_CSUM
,
579 return handle_gso_type(skb
, SKB_GSO_UDP_TUNNEL
, encap_len
);
584 return handle_gso_type(skb
, SKB_GSO_IPXIP4
, encap_len
);
586 return handle_gso_type(skb
, SKB_GSO_IPXIP6
, encap_len
);
589 return -EPROTONOSUPPORT
;
593 int bpf_lwt_push_ip_encap(struct sk_buff
*skb
, void *hdr
, u32 len
, bool ingress
)
599 if (unlikely(len
< sizeof(struct iphdr
) || len
> LWT_BPF_MAX_HEADROOM
))
602 /* validate protocol and length */
603 iph
= (struct iphdr
*)hdr
;
604 if (iph
->version
== 4) {
606 if (unlikely(len
< iph
->ihl
* 4))
608 } else if (iph
->version
== 6) {
610 if (unlikely(len
< sizeof(struct ipv6hdr
)))
617 err
= skb_cow_head(skb
, len
+ skb
->mac_len
);
619 err
= skb_cow_head(skb
,
620 len
+ LL_RESERVED_SPACE(skb_dst(skb
)->dev
));
624 /* push the encap headers and fix pointers */
625 skb_reset_inner_headers(skb
);
626 skb_reset_inner_mac_header(skb
); /* mac header is not yet set */
627 skb_set_inner_protocol(skb
, skb
->protocol
);
628 skb
->encapsulation
= 1;
631 skb_postpush_rcsum(skb
, iph
, len
);
632 skb_reset_network_header(skb
);
633 memcpy(skb_network_header(skb
), hdr
, len
);
634 bpf_compute_data_pointers(skb
);
638 skb
->protocol
= htons(ETH_P_IP
);
642 iph
->check
= ip_fast_csum((unsigned char *)iph
,
645 skb
->protocol
= htons(ETH_P_IPV6
);
649 return handle_gso_encap(skb
, ipv4
, len
);
654 static int __init
bpf_lwt_init(void)
656 return lwtunnel_encap_add_ops(&bpf_encap_ops
, LWTUNNEL_ENCAP_BPF
);
659 subsys_initcall(bpf_lwt_init
)