1 // SPDX-License-Identifier: GPL-2.0-only
2 /* Copyright (c) 2016 Thomas Graf <tgraf@tgraf.ch>
5 #include <linux/filter.h>
6 #include <linux/kernel.h>
7 #include <linux/module.h>
8 #include <linux/skbuff.h>
9 #include <linux/types.h>
10 #include <linux/bpf.h>
11 #include <net/lwtunnel.h>
14 #include <net/ip6_route.h>
15 #include <net/ipv6_stubs.h>
16 #include <net/inet_dscp.h>
19 struct bpf_prog
*prog
;
24 struct bpf_lwt_prog in
;
25 struct bpf_lwt_prog out
;
26 struct bpf_lwt_prog xmit
;
30 #define MAX_PROG_NAME 256
32 static inline struct bpf_lwt
*bpf_lwt_lwtunnel(struct lwtunnel_state
*lwt
)
34 return (struct bpf_lwt
*)lwt
->data
;
37 #define NO_REDIRECT false
38 #define CAN_REDIRECT true
40 static int run_lwt_bpf(struct sk_buff
*skb
, struct bpf_lwt_prog
*lwt
,
41 struct dst_entry
*dst
, bool can_redirect
)
43 struct bpf_net_context __bpf_net_ctx
, *bpf_net_ctx
;
46 /* Disabling BH is needed to protect per-CPU bpf_redirect_info between
47 * BPF prog and skb_do_redirect().
50 bpf_net_ctx
= bpf_net_ctx_set(&__bpf_net_ctx
);
51 bpf_compute_data_pointers(skb
);
52 ret
= bpf_prog_run_save_cb(lwt
->prog
, skb
);
60 if (unlikely(!can_redirect
)) {
61 pr_warn_once("Illegal redirect return code in prog %s\n",
62 lwt
->name
? : "<unknown>");
65 skb_reset_mac_header(skb
);
77 pr_warn_once("bpf-lwt: Illegal return value %u, expect packet loss\n", ret
);
83 bpf_net_ctx_clear(bpf_net_ctx
);
89 static int bpf_lwt_input_reroute(struct sk_buff
*skb
)
91 enum skb_drop_reason reason
;
94 if (skb
->protocol
== htons(ETH_P_IP
)) {
95 struct net_device
*dev
= skb_dst(skb
)->dev
;
96 const struct iphdr
*iph
= ip_hdr(skb
);
100 reason
= ip_route_input_noref(skb
, iph
->daddr
, iph
->saddr
,
101 ip4h_dscp(iph
), dev
);
102 err
= reason
? -EINVAL
: 0;
104 } else if (skb
->protocol
== htons(ETH_P_IPV6
)) {
106 err
= ipv6_stub
->ipv6_route_input(skb
);
113 return dst_input(skb
);
120 static int bpf_input(struct sk_buff
*skb
)
122 struct dst_entry
*dst
= skb_dst(skb
);
126 bpf
= bpf_lwt_lwtunnel(dst
->lwtstate
);
128 ret
= run_lwt_bpf(skb
, &bpf
->in
, dst
, NO_REDIRECT
);
131 if (ret
== BPF_LWT_REROUTE
)
132 return bpf_lwt_input_reroute(skb
);
135 if (unlikely(!dst
->lwtstate
->orig_input
)) {
140 return dst
->lwtstate
->orig_input(skb
);
143 static int bpf_output(struct net
*net
, struct sock
*sk
, struct sk_buff
*skb
)
145 struct dst_entry
*dst
= skb_dst(skb
);
149 bpf
= bpf_lwt_lwtunnel(dst
->lwtstate
);
151 ret
= run_lwt_bpf(skb
, &bpf
->out
, dst
, NO_REDIRECT
);
156 if (unlikely(!dst
->lwtstate
->orig_output
)) {
157 pr_warn_once("orig_output not set on dst for prog %s\n",
163 return dst
->lwtstate
->orig_output(net
, sk
, skb
);
166 static int xmit_check_hhlen(struct sk_buff
*skb
, int hh_len
)
168 if (skb_headroom(skb
) < hh_len
) {
169 int nhead
= HH_DATA_ALIGN(hh_len
- skb_headroom(skb
));
171 if (pskb_expand_head(skb
, nhead
, 0, GFP_ATOMIC
))
178 static int bpf_lwt_xmit_reroute(struct sk_buff
*skb
)
180 struct net_device
*l3mdev
= l3mdev_master_dev_rcu(skb_dst(skb
)->dev
);
181 int oif
= l3mdev
? l3mdev
->ifindex
: 0;
182 struct dst_entry
*dst
= NULL
;
183 int err
= -EAFNOSUPPORT
;
188 if (skb
->protocol
== htons(ETH_P_IP
))
190 else if (skb
->protocol
== htons(ETH_P_IPV6
))
195 sk
= sk_to_full_sk(skb
->sk
);
197 if (sk
->sk_bound_dev_if
)
198 oif
= sk
->sk_bound_dev_if
;
201 net
= dev_net(skb_dst(skb
)->dev
);
205 struct iphdr
*iph
= ip_hdr(skb
);
206 struct flowi4 fl4
= {};
209 fl4
.flowi4_oif
= oif
;
210 fl4
.flowi4_mark
= skb
->mark
;
211 fl4
.flowi4_uid
= sock_net_uid(net
, sk
);
212 fl4
.flowi4_tos
= inet_dscp_to_dsfield(ip4h_dscp(iph
));
213 fl4
.flowi4_flags
= FLOWI_FLAG_ANYSRC
;
214 fl4
.flowi4_proto
= iph
->protocol
;
215 fl4
.daddr
= iph
->daddr
;
216 fl4
.saddr
= iph
->saddr
;
218 rt
= ip_route_output_key(net
, &fl4
);
225 struct ipv6hdr
*iph6
= ipv6_hdr(skb
);
226 struct flowi6 fl6
= {};
228 fl6
.flowi6_oif
= oif
;
229 fl6
.flowi6_mark
= skb
->mark
;
230 fl6
.flowi6_uid
= sock_net_uid(net
, sk
);
231 fl6
.flowlabel
= ip6_flowinfo(iph6
);
232 fl6
.flowi6_proto
= iph6
->nexthdr
;
233 fl6
.daddr
= iph6
->daddr
;
234 fl6
.saddr
= iph6
->saddr
;
236 dst
= ipv6_stub
->ipv6_dst_lookup_flow(net
, skb
->sk
, &fl6
, NULL
);
242 if (unlikely(dst
->error
)) {
248 /* Although skb header was reserved in bpf_lwt_push_ip_encap(), it
249 * was done for the previous dst, so we are doing it here again, in
250 * case the new dst needs much more space. The call below is a noop
251 * if there is enough header space in skb.
253 err
= skb_cow_head(skb
, LL_RESERVED_SPACE(dst
->dev
));
258 skb_dst_set(skb
, dst
);
260 err
= dst_output(dev_net(skb_dst(skb
)->dev
), skb
->sk
, skb
);
262 return net_xmit_errno(err
);
264 /* ip[6]_finish_output2 understand LWTUNNEL_XMIT_DONE */
265 return LWTUNNEL_XMIT_DONE
;
272 static int bpf_xmit(struct sk_buff
*skb
)
274 struct dst_entry
*dst
= skb_dst(skb
);
277 bpf
= bpf_lwt_lwtunnel(dst
->lwtstate
);
278 if (bpf
->xmit
.prog
) {
279 int hh_len
= dst
->dev
->hard_header_len
;
280 __be16 proto
= skb
->protocol
;
283 ret
= run_lwt_bpf(skb
, &bpf
->xmit
, dst
, CAN_REDIRECT
);
286 /* If the header changed, e.g. via bpf_lwt_push_encap,
287 * BPF_LWT_REROUTE below should have been used if the
288 * protocol was also changed.
290 if (skb
->protocol
!= proto
) {
294 /* If the header was expanded, headroom might be too
295 * small for L2 header to come, expand as needed.
297 ret
= xmit_check_hhlen(skb
, hh_len
);
301 return LWTUNNEL_XMIT_CONTINUE
;
303 return LWTUNNEL_XMIT_DONE
;
304 case BPF_LWT_REROUTE
:
305 return bpf_lwt_xmit_reroute(skb
);
311 return LWTUNNEL_XMIT_CONTINUE
;
314 static void bpf_lwt_prog_destroy(struct bpf_lwt_prog
*prog
)
317 bpf_prog_put(prog
->prog
);
322 static void bpf_destroy_state(struct lwtunnel_state
*lwt
)
324 struct bpf_lwt
*bpf
= bpf_lwt_lwtunnel(lwt
);
326 bpf_lwt_prog_destroy(&bpf
->in
);
327 bpf_lwt_prog_destroy(&bpf
->out
);
328 bpf_lwt_prog_destroy(&bpf
->xmit
);
331 static const struct nla_policy bpf_prog_policy
[LWT_BPF_PROG_MAX
+ 1] = {
332 [LWT_BPF_PROG_FD
] = { .type
= NLA_U32
, },
333 [LWT_BPF_PROG_NAME
] = { .type
= NLA_NUL_STRING
,
334 .len
= MAX_PROG_NAME
},
337 static int bpf_parse_prog(struct nlattr
*attr
, struct bpf_lwt_prog
*prog
,
338 enum bpf_prog_type type
)
340 struct nlattr
*tb
[LWT_BPF_PROG_MAX
+ 1];
345 ret
= nla_parse_nested_deprecated(tb
, LWT_BPF_PROG_MAX
, attr
,
346 bpf_prog_policy
, NULL
);
350 if (!tb
[LWT_BPF_PROG_FD
] || !tb
[LWT_BPF_PROG_NAME
])
353 prog
->name
= nla_memdup(tb
[LWT_BPF_PROG_NAME
], GFP_ATOMIC
);
357 fd
= nla_get_u32(tb
[LWT_BPF_PROG_FD
]);
358 p
= bpf_prog_get_type(fd
, type
);
367 static const struct nla_policy bpf_nl_policy
[LWT_BPF_MAX
+ 1] = {
368 [LWT_BPF_IN
] = { .type
= NLA_NESTED
, },
369 [LWT_BPF_OUT
] = { .type
= NLA_NESTED
, },
370 [LWT_BPF_XMIT
] = { .type
= NLA_NESTED
, },
371 [LWT_BPF_XMIT_HEADROOM
] = { .type
= NLA_U32
},
374 static int bpf_build_state(struct net
*net
, struct nlattr
*nla
,
375 unsigned int family
, const void *cfg
,
376 struct lwtunnel_state
**ts
,
377 struct netlink_ext_ack
*extack
)
379 struct nlattr
*tb
[LWT_BPF_MAX
+ 1];
380 struct lwtunnel_state
*newts
;
384 if (family
!= AF_INET
&& family
!= AF_INET6
)
385 return -EAFNOSUPPORT
;
387 ret
= nla_parse_nested_deprecated(tb
, LWT_BPF_MAX
, nla
, bpf_nl_policy
,
392 if (!tb
[LWT_BPF_IN
] && !tb
[LWT_BPF_OUT
] && !tb
[LWT_BPF_XMIT
])
395 newts
= lwtunnel_state_alloc(sizeof(*bpf
));
399 newts
->type
= LWTUNNEL_ENCAP_BPF
;
400 bpf
= bpf_lwt_lwtunnel(newts
);
402 if (tb
[LWT_BPF_IN
]) {
403 newts
->flags
|= LWTUNNEL_STATE_INPUT_REDIRECT
;
404 ret
= bpf_parse_prog(tb
[LWT_BPF_IN
], &bpf
->in
,
405 BPF_PROG_TYPE_LWT_IN
);
410 if (tb
[LWT_BPF_OUT
]) {
411 newts
->flags
|= LWTUNNEL_STATE_OUTPUT_REDIRECT
;
412 ret
= bpf_parse_prog(tb
[LWT_BPF_OUT
], &bpf
->out
,
413 BPF_PROG_TYPE_LWT_OUT
);
418 if (tb
[LWT_BPF_XMIT
]) {
419 newts
->flags
|= LWTUNNEL_STATE_XMIT_REDIRECT
;
420 ret
= bpf_parse_prog(tb
[LWT_BPF_XMIT
], &bpf
->xmit
,
421 BPF_PROG_TYPE_LWT_XMIT
);
426 if (tb
[LWT_BPF_XMIT_HEADROOM
]) {
427 u32 headroom
= nla_get_u32(tb
[LWT_BPF_XMIT_HEADROOM
]);
429 if (headroom
> LWT_BPF_MAX_HEADROOM
) {
434 newts
->headroom
= headroom
;
437 bpf
->family
= family
;
443 bpf_destroy_state(newts
);
448 static int bpf_fill_lwt_prog(struct sk_buff
*skb
, int attr
,
449 struct bpf_lwt_prog
*prog
)
456 nest
= nla_nest_start_noflag(skb
, attr
);
461 nla_put_string(skb
, LWT_BPF_PROG_NAME
, prog
->name
))
464 return nla_nest_end(skb
, nest
);
467 static int bpf_fill_encap_info(struct sk_buff
*skb
, struct lwtunnel_state
*lwt
)
469 struct bpf_lwt
*bpf
= bpf_lwt_lwtunnel(lwt
);
471 if (bpf_fill_lwt_prog(skb
, LWT_BPF_IN
, &bpf
->in
) < 0 ||
472 bpf_fill_lwt_prog(skb
, LWT_BPF_OUT
, &bpf
->out
) < 0 ||
473 bpf_fill_lwt_prog(skb
, LWT_BPF_XMIT
, &bpf
->xmit
) < 0)
479 static int bpf_encap_nlsize(struct lwtunnel_state
*lwtstate
)
481 int nest_len
= nla_total_size(sizeof(struct nlattr
)) +
482 nla_total_size(MAX_PROG_NAME
) + /* LWT_BPF_PROG_NAME */
485 return nest_len
+ /* LWT_BPF_IN */
486 nest_len
+ /* LWT_BPF_OUT */
487 nest_len
+ /* LWT_BPF_XMIT */
491 static int bpf_lwt_prog_cmp(struct bpf_lwt_prog
*a
, struct bpf_lwt_prog
*b
)
494 * The LWT state is currently rebuilt for delete requests which
495 * results in a new bpf_prog instance. Comparing names for now.
497 if (!a
->name
&& !b
->name
)
500 if (!a
->name
|| !b
->name
)
503 return strcmp(a
->name
, b
->name
);
506 static int bpf_encap_cmp(struct lwtunnel_state
*a
, struct lwtunnel_state
*b
)
508 struct bpf_lwt
*a_bpf
= bpf_lwt_lwtunnel(a
);
509 struct bpf_lwt
*b_bpf
= bpf_lwt_lwtunnel(b
);
511 return bpf_lwt_prog_cmp(&a_bpf
->in
, &b_bpf
->in
) ||
512 bpf_lwt_prog_cmp(&a_bpf
->out
, &b_bpf
->out
) ||
513 bpf_lwt_prog_cmp(&a_bpf
->xmit
, &b_bpf
->xmit
);
516 static const struct lwtunnel_encap_ops bpf_encap_ops
= {
517 .build_state
= bpf_build_state
,
518 .destroy_state
= bpf_destroy_state
,
520 .output
= bpf_output
,
522 .fill_encap
= bpf_fill_encap_info
,
523 .get_encap_size
= bpf_encap_nlsize
,
524 .cmp_encap
= bpf_encap_cmp
,
525 .owner
= THIS_MODULE
,
528 static int handle_gso_type(struct sk_buff
*skb
, unsigned int gso_type
,
531 struct skb_shared_info
*shinfo
= skb_shinfo(skb
);
533 gso_type
|= SKB_GSO_DODGY
;
534 shinfo
->gso_type
|= gso_type
;
535 skb_decrease_gso_size(shinfo
, encap_len
);
536 shinfo
->gso_segs
= 0;
540 static int handle_gso_encap(struct sk_buff
*skb
, bool ipv4
, int encap_len
)
546 /* SCTP and UDP_L4 gso need more nuanced handling than what
547 * handle_gso_type() does above: skb_decrease_gso_size() is not enough.
548 * So at the moment only TCP GSO packets are let through.
550 if (!(skb_shinfo(skb
)->gso_type
& (SKB_GSO_TCPV4
| SKB_GSO_TCPV6
)))
554 protocol
= ip_hdr(skb
)->protocol
;
555 next_hdr_offset
= sizeof(struct iphdr
);
556 next_hdr
= skb_network_header(skb
) + next_hdr_offset
;
558 protocol
= ipv6_hdr(skb
)->nexthdr
;
559 next_hdr_offset
= sizeof(struct ipv6hdr
);
560 next_hdr
= skb_network_header(skb
) + next_hdr_offset
;
565 next_hdr_offset
+= sizeof(struct gre_base_hdr
);
566 if (next_hdr_offset
> encap_len
)
569 if (((struct gre_base_hdr
*)next_hdr
)->flags
& GRE_CSUM
)
570 return handle_gso_type(skb
, SKB_GSO_GRE_CSUM
,
572 return handle_gso_type(skb
, SKB_GSO_GRE
, encap_len
);
575 next_hdr_offset
+= sizeof(struct udphdr
);
576 if (next_hdr_offset
> encap_len
)
579 if (((struct udphdr
*)next_hdr
)->check
)
580 return handle_gso_type(skb
, SKB_GSO_UDP_TUNNEL_CSUM
,
582 return handle_gso_type(skb
, SKB_GSO_UDP_TUNNEL
, encap_len
);
587 return handle_gso_type(skb
, SKB_GSO_IPXIP4
, encap_len
);
589 return handle_gso_type(skb
, SKB_GSO_IPXIP6
, encap_len
);
592 return -EPROTONOSUPPORT
;
596 int bpf_lwt_push_ip_encap(struct sk_buff
*skb
, void *hdr
, u32 len
, bool ingress
)
602 if (unlikely(len
< sizeof(struct iphdr
) || len
> LWT_BPF_MAX_HEADROOM
))
605 /* validate protocol and length */
606 iph
= (struct iphdr
*)hdr
;
607 if (iph
->version
== 4) {
609 if (unlikely(len
< iph
->ihl
* 4))
611 } else if (iph
->version
== 6) {
613 if (unlikely(len
< sizeof(struct ipv6hdr
)))
620 err
= skb_cow_head(skb
, len
+ skb
->mac_len
);
622 err
= skb_cow_head(skb
,
623 len
+ LL_RESERVED_SPACE(skb_dst(skb
)->dev
));
627 /* push the encap headers and fix pointers */
628 skb_reset_inner_headers(skb
);
629 skb_reset_inner_mac_header(skb
); /* mac header is not yet set */
630 skb_set_inner_protocol(skb
, skb
->protocol
);
631 skb
->encapsulation
= 1;
634 skb_postpush_rcsum(skb
, iph
, len
);
635 skb_reset_network_header(skb
);
636 memcpy(skb_network_header(skb
), hdr
, len
);
637 bpf_compute_data_pointers(skb
);
641 skb
->protocol
= htons(ETH_P_IP
);
645 iph
->check
= ip_fast_csum((unsigned char *)iph
,
648 skb
->protocol
= htons(ETH_P_IPV6
);
652 return handle_gso_encap(skb
, ipv4
, len
);
657 static int __init
bpf_lwt_init(void)
659 return lwtunnel_encap_add_ops(&bpf_encap_ops
, LWTUNNEL_ENCAP_BPF
);
662 subsys_initcall(bpf_lwt_init
)