1 // SPDX-License-Identifier: GPL-2.0-only
2 /* Unstable Conntrack Helpers for XDP and TC-BPF hook
4 * These are called from the XDP and SCHED_CLS BPF programs. Note that it is
5 * allowed to break compatibility for these functions since the interface they
6 * are exposed through to BPF programs is explicitly unstable.
9 #include <linux/bpf_verifier.h>
10 #include <linux/bpf.h>
11 #include <linux/btf.h>
12 #include <linux/filter.h>
13 #include <linux/mutex.h>
14 #include <linux/types.h>
15 #include <linux/btf_ids.h>
16 #include <linux/net_namespace.h>
18 #include <net/netfilter/nf_conntrack_bpf.h>
19 #include <net/netfilter/nf_conntrack_core.h>
21 /* bpf_ct_opts - Options for CT lookup helpers
24 * @netns_id - Specify the network namespace for lookup
26 * BPF_F_CURRENT_NETNS (-1)
27 * Use namespace associated with ctx (xdp_md, __sk_buff)
29 * Network Namespace ID
30 * @error - Out parameter, set for any errors encountered
32 * -EINVAL - Passed NULL for bpf_tuple pointer
33 * -EINVAL - opts->reserved is not 0
34 * -EINVAL - netns_id is less than -1
35 * -EINVAL - opts__sz isn't NF_BPF_CT_OPTS_SZ (16) or 12
36 * -EINVAL - opts->ct_zone_id set when
37 opts__sz isn't NF_BPF_CT_OPTS_SZ (16)
38 * -EPROTO - l4proto isn't one of IPPROTO_TCP or IPPROTO_UDP
39 * -ENONET - No network namespace found for netns_id
40 * -ENOENT - Conntrack lookup could not find entry for tuple
41 * -EAFNOSUPPORT - tuple__sz isn't one of sizeof(tuple->ipv4)
42 * or sizeof(tuple->ipv6)
43 * @l4proto - Layer 4 protocol
45 * IPPROTO_TCP, IPPROTO_UDP
46 * @dir: - connection tracking tuple direction.
47 * @ct_zone_id - connection tracking zone id.
48 * @ct_zone_dir - connection tracking zone direction.
49 * @reserved - Reserved member, will be reused for more options in future
64 NF_BPF_CT_OPTS_SZ
= 16,
67 static int bpf_nf_ct_tuple_parse(struct bpf_sock_tuple
*bpf_tuple
,
68 u32 tuple_len
, u8 protonum
, u8 dir
,
69 struct nf_conntrack_tuple
*tuple
)
71 union nf_inet_addr
*src
= dir
? &tuple
->dst
.u3
: &tuple
->src
.u3
;
72 union nf_inet_addr
*dst
= dir
? &tuple
->src
.u3
: &tuple
->dst
.u3
;
73 union nf_conntrack_man_proto
*sport
= dir
? (void *)&tuple
->dst
.u
75 union nf_conntrack_man_proto
*dport
= dir
? &tuple
->src
.u
76 : (void *)&tuple
->dst
.u
;
78 if (unlikely(protonum
!= IPPROTO_TCP
&& protonum
!= IPPROTO_UDP
))
81 memset(tuple
, 0, sizeof(*tuple
));
84 case sizeof(bpf_tuple
->ipv4
):
85 tuple
->src
.l3num
= AF_INET
;
86 src
->ip
= bpf_tuple
->ipv4
.saddr
;
87 sport
->tcp
.port
= bpf_tuple
->ipv4
.sport
;
88 dst
->ip
= bpf_tuple
->ipv4
.daddr
;
89 dport
->tcp
.port
= bpf_tuple
->ipv4
.dport
;
91 case sizeof(bpf_tuple
->ipv6
):
92 tuple
->src
.l3num
= AF_INET6
;
93 memcpy(src
->ip6
, bpf_tuple
->ipv6
.saddr
, sizeof(bpf_tuple
->ipv6
.saddr
));
94 sport
->tcp
.port
= bpf_tuple
->ipv6
.sport
;
95 memcpy(dst
->ip6
, bpf_tuple
->ipv6
.daddr
, sizeof(bpf_tuple
->ipv6
.daddr
));
96 dport
->tcp
.port
= bpf_tuple
->ipv6
.dport
;
101 tuple
->dst
.protonum
= protonum
;
102 tuple
->dst
.dir
= dir
;
107 static struct nf_conn
*
108 __bpf_nf_ct_alloc_entry(struct net
*net
, struct bpf_sock_tuple
*bpf_tuple
,
109 u32 tuple_len
, struct bpf_ct_opts
*opts
, u32 opts_len
,
112 struct nf_conntrack_tuple otuple
, rtuple
;
113 struct nf_conntrack_zone ct_zone
;
117 if (!opts
|| !bpf_tuple
)
118 return ERR_PTR(-EINVAL
);
119 if (!(opts_len
== NF_BPF_CT_OPTS_SZ
|| opts_len
== 12))
120 return ERR_PTR(-EINVAL
);
121 if (opts_len
== NF_BPF_CT_OPTS_SZ
) {
122 if (opts
->reserved
[0] || opts
->reserved
[1] || opts
->reserved
[2])
123 return ERR_PTR(-EINVAL
);
125 if (opts
->ct_zone_id
)
126 return ERR_PTR(-EINVAL
);
129 if (unlikely(opts
->netns_id
< BPF_F_CURRENT_NETNS
))
130 return ERR_PTR(-EINVAL
);
132 err
= bpf_nf_ct_tuple_parse(bpf_tuple
, tuple_len
, opts
->l4proto
,
133 IP_CT_DIR_ORIGINAL
, &otuple
);
137 err
= bpf_nf_ct_tuple_parse(bpf_tuple
, tuple_len
, opts
->l4proto
,
138 IP_CT_DIR_REPLY
, &rtuple
);
142 if (opts
->netns_id
>= 0) {
143 net
= get_net_ns_by_id(net
, opts
->netns_id
);
145 return ERR_PTR(-ENONET
);
148 if (opts_len
== NF_BPF_CT_OPTS_SZ
) {
149 if (opts
->ct_zone_dir
== 0)
150 opts
->ct_zone_dir
= NF_CT_DEFAULT_ZONE_DIR
;
151 nf_ct_zone_init(&ct_zone
,
152 opts
->ct_zone_id
, opts
->ct_zone_dir
, 0);
154 ct_zone
= nf_ct_zone_dflt
;
157 ct
= nf_conntrack_alloc(net
, &ct_zone
, &otuple
, &rtuple
,
162 memset(&ct
->proto
, 0, sizeof(ct
->proto
));
163 __nf_ct_set_timeout(ct
, timeout
* HZ
);
166 if (opts
->netns_id
>= 0)
172 static struct nf_conn
*__bpf_nf_ct_lookup(struct net
*net
,
173 struct bpf_sock_tuple
*bpf_tuple
,
174 u32 tuple_len
, struct bpf_ct_opts
*opts
,
177 struct nf_conntrack_tuple_hash
*hash
;
178 struct nf_conntrack_tuple tuple
;
179 struct nf_conntrack_zone ct_zone
;
183 if (!opts
|| !bpf_tuple
)
184 return ERR_PTR(-EINVAL
);
185 if (!(opts_len
== NF_BPF_CT_OPTS_SZ
|| opts_len
== 12))
186 return ERR_PTR(-EINVAL
);
187 if (opts_len
== NF_BPF_CT_OPTS_SZ
) {
188 if (opts
->reserved
[0] || opts
->reserved
[1] || opts
->reserved
[2])
189 return ERR_PTR(-EINVAL
);
191 if (opts
->ct_zone_id
)
192 return ERR_PTR(-EINVAL
);
194 if (unlikely(opts
->l4proto
!= IPPROTO_TCP
&& opts
->l4proto
!= IPPROTO_UDP
))
195 return ERR_PTR(-EPROTO
);
196 if (unlikely(opts
->netns_id
< BPF_F_CURRENT_NETNS
))
197 return ERR_PTR(-EINVAL
);
199 err
= bpf_nf_ct_tuple_parse(bpf_tuple
, tuple_len
, opts
->l4proto
,
200 IP_CT_DIR_ORIGINAL
, &tuple
);
204 if (opts
->netns_id
>= 0) {
205 net
= get_net_ns_by_id(net
, opts
->netns_id
);
207 return ERR_PTR(-ENONET
);
210 if (opts_len
== NF_BPF_CT_OPTS_SZ
) {
211 if (opts
->ct_zone_dir
== 0)
212 opts
->ct_zone_dir
= NF_CT_DEFAULT_ZONE_DIR
;
213 nf_ct_zone_init(&ct_zone
,
214 opts
->ct_zone_id
, opts
->ct_zone_dir
, 0);
216 ct_zone
= nf_ct_zone_dflt
;
219 hash
= nf_conntrack_find_get(net
, &ct_zone
, &tuple
);
220 if (opts
->netns_id
>= 0)
223 return ERR_PTR(-ENOENT
);
225 ct
= nf_ct_tuplehash_to_ctrack(hash
);
226 opts
->dir
= NF_CT_DIRECTION(hash
);
231 BTF_ID_LIST(btf_nf_conn_ids
)
232 BTF_ID(struct, nf_conn
)
233 BTF_ID(struct, nf_conn___init
)
235 /* Check writes into `struct nf_conn` */
236 static int _nf_conntrack_btf_struct_access(struct bpf_verifier_log
*log
,
237 const struct bpf_reg_state
*reg
,
240 const struct btf_type
*ncit
, *nct
, *t
;
243 ncit
= btf_type_by_id(reg
->btf
, btf_nf_conn_ids
[1]);
244 nct
= btf_type_by_id(reg
->btf
, btf_nf_conn_ids
[0]);
245 t
= btf_type_by_id(reg
->btf
, reg
->btf_id
);
246 if (t
!= nct
&& t
!= ncit
) {
247 bpf_log(log
, "only read is supported\n");
251 /* `struct nf_conn` and `struct nf_conn___init` have the same layout
252 * so we are safe to simply merge offset checks here
255 #if defined(CONFIG_NF_CONNTRACK_MARK)
256 case offsetof(struct nf_conn
, mark
):
257 end
= offsetofend(struct nf_conn
, mark
);
261 bpf_log(log
, "no write support to nf_conn at off %d\n", off
);
265 if (off
+ size
> end
) {
267 "write access at off %d with size %d beyond the member of nf_conn ended at %zu\n",
275 __bpf_kfunc_start_defs();
277 /* bpf_xdp_ct_alloc - Allocate a new CT entry
280 * @xdp_ctx - Pointer to ctx (xdp_md) in XDP program
282 * @bpf_tuple - Pointer to memory representing the tuple to look up
284 * @tuple__sz - Length of the tuple structure
285 * Must be one of sizeof(bpf_tuple->ipv4) or
286 * sizeof(bpf_tuple->ipv6)
287 * @opts - Additional options for allocation (documented above)
289 * @opts__sz - Length of the bpf_ct_opts structure
290 * Must be NF_BPF_CT_OPTS_SZ (16) or 12
292 __bpf_kfunc
struct nf_conn___init
*
293 bpf_xdp_ct_alloc(struct xdp_md
*xdp_ctx
, struct bpf_sock_tuple
*bpf_tuple
,
294 u32 tuple__sz
, struct bpf_ct_opts
*opts
, u32 opts__sz
)
296 struct xdp_buff
*ctx
= (struct xdp_buff
*)xdp_ctx
;
297 struct nf_conn
*nfct
;
299 nfct
= __bpf_nf_ct_alloc_entry(dev_net(ctx
->rxq
->dev
), bpf_tuple
, tuple__sz
,
303 opts
->error
= PTR_ERR(nfct
);
307 return (struct nf_conn___init
*)nfct
;
310 /* bpf_xdp_ct_lookup - Lookup CT entry for the given tuple, and acquire a
314 * @xdp_ctx - Pointer to ctx (xdp_md) in XDP program
316 * @bpf_tuple - Pointer to memory representing the tuple to look up
318 * @tuple__sz - Length of the tuple structure
319 * Must be one of sizeof(bpf_tuple->ipv4) or
320 * sizeof(bpf_tuple->ipv6)
321 * @opts - Additional options for lookup (documented above)
323 * @opts__sz - Length of the bpf_ct_opts structure
324 * Must be NF_BPF_CT_OPTS_SZ (16) or 12
326 __bpf_kfunc
struct nf_conn
*
327 bpf_xdp_ct_lookup(struct xdp_md
*xdp_ctx
, struct bpf_sock_tuple
*bpf_tuple
,
328 u32 tuple__sz
, struct bpf_ct_opts
*opts
, u32 opts__sz
)
330 struct xdp_buff
*ctx
= (struct xdp_buff
*)xdp_ctx
;
331 struct net
*caller_net
;
332 struct nf_conn
*nfct
;
334 caller_net
= dev_net(ctx
->rxq
->dev
);
335 nfct
= __bpf_nf_ct_lookup(caller_net
, bpf_tuple
, tuple__sz
, opts
, opts__sz
);
338 opts
->error
= PTR_ERR(nfct
);
344 /* bpf_skb_ct_alloc - Allocate a new CT entry
347 * @skb_ctx - Pointer to ctx (__sk_buff) in TC program
349 * @bpf_tuple - Pointer to memory representing the tuple to look up
351 * @tuple__sz - Length of the tuple structure
352 * Must be one of sizeof(bpf_tuple->ipv4) or
353 * sizeof(bpf_tuple->ipv6)
354 * @opts - Additional options for allocation (documented above)
356 * @opts__sz - Length of the bpf_ct_opts structure
357 * Must be NF_BPF_CT_OPTS_SZ (16) or 12
359 __bpf_kfunc
struct nf_conn___init
*
360 bpf_skb_ct_alloc(struct __sk_buff
*skb_ctx
, struct bpf_sock_tuple
*bpf_tuple
,
361 u32 tuple__sz
, struct bpf_ct_opts
*opts
, u32 opts__sz
)
363 struct sk_buff
*skb
= (struct sk_buff
*)skb_ctx
;
364 struct nf_conn
*nfct
;
367 net
= skb
->dev
? dev_net(skb
->dev
) : sock_net(skb
->sk
);
368 nfct
= __bpf_nf_ct_alloc_entry(net
, bpf_tuple
, tuple__sz
, opts
, opts__sz
, 10);
371 opts
->error
= PTR_ERR(nfct
);
375 return (struct nf_conn___init
*)nfct
;
378 /* bpf_skb_ct_lookup - Lookup CT entry for the given tuple, and acquire a
382 * @skb_ctx - Pointer to ctx (__sk_buff) in TC program
384 * @bpf_tuple - Pointer to memory representing the tuple to look up
386 * @tuple__sz - Length of the tuple structure
387 * Must be one of sizeof(bpf_tuple->ipv4) or
388 * sizeof(bpf_tuple->ipv6)
389 * @opts - Additional options for lookup (documented above)
391 * @opts__sz - Length of the bpf_ct_opts structure
392 * Must be NF_BPF_CT_OPTS_SZ (16) or 12
394 __bpf_kfunc
struct nf_conn
*
395 bpf_skb_ct_lookup(struct __sk_buff
*skb_ctx
, struct bpf_sock_tuple
*bpf_tuple
,
396 u32 tuple__sz
, struct bpf_ct_opts
*opts
, u32 opts__sz
)
398 struct sk_buff
*skb
= (struct sk_buff
*)skb_ctx
;
399 struct net
*caller_net
;
400 struct nf_conn
*nfct
;
402 caller_net
= skb
->dev
? dev_net(skb
->dev
) : sock_net(skb
->sk
);
403 nfct
= __bpf_nf_ct_lookup(caller_net
, bpf_tuple
, tuple__sz
, opts
, opts__sz
);
406 opts
->error
= PTR_ERR(nfct
);
412 /* bpf_ct_insert_entry - Add the provided entry into a CT map
414 * This must be invoked for referenced PTR_TO_BTF_ID.
416 * @nfct - Pointer to referenced nf_conn___init object, obtained
417 * using bpf_xdp_ct_alloc or bpf_skb_ct_alloc.
419 __bpf_kfunc
struct nf_conn
*bpf_ct_insert_entry(struct nf_conn___init
*nfct_i
)
421 struct nf_conn
*nfct
= (struct nf_conn
*)nfct_i
;
424 if (!nf_ct_is_confirmed(nfct
))
425 nfct
->timeout
+= nfct_time_stamp
;
426 nfct
->status
|= IPS_CONFIRMED
;
427 err
= nf_conntrack_hash_check_insert(nfct
);
429 nf_conntrack_free(nfct
);
435 /* bpf_ct_release - Release acquired nf_conn object
437 * This must be invoked for referenced PTR_TO_BTF_ID, and the verifier rejects
438 * the program if any references remain in the program in all of the explored
442 * @nf_conn - Pointer to referenced nf_conn object, obtained using
443 * bpf_xdp_ct_lookup or bpf_skb_ct_lookup.
445 __bpf_kfunc
void bpf_ct_release(struct nf_conn
*nfct
)
450 /* bpf_ct_set_timeout - Set timeout of allocated nf_conn
452 * Sets the default timeout of newly allocated nf_conn before insertion.
453 * This helper must be invoked for refcounted pointer to nf_conn___init.
456 * @nfct - Pointer to referenced nf_conn object, obtained using
457 * bpf_xdp_ct_alloc or bpf_skb_ct_alloc.
458 * @timeout - Timeout in msecs.
460 __bpf_kfunc
void bpf_ct_set_timeout(struct nf_conn___init
*nfct
, u32 timeout
)
462 __nf_ct_set_timeout((struct nf_conn
*)nfct
, msecs_to_jiffies(timeout
));
465 /* bpf_ct_change_timeout - Change timeout of inserted nf_conn
467 * Change timeout associated of the inserted or looked up nf_conn.
468 * This helper must be invoked for refcounted pointer to nf_conn.
471 * @nfct - Pointer to referenced nf_conn object, obtained using
472 * bpf_ct_insert_entry, bpf_xdp_ct_lookup, or bpf_skb_ct_lookup.
473 * @timeout - New timeout in msecs.
475 __bpf_kfunc
int bpf_ct_change_timeout(struct nf_conn
*nfct
, u32 timeout
)
477 return __nf_ct_change_timeout(nfct
, msecs_to_jiffies(timeout
));
480 /* bpf_ct_set_status - Set status field of allocated nf_conn
482 * Set the status field of the newly allocated nf_conn before insertion.
483 * This must be invoked for referenced PTR_TO_BTF_ID to nf_conn___init.
486 * @nfct - Pointer to referenced nf_conn object, obtained using
487 * bpf_xdp_ct_alloc or bpf_skb_ct_alloc.
488 * @status - New status value.
490 __bpf_kfunc
int bpf_ct_set_status(const struct nf_conn___init
*nfct
, u32 status
)
492 return nf_ct_change_status_common((struct nf_conn
*)nfct
, status
);
495 /* bpf_ct_change_status - Change status of inserted nf_conn
497 * Change the status field of the provided connection tracking entry.
498 * This must be invoked for referenced PTR_TO_BTF_ID to nf_conn.
501 * @nfct - Pointer to referenced nf_conn object, obtained using
502 * bpf_ct_insert_entry, bpf_xdp_ct_lookup or bpf_skb_ct_lookup.
503 * @status - New status value.
505 __bpf_kfunc
int bpf_ct_change_status(struct nf_conn
*nfct
, u32 status
)
507 return nf_ct_change_status_common(nfct
, status
);
510 __bpf_kfunc_end_defs();
512 BTF_KFUNCS_START(nf_ct_kfunc_set
)
513 BTF_ID_FLAGS(func
, bpf_xdp_ct_alloc
, KF_ACQUIRE
| KF_RET_NULL
)
514 BTF_ID_FLAGS(func
, bpf_xdp_ct_lookup
, KF_ACQUIRE
| KF_RET_NULL
)
515 BTF_ID_FLAGS(func
, bpf_skb_ct_alloc
, KF_ACQUIRE
| KF_RET_NULL
)
516 BTF_ID_FLAGS(func
, bpf_skb_ct_lookup
, KF_ACQUIRE
| KF_RET_NULL
)
517 BTF_ID_FLAGS(func
, bpf_ct_insert_entry
, KF_ACQUIRE
| KF_RET_NULL
| KF_RELEASE
)
518 BTF_ID_FLAGS(func
, bpf_ct_release
, KF_RELEASE
)
519 BTF_ID_FLAGS(func
, bpf_ct_set_timeout
, KF_TRUSTED_ARGS
)
520 BTF_ID_FLAGS(func
, bpf_ct_change_timeout
, KF_TRUSTED_ARGS
)
521 BTF_ID_FLAGS(func
, bpf_ct_set_status
, KF_TRUSTED_ARGS
)
522 BTF_ID_FLAGS(func
, bpf_ct_change_status
, KF_TRUSTED_ARGS
)
523 BTF_KFUNCS_END(nf_ct_kfunc_set
)
525 static const struct btf_kfunc_id_set nf_conntrack_kfunc_set
= {
526 .owner
= THIS_MODULE
,
527 .set
= &nf_ct_kfunc_set
,
530 int register_nf_conntrack_bpf(void)
534 ret
= register_btf_kfunc_id_set(BPF_PROG_TYPE_XDP
, &nf_conntrack_kfunc_set
);
535 ret
= ret
?: register_btf_kfunc_id_set(BPF_PROG_TYPE_SCHED_CLS
, &nf_conntrack_kfunc_set
);
537 mutex_lock(&nf_conn_btf_access_lock
);
538 nfct_btf_struct_access
= _nf_conntrack_btf_struct_access
;
539 mutex_unlock(&nf_conn_btf_access_lock
);
545 void cleanup_nf_conntrack_bpf(void)
547 mutex_lock(&nf_conn_btf_access_lock
);
548 nfct_btf_struct_access
= NULL
;
549 mutex_unlock(&nf_conn_btf_access_lock
);