1 // SPDX-License-Identifier: GPL-2.0-only
2 #include <linux/kernel.h>
3 #include <linux/module.h>
4 #include <linux/init.h>
5 #include <linux/netlink.h>
6 #include <linux/netfilter.h>
7 #include <linux/workqueue.h>
8 #include <linux/spinlock.h>
9 #include <linux/netfilter/nf_conntrack_common.h>
10 #include <linux/netfilter/nf_tables.h>
12 #include <net/inet_dscp.h>
13 #include <net/netfilter/nf_tables.h>
14 #include <net/netfilter/nf_tables_core.h>
15 #include <net/netfilter/nf_conntrack_core.h>
16 #include <net/netfilter/nf_conntrack_extend.h>
17 #include <net/netfilter/nf_flow_table.h>
19 struct nft_flow_offload
{
20 struct nft_flowtable
*flowtable
;
23 static enum flow_offload_xmit_type
nft_xmit_type(struct dst_entry
*dst
)
26 return FLOW_OFFLOAD_XMIT_XFRM
;
28 return FLOW_OFFLOAD_XMIT_NEIGH
;
31 static void nft_default_forward_path(struct nf_flow_route
*route
,
32 struct dst_entry
*dst_cache
,
33 enum ip_conntrack_dir dir
)
35 route
->tuple
[!dir
].in
.ifindex
= dst_cache
->dev
->ifindex
;
36 route
->tuple
[dir
].dst
= dst_cache
;
37 route
->tuple
[dir
].xmit_type
= nft_xmit_type(dst_cache
);
40 static bool nft_is_valid_ether_device(const struct net_device
*dev
)
42 if (!dev
|| (dev
->flags
& IFF_LOOPBACK
) || dev
->type
!= ARPHRD_ETHER
||
43 dev
->addr_len
!= ETH_ALEN
|| !is_valid_ether_addr(dev
->dev_addr
))
49 static int nft_dev_fill_forward_path(const struct nf_flow_route
*route
,
50 const struct dst_entry
*dst_cache
,
51 const struct nf_conn
*ct
,
52 enum ip_conntrack_dir dir
, u8
*ha
,
53 struct net_device_path_stack
*stack
)
55 const void *daddr
= &ct
->tuplehash
[!dir
].tuple
.src
.u3
;
56 struct net_device
*dev
= dst_cache
->dev
;
60 if (!nft_is_valid_ether_device(dev
))
63 n
= dst_neigh_lookup(dst_cache
, daddr
);
67 read_lock_bh(&n
->lock
);
68 nud_state
= n
->nud_state
;
69 ether_addr_copy(ha
, n
->ha
);
70 read_unlock_bh(&n
->lock
);
73 if (!(nud_state
& NUD_VALID
))
77 return dev_fill_forward_path(dev
, ha
, stack
);
80 struct nft_forward_info
{
81 const struct net_device
*indev
;
82 const struct net_device
*outdev
;
83 const struct net_device
*hw_outdev
;
87 } encap
[NF_FLOW_TABLE_ENCAP_MAX
];
90 u8 h_source
[ETH_ALEN
];
92 enum flow_offload_xmit_type xmit_type
;
95 static void nft_dev_path_info(const struct net_device_path_stack
*stack
,
96 struct nft_forward_info
*info
,
97 unsigned char *ha
, struct nf_flowtable
*flowtable
)
99 const struct net_device_path
*path
;
102 memcpy(info
->h_dest
, ha
, ETH_ALEN
);
104 for (i
= 0; i
< stack
->num_paths
; i
++) {
105 path
= &stack
->path
[i
];
106 switch (path
->type
) {
107 case DEV_PATH_ETHERNET
:
111 info
->indev
= path
->dev
;
112 if (is_zero_ether_addr(info
->h_source
))
113 memcpy(info
->h_source
, path
->dev
->dev_addr
, ETH_ALEN
);
115 if (path
->type
== DEV_PATH_ETHERNET
)
117 if (path
->type
== DEV_PATH_DSA
) {
118 i
= stack
->num_paths
;
122 /* DEV_PATH_VLAN and DEV_PATH_PPPOE */
123 if (info
->num_encaps
>= NF_FLOW_TABLE_ENCAP_MAX
) {
128 info
->outdev
= path
->dev
;
129 info
->encap
[info
->num_encaps
].id
= path
->encap
.id
;
130 info
->encap
[info
->num_encaps
].proto
= path
->encap
.proto
;
132 if (path
->type
== DEV_PATH_PPPOE
)
133 memcpy(info
->h_dest
, path
->encap
.h_dest
, ETH_ALEN
);
135 case DEV_PATH_BRIDGE
:
136 if (is_zero_ether_addr(info
->h_source
))
137 memcpy(info
->h_source
, path
->dev
->dev_addr
, ETH_ALEN
);
139 switch (path
->bridge
.vlan_mode
) {
140 case DEV_PATH_BR_VLAN_UNTAG_HW
:
141 info
->ingress_vlans
|= BIT(info
->num_encaps
- 1);
143 case DEV_PATH_BR_VLAN_TAG
:
144 info
->encap
[info
->num_encaps
].id
= path
->bridge
.vlan_id
;
145 info
->encap
[info
->num_encaps
].proto
= path
->bridge
.vlan_proto
;
148 case DEV_PATH_BR_VLAN_UNTAG
:
151 case DEV_PATH_BR_VLAN_KEEP
:
154 info
->xmit_type
= FLOW_OFFLOAD_XMIT_DIRECT
;
162 info
->outdev
= info
->indev
;
164 info
->hw_outdev
= info
->indev
;
166 if (nf_flowtable_hw_offload(flowtable
) &&
167 nft_is_valid_ether_device(info
->indev
))
168 info
->xmit_type
= FLOW_OFFLOAD_XMIT_DIRECT
;
171 static bool nft_flowtable_find_dev(const struct net_device
*dev
,
172 struct nft_flowtable
*ft
)
174 struct nft_hook
*hook
;
177 list_for_each_entry_rcu(hook
, &ft
->hook_list
, list
) {
178 if (hook
->ops
.dev
!= dev
)
188 static void nft_dev_forward_path(struct nf_flow_route
*route
,
189 const struct nf_conn
*ct
,
190 enum ip_conntrack_dir dir
,
191 struct nft_flowtable
*ft
)
193 const struct dst_entry
*dst
= route
->tuple
[dir
].dst
;
194 struct net_device_path_stack stack
;
195 struct nft_forward_info info
= {};
196 unsigned char ha
[ETH_ALEN
];
199 if (nft_dev_fill_forward_path(route
, dst
, ct
, dir
, ha
, &stack
) >= 0)
200 nft_dev_path_info(&stack
, &info
, ha
, &ft
->data
);
202 if (!info
.indev
|| !nft_flowtable_find_dev(info
.indev
, ft
))
205 route
->tuple
[!dir
].in
.ifindex
= info
.indev
->ifindex
;
206 for (i
= 0; i
< info
.num_encaps
; i
++) {
207 route
->tuple
[!dir
].in
.encap
[i
].id
= info
.encap
[i
].id
;
208 route
->tuple
[!dir
].in
.encap
[i
].proto
= info
.encap
[i
].proto
;
210 route
->tuple
[!dir
].in
.num_encaps
= info
.num_encaps
;
211 route
->tuple
[!dir
].in
.ingress_vlans
= info
.ingress_vlans
;
213 if (info
.xmit_type
== FLOW_OFFLOAD_XMIT_DIRECT
) {
214 memcpy(route
->tuple
[dir
].out
.h_source
, info
.h_source
, ETH_ALEN
);
215 memcpy(route
->tuple
[dir
].out
.h_dest
, info
.h_dest
, ETH_ALEN
);
216 route
->tuple
[dir
].out
.ifindex
= info
.outdev
->ifindex
;
217 route
->tuple
[dir
].out
.hw_ifindex
= info
.hw_outdev
->ifindex
;
218 route
->tuple
[dir
].xmit_type
= info
.xmit_type
;
222 static int nft_flow_route(const struct nft_pktinfo
*pkt
,
223 const struct nf_conn
*ct
,
224 struct nf_flow_route
*route
,
225 enum ip_conntrack_dir dir
,
226 struct nft_flowtable
*ft
)
228 struct dst_entry
*this_dst
= skb_dst(pkt
->skb
);
229 struct dst_entry
*other_dst
= NULL
;
232 memset(&fl
, 0, sizeof(fl
));
233 switch (nft_pf(pkt
)) {
235 fl
.u
.ip4
.daddr
= ct
->tuplehash
[dir
].tuple
.src
.u3
.ip
;
236 fl
.u
.ip4
.saddr
= ct
->tuplehash
[!dir
].tuple
.src
.u3
.ip
;
237 fl
.u
.ip4
.flowi4_oif
= nft_in(pkt
)->ifindex
;
238 fl
.u
.ip4
.flowi4_iif
= this_dst
->dev
->ifindex
;
239 fl
.u
.ip4
.flowi4_tos
= inet_dscp_to_dsfield(ip4h_dscp(ip_hdr(pkt
->skb
)));
240 fl
.u
.ip4
.flowi4_mark
= pkt
->skb
->mark
;
241 fl
.u
.ip4
.flowi4_flags
= FLOWI_FLAG_ANYSRC
;
244 fl
.u
.ip6
.daddr
= ct
->tuplehash
[dir
].tuple
.src
.u3
.in6
;
245 fl
.u
.ip6
.saddr
= ct
->tuplehash
[!dir
].tuple
.src
.u3
.in6
;
246 fl
.u
.ip6
.flowi6_oif
= nft_in(pkt
)->ifindex
;
247 fl
.u
.ip6
.flowi6_iif
= this_dst
->dev
->ifindex
;
248 fl
.u
.ip6
.flowlabel
= ip6_flowinfo(ipv6_hdr(pkt
->skb
));
249 fl
.u
.ip6
.flowi6_mark
= pkt
->skb
->mark
;
250 fl
.u
.ip6
.flowi6_flags
= FLOWI_FLAG_ANYSRC
;
254 if (!dst_hold_safe(this_dst
))
257 nf_route(nft_net(pkt
), &other_dst
, &fl
, false, nft_pf(pkt
));
259 dst_release(this_dst
);
263 nft_default_forward_path(route
, this_dst
, dir
);
264 nft_default_forward_path(route
, other_dst
, !dir
);
266 if (route
->tuple
[dir
].xmit_type
== FLOW_OFFLOAD_XMIT_NEIGH
&&
267 route
->tuple
[!dir
].xmit_type
== FLOW_OFFLOAD_XMIT_NEIGH
) {
268 nft_dev_forward_path(route
, ct
, dir
, ft
);
269 nft_dev_forward_path(route
, ct
, !dir
, ft
);
275 static bool nft_flow_offload_skip(struct sk_buff
*skb
, int family
)
277 if (skb_sec_path(skb
))
280 if (family
== NFPROTO_IPV4
) {
281 const struct ip_options
*opt
;
283 opt
= &(IPCB(skb
)->opt
);
285 if (unlikely(opt
->optlen
))
292 static void flow_offload_ct_tcp(struct nf_conn
*ct
)
294 /* conntrack will not see all packets, disable tcp window validation. */
295 spin_lock_bh(&ct
->lock
);
296 ct
->proto
.tcp
.seen
[0].flags
|= IP_CT_TCP_FLAG_BE_LIBERAL
;
297 ct
->proto
.tcp
.seen
[1].flags
|= IP_CT_TCP_FLAG_BE_LIBERAL
;
298 spin_unlock_bh(&ct
->lock
);
301 static void nft_flow_offload_eval(const struct nft_expr
*expr
,
302 struct nft_regs
*regs
,
303 const struct nft_pktinfo
*pkt
)
305 struct nft_flow_offload
*priv
= nft_expr_priv(expr
);
306 struct nf_flowtable
*flowtable
= &priv
->flowtable
->data
;
307 struct tcphdr _tcph
, *tcph
= NULL
;
308 struct nf_flow_route route
= {};
309 enum ip_conntrack_info ctinfo
;
310 struct flow_offload
*flow
;
311 enum ip_conntrack_dir dir
;
315 if (nft_flow_offload_skip(pkt
->skb
, nft_pf(pkt
)))
318 ct
= nf_ct_get(pkt
->skb
, &ctinfo
);
322 switch (ct
->tuplehash
[IP_CT_DIR_ORIGINAL
].tuple
.dst
.protonum
) {
324 tcph
= skb_header_pointer(pkt
->skb
, nft_thoff(pkt
),
325 sizeof(_tcph
), &_tcph
);
326 if (unlikely(!tcph
|| tcph
->fin
|| tcph
->rst
||
327 !nf_conntrack_tcp_established(ct
)))
332 #ifdef CONFIG_NF_CT_PROTO_GRE
334 struct nf_conntrack_tuple
*tuple
;
336 if (ct
->status
& IPS_NAT_MASK
)
338 tuple
= &ct
->tuplehash
[IP_CT_DIR_ORIGINAL
].tuple
;
339 /* No support for GRE v1 */
340 if (tuple
->src
.u
.gre
.key
|| tuple
->dst
.u
.gre
.key
)
349 if (nf_ct_ext_exist(ct
, NF_CT_EXT_HELPER
) ||
350 ct
->status
& (IPS_SEQ_ADJUST
| IPS_NAT_CLASH
))
353 if (!nf_ct_is_confirmed(ct
))
356 if (test_and_set_bit(IPS_OFFLOAD_BIT
, &ct
->status
))
359 dir
= CTINFO2DIR(ctinfo
);
360 if (nft_flow_route(pkt
, ct
, &route
, dir
, priv
->flowtable
) < 0)
363 flow
= flow_offload_alloc(ct
);
367 flow_offload_route_init(flow
, &route
);
369 flow_offload_ct_tcp(ct
);
371 __set_bit(NF_FLOW_HW_BIDIRECTIONAL
, &flow
->flags
);
372 ret
= flow_offload_add(flowtable
, flow
);
379 flow_offload_free(flow
);
381 dst_release(route
.tuple
[dir
].dst
);
382 dst_release(route
.tuple
[!dir
].dst
);
384 clear_bit(IPS_OFFLOAD_BIT
, &ct
->status
);
386 regs
->verdict
.code
= NFT_BREAK
;
389 static int nft_flow_offload_validate(const struct nft_ctx
*ctx
,
390 const struct nft_expr
*expr
)
392 unsigned int hook_mask
= (1 << NF_INET_FORWARD
);
394 if (ctx
->family
!= NFPROTO_IPV4
&&
395 ctx
->family
!= NFPROTO_IPV6
&&
396 ctx
->family
!= NFPROTO_INET
)
399 return nft_chain_validate_hooks(ctx
->chain
, hook_mask
);
402 static const struct nla_policy nft_flow_offload_policy
[NFTA_FLOW_MAX
+ 1] = {
403 [NFTA_FLOW_TABLE_NAME
] = { .type
= NLA_STRING
,
404 .len
= NFT_NAME_MAXLEN
- 1 },
407 static int nft_flow_offload_init(const struct nft_ctx
*ctx
,
408 const struct nft_expr
*expr
,
409 const struct nlattr
* const tb
[])
411 struct nft_flow_offload
*priv
= nft_expr_priv(expr
);
412 u8 genmask
= nft_genmask_next(ctx
->net
);
413 struct nft_flowtable
*flowtable
;
415 if (!tb
[NFTA_FLOW_TABLE_NAME
])
418 flowtable
= nft_flowtable_lookup(ctx
->net
, ctx
->table
,
419 tb
[NFTA_FLOW_TABLE_NAME
], genmask
);
420 if (IS_ERR(flowtable
))
421 return PTR_ERR(flowtable
);
423 if (!nft_use_inc(&flowtable
->use
))
426 priv
->flowtable
= flowtable
;
428 return nf_ct_netns_get(ctx
->net
, ctx
->family
);
431 static void nft_flow_offload_deactivate(const struct nft_ctx
*ctx
,
432 const struct nft_expr
*expr
,
433 enum nft_trans_phase phase
)
435 struct nft_flow_offload
*priv
= nft_expr_priv(expr
);
437 nf_tables_deactivate_flowtable(ctx
, priv
->flowtable
, phase
);
440 static void nft_flow_offload_activate(const struct nft_ctx
*ctx
,
441 const struct nft_expr
*expr
)
443 struct nft_flow_offload
*priv
= nft_expr_priv(expr
);
445 nft_use_inc_restore(&priv
->flowtable
->use
);
448 static void nft_flow_offload_destroy(const struct nft_ctx
*ctx
,
449 const struct nft_expr
*expr
)
451 nf_ct_netns_put(ctx
->net
, ctx
->family
);
454 static int nft_flow_offload_dump(struct sk_buff
*skb
,
455 const struct nft_expr
*expr
, bool reset
)
457 struct nft_flow_offload
*priv
= nft_expr_priv(expr
);
459 if (nla_put_string(skb
, NFTA_FLOW_TABLE_NAME
, priv
->flowtable
->name
))
460 goto nla_put_failure
;
468 static struct nft_expr_type nft_flow_offload_type
;
469 static const struct nft_expr_ops nft_flow_offload_ops
= {
470 .type
= &nft_flow_offload_type
,
471 .size
= NFT_EXPR_SIZE(sizeof(struct nft_flow_offload
)),
472 .eval
= nft_flow_offload_eval
,
473 .init
= nft_flow_offload_init
,
474 .activate
= nft_flow_offload_activate
,
475 .deactivate
= nft_flow_offload_deactivate
,
476 .destroy
= nft_flow_offload_destroy
,
477 .validate
= nft_flow_offload_validate
,
478 .dump
= nft_flow_offload_dump
,
479 .reduce
= NFT_REDUCE_READONLY
,
482 static struct nft_expr_type nft_flow_offload_type __read_mostly
= {
483 .name
= "flow_offload",
484 .ops
= &nft_flow_offload_ops
,
485 .policy
= nft_flow_offload_policy
,
486 .maxattr
= NFTA_FLOW_MAX
,
487 .owner
= THIS_MODULE
,
490 static int flow_offload_netdev_event(struct notifier_block
*this,
491 unsigned long event
, void *ptr
)
493 struct net_device
*dev
= netdev_notifier_info_to_dev(ptr
);
495 if (event
!= NETDEV_DOWN
)
498 nf_flow_table_cleanup(dev
);
503 static struct notifier_block flow_offload_netdev_notifier
= {
504 .notifier_call
= flow_offload_netdev_event
,
507 static int __init
nft_flow_offload_module_init(void)
511 err
= register_netdevice_notifier(&flow_offload_netdev_notifier
);
515 err
= nft_register_expr(&nft_flow_offload_type
);
522 unregister_netdevice_notifier(&flow_offload_netdev_notifier
);
527 static void __exit
nft_flow_offload_module_exit(void)
529 nft_unregister_expr(&nft_flow_offload_type
);
530 unregister_netdevice_notifier(&flow_offload_netdev_notifier
);
533 module_init(nft_flow_offload_module_init
);
534 module_exit(nft_flow_offload_module_exit
);
536 MODULE_LICENSE("GPL");
537 MODULE_AUTHOR("Pablo Neira Ayuso <pablo@netfilter.org>");
538 MODULE_ALIAS_NFT_EXPR("flow_offload");
539 MODULE_DESCRIPTION("nftables hardware flow offload module");