1 // SPDX-License-Identifier: GPL-2.0-or-later
4 * Linux ethernet bridge
7 * Lennert Buytenhek <buytenh@gnu.org>
8 * Bart De Schuymer <bdschuym@pandora.be>
10 * Lennert dedicates this file to Kerstin Wurdinger.
13 #include <linux/module.h>
14 #include <linux/kernel.h>
15 #include <linux/slab.h>
17 #include <linux/netdevice.h>
18 #include <linux/skbuff.h>
19 #include <linux/if_arp.h>
20 #include <linux/if_ether.h>
21 #include <linux/if_vlan.h>
22 #include <linux/if_pppox.h>
23 #include <linux/ppp_defs.h>
24 #include <linux/netfilter_bridge.h>
25 #include <uapi/linux/netfilter_bridge.h>
26 #include <linux/netfilter_ipv4.h>
27 #include <linux/netfilter_ipv6.h>
28 #include <linux/netfilter_arp.h>
29 #include <linux/in_route.h>
30 #include <linux/rculist.h>
31 #include <linux/inetdevice.h>
35 #include <net/addrconf.h>
36 #include <net/dst_metadata.h>
37 #include <net/route.h>
38 #include <net/netfilter/br_netfilter.h>
39 #include <net/netns/generic.h>
40 #include <net/inet_dscp.h>
42 #include <linux/uaccess.h>
43 #include "br_private.h"
45 #include <linux/sysctl.h>
48 #if IS_ENABLED(CONFIG_NF_CONNTRACK)
49 #include <net/netfilter/nf_conntrack_core.h>
52 static unsigned int brnf_net_id __read_mostly
;
58 struct ctl_table_header
*ctl_hdr
;
61 /* default value is 1 */
66 /* default value is 0 */
67 int filter_vlan_tagged
;
68 int filter_pppoe_tagged
;
73 (!skb_vlan_tag_present(skb) && skb->protocol == htons(ETH_P_IP))
75 #define IS_IPV6(skb) \
76 (!skb_vlan_tag_present(skb) && skb->protocol == htons(ETH_P_IPV6))
79 (!skb_vlan_tag_present(skb) && skb->protocol == htons(ETH_P_ARP))
81 static inline __be16
vlan_proto(const struct sk_buff
*skb
)
83 if (skb_vlan_tag_present(skb
))
85 else if (skb
->protocol
== htons(ETH_P_8021Q
))
86 return vlan_eth_hdr(skb
)->h_vlan_encapsulated_proto
;
91 static inline bool is_vlan_ip(const struct sk_buff
*skb
, const struct net
*net
)
93 struct brnf_net
*brnet
= net_generic(net
, brnf_net_id
);
95 return vlan_proto(skb
) == htons(ETH_P_IP
) && brnet
->filter_vlan_tagged
;
98 static inline bool is_vlan_ipv6(const struct sk_buff
*skb
,
99 const struct net
*net
)
101 struct brnf_net
*brnet
= net_generic(net
, brnf_net_id
);
103 return vlan_proto(skb
) == htons(ETH_P_IPV6
) &&
104 brnet
->filter_vlan_tagged
;
107 static inline bool is_vlan_arp(const struct sk_buff
*skb
, const struct net
*net
)
109 struct brnf_net
*brnet
= net_generic(net
, brnf_net_id
);
111 return vlan_proto(skb
) == htons(ETH_P_ARP
) && brnet
->filter_vlan_tagged
;
114 static inline __be16
pppoe_proto(const struct sk_buff
*skb
)
116 return *((__be16
*)(skb_mac_header(skb
) + ETH_HLEN
+
117 sizeof(struct pppoe_hdr
)));
120 static inline bool is_pppoe_ip(const struct sk_buff
*skb
, const struct net
*net
)
122 struct brnf_net
*brnet
= net_generic(net
, brnf_net_id
);
124 return skb
->protocol
== htons(ETH_P_PPP_SES
) &&
125 pppoe_proto(skb
) == htons(PPP_IP
) && brnet
->filter_pppoe_tagged
;
128 static inline bool is_pppoe_ipv6(const struct sk_buff
*skb
,
129 const struct net
*net
)
131 struct brnf_net
*brnet
= net_generic(net
, brnf_net_id
);
133 return skb
->protocol
== htons(ETH_P_PPP_SES
) &&
134 pppoe_proto(skb
) == htons(PPP_IPV6
) &&
135 brnet
->filter_pppoe_tagged
;
138 /* largest possible L2 header, see br_nf_dev_queue_xmit() */
139 #define NF_BRIDGE_MAX_MAC_HEADER_LENGTH (PPPOE_SES_HLEN + ETH_HLEN)
141 struct brnf_frag_data
{
142 local_lock_t bh_lock
;
143 char mac
[NF_BRIDGE_MAX_MAC_HEADER_LENGTH
];
150 static DEFINE_PER_CPU(struct brnf_frag_data
, brnf_frag_data_storage
) = {
151 .bh_lock
= INIT_LOCAL_LOCK(bh_lock
),
154 static void nf_bridge_info_free(struct sk_buff
*skb
)
156 skb_ext_del(skb
, SKB_EXT_BRIDGE_NF
);
159 static inline struct net_device
*bridge_parent(const struct net_device
*dev
)
161 struct net_bridge_port
*port
;
163 port
= br_port_get_rcu(dev
);
164 return port
? port
->br
->dev
: NULL
;
167 static inline struct nf_bridge_info
*nf_bridge_unshare(struct sk_buff
*skb
)
169 return skb_ext_add(skb
, SKB_EXT_BRIDGE_NF
);
172 unsigned int nf_bridge_encap_header_len(const struct sk_buff
*skb
)
174 switch (skb
->protocol
) {
175 case __cpu_to_be16(ETH_P_8021Q
):
177 case __cpu_to_be16(ETH_P_PPP_SES
):
178 return PPPOE_SES_HLEN
;
184 static inline void nf_bridge_pull_encap_header(struct sk_buff
*skb
)
186 unsigned int len
= nf_bridge_encap_header_len(skb
);
189 skb
->network_header
+= len
;
192 static inline void nf_bridge_pull_encap_header_rcsum(struct sk_buff
*skb
)
194 unsigned int len
= nf_bridge_encap_header_len(skb
);
196 skb_pull_rcsum(skb
, len
);
197 skb
->network_header
+= len
;
200 /* When handing a packet over to the IP layer
201 * check whether we have a skb that is in the
205 static int br_validate_ipv4(struct net
*net
, struct sk_buff
*skb
)
207 const struct iphdr
*iph
;
210 if (!pskb_may_pull(skb
, sizeof(struct iphdr
)))
215 /* Basic sanity checks */
216 if (iph
->ihl
< 5 || iph
->version
!= 4)
219 if (!pskb_may_pull(skb
, iph
->ihl
*4))
223 if (unlikely(ip_fast_csum((u8
*)iph
, iph
->ihl
)))
226 len
= skb_ip_totlen(skb
);
227 if (skb
->len
< len
) {
228 __IP_INC_STATS(net
, IPSTATS_MIB_INTRUNCATEDPKTS
);
230 } else if (len
< (iph
->ihl
*4))
233 if (pskb_trim_rcsum(skb
, len
)) {
234 __IP_INC_STATS(net
, IPSTATS_MIB_INDISCARDS
);
238 memset(IPCB(skb
), 0, sizeof(struct inet_skb_parm
));
239 /* We should really parse IP options here but until
240 * somebody who actually uses IP options complains to
241 * us we'll just silently ignore the options because
247 __IP_INC_STATS(net
, IPSTATS_MIB_CSUMERRORS
);
249 __IP_INC_STATS(net
, IPSTATS_MIB_INHDRERRORS
);
254 void nf_bridge_update_protocol(struct sk_buff
*skb
)
256 const struct nf_bridge_info
*nf_bridge
= nf_bridge_info_get(skb
);
258 switch (nf_bridge
->orig_proto
) {
259 case BRNF_PROTO_8021Q
:
260 skb
->protocol
= htons(ETH_P_8021Q
);
262 case BRNF_PROTO_PPPOE
:
263 skb
->protocol
= htons(ETH_P_PPP_SES
);
265 case BRNF_PROTO_UNCHANGED
:
270 /* Obtain the correct destination MAC address, while preserving the original
271 * source MAC address. If we already know this address, we just copy it. If we
272 * don't, we use the neighbour framework to find out. In both cases, we make
273 * sure that br_handle_frame_finish() is called afterwards.
275 int br_nf_pre_routing_finish_bridge(struct net
*net
, struct sock
*sk
, struct sk_buff
*skb
)
277 struct neighbour
*neigh
;
278 struct dst_entry
*dst
;
280 skb
->dev
= bridge_parent(skb
->dev
);
284 neigh
= dst_neigh_lookup_skb(dst
, skb
);
286 struct nf_bridge_info
*nf_bridge
= nf_bridge_info_get(skb
);
289 if ((READ_ONCE(neigh
->nud_state
) & NUD_CONNECTED
) &&
290 READ_ONCE(neigh
->hh
.hh_len
)) {
291 struct net_device
*br_indev
;
293 br_indev
= nf_bridge_get_physindev(skb
, net
);
295 neigh_release(neigh
);
299 neigh_hh_bridge(&neigh
->hh
, skb
);
302 ret
= br_handle_frame_finish(net
, sk
, skb
);
304 /* the neighbour function below overwrites the complete
305 * MAC header, so we save the Ethernet source address and
308 skb_copy_from_linear_data_offset(skb
,
309 -(ETH_HLEN
-ETH_ALEN
),
310 nf_bridge
->neigh_header
,
312 /* tell br_dev_xmit to continue with forwarding */
313 nf_bridge
->bridged_dnat
= 1;
314 /* FIXME Need to refragment */
315 ret
= READ_ONCE(neigh
->output
)(neigh
, skb
);
317 neigh_release(neigh
);
326 br_nf_ipv4_daddr_was_changed(const struct sk_buff
*skb
,
327 const struct nf_bridge_info
*nf_bridge
)
329 return ip_hdr(skb
)->daddr
!= nf_bridge
->ipv4_daddr
;
332 /* This requires some explaining. If DNAT has taken place,
333 * we will need to fix up the destination Ethernet address.
334 * This is also true when SNAT takes place (for the reply direction).
336 * There are two cases to consider:
337 * 1. The packet was DNAT'ed to a device in the same bridge
338 * port group as it was received on. We can still bridge
340 * 2. The packet was DNAT'ed to a different device, either
341 * a non-bridged device or another bridge port group.
342 * The packet will need to be routed.
344 * The correct way of distinguishing between these two cases is to
345 * call ip_route_input() and to look at skb->dst->dev, which is
346 * changed to the destination device if ip_route_input() succeeds.
348 * Let's first consider the case that ip_route_input() succeeds:
350 * If the output device equals the logical bridge device the packet
351 * came in on, we can consider this bridging. The corresponding MAC
352 * address will be obtained in br_nf_pre_routing_finish_bridge.
353 * Otherwise, the packet is considered to be routed and we just
354 * change the destination MAC address so that the packet will
355 * later be passed up to the IP stack to be routed. For a redirected
356 * packet, ip_route_input() will give back the localhost as output device,
357 * which differs from the bridge device.
359 * Let's now consider the case that ip_route_input() fails:
361 * This can be because the destination address is martian, in which case
362 * the packet will be dropped.
363 * If IP forwarding is disabled, ip_route_input() will fail, while
364 * ip_route_output_key() can return success. The source
365 * address for ip_route_output_key() is set to zero, so ip_route_output_key()
366 * thinks we're handling a locally generated packet and won't care
367 * if IP forwarding is enabled. If the output device equals the logical bridge
368 * device, we proceed as if ip_route_input() succeeded. If it differs from the
369 * logical bridge port or if ip_route_output_key() fails we drop the packet.
371 static int br_nf_pre_routing_finish(struct net
*net
, struct sock
*sk
, struct sk_buff
*skb
)
373 struct nf_bridge_info
*nf_bridge
= nf_bridge_info_get(skb
);
374 struct net_device
*dev
= skb
->dev
, *br_indev
;
375 const struct iphdr
*iph
= ip_hdr(skb
);
376 enum skb_drop_reason reason
;
379 br_indev
= nf_bridge_get_physindev(skb
, net
);
385 nf_bridge
->frag_max_size
= IPCB(skb
)->frag_max_size
;
387 if (nf_bridge
->pkt_otherhost
) {
388 skb
->pkt_type
= PACKET_OTHERHOST
;
389 nf_bridge
->pkt_otherhost
= false;
391 nf_bridge
->in_prerouting
= 0;
392 if (br_nf_ipv4_daddr_was_changed(skb
, nf_bridge
)) {
393 reason
= ip_route_input(skb
, iph
->daddr
, iph
->saddr
,
394 ip4h_dscp(iph
), dev
);
396 struct in_device
*in_dev
= __in_dev_get_rcu(dev
);
398 /* If err equals -EHOSTUNREACH the error is due to a
399 * martian destination or due to the fact that
400 * forwarding is disabled. For most martian packets,
401 * ip_route_output_key() will fail. It won't fail for 2 types of
402 * martian destinations: loopback destinations and destination
403 * 0.0.0.0. In both cases the packet will be dropped because the
404 * destination is the loopback device and not the bridge. */
405 if (reason
!= SKB_DROP_REASON_IP_INADDRERRORS
|| !in_dev
||
406 IN_DEV_FORWARD(in_dev
))
409 rt
= ip_route_output(net
, iph
->daddr
, 0,
413 /* - Bridged-and-DNAT'ed traffic doesn't
414 * require ip_forwarding. */
415 if (rt
->dst
.dev
== dev
) {
417 skb_dst_set(skb
, &rt
->dst
);
426 if (skb_dst(skb
)->dev
== dev
) {
429 nf_bridge_update_protocol(skb
);
430 nf_bridge_push_encap_header(skb
);
431 br_nf_hook_thresh(NF_BR_PRE_ROUTING
,
432 net
, sk
, skb
, skb
->dev
,
434 br_nf_pre_routing_finish_bridge
);
437 ether_addr_copy(eth_hdr(skb
)->h_dest
, dev
->dev_addr
);
438 skb
->pkt_type
= PACKET_HOST
;
441 rt
= bridge_parent_rtable(br_indev
);
447 skb_dst_set_noref(skb
, &rt
->dst
);
451 nf_bridge_update_protocol(skb
);
452 nf_bridge_push_encap_header(skb
);
453 br_nf_hook_thresh(NF_BR_PRE_ROUTING
, net
, sk
, skb
, skb
->dev
, NULL
,
454 br_handle_frame_finish
);
458 static struct net_device
*brnf_get_logical_dev(struct sk_buff
*skb
,
459 const struct net_device
*dev
,
460 const struct net
*net
)
462 struct net_device
*vlan
, *br
;
463 struct brnf_net
*brnet
= net_generic(net
, brnf_net_id
);
465 br
= bridge_parent(dev
);
467 if (brnet
->pass_vlan_indev
== 0 || !skb_vlan_tag_present(skb
))
470 vlan
= __vlan_find_dev_deep_rcu(br
, skb
->vlan_proto
,
471 skb_vlan_tag_get(skb
) & VLAN_VID_MASK
);
473 return vlan
? vlan
: br
;
476 /* Some common code for IPv4/IPv6 */
477 struct net_device
*setup_pre_routing(struct sk_buff
*skb
, const struct net
*net
)
479 struct nf_bridge_info
*nf_bridge
= nf_bridge_info_get(skb
);
481 if (skb
->pkt_type
== PACKET_OTHERHOST
) {
482 skb
->pkt_type
= PACKET_HOST
;
483 nf_bridge
->pkt_otherhost
= true;
486 nf_bridge
->in_prerouting
= 1;
487 nf_bridge
->physinif
= skb
->dev
->ifindex
;
488 skb
->dev
= brnf_get_logical_dev(skb
, skb
->dev
, net
);
490 if (skb
->protocol
== htons(ETH_P_8021Q
))
491 nf_bridge
->orig_proto
= BRNF_PROTO_8021Q
;
492 else if (skb
->protocol
== htons(ETH_P_PPP_SES
))
493 nf_bridge
->orig_proto
= BRNF_PROTO_PPPOE
;
495 /* Must drop socket now because of tproxy. */
500 /* Direct IPv6 traffic to br_nf_pre_routing_ipv6.
501 * Replicate the checks that IPv4 does on packet reception.
502 * Set skb->dev to the bridge device (i.e. parent of the
503 * receiving device) to make netfilter happy, the REDIRECT
504 * target in particular. Save the original destination IP
505 * address to be able to detect DNAT afterwards. */
506 static unsigned int br_nf_pre_routing(void *priv
,
508 const struct nf_hook_state
*state
)
510 struct nf_bridge_info
*nf_bridge
;
511 struct net_bridge_port
*p
;
512 struct net_bridge
*br
;
513 __u32 len
= nf_bridge_encap_header_len(skb
);
514 struct brnf_net
*brnet
;
516 if (unlikely(!pskb_may_pull(skb
, len
)))
517 return NF_DROP_REASON(skb
, SKB_DROP_REASON_PKT_TOO_SMALL
, 0);
519 p
= br_port_get_rcu(state
->in
);
521 return NF_DROP_REASON(skb
, SKB_DROP_REASON_DEV_READY
, 0);
524 brnet
= net_generic(state
->net
, brnf_net_id
);
525 if (IS_IPV6(skb
) || is_vlan_ipv6(skb
, state
->net
) ||
526 is_pppoe_ipv6(skb
, state
->net
)) {
527 if (!brnet
->call_ip6tables
&&
528 !br_opt_get(br
, BROPT_NF_CALL_IP6TABLES
))
530 if (!ipv6_mod_enabled()) {
531 pr_warn_once("Module ipv6 is disabled, so call_ip6tables is not supported.");
532 return NF_DROP_REASON(skb
, SKB_DROP_REASON_IPV6DISABLED
, 0);
535 nf_bridge_pull_encap_header_rcsum(skb
);
536 return br_nf_pre_routing_ipv6(priv
, skb
, state
);
539 if (!brnet
->call_iptables
&& !br_opt_get(br
, BROPT_NF_CALL_IPTABLES
))
542 if (!IS_IP(skb
) && !is_vlan_ip(skb
, state
->net
) &&
543 !is_pppoe_ip(skb
, state
->net
))
546 nf_bridge_pull_encap_header_rcsum(skb
);
548 if (br_validate_ipv4(state
->net
, skb
))
549 return NF_DROP_REASON(skb
, SKB_DROP_REASON_IP_INHDR
, 0);
551 if (!nf_bridge_alloc(skb
))
552 return NF_DROP_REASON(skb
, SKB_DROP_REASON_NOMEM
, 0);
553 if (!setup_pre_routing(skb
, state
->net
))
554 return NF_DROP_REASON(skb
, SKB_DROP_REASON_DEV_READY
, 0);
556 nf_bridge
= nf_bridge_info_get(skb
);
557 nf_bridge
->ipv4_daddr
= ip_hdr(skb
)->daddr
;
559 skb
->protocol
= htons(ETH_P_IP
);
560 skb
->transport_header
= skb
->network_header
+ ip_hdr(skb
)->ihl
* 4;
562 NF_HOOK(NFPROTO_IPV4
, NF_INET_PRE_ROUTING
, state
->net
, state
->sk
, skb
,
564 br_nf_pre_routing_finish
);
569 #if IS_ENABLED(CONFIG_NF_CONNTRACK)
570 /* conntracks' nf_confirm logic cannot handle cloned skbs referencing
571 * the same nf_conn entry, which will happen for multicast (broadcast)
579 * ethX (or Y) receives multicast or broadcast packet containing
580 * an IP packet, not yet in conntrack table.
582 * 1. skb passes through bridge and fake-ip (br_netfilter)Prerouting.
583 * -> skb->_nfct now references a unconfirmed entry
584 * 2. skb is broad/mcast packet. bridge now passes clones out on each bridge
586 * 3. skb gets passed up the stack.
587 * 4. In macvlan case, macvlan driver retains clone(s) of the mcast skb
588 * and schedules a work queue to send them out on the lower devices.
590 * The clone skb->_nfct is not a copy, it is the same entry as the
591 * original skb. The macvlan rx handler then returns RX_HANDLER_PASS.
592 * 5. Normal conntrack hooks (in NF_INET_LOCAL_IN) confirm the orig skb.
594 * The Macvlan broadcast worker and normal confirm path will race.
596 * This race will not happen if step 2 already confirmed a clone. In that
597 * case later steps perform skb_clone() with skb->_nfct already confirmed (in
598 * hash table). This works fine.
600 * But such confirmation won't happen when eb/ip/nftables rules dropped the
601 * packets before they reached the nf_confirm step in postrouting.
603 * Work around this problem by explicit confirmation of the entry at
604 * LOCAL_IN time, before upper layer has a chance to clone the unconfirmed
608 static unsigned int br_nf_local_in(void *priv
,
610 const struct nf_hook_state
*state
)
612 bool promisc
= BR_INPUT_SKB_CB(skb
)->promisc
;
613 struct nf_conntrack
*nfct
= skb_nfct(skb
);
614 const struct nf_ct_hook
*ct_hook
;
623 if (!nfct
|| skb
->pkt_type
== PACKET_HOST
)
626 ct
= container_of(nfct
, struct nf_conn
, ct_general
);
627 if (likely(nf_ct_is_confirmed(ct
)))
630 if (WARN_ON_ONCE(refcount_read(&nfct
->use
) != 1)) {
635 WARN_ON_ONCE(skb_shared(skb
));
637 /* We can't call nf_confirm here, it would create a dependency
638 * on nf_conntrack module.
640 ct_hook
= rcu_dereference(nf_ct_hook
);
643 nf_conntrack_put(nfct
);
647 nf_bridge_pull_encap_header(skb
);
648 ret
= ct_hook
->confirm(skb
);
649 switch (ret
& NF_VERDICT_MASK
) {
653 nf_bridge_push_encap_header(skb
);
657 ct
= container_of(nfct
, struct nf_conn
, ct_general
);
658 WARN_ON_ONCE(!nf_ct_is_confirmed(ct
));
664 /* PF_BRIDGE/FORWARD *************************************************/
665 static int br_nf_forward_finish(struct net
*net
, struct sock
*sk
, struct sk_buff
*skb
)
667 struct nf_bridge_info
*nf_bridge
= nf_bridge_info_get(skb
);
668 struct net_device
*in
;
670 if (!IS_ARP(skb
) && !is_vlan_arp(skb
, net
)) {
672 if (skb
->protocol
== htons(ETH_P_IP
))
673 nf_bridge
->frag_max_size
= IPCB(skb
)->frag_max_size
;
675 if (skb
->protocol
== htons(ETH_P_IPV6
))
676 nf_bridge
->frag_max_size
= IP6CB(skb
)->frag_max_size
;
678 in
= nf_bridge_get_physindev(skb
, net
);
683 if (nf_bridge
->pkt_otherhost
) {
684 skb
->pkt_type
= PACKET_OTHERHOST
;
685 nf_bridge
->pkt_otherhost
= false;
687 nf_bridge_update_protocol(skb
);
689 in
= *((struct net_device
**)(skb
->cb
));
691 nf_bridge_push_encap_header(skb
);
693 br_nf_hook_thresh(NF_BR_FORWARD
, net
, sk
, skb
, in
, skb
->dev
,
699 static unsigned int br_nf_forward_ip(struct sk_buff
*skb
,
700 const struct nf_hook_state
*state
,
703 struct nf_bridge_info
*nf_bridge
;
704 struct net_device
*parent
;
706 nf_bridge
= nf_bridge_info_get(skb
);
710 /* Need exclusive nf_bridge_info since we might have multiple
711 * different physoutdevs. */
712 if (!nf_bridge_unshare(skb
))
713 return NF_DROP_REASON(skb
, SKB_DROP_REASON_NOMEM
, 0);
715 nf_bridge
= nf_bridge_info_get(skb
);
717 return NF_DROP_REASON(skb
, SKB_DROP_REASON_NOMEM
, 0);
719 parent
= bridge_parent(state
->out
);
721 return NF_DROP_REASON(skb
, SKB_DROP_REASON_DEV_READY
, 0);
723 nf_bridge_pull_encap_header(skb
);
725 if (skb
->pkt_type
== PACKET_OTHERHOST
) {
726 skb
->pkt_type
= PACKET_HOST
;
727 nf_bridge
->pkt_otherhost
= true;
730 if (pf
== NFPROTO_IPV4
) {
731 if (br_validate_ipv4(state
->net
, skb
))
732 return NF_DROP_REASON(skb
, SKB_DROP_REASON_IP_INHDR
, 0);
733 IPCB(skb
)->frag_max_size
= nf_bridge
->frag_max_size
;
734 skb
->protocol
= htons(ETH_P_IP
);
735 } else if (pf
== NFPROTO_IPV6
) {
736 if (br_validate_ipv6(state
->net
, skb
))
737 return NF_DROP_REASON(skb
, SKB_DROP_REASON_IP_INHDR
, 0);
738 IP6CB(skb
)->frag_max_size
= nf_bridge
->frag_max_size
;
739 skb
->protocol
= htons(ETH_P_IPV6
);
745 nf_bridge
->physoutdev
= skb
->dev
;
747 NF_HOOK(pf
, NF_INET_FORWARD
, state
->net
, NULL
, skb
,
748 brnf_get_logical_dev(skb
, state
->in
, state
->net
),
749 parent
, br_nf_forward_finish
);
754 static unsigned int br_nf_forward_arp(struct sk_buff
*skb
,
755 const struct nf_hook_state
*state
)
757 struct net_bridge_port
*p
;
758 struct net_bridge
*br
;
759 struct net_device
**d
= (struct net_device
**)(skb
->cb
);
760 struct brnf_net
*brnet
;
762 p
= br_port_get_rcu(state
->out
);
767 brnet
= net_generic(state
->net
, brnf_net_id
);
768 if (!brnet
->call_arptables
&& !br_opt_get(br
, BROPT_NF_CALL_ARPTABLES
))
771 if (is_vlan_arp(skb
, state
->net
))
772 nf_bridge_pull_encap_header(skb
);
774 if (unlikely(!pskb_may_pull(skb
, sizeof(struct arphdr
))))
775 return NF_DROP_REASON(skb
, SKB_DROP_REASON_PKT_TOO_SMALL
, 0);
777 if (arp_hdr(skb
)->ar_pln
!= 4) {
778 if (is_vlan_arp(skb
, state
->net
))
779 nf_bridge_push_encap_header(skb
);
783 NF_HOOK(NFPROTO_ARP
, NF_ARP_FORWARD
, state
->net
, state
->sk
, skb
,
784 state
->in
, state
->out
, br_nf_forward_finish
);
789 /* This is the 'purely bridged' case. For IP, we pass the packet to
790 * netfilter with indev and outdev set to the bridge device,
791 * but we are still able to filter on the 'real' indev/outdev
792 * because of the physdev module. For ARP, indev and outdev are the
795 static unsigned int br_nf_forward(void *priv
,
797 const struct nf_hook_state
*state
)
799 if (IS_IP(skb
) || is_vlan_ip(skb
, state
->net
) ||
800 is_pppoe_ip(skb
, state
->net
))
801 return br_nf_forward_ip(skb
, state
, NFPROTO_IPV4
);
802 if (IS_IPV6(skb
) || is_vlan_ipv6(skb
, state
->net
) ||
803 is_pppoe_ipv6(skb
, state
->net
))
804 return br_nf_forward_ip(skb
, state
, NFPROTO_IPV6
);
805 if (IS_ARP(skb
) || is_vlan_arp(skb
, state
->net
))
806 return br_nf_forward_arp(skb
, state
);
811 static int br_nf_push_frag_xmit(struct net
*net
, struct sock
*sk
, struct sk_buff
*skb
)
813 struct brnf_frag_data
*data
;
816 data
= this_cpu_ptr(&brnf_frag_data_storage
);
817 err
= skb_cow_head(skb
, data
->size
);
824 if (data
->vlan_proto
)
825 __vlan_hwaccel_put_tag(skb
, data
->vlan_proto
, data
->vlan_tci
);
827 skb_copy_to_linear_data_offset(skb
, -data
->size
, data
->mac
, data
->size
);
828 __skb_push(skb
, data
->encap_size
);
830 nf_bridge_info_free(skb
);
831 return br_dev_queue_push_xmit(net
, sk
, skb
);
835 br_nf_ip_fragment(struct net
*net
, struct sock
*sk
, struct sk_buff
*skb
,
836 int (*output
)(struct net
*, struct sock
*, struct sk_buff
*))
838 unsigned int mtu
= ip_skb_dst_mtu(sk
, skb
);
839 struct iphdr
*iph
= ip_hdr(skb
);
841 if (unlikely(((iph
->frag_off
& htons(IP_DF
)) && !skb
->ignore_df
) ||
842 (IPCB(skb
)->frag_max_size
&&
843 IPCB(skb
)->frag_max_size
> mtu
))) {
844 IP_INC_STATS(net
, IPSTATS_MIB_FRAGFAILS
);
849 return ip_do_fragment(net
, sk
, skb
, output
);
852 static unsigned int nf_bridge_mtu_reduction(const struct sk_buff
*skb
)
854 const struct nf_bridge_info
*nf_bridge
= nf_bridge_info_get(skb
);
856 if (nf_bridge
->orig_proto
== BRNF_PROTO_PPPOE
)
857 return PPPOE_SES_HLEN
;
861 static int br_nf_dev_queue_xmit(struct net
*net
, struct sock
*sk
, struct sk_buff
*skb
)
863 struct nf_bridge_info
*nf_bridge
= nf_bridge_info_get(skb
);
864 unsigned int mtu
, mtu_reserved
;
867 mtu_reserved
= nf_bridge_mtu_reduction(skb
);
870 if (nf_bridge
->pkt_otherhost
) {
871 skb
->pkt_type
= PACKET_OTHERHOST
;
872 nf_bridge
->pkt_otherhost
= false;
875 if (nf_bridge
->frag_max_size
&& nf_bridge
->frag_max_size
< mtu
)
876 mtu
= nf_bridge
->frag_max_size
;
878 nf_bridge_update_protocol(skb
);
879 nf_bridge_push_encap_header(skb
);
881 if (skb_is_gso(skb
) || skb
->len
+ mtu_reserved
<= mtu
) {
882 nf_bridge_info_free(skb
);
883 return br_dev_queue_push_xmit(net
, sk
, skb
);
886 /* Fragmentation on metadata/template dst is not supported */
887 if (unlikely(!skb_valid_dst(skb
)))
890 /* This is wrong! We should preserve the original fragment
891 * boundaries by preserving frag_list rather than refragmenting.
893 if (IS_ENABLED(CONFIG_NF_DEFRAG_IPV4
) &&
894 skb
->protocol
== htons(ETH_P_IP
)) {
895 struct brnf_frag_data
*data
;
897 if (br_validate_ipv4(net
, skb
))
900 IPCB(skb
)->frag_max_size
= nf_bridge
->frag_max_size
;
902 local_lock_nested_bh(&brnf_frag_data_storage
.bh_lock
);
903 data
= this_cpu_ptr(&brnf_frag_data_storage
);
905 if (skb_vlan_tag_present(skb
)) {
906 data
->vlan_tci
= skb
->vlan_tci
;
907 data
->vlan_proto
= skb
->vlan_proto
;
909 data
->vlan_proto
= 0;
912 data
->encap_size
= nf_bridge_encap_header_len(skb
);
913 data
->size
= ETH_HLEN
+ data
->encap_size
;
915 skb_copy_from_linear_data_offset(skb
, -data
->size
, data
->mac
,
918 ret
= br_nf_ip_fragment(net
, sk
, skb
, br_nf_push_frag_xmit
);
919 local_unlock_nested_bh(&brnf_frag_data_storage
.bh_lock
);
922 if (IS_ENABLED(CONFIG_NF_DEFRAG_IPV6
) &&
923 skb
->protocol
== htons(ETH_P_IPV6
)) {
924 const struct nf_ipv6_ops
*v6ops
= nf_get_ipv6_ops();
925 struct brnf_frag_data
*data
;
927 if (br_validate_ipv6(net
, skb
))
930 IP6CB(skb
)->frag_max_size
= nf_bridge
->frag_max_size
;
932 local_lock_nested_bh(&brnf_frag_data_storage
.bh_lock
);
933 data
= this_cpu_ptr(&brnf_frag_data_storage
);
934 data
->encap_size
= nf_bridge_encap_header_len(skb
);
935 data
->size
= ETH_HLEN
+ data
->encap_size
;
937 skb_copy_from_linear_data_offset(skb
, -data
->size
, data
->mac
,
941 ret
= v6ops
->fragment(net
, sk
, skb
, br_nf_push_frag_xmit
);
942 local_unlock_nested_bh(&brnf_frag_data_storage
.bh_lock
);
945 local_unlock_nested_bh(&brnf_frag_data_storage
.bh_lock
);
950 nf_bridge_info_free(skb
);
951 return br_dev_queue_push_xmit(net
, sk
, skb
);
957 /* PF_BRIDGE/POST_ROUTING ********************************************/
958 static unsigned int br_nf_post_routing(void *priv
,
960 const struct nf_hook_state
*state
)
962 struct nf_bridge_info
*nf_bridge
= nf_bridge_info_get(skb
);
963 struct net_device
*realoutdev
= bridge_parent(skb
->dev
);
966 /* if nf_bridge is set, but ->physoutdev is NULL, this packet came in
967 * on a bridge, but was delivered locally and is now being routed:
969 * POST_ROUTING was already invoked from the ip stack.
971 if (!nf_bridge
|| !nf_bridge
->physoutdev
)
975 return NF_DROP_REASON(skb
, SKB_DROP_REASON_DEV_READY
, 0);
977 if (IS_IP(skb
) || is_vlan_ip(skb
, state
->net
) ||
978 is_pppoe_ip(skb
, state
->net
))
980 else if (IS_IPV6(skb
) || is_vlan_ipv6(skb
, state
->net
) ||
981 is_pppoe_ipv6(skb
, state
->net
))
986 if (skb
->pkt_type
== PACKET_OTHERHOST
) {
987 skb
->pkt_type
= PACKET_HOST
;
988 nf_bridge
->pkt_otherhost
= true;
991 nf_bridge_pull_encap_header(skb
);
992 if (pf
== NFPROTO_IPV4
)
993 skb
->protocol
= htons(ETH_P_IP
);
995 skb
->protocol
= htons(ETH_P_IPV6
);
997 NF_HOOK(pf
, NF_INET_POST_ROUTING
, state
->net
, state
->sk
, skb
,
999 br_nf_dev_queue_xmit
);
1004 /* IP/SABOTAGE *****************************************************/
1005 /* Don't hand locally destined packets to PF_INET(6)/PRE_ROUTING
1006 * for the second time. */
1007 static unsigned int ip_sabotage_in(void *priv
,
1008 struct sk_buff
*skb
,
1009 const struct nf_hook_state
*state
)
1011 struct nf_bridge_info
*nf_bridge
= nf_bridge_info_get(skb
);
1014 if (nf_bridge
->sabotage_in_done
)
1017 if (!nf_bridge
->in_prerouting
&&
1018 !netif_is_l3_master(skb
->dev
) &&
1019 !netif_is_l3_slave(skb
->dev
)) {
1020 nf_bridge
->sabotage_in_done
= 1;
1021 state
->okfn(state
->net
, state
->sk
, skb
);
1029 /* This is called when br_netfilter has called into iptables/netfilter,
1030 * and DNAT has taken place on a bridge-forwarded packet.
1032 * neigh->output has created a new MAC header, with local br0 MAC
1035 * This restores the original MAC saddr of the bridged packet
1036 * before invoking bridge forward logic to transmit the packet.
1038 static void br_nf_pre_routing_finish_bridge_slow(struct sk_buff
*skb
)
1040 struct nf_bridge_info
*nf_bridge
= nf_bridge_info_get(skb
);
1041 struct net_device
*br_indev
;
1043 br_indev
= nf_bridge_get_physindev(skb
, dev_net(skb
->dev
));
1049 skb_pull(skb
, ETH_HLEN
);
1050 nf_bridge
->bridged_dnat
= 0;
1052 BUILD_BUG_ON(sizeof(nf_bridge
->neigh_header
) != (ETH_HLEN
- ETH_ALEN
));
1054 skb_copy_to_linear_data_offset(skb
, -(ETH_HLEN
- ETH_ALEN
),
1055 nf_bridge
->neigh_header
,
1056 ETH_HLEN
- ETH_ALEN
);
1057 skb
->dev
= br_indev
;
1059 nf_bridge
->physoutdev
= NULL
;
1060 br_handle_frame_finish(dev_net(skb
->dev
), NULL
, skb
);
1063 static int br_nf_dev_xmit(struct sk_buff
*skb
)
1065 const struct nf_bridge_info
*nf_bridge
= nf_bridge_info_get(skb
);
1067 if (nf_bridge
&& nf_bridge
->bridged_dnat
) {
1068 br_nf_pre_routing_finish_bridge_slow(skb
);
1074 static const struct nf_br_ops br_ops
= {
1075 .br_dev_xmit_hook
= br_nf_dev_xmit
,
1078 /* For br_nf_post_routing, we need (prio = NF_BR_PRI_LAST), because
1079 * br_dev_queue_push_xmit is called afterwards */
1080 static const struct nf_hook_ops br_nf_ops
[] = {
1082 .hook
= br_nf_pre_routing
,
1083 .pf
= NFPROTO_BRIDGE
,
1084 .hooknum
= NF_BR_PRE_ROUTING
,
1085 .priority
= NF_BR_PRI_BRNF
,
1087 #if IS_ENABLED(CONFIG_NF_CONNTRACK)
1089 .hook
= br_nf_local_in
,
1090 .pf
= NFPROTO_BRIDGE
,
1091 .hooknum
= NF_BR_LOCAL_IN
,
1092 .priority
= NF_BR_PRI_LAST
,
1096 .hook
= br_nf_forward
,
1097 .pf
= NFPROTO_BRIDGE
,
1098 .hooknum
= NF_BR_FORWARD
,
1099 .priority
= NF_BR_PRI_BRNF
,
1102 .hook
= br_nf_post_routing
,
1103 .pf
= NFPROTO_BRIDGE
,
1104 .hooknum
= NF_BR_POST_ROUTING
,
1105 .priority
= NF_BR_PRI_LAST
,
1108 .hook
= ip_sabotage_in
,
1110 .hooknum
= NF_INET_PRE_ROUTING
,
1111 .priority
= NF_IP_PRI_FIRST
,
1114 .hook
= ip_sabotage_in
,
1116 .hooknum
= NF_INET_PRE_ROUTING
,
1117 .priority
= NF_IP6_PRI_FIRST
,
1121 static int brnf_device_event(struct notifier_block
*unused
, unsigned long event
,
1124 struct net_device
*dev
= netdev_notifier_info_to_dev(ptr
);
1125 struct brnf_net
*brnet
;
1129 if (event
!= NETDEV_REGISTER
|| !netif_is_bridge_master(dev
))
1135 brnet
= net_generic(net
, brnf_net_id
);
1139 ret
= nf_register_net_hooks(net
, br_nf_ops
, ARRAY_SIZE(br_nf_ops
));
1143 brnet
->enabled
= true;
1147 static struct notifier_block brnf_notifier __read_mostly
= {
1148 .notifier_call
= brnf_device_event
,
1151 /* recursively invokes nf_hook_slow (again), skipping already-called
1152 * hooks (< NF_BR_PRI_BRNF).
1154 * Called with rcu read lock held.
1156 int br_nf_hook_thresh(unsigned int hook
, struct net
*net
,
1157 struct sock
*sk
, struct sk_buff
*skb
,
1158 struct net_device
*indev
,
1159 struct net_device
*outdev
,
1160 int (*okfn
)(struct net
*, struct sock
*,
1163 const struct nf_hook_entries
*e
;
1164 struct nf_hook_state state
;
1165 struct nf_hook_ops
**ops
;
1169 e
= rcu_dereference(net
->nf
.hooks_bridge
[hook
]);
1171 return okfn(net
, sk
, skb
);
1173 ops
= nf_hook_entries_get_hook_ops(e
);
1174 for (i
= 0; i
< e
->num_hook_entries
; i
++) {
1175 /* These hooks have already been called */
1176 if (ops
[i
]->priority
< NF_BR_PRI_BRNF
)
1179 /* These hooks have not been called yet, run them. */
1180 if (ops
[i
]->priority
> NF_BR_PRI_BRNF
)
1183 /* take a closer look at NF_BR_PRI_BRNF. */
1184 if (ops
[i
]->hook
== br_nf_pre_routing
) {
1185 /* This hook diverted the skb to this function,
1186 * hooks after this have not been run yet.
1193 nf_hook_state_init(&state
, hook
, NFPROTO_BRIDGE
, indev
, outdev
,
1196 ret
= nf_hook_slow(skb
, &state
, e
, i
);
1198 ret
= okfn(net
, sk
, skb
);
1203 #ifdef CONFIG_SYSCTL
1205 int brnf_sysctl_call_tables(const struct ctl_table
*ctl
, int write
,
1206 void *buffer
, size_t *lenp
, loff_t
*ppos
)
1210 ret
= proc_dointvec(ctl
, write
, buffer
, lenp
, ppos
);
1212 if (write
&& *(int *)(ctl
->data
))
1213 *(int *)(ctl
->data
) = 1;
1217 static struct ctl_table brnf_table
[] = {
1219 .procname
= "bridge-nf-call-arptables",
1220 .maxlen
= sizeof(int),
1222 .proc_handler
= brnf_sysctl_call_tables
,
1225 .procname
= "bridge-nf-call-iptables",
1226 .maxlen
= sizeof(int),
1228 .proc_handler
= brnf_sysctl_call_tables
,
1231 .procname
= "bridge-nf-call-ip6tables",
1232 .maxlen
= sizeof(int),
1234 .proc_handler
= brnf_sysctl_call_tables
,
1237 .procname
= "bridge-nf-filter-vlan-tagged",
1238 .maxlen
= sizeof(int),
1240 .proc_handler
= brnf_sysctl_call_tables
,
1243 .procname
= "bridge-nf-filter-pppoe-tagged",
1244 .maxlen
= sizeof(int),
1246 .proc_handler
= brnf_sysctl_call_tables
,
1249 .procname
= "bridge-nf-pass-vlan-input-dev",
1250 .maxlen
= sizeof(int),
1252 .proc_handler
= brnf_sysctl_call_tables
,
1256 static inline void br_netfilter_sysctl_default(struct brnf_net
*brnf
)
1258 brnf
->call_iptables
= 1;
1259 brnf
->call_ip6tables
= 1;
1260 brnf
->call_arptables
= 1;
1261 brnf
->filter_vlan_tagged
= 0;
1262 brnf
->filter_pppoe_tagged
= 0;
1263 brnf
->pass_vlan_indev
= 0;
1266 static int br_netfilter_sysctl_init_net(struct net
*net
)
1268 struct ctl_table
*table
= brnf_table
;
1269 struct brnf_net
*brnet
;
1271 if (!net_eq(net
, &init_net
)) {
1272 table
= kmemdup(table
, sizeof(brnf_table
), GFP_KERNEL
);
1277 brnet
= net_generic(net
, brnf_net_id
);
1278 table
[0].data
= &brnet
->call_arptables
;
1279 table
[1].data
= &brnet
->call_iptables
;
1280 table
[2].data
= &brnet
->call_ip6tables
;
1281 table
[3].data
= &brnet
->filter_vlan_tagged
;
1282 table
[4].data
= &brnet
->filter_pppoe_tagged
;
1283 table
[5].data
= &brnet
->pass_vlan_indev
;
1285 br_netfilter_sysctl_default(brnet
);
1287 brnet
->ctl_hdr
= register_net_sysctl_sz(net
, "net/bridge", table
,
1288 ARRAY_SIZE(brnf_table
));
1289 if (!brnet
->ctl_hdr
) {
1290 if (!net_eq(net
, &init_net
))
1299 static void br_netfilter_sysctl_exit_net(struct net
*net
,
1300 struct brnf_net
*brnet
)
1302 const struct ctl_table
*table
= brnet
->ctl_hdr
->ctl_table_arg
;
1304 unregister_net_sysctl_table(brnet
->ctl_hdr
);
1305 if (!net_eq(net
, &init_net
))
1309 static int __net_init
brnf_init_net(struct net
*net
)
1311 return br_netfilter_sysctl_init_net(net
);
1315 static void __net_exit
brnf_exit_net(struct net
*net
)
1317 struct brnf_net
*brnet
;
1319 brnet
= net_generic(net
, brnf_net_id
);
1320 if (brnet
->enabled
) {
1321 nf_unregister_net_hooks(net
, br_nf_ops
, ARRAY_SIZE(br_nf_ops
));
1322 brnet
->enabled
= false;
1325 #ifdef CONFIG_SYSCTL
1326 br_netfilter_sysctl_exit_net(net
, brnet
);
1330 static struct pernet_operations brnf_net_ops __read_mostly
= {
1331 #ifdef CONFIG_SYSCTL
1332 .init
= brnf_init_net
,
1334 .exit
= brnf_exit_net
,
1336 .size
= sizeof(struct brnf_net
),
1339 static int __init
br_netfilter_init(void)
1343 ret
= register_pernet_subsys(&brnf_net_ops
);
1347 ret
= register_netdevice_notifier(&brnf_notifier
);
1349 unregister_pernet_subsys(&brnf_net_ops
);
1353 RCU_INIT_POINTER(nf_br_ops
, &br_ops
);
1354 printk(KERN_NOTICE
"Bridge firewalling registered\n");
1358 static void __exit
br_netfilter_fini(void)
1360 RCU_INIT_POINTER(nf_br_ops
, NULL
);
1361 unregister_netdevice_notifier(&brnf_notifier
);
1362 unregister_pernet_subsys(&brnf_net_ops
);
1365 module_init(br_netfilter_init
);
1366 module_exit(br_netfilter_fini
);
1368 MODULE_LICENSE("GPL");
1369 MODULE_AUTHOR("Lennert Buytenhek <buytenh@gnu.org>");
1370 MODULE_AUTHOR("Bart De Schuymer <bdschuym@pandora.be>");
1371 MODULE_DESCRIPTION("Linux ethernet netfilter firewall bridge");