1 // SPDX-License-Identifier: GPL-2.0-or-later
3 * INET An implementation of the TCP/IP protocol suite for the LINUX
4 * operating system. INET is implemented using the BSD Socket
5 * interface as the means of communication with the user level.
7 * PACKET - implements raw packet sockets.
10 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
11 * Alan Cox, <gw4pts@gw4pts.ampr.org>
14 * Alan Cox : verify_area() now used correctly
15 * Alan Cox : new skbuff lists, look ma no backlogs!
16 * Alan Cox : tidied skbuff lists.
17 * Alan Cox : Now uses generic datagram routines I
18 * added. Also fixed the peek/read crash
19 * from all old Linux datagram code.
20 * Alan Cox : Uses the improved datagram code.
21 * Alan Cox : Added NULL's for socket options.
22 * Alan Cox : Re-commented the code.
23 * Alan Cox : Use new kernel side addressing
24 * Rob Janssen : Correct MTU usage.
25 * Dave Platt : Counter leaks caused by incorrect
26 * interrupt locking and some slightly
27 * dubious gcc output. Can you read
28 * compiler: it said _VOLATILE_
29 * Richard Kooijman : Timestamp fixes.
30 * Alan Cox : New buffers. Use sk->mac.raw.
31 * Alan Cox : sendmsg/recvmsg support.
32 * Alan Cox : Protocol setting support
33 * Alexey Kuznetsov : Untied from IPv4 stack.
34 * Cyrus Durgin : Fixed kerneld for kmod.
35 * Michal Ostrowski : Module initialization cleanup.
36 * Ulises Alonso : Frame number limit removal and
37 * packet_set_ring memory leak.
38 * Eric Biederman : Allow for > 8 byte hardware addresses.
39 * The convention is that longer addresses
40 * will simply extend the hardware address
41 * byte arrays at the end of sockaddr_ll
43 * Johann Baudy : Added TX RING.
44 * Chetan Loke : Implemented TPACKET_V3 block abstraction
46 * Copyright (C) 2011, <lokec@ccs.neu.edu>
49 #include <linux/ethtool.h>
50 #include <linux/types.h>
52 #include <linux/capability.h>
53 #include <linux/fcntl.h>
54 #include <linux/socket.h>
56 #include <linux/inet.h>
57 #include <linux/netdevice.h>
58 #include <linux/if_packet.h>
59 #include <linux/wireless.h>
60 #include <linux/kernel.h>
61 #include <linux/kmod.h>
62 #include <linux/slab.h>
63 #include <linux/vmalloc.h>
64 #include <net/net_namespace.h>
66 #include <net/protocol.h>
67 #include <linux/skbuff.h>
69 #include <linux/errno.h>
70 #include <linux/timer.h>
71 #include <linux/uaccess.h>
72 #include <asm/ioctls.h>
74 #include <asm/cacheflush.h>
76 #include <linux/proc_fs.h>
77 #include <linux/seq_file.h>
78 #include <linux/poll.h>
79 #include <linux/module.h>
80 #include <linux/init.h>
81 #include <linux/mutex.h>
82 #include <linux/if_vlan.h>
83 #include <linux/virtio_net.h>
84 #include <linux/errqueue.h>
85 #include <linux/net_tstamp.h>
86 #include <linux/percpu.h>
88 #include <net/inet_common.h>
90 #include <linux/bpf.h>
91 #include <net/compat.h>
97 - If the device has no dev->header_ops->create, there is no LL header
98 visible above the device. In this case, its hard_header_len should be 0.
99 The device may prepend its own header internally. In this case, its
100 needed_headroom should be set to the space needed for it to add its
102 For example, a WiFi driver pretending to be an Ethernet driver should
103 set its hard_header_len to be the Ethernet header length, and set its
104 needed_headroom to be (the real WiFi header length - the fake Ethernet
106 - packet socket receives packets with pulled ll header,
107 so that SOCK_RAW should push it back.
112 Incoming, dev_has_header(dev) == true
113 mac_header -> ll header
116 Outgoing, dev_has_header(dev) == true
117 mac_header -> ll header
120 Incoming, dev_has_header(dev) == false
122 However drivers often make it point to the ll header.
123 This is incorrect because the ll header should be invisible to us.
126 Outgoing, dev_has_header(dev) == false
127 mac_header -> data. ll header is invisible to us.
131 If dev_has_header(dev) == false we are unable to restore the ll header,
132 because it is invisible to us.
138 dev->header_ops != NULL
139 mac_header -> ll header
142 dev->header_ops == NULL (ll header is invisible to us)
146 We should set network_header on output to the correct position,
147 packet classifier depends on it.
150 /* Private packet socket structures. */
152 /* identical to struct packet_mreq except it has
153 * a longer address field.
155 struct packet_mreq_max
{
157 unsigned short mr_type
;
158 unsigned short mr_alen
;
159 unsigned char mr_address
[MAX_ADDR_LEN
];
163 struct tpacket_hdr
*h1
;
164 struct tpacket2_hdr
*h2
;
165 struct tpacket3_hdr
*h3
;
169 static int packet_set_ring(struct sock
*sk
, union tpacket_req_u
*req_u
,
170 int closing
, int tx_ring
);
172 #define V3_ALIGNMENT (8)
174 #define BLK_HDR_LEN (ALIGN(sizeof(struct tpacket_block_desc), V3_ALIGNMENT))
176 #define BLK_PLUS_PRIV(sz_of_priv) \
177 (BLK_HDR_LEN + ALIGN((sz_of_priv), V3_ALIGNMENT))
179 #define BLOCK_STATUS(x) ((x)->hdr.bh1.block_status)
180 #define BLOCK_NUM_PKTS(x) ((x)->hdr.bh1.num_pkts)
181 #define BLOCK_O2FP(x) ((x)->hdr.bh1.offset_to_first_pkt)
182 #define BLOCK_LEN(x) ((x)->hdr.bh1.blk_len)
183 #define BLOCK_SNUM(x) ((x)->hdr.bh1.seq_num)
184 #define BLOCK_O2PRIV(x) ((x)->offset_to_priv)
187 static int tpacket_rcv(struct sk_buff
*skb
, struct net_device
*dev
,
188 struct packet_type
*pt
, struct net_device
*orig_dev
);
190 static void *packet_previous_frame(struct packet_sock
*po
,
191 struct packet_ring_buffer
*rb
,
193 static void packet_increment_head(struct packet_ring_buffer
*buff
);
194 static int prb_curr_blk_in_use(struct tpacket_block_desc
*);
195 static void *prb_dispatch_next_block(struct tpacket_kbdq_core
*,
196 struct packet_sock
*);
197 static void prb_retire_current_block(struct tpacket_kbdq_core
*,
198 struct packet_sock
*, unsigned int status
);
199 static int prb_queue_frozen(struct tpacket_kbdq_core
*);
200 static void prb_open_block(struct tpacket_kbdq_core
*,
201 struct tpacket_block_desc
*);
202 static void prb_retire_rx_blk_timer_expired(struct timer_list
*);
203 static void _prb_refresh_rx_retire_blk_timer(struct tpacket_kbdq_core
*);
204 static void prb_fill_rxhash(struct tpacket_kbdq_core
*, struct tpacket3_hdr
*);
205 static void prb_clear_rxhash(struct tpacket_kbdq_core
*,
206 struct tpacket3_hdr
*);
207 static void prb_fill_vlan_info(struct tpacket_kbdq_core
*,
208 struct tpacket3_hdr
*);
209 static void packet_flush_mclist(struct sock
*sk
);
210 static u16
packet_pick_tx_queue(struct sk_buff
*skb
);
212 struct packet_skb_cb
{
214 struct sockaddr_pkt pkt
;
216 /* Trick: alias skb original length with
217 * ll.sll_family and ll.protocol in order
220 unsigned int origlen
;
221 struct sockaddr_ll ll
;
226 #define vio_le() virtio_legacy_is_little_endian()
228 #define PACKET_SKB_CB(__skb) ((struct packet_skb_cb *)((__skb)->cb))
230 #define GET_PBDQC_FROM_RB(x) ((struct tpacket_kbdq_core *)(&(x)->prb_bdqc))
231 #define GET_PBLOCK_DESC(x, bid) \
232 ((struct tpacket_block_desc *)((x)->pkbdq[(bid)].buffer))
233 #define GET_CURR_PBLOCK_DESC_FROM_CORE(x) \
234 ((struct tpacket_block_desc *)((x)->pkbdq[(x)->kactive_blk_num].buffer))
235 #define GET_NEXT_PRB_BLK_NUM(x) \
236 (((x)->kactive_blk_num < ((x)->knum_blocks-1)) ? \
237 ((x)->kactive_blk_num+1) : 0)
239 static void __fanout_unlink(struct sock
*sk
, struct packet_sock
*po
);
240 static void __fanout_link(struct sock
*sk
, struct packet_sock
*po
);
242 static int packet_direct_xmit(struct sk_buff
*skb
)
244 return dev_direct_xmit(skb
, packet_pick_tx_queue(skb
));
247 static struct net_device
*packet_cached_dev_get(struct packet_sock
*po
)
249 struct net_device
*dev
;
252 dev
= rcu_dereference(po
->cached_dev
);
260 static void packet_cached_dev_assign(struct packet_sock
*po
,
261 struct net_device
*dev
)
263 rcu_assign_pointer(po
->cached_dev
, dev
);
266 static void packet_cached_dev_reset(struct packet_sock
*po
)
268 RCU_INIT_POINTER(po
->cached_dev
, NULL
);
271 static bool packet_use_direct_xmit(const struct packet_sock
*po
)
273 return po
->xmit
== packet_direct_xmit
;
276 static u16
packet_pick_tx_queue(struct sk_buff
*skb
)
278 struct net_device
*dev
= skb
->dev
;
279 const struct net_device_ops
*ops
= dev
->netdev_ops
;
280 int cpu
= raw_smp_processor_id();
284 skb
->sender_cpu
= cpu
+ 1;
286 skb_record_rx_queue(skb
, cpu
% dev
->real_num_tx_queues
);
287 if (ops
->ndo_select_queue
) {
288 queue_index
= ops
->ndo_select_queue(dev
, skb
, NULL
);
289 queue_index
= netdev_cap_txqueue(dev
, queue_index
);
291 queue_index
= netdev_pick_tx(dev
, skb
, NULL
);
297 /* __register_prot_hook must be invoked through register_prot_hook
298 * or from a context in which asynchronous accesses to the packet
299 * socket is not possible (packet_create()).
301 static void __register_prot_hook(struct sock
*sk
)
303 struct packet_sock
*po
= pkt_sk(sk
);
307 __fanout_link(sk
, po
);
309 dev_add_pack(&po
->prot_hook
);
316 static void register_prot_hook(struct sock
*sk
)
318 lockdep_assert_held_once(&pkt_sk(sk
)->bind_lock
);
319 __register_prot_hook(sk
);
322 /* If the sync parameter is true, we will temporarily drop
323 * the po->bind_lock and do a synchronize_net to make sure no
324 * asynchronous packet processing paths still refer to the elements
325 * of po->prot_hook. If the sync parameter is false, it is the
326 * callers responsibility to take care of this.
328 static void __unregister_prot_hook(struct sock
*sk
, bool sync
)
330 struct packet_sock
*po
= pkt_sk(sk
);
332 lockdep_assert_held_once(&po
->bind_lock
);
337 __fanout_unlink(sk
, po
);
339 __dev_remove_pack(&po
->prot_hook
);
344 spin_unlock(&po
->bind_lock
);
346 spin_lock(&po
->bind_lock
);
350 static void unregister_prot_hook(struct sock
*sk
, bool sync
)
352 struct packet_sock
*po
= pkt_sk(sk
);
355 __unregister_prot_hook(sk
, sync
);
358 static inline struct page
* __pure
pgv_to_page(void *addr
)
360 if (is_vmalloc_addr(addr
))
361 return vmalloc_to_page(addr
);
362 return virt_to_page(addr
);
365 static void __packet_set_status(struct packet_sock
*po
, void *frame
, int status
)
367 union tpacket_uhdr h
;
370 switch (po
->tp_version
) {
372 h
.h1
->tp_status
= status
;
373 flush_dcache_page(pgv_to_page(&h
.h1
->tp_status
));
376 h
.h2
->tp_status
= status
;
377 flush_dcache_page(pgv_to_page(&h
.h2
->tp_status
));
380 h
.h3
->tp_status
= status
;
381 flush_dcache_page(pgv_to_page(&h
.h3
->tp_status
));
384 WARN(1, "TPACKET version not supported.\n");
391 static int __packet_get_status(const struct packet_sock
*po
, void *frame
)
393 union tpacket_uhdr h
;
398 switch (po
->tp_version
) {
400 flush_dcache_page(pgv_to_page(&h
.h1
->tp_status
));
401 return h
.h1
->tp_status
;
403 flush_dcache_page(pgv_to_page(&h
.h2
->tp_status
));
404 return h
.h2
->tp_status
;
406 flush_dcache_page(pgv_to_page(&h
.h3
->tp_status
));
407 return h
.h3
->tp_status
;
409 WARN(1, "TPACKET version not supported.\n");
415 static __u32
tpacket_get_timestamp(struct sk_buff
*skb
, struct timespec64
*ts
,
418 struct skb_shared_hwtstamps
*shhwtstamps
= skb_hwtstamps(skb
);
421 (flags
& SOF_TIMESTAMPING_RAW_HARDWARE
) &&
422 ktime_to_timespec64_cond(shhwtstamps
->hwtstamp
, ts
))
423 return TP_STATUS_TS_RAW_HARDWARE
;
425 if (ktime_to_timespec64_cond(skb
->tstamp
, ts
))
426 return TP_STATUS_TS_SOFTWARE
;
431 static __u32
__packet_set_timestamp(struct packet_sock
*po
, void *frame
,
434 union tpacket_uhdr h
;
435 struct timespec64 ts
;
438 if (!(ts_status
= tpacket_get_timestamp(skb
, &ts
, po
->tp_tstamp
)))
443 * versions 1 through 3 overflow the timestamps in y2106, since they
444 * all store the seconds in a 32-bit unsigned integer.
445 * If we create a version 4, that should have a 64-bit timestamp,
446 * either 64-bit seconds + 32-bit nanoseconds, or just 64-bit
449 switch (po
->tp_version
) {
451 h
.h1
->tp_sec
= ts
.tv_sec
;
452 h
.h1
->tp_usec
= ts
.tv_nsec
/ NSEC_PER_USEC
;
455 h
.h2
->tp_sec
= ts
.tv_sec
;
456 h
.h2
->tp_nsec
= ts
.tv_nsec
;
459 h
.h3
->tp_sec
= ts
.tv_sec
;
460 h
.h3
->tp_nsec
= ts
.tv_nsec
;
463 WARN(1, "TPACKET version not supported.\n");
467 /* one flush is safe, as both fields always lie on the same cacheline */
468 flush_dcache_page(pgv_to_page(&h
.h1
->tp_sec
));
474 static void *packet_lookup_frame(const struct packet_sock
*po
,
475 const struct packet_ring_buffer
*rb
,
476 unsigned int position
,
479 unsigned int pg_vec_pos
, frame_offset
;
480 union tpacket_uhdr h
;
482 pg_vec_pos
= position
/ rb
->frames_per_block
;
483 frame_offset
= position
% rb
->frames_per_block
;
485 h
.raw
= rb
->pg_vec
[pg_vec_pos
].buffer
+
486 (frame_offset
* rb
->frame_size
);
488 if (status
!= __packet_get_status(po
, h
.raw
))
494 static void *packet_current_frame(struct packet_sock
*po
,
495 struct packet_ring_buffer
*rb
,
498 return packet_lookup_frame(po
, rb
, rb
->head
, status
);
501 static void prb_del_retire_blk_timer(struct tpacket_kbdq_core
*pkc
)
503 del_timer_sync(&pkc
->retire_blk_timer
);
506 static void prb_shutdown_retire_blk_timer(struct packet_sock
*po
,
507 struct sk_buff_head
*rb_queue
)
509 struct tpacket_kbdq_core
*pkc
;
511 pkc
= GET_PBDQC_FROM_RB(&po
->rx_ring
);
513 spin_lock_bh(&rb_queue
->lock
);
514 pkc
->delete_blk_timer
= 1;
515 spin_unlock_bh(&rb_queue
->lock
);
517 prb_del_retire_blk_timer(pkc
);
520 static void prb_setup_retire_blk_timer(struct packet_sock
*po
)
522 struct tpacket_kbdq_core
*pkc
;
524 pkc
= GET_PBDQC_FROM_RB(&po
->rx_ring
);
525 timer_setup(&pkc
->retire_blk_timer
, prb_retire_rx_blk_timer_expired
,
527 pkc
->retire_blk_timer
.expires
= jiffies
;
530 static int prb_calc_retire_blk_tmo(struct packet_sock
*po
,
531 int blk_size_in_bytes
)
533 struct net_device
*dev
;
534 unsigned int mbits
, div
;
535 struct ethtool_link_ksettings ecmd
;
539 dev
= __dev_get_by_index(sock_net(&po
->sk
), po
->ifindex
);
540 if (unlikely(!dev
)) {
542 return DEFAULT_PRB_RETIRE_TOV
;
544 err
= __ethtool_get_link_ksettings(dev
, &ecmd
);
547 return DEFAULT_PRB_RETIRE_TOV
;
549 /* If the link speed is so slow you don't really
550 * need to worry about perf anyways
552 if (ecmd
.base
.speed
< SPEED_1000
||
553 ecmd
.base
.speed
== SPEED_UNKNOWN
)
554 return DEFAULT_PRB_RETIRE_TOV
;
556 div
= ecmd
.base
.speed
/ 1000;
557 mbits
= (blk_size_in_bytes
* 8) / (1024 * 1024);
567 static void prb_init_ft_ops(struct tpacket_kbdq_core
*p1
,
568 union tpacket_req_u
*req_u
)
570 p1
->feature_req_word
= req_u
->req3
.tp_feature_req_word
;
573 static void init_prb_bdqc(struct packet_sock
*po
,
574 struct packet_ring_buffer
*rb
,
576 union tpacket_req_u
*req_u
)
578 struct tpacket_kbdq_core
*p1
= GET_PBDQC_FROM_RB(rb
);
579 struct tpacket_block_desc
*pbd
;
581 memset(p1
, 0x0, sizeof(*p1
));
583 p1
->knxt_seq_num
= 1;
585 pbd
= (struct tpacket_block_desc
*)pg_vec
[0].buffer
;
586 p1
->pkblk_start
= pg_vec
[0].buffer
;
587 p1
->kblk_size
= req_u
->req3
.tp_block_size
;
588 p1
->knum_blocks
= req_u
->req3
.tp_block_nr
;
589 p1
->hdrlen
= po
->tp_hdrlen
;
590 p1
->version
= po
->tp_version
;
591 p1
->last_kactive_blk_num
= 0;
592 po
->stats
.stats3
.tp_freeze_q_cnt
= 0;
593 if (req_u
->req3
.tp_retire_blk_tov
)
594 p1
->retire_blk_tov
= req_u
->req3
.tp_retire_blk_tov
;
596 p1
->retire_blk_tov
= prb_calc_retire_blk_tmo(po
,
597 req_u
->req3
.tp_block_size
);
598 p1
->tov_in_jiffies
= msecs_to_jiffies(p1
->retire_blk_tov
);
599 p1
->blk_sizeof_priv
= req_u
->req3
.tp_sizeof_priv
;
600 rwlock_init(&p1
->blk_fill_in_prog_lock
);
602 p1
->max_frame_len
= p1
->kblk_size
- BLK_PLUS_PRIV(p1
->blk_sizeof_priv
);
603 prb_init_ft_ops(p1
, req_u
);
604 prb_setup_retire_blk_timer(po
);
605 prb_open_block(p1
, pbd
);
608 /* Do NOT update the last_blk_num first.
609 * Assumes sk_buff_head lock is held.
611 static void _prb_refresh_rx_retire_blk_timer(struct tpacket_kbdq_core
*pkc
)
613 mod_timer(&pkc
->retire_blk_timer
,
614 jiffies
+ pkc
->tov_in_jiffies
);
615 pkc
->last_kactive_blk_num
= pkc
->kactive_blk_num
;
620 * 1) We refresh the timer only when we open a block.
621 * By doing this we don't waste cycles refreshing the timer
622 * on packet-by-packet basis.
624 * With a 1MB block-size, on a 1Gbps line, it will take
625 * i) ~8 ms to fill a block + ii) memcpy etc.
626 * In this cut we are not accounting for the memcpy time.
628 * So, if the user sets the 'tmo' to 10ms then the timer
629 * will never fire while the block is still getting filled
630 * (which is what we want). However, the user could choose
631 * to close a block early and that's fine.
633 * But when the timer does fire, we check whether or not to refresh it.
634 * Since the tmo granularity is in msecs, it is not too expensive
635 * to refresh the timer, lets say every '8' msecs.
636 * Either the user can set the 'tmo' or we can derive it based on
637 * a) line-speed and b) block-size.
638 * prb_calc_retire_blk_tmo() calculates the tmo.
641 static void prb_retire_rx_blk_timer_expired(struct timer_list
*t
)
643 struct packet_sock
*po
=
644 from_timer(po
, t
, rx_ring
.prb_bdqc
.retire_blk_timer
);
645 struct tpacket_kbdq_core
*pkc
= GET_PBDQC_FROM_RB(&po
->rx_ring
);
647 struct tpacket_block_desc
*pbd
;
649 spin_lock(&po
->sk
.sk_receive_queue
.lock
);
651 frozen
= prb_queue_frozen(pkc
);
652 pbd
= GET_CURR_PBLOCK_DESC_FROM_CORE(pkc
);
654 if (unlikely(pkc
->delete_blk_timer
))
657 /* We only need to plug the race when the block is partially filled.
659 * lock(); increment BLOCK_NUM_PKTS; unlock()
660 * copy_bits() is in progress ...
661 * timer fires on other cpu:
662 * we can't retire the current block because copy_bits
666 if (BLOCK_NUM_PKTS(pbd
)) {
667 /* Waiting for skb_copy_bits to finish... */
668 write_lock(&pkc
->blk_fill_in_prog_lock
);
669 write_unlock(&pkc
->blk_fill_in_prog_lock
);
672 if (pkc
->last_kactive_blk_num
== pkc
->kactive_blk_num
) {
674 if (!BLOCK_NUM_PKTS(pbd
)) {
675 /* An empty block. Just refresh the timer. */
678 prb_retire_current_block(pkc
, po
, TP_STATUS_BLK_TMO
);
679 if (!prb_dispatch_next_block(pkc
, po
))
684 /* Case 1. Queue was frozen because user-space was
687 if (prb_curr_blk_in_use(pbd
)) {
689 * Ok, user-space is still behind.
690 * So just refresh the timer.
694 /* Case 2. queue was frozen,user-space caught up,
695 * now the link went idle && the timer fired.
696 * We don't have a block to close.So we open this
697 * block and restart the timer.
698 * opening a block thaws the queue,restarts timer
699 * Thawing/timer-refresh is a side effect.
701 prb_open_block(pkc
, pbd
);
708 _prb_refresh_rx_retire_blk_timer(pkc
);
711 spin_unlock(&po
->sk
.sk_receive_queue
.lock
);
714 static void prb_flush_block(struct tpacket_kbdq_core
*pkc1
,
715 struct tpacket_block_desc
*pbd1
, __u32 status
)
717 /* Flush everything minus the block header */
719 #if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1
724 /* Skip the block header(we know header WILL fit in 4K) */
727 end
= (u8
*)PAGE_ALIGN((unsigned long)pkc1
->pkblk_end
);
728 for (; start
< end
; start
+= PAGE_SIZE
)
729 flush_dcache_page(pgv_to_page(start
));
734 /* Now update the block status. */
736 BLOCK_STATUS(pbd1
) = status
;
738 /* Flush the block header */
740 #if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1
742 flush_dcache_page(pgv_to_page(start
));
752 * 2) Increment active_blk_num
754 * Note:We DONT refresh the timer on purpose.
755 * Because almost always the next block will be opened.
757 static void prb_close_block(struct tpacket_kbdq_core
*pkc1
,
758 struct tpacket_block_desc
*pbd1
,
759 struct packet_sock
*po
, unsigned int stat
)
761 __u32 status
= TP_STATUS_USER
| stat
;
763 struct tpacket3_hdr
*last_pkt
;
764 struct tpacket_hdr_v1
*h1
= &pbd1
->hdr
.bh1
;
765 struct sock
*sk
= &po
->sk
;
767 if (atomic_read(&po
->tp_drops
))
768 status
|= TP_STATUS_LOSING
;
770 last_pkt
= (struct tpacket3_hdr
*)pkc1
->prev
;
771 last_pkt
->tp_next_offset
= 0;
773 /* Get the ts of the last pkt */
774 if (BLOCK_NUM_PKTS(pbd1
)) {
775 h1
->ts_last_pkt
.ts_sec
= last_pkt
->tp_sec
;
776 h1
->ts_last_pkt
.ts_nsec
= last_pkt
->tp_nsec
;
778 /* Ok, we tmo'd - so get the current time.
780 * It shouldn't really happen as we don't close empty
781 * blocks. See prb_retire_rx_blk_timer_expired().
783 struct timespec64 ts
;
784 ktime_get_real_ts64(&ts
);
785 h1
->ts_last_pkt
.ts_sec
= ts
.tv_sec
;
786 h1
->ts_last_pkt
.ts_nsec
= ts
.tv_nsec
;
791 /* Flush the block */
792 prb_flush_block(pkc1
, pbd1
, status
);
794 sk
->sk_data_ready(sk
);
796 pkc1
->kactive_blk_num
= GET_NEXT_PRB_BLK_NUM(pkc1
);
799 static void prb_thaw_queue(struct tpacket_kbdq_core
*pkc
)
801 pkc
->reset_pending_on_curr_blk
= 0;
805 * Side effect of opening a block:
807 * 1) prb_queue is thawed.
808 * 2) retire_blk_timer is refreshed.
811 static void prb_open_block(struct tpacket_kbdq_core
*pkc1
,
812 struct tpacket_block_desc
*pbd1
)
814 struct timespec64 ts
;
815 struct tpacket_hdr_v1
*h1
= &pbd1
->hdr
.bh1
;
819 /* We could have just memset this but we will lose the
820 * flexibility of making the priv area sticky
823 BLOCK_SNUM(pbd1
) = pkc1
->knxt_seq_num
++;
824 BLOCK_NUM_PKTS(pbd1
) = 0;
825 BLOCK_LEN(pbd1
) = BLK_PLUS_PRIV(pkc1
->blk_sizeof_priv
);
827 ktime_get_real_ts64(&ts
);
829 h1
->ts_first_pkt
.ts_sec
= ts
.tv_sec
;
830 h1
->ts_first_pkt
.ts_nsec
= ts
.tv_nsec
;
832 pkc1
->pkblk_start
= (char *)pbd1
;
833 pkc1
->nxt_offset
= pkc1
->pkblk_start
+ BLK_PLUS_PRIV(pkc1
->blk_sizeof_priv
);
835 BLOCK_O2FP(pbd1
) = (__u32
)BLK_PLUS_PRIV(pkc1
->blk_sizeof_priv
);
836 BLOCK_O2PRIV(pbd1
) = BLK_HDR_LEN
;
838 pbd1
->version
= pkc1
->version
;
839 pkc1
->prev
= pkc1
->nxt_offset
;
840 pkc1
->pkblk_end
= pkc1
->pkblk_start
+ pkc1
->kblk_size
;
842 prb_thaw_queue(pkc1
);
843 _prb_refresh_rx_retire_blk_timer(pkc1
);
849 * Queue freeze logic:
850 * 1) Assume tp_block_nr = 8 blocks.
851 * 2) At time 't0', user opens Rx ring.
852 * 3) Some time past 't0', kernel starts filling blocks starting from 0 .. 7
853 * 4) user-space is either sleeping or processing block '0'.
854 * 5) tpacket_rcv is currently filling block '7', since there is no space left,
855 * it will close block-7,loop around and try to fill block '0'.
857 * __packet_lookup_frame_in_block
858 * prb_retire_current_block()
859 * prb_dispatch_next_block()
860 * |->(BLOCK_STATUS == USER) evaluates to true
861 * 5.1) Since block-0 is currently in-use, we just freeze the queue.
862 * 6) Now there are two cases:
863 * 6.1) Link goes idle right after the queue is frozen.
864 * But remember, the last open_block() refreshed the timer.
865 * When this timer expires,it will refresh itself so that we can
866 * re-open block-0 in near future.
867 * 6.2) Link is busy and keeps on receiving packets. This is a simple
868 * case and __packet_lookup_frame_in_block will check if block-0
869 * is free and can now be re-used.
871 static void prb_freeze_queue(struct tpacket_kbdq_core
*pkc
,
872 struct packet_sock
*po
)
874 pkc
->reset_pending_on_curr_blk
= 1;
875 po
->stats
.stats3
.tp_freeze_q_cnt
++;
878 #define TOTAL_PKT_LEN_INCL_ALIGN(length) (ALIGN((length), V3_ALIGNMENT))
881 * If the next block is free then we will dispatch it
882 * and return a good offset.
883 * Else, we will freeze the queue.
884 * So, caller must check the return value.
886 static void *prb_dispatch_next_block(struct tpacket_kbdq_core
*pkc
,
887 struct packet_sock
*po
)
889 struct tpacket_block_desc
*pbd
;
893 /* 1. Get current block num */
894 pbd
= GET_CURR_PBLOCK_DESC_FROM_CORE(pkc
);
896 /* 2. If this block is currently in_use then freeze the queue */
897 if (TP_STATUS_USER
& BLOCK_STATUS(pbd
)) {
898 prb_freeze_queue(pkc
, po
);
904 * open this block and return the offset where the first packet
905 * needs to get stored.
907 prb_open_block(pkc
, pbd
);
908 return (void *)pkc
->nxt_offset
;
911 static void prb_retire_current_block(struct tpacket_kbdq_core
*pkc
,
912 struct packet_sock
*po
, unsigned int status
)
914 struct tpacket_block_desc
*pbd
= GET_CURR_PBLOCK_DESC_FROM_CORE(pkc
);
916 /* retire/close the current block */
917 if (likely(TP_STATUS_KERNEL
== BLOCK_STATUS(pbd
))) {
919 * Plug the case where copy_bits() is in progress on
920 * cpu-0 and tpacket_rcv() got invoked on cpu-1, didn't
921 * have space to copy the pkt in the current block and
922 * called prb_retire_current_block()
924 * We don't need to worry about the TMO case because
925 * the timer-handler already handled this case.
927 if (!(status
& TP_STATUS_BLK_TMO
)) {
928 /* Waiting for skb_copy_bits to finish... */
929 write_lock(&pkc
->blk_fill_in_prog_lock
);
930 write_unlock(&pkc
->blk_fill_in_prog_lock
);
932 prb_close_block(pkc
, pbd
, po
, status
);
937 static int prb_curr_blk_in_use(struct tpacket_block_desc
*pbd
)
939 return TP_STATUS_USER
& BLOCK_STATUS(pbd
);
942 static int prb_queue_frozen(struct tpacket_kbdq_core
*pkc
)
944 return pkc
->reset_pending_on_curr_blk
;
947 static void prb_clear_blk_fill_status(struct packet_ring_buffer
*rb
)
948 __releases(&pkc
->blk_fill_in_prog_lock
)
950 struct tpacket_kbdq_core
*pkc
= GET_PBDQC_FROM_RB(rb
);
952 read_unlock(&pkc
->blk_fill_in_prog_lock
);
955 static void prb_fill_rxhash(struct tpacket_kbdq_core
*pkc
,
956 struct tpacket3_hdr
*ppd
)
958 ppd
->hv1
.tp_rxhash
= skb_get_hash(pkc
->skb
);
961 static void prb_clear_rxhash(struct tpacket_kbdq_core
*pkc
,
962 struct tpacket3_hdr
*ppd
)
964 ppd
->hv1
.tp_rxhash
= 0;
967 static void prb_fill_vlan_info(struct tpacket_kbdq_core
*pkc
,
968 struct tpacket3_hdr
*ppd
)
970 if (skb_vlan_tag_present(pkc
->skb
)) {
971 ppd
->hv1
.tp_vlan_tci
= skb_vlan_tag_get(pkc
->skb
);
972 ppd
->hv1
.tp_vlan_tpid
= ntohs(pkc
->skb
->vlan_proto
);
973 ppd
->tp_status
= TP_STATUS_VLAN_VALID
| TP_STATUS_VLAN_TPID_VALID
;
975 ppd
->hv1
.tp_vlan_tci
= 0;
976 ppd
->hv1
.tp_vlan_tpid
= 0;
977 ppd
->tp_status
= TP_STATUS_AVAILABLE
;
981 static void prb_run_all_ft_ops(struct tpacket_kbdq_core
*pkc
,
982 struct tpacket3_hdr
*ppd
)
984 ppd
->hv1
.tp_padding
= 0;
985 prb_fill_vlan_info(pkc
, ppd
);
987 if (pkc
->feature_req_word
& TP_FT_REQ_FILL_RXHASH
)
988 prb_fill_rxhash(pkc
, ppd
);
990 prb_clear_rxhash(pkc
, ppd
);
993 static void prb_fill_curr_block(char *curr
,
994 struct tpacket_kbdq_core
*pkc
,
995 struct tpacket_block_desc
*pbd
,
997 __acquires(&pkc
->blk_fill_in_prog_lock
)
999 struct tpacket3_hdr
*ppd
;
1001 ppd
= (struct tpacket3_hdr
*)curr
;
1002 ppd
->tp_next_offset
= TOTAL_PKT_LEN_INCL_ALIGN(len
);
1004 pkc
->nxt_offset
+= TOTAL_PKT_LEN_INCL_ALIGN(len
);
1005 BLOCK_LEN(pbd
) += TOTAL_PKT_LEN_INCL_ALIGN(len
);
1006 BLOCK_NUM_PKTS(pbd
) += 1;
1007 read_lock(&pkc
->blk_fill_in_prog_lock
);
1008 prb_run_all_ft_ops(pkc
, ppd
);
1011 /* Assumes caller has the sk->rx_queue.lock */
1012 static void *__packet_lookup_frame_in_block(struct packet_sock
*po
,
1013 struct sk_buff
*skb
,
1017 struct tpacket_kbdq_core
*pkc
;
1018 struct tpacket_block_desc
*pbd
;
1021 pkc
= GET_PBDQC_FROM_RB(&po
->rx_ring
);
1022 pbd
= GET_CURR_PBLOCK_DESC_FROM_CORE(pkc
);
1024 /* Queue is frozen when user space is lagging behind */
1025 if (prb_queue_frozen(pkc
)) {
1027 * Check if that last block which caused the queue to freeze,
1028 * is still in_use by user-space.
1030 if (prb_curr_blk_in_use(pbd
)) {
1031 /* Can't record this packet */
1035 * Ok, the block was released by user-space.
1036 * Now let's open that block.
1037 * opening a block also thaws the queue.
1038 * Thawing is a side effect.
1040 prb_open_block(pkc
, pbd
);
1045 curr
= pkc
->nxt_offset
;
1047 end
= (char *)pbd
+ pkc
->kblk_size
;
1049 /* first try the current block */
1050 if (curr
+TOTAL_PKT_LEN_INCL_ALIGN(len
) < end
) {
1051 prb_fill_curr_block(curr
, pkc
, pbd
, len
);
1052 return (void *)curr
;
1055 /* Ok, close the current block */
1056 prb_retire_current_block(pkc
, po
, 0);
1058 /* Now, try to dispatch the next block */
1059 curr
= (char *)prb_dispatch_next_block(pkc
, po
);
1061 pbd
= GET_CURR_PBLOCK_DESC_FROM_CORE(pkc
);
1062 prb_fill_curr_block(curr
, pkc
, pbd
, len
);
1063 return (void *)curr
;
1067 * No free blocks are available.user_space hasn't caught up yet.
1068 * Queue was just frozen and now this packet will get dropped.
1073 static void *packet_current_rx_frame(struct packet_sock
*po
,
1074 struct sk_buff
*skb
,
1075 int status
, unsigned int len
)
1078 switch (po
->tp_version
) {
1081 curr
= packet_lookup_frame(po
, &po
->rx_ring
,
1082 po
->rx_ring
.head
, status
);
1085 return __packet_lookup_frame_in_block(po
, skb
, len
);
1087 WARN(1, "TPACKET version not supported\n");
1093 static void *prb_lookup_block(const struct packet_sock
*po
,
1094 const struct packet_ring_buffer
*rb
,
1098 struct tpacket_kbdq_core
*pkc
= GET_PBDQC_FROM_RB(rb
);
1099 struct tpacket_block_desc
*pbd
= GET_PBLOCK_DESC(pkc
, idx
);
1101 if (status
!= BLOCK_STATUS(pbd
))
1106 static int prb_previous_blk_num(struct packet_ring_buffer
*rb
)
1109 if (rb
->prb_bdqc
.kactive_blk_num
)
1110 prev
= rb
->prb_bdqc
.kactive_blk_num
-1;
1112 prev
= rb
->prb_bdqc
.knum_blocks
-1;
1116 /* Assumes caller has held the rx_queue.lock */
1117 static void *__prb_previous_block(struct packet_sock
*po
,
1118 struct packet_ring_buffer
*rb
,
1121 unsigned int previous
= prb_previous_blk_num(rb
);
1122 return prb_lookup_block(po
, rb
, previous
, status
);
1125 static void *packet_previous_rx_frame(struct packet_sock
*po
,
1126 struct packet_ring_buffer
*rb
,
1129 if (po
->tp_version
<= TPACKET_V2
)
1130 return packet_previous_frame(po
, rb
, status
);
1132 return __prb_previous_block(po
, rb
, status
);
1135 static void packet_increment_rx_head(struct packet_sock
*po
,
1136 struct packet_ring_buffer
*rb
)
1138 switch (po
->tp_version
) {
1141 return packet_increment_head(rb
);
1144 WARN(1, "TPACKET version not supported.\n");
1150 static void *packet_previous_frame(struct packet_sock
*po
,
1151 struct packet_ring_buffer
*rb
,
1154 unsigned int previous
= rb
->head
? rb
->head
- 1 : rb
->frame_max
;
1155 return packet_lookup_frame(po
, rb
, previous
, status
);
1158 static void packet_increment_head(struct packet_ring_buffer
*buff
)
1160 buff
->head
= buff
->head
!= buff
->frame_max
? buff
->head
+1 : 0;
1163 static void packet_inc_pending(struct packet_ring_buffer
*rb
)
1165 this_cpu_inc(*rb
->pending_refcnt
);
1168 static void packet_dec_pending(struct packet_ring_buffer
*rb
)
1170 this_cpu_dec(*rb
->pending_refcnt
);
1173 static unsigned int packet_read_pending(const struct packet_ring_buffer
*rb
)
1175 unsigned int refcnt
= 0;
1178 /* We don't use pending refcount in rx_ring. */
1179 if (rb
->pending_refcnt
== NULL
)
1182 for_each_possible_cpu(cpu
)
1183 refcnt
+= *per_cpu_ptr(rb
->pending_refcnt
, cpu
);
1188 static int packet_alloc_pending(struct packet_sock
*po
)
1190 po
->rx_ring
.pending_refcnt
= NULL
;
1192 po
->tx_ring
.pending_refcnt
= alloc_percpu(unsigned int);
1193 if (unlikely(po
->tx_ring
.pending_refcnt
== NULL
))
1199 static void packet_free_pending(struct packet_sock
*po
)
1201 free_percpu(po
->tx_ring
.pending_refcnt
);
1204 #define ROOM_POW_OFF 2
1205 #define ROOM_NONE 0x0
1206 #define ROOM_LOW 0x1
1207 #define ROOM_NORMAL 0x2
1209 static bool __tpacket_has_room(const struct packet_sock
*po
, int pow_off
)
1213 len
= READ_ONCE(po
->rx_ring
.frame_max
) + 1;
1214 idx
= READ_ONCE(po
->rx_ring
.head
);
1216 idx
+= len
>> pow_off
;
1219 return packet_lookup_frame(po
, &po
->rx_ring
, idx
, TP_STATUS_KERNEL
);
1222 static bool __tpacket_v3_has_room(const struct packet_sock
*po
, int pow_off
)
1226 len
= READ_ONCE(po
->rx_ring
.prb_bdqc
.knum_blocks
);
1227 idx
= READ_ONCE(po
->rx_ring
.prb_bdqc
.kactive_blk_num
);
1229 idx
+= len
>> pow_off
;
1232 return prb_lookup_block(po
, &po
->rx_ring
, idx
, TP_STATUS_KERNEL
);
1235 static int __packet_rcv_has_room(const struct packet_sock
*po
,
1236 const struct sk_buff
*skb
)
1238 const struct sock
*sk
= &po
->sk
;
1239 int ret
= ROOM_NONE
;
1241 if (po
->prot_hook
.func
!= tpacket_rcv
) {
1242 int rcvbuf
= READ_ONCE(sk
->sk_rcvbuf
);
1243 int avail
= rcvbuf
- atomic_read(&sk
->sk_rmem_alloc
)
1244 - (skb
? skb
->truesize
: 0);
1246 if (avail
> (rcvbuf
>> ROOM_POW_OFF
))
1254 if (po
->tp_version
== TPACKET_V3
) {
1255 if (__tpacket_v3_has_room(po
, ROOM_POW_OFF
))
1257 else if (__tpacket_v3_has_room(po
, 0))
1260 if (__tpacket_has_room(po
, ROOM_POW_OFF
))
1262 else if (__tpacket_has_room(po
, 0))
1269 static int packet_rcv_has_room(struct packet_sock
*po
, struct sk_buff
*skb
)
1273 ret
= __packet_rcv_has_room(po
, skb
);
1274 pressure
= ret
!= ROOM_NORMAL
;
1276 if (READ_ONCE(po
->pressure
) != pressure
)
1277 WRITE_ONCE(po
->pressure
, pressure
);
1282 static void packet_rcv_try_clear_pressure(struct packet_sock
*po
)
1284 if (READ_ONCE(po
->pressure
) &&
1285 __packet_rcv_has_room(po
, NULL
) == ROOM_NORMAL
)
1286 WRITE_ONCE(po
->pressure
, 0);
1289 static void packet_sock_destruct(struct sock
*sk
)
1291 skb_queue_purge(&sk
->sk_error_queue
);
1293 WARN_ON(atomic_read(&sk
->sk_rmem_alloc
));
1294 WARN_ON(refcount_read(&sk
->sk_wmem_alloc
));
1296 if (!sock_flag(sk
, SOCK_DEAD
)) {
1297 pr_err("Attempt to release alive packet socket: %p\n", sk
);
1301 sk_refcnt_debug_dec(sk
);
1304 static bool fanout_flow_is_huge(struct packet_sock
*po
, struct sk_buff
*skb
)
1306 u32
*history
= po
->rollover
->history
;
1310 rxhash
= skb_get_hash(skb
);
1311 for (i
= 0; i
< ROLLOVER_HLEN
; i
++)
1312 if (READ_ONCE(history
[i
]) == rxhash
)
1315 victim
= prandom_u32() % ROLLOVER_HLEN
;
1317 /* Avoid dirtying the cache line if possible */
1318 if (READ_ONCE(history
[victim
]) != rxhash
)
1319 WRITE_ONCE(history
[victim
], rxhash
);
1321 return count
> (ROLLOVER_HLEN
>> 1);
1324 static unsigned int fanout_demux_hash(struct packet_fanout
*f
,
1325 struct sk_buff
*skb
,
1328 return reciprocal_scale(__skb_get_hash_symmetric(skb
), num
);
1331 static unsigned int fanout_demux_lb(struct packet_fanout
*f
,
1332 struct sk_buff
*skb
,
1335 unsigned int val
= atomic_inc_return(&f
->rr_cur
);
1340 static unsigned int fanout_demux_cpu(struct packet_fanout
*f
,
1341 struct sk_buff
*skb
,
1344 return smp_processor_id() % num
;
1347 static unsigned int fanout_demux_rnd(struct packet_fanout
*f
,
1348 struct sk_buff
*skb
,
1351 return prandom_u32_max(num
);
1354 static unsigned int fanout_demux_rollover(struct packet_fanout
*f
,
1355 struct sk_buff
*skb
,
1356 unsigned int idx
, bool try_self
,
1359 struct packet_sock
*po
, *po_next
, *po_skip
= NULL
;
1360 unsigned int i
, j
, room
= ROOM_NONE
;
1362 po
= pkt_sk(f
->arr
[idx
]);
1365 room
= packet_rcv_has_room(po
, skb
);
1366 if (room
== ROOM_NORMAL
||
1367 (room
== ROOM_LOW
&& !fanout_flow_is_huge(po
, skb
)))
1372 i
= j
= min_t(int, po
->rollover
->sock
, num
- 1);
1374 po_next
= pkt_sk(f
->arr
[i
]);
1375 if (po_next
!= po_skip
&& !READ_ONCE(po_next
->pressure
) &&
1376 packet_rcv_has_room(po_next
, skb
) == ROOM_NORMAL
) {
1378 po
->rollover
->sock
= i
;
1379 atomic_long_inc(&po
->rollover
->num
);
1380 if (room
== ROOM_LOW
)
1381 atomic_long_inc(&po
->rollover
->num_huge
);
1389 atomic_long_inc(&po
->rollover
->num_failed
);
1393 static unsigned int fanout_demux_qm(struct packet_fanout
*f
,
1394 struct sk_buff
*skb
,
1397 return skb_get_queue_mapping(skb
) % num
;
1400 static unsigned int fanout_demux_bpf(struct packet_fanout
*f
,
1401 struct sk_buff
*skb
,
1404 struct bpf_prog
*prog
;
1405 unsigned int ret
= 0;
1408 prog
= rcu_dereference(f
->bpf_prog
);
1410 ret
= bpf_prog_run_clear_cb(prog
, skb
) % num
;
1416 static bool fanout_has_flag(struct packet_fanout
*f
, u16 flag
)
1418 return f
->flags
& (flag
>> 8);
1421 static int packet_rcv_fanout(struct sk_buff
*skb
, struct net_device
*dev
,
1422 struct packet_type
*pt
, struct net_device
*orig_dev
)
1424 struct packet_fanout
*f
= pt
->af_packet_priv
;
1425 unsigned int num
= READ_ONCE(f
->num_members
);
1426 struct net
*net
= read_pnet(&f
->net
);
1427 struct packet_sock
*po
;
1430 if (!net_eq(dev_net(dev
), net
) || !num
) {
1435 if (fanout_has_flag(f
, PACKET_FANOUT_FLAG_DEFRAG
)) {
1436 skb
= ip_check_defrag(net
, skb
, IP_DEFRAG_AF_PACKET
);
1441 case PACKET_FANOUT_HASH
:
1443 idx
= fanout_demux_hash(f
, skb
, num
);
1445 case PACKET_FANOUT_LB
:
1446 idx
= fanout_demux_lb(f
, skb
, num
);
1448 case PACKET_FANOUT_CPU
:
1449 idx
= fanout_demux_cpu(f
, skb
, num
);
1451 case PACKET_FANOUT_RND
:
1452 idx
= fanout_demux_rnd(f
, skb
, num
);
1454 case PACKET_FANOUT_QM
:
1455 idx
= fanout_demux_qm(f
, skb
, num
);
1457 case PACKET_FANOUT_ROLLOVER
:
1458 idx
= fanout_demux_rollover(f
, skb
, 0, false, num
);
1460 case PACKET_FANOUT_CBPF
:
1461 case PACKET_FANOUT_EBPF
:
1462 idx
= fanout_demux_bpf(f
, skb
, num
);
1466 if (fanout_has_flag(f
, PACKET_FANOUT_FLAG_ROLLOVER
))
1467 idx
= fanout_demux_rollover(f
, skb
, idx
, true, num
);
1469 po
= pkt_sk(f
->arr
[idx
]);
1470 return po
->prot_hook
.func(skb
, dev
, &po
->prot_hook
, orig_dev
);
1473 DEFINE_MUTEX(fanout_mutex
);
1474 EXPORT_SYMBOL_GPL(fanout_mutex
);
1475 static LIST_HEAD(fanout_list
);
1476 static u16 fanout_next_id
;
1478 static void __fanout_link(struct sock
*sk
, struct packet_sock
*po
)
1480 struct packet_fanout
*f
= po
->fanout
;
1482 spin_lock(&f
->lock
);
1483 f
->arr
[f
->num_members
] = sk
;
1486 if (f
->num_members
== 1)
1487 dev_add_pack(&f
->prot_hook
);
1488 spin_unlock(&f
->lock
);
1491 static void __fanout_unlink(struct sock
*sk
, struct packet_sock
*po
)
1493 struct packet_fanout
*f
= po
->fanout
;
1496 spin_lock(&f
->lock
);
1497 for (i
= 0; i
< f
->num_members
; i
++) {
1498 if (f
->arr
[i
] == sk
)
1501 BUG_ON(i
>= f
->num_members
);
1502 f
->arr
[i
] = f
->arr
[f
->num_members
- 1];
1504 if (f
->num_members
== 0)
1505 __dev_remove_pack(&f
->prot_hook
);
1506 spin_unlock(&f
->lock
);
1509 static bool match_fanout_group(struct packet_type
*ptype
, struct sock
*sk
)
1511 if (sk
->sk_family
!= PF_PACKET
)
1514 return ptype
->af_packet_priv
== pkt_sk(sk
)->fanout
;
1517 static void fanout_init_data(struct packet_fanout
*f
)
1520 case PACKET_FANOUT_LB
:
1521 atomic_set(&f
->rr_cur
, 0);
1523 case PACKET_FANOUT_CBPF
:
1524 case PACKET_FANOUT_EBPF
:
1525 RCU_INIT_POINTER(f
->bpf_prog
, NULL
);
1530 static void __fanout_set_data_bpf(struct packet_fanout
*f
, struct bpf_prog
*new)
1532 struct bpf_prog
*old
;
1534 spin_lock(&f
->lock
);
1535 old
= rcu_dereference_protected(f
->bpf_prog
, lockdep_is_held(&f
->lock
));
1536 rcu_assign_pointer(f
->bpf_prog
, new);
1537 spin_unlock(&f
->lock
);
1541 bpf_prog_destroy(old
);
1545 static int fanout_set_data_cbpf(struct packet_sock
*po
, sockptr_t data
,
1548 struct bpf_prog
*new;
1549 struct sock_fprog fprog
;
1552 if (sock_flag(&po
->sk
, SOCK_FILTER_LOCKED
))
1555 ret
= copy_bpf_fprog_from_user(&fprog
, data
, len
);
1559 ret
= bpf_prog_create_from_user(&new, &fprog
, NULL
, false);
1563 __fanout_set_data_bpf(po
->fanout
, new);
1567 static int fanout_set_data_ebpf(struct packet_sock
*po
, sockptr_t data
,
1570 struct bpf_prog
*new;
1573 if (sock_flag(&po
->sk
, SOCK_FILTER_LOCKED
))
1575 if (len
!= sizeof(fd
))
1577 if (copy_from_sockptr(&fd
, data
, len
))
1580 new = bpf_prog_get_type(fd
, BPF_PROG_TYPE_SOCKET_FILTER
);
1582 return PTR_ERR(new);
1584 __fanout_set_data_bpf(po
->fanout
, new);
1588 static int fanout_set_data(struct packet_sock
*po
, sockptr_t data
,
1591 switch (po
->fanout
->type
) {
1592 case PACKET_FANOUT_CBPF
:
1593 return fanout_set_data_cbpf(po
, data
, len
);
1594 case PACKET_FANOUT_EBPF
:
1595 return fanout_set_data_ebpf(po
, data
, len
);
1601 static void fanout_release_data(struct packet_fanout
*f
)
1604 case PACKET_FANOUT_CBPF
:
1605 case PACKET_FANOUT_EBPF
:
1606 __fanout_set_data_bpf(f
, NULL
);
1610 static bool __fanout_id_is_free(struct sock
*sk
, u16 candidate_id
)
1612 struct packet_fanout
*f
;
1614 list_for_each_entry(f
, &fanout_list
, list
) {
1615 if (f
->id
== candidate_id
&&
1616 read_pnet(&f
->net
) == sock_net(sk
)) {
1623 static bool fanout_find_new_id(struct sock
*sk
, u16
*new_id
)
1625 u16 id
= fanout_next_id
;
1628 if (__fanout_id_is_free(sk
, id
)) {
1630 fanout_next_id
= id
+ 1;
1635 } while (id
!= fanout_next_id
);
1640 static int fanout_add(struct sock
*sk
, struct fanout_args
*args
)
1642 struct packet_rollover
*rollover
= NULL
;
1643 struct packet_sock
*po
= pkt_sk(sk
);
1644 u16 type_flags
= args
->type_flags
;
1645 struct packet_fanout
*f
, *match
;
1646 u8 type
= type_flags
& 0xff;
1647 u8 flags
= type_flags
>> 8;
1652 case PACKET_FANOUT_ROLLOVER
:
1653 if (type_flags
& PACKET_FANOUT_FLAG_ROLLOVER
)
1655 case PACKET_FANOUT_HASH
:
1656 case PACKET_FANOUT_LB
:
1657 case PACKET_FANOUT_CPU
:
1658 case PACKET_FANOUT_RND
:
1659 case PACKET_FANOUT_QM
:
1660 case PACKET_FANOUT_CBPF
:
1661 case PACKET_FANOUT_EBPF
:
1667 mutex_lock(&fanout_mutex
);
1673 if (type
== PACKET_FANOUT_ROLLOVER
||
1674 (type_flags
& PACKET_FANOUT_FLAG_ROLLOVER
)) {
1676 rollover
= kzalloc(sizeof(*rollover
), GFP_KERNEL
);
1679 atomic_long_set(&rollover
->num
, 0);
1680 atomic_long_set(&rollover
->num_huge
, 0);
1681 atomic_long_set(&rollover
->num_failed
, 0);
1684 if (type_flags
& PACKET_FANOUT_FLAG_UNIQUEID
) {
1689 if (!fanout_find_new_id(sk
, &id
)) {
1693 /* ephemeral flag for the first socket in the group: drop it */
1694 flags
&= ~(PACKET_FANOUT_FLAG_UNIQUEID
>> 8);
1698 list_for_each_entry(f
, &fanout_list
, list
) {
1700 read_pnet(&f
->net
) == sock_net(sk
)) {
1707 if (match
->flags
!= flags
)
1709 if (args
->max_num_members
&&
1710 args
->max_num_members
!= match
->max_num_members
)
1713 if (args
->max_num_members
> PACKET_FANOUT_MAX
)
1715 if (!args
->max_num_members
)
1716 /* legacy PACKET_FANOUT_MAX */
1717 args
->max_num_members
= 256;
1719 match
= kvzalloc(struct_size(match
, arr
, args
->max_num_members
),
1723 write_pnet(&match
->net
, sock_net(sk
));
1726 match
->flags
= flags
;
1727 INIT_LIST_HEAD(&match
->list
);
1728 spin_lock_init(&match
->lock
);
1729 refcount_set(&match
->sk_ref
, 0);
1730 fanout_init_data(match
);
1731 match
->prot_hook
.type
= po
->prot_hook
.type
;
1732 match
->prot_hook
.dev
= po
->prot_hook
.dev
;
1733 match
->prot_hook
.func
= packet_rcv_fanout
;
1734 match
->prot_hook
.af_packet_priv
= match
;
1735 match
->prot_hook
.id_match
= match_fanout_group
;
1736 match
->max_num_members
= args
->max_num_members
;
1737 list_add(&match
->list
, &fanout_list
);
1741 spin_lock(&po
->bind_lock
);
1743 match
->type
== type
&&
1744 match
->prot_hook
.type
== po
->prot_hook
.type
&&
1745 match
->prot_hook
.dev
== po
->prot_hook
.dev
) {
1747 if (refcount_read(&match
->sk_ref
) < match
->max_num_members
) {
1748 __dev_remove_pack(&po
->prot_hook
);
1750 po
->rollover
= rollover
;
1752 refcount_set(&match
->sk_ref
, refcount_read(&match
->sk_ref
) + 1);
1753 __fanout_link(sk
, po
);
1757 spin_unlock(&po
->bind_lock
);
1759 if (err
&& !refcount_read(&match
->sk_ref
)) {
1760 list_del(&match
->list
);
1766 mutex_unlock(&fanout_mutex
);
1770 /* If pkt_sk(sk)->fanout->sk_ref is zero, this function removes
1771 * pkt_sk(sk)->fanout from fanout_list and returns pkt_sk(sk)->fanout.
1772 * It is the responsibility of the caller to call fanout_release_data() and
1773 * free the returned packet_fanout (after synchronize_net())
1775 static struct packet_fanout
*fanout_release(struct sock
*sk
)
1777 struct packet_sock
*po
= pkt_sk(sk
);
1778 struct packet_fanout
*f
;
1780 mutex_lock(&fanout_mutex
);
1785 if (refcount_dec_and_test(&f
->sk_ref
))
1790 mutex_unlock(&fanout_mutex
);
1795 static bool packet_extra_vlan_len_allowed(const struct net_device
*dev
,
1796 struct sk_buff
*skb
)
1798 /* Earlier code assumed this would be a VLAN pkt, double-check
1799 * this now that we have the actual packet in hand. We can only
1800 * do this check on Ethernet devices.
1802 if (unlikely(dev
->type
!= ARPHRD_ETHER
))
1805 skb_reset_mac_header(skb
);
1806 return likely(eth_hdr(skb
)->h_proto
== htons(ETH_P_8021Q
));
1809 static const struct proto_ops packet_ops
;
1811 static const struct proto_ops packet_ops_spkt
;
1813 static int packet_rcv_spkt(struct sk_buff
*skb
, struct net_device
*dev
,
1814 struct packet_type
*pt
, struct net_device
*orig_dev
)
1817 struct sockaddr_pkt
*spkt
;
1820 * When we registered the protocol we saved the socket in the data
1821 * field for just this event.
1824 sk
= pt
->af_packet_priv
;
1827 * Yank back the headers [hope the device set this
1828 * right or kerboom...]
1830 * Incoming packets have ll header pulled,
1833 * For outgoing ones skb->data == skb_mac_header(skb)
1834 * so that this procedure is noop.
1837 if (skb
->pkt_type
== PACKET_LOOPBACK
)
1840 if (!net_eq(dev_net(dev
), sock_net(sk
)))
1843 skb
= skb_share_check(skb
, GFP_ATOMIC
);
1847 /* drop any routing info */
1850 /* drop conntrack reference */
1853 spkt
= &PACKET_SKB_CB(skb
)->sa
.pkt
;
1855 skb_push(skb
, skb
->data
- skb_mac_header(skb
));
1858 * The SOCK_PACKET socket receives _all_ frames.
1861 spkt
->spkt_family
= dev
->type
;
1862 strlcpy(spkt
->spkt_device
, dev
->name
, sizeof(spkt
->spkt_device
));
1863 spkt
->spkt_protocol
= skb
->protocol
;
1866 * Charge the memory to the socket. This is done specifically
1867 * to prevent sockets using all the memory up.
1870 if (sock_queue_rcv_skb(sk
, skb
) == 0)
1879 static void packet_parse_headers(struct sk_buff
*skb
, struct socket
*sock
)
1881 if ((!skb
->protocol
|| skb
->protocol
== htons(ETH_P_ALL
)) &&
1882 sock
->type
== SOCK_RAW
) {
1883 skb_reset_mac_header(skb
);
1884 skb
->protocol
= dev_parse_header_protocol(skb
);
1887 skb_probe_transport_header(skb
);
1891 * Output a raw packet to a device layer. This bypasses all the other
1892 * protocol layers and you must therefore supply it with a complete frame
1895 static int packet_sendmsg_spkt(struct socket
*sock
, struct msghdr
*msg
,
1898 struct sock
*sk
= sock
->sk
;
1899 DECLARE_SOCKADDR(struct sockaddr_pkt
*, saddr
, msg
->msg_name
);
1900 struct sk_buff
*skb
= NULL
;
1901 struct net_device
*dev
;
1902 struct sockcm_cookie sockc
;
1908 * Get and verify the address.
1912 if (msg
->msg_namelen
< sizeof(struct sockaddr
))
1914 if (msg
->msg_namelen
== sizeof(struct sockaddr_pkt
))
1915 proto
= saddr
->spkt_protocol
;
1917 return -ENOTCONN
; /* SOCK_PACKET must be sent giving an address */
1920 * Find the device first to size check it
1923 saddr
->spkt_device
[sizeof(saddr
->spkt_device
) - 1] = 0;
1926 dev
= dev_get_by_name_rcu(sock_net(sk
), saddr
->spkt_device
);
1932 if (!(dev
->flags
& IFF_UP
))
1936 * You may not queue a frame bigger than the mtu. This is the lowest level
1937 * raw protocol and you must do your own fragmentation at this level.
1940 if (unlikely(sock_flag(sk
, SOCK_NOFCS
))) {
1941 if (!netif_supports_nofcs(dev
)) {
1942 err
= -EPROTONOSUPPORT
;
1945 extra_len
= 4; /* We're doing our own CRC */
1949 if (len
> dev
->mtu
+ dev
->hard_header_len
+ VLAN_HLEN
+ extra_len
)
1953 size_t reserved
= LL_RESERVED_SPACE(dev
);
1954 int tlen
= dev
->needed_tailroom
;
1955 unsigned int hhlen
= dev
->header_ops
? dev
->hard_header_len
: 0;
1958 skb
= sock_wmalloc(sk
, len
+ reserved
+ tlen
, 0, GFP_KERNEL
);
1961 /* FIXME: Save some space for broken drivers that write a hard
1962 * header at transmission time by themselves. PPP is the notable
1963 * one here. This should really be fixed at the driver level.
1965 skb_reserve(skb
, reserved
);
1966 skb_reset_network_header(skb
);
1968 /* Try to align data part correctly */
1973 skb_reset_network_header(skb
);
1975 err
= memcpy_from_msg(skb_put(skb
, len
), msg
, len
);
1981 if (!dev_validate_header(dev
, skb
->data
, len
)) {
1985 if (len
> (dev
->mtu
+ dev
->hard_header_len
+ extra_len
) &&
1986 !packet_extra_vlan_len_allowed(dev
, skb
)) {
1991 sockcm_init(&sockc
, sk
);
1992 if (msg
->msg_controllen
) {
1993 err
= sock_cmsg_send(sk
, msg
, &sockc
);
1998 skb
->protocol
= proto
;
2000 skb
->priority
= sk
->sk_priority
;
2001 skb
->mark
= sk
->sk_mark
;
2002 skb
->tstamp
= sockc
.transmit_time
;
2004 skb_setup_tx_timestamp(skb
, sockc
.tsflags
);
2006 if (unlikely(extra_len
== 4))
2009 packet_parse_headers(skb
, sock
);
2011 dev_queue_xmit(skb
);
2022 static unsigned int run_filter(struct sk_buff
*skb
,
2023 const struct sock
*sk
,
2026 struct sk_filter
*filter
;
2029 filter
= rcu_dereference(sk
->sk_filter
);
2031 res
= bpf_prog_run_clear_cb(filter
->prog
, skb
);
2037 static int packet_rcv_vnet(struct msghdr
*msg
, const struct sk_buff
*skb
,
2040 struct virtio_net_hdr vnet_hdr
;
2042 if (*len
< sizeof(vnet_hdr
))
2044 *len
-= sizeof(vnet_hdr
);
2046 if (virtio_net_hdr_from_skb(skb
, &vnet_hdr
, vio_le(), true, 0))
2049 return memcpy_to_msg(msg
, (void *)&vnet_hdr
, sizeof(vnet_hdr
));
2053 * This function makes lazy skb cloning in hope that most of packets
2054 * are discarded by BPF.
2056 * Note tricky part: we DO mangle shared skb! skb->data, skb->len
2057 * and skb->cb are mangled. It works because (and until) packets
2058 * falling here are owned by current CPU. Output packets are cloned
2059 * by dev_queue_xmit_nit(), input packets are processed by net_bh
2060 * sequencially, so that if we return skb to original state on exit,
2061 * we will not harm anyone.
2064 static int packet_rcv(struct sk_buff
*skb
, struct net_device
*dev
,
2065 struct packet_type
*pt
, struct net_device
*orig_dev
)
2068 struct sockaddr_ll
*sll
;
2069 struct packet_sock
*po
;
2070 u8
*skb_head
= skb
->data
;
2071 int skb_len
= skb
->len
;
2072 unsigned int snaplen
, res
;
2073 bool is_drop_n_account
= false;
2075 if (skb
->pkt_type
== PACKET_LOOPBACK
)
2078 sk
= pt
->af_packet_priv
;
2081 if (!net_eq(dev_net(dev
), sock_net(sk
)))
2086 if (dev_has_header(dev
)) {
2087 /* The device has an explicit notion of ll header,
2088 * exported to higher levels.
2090 * Otherwise, the device hides details of its frame
2091 * structure, so that corresponding packet head is
2092 * never delivered to user.
2094 if (sk
->sk_type
!= SOCK_DGRAM
)
2095 skb_push(skb
, skb
->data
- skb_mac_header(skb
));
2096 else if (skb
->pkt_type
== PACKET_OUTGOING
) {
2097 /* Special case: outgoing packets have ll header at head */
2098 skb_pull(skb
, skb_network_offset(skb
));
2104 res
= run_filter(skb
, sk
, snaplen
);
2106 goto drop_n_restore
;
2110 if (atomic_read(&sk
->sk_rmem_alloc
) >= sk
->sk_rcvbuf
)
2113 if (skb_shared(skb
)) {
2114 struct sk_buff
*nskb
= skb_clone(skb
, GFP_ATOMIC
);
2118 if (skb_head
!= skb
->data
) {
2119 skb
->data
= skb_head
;
2126 sock_skb_cb_check_size(sizeof(*PACKET_SKB_CB(skb
)) + MAX_ADDR_LEN
- 8);
2128 sll
= &PACKET_SKB_CB(skb
)->sa
.ll
;
2129 sll
->sll_hatype
= dev
->type
;
2130 sll
->sll_pkttype
= skb
->pkt_type
;
2131 if (unlikely(po
->origdev
))
2132 sll
->sll_ifindex
= orig_dev
->ifindex
;
2134 sll
->sll_ifindex
= dev
->ifindex
;
2136 sll
->sll_halen
= dev_parse_header(skb
, sll
->sll_addr
);
2138 /* sll->sll_family and sll->sll_protocol are set in packet_recvmsg().
2139 * Use their space for storing the original skb length.
2141 PACKET_SKB_CB(skb
)->sa
.origlen
= skb
->len
;
2143 if (pskb_trim(skb
, snaplen
))
2146 skb_set_owner_r(skb
, sk
);
2150 /* drop conntrack reference */
2153 spin_lock(&sk
->sk_receive_queue
.lock
);
2154 po
->stats
.stats1
.tp_packets
++;
2155 sock_skb_set_dropcount(sk
, skb
);
2156 __skb_queue_tail(&sk
->sk_receive_queue
, skb
);
2157 spin_unlock(&sk
->sk_receive_queue
.lock
);
2158 sk
->sk_data_ready(sk
);
2162 is_drop_n_account
= true;
2163 atomic_inc(&po
->tp_drops
);
2164 atomic_inc(&sk
->sk_drops
);
2167 if (skb_head
!= skb
->data
&& skb_shared(skb
)) {
2168 skb
->data
= skb_head
;
2172 if (!is_drop_n_account
)
2179 static int tpacket_rcv(struct sk_buff
*skb
, struct net_device
*dev
,
2180 struct packet_type
*pt
, struct net_device
*orig_dev
)
2183 struct packet_sock
*po
;
2184 struct sockaddr_ll
*sll
;
2185 union tpacket_uhdr h
;
2186 u8
*skb_head
= skb
->data
;
2187 int skb_len
= skb
->len
;
2188 unsigned int snaplen
, res
;
2189 unsigned long status
= TP_STATUS_USER
;
2190 unsigned short macoff
, hdrlen
;
2191 unsigned int netoff
;
2192 struct sk_buff
*copy_skb
= NULL
;
2193 struct timespec64 ts
;
2195 bool is_drop_n_account
= false;
2196 unsigned int slot_id
= 0;
2197 bool do_vnet
= false;
2199 /* struct tpacket{2,3}_hdr is aligned to a multiple of TPACKET_ALIGNMENT.
2200 * We may add members to them until current aligned size without forcing
2201 * userspace to call getsockopt(..., PACKET_HDRLEN, ...).
2203 BUILD_BUG_ON(TPACKET_ALIGN(sizeof(*h
.h2
)) != 32);
2204 BUILD_BUG_ON(TPACKET_ALIGN(sizeof(*h
.h3
)) != 48);
2206 if (skb
->pkt_type
== PACKET_LOOPBACK
)
2209 sk
= pt
->af_packet_priv
;
2212 if (!net_eq(dev_net(dev
), sock_net(sk
)))
2215 if (dev_has_header(dev
)) {
2216 if (sk
->sk_type
!= SOCK_DGRAM
)
2217 skb_push(skb
, skb
->data
- skb_mac_header(skb
));
2218 else if (skb
->pkt_type
== PACKET_OUTGOING
) {
2219 /* Special case: outgoing packets have ll header at head */
2220 skb_pull(skb
, skb_network_offset(skb
));
2226 res
= run_filter(skb
, sk
, snaplen
);
2228 goto drop_n_restore
;
2230 /* If we are flooded, just give up */
2231 if (__packet_rcv_has_room(po
, skb
) == ROOM_NONE
) {
2232 atomic_inc(&po
->tp_drops
);
2233 goto drop_n_restore
;
2236 if (skb
->ip_summed
== CHECKSUM_PARTIAL
)
2237 status
|= TP_STATUS_CSUMNOTREADY
;
2238 else if (skb
->pkt_type
!= PACKET_OUTGOING
&&
2239 (skb
->ip_summed
== CHECKSUM_COMPLETE
||
2240 skb_csum_unnecessary(skb
)))
2241 status
|= TP_STATUS_CSUM_VALID
;
2246 if (sk
->sk_type
== SOCK_DGRAM
) {
2247 macoff
= netoff
= TPACKET_ALIGN(po
->tp_hdrlen
) + 16 +
2250 unsigned int maclen
= skb_network_offset(skb
);
2251 netoff
= TPACKET_ALIGN(po
->tp_hdrlen
+
2252 (maclen
< 16 ? 16 : maclen
)) +
2254 if (po
->has_vnet_hdr
) {
2255 netoff
+= sizeof(struct virtio_net_hdr
);
2258 macoff
= netoff
- maclen
;
2260 if (netoff
> USHRT_MAX
) {
2261 atomic_inc(&po
->tp_drops
);
2262 goto drop_n_restore
;
2264 if (po
->tp_version
<= TPACKET_V2
) {
2265 if (macoff
+ snaplen
> po
->rx_ring
.frame_size
) {
2266 if (po
->copy_thresh
&&
2267 atomic_read(&sk
->sk_rmem_alloc
) < sk
->sk_rcvbuf
) {
2268 if (skb_shared(skb
)) {
2269 copy_skb
= skb_clone(skb
, GFP_ATOMIC
);
2271 copy_skb
= skb_get(skb
);
2272 skb_head
= skb
->data
;
2275 skb_set_owner_r(copy_skb
, sk
);
2277 snaplen
= po
->rx_ring
.frame_size
- macoff
;
2278 if ((int)snaplen
< 0) {
2283 } else if (unlikely(macoff
+ snaplen
>
2284 GET_PBDQC_FROM_RB(&po
->rx_ring
)->max_frame_len
)) {
2287 nval
= GET_PBDQC_FROM_RB(&po
->rx_ring
)->max_frame_len
- macoff
;
2288 pr_err_once("tpacket_rcv: packet too big, clamped from %u to %u. macoff=%u\n",
2289 snaplen
, nval
, macoff
);
2291 if (unlikely((int)snaplen
< 0)) {
2293 macoff
= GET_PBDQC_FROM_RB(&po
->rx_ring
)->max_frame_len
;
2297 spin_lock(&sk
->sk_receive_queue
.lock
);
2298 h
.raw
= packet_current_rx_frame(po
, skb
,
2299 TP_STATUS_KERNEL
, (macoff
+snaplen
));
2301 goto drop_n_account
;
2303 if (po
->tp_version
<= TPACKET_V2
) {
2304 slot_id
= po
->rx_ring
.head
;
2305 if (test_bit(slot_id
, po
->rx_ring
.rx_owner_map
))
2306 goto drop_n_account
;
2307 __set_bit(slot_id
, po
->rx_ring
.rx_owner_map
);
2311 virtio_net_hdr_from_skb(skb
, h
.raw
+ macoff
-
2312 sizeof(struct virtio_net_hdr
),
2313 vio_le(), true, 0)) {
2314 if (po
->tp_version
== TPACKET_V3
)
2315 prb_clear_blk_fill_status(&po
->rx_ring
);
2316 goto drop_n_account
;
2319 if (po
->tp_version
<= TPACKET_V2
) {
2320 packet_increment_rx_head(po
, &po
->rx_ring
);
2322 * LOSING will be reported till you read the stats,
2323 * because it's COR - Clear On Read.
2324 * Anyways, moving it for V1/V2 only as V3 doesn't need this
2327 if (atomic_read(&po
->tp_drops
))
2328 status
|= TP_STATUS_LOSING
;
2331 po
->stats
.stats1
.tp_packets
++;
2333 status
|= TP_STATUS_COPY
;
2334 __skb_queue_tail(&sk
->sk_receive_queue
, copy_skb
);
2336 spin_unlock(&sk
->sk_receive_queue
.lock
);
2338 skb_copy_bits(skb
, 0, h
.raw
+ macoff
, snaplen
);
2340 if (!(ts_status
= tpacket_get_timestamp(skb
, &ts
, po
->tp_tstamp
)))
2341 ktime_get_real_ts64(&ts
);
2343 status
|= ts_status
;
2345 switch (po
->tp_version
) {
2347 h
.h1
->tp_len
= skb
->len
;
2348 h
.h1
->tp_snaplen
= snaplen
;
2349 h
.h1
->tp_mac
= macoff
;
2350 h
.h1
->tp_net
= netoff
;
2351 h
.h1
->tp_sec
= ts
.tv_sec
;
2352 h
.h1
->tp_usec
= ts
.tv_nsec
/ NSEC_PER_USEC
;
2353 hdrlen
= sizeof(*h
.h1
);
2356 h
.h2
->tp_len
= skb
->len
;
2357 h
.h2
->tp_snaplen
= snaplen
;
2358 h
.h2
->tp_mac
= macoff
;
2359 h
.h2
->tp_net
= netoff
;
2360 h
.h2
->tp_sec
= ts
.tv_sec
;
2361 h
.h2
->tp_nsec
= ts
.tv_nsec
;
2362 if (skb_vlan_tag_present(skb
)) {
2363 h
.h2
->tp_vlan_tci
= skb_vlan_tag_get(skb
);
2364 h
.h2
->tp_vlan_tpid
= ntohs(skb
->vlan_proto
);
2365 status
|= TP_STATUS_VLAN_VALID
| TP_STATUS_VLAN_TPID_VALID
;
2367 h
.h2
->tp_vlan_tci
= 0;
2368 h
.h2
->tp_vlan_tpid
= 0;
2370 memset(h
.h2
->tp_padding
, 0, sizeof(h
.h2
->tp_padding
));
2371 hdrlen
= sizeof(*h
.h2
);
2374 /* tp_nxt_offset,vlan are already populated above.
2375 * So DONT clear those fields here
2377 h
.h3
->tp_status
|= status
;
2378 h
.h3
->tp_len
= skb
->len
;
2379 h
.h3
->tp_snaplen
= snaplen
;
2380 h
.h3
->tp_mac
= macoff
;
2381 h
.h3
->tp_net
= netoff
;
2382 h
.h3
->tp_sec
= ts
.tv_sec
;
2383 h
.h3
->tp_nsec
= ts
.tv_nsec
;
2384 memset(h
.h3
->tp_padding
, 0, sizeof(h
.h3
->tp_padding
));
2385 hdrlen
= sizeof(*h
.h3
);
2391 sll
= h
.raw
+ TPACKET_ALIGN(hdrlen
);
2392 sll
->sll_halen
= dev_parse_header(skb
, sll
->sll_addr
);
2393 sll
->sll_family
= AF_PACKET
;
2394 sll
->sll_hatype
= dev
->type
;
2395 sll
->sll_protocol
= skb
->protocol
;
2396 sll
->sll_pkttype
= skb
->pkt_type
;
2397 if (unlikely(po
->origdev
))
2398 sll
->sll_ifindex
= orig_dev
->ifindex
;
2400 sll
->sll_ifindex
= dev
->ifindex
;
2404 #if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1
2405 if (po
->tp_version
<= TPACKET_V2
) {
2408 end
= (u8
*) PAGE_ALIGN((unsigned long) h
.raw
+
2411 for (start
= h
.raw
; start
< end
; start
+= PAGE_SIZE
)
2412 flush_dcache_page(pgv_to_page(start
));
2417 if (po
->tp_version
<= TPACKET_V2
) {
2418 spin_lock(&sk
->sk_receive_queue
.lock
);
2419 __packet_set_status(po
, h
.raw
, status
);
2420 __clear_bit(slot_id
, po
->rx_ring
.rx_owner_map
);
2421 spin_unlock(&sk
->sk_receive_queue
.lock
);
2422 sk
->sk_data_ready(sk
);
2423 } else if (po
->tp_version
== TPACKET_V3
) {
2424 prb_clear_blk_fill_status(&po
->rx_ring
);
2428 if (skb_head
!= skb
->data
&& skb_shared(skb
)) {
2429 skb
->data
= skb_head
;
2433 if (!is_drop_n_account
)
2440 spin_unlock(&sk
->sk_receive_queue
.lock
);
2441 atomic_inc(&po
->tp_drops
);
2442 is_drop_n_account
= true;
2444 sk
->sk_data_ready(sk
);
2445 kfree_skb(copy_skb
);
2446 goto drop_n_restore
;
2449 static void tpacket_destruct_skb(struct sk_buff
*skb
)
2451 struct packet_sock
*po
= pkt_sk(skb
->sk
);
2453 if (likely(po
->tx_ring
.pg_vec
)) {
2457 ph
= skb_zcopy_get_nouarg(skb
);
2458 packet_dec_pending(&po
->tx_ring
);
2460 ts
= __packet_set_timestamp(po
, ph
, skb
);
2461 __packet_set_status(po
, ph
, TP_STATUS_AVAILABLE
| ts
);
2463 if (!packet_read_pending(&po
->tx_ring
))
2464 complete(&po
->skb_completion
);
2470 static int __packet_snd_vnet_parse(struct virtio_net_hdr
*vnet_hdr
, size_t len
)
2472 if ((vnet_hdr
->flags
& VIRTIO_NET_HDR_F_NEEDS_CSUM
) &&
2473 (__virtio16_to_cpu(vio_le(), vnet_hdr
->csum_start
) +
2474 __virtio16_to_cpu(vio_le(), vnet_hdr
->csum_offset
) + 2 >
2475 __virtio16_to_cpu(vio_le(), vnet_hdr
->hdr_len
)))
2476 vnet_hdr
->hdr_len
= __cpu_to_virtio16(vio_le(),
2477 __virtio16_to_cpu(vio_le(), vnet_hdr
->csum_start
) +
2478 __virtio16_to_cpu(vio_le(), vnet_hdr
->csum_offset
) + 2);
2480 if (__virtio16_to_cpu(vio_le(), vnet_hdr
->hdr_len
) > len
)
2486 static int packet_snd_vnet_parse(struct msghdr
*msg
, size_t *len
,
2487 struct virtio_net_hdr
*vnet_hdr
)
2489 if (*len
< sizeof(*vnet_hdr
))
2491 *len
-= sizeof(*vnet_hdr
);
2493 if (!copy_from_iter_full(vnet_hdr
, sizeof(*vnet_hdr
), &msg
->msg_iter
))
2496 return __packet_snd_vnet_parse(vnet_hdr
, *len
);
2499 static int tpacket_fill_skb(struct packet_sock
*po
, struct sk_buff
*skb
,
2500 void *frame
, struct net_device
*dev
, void *data
, int tp_len
,
2501 __be16 proto
, unsigned char *addr
, int hlen
, int copylen
,
2502 const struct sockcm_cookie
*sockc
)
2504 union tpacket_uhdr ph
;
2505 int to_write
, offset
, len
, nr_frags
, len_max
;
2506 struct socket
*sock
= po
->sk
.sk_socket
;
2512 skb
->protocol
= proto
;
2514 skb
->priority
= po
->sk
.sk_priority
;
2515 skb
->mark
= po
->sk
.sk_mark
;
2516 skb
->tstamp
= sockc
->transmit_time
;
2517 skb_setup_tx_timestamp(skb
, sockc
->tsflags
);
2518 skb_zcopy_set_nouarg(skb
, ph
.raw
);
2520 skb_reserve(skb
, hlen
);
2521 skb_reset_network_header(skb
);
2525 if (sock
->type
== SOCK_DGRAM
) {
2526 err
= dev_hard_header(skb
, dev
, ntohs(proto
), addr
,
2528 if (unlikely(err
< 0))
2530 } else if (copylen
) {
2531 int hdrlen
= min_t(int, copylen
, tp_len
);
2533 skb_push(skb
, dev
->hard_header_len
);
2534 skb_put(skb
, copylen
- dev
->hard_header_len
);
2535 err
= skb_store_bits(skb
, 0, data
, hdrlen
);
2538 if (!dev_validate_header(dev
, skb
->data
, hdrlen
))
2545 offset
= offset_in_page(data
);
2546 len_max
= PAGE_SIZE
- offset
;
2547 len
= ((to_write
> len_max
) ? len_max
: to_write
);
2549 skb
->data_len
= to_write
;
2550 skb
->len
+= to_write
;
2551 skb
->truesize
+= to_write
;
2552 refcount_add(to_write
, &po
->sk
.sk_wmem_alloc
);
2554 while (likely(to_write
)) {
2555 nr_frags
= skb_shinfo(skb
)->nr_frags
;
2557 if (unlikely(nr_frags
>= MAX_SKB_FRAGS
)) {
2558 pr_err("Packet exceed the number of skb frags(%lu)\n",
2563 page
= pgv_to_page(data
);
2565 flush_dcache_page(page
);
2567 skb_fill_page_desc(skb
, nr_frags
, page
, offset
, len
);
2570 len_max
= PAGE_SIZE
;
2571 len
= ((to_write
> len_max
) ? len_max
: to_write
);
2574 packet_parse_headers(skb
, sock
);
2579 static int tpacket_parse_header(struct packet_sock
*po
, void *frame
,
2580 int size_max
, void **data
)
2582 union tpacket_uhdr ph
;
2587 switch (po
->tp_version
) {
2589 if (ph
.h3
->tp_next_offset
!= 0) {
2590 pr_warn_once("variable sized slot not supported");
2593 tp_len
= ph
.h3
->tp_len
;
2596 tp_len
= ph
.h2
->tp_len
;
2599 tp_len
= ph
.h1
->tp_len
;
2602 if (unlikely(tp_len
> size_max
)) {
2603 pr_err("packet size is too long (%d > %d)\n", tp_len
, size_max
);
2607 if (unlikely(po
->tp_tx_has_off
)) {
2608 int off_min
, off_max
;
2610 off_min
= po
->tp_hdrlen
- sizeof(struct sockaddr_ll
);
2611 off_max
= po
->tx_ring
.frame_size
- tp_len
;
2612 if (po
->sk
.sk_type
== SOCK_DGRAM
) {
2613 switch (po
->tp_version
) {
2615 off
= ph
.h3
->tp_net
;
2618 off
= ph
.h2
->tp_net
;
2621 off
= ph
.h1
->tp_net
;
2625 switch (po
->tp_version
) {
2627 off
= ph
.h3
->tp_mac
;
2630 off
= ph
.h2
->tp_mac
;
2633 off
= ph
.h1
->tp_mac
;
2637 if (unlikely((off
< off_min
) || (off_max
< off
)))
2640 off
= po
->tp_hdrlen
- sizeof(struct sockaddr_ll
);
2643 *data
= frame
+ off
;
2647 static int tpacket_snd(struct packet_sock
*po
, struct msghdr
*msg
)
2649 struct sk_buff
*skb
= NULL
;
2650 struct net_device
*dev
;
2651 struct virtio_net_hdr
*vnet_hdr
= NULL
;
2652 struct sockcm_cookie sockc
;
2654 int err
, reserve
= 0;
2656 DECLARE_SOCKADDR(struct sockaddr_ll
*, saddr
, msg
->msg_name
);
2657 bool need_wait
= !(msg
->msg_flags
& MSG_DONTWAIT
);
2658 unsigned char *addr
= NULL
;
2659 int tp_len
, size_max
;
2662 int status
= TP_STATUS_AVAILABLE
;
2663 int hlen
, tlen
, copylen
= 0;
2666 mutex_lock(&po
->pg_vec_lock
);
2668 /* packet_sendmsg() check on tx_ring.pg_vec was lockless,
2669 * we need to confirm it under protection of pg_vec_lock.
2671 if (unlikely(!po
->tx_ring
.pg_vec
)) {
2675 if (likely(saddr
== NULL
)) {
2676 dev
= packet_cached_dev_get(po
);
2680 if (msg
->msg_namelen
< sizeof(struct sockaddr_ll
))
2682 if (msg
->msg_namelen
< (saddr
->sll_halen
2683 + offsetof(struct sockaddr_ll
,
2686 proto
= saddr
->sll_protocol
;
2687 dev
= dev_get_by_index(sock_net(&po
->sk
), saddr
->sll_ifindex
);
2688 if (po
->sk
.sk_socket
->type
== SOCK_DGRAM
) {
2689 if (dev
&& msg
->msg_namelen
< dev
->addr_len
+
2690 offsetof(struct sockaddr_ll
, sll_addr
))
2692 addr
= saddr
->sll_addr
;
2697 if (unlikely(dev
== NULL
))
2700 if (unlikely(!(dev
->flags
& IFF_UP
)))
2703 sockcm_init(&sockc
, &po
->sk
);
2704 if (msg
->msg_controllen
) {
2705 err
= sock_cmsg_send(&po
->sk
, msg
, &sockc
);
2710 if (po
->sk
.sk_socket
->type
== SOCK_RAW
)
2711 reserve
= dev
->hard_header_len
;
2712 size_max
= po
->tx_ring
.frame_size
2713 - (po
->tp_hdrlen
- sizeof(struct sockaddr_ll
));
2715 if ((size_max
> dev
->mtu
+ reserve
+ VLAN_HLEN
) && !po
->has_vnet_hdr
)
2716 size_max
= dev
->mtu
+ reserve
+ VLAN_HLEN
;
2718 reinit_completion(&po
->skb_completion
);
2721 ph
= packet_current_frame(po
, &po
->tx_ring
,
2722 TP_STATUS_SEND_REQUEST
);
2723 if (unlikely(ph
== NULL
)) {
2724 if (need_wait
&& skb
) {
2725 timeo
= sock_sndtimeo(&po
->sk
, msg
->msg_flags
& MSG_DONTWAIT
);
2726 timeo
= wait_for_completion_interruptible_timeout(&po
->skb_completion
, timeo
);
2728 err
= !timeo
? -ETIMEDOUT
: -ERESTARTSYS
;
2732 /* check for additional frames */
2737 tp_len
= tpacket_parse_header(po
, ph
, size_max
, &data
);
2741 status
= TP_STATUS_SEND_REQUEST
;
2742 hlen
= LL_RESERVED_SPACE(dev
);
2743 tlen
= dev
->needed_tailroom
;
2744 if (po
->has_vnet_hdr
) {
2746 data
+= sizeof(*vnet_hdr
);
2747 tp_len
-= sizeof(*vnet_hdr
);
2749 __packet_snd_vnet_parse(vnet_hdr
, tp_len
)) {
2753 copylen
= __virtio16_to_cpu(vio_le(),
2756 copylen
= max_t(int, copylen
, dev
->hard_header_len
);
2757 skb
= sock_alloc_send_skb(&po
->sk
,
2758 hlen
+ tlen
+ sizeof(struct sockaddr_ll
) +
2759 (copylen
- dev
->hard_header_len
),
2762 if (unlikely(skb
== NULL
)) {
2763 /* we assume the socket was initially writeable ... */
2764 if (likely(len_sum
> 0))
2768 tp_len
= tpacket_fill_skb(po
, skb
, ph
, dev
, data
, tp_len
, proto
,
2769 addr
, hlen
, copylen
, &sockc
);
2770 if (likely(tp_len
>= 0) &&
2771 tp_len
> dev
->mtu
+ reserve
&&
2772 !po
->has_vnet_hdr
&&
2773 !packet_extra_vlan_len_allowed(dev
, skb
))
2776 if (unlikely(tp_len
< 0)) {
2779 __packet_set_status(po
, ph
,
2780 TP_STATUS_AVAILABLE
);
2781 packet_increment_head(&po
->tx_ring
);
2785 status
= TP_STATUS_WRONG_FORMAT
;
2791 if (po
->has_vnet_hdr
) {
2792 if (virtio_net_hdr_to_skb(skb
, vnet_hdr
, vio_le())) {
2796 virtio_net_hdr_set_proto(skb
, vnet_hdr
);
2799 skb
->destructor
= tpacket_destruct_skb
;
2800 __packet_set_status(po
, ph
, TP_STATUS_SENDING
);
2801 packet_inc_pending(&po
->tx_ring
);
2803 status
= TP_STATUS_SEND_REQUEST
;
2804 err
= po
->xmit(skb
);
2805 if (unlikely(err
> 0)) {
2806 err
= net_xmit_errno(err
);
2807 if (err
&& __packet_get_status(po
, ph
) ==
2808 TP_STATUS_AVAILABLE
) {
2809 /* skb was destructed already */
2814 * skb was dropped but not destructed yet;
2815 * let's treat it like congestion or err < 0
2819 packet_increment_head(&po
->tx_ring
);
2821 } while (likely((ph
!= NULL
) ||
2822 /* Note: packet_read_pending() might be slow if we have
2823 * to call it as it's per_cpu variable, but in fast-path
2824 * we already short-circuit the loop with the first
2825 * condition, and luckily don't have to go that path
2828 (need_wait
&& packet_read_pending(&po
->tx_ring
))));
2834 __packet_set_status(po
, ph
, status
);
2839 mutex_unlock(&po
->pg_vec_lock
);
2843 static struct sk_buff
*packet_alloc_skb(struct sock
*sk
, size_t prepad
,
2844 size_t reserve
, size_t len
,
2845 size_t linear
, int noblock
,
2848 struct sk_buff
*skb
;
2850 /* Under a page? Don't bother with paged skb. */
2851 if (prepad
+ len
< PAGE_SIZE
|| !linear
)
2854 skb
= sock_alloc_send_pskb(sk
, prepad
+ linear
, len
- linear
, noblock
,
2859 skb_reserve(skb
, reserve
);
2860 skb_put(skb
, linear
);
2861 skb
->data_len
= len
- linear
;
2862 skb
->len
+= len
- linear
;
2867 static int packet_snd(struct socket
*sock
, struct msghdr
*msg
, size_t len
)
2869 struct sock
*sk
= sock
->sk
;
2870 DECLARE_SOCKADDR(struct sockaddr_ll
*, saddr
, msg
->msg_name
);
2871 struct sk_buff
*skb
;
2872 struct net_device
*dev
;
2874 unsigned char *addr
= NULL
;
2875 int err
, reserve
= 0;
2876 struct sockcm_cookie sockc
;
2877 struct virtio_net_hdr vnet_hdr
= { 0 };
2879 struct packet_sock
*po
= pkt_sk(sk
);
2880 bool has_vnet_hdr
= false;
2881 int hlen
, tlen
, linear
;
2885 * Get and verify the address.
2888 if (likely(saddr
== NULL
)) {
2889 dev
= packet_cached_dev_get(po
);
2893 if (msg
->msg_namelen
< sizeof(struct sockaddr_ll
))
2895 if (msg
->msg_namelen
< (saddr
->sll_halen
+ offsetof(struct sockaddr_ll
, sll_addr
)))
2897 proto
= saddr
->sll_protocol
;
2898 dev
= dev_get_by_index(sock_net(sk
), saddr
->sll_ifindex
);
2899 if (sock
->type
== SOCK_DGRAM
) {
2900 if (dev
&& msg
->msg_namelen
< dev
->addr_len
+
2901 offsetof(struct sockaddr_ll
, sll_addr
))
2903 addr
= saddr
->sll_addr
;
2908 if (unlikely(dev
== NULL
))
2911 if (unlikely(!(dev
->flags
& IFF_UP
)))
2914 sockcm_init(&sockc
, sk
);
2915 sockc
.mark
= sk
->sk_mark
;
2916 if (msg
->msg_controllen
) {
2917 err
= sock_cmsg_send(sk
, msg
, &sockc
);
2922 if (sock
->type
== SOCK_RAW
)
2923 reserve
= dev
->hard_header_len
;
2924 if (po
->has_vnet_hdr
) {
2925 err
= packet_snd_vnet_parse(msg
, &len
, &vnet_hdr
);
2928 has_vnet_hdr
= true;
2931 if (unlikely(sock_flag(sk
, SOCK_NOFCS
))) {
2932 if (!netif_supports_nofcs(dev
)) {
2933 err
= -EPROTONOSUPPORT
;
2936 extra_len
= 4; /* We're doing our own CRC */
2940 if (!vnet_hdr
.gso_type
&&
2941 (len
> dev
->mtu
+ reserve
+ VLAN_HLEN
+ extra_len
))
2945 hlen
= LL_RESERVED_SPACE(dev
);
2946 tlen
= dev
->needed_tailroom
;
2947 linear
= __virtio16_to_cpu(vio_le(), vnet_hdr
.hdr_len
);
2948 linear
= max(linear
, min_t(int, len
, dev
->hard_header_len
));
2949 skb
= packet_alloc_skb(sk
, hlen
+ tlen
, hlen
, len
, linear
,
2950 msg
->msg_flags
& MSG_DONTWAIT
, &err
);
2954 skb_reset_network_header(skb
);
2957 if (sock
->type
== SOCK_DGRAM
) {
2958 offset
= dev_hard_header(skb
, dev
, ntohs(proto
), addr
, NULL
, len
);
2959 if (unlikely(offset
< 0))
2961 } else if (reserve
) {
2962 skb_reserve(skb
, -reserve
);
2963 if (len
< reserve
+ sizeof(struct ipv6hdr
) &&
2964 dev
->min_header_len
!= dev
->hard_header_len
)
2965 skb_reset_network_header(skb
);
2968 /* Returns -EFAULT on error */
2969 err
= skb_copy_datagram_from_iter(skb
, offset
, &msg
->msg_iter
, len
);
2973 if (sock
->type
== SOCK_RAW
&&
2974 !dev_validate_header(dev
, skb
->data
, len
)) {
2979 skb_setup_tx_timestamp(skb
, sockc
.tsflags
);
2981 if (!vnet_hdr
.gso_type
&& (len
> dev
->mtu
+ reserve
+ extra_len
) &&
2982 !packet_extra_vlan_len_allowed(dev
, skb
)) {
2987 skb
->protocol
= proto
;
2989 skb
->priority
= sk
->sk_priority
;
2990 skb
->mark
= sockc
.mark
;
2991 skb
->tstamp
= sockc
.transmit_time
;
2994 err
= virtio_net_hdr_to_skb(skb
, &vnet_hdr
, vio_le());
2997 len
+= sizeof(vnet_hdr
);
2998 virtio_net_hdr_set_proto(skb
, &vnet_hdr
);
3001 packet_parse_headers(skb
, sock
);
3003 if (unlikely(extra_len
== 4))
3006 err
= po
->xmit(skb
);
3007 if (err
> 0 && (err
= net_xmit_errno(err
)) != 0)
3023 static int packet_sendmsg(struct socket
*sock
, struct msghdr
*msg
, size_t len
)
3025 struct sock
*sk
= sock
->sk
;
3026 struct packet_sock
*po
= pkt_sk(sk
);
3028 if (po
->tx_ring
.pg_vec
)
3029 return tpacket_snd(po
, msg
);
3031 return packet_snd(sock
, msg
, len
);
3035 * Close a PACKET socket. This is fairly simple. We immediately go
3036 * to 'closed' state and remove our protocol entry in the device list.
3039 static int packet_release(struct socket
*sock
)
3041 struct sock
*sk
= sock
->sk
;
3042 struct packet_sock
*po
;
3043 struct packet_fanout
*f
;
3045 union tpacket_req_u req_u
;
3053 mutex_lock(&net
->packet
.sklist_lock
);
3054 sk_del_node_init_rcu(sk
);
3055 mutex_unlock(&net
->packet
.sklist_lock
);
3058 sock_prot_inuse_add(net
, sk
->sk_prot
, -1);
3061 spin_lock(&po
->bind_lock
);
3062 unregister_prot_hook(sk
, false);
3063 packet_cached_dev_reset(po
);
3065 if (po
->prot_hook
.dev
) {
3066 dev_put(po
->prot_hook
.dev
);
3067 po
->prot_hook
.dev
= NULL
;
3069 spin_unlock(&po
->bind_lock
);
3071 packet_flush_mclist(sk
);
3074 if (po
->rx_ring
.pg_vec
) {
3075 memset(&req_u
, 0, sizeof(req_u
));
3076 packet_set_ring(sk
, &req_u
, 1, 0);
3079 if (po
->tx_ring
.pg_vec
) {
3080 memset(&req_u
, 0, sizeof(req_u
));
3081 packet_set_ring(sk
, &req_u
, 1, 1);
3085 f
= fanout_release(sk
);
3089 kfree(po
->rollover
);
3091 fanout_release_data(f
);
3095 * Now the socket is dead. No more input will appear.
3102 skb_queue_purge(&sk
->sk_receive_queue
);
3103 packet_free_pending(po
);
3104 sk_refcnt_debug_release(sk
);
3111 * Attach a packet hook.
3114 static int packet_do_bind(struct sock
*sk
, const char *name
, int ifindex
,
3117 struct packet_sock
*po
= pkt_sk(sk
);
3118 struct net_device
*dev_curr
;
3121 struct net_device
*dev
= NULL
;
3123 bool unlisted
= false;
3126 spin_lock(&po
->bind_lock
);
3135 dev
= dev_get_by_name_rcu(sock_net(sk
), name
);
3140 } else if (ifindex
) {
3141 dev
= dev_get_by_index_rcu(sock_net(sk
), ifindex
);
3151 proto_curr
= po
->prot_hook
.type
;
3152 dev_curr
= po
->prot_hook
.dev
;
3154 need_rehook
= proto_curr
!= proto
|| dev_curr
!= dev
;
3159 /* prevents packet_notifier() from calling
3160 * register_prot_hook()
3163 __unregister_prot_hook(sk
, true);
3165 dev_curr
= po
->prot_hook
.dev
;
3167 unlisted
= !dev_get_by_index_rcu(sock_net(sk
),
3171 BUG_ON(po
->running
);
3173 po
->prot_hook
.type
= proto
;
3175 if (unlikely(unlisted
)) {
3177 po
->prot_hook
.dev
= NULL
;
3179 packet_cached_dev_reset(po
);
3181 po
->prot_hook
.dev
= dev
;
3182 po
->ifindex
= dev
? dev
->ifindex
: 0;
3183 packet_cached_dev_assign(po
, dev
);
3189 if (proto
== 0 || !need_rehook
)
3192 if (!unlisted
&& (!dev
|| (dev
->flags
& IFF_UP
))) {
3193 register_prot_hook(sk
);
3195 sk
->sk_err
= ENETDOWN
;
3196 if (!sock_flag(sk
, SOCK_DEAD
))
3197 sk
->sk_error_report(sk
);
3202 spin_unlock(&po
->bind_lock
);
3208 * Bind a packet socket to a device
3211 static int packet_bind_spkt(struct socket
*sock
, struct sockaddr
*uaddr
,
3214 struct sock
*sk
= sock
->sk
;
3215 char name
[sizeof(uaddr
->sa_data
) + 1];
3221 if (addr_len
!= sizeof(struct sockaddr
))
3223 /* uaddr->sa_data comes from the userspace, it's not guaranteed to be
3226 memcpy(name
, uaddr
->sa_data
, sizeof(uaddr
->sa_data
));
3227 name
[sizeof(uaddr
->sa_data
)] = 0;
3229 return packet_do_bind(sk
, name
, 0, pkt_sk(sk
)->num
);
3232 static int packet_bind(struct socket
*sock
, struct sockaddr
*uaddr
, int addr_len
)
3234 struct sockaddr_ll
*sll
= (struct sockaddr_ll
*)uaddr
;
3235 struct sock
*sk
= sock
->sk
;
3241 if (addr_len
< sizeof(struct sockaddr_ll
))
3243 if (sll
->sll_family
!= AF_PACKET
)
3246 return packet_do_bind(sk
, NULL
, sll
->sll_ifindex
,
3247 sll
->sll_protocol
? : pkt_sk(sk
)->num
);
3250 static struct proto packet_proto
= {
3252 .owner
= THIS_MODULE
,
3253 .obj_size
= sizeof(struct packet_sock
),
3257 * Create a packet of type SOCK_PACKET.
3260 static int packet_create(struct net
*net
, struct socket
*sock
, int protocol
,
3264 struct packet_sock
*po
;
3265 __be16 proto
= (__force __be16
)protocol
; /* weird, but documented */
3268 if (!ns_capable(net
->user_ns
, CAP_NET_RAW
))
3270 if (sock
->type
!= SOCK_DGRAM
&& sock
->type
!= SOCK_RAW
&&
3271 sock
->type
!= SOCK_PACKET
)
3272 return -ESOCKTNOSUPPORT
;
3274 sock
->state
= SS_UNCONNECTED
;
3277 sk
= sk_alloc(net
, PF_PACKET
, GFP_KERNEL
, &packet_proto
, kern
);
3281 sock
->ops
= &packet_ops
;
3282 if (sock
->type
== SOCK_PACKET
)
3283 sock
->ops
= &packet_ops_spkt
;
3285 sock_init_data(sock
, sk
);
3288 init_completion(&po
->skb_completion
);
3289 sk
->sk_family
= PF_PACKET
;
3291 po
->xmit
= dev_queue_xmit
;
3293 err
= packet_alloc_pending(po
);
3297 packet_cached_dev_reset(po
);
3299 sk
->sk_destruct
= packet_sock_destruct
;
3300 sk_refcnt_debug_inc(sk
);
3303 * Attach a protocol block
3306 spin_lock_init(&po
->bind_lock
);
3307 mutex_init(&po
->pg_vec_lock
);
3308 po
->rollover
= NULL
;
3309 po
->prot_hook
.func
= packet_rcv
;
3311 if (sock
->type
== SOCK_PACKET
)
3312 po
->prot_hook
.func
= packet_rcv_spkt
;
3314 po
->prot_hook
.af_packet_priv
= sk
;
3317 po
->prot_hook
.type
= proto
;
3318 __register_prot_hook(sk
);
3321 mutex_lock(&net
->packet
.sklist_lock
);
3322 sk_add_node_tail_rcu(sk
, &net
->packet
.sklist
);
3323 mutex_unlock(&net
->packet
.sklist_lock
);
3326 sock_prot_inuse_add(net
, &packet_proto
, 1);
3337 * Pull a packet from our receive queue and hand it to the user.
3338 * If necessary we block.
3341 static int packet_recvmsg(struct socket
*sock
, struct msghdr
*msg
, size_t len
,
3344 struct sock
*sk
= sock
->sk
;
3345 struct sk_buff
*skb
;
3347 int vnet_hdr_len
= 0;
3348 unsigned int origlen
= 0;
3351 if (flags
& ~(MSG_PEEK
|MSG_DONTWAIT
|MSG_TRUNC
|MSG_CMSG_COMPAT
|MSG_ERRQUEUE
))
3355 /* What error should we return now? EUNATTACH? */
3356 if (pkt_sk(sk
)->ifindex
< 0)
3360 if (flags
& MSG_ERRQUEUE
) {
3361 err
= sock_recv_errqueue(sk
, msg
, len
,
3362 SOL_PACKET
, PACKET_TX_TIMESTAMP
);
3367 * Call the generic datagram receiver. This handles all sorts
3368 * of horrible races and re-entrancy so we can forget about it
3369 * in the protocol layers.
3371 * Now it will return ENETDOWN, if device have just gone down,
3372 * but then it will block.
3375 skb
= skb_recv_datagram(sk
, flags
, flags
& MSG_DONTWAIT
, &err
);
3378 * An error occurred so return it. Because skb_recv_datagram()
3379 * handles the blocking we don't see and worry about blocking
3386 packet_rcv_try_clear_pressure(pkt_sk(sk
));
3388 if (pkt_sk(sk
)->has_vnet_hdr
) {
3389 err
= packet_rcv_vnet(msg
, skb
, &len
);
3392 vnet_hdr_len
= sizeof(struct virtio_net_hdr
);
3395 /* You lose any data beyond the buffer you gave. If it worries
3396 * a user program they can ask the device for its MTU
3402 msg
->msg_flags
|= MSG_TRUNC
;
3405 err
= skb_copy_datagram_msg(skb
, 0, msg
, copied
);
3409 if (sock
->type
!= SOCK_PACKET
) {
3410 struct sockaddr_ll
*sll
= &PACKET_SKB_CB(skb
)->sa
.ll
;
3412 /* Original length was stored in sockaddr_ll fields */
3413 origlen
= PACKET_SKB_CB(skb
)->sa
.origlen
;
3414 sll
->sll_family
= AF_PACKET
;
3415 sll
->sll_protocol
= skb
->protocol
;
3418 sock_recv_ts_and_drops(msg
, sk
, skb
);
3420 if (msg
->msg_name
) {
3423 /* If the address length field is there to be filled
3424 * in, we fill it in now.
3426 if (sock
->type
== SOCK_PACKET
) {
3427 __sockaddr_check_size(sizeof(struct sockaddr_pkt
));
3428 msg
->msg_namelen
= sizeof(struct sockaddr_pkt
);
3429 copy_len
= msg
->msg_namelen
;
3431 struct sockaddr_ll
*sll
= &PACKET_SKB_CB(skb
)->sa
.ll
;
3433 msg
->msg_namelen
= sll
->sll_halen
+
3434 offsetof(struct sockaddr_ll
, sll_addr
);
3435 copy_len
= msg
->msg_namelen
;
3436 if (msg
->msg_namelen
< sizeof(struct sockaddr_ll
)) {
3437 memset(msg
->msg_name
+
3438 offsetof(struct sockaddr_ll
, sll_addr
),
3439 0, sizeof(sll
->sll_addr
));
3440 msg
->msg_namelen
= sizeof(struct sockaddr_ll
);
3443 memcpy(msg
->msg_name
, &PACKET_SKB_CB(skb
)->sa
, copy_len
);
3446 if (pkt_sk(sk
)->auxdata
) {
3447 struct tpacket_auxdata aux
;
3449 aux
.tp_status
= TP_STATUS_USER
;
3450 if (skb
->ip_summed
== CHECKSUM_PARTIAL
)
3451 aux
.tp_status
|= TP_STATUS_CSUMNOTREADY
;
3452 else if (skb
->pkt_type
!= PACKET_OUTGOING
&&
3453 (skb
->ip_summed
== CHECKSUM_COMPLETE
||
3454 skb_csum_unnecessary(skb
)))
3455 aux
.tp_status
|= TP_STATUS_CSUM_VALID
;
3457 aux
.tp_len
= origlen
;
3458 aux
.tp_snaplen
= skb
->len
;
3460 aux
.tp_net
= skb_network_offset(skb
);
3461 if (skb_vlan_tag_present(skb
)) {
3462 aux
.tp_vlan_tci
= skb_vlan_tag_get(skb
);
3463 aux
.tp_vlan_tpid
= ntohs(skb
->vlan_proto
);
3464 aux
.tp_status
|= TP_STATUS_VLAN_VALID
| TP_STATUS_VLAN_TPID_VALID
;
3466 aux
.tp_vlan_tci
= 0;
3467 aux
.tp_vlan_tpid
= 0;
3469 put_cmsg(msg
, SOL_PACKET
, PACKET_AUXDATA
, sizeof(aux
), &aux
);
3473 * Free or return the buffer as appropriate. Again this
3474 * hides all the races and re-entrancy issues from us.
3476 err
= vnet_hdr_len
+ ((flags
&MSG_TRUNC
) ? skb
->len
: copied
);
3479 skb_free_datagram(sk
, skb
);
3484 static int packet_getname_spkt(struct socket
*sock
, struct sockaddr
*uaddr
,
3487 struct net_device
*dev
;
3488 struct sock
*sk
= sock
->sk
;
3493 uaddr
->sa_family
= AF_PACKET
;
3494 memset(uaddr
->sa_data
, 0, sizeof(uaddr
->sa_data
));
3496 dev
= dev_get_by_index_rcu(sock_net(sk
), pkt_sk(sk
)->ifindex
);
3498 strlcpy(uaddr
->sa_data
, dev
->name
, sizeof(uaddr
->sa_data
));
3501 return sizeof(*uaddr
);
3504 static int packet_getname(struct socket
*sock
, struct sockaddr
*uaddr
,
3507 struct net_device
*dev
;
3508 struct sock
*sk
= sock
->sk
;
3509 struct packet_sock
*po
= pkt_sk(sk
);
3510 DECLARE_SOCKADDR(struct sockaddr_ll
*, sll
, uaddr
);
3515 sll
->sll_family
= AF_PACKET
;
3516 sll
->sll_ifindex
= po
->ifindex
;
3517 sll
->sll_protocol
= po
->num
;
3518 sll
->sll_pkttype
= 0;
3520 dev
= dev_get_by_index_rcu(sock_net(sk
), po
->ifindex
);
3522 sll
->sll_hatype
= dev
->type
;
3523 sll
->sll_halen
= dev
->addr_len
;
3524 memcpy(sll
->sll_addr
, dev
->dev_addr
, dev
->addr_len
);
3526 sll
->sll_hatype
= 0; /* Bad: we have no ARPHRD_UNSPEC */
3531 return offsetof(struct sockaddr_ll
, sll_addr
) + sll
->sll_halen
;
3534 static int packet_dev_mc(struct net_device
*dev
, struct packet_mclist
*i
,
3538 case PACKET_MR_MULTICAST
:
3539 if (i
->alen
!= dev
->addr_len
)
3542 return dev_mc_add(dev
, i
->addr
);
3544 return dev_mc_del(dev
, i
->addr
);
3546 case PACKET_MR_PROMISC
:
3547 return dev_set_promiscuity(dev
, what
);
3548 case PACKET_MR_ALLMULTI
:
3549 return dev_set_allmulti(dev
, what
);
3550 case PACKET_MR_UNICAST
:
3551 if (i
->alen
!= dev
->addr_len
)
3554 return dev_uc_add(dev
, i
->addr
);
3556 return dev_uc_del(dev
, i
->addr
);
3564 static void packet_dev_mclist_delete(struct net_device
*dev
,
3565 struct packet_mclist
**mlp
)
3567 struct packet_mclist
*ml
;
3569 while ((ml
= *mlp
) != NULL
) {
3570 if (ml
->ifindex
== dev
->ifindex
) {
3571 packet_dev_mc(dev
, ml
, -1);
3579 static int packet_mc_add(struct sock
*sk
, struct packet_mreq_max
*mreq
)
3581 struct packet_sock
*po
= pkt_sk(sk
);
3582 struct packet_mclist
*ml
, *i
;
3583 struct net_device
*dev
;
3589 dev
= __dev_get_by_index(sock_net(sk
), mreq
->mr_ifindex
);
3594 if (mreq
->mr_alen
> dev
->addr_len
)
3598 i
= kmalloc(sizeof(*i
), GFP_KERNEL
);
3603 for (ml
= po
->mclist
; ml
; ml
= ml
->next
) {
3604 if (ml
->ifindex
== mreq
->mr_ifindex
&&
3605 ml
->type
== mreq
->mr_type
&&
3606 ml
->alen
== mreq
->mr_alen
&&
3607 memcmp(ml
->addr
, mreq
->mr_address
, ml
->alen
) == 0) {
3609 /* Free the new element ... */
3615 i
->type
= mreq
->mr_type
;
3616 i
->ifindex
= mreq
->mr_ifindex
;
3617 i
->alen
= mreq
->mr_alen
;
3618 memcpy(i
->addr
, mreq
->mr_address
, i
->alen
);
3619 memset(i
->addr
+ i
->alen
, 0, sizeof(i
->addr
) - i
->alen
);
3621 i
->next
= po
->mclist
;
3623 err
= packet_dev_mc(dev
, i
, 1);
3625 po
->mclist
= i
->next
;
3634 static int packet_mc_drop(struct sock
*sk
, struct packet_mreq_max
*mreq
)
3636 struct packet_mclist
*ml
, **mlp
;
3640 for (mlp
= &pkt_sk(sk
)->mclist
; (ml
= *mlp
) != NULL
; mlp
= &ml
->next
) {
3641 if (ml
->ifindex
== mreq
->mr_ifindex
&&
3642 ml
->type
== mreq
->mr_type
&&
3643 ml
->alen
== mreq
->mr_alen
&&
3644 memcmp(ml
->addr
, mreq
->mr_address
, ml
->alen
) == 0) {
3645 if (--ml
->count
== 0) {
3646 struct net_device
*dev
;
3648 dev
= __dev_get_by_index(sock_net(sk
), ml
->ifindex
);
3650 packet_dev_mc(dev
, ml
, -1);
3660 static void packet_flush_mclist(struct sock
*sk
)
3662 struct packet_sock
*po
= pkt_sk(sk
);
3663 struct packet_mclist
*ml
;
3669 while ((ml
= po
->mclist
) != NULL
) {
3670 struct net_device
*dev
;
3672 po
->mclist
= ml
->next
;
3673 dev
= __dev_get_by_index(sock_net(sk
), ml
->ifindex
);
3675 packet_dev_mc(dev
, ml
, -1);
3682 packet_setsockopt(struct socket
*sock
, int level
, int optname
, sockptr_t optval
,
3683 unsigned int optlen
)
3685 struct sock
*sk
= sock
->sk
;
3686 struct packet_sock
*po
= pkt_sk(sk
);
3689 if (level
!= SOL_PACKET
)
3690 return -ENOPROTOOPT
;
3693 case PACKET_ADD_MEMBERSHIP
:
3694 case PACKET_DROP_MEMBERSHIP
:
3696 struct packet_mreq_max mreq
;
3698 memset(&mreq
, 0, sizeof(mreq
));
3699 if (len
< sizeof(struct packet_mreq
))
3701 if (len
> sizeof(mreq
))
3703 if (copy_from_sockptr(&mreq
, optval
, len
))
3705 if (len
< (mreq
.mr_alen
+ offsetof(struct packet_mreq
, mr_address
)))
3707 if (optname
== PACKET_ADD_MEMBERSHIP
)
3708 ret
= packet_mc_add(sk
, &mreq
);
3710 ret
= packet_mc_drop(sk
, &mreq
);
3714 case PACKET_RX_RING
:
3715 case PACKET_TX_RING
:
3717 union tpacket_req_u req_u
;
3721 switch (po
->tp_version
) {
3724 len
= sizeof(req_u
.req
);
3728 len
= sizeof(req_u
.req3
);
3734 if (copy_from_sockptr(&req_u
.req
, optval
, len
))
3737 ret
= packet_set_ring(sk
, &req_u
, 0,
3738 optname
== PACKET_TX_RING
);
3743 case PACKET_COPY_THRESH
:
3747 if (optlen
!= sizeof(val
))
3749 if (copy_from_sockptr(&val
, optval
, sizeof(val
)))
3752 pkt_sk(sk
)->copy_thresh
= val
;
3755 case PACKET_VERSION
:
3759 if (optlen
!= sizeof(val
))
3761 if (copy_from_sockptr(&val
, optval
, sizeof(val
)))
3772 if (po
->rx_ring
.pg_vec
|| po
->tx_ring
.pg_vec
) {
3775 po
->tp_version
= val
;
3781 case PACKET_RESERVE
:
3785 if (optlen
!= sizeof(val
))
3787 if (copy_from_sockptr(&val
, optval
, sizeof(val
)))
3792 if (po
->rx_ring
.pg_vec
|| po
->tx_ring
.pg_vec
) {
3795 po
->tp_reserve
= val
;
3805 if (optlen
!= sizeof(val
))
3807 if (copy_from_sockptr(&val
, optval
, sizeof(val
)))
3811 if (po
->rx_ring
.pg_vec
|| po
->tx_ring
.pg_vec
) {
3814 po
->tp_loss
= !!val
;
3820 case PACKET_AUXDATA
:
3824 if (optlen
< sizeof(val
))
3826 if (copy_from_sockptr(&val
, optval
, sizeof(val
)))
3830 po
->auxdata
= !!val
;
3834 case PACKET_ORIGDEV
:
3838 if (optlen
< sizeof(val
))
3840 if (copy_from_sockptr(&val
, optval
, sizeof(val
)))
3844 po
->origdev
= !!val
;
3848 case PACKET_VNET_HDR
:
3852 if (sock
->type
!= SOCK_RAW
)
3854 if (optlen
< sizeof(val
))
3856 if (copy_from_sockptr(&val
, optval
, sizeof(val
)))
3860 if (po
->rx_ring
.pg_vec
|| po
->tx_ring
.pg_vec
) {
3863 po
->has_vnet_hdr
= !!val
;
3869 case PACKET_TIMESTAMP
:
3873 if (optlen
!= sizeof(val
))
3875 if (copy_from_sockptr(&val
, optval
, sizeof(val
)))
3878 po
->tp_tstamp
= val
;
3883 struct fanout_args args
= { 0 };
3885 if (optlen
!= sizeof(int) && optlen
!= sizeof(args
))
3887 if (copy_from_sockptr(&args
, optval
, optlen
))
3890 return fanout_add(sk
, &args
);
3892 case PACKET_FANOUT_DATA
:
3897 return fanout_set_data(po
, optval
, optlen
);
3899 case PACKET_IGNORE_OUTGOING
:
3903 if (optlen
!= sizeof(val
))
3905 if (copy_from_sockptr(&val
, optval
, sizeof(val
)))
3907 if (val
< 0 || val
> 1)
3910 po
->prot_hook
.ignore_outgoing
= !!val
;
3913 case PACKET_TX_HAS_OFF
:
3917 if (optlen
!= sizeof(val
))
3919 if (copy_from_sockptr(&val
, optval
, sizeof(val
)))
3923 if (po
->rx_ring
.pg_vec
|| po
->tx_ring
.pg_vec
) {
3926 po
->tp_tx_has_off
= !!val
;
3932 case PACKET_QDISC_BYPASS
:
3936 if (optlen
!= sizeof(val
))
3938 if (copy_from_sockptr(&val
, optval
, sizeof(val
)))
3941 po
->xmit
= val
? packet_direct_xmit
: dev_queue_xmit
;
3945 return -ENOPROTOOPT
;
3949 static int packet_getsockopt(struct socket
*sock
, int level
, int optname
,
3950 char __user
*optval
, int __user
*optlen
)
3953 int val
, lv
= sizeof(val
);
3954 struct sock
*sk
= sock
->sk
;
3955 struct packet_sock
*po
= pkt_sk(sk
);
3957 union tpacket_stats_u st
;
3958 struct tpacket_rollover_stats rstats
;
3961 if (level
!= SOL_PACKET
)
3962 return -ENOPROTOOPT
;
3964 if (get_user(len
, optlen
))
3971 case PACKET_STATISTICS
:
3972 spin_lock_bh(&sk
->sk_receive_queue
.lock
);
3973 memcpy(&st
, &po
->stats
, sizeof(st
));
3974 memset(&po
->stats
, 0, sizeof(po
->stats
));
3975 spin_unlock_bh(&sk
->sk_receive_queue
.lock
);
3976 drops
= atomic_xchg(&po
->tp_drops
, 0);
3978 if (po
->tp_version
== TPACKET_V3
) {
3979 lv
= sizeof(struct tpacket_stats_v3
);
3980 st
.stats3
.tp_drops
= drops
;
3981 st
.stats3
.tp_packets
+= drops
;
3984 lv
= sizeof(struct tpacket_stats
);
3985 st
.stats1
.tp_drops
= drops
;
3986 st
.stats1
.tp_packets
+= drops
;
3991 case PACKET_AUXDATA
:
3994 case PACKET_ORIGDEV
:
3997 case PACKET_VNET_HDR
:
3998 val
= po
->has_vnet_hdr
;
4000 case PACKET_VERSION
:
4001 val
= po
->tp_version
;
4004 if (len
> sizeof(int))
4006 if (len
< sizeof(int))
4008 if (copy_from_user(&val
, optval
, len
))
4012 val
= sizeof(struct tpacket_hdr
);
4015 val
= sizeof(struct tpacket2_hdr
);
4018 val
= sizeof(struct tpacket3_hdr
);
4024 case PACKET_RESERVE
:
4025 val
= po
->tp_reserve
;
4030 case PACKET_TIMESTAMP
:
4031 val
= po
->tp_tstamp
;
4035 ((u32
)po
->fanout
->id
|
4036 ((u32
)po
->fanout
->type
<< 16) |
4037 ((u32
)po
->fanout
->flags
<< 24)) :
4040 case PACKET_IGNORE_OUTGOING
:
4041 val
= po
->prot_hook
.ignore_outgoing
;
4043 case PACKET_ROLLOVER_STATS
:
4046 rstats
.tp_all
= atomic_long_read(&po
->rollover
->num
);
4047 rstats
.tp_huge
= atomic_long_read(&po
->rollover
->num_huge
);
4048 rstats
.tp_failed
= atomic_long_read(&po
->rollover
->num_failed
);
4050 lv
= sizeof(rstats
);
4052 case PACKET_TX_HAS_OFF
:
4053 val
= po
->tp_tx_has_off
;
4055 case PACKET_QDISC_BYPASS
:
4056 val
= packet_use_direct_xmit(po
);
4059 return -ENOPROTOOPT
;
4064 if (put_user(len
, optlen
))
4066 if (copy_to_user(optval
, data
, len
))
4071 static int packet_notifier(struct notifier_block
*this,
4072 unsigned long msg
, void *ptr
)
4075 struct net_device
*dev
= netdev_notifier_info_to_dev(ptr
);
4076 struct net
*net
= dev_net(dev
);
4079 sk_for_each_rcu(sk
, &net
->packet
.sklist
) {
4080 struct packet_sock
*po
= pkt_sk(sk
);
4083 case NETDEV_UNREGISTER
:
4085 packet_dev_mclist_delete(dev
, &po
->mclist
);
4089 if (dev
->ifindex
== po
->ifindex
) {
4090 spin_lock(&po
->bind_lock
);
4092 __unregister_prot_hook(sk
, false);
4093 sk
->sk_err
= ENETDOWN
;
4094 if (!sock_flag(sk
, SOCK_DEAD
))
4095 sk
->sk_error_report(sk
);
4097 if (msg
== NETDEV_UNREGISTER
) {
4098 packet_cached_dev_reset(po
);
4100 if (po
->prot_hook
.dev
)
4101 dev_put(po
->prot_hook
.dev
);
4102 po
->prot_hook
.dev
= NULL
;
4104 spin_unlock(&po
->bind_lock
);
4108 if (dev
->ifindex
== po
->ifindex
) {
4109 spin_lock(&po
->bind_lock
);
4111 register_prot_hook(sk
);
4112 spin_unlock(&po
->bind_lock
);
4122 static int packet_ioctl(struct socket
*sock
, unsigned int cmd
,
4125 struct sock
*sk
= sock
->sk
;
4130 int amount
= sk_wmem_alloc_get(sk
);
4132 return put_user(amount
, (int __user
*)arg
);
4136 struct sk_buff
*skb
;
4139 spin_lock_bh(&sk
->sk_receive_queue
.lock
);
4140 skb
= skb_peek(&sk
->sk_receive_queue
);
4143 spin_unlock_bh(&sk
->sk_receive_queue
.lock
);
4144 return put_user(amount
, (int __user
*)arg
);
4154 case SIOCGIFBRDADDR
:
4155 case SIOCSIFBRDADDR
:
4156 case SIOCGIFNETMASK
:
4157 case SIOCSIFNETMASK
:
4158 case SIOCGIFDSTADDR
:
4159 case SIOCSIFDSTADDR
:
4161 return inet_dgram_ops
.ioctl(sock
, cmd
, arg
);
4165 return -ENOIOCTLCMD
;
4170 static __poll_t
packet_poll(struct file
*file
, struct socket
*sock
,
4173 struct sock
*sk
= sock
->sk
;
4174 struct packet_sock
*po
= pkt_sk(sk
);
4175 __poll_t mask
= datagram_poll(file
, sock
, wait
);
4177 spin_lock_bh(&sk
->sk_receive_queue
.lock
);
4178 if (po
->rx_ring
.pg_vec
) {
4179 if (!packet_previous_rx_frame(po
, &po
->rx_ring
,
4181 mask
|= EPOLLIN
| EPOLLRDNORM
;
4183 packet_rcv_try_clear_pressure(po
);
4184 spin_unlock_bh(&sk
->sk_receive_queue
.lock
);
4185 spin_lock_bh(&sk
->sk_write_queue
.lock
);
4186 if (po
->tx_ring
.pg_vec
) {
4187 if (packet_current_frame(po
, &po
->tx_ring
, TP_STATUS_AVAILABLE
))
4188 mask
|= EPOLLOUT
| EPOLLWRNORM
;
4190 spin_unlock_bh(&sk
->sk_write_queue
.lock
);
4195 /* Dirty? Well, I still did not learn better way to account
4199 static void packet_mm_open(struct vm_area_struct
*vma
)
4201 struct file
*file
= vma
->vm_file
;
4202 struct socket
*sock
= file
->private_data
;
4203 struct sock
*sk
= sock
->sk
;
4206 atomic_inc(&pkt_sk(sk
)->mapped
);
4209 static void packet_mm_close(struct vm_area_struct
*vma
)
4211 struct file
*file
= vma
->vm_file
;
4212 struct socket
*sock
= file
->private_data
;
4213 struct sock
*sk
= sock
->sk
;
4216 atomic_dec(&pkt_sk(sk
)->mapped
);
4219 static const struct vm_operations_struct packet_mmap_ops
= {
4220 .open
= packet_mm_open
,
4221 .close
= packet_mm_close
,
4224 static void free_pg_vec(struct pgv
*pg_vec
, unsigned int order
,
4229 for (i
= 0; i
< len
; i
++) {
4230 if (likely(pg_vec
[i
].buffer
)) {
4231 if (is_vmalloc_addr(pg_vec
[i
].buffer
))
4232 vfree(pg_vec
[i
].buffer
);
4234 free_pages((unsigned long)pg_vec
[i
].buffer
,
4236 pg_vec
[i
].buffer
= NULL
;
4242 static char *alloc_one_pg_vec_page(unsigned long order
)
4245 gfp_t gfp_flags
= GFP_KERNEL
| __GFP_COMP
|
4246 __GFP_ZERO
| __GFP_NOWARN
| __GFP_NORETRY
;
4248 buffer
= (char *) __get_free_pages(gfp_flags
, order
);
4252 /* __get_free_pages failed, fall back to vmalloc */
4253 buffer
= vzalloc(array_size((1 << order
), PAGE_SIZE
));
4257 /* vmalloc failed, lets dig into swap here */
4258 gfp_flags
&= ~__GFP_NORETRY
;
4259 buffer
= (char *) __get_free_pages(gfp_flags
, order
);
4263 /* complete and utter failure */
4267 static struct pgv
*alloc_pg_vec(struct tpacket_req
*req
, int order
)
4269 unsigned int block_nr
= req
->tp_block_nr
;
4273 pg_vec
= kcalloc(block_nr
, sizeof(struct pgv
), GFP_KERNEL
| __GFP_NOWARN
);
4274 if (unlikely(!pg_vec
))
4277 for (i
= 0; i
< block_nr
; i
++) {
4278 pg_vec
[i
].buffer
= alloc_one_pg_vec_page(order
);
4279 if (unlikely(!pg_vec
[i
].buffer
))
4280 goto out_free_pgvec
;
4287 free_pg_vec(pg_vec
, order
, block_nr
);
4292 static int packet_set_ring(struct sock
*sk
, union tpacket_req_u
*req_u
,
4293 int closing
, int tx_ring
)
4295 struct pgv
*pg_vec
= NULL
;
4296 struct packet_sock
*po
= pkt_sk(sk
);
4297 unsigned long *rx_owner_map
= NULL
;
4298 int was_running
, order
= 0;
4299 struct packet_ring_buffer
*rb
;
4300 struct sk_buff_head
*rb_queue
;
4303 /* Added to avoid minimal code churn */
4304 struct tpacket_req
*req
= &req_u
->req
;
4306 rb
= tx_ring
? &po
->tx_ring
: &po
->rx_ring
;
4307 rb_queue
= tx_ring
? &sk
->sk_write_queue
: &sk
->sk_receive_queue
;
4311 if (atomic_read(&po
->mapped
))
4313 if (packet_read_pending(rb
))
4317 if (req
->tp_block_nr
) {
4318 unsigned int min_frame_size
;
4320 /* Sanity tests and some calculations */
4322 if (unlikely(rb
->pg_vec
))
4325 switch (po
->tp_version
) {
4327 po
->tp_hdrlen
= TPACKET_HDRLEN
;
4330 po
->tp_hdrlen
= TPACKET2_HDRLEN
;
4333 po
->tp_hdrlen
= TPACKET3_HDRLEN
;
4338 if (unlikely((int)req
->tp_block_size
<= 0))
4340 if (unlikely(!PAGE_ALIGNED(req
->tp_block_size
)))
4342 min_frame_size
= po
->tp_hdrlen
+ po
->tp_reserve
;
4343 if (po
->tp_version
>= TPACKET_V3
&&
4344 req
->tp_block_size
<
4345 BLK_PLUS_PRIV((u64
)req_u
->req3
.tp_sizeof_priv
) + min_frame_size
)
4347 if (unlikely(req
->tp_frame_size
< min_frame_size
))
4349 if (unlikely(req
->tp_frame_size
& (TPACKET_ALIGNMENT
- 1)))
4352 rb
->frames_per_block
= req
->tp_block_size
/ req
->tp_frame_size
;
4353 if (unlikely(rb
->frames_per_block
== 0))
4355 if (unlikely(rb
->frames_per_block
> UINT_MAX
/ req
->tp_block_nr
))
4357 if (unlikely((rb
->frames_per_block
* req
->tp_block_nr
) !=
4362 order
= get_order(req
->tp_block_size
);
4363 pg_vec
= alloc_pg_vec(req
, order
);
4364 if (unlikely(!pg_vec
))
4366 switch (po
->tp_version
) {
4368 /* Block transmit is not supported yet */
4370 init_prb_bdqc(po
, rb
, pg_vec
, req_u
);
4372 struct tpacket_req3
*req3
= &req_u
->req3
;
4374 if (req3
->tp_retire_blk_tov
||
4375 req3
->tp_sizeof_priv
||
4376 req3
->tp_feature_req_word
) {
4378 goto out_free_pg_vec
;
4384 rx_owner_map
= bitmap_alloc(req
->tp_frame_nr
,
4385 GFP_KERNEL
| __GFP_NOWARN
| __GFP_ZERO
);
4387 goto out_free_pg_vec
;
4395 if (unlikely(req
->tp_frame_nr
))
4400 /* Detach socket from network */
4401 spin_lock(&po
->bind_lock
);
4402 was_running
= po
->running
;
4406 __unregister_prot_hook(sk
, false);
4408 spin_unlock(&po
->bind_lock
);
4413 mutex_lock(&po
->pg_vec_lock
);
4414 if (closing
|| atomic_read(&po
->mapped
) == 0) {
4416 spin_lock_bh(&rb_queue
->lock
);
4417 swap(rb
->pg_vec
, pg_vec
);
4418 if (po
->tp_version
<= TPACKET_V2
)
4419 swap(rb
->rx_owner_map
, rx_owner_map
);
4420 rb
->frame_max
= (req
->tp_frame_nr
- 1);
4422 rb
->frame_size
= req
->tp_frame_size
;
4423 spin_unlock_bh(&rb_queue
->lock
);
4425 swap(rb
->pg_vec_order
, order
);
4426 swap(rb
->pg_vec_len
, req
->tp_block_nr
);
4428 rb
->pg_vec_pages
= req
->tp_block_size
/PAGE_SIZE
;
4429 po
->prot_hook
.func
= (po
->rx_ring
.pg_vec
) ?
4430 tpacket_rcv
: packet_rcv
;
4431 skb_queue_purge(rb_queue
);
4432 if (atomic_read(&po
->mapped
))
4433 pr_err("packet_mmap: vma is busy: %d\n",
4434 atomic_read(&po
->mapped
));
4436 mutex_unlock(&po
->pg_vec_lock
);
4438 spin_lock(&po
->bind_lock
);
4441 register_prot_hook(sk
);
4443 spin_unlock(&po
->bind_lock
);
4444 if (pg_vec
&& (po
->tp_version
> TPACKET_V2
)) {
4445 /* Because we don't support block-based V3 on tx-ring */
4447 prb_shutdown_retire_blk_timer(po
, rb_queue
);
4451 bitmap_free(rx_owner_map
);
4453 free_pg_vec(pg_vec
, order
, req
->tp_block_nr
);
4458 static int packet_mmap(struct file
*file
, struct socket
*sock
,
4459 struct vm_area_struct
*vma
)
4461 struct sock
*sk
= sock
->sk
;
4462 struct packet_sock
*po
= pkt_sk(sk
);
4463 unsigned long size
, expected_size
;
4464 struct packet_ring_buffer
*rb
;
4465 unsigned long start
;
4472 mutex_lock(&po
->pg_vec_lock
);
4475 for (rb
= &po
->rx_ring
; rb
<= &po
->tx_ring
; rb
++) {
4477 expected_size
+= rb
->pg_vec_len
4483 if (expected_size
== 0)
4486 size
= vma
->vm_end
- vma
->vm_start
;
4487 if (size
!= expected_size
)
4490 start
= vma
->vm_start
;
4491 for (rb
= &po
->rx_ring
; rb
<= &po
->tx_ring
; rb
++) {
4492 if (rb
->pg_vec
== NULL
)
4495 for (i
= 0; i
< rb
->pg_vec_len
; i
++) {
4497 void *kaddr
= rb
->pg_vec
[i
].buffer
;
4500 for (pg_num
= 0; pg_num
< rb
->pg_vec_pages
; pg_num
++) {
4501 page
= pgv_to_page(kaddr
);
4502 err
= vm_insert_page(vma
, start
, page
);
4511 atomic_inc(&po
->mapped
);
4512 vma
->vm_ops
= &packet_mmap_ops
;
4516 mutex_unlock(&po
->pg_vec_lock
);
4520 static const struct proto_ops packet_ops_spkt
= {
4521 .family
= PF_PACKET
,
4522 .owner
= THIS_MODULE
,
4523 .release
= packet_release
,
4524 .bind
= packet_bind_spkt
,
4525 .connect
= sock_no_connect
,
4526 .socketpair
= sock_no_socketpair
,
4527 .accept
= sock_no_accept
,
4528 .getname
= packet_getname_spkt
,
4529 .poll
= datagram_poll
,
4530 .ioctl
= packet_ioctl
,
4531 .gettstamp
= sock_gettstamp
,
4532 .listen
= sock_no_listen
,
4533 .shutdown
= sock_no_shutdown
,
4534 .sendmsg
= packet_sendmsg_spkt
,
4535 .recvmsg
= packet_recvmsg
,
4536 .mmap
= sock_no_mmap
,
4537 .sendpage
= sock_no_sendpage
,
4540 static const struct proto_ops packet_ops
= {
4541 .family
= PF_PACKET
,
4542 .owner
= THIS_MODULE
,
4543 .release
= packet_release
,
4544 .bind
= packet_bind
,
4545 .connect
= sock_no_connect
,
4546 .socketpair
= sock_no_socketpair
,
4547 .accept
= sock_no_accept
,
4548 .getname
= packet_getname
,
4549 .poll
= packet_poll
,
4550 .ioctl
= packet_ioctl
,
4551 .gettstamp
= sock_gettstamp
,
4552 .listen
= sock_no_listen
,
4553 .shutdown
= sock_no_shutdown
,
4554 .setsockopt
= packet_setsockopt
,
4555 .getsockopt
= packet_getsockopt
,
4556 .sendmsg
= packet_sendmsg
,
4557 .recvmsg
= packet_recvmsg
,
4558 .mmap
= packet_mmap
,
4559 .sendpage
= sock_no_sendpage
,
4562 static const struct net_proto_family packet_family_ops
= {
4563 .family
= PF_PACKET
,
4564 .create
= packet_create
,
4565 .owner
= THIS_MODULE
,
4568 static struct notifier_block packet_netdev_notifier
= {
4569 .notifier_call
= packet_notifier
,
4572 #ifdef CONFIG_PROC_FS
4574 static void *packet_seq_start(struct seq_file
*seq
, loff_t
*pos
)
4577 struct net
*net
= seq_file_net(seq
);
4580 return seq_hlist_start_head_rcu(&net
->packet
.sklist
, *pos
);
4583 static void *packet_seq_next(struct seq_file
*seq
, void *v
, loff_t
*pos
)
4585 struct net
*net
= seq_file_net(seq
);
4586 return seq_hlist_next_rcu(v
, &net
->packet
.sklist
, pos
);
4589 static void packet_seq_stop(struct seq_file
*seq
, void *v
)
4595 static int packet_seq_show(struct seq_file
*seq
, void *v
)
4597 if (v
== SEQ_START_TOKEN
)
4598 seq_puts(seq
, "sk RefCnt Type Proto Iface R Rmem User Inode\n");
4600 struct sock
*s
= sk_entry(v
);
4601 const struct packet_sock
*po
= pkt_sk(s
);
4604 "%pK %-6d %-4d %04x %-5d %1d %-6u %-6u %-6lu\n",
4606 refcount_read(&s
->sk_refcnt
),
4611 atomic_read(&s
->sk_rmem_alloc
),
4612 from_kuid_munged(seq_user_ns(seq
), sock_i_uid(s
)),
4619 static const struct seq_operations packet_seq_ops
= {
4620 .start
= packet_seq_start
,
4621 .next
= packet_seq_next
,
4622 .stop
= packet_seq_stop
,
4623 .show
= packet_seq_show
,
4627 static int __net_init
packet_net_init(struct net
*net
)
4629 mutex_init(&net
->packet
.sklist_lock
);
4630 INIT_HLIST_HEAD(&net
->packet
.sklist
);
4632 #ifdef CONFIG_PROC_FS
4633 if (!proc_create_net("packet", 0, net
->proc_net
, &packet_seq_ops
,
4634 sizeof(struct seq_net_private
)))
4636 #endif /* CONFIG_PROC_FS */
4641 static void __net_exit
packet_net_exit(struct net
*net
)
4643 remove_proc_entry("packet", net
->proc_net
);
4644 WARN_ON_ONCE(!hlist_empty(&net
->packet
.sklist
));
4647 static struct pernet_operations packet_net_ops
= {
4648 .init
= packet_net_init
,
4649 .exit
= packet_net_exit
,
4653 static void __exit
packet_exit(void)
4655 unregister_netdevice_notifier(&packet_netdev_notifier
);
4656 unregister_pernet_subsys(&packet_net_ops
);
4657 sock_unregister(PF_PACKET
);
4658 proto_unregister(&packet_proto
);
4661 static int __init
packet_init(void)
4665 rc
= proto_register(&packet_proto
, 0);
4668 rc
= sock_register(&packet_family_ops
);
4671 rc
= register_pernet_subsys(&packet_net_ops
);
4674 rc
= register_netdevice_notifier(&packet_netdev_notifier
);
4681 unregister_pernet_subsys(&packet_net_ops
);
4683 sock_unregister(PF_PACKET
);
4685 proto_unregister(&packet_proto
);
4690 module_init(packet_init
);
4691 module_exit(packet_exit
);
4692 MODULE_LICENSE("GPL");
4693 MODULE_ALIAS_NETPROTO(PF_PACKET
);