1 // SPDX-License-Identifier: GPL-2.0-or-later
3 * INET An implementation of the TCP/IP protocol suite for the LINUX
4 * operating system. INET is implemented using the BSD Socket
5 * interface as the means of communication with the user level.
7 * PACKET - implements raw packet sockets.
10 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
11 * Alan Cox, <gw4pts@gw4pts.ampr.org>
14 * Alan Cox : verify_area() now used correctly
15 * Alan Cox : new skbuff lists, look ma no backlogs!
16 * Alan Cox : tidied skbuff lists.
17 * Alan Cox : Now uses generic datagram routines I
18 * added. Also fixed the peek/read crash
19 * from all old Linux datagram code.
20 * Alan Cox : Uses the improved datagram code.
21 * Alan Cox : Added NULL's for socket options.
22 * Alan Cox : Re-commented the code.
23 * Alan Cox : Use new kernel side addressing
24 * Rob Janssen : Correct MTU usage.
25 * Dave Platt : Counter leaks caused by incorrect
26 * interrupt locking and some slightly
27 * dubious gcc output. Can you read
28 * compiler: it said _VOLATILE_
29 * Richard Kooijman : Timestamp fixes.
30 * Alan Cox : New buffers. Use sk->mac.raw.
31 * Alan Cox : sendmsg/recvmsg support.
32 * Alan Cox : Protocol setting support
33 * Alexey Kuznetsov : Untied from IPv4 stack.
34 * Cyrus Durgin : Fixed kerneld for kmod.
35 * Michal Ostrowski : Module initialization cleanup.
36 * Ulises Alonso : Frame number limit removal and
37 * packet_set_ring memory leak.
38 * Eric Biederman : Allow for > 8 byte hardware addresses.
39 * The convention is that longer addresses
40 * will simply extend the hardware address
41 * byte arrays at the end of sockaddr_ll
43 * Johann Baudy : Added TX RING.
44 * Chetan Loke : Implemented TPACKET_V3 block abstraction
46 * Copyright (C) 2011, <lokec@ccs.neu.edu>
49 #include <linux/types.h>
51 #include <linux/capability.h>
52 #include <linux/fcntl.h>
53 #include <linux/socket.h>
55 #include <linux/inet.h>
56 #include <linux/netdevice.h>
57 #include <linux/if_packet.h>
58 #include <linux/wireless.h>
59 #include <linux/kernel.h>
60 #include <linux/kmod.h>
61 #include <linux/slab.h>
62 #include <linux/vmalloc.h>
63 #include <net/net_namespace.h>
65 #include <net/protocol.h>
66 #include <linux/skbuff.h>
68 #include <linux/errno.h>
69 #include <linux/timer.h>
70 #include <linux/uaccess.h>
71 #include <asm/ioctls.h>
73 #include <asm/cacheflush.h>
75 #include <linux/proc_fs.h>
76 #include <linux/seq_file.h>
77 #include <linux/poll.h>
78 #include <linux/module.h>
79 #include <linux/init.h>
80 #include <linux/mutex.h>
81 #include <linux/if_vlan.h>
82 #include <linux/virtio_net.h>
83 #include <linux/errqueue.h>
84 #include <linux/net_tstamp.h>
85 #include <linux/percpu.h>
87 #include <net/inet_common.h>
89 #include <linux/bpf.h>
90 #include <net/compat.h>
96 - if device has no dev->hard_header routine, it adds and removes ll header
97 inside itself. In this case ll header is invisible outside of device,
98 but higher levels still should reserve dev->hard_header_len.
99 Some devices are enough clever to reallocate skb, when header
100 will not fit to reserved space (tunnel), another ones are silly
102 - packet socket receives packets with pulled ll header,
103 so that SOCK_RAW should push it back.
108 Incoming, dev->hard_header!=NULL
109 mac_header -> ll header
112 Outgoing, dev->hard_header!=NULL
113 mac_header -> ll header
116 Incoming, dev->hard_header==NULL
117 mac_header -> UNKNOWN position. It is very likely, that it points to ll
118 header. PPP makes it, that is wrong, because introduce
119 assymetry between rx and tx paths.
122 Outgoing, dev->hard_header==NULL
123 mac_header -> data. ll header is still not built!
127 If dev->hard_header==NULL we are unlikely to restore sensible ll header.
133 dev->hard_header != NULL
134 mac_header -> ll header
137 dev->hard_header == NULL (ll header is added by device, we cannot control it)
141 We should set nh.raw on output to correct posistion,
142 packet classifier depends on it.
145 /* Private packet socket structures. */
147 /* identical to struct packet_mreq except it has
148 * a longer address field.
150 struct packet_mreq_max
{
152 unsigned short mr_type
;
153 unsigned short mr_alen
;
154 unsigned char mr_address
[MAX_ADDR_LEN
];
158 struct tpacket_hdr
*h1
;
159 struct tpacket2_hdr
*h2
;
160 struct tpacket3_hdr
*h3
;
164 static int packet_set_ring(struct sock
*sk
, union tpacket_req_u
*req_u
,
165 int closing
, int tx_ring
);
167 #define V3_ALIGNMENT (8)
169 #define BLK_HDR_LEN (ALIGN(sizeof(struct tpacket_block_desc), V3_ALIGNMENT))
171 #define BLK_PLUS_PRIV(sz_of_priv) \
172 (BLK_HDR_LEN + ALIGN((sz_of_priv), V3_ALIGNMENT))
174 #define BLOCK_STATUS(x) ((x)->hdr.bh1.block_status)
175 #define BLOCK_NUM_PKTS(x) ((x)->hdr.bh1.num_pkts)
176 #define BLOCK_O2FP(x) ((x)->hdr.bh1.offset_to_first_pkt)
177 #define BLOCK_LEN(x) ((x)->hdr.bh1.blk_len)
178 #define BLOCK_SNUM(x) ((x)->hdr.bh1.seq_num)
179 #define BLOCK_O2PRIV(x) ((x)->offset_to_priv)
180 #define BLOCK_PRIV(x) ((void *)((char *)(x) + BLOCK_O2PRIV(x)))
183 static int tpacket_rcv(struct sk_buff
*skb
, struct net_device
*dev
,
184 struct packet_type
*pt
, struct net_device
*orig_dev
);
186 static void *packet_previous_frame(struct packet_sock
*po
,
187 struct packet_ring_buffer
*rb
,
189 static void packet_increment_head(struct packet_ring_buffer
*buff
);
190 static int prb_curr_blk_in_use(struct tpacket_block_desc
*);
191 static void *prb_dispatch_next_block(struct tpacket_kbdq_core
*,
192 struct packet_sock
*);
193 static void prb_retire_current_block(struct tpacket_kbdq_core
*,
194 struct packet_sock
*, unsigned int status
);
195 static int prb_queue_frozen(struct tpacket_kbdq_core
*);
196 static void prb_open_block(struct tpacket_kbdq_core
*,
197 struct tpacket_block_desc
*);
198 static void prb_retire_rx_blk_timer_expired(struct timer_list
*);
199 static void _prb_refresh_rx_retire_blk_timer(struct tpacket_kbdq_core
*);
200 static void prb_fill_rxhash(struct tpacket_kbdq_core
*, struct tpacket3_hdr
*);
201 static void prb_clear_rxhash(struct tpacket_kbdq_core
*,
202 struct tpacket3_hdr
*);
203 static void prb_fill_vlan_info(struct tpacket_kbdq_core
*,
204 struct tpacket3_hdr
*);
205 static void packet_flush_mclist(struct sock
*sk
);
206 static u16
packet_pick_tx_queue(struct sk_buff
*skb
);
208 struct packet_skb_cb
{
210 struct sockaddr_pkt pkt
;
212 /* Trick: alias skb original length with
213 * ll.sll_family and ll.protocol in order
216 unsigned int origlen
;
217 struct sockaddr_ll ll
;
222 #define vio_le() virtio_legacy_is_little_endian()
224 #define PACKET_SKB_CB(__skb) ((struct packet_skb_cb *)((__skb)->cb))
226 #define GET_PBDQC_FROM_RB(x) ((struct tpacket_kbdq_core *)(&(x)->prb_bdqc))
227 #define GET_PBLOCK_DESC(x, bid) \
228 ((struct tpacket_block_desc *)((x)->pkbdq[(bid)].buffer))
229 #define GET_CURR_PBLOCK_DESC_FROM_CORE(x) \
230 ((struct tpacket_block_desc *)((x)->pkbdq[(x)->kactive_blk_num].buffer))
231 #define GET_NEXT_PRB_BLK_NUM(x) \
232 (((x)->kactive_blk_num < ((x)->knum_blocks-1)) ? \
233 ((x)->kactive_blk_num+1) : 0)
235 static void __fanout_unlink(struct sock
*sk
, struct packet_sock
*po
);
236 static void __fanout_link(struct sock
*sk
, struct packet_sock
*po
);
238 static int packet_direct_xmit(struct sk_buff
*skb
)
240 return dev_direct_xmit(skb
, packet_pick_tx_queue(skb
));
243 static struct net_device
*packet_cached_dev_get(struct packet_sock
*po
)
245 struct net_device
*dev
;
248 dev
= rcu_dereference(po
->cached_dev
);
256 static void packet_cached_dev_assign(struct packet_sock
*po
,
257 struct net_device
*dev
)
259 rcu_assign_pointer(po
->cached_dev
, dev
);
262 static void packet_cached_dev_reset(struct packet_sock
*po
)
264 RCU_INIT_POINTER(po
->cached_dev
, NULL
);
267 static bool packet_use_direct_xmit(const struct packet_sock
*po
)
269 return po
->xmit
== packet_direct_xmit
;
272 static u16
packet_pick_tx_queue(struct sk_buff
*skb
)
274 struct net_device
*dev
= skb
->dev
;
275 const struct net_device_ops
*ops
= dev
->netdev_ops
;
276 int cpu
= raw_smp_processor_id();
280 skb
->sender_cpu
= cpu
+ 1;
282 skb_record_rx_queue(skb
, cpu
% dev
->real_num_tx_queues
);
283 if (ops
->ndo_select_queue
) {
284 queue_index
= ops
->ndo_select_queue(dev
, skb
, NULL
);
285 queue_index
= netdev_cap_txqueue(dev
, queue_index
);
287 queue_index
= netdev_pick_tx(dev
, skb
, NULL
);
293 /* __register_prot_hook must be invoked through register_prot_hook
294 * or from a context in which asynchronous accesses to the packet
295 * socket is not possible (packet_create()).
297 static void __register_prot_hook(struct sock
*sk
)
299 struct packet_sock
*po
= pkt_sk(sk
);
303 __fanout_link(sk
, po
);
305 dev_add_pack(&po
->prot_hook
);
312 static void register_prot_hook(struct sock
*sk
)
314 lockdep_assert_held_once(&pkt_sk(sk
)->bind_lock
);
315 __register_prot_hook(sk
);
318 /* If the sync parameter is true, we will temporarily drop
319 * the po->bind_lock and do a synchronize_net to make sure no
320 * asynchronous packet processing paths still refer to the elements
321 * of po->prot_hook. If the sync parameter is false, it is the
322 * callers responsibility to take care of this.
324 static void __unregister_prot_hook(struct sock
*sk
, bool sync
)
326 struct packet_sock
*po
= pkt_sk(sk
);
328 lockdep_assert_held_once(&po
->bind_lock
);
333 __fanout_unlink(sk
, po
);
335 __dev_remove_pack(&po
->prot_hook
);
340 spin_unlock(&po
->bind_lock
);
342 spin_lock(&po
->bind_lock
);
346 static void unregister_prot_hook(struct sock
*sk
, bool sync
)
348 struct packet_sock
*po
= pkt_sk(sk
);
351 __unregister_prot_hook(sk
, sync
);
354 static inline struct page
* __pure
pgv_to_page(void *addr
)
356 if (is_vmalloc_addr(addr
))
357 return vmalloc_to_page(addr
);
358 return virt_to_page(addr
);
361 static void __packet_set_status(struct packet_sock
*po
, void *frame
, int status
)
363 union tpacket_uhdr h
;
366 switch (po
->tp_version
) {
368 h
.h1
->tp_status
= status
;
369 flush_dcache_page(pgv_to_page(&h
.h1
->tp_status
));
372 h
.h2
->tp_status
= status
;
373 flush_dcache_page(pgv_to_page(&h
.h2
->tp_status
));
376 h
.h3
->tp_status
= status
;
377 flush_dcache_page(pgv_to_page(&h
.h3
->tp_status
));
380 WARN(1, "TPACKET version not supported.\n");
387 static int __packet_get_status(struct packet_sock
*po
, void *frame
)
389 union tpacket_uhdr h
;
394 switch (po
->tp_version
) {
396 flush_dcache_page(pgv_to_page(&h
.h1
->tp_status
));
397 return h
.h1
->tp_status
;
399 flush_dcache_page(pgv_to_page(&h
.h2
->tp_status
));
400 return h
.h2
->tp_status
;
402 flush_dcache_page(pgv_to_page(&h
.h3
->tp_status
));
403 return h
.h3
->tp_status
;
405 WARN(1, "TPACKET version not supported.\n");
411 static __u32
tpacket_get_timestamp(struct sk_buff
*skb
, struct timespec
*ts
,
414 struct skb_shared_hwtstamps
*shhwtstamps
= skb_hwtstamps(skb
);
417 (flags
& SOF_TIMESTAMPING_RAW_HARDWARE
) &&
418 ktime_to_timespec_cond(shhwtstamps
->hwtstamp
, ts
))
419 return TP_STATUS_TS_RAW_HARDWARE
;
421 if (ktime_to_timespec_cond(skb
->tstamp
, ts
))
422 return TP_STATUS_TS_SOFTWARE
;
427 static __u32
__packet_set_timestamp(struct packet_sock
*po
, void *frame
,
430 union tpacket_uhdr h
;
434 if (!(ts_status
= tpacket_get_timestamp(skb
, &ts
, po
->tp_tstamp
)))
438 switch (po
->tp_version
) {
440 h
.h1
->tp_sec
= ts
.tv_sec
;
441 h
.h1
->tp_usec
= ts
.tv_nsec
/ NSEC_PER_USEC
;
444 h
.h2
->tp_sec
= ts
.tv_sec
;
445 h
.h2
->tp_nsec
= ts
.tv_nsec
;
448 h
.h3
->tp_sec
= ts
.tv_sec
;
449 h
.h3
->tp_nsec
= ts
.tv_nsec
;
452 WARN(1, "TPACKET version not supported.\n");
456 /* one flush is safe, as both fields always lie on the same cacheline */
457 flush_dcache_page(pgv_to_page(&h
.h1
->tp_sec
));
463 static void *packet_lookup_frame(struct packet_sock
*po
,
464 struct packet_ring_buffer
*rb
,
465 unsigned int position
,
468 unsigned int pg_vec_pos
, frame_offset
;
469 union tpacket_uhdr h
;
471 pg_vec_pos
= position
/ rb
->frames_per_block
;
472 frame_offset
= position
% rb
->frames_per_block
;
474 h
.raw
= rb
->pg_vec
[pg_vec_pos
].buffer
+
475 (frame_offset
* rb
->frame_size
);
477 if (status
!= __packet_get_status(po
, h
.raw
))
483 static void *packet_current_frame(struct packet_sock
*po
,
484 struct packet_ring_buffer
*rb
,
487 return packet_lookup_frame(po
, rb
, rb
->head
, status
);
490 static void prb_del_retire_blk_timer(struct tpacket_kbdq_core
*pkc
)
492 del_timer_sync(&pkc
->retire_blk_timer
);
495 static void prb_shutdown_retire_blk_timer(struct packet_sock
*po
,
496 struct sk_buff_head
*rb_queue
)
498 struct tpacket_kbdq_core
*pkc
;
500 pkc
= GET_PBDQC_FROM_RB(&po
->rx_ring
);
502 spin_lock_bh(&rb_queue
->lock
);
503 pkc
->delete_blk_timer
= 1;
504 spin_unlock_bh(&rb_queue
->lock
);
506 prb_del_retire_blk_timer(pkc
);
509 static void prb_setup_retire_blk_timer(struct packet_sock
*po
)
511 struct tpacket_kbdq_core
*pkc
;
513 pkc
= GET_PBDQC_FROM_RB(&po
->rx_ring
);
514 timer_setup(&pkc
->retire_blk_timer
, prb_retire_rx_blk_timer_expired
,
516 pkc
->retire_blk_timer
.expires
= jiffies
;
519 static int prb_calc_retire_blk_tmo(struct packet_sock
*po
,
520 int blk_size_in_bytes
)
522 struct net_device
*dev
;
523 unsigned int mbits
= 0, msec
= 0, div
= 0, tmo
= 0;
524 struct ethtool_link_ksettings ecmd
;
528 dev
= __dev_get_by_index(sock_net(&po
->sk
), po
->ifindex
);
529 if (unlikely(!dev
)) {
531 return DEFAULT_PRB_RETIRE_TOV
;
533 err
= __ethtool_get_link_ksettings(dev
, &ecmd
);
537 * If the link speed is so slow you don't really
538 * need to worry about perf anyways
540 if (ecmd
.base
.speed
< SPEED_1000
||
541 ecmd
.base
.speed
== SPEED_UNKNOWN
) {
542 return DEFAULT_PRB_RETIRE_TOV
;
545 div
= ecmd
.base
.speed
/ 1000;
549 mbits
= (blk_size_in_bytes
* 8) / (1024 * 1024);
561 static void prb_init_ft_ops(struct tpacket_kbdq_core
*p1
,
562 union tpacket_req_u
*req_u
)
564 p1
->feature_req_word
= req_u
->req3
.tp_feature_req_word
;
567 static void init_prb_bdqc(struct packet_sock
*po
,
568 struct packet_ring_buffer
*rb
,
570 union tpacket_req_u
*req_u
)
572 struct tpacket_kbdq_core
*p1
= GET_PBDQC_FROM_RB(rb
);
573 struct tpacket_block_desc
*pbd
;
575 memset(p1
, 0x0, sizeof(*p1
));
577 p1
->knxt_seq_num
= 1;
579 pbd
= (struct tpacket_block_desc
*)pg_vec
[0].buffer
;
580 p1
->pkblk_start
= pg_vec
[0].buffer
;
581 p1
->kblk_size
= req_u
->req3
.tp_block_size
;
582 p1
->knum_blocks
= req_u
->req3
.tp_block_nr
;
583 p1
->hdrlen
= po
->tp_hdrlen
;
584 p1
->version
= po
->tp_version
;
585 p1
->last_kactive_blk_num
= 0;
586 po
->stats
.stats3
.tp_freeze_q_cnt
= 0;
587 if (req_u
->req3
.tp_retire_blk_tov
)
588 p1
->retire_blk_tov
= req_u
->req3
.tp_retire_blk_tov
;
590 p1
->retire_blk_tov
= prb_calc_retire_blk_tmo(po
,
591 req_u
->req3
.tp_block_size
);
592 p1
->tov_in_jiffies
= msecs_to_jiffies(p1
->retire_blk_tov
);
593 p1
->blk_sizeof_priv
= req_u
->req3
.tp_sizeof_priv
;
595 p1
->max_frame_len
= p1
->kblk_size
- BLK_PLUS_PRIV(p1
->blk_sizeof_priv
);
596 prb_init_ft_ops(p1
, req_u
);
597 prb_setup_retire_blk_timer(po
);
598 prb_open_block(p1
, pbd
);
601 /* Do NOT update the last_blk_num first.
602 * Assumes sk_buff_head lock is held.
604 static void _prb_refresh_rx_retire_blk_timer(struct tpacket_kbdq_core
*pkc
)
606 mod_timer(&pkc
->retire_blk_timer
,
607 jiffies
+ pkc
->tov_in_jiffies
);
608 pkc
->last_kactive_blk_num
= pkc
->kactive_blk_num
;
613 * 1) We refresh the timer only when we open a block.
614 * By doing this we don't waste cycles refreshing the timer
615 * on packet-by-packet basis.
617 * With a 1MB block-size, on a 1Gbps line, it will take
618 * i) ~8 ms to fill a block + ii) memcpy etc.
619 * In this cut we are not accounting for the memcpy time.
621 * So, if the user sets the 'tmo' to 10ms then the timer
622 * will never fire while the block is still getting filled
623 * (which is what we want). However, the user could choose
624 * to close a block early and that's fine.
626 * But when the timer does fire, we check whether or not to refresh it.
627 * Since the tmo granularity is in msecs, it is not too expensive
628 * to refresh the timer, lets say every '8' msecs.
629 * Either the user can set the 'tmo' or we can derive it based on
630 * a) line-speed and b) block-size.
631 * prb_calc_retire_blk_tmo() calculates the tmo.
634 static void prb_retire_rx_blk_timer_expired(struct timer_list
*t
)
636 struct packet_sock
*po
=
637 from_timer(po
, t
, rx_ring
.prb_bdqc
.retire_blk_timer
);
638 struct tpacket_kbdq_core
*pkc
= GET_PBDQC_FROM_RB(&po
->rx_ring
);
640 struct tpacket_block_desc
*pbd
;
642 spin_lock(&po
->sk
.sk_receive_queue
.lock
);
644 frozen
= prb_queue_frozen(pkc
);
645 pbd
= GET_CURR_PBLOCK_DESC_FROM_CORE(pkc
);
647 if (unlikely(pkc
->delete_blk_timer
))
650 /* We only need to plug the race when the block is partially filled.
652 * lock(); increment BLOCK_NUM_PKTS; unlock()
653 * copy_bits() is in progress ...
654 * timer fires on other cpu:
655 * we can't retire the current block because copy_bits
659 if (BLOCK_NUM_PKTS(pbd
)) {
660 while (atomic_read(&pkc
->blk_fill_in_prog
)) {
661 /* Waiting for skb_copy_bits to finish... */
666 if (pkc
->last_kactive_blk_num
== pkc
->kactive_blk_num
) {
668 if (!BLOCK_NUM_PKTS(pbd
)) {
669 /* An empty block. Just refresh the timer. */
672 prb_retire_current_block(pkc
, po
, TP_STATUS_BLK_TMO
);
673 if (!prb_dispatch_next_block(pkc
, po
))
678 /* Case 1. Queue was frozen because user-space was
681 if (prb_curr_blk_in_use(pbd
)) {
683 * Ok, user-space is still behind.
684 * So just refresh the timer.
688 /* Case 2. queue was frozen,user-space caught up,
689 * now the link went idle && the timer fired.
690 * We don't have a block to close.So we open this
691 * block and restart the timer.
692 * opening a block thaws the queue,restarts timer
693 * Thawing/timer-refresh is a side effect.
695 prb_open_block(pkc
, pbd
);
702 _prb_refresh_rx_retire_blk_timer(pkc
);
705 spin_unlock(&po
->sk
.sk_receive_queue
.lock
);
708 static void prb_flush_block(struct tpacket_kbdq_core
*pkc1
,
709 struct tpacket_block_desc
*pbd1
, __u32 status
)
711 /* Flush everything minus the block header */
713 #if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1
718 /* Skip the block header(we know header WILL fit in 4K) */
721 end
= (u8
*)PAGE_ALIGN((unsigned long)pkc1
->pkblk_end
);
722 for (; start
< end
; start
+= PAGE_SIZE
)
723 flush_dcache_page(pgv_to_page(start
));
728 /* Now update the block status. */
730 BLOCK_STATUS(pbd1
) = status
;
732 /* Flush the block header */
734 #if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1
736 flush_dcache_page(pgv_to_page(start
));
746 * 2) Increment active_blk_num
748 * Note:We DONT refresh the timer on purpose.
749 * Because almost always the next block will be opened.
751 static void prb_close_block(struct tpacket_kbdq_core
*pkc1
,
752 struct tpacket_block_desc
*pbd1
,
753 struct packet_sock
*po
, unsigned int stat
)
755 __u32 status
= TP_STATUS_USER
| stat
;
757 struct tpacket3_hdr
*last_pkt
;
758 struct tpacket_hdr_v1
*h1
= &pbd1
->hdr
.bh1
;
759 struct sock
*sk
= &po
->sk
;
761 if (po
->stats
.stats3
.tp_drops
)
762 status
|= TP_STATUS_LOSING
;
764 last_pkt
= (struct tpacket3_hdr
*)pkc1
->prev
;
765 last_pkt
->tp_next_offset
= 0;
767 /* Get the ts of the last pkt */
768 if (BLOCK_NUM_PKTS(pbd1
)) {
769 h1
->ts_last_pkt
.ts_sec
= last_pkt
->tp_sec
;
770 h1
->ts_last_pkt
.ts_nsec
= last_pkt
->tp_nsec
;
772 /* Ok, we tmo'd - so get the current time.
774 * It shouldn't really happen as we don't close empty
775 * blocks. See prb_retire_rx_blk_timer_expired().
779 h1
->ts_last_pkt
.ts_sec
= ts
.tv_sec
;
780 h1
->ts_last_pkt
.ts_nsec
= ts
.tv_nsec
;
785 /* Flush the block */
786 prb_flush_block(pkc1
, pbd1
, status
);
788 sk
->sk_data_ready(sk
);
790 pkc1
->kactive_blk_num
= GET_NEXT_PRB_BLK_NUM(pkc1
);
793 static void prb_thaw_queue(struct tpacket_kbdq_core
*pkc
)
795 pkc
->reset_pending_on_curr_blk
= 0;
799 * Side effect of opening a block:
801 * 1) prb_queue is thawed.
802 * 2) retire_blk_timer is refreshed.
805 static void prb_open_block(struct tpacket_kbdq_core
*pkc1
,
806 struct tpacket_block_desc
*pbd1
)
809 struct tpacket_hdr_v1
*h1
= &pbd1
->hdr
.bh1
;
813 /* We could have just memset this but we will lose the
814 * flexibility of making the priv area sticky
817 BLOCK_SNUM(pbd1
) = pkc1
->knxt_seq_num
++;
818 BLOCK_NUM_PKTS(pbd1
) = 0;
819 BLOCK_LEN(pbd1
) = BLK_PLUS_PRIV(pkc1
->blk_sizeof_priv
);
823 h1
->ts_first_pkt
.ts_sec
= ts
.tv_sec
;
824 h1
->ts_first_pkt
.ts_nsec
= ts
.tv_nsec
;
826 pkc1
->pkblk_start
= (char *)pbd1
;
827 pkc1
->nxt_offset
= pkc1
->pkblk_start
+ BLK_PLUS_PRIV(pkc1
->blk_sizeof_priv
);
829 BLOCK_O2FP(pbd1
) = (__u32
)BLK_PLUS_PRIV(pkc1
->blk_sizeof_priv
);
830 BLOCK_O2PRIV(pbd1
) = BLK_HDR_LEN
;
832 pbd1
->version
= pkc1
->version
;
833 pkc1
->prev
= pkc1
->nxt_offset
;
834 pkc1
->pkblk_end
= pkc1
->pkblk_start
+ pkc1
->kblk_size
;
836 prb_thaw_queue(pkc1
);
837 _prb_refresh_rx_retire_blk_timer(pkc1
);
843 * Queue freeze logic:
844 * 1) Assume tp_block_nr = 8 blocks.
845 * 2) At time 't0', user opens Rx ring.
846 * 3) Some time past 't0', kernel starts filling blocks starting from 0 .. 7
847 * 4) user-space is either sleeping or processing block '0'.
848 * 5) tpacket_rcv is currently filling block '7', since there is no space left,
849 * it will close block-7,loop around and try to fill block '0'.
851 * __packet_lookup_frame_in_block
852 * prb_retire_current_block()
853 * prb_dispatch_next_block()
854 * |->(BLOCK_STATUS == USER) evaluates to true
855 * 5.1) Since block-0 is currently in-use, we just freeze the queue.
856 * 6) Now there are two cases:
857 * 6.1) Link goes idle right after the queue is frozen.
858 * But remember, the last open_block() refreshed the timer.
859 * When this timer expires,it will refresh itself so that we can
860 * re-open block-0 in near future.
861 * 6.2) Link is busy and keeps on receiving packets. This is a simple
862 * case and __packet_lookup_frame_in_block will check if block-0
863 * is free and can now be re-used.
865 static void prb_freeze_queue(struct tpacket_kbdq_core
*pkc
,
866 struct packet_sock
*po
)
868 pkc
->reset_pending_on_curr_blk
= 1;
869 po
->stats
.stats3
.tp_freeze_q_cnt
++;
872 #define TOTAL_PKT_LEN_INCL_ALIGN(length) (ALIGN((length), V3_ALIGNMENT))
875 * If the next block is free then we will dispatch it
876 * and return a good offset.
877 * Else, we will freeze the queue.
878 * So, caller must check the return value.
880 static void *prb_dispatch_next_block(struct tpacket_kbdq_core
*pkc
,
881 struct packet_sock
*po
)
883 struct tpacket_block_desc
*pbd
;
887 /* 1. Get current block num */
888 pbd
= GET_CURR_PBLOCK_DESC_FROM_CORE(pkc
);
890 /* 2. If this block is currently in_use then freeze the queue */
891 if (TP_STATUS_USER
& BLOCK_STATUS(pbd
)) {
892 prb_freeze_queue(pkc
, po
);
898 * open this block and return the offset where the first packet
899 * needs to get stored.
901 prb_open_block(pkc
, pbd
);
902 return (void *)pkc
->nxt_offset
;
905 static void prb_retire_current_block(struct tpacket_kbdq_core
*pkc
,
906 struct packet_sock
*po
, unsigned int status
)
908 struct tpacket_block_desc
*pbd
= GET_CURR_PBLOCK_DESC_FROM_CORE(pkc
);
910 /* retire/close the current block */
911 if (likely(TP_STATUS_KERNEL
== BLOCK_STATUS(pbd
))) {
913 * Plug the case where copy_bits() is in progress on
914 * cpu-0 and tpacket_rcv() got invoked on cpu-1, didn't
915 * have space to copy the pkt in the current block and
916 * called prb_retire_current_block()
918 * We don't need to worry about the TMO case because
919 * the timer-handler already handled this case.
921 if (!(status
& TP_STATUS_BLK_TMO
)) {
922 while (atomic_read(&pkc
->blk_fill_in_prog
)) {
923 /* Waiting for skb_copy_bits to finish... */
927 prb_close_block(pkc
, pbd
, po
, status
);
932 static int prb_curr_blk_in_use(struct tpacket_block_desc
*pbd
)
934 return TP_STATUS_USER
& BLOCK_STATUS(pbd
);
937 static int prb_queue_frozen(struct tpacket_kbdq_core
*pkc
)
939 return pkc
->reset_pending_on_curr_blk
;
942 static void prb_clear_blk_fill_status(struct packet_ring_buffer
*rb
)
944 struct tpacket_kbdq_core
*pkc
= GET_PBDQC_FROM_RB(rb
);
945 atomic_dec(&pkc
->blk_fill_in_prog
);
948 static void prb_fill_rxhash(struct tpacket_kbdq_core
*pkc
,
949 struct tpacket3_hdr
*ppd
)
951 ppd
->hv1
.tp_rxhash
= skb_get_hash(pkc
->skb
);
954 static void prb_clear_rxhash(struct tpacket_kbdq_core
*pkc
,
955 struct tpacket3_hdr
*ppd
)
957 ppd
->hv1
.tp_rxhash
= 0;
960 static void prb_fill_vlan_info(struct tpacket_kbdq_core
*pkc
,
961 struct tpacket3_hdr
*ppd
)
963 if (skb_vlan_tag_present(pkc
->skb
)) {
964 ppd
->hv1
.tp_vlan_tci
= skb_vlan_tag_get(pkc
->skb
);
965 ppd
->hv1
.tp_vlan_tpid
= ntohs(pkc
->skb
->vlan_proto
);
966 ppd
->tp_status
= TP_STATUS_VLAN_VALID
| TP_STATUS_VLAN_TPID_VALID
;
968 ppd
->hv1
.tp_vlan_tci
= 0;
969 ppd
->hv1
.tp_vlan_tpid
= 0;
970 ppd
->tp_status
= TP_STATUS_AVAILABLE
;
974 static void prb_run_all_ft_ops(struct tpacket_kbdq_core
*pkc
,
975 struct tpacket3_hdr
*ppd
)
977 ppd
->hv1
.tp_padding
= 0;
978 prb_fill_vlan_info(pkc
, ppd
);
980 if (pkc
->feature_req_word
& TP_FT_REQ_FILL_RXHASH
)
981 prb_fill_rxhash(pkc
, ppd
);
983 prb_clear_rxhash(pkc
, ppd
);
986 static void prb_fill_curr_block(char *curr
,
987 struct tpacket_kbdq_core
*pkc
,
988 struct tpacket_block_desc
*pbd
,
991 struct tpacket3_hdr
*ppd
;
993 ppd
= (struct tpacket3_hdr
*)curr
;
994 ppd
->tp_next_offset
= TOTAL_PKT_LEN_INCL_ALIGN(len
);
996 pkc
->nxt_offset
+= TOTAL_PKT_LEN_INCL_ALIGN(len
);
997 BLOCK_LEN(pbd
) += TOTAL_PKT_LEN_INCL_ALIGN(len
);
998 BLOCK_NUM_PKTS(pbd
) += 1;
999 atomic_inc(&pkc
->blk_fill_in_prog
);
1000 prb_run_all_ft_ops(pkc
, ppd
);
1003 /* Assumes caller has the sk->rx_queue.lock */
1004 static void *__packet_lookup_frame_in_block(struct packet_sock
*po
,
1005 struct sk_buff
*skb
,
1010 struct tpacket_kbdq_core
*pkc
;
1011 struct tpacket_block_desc
*pbd
;
1014 pkc
= GET_PBDQC_FROM_RB(&po
->rx_ring
);
1015 pbd
= GET_CURR_PBLOCK_DESC_FROM_CORE(pkc
);
1017 /* Queue is frozen when user space is lagging behind */
1018 if (prb_queue_frozen(pkc
)) {
1020 * Check if that last block which caused the queue to freeze,
1021 * is still in_use by user-space.
1023 if (prb_curr_blk_in_use(pbd
)) {
1024 /* Can't record this packet */
1028 * Ok, the block was released by user-space.
1029 * Now let's open that block.
1030 * opening a block also thaws the queue.
1031 * Thawing is a side effect.
1033 prb_open_block(pkc
, pbd
);
1038 curr
= pkc
->nxt_offset
;
1040 end
= (char *)pbd
+ pkc
->kblk_size
;
1042 /* first try the current block */
1043 if (curr
+TOTAL_PKT_LEN_INCL_ALIGN(len
) < end
) {
1044 prb_fill_curr_block(curr
, pkc
, pbd
, len
);
1045 return (void *)curr
;
1048 /* Ok, close the current block */
1049 prb_retire_current_block(pkc
, po
, 0);
1051 /* Now, try to dispatch the next block */
1052 curr
= (char *)prb_dispatch_next_block(pkc
, po
);
1054 pbd
= GET_CURR_PBLOCK_DESC_FROM_CORE(pkc
);
1055 prb_fill_curr_block(curr
, pkc
, pbd
, len
);
1056 return (void *)curr
;
1060 * No free blocks are available.user_space hasn't caught up yet.
1061 * Queue was just frozen and now this packet will get dropped.
1066 static void *packet_current_rx_frame(struct packet_sock
*po
,
1067 struct sk_buff
*skb
,
1068 int status
, unsigned int len
)
1071 switch (po
->tp_version
) {
1074 curr
= packet_lookup_frame(po
, &po
->rx_ring
,
1075 po
->rx_ring
.head
, status
);
1078 return __packet_lookup_frame_in_block(po
, skb
, status
, len
);
1080 WARN(1, "TPACKET version not supported\n");
1086 static void *prb_lookup_block(struct packet_sock
*po
,
1087 struct packet_ring_buffer
*rb
,
1091 struct tpacket_kbdq_core
*pkc
= GET_PBDQC_FROM_RB(rb
);
1092 struct tpacket_block_desc
*pbd
= GET_PBLOCK_DESC(pkc
, idx
);
1094 if (status
!= BLOCK_STATUS(pbd
))
1099 static int prb_previous_blk_num(struct packet_ring_buffer
*rb
)
1102 if (rb
->prb_bdqc
.kactive_blk_num
)
1103 prev
= rb
->prb_bdqc
.kactive_blk_num
-1;
1105 prev
= rb
->prb_bdqc
.knum_blocks
-1;
1109 /* Assumes caller has held the rx_queue.lock */
1110 static void *__prb_previous_block(struct packet_sock
*po
,
1111 struct packet_ring_buffer
*rb
,
1114 unsigned int previous
= prb_previous_blk_num(rb
);
1115 return prb_lookup_block(po
, rb
, previous
, status
);
1118 static void *packet_previous_rx_frame(struct packet_sock
*po
,
1119 struct packet_ring_buffer
*rb
,
1122 if (po
->tp_version
<= TPACKET_V2
)
1123 return packet_previous_frame(po
, rb
, status
);
1125 return __prb_previous_block(po
, rb
, status
);
1128 static void packet_increment_rx_head(struct packet_sock
*po
,
1129 struct packet_ring_buffer
*rb
)
1131 switch (po
->tp_version
) {
1134 return packet_increment_head(rb
);
1137 WARN(1, "TPACKET version not supported.\n");
1143 static void *packet_previous_frame(struct packet_sock
*po
,
1144 struct packet_ring_buffer
*rb
,
1147 unsigned int previous
= rb
->head
? rb
->head
- 1 : rb
->frame_max
;
1148 return packet_lookup_frame(po
, rb
, previous
, status
);
1151 static void packet_increment_head(struct packet_ring_buffer
*buff
)
1153 buff
->head
= buff
->head
!= buff
->frame_max
? buff
->head
+1 : 0;
1156 static void packet_inc_pending(struct packet_ring_buffer
*rb
)
1158 this_cpu_inc(*rb
->pending_refcnt
);
1161 static void packet_dec_pending(struct packet_ring_buffer
*rb
)
1163 this_cpu_dec(*rb
->pending_refcnt
);
1166 static unsigned int packet_read_pending(const struct packet_ring_buffer
*rb
)
1168 unsigned int refcnt
= 0;
1171 /* We don't use pending refcount in rx_ring. */
1172 if (rb
->pending_refcnt
== NULL
)
1175 for_each_possible_cpu(cpu
)
1176 refcnt
+= *per_cpu_ptr(rb
->pending_refcnt
, cpu
);
1181 static int packet_alloc_pending(struct packet_sock
*po
)
1183 po
->rx_ring
.pending_refcnt
= NULL
;
1185 po
->tx_ring
.pending_refcnt
= alloc_percpu(unsigned int);
1186 if (unlikely(po
->tx_ring
.pending_refcnt
== NULL
))
1192 static void packet_free_pending(struct packet_sock
*po
)
1194 free_percpu(po
->tx_ring
.pending_refcnt
);
1197 #define ROOM_POW_OFF 2
1198 #define ROOM_NONE 0x0
1199 #define ROOM_LOW 0x1
1200 #define ROOM_NORMAL 0x2
1202 static bool __tpacket_has_room(struct packet_sock
*po
, int pow_off
)
1206 len
= po
->rx_ring
.frame_max
+ 1;
1207 idx
= po
->rx_ring
.head
;
1209 idx
+= len
>> pow_off
;
1212 return packet_lookup_frame(po
, &po
->rx_ring
, idx
, TP_STATUS_KERNEL
);
1215 static bool __tpacket_v3_has_room(struct packet_sock
*po
, int pow_off
)
1219 len
= po
->rx_ring
.prb_bdqc
.knum_blocks
;
1220 idx
= po
->rx_ring
.prb_bdqc
.kactive_blk_num
;
1222 idx
+= len
>> pow_off
;
1225 return prb_lookup_block(po
, &po
->rx_ring
, idx
, TP_STATUS_KERNEL
);
1228 static int __packet_rcv_has_room(struct packet_sock
*po
, struct sk_buff
*skb
)
1230 struct sock
*sk
= &po
->sk
;
1231 int ret
= ROOM_NONE
;
1233 if (po
->prot_hook
.func
!= tpacket_rcv
) {
1234 int avail
= sk
->sk_rcvbuf
- atomic_read(&sk
->sk_rmem_alloc
)
1235 - (skb
? skb
->truesize
: 0);
1236 if (avail
> (sk
->sk_rcvbuf
>> ROOM_POW_OFF
))
1244 if (po
->tp_version
== TPACKET_V3
) {
1245 if (__tpacket_v3_has_room(po
, ROOM_POW_OFF
))
1247 else if (__tpacket_v3_has_room(po
, 0))
1250 if (__tpacket_has_room(po
, ROOM_POW_OFF
))
1252 else if (__tpacket_has_room(po
, 0))
1259 static int packet_rcv_has_room(struct packet_sock
*po
, struct sk_buff
*skb
)
1264 spin_lock_bh(&po
->sk
.sk_receive_queue
.lock
);
1265 ret
= __packet_rcv_has_room(po
, skb
);
1266 has_room
= ret
== ROOM_NORMAL
;
1267 if (po
->pressure
== has_room
)
1268 po
->pressure
= !has_room
;
1269 spin_unlock_bh(&po
->sk
.sk_receive_queue
.lock
);
1274 static void packet_sock_destruct(struct sock
*sk
)
1276 skb_queue_purge(&sk
->sk_error_queue
);
1278 WARN_ON(atomic_read(&sk
->sk_rmem_alloc
));
1279 WARN_ON(refcount_read(&sk
->sk_wmem_alloc
));
1281 if (!sock_flag(sk
, SOCK_DEAD
)) {
1282 pr_err("Attempt to release alive packet socket: %p\n", sk
);
1286 sk_refcnt_debug_dec(sk
);
1289 static bool fanout_flow_is_huge(struct packet_sock
*po
, struct sk_buff
*skb
)
1294 rxhash
= skb_get_hash(skb
);
1295 for (i
= 0; i
< ROLLOVER_HLEN
; i
++)
1296 if (po
->rollover
->history
[i
] == rxhash
)
1299 po
->rollover
->history
[prandom_u32() % ROLLOVER_HLEN
] = rxhash
;
1300 return count
> (ROLLOVER_HLEN
>> 1);
1303 static unsigned int fanout_demux_hash(struct packet_fanout
*f
,
1304 struct sk_buff
*skb
,
1307 return reciprocal_scale(__skb_get_hash_symmetric(skb
), num
);
1310 static unsigned int fanout_demux_lb(struct packet_fanout
*f
,
1311 struct sk_buff
*skb
,
1314 unsigned int val
= atomic_inc_return(&f
->rr_cur
);
1319 static unsigned int fanout_demux_cpu(struct packet_fanout
*f
,
1320 struct sk_buff
*skb
,
1323 return smp_processor_id() % num
;
1326 static unsigned int fanout_demux_rnd(struct packet_fanout
*f
,
1327 struct sk_buff
*skb
,
1330 return prandom_u32_max(num
);
1333 static unsigned int fanout_demux_rollover(struct packet_fanout
*f
,
1334 struct sk_buff
*skb
,
1335 unsigned int idx
, bool try_self
,
1338 struct packet_sock
*po
, *po_next
, *po_skip
= NULL
;
1339 unsigned int i
, j
, room
= ROOM_NONE
;
1341 po
= pkt_sk(f
->arr
[idx
]);
1344 room
= packet_rcv_has_room(po
, skb
);
1345 if (room
== ROOM_NORMAL
||
1346 (room
== ROOM_LOW
&& !fanout_flow_is_huge(po
, skb
)))
1351 i
= j
= min_t(int, po
->rollover
->sock
, num
- 1);
1353 po_next
= pkt_sk(f
->arr
[i
]);
1354 if (po_next
!= po_skip
&& !po_next
->pressure
&&
1355 packet_rcv_has_room(po_next
, skb
) == ROOM_NORMAL
) {
1357 po
->rollover
->sock
= i
;
1358 atomic_long_inc(&po
->rollover
->num
);
1359 if (room
== ROOM_LOW
)
1360 atomic_long_inc(&po
->rollover
->num_huge
);
1368 atomic_long_inc(&po
->rollover
->num_failed
);
1372 static unsigned int fanout_demux_qm(struct packet_fanout
*f
,
1373 struct sk_buff
*skb
,
1376 return skb_get_queue_mapping(skb
) % num
;
1379 static unsigned int fanout_demux_bpf(struct packet_fanout
*f
,
1380 struct sk_buff
*skb
,
1383 struct bpf_prog
*prog
;
1384 unsigned int ret
= 0;
1387 prog
= rcu_dereference(f
->bpf_prog
);
1389 ret
= bpf_prog_run_clear_cb(prog
, skb
) % num
;
1395 static bool fanout_has_flag(struct packet_fanout
*f
, u16 flag
)
1397 return f
->flags
& (flag
>> 8);
1400 static int packet_rcv_fanout(struct sk_buff
*skb
, struct net_device
*dev
,
1401 struct packet_type
*pt
, struct net_device
*orig_dev
)
1403 struct packet_fanout
*f
= pt
->af_packet_priv
;
1404 unsigned int num
= READ_ONCE(f
->num_members
);
1405 struct net
*net
= read_pnet(&f
->net
);
1406 struct packet_sock
*po
;
1409 if (!net_eq(dev_net(dev
), net
) || !num
) {
1414 if (fanout_has_flag(f
, PACKET_FANOUT_FLAG_DEFRAG
)) {
1415 skb
= ip_check_defrag(net
, skb
, IP_DEFRAG_AF_PACKET
);
1420 case PACKET_FANOUT_HASH
:
1422 idx
= fanout_demux_hash(f
, skb
, num
);
1424 case PACKET_FANOUT_LB
:
1425 idx
= fanout_demux_lb(f
, skb
, num
);
1427 case PACKET_FANOUT_CPU
:
1428 idx
= fanout_demux_cpu(f
, skb
, num
);
1430 case PACKET_FANOUT_RND
:
1431 idx
= fanout_demux_rnd(f
, skb
, num
);
1433 case PACKET_FANOUT_QM
:
1434 idx
= fanout_demux_qm(f
, skb
, num
);
1436 case PACKET_FANOUT_ROLLOVER
:
1437 idx
= fanout_demux_rollover(f
, skb
, 0, false, num
);
1439 case PACKET_FANOUT_CBPF
:
1440 case PACKET_FANOUT_EBPF
:
1441 idx
= fanout_demux_bpf(f
, skb
, num
);
1445 if (fanout_has_flag(f
, PACKET_FANOUT_FLAG_ROLLOVER
))
1446 idx
= fanout_demux_rollover(f
, skb
, idx
, true, num
);
1448 po
= pkt_sk(f
->arr
[idx
]);
1449 return po
->prot_hook
.func(skb
, dev
, &po
->prot_hook
, orig_dev
);
1452 DEFINE_MUTEX(fanout_mutex
);
1453 EXPORT_SYMBOL_GPL(fanout_mutex
);
1454 static LIST_HEAD(fanout_list
);
1455 static u16 fanout_next_id
;
1457 static void __fanout_link(struct sock
*sk
, struct packet_sock
*po
)
1459 struct packet_fanout
*f
= po
->fanout
;
1461 spin_lock(&f
->lock
);
1462 f
->arr
[f
->num_members
] = sk
;
1465 if (f
->num_members
== 1)
1466 dev_add_pack(&f
->prot_hook
);
1467 spin_unlock(&f
->lock
);
1470 static void __fanout_unlink(struct sock
*sk
, struct packet_sock
*po
)
1472 struct packet_fanout
*f
= po
->fanout
;
1475 spin_lock(&f
->lock
);
1476 for (i
= 0; i
< f
->num_members
; i
++) {
1477 if (f
->arr
[i
] == sk
)
1480 BUG_ON(i
>= f
->num_members
);
1481 f
->arr
[i
] = f
->arr
[f
->num_members
- 1];
1483 if (f
->num_members
== 0)
1484 __dev_remove_pack(&f
->prot_hook
);
1485 spin_unlock(&f
->lock
);
1488 static bool match_fanout_group(struct packet_type
*ptype
, struct sock
*sk
)
1490 if (sk
->sk_family
!= PF_PACKET
)
1493 return ptype
->af_packet_priv
== pkt_sk(sk
)->fanout
;
1496 static void fanout_init_data(struct packet_fanout
*f
)
1499 case PACKET_FANOUT_LB
:
1500 atomic_set(&f
->rr_cur
, 0);
1502 case PACKET_FANOUT_CBPF
:
1503 case PACKET_FANOUT_EBPF
:
1504 RCU_INIT_POINTER(f
->bpf_prog
, NULL
);
1509 static void __fanout_set_data_bpf(struct packet_fanout
*f
, struct bpf_prog
*new)
1511 struct bpf_prog
*old
;
1513 spin_lock(&f
->lock
);
1514 old
= rcu_dereference_protected(f
->bpf_prog
, lockdep_is_held(&f
->lock
));
1515 rcu_assign_pointer(f
->bpf_prog
, new);
1516 spin_unlock(&f
->lock
);
1520 bpf_prog_destroy(old
);
1524 static int fanout_set_data_cbpf(struct packet_sock
*po
, char __user
*data
,
1527 struct bpf_prog
*new;
1528 struct sock_fprog fprog
;
1531 if (sock_flag(&po
->sk
, SOCK_FILTER_LOCKED
))
1533 if (len
!= sizeof(fprog
))
1535 if (copy_from_user(&fprog
, data
, len
))
1538 ret
= bpf_prog_create_from_user(&new, &fprog
, NULL
, false);
1542 __fanout_set_data_bpf(po
->fanout
, new);
1546 static int fanout_set_data_ebpf(struct packet_sock
*po
, char __user
*data
,
1549 struct bpf_prog
*new;
1552 if (sock_flag(&po
->sk
, SOCK_FILTER_LOCKED
))
1554 if (len
!= sizeof(fd
))
1556 if (copy_from_user(&fd
, data
, len
))
1559 new = bpf_prog_get_type(fd
, BPF_PROG_TYPE_SOCKET_FILTER
);
1561 return PTR_ERR(new);
1563 __fanout_set_data_bpf(po
->fanout
, new);
1567 static int fanout_set_data(struct packet_sock
*po
, char __user
*data
,
1570 switch (po
->fanout
->type
) {
1571 case PACKET_FANOUT_CBPF
:
1572 return fanout_set_data_cbpf(po
, data
, len
);
1573 case PACKET_FANOUT_EBPF
:
1574 return fanout_set_data_ebpf(po
, data
, len
);
1580 static void fanout_release_data(struct packet_fanout
*f
)
1583 case PACKET_FANOUT_CBPF
:
1584 case PACKET_FANOUT_EBPF
:
1585 __fanout_set_data_bpf(f
, NULL
);
1589 static bool __fanout_id_is_free(struct sock
*sk
, u16 candidate_id
)
1591 struct packet_fanout
*f
;
1593 list_for_each_entry(f
, &fanout_list
, list
) {
1594 if (f
->id
== candidate_id
&&
1595 read_pnet(&f
->net
) == sock_net(sk
)) {
1602 static bool fanout_find_new_id(struct sock
*sk
, u16
*new_id
)
1604 u16 id
= fanout_next_id
;
1607 if (__fanout_id_is_free(sk
, id
)) {
1609 fanout_next_id
= id
+ 1;
1614 } while (id
!= fanout_next_id
);
1619 static int fanout_add(struct sock
*sk
, u16 id
, u16 type_flags
)
1621 struct packet_rollover
*rollover
= NULL
;
1622 struct packet_sock
*po
= pkt_sk(sk
);
1623 struct packet_fanout
*f
, *match
;
1624 u8 type
= type_flags
& 0xff;
1625 u8 flags
= type_flags
>> 8;
1629 case PACKET_FANOUT_ROLLOVER
:
1630 if (type_flags
& PACKET_FANOUT_FLAG_ROLLOVER
)
1632 case PACKET_FANOUT_HASH
:
1633 case PACKET_FANOUT_LB
:
1634 case PACKET_FANOUT_CPU
:
1635 case PACKET_FANOUT_RND
:
1636 case PACKET_FANOUT_QM
:
1637 case PACKET_FANOUT_CBPF
:
1638 case PACKET_FANOUT_EBPF
:
1644 mutex_lock(&fanout_mutex
);
1650 if (type
== PACKET_FANOUT_ROLLOVER
||
1651 (type_flags
& PACKET_FANOUT_FLAG_ROLLOVER
)) {
1653 rollover
= kzalloc(sizeof(*rollover
), GFP_KERNEL
);
1656 atomic_long_set(&rollover
->num
, 0);
1657 atomic_long_set(&rollover
->num_huge
, 0);
1658 atomic_long_set(&rollover
->num_failed
, 0);
1661 if (type_flags
& PACKET_FANOUT_FLAG_UNIQUEID
) {
1666 if (!fanout_find_new_id(sk
, &id
)) {
1670 /* ephemeral flag for the first socket in the group: drop it */
1671 flags
&= ~(PACKET_FANOUT_FLAG_UNIQUEID
>> 8);
1675 list_for_each_entry(f
, &fanout_list
, list
) {
1677 read_pnet(&f
->net
) == sock_net(sk
)) {
1683 if (match
&& match
->flags
!= flags
)
1687 match
= kzalloc(sizeof(*match
), GFP_KERNEL
);
1690 write_pnet(&match
->net
, sock_net(sk
));
1693 match
->flags
= flags
;
1694 INIT_LIST_HEAD(&match
->list
);
1695 spin_lock_init(&match
->lock
);
1696 refcount_set(&match
->sk_ref
, 0);
1697 fanout_init_data(match
);
1698 match
->prot_hook
.type
= po
->prot_hook
.type
;
1699 match
->prot_hook
.dev
= po
->prot_hook
.dev
;
1700 match
->prot_hook
.func
= packet_rcv_fanout
;
1701 match
->prot_hook
.af_packet_priv
= match
;
1702 match
->prot_hook
.id_match
= match_fanout_group
;
1703 list_add(&match
->list
, &fanout_list
);
1707 spin_lock(&po
->bind_lock
);
1709 match
->type
== type
&&
1710 match
->prot_hook
.type
== po
->prot_hook
.type
&&
1711 match
->prot_hook
.dev
== po
->prot_hook
.dev
) {
1713 if (refcount_read(&match
->sk_ref
) < PACKET_FANOUT_MAX
) {
1714 __dev_remove_pack(&po
->prot_hook
);
1716 po
->rollover
= rollover
;
1718 refcount_set(&match
->sk_ref
, refcount_read(&match
->sk_ref
) + 1);
1719 __fanout_link(sk
, po
);
1723 spin_unlock(&po
->bind_lock
);
1725 if (err
&& !refcount_read(&match
->sk_ref
)) {
1726 list_del(&match
->list
);
1732 mutex_unlock(&fanout_mutex
);
1736 /* If pkt_sk(sk)->fanout->sk_ref is zero, this function removes
1737 * pkt_sk(sk)->fanout from fanout_list and returns pkt_sk(sk)->fanout.
1738 * It is the responsibility of the caller to call fanout_release_data() and
1739 * free the returned packet_fanout (after synchronize_net())
1741 static struct packet_fanout
*fanout_release(struct sock
*sk
)
1743 struct packet_sock
*po
= pkt_sk(sk
);
1744 struct packet_fanout
*f
;
1746 mutex_lock(&fanout_mutex
);
1751 if (refcount_dec_and_test(&f
->sk_ref
))
1756 mutex_unlock(&fanout_mutex
);
1761 static bool packet_extra_vlan_len_allowed(const struct net_device
*dev
,
1762 struct sk_buff
*skb
)
1764 /* Earlier code assumed this would be a VLAN pkt, double-check
1765 * this now that we have the actual packet in hand. We can only
1766 * do this check on Ethernet devices.
1768 if (unlikely(dev
->type
!= ARPHRD_ETHER
))
1771 skb_reset_mac_header(skb
);
1772 return likely(eth_hdr(skb
)->h_proto
== htons(ETH_P_8021Q
));
1775 static const struct proto_ops packet_ops
;
1777 static const struct proto_ops packet_ops_spkt
;
1779 static int packet_rcv_spkt(struct sk_buff
*skb
, struct net_device
*dev
,
1780 struct packet_type
*pt
, struct net_device
*orig_dev
)
1783 struct sockaddr_pkt
*spkt
;
1786 * When we registered the protocol we saved the socket in the data
1787 * field for just this event.
1790 sk
= pt
->af_packet_priv
;
1793 * Yank back the headers [hope the device set this
1794 * right or kerboom...]
1796 * Incoming packets have ll header pulled,
1799 * For outgoing ones skb->data == skb_mac_header(skb)
1800 * so that this procedure is noop.
1803 if (skb
->pkt_type
== PACKET_LOOPBACK
)
1806 if (!net_eq(dev_net(dev
), sock_net(sk
)))
1809 skb
= skb_share_check(skb
, GFP_ATOMIC
);
1813 /* drop any routing info */
1816 /* drop conntrack reference */
1819 spkt
= &PACKET_SKB_CB(skb
)->sa
.pkt
;
1821 skb_push(skb
, skb
->data
- skb_mac_header(skb
));
1824 * The SOCK_PACKET socket receives _all_ frames.
1827 spkt
->spkt_family
= dev
->type
;
1828 strlcpy(spkt
->spkt_device
, dev
->name
, sizeof(spkt
->spkt_device
));
1829 spkt
->spkt_protocol
= skb
->protocol
;
1832 * Charge the memory to the socket. This is done specifically
1833 * to prevent sockets using all the memory up.
1836 if (sock_queue_rcv_skb(sk
, skb
) == 0)
1845 static void packet_parse_headers(struct sk_buff
*skb
, struct socket
*sock
)
1847 if ((!skb
->protocol
|| skb
->protocol
== htons(ETH_P_ALL
)) &&
1848 sock
->type
== SOCK_RAW
) {
1849 skb_reset_mac_header(skb
);
1850 skb
->protocol
= dev_parse_header_protocol(skb
);
1853 skb_probe_transport_header(skb
);
1857 * Output a raw packet to a device layer. This bypasses all the other
1858 * protocol layers and you must therefore supply it with a complete frame
1861 static int packet_sendmsg_spkt(struct socket
*sock
, struct msghdr
*msg
,
1864 struct sock
*sk
= sock
->sk
;
1865 DECLARE_SOCKADDR(struct sockaddr_pkt
*, saddr
, msg
->msg_name
);
1866 struct sk_buff
*skb
= NULL
;
1867 struct net_device
*dev
;
1868 struct sockcm_cookie sockc
;
1874 * Get and verify the address.
1878 if (msg
->msg_namelen
< sizeof(struct sockaddr
))
1880 if (msg
->msg_namelen
== sizeof(struct sockaddr_pkt
))
1881 proto
= saddr
->spkt_protocol
;
1883 return -ENOTCONN
; /* SOCK_PACKET must be sent giving an address */
1886 * Find the device first to size check it
1889 saddr
->spkt_device
[sizeof(saddr
->spkt_device
) - 1] = 0;
1892 dev
= dev_get_by_name_rcu(sock_net(sk
), saddr
->spkt_device
);
1898 if (!(dev
->flags
& IFF_UP
))
1902 * You may not queue a frame bigger than the mtu. This is the lowest level
1903 * raw protocol and you must do your own fragmentation at this level.
1906 if (unlikely(sock_flag(sk
, SOCK_NOFCS
))) {
1907 if (!netif_supports_nofcs(dev
)) {
1908 err
= -EPROTONOSUPPORT
;
1911 extra_len
= 4; /* We're doing our own CRC */
1915 if (len
> dev
->mtu
+ dev
->hard_header_len
+ VLAN_HLEN
+ extra_len
)
1919 size_t reserved
= LL_RESERVED_SPACE(dev
);
1920 int tlen
= dev
->needed_tailroom
;
1921 unsigned int hhlen
= dev
->header_ops
? dev
->hard_header_len
: 0;
1924 skb
= sock_wmalloc(sk
, len
+ reserved
+ tlen
, 0, GFP_KERNEL
);
1927 /* FIXME: Save some space for broken drivers that write a hard
1928 * header at transmission time by themselves. PPP is the notable
1929 * one here. This should really be fixed at the driver level.
1931 skb_reserve(skb
, reserved
);
1932 skb_reset_network_header(skb
);
1934 /* Try to align data part correctly */
1939 skb_reset_network_header(skb
);
1941 err
= memcpy_from_msg(skb_put(skb
, len
), msg
, len
);
1947 if (!dev_validate_header(dev
, skb
->data
, len
)) {
1951 if (len
> (dev
->mtu
+ dev
->hard_header_len
+ extra_len
) &&
1952 !packet_extra_vlan_len_allowed(dev
, skb
)) {
1957 sockcm_init(&sockc
, sk
);
1958 if (msg
->msg_controllen
) {
1959 err
= sock_cmsg_send(sk
, msg
, &sockc
);
1964 skb
->protocol
= proto
;
1966 skb
->priority
= sk
->sk_priority
;
1967 skb
->mark
= sk
->sk_mark
;
1968 skb
->tstamp
= sockc
.transmit_time
;
1970 skb_setup_tx_timestamp(skb
, sockc
.tsflags
);
1972 if (unlikely(extra_len
== 4))
1975 packet_parse_headers(skb
, sock
);
1977 dev_queue_xmit(skb
);
1988 static unsigned int run_filter(struct sk_buff
*skb
,
1989 const struct sock
*sk
,
1992 struct sk_filter
*filter
;
1995 filter
= rcu_dereference(sk
->sk_filter
);
1997 res
= bpf_prog_run_clear_cb(filter
->prog
, skb
);
2003 static int packet_rcv_vnet(struct msghdr
*msg
, const struct sk_buff
*skb
,
2006 struct virtio_net_hdr vnet_hdr
;
2008 if (*len
< sizeof(vnet_hdr
))
2010 *len
-= sizeof(vnet_hdr
);
2012 if (virtio_net_hdr_from_skb(skb
, &vnet_hdr
, vio_le(), true, 0))
2015 return memcpy_to_msg(msg
, (void *)&vnet_hdr
, sizeof(vnet_hdr
));
2019 * This function makes lazy skb cloning in hope that most of packets
2020 * are discarded by BPF.
2022 * Note tricky part: we DO mangle shared skb! skb->data, skb->len
2023 * and skb->cb are mangled. It works because (and until) packets
2024 * falling here are owned by current CPU. Output packets are cloned
2025 * by dev_queue_xmit_nit(), input packets are processed by net_bh
2026 * sequencially, so that if we return skb to original state on exit,
2027 * we will not harm anyone.
2030 static int packet_rcv(struct sk_buff
*skb
, struct net_device
*dev
,
2031 struct packet_type
*pt
, struct net_device
*orig_dev
)
2034 struct sockaddr_ll
*sll
;
2035 struct packet_sock
*po
;
2036 u8
*skb_head
= skb
->data
;
2037 int skb_len
= skb
->len
;
2038 unsigned int snaplen
, res
;
2039 bool is_drop_n_account
= false;
2041 if (skb
->pkt_type
== PACKET_LOOPBACK
)
2044 sk
= pt
->af_packet_priv
;
2047 if (!net_eq(dev_net(dev
), sock_net(sk
)))
2052 if (dev
->header_ops
) {
2053 /* The device has an explicit notion of ll header,
2054 * exported to higher levels.
2056 * Otherwise, the device hides details of its frame
2057 * structure, so that corresponding packet head is
2058 * never delivered to user.
2060 if (sk
->sk_type
!= SOCK_DGRAM
)
2061 skb_push(skb
, skb
->data
- skb_mac_header(skb
));
2062 else if (skb
->pkt_type
== PACKET_OUTGOING
) {
2063 /* Special case: outgoing packets have ll header at head */
2064 skb_pull(skb
, skb_network_offset(skb
));
2070 res
= run_filter(skb
, sk
, snaplen
);
2072 goto drop_n_restore
;
2076 if (atomic_read(&sk
->sk_rmem_alloc
) >= sk
->sk_rcvbuf
)
2079 if (skb_shared(skb
)) {
2080 struct sk_buff
*nskb
= skb_clone(skb
, GFP_ATOMIC
);
2084 if (skb_head
!= skb
->data
) {
2085 skb
->data
= skb_head
;
2092 sock_skb_cb_check_size(sizeof(*PACKET_SKB_CB(skb
)) + MAX_ADDR_LEN
- 8);
2094 sll
= &PACKET_SKB_CB(skb
)->sa
.ll
;
2095 sll
->sll_hatype
= dev
->type
;
2096 sll
->sll_pkttype
= skb
->pkt_type
;
2097 if (unlikely(po
->origdev
))
2098 sll
->sll_ifindex
= orig_dev
->ifindex
;
2100 sll
->sll_ifindex
= dev
->ifindex
;
2102 sll
->sll_halen
= dev_parse_header(skb
, sll
->sll_addr
);
2104 /* sll->sll_family and sll->sll_protocol are set in packet_recvmsg().
2105 * Use their space for storing the original skb length.
2107 PACKET_SKB_CB(skb
)->sa
.origlen
= skb
->len
;
2109 if (pskb_trim(skb
, snaplen
))
2112 skb_set_owner_r(skb
, sk
);
2116 /* drop conntrack reference */
2119 spin_lock(&sk
->sk_receive_queue
.lock
);
2120 po
->stats
.stats1
.tp_packets
++;
2121 sock_skb_set_dropcount(sk
, skb
);
2122 __skb_queue_tail(&sk
->sk_receive_queue
, skb
);
2123 spin_unlock(&sk
->sk_receive_queue
.lock
);
2124 sk
->sk_data_ready(sk
);
2128 is_drop_n_account
= true;
2129 spin_lock(&sk
->sk_receive_queue
.lock
);
2130 po
->stats
.stats1
.tp_drops
++;
2131 atomic_inc(&sk
->sk_drops
);
2132 spin_unlock(&sk
->sk_receive_queue
.lock
);
2135 if (skb_head
!= skb
->data
&& skb_shared(skb
)) {
2136 skb
->data
= skb_head
;
2140 if (!is_drop_n_account
)
2147 static int tpacket_rcv(struct sk_buff
*skb
, struct net_device
*dev
,
2148 struct packet_type
*pt
, struct net_device
*orig_dev
)
2151 struct packet_sock
*po
;
2152 struct sockaddr_ll
*sll
;
2153 union tpacket_uhdr h
;
2154 u8
*skb_head
= skb
->data
;
2155 int skb_len
= skb
->len
;
2156 unsigned int snaplen
, res
;
2157 unsigned long status
= TP_STATUS_USER
;
2158 unsigned short macoff
, netoff
, hdrlen
;
2159 struct sk_buff
*copy_skb
= NULL
;
2162 bool is_drop_n_account
= false;
2163 bool do_vnet
= false;
2165 /* struct tpacket{2,3}_hdr is aligned to a multiple of TPACKET_ALIGNMENT.
2166 * We may add members to them until current aligned size without forcing
2167 * userspace to call getsockopt(..., PACKET_HDRLEN, ...).
2169 BUILD_BUG_ON(TPACKET_ALIGN(sizeof(*h
.h2
)) != 32);
2170 BUILD_BUG_ON(TPACKET_ALIGN(sizeof(*h
.h3
)) != 48);
2172 if (skb
->pkt_type
== PACKET_LOOPBACK
)
2175 sk
= pt
->af_packet_priv
;
2178 if (!net_eq(dev_net(dev
), sock_net(sk
)))
2181 if (dev
->header_ops
) {
2182 if (sk
->sk_type
!= SOCK_DGRAM
)
2183 skb_push(skb
, skb
->data
- skb_mac_header(skb
));
2184 else if (skb
->pkt_type
== PACKET_OUTGOING
) {
2185 /* Special case: outgoing packets have ll header at head */
2186 skb_pull(skb
, skb_network_offset(skb
));
2192 res
= run_filter(skb
, sk
, snaplen
);
2194 goto drop_n_restore
;
2196 if (skb
->ip_summed
== CHECKSUM_PARTIAL
)
2197 status
|= TP_STATUS_CSUMNOTREADY
;
2198 else if (skb
->pkt_type
!= PACKET_OUTGOING
&&
2199 (skb
->ip_summed
== CHECKSUM_COMPLETE
||
2200 skb_csum_unnecessary(skb
)))
2201 status
|= TP_STATUS_CSUM_VALID
;
2206 if (sk
->sk_type
== SOCK_DGRAM
) {
2207 macoff
= netoff
= TPACKET_ALIGN(po
->tp_hdrlen
) + 16 +
2210 unsigned int maclen
= skb_network_offset(skb
);
2211 netoff
= TPACKET_ALIGN(po
->tp_hdrlen
+
2212 (maclen
< 16 ? 16 : maclen
)) +
2214 if (po
->has_vnet_hdr
) {
2215 netoff
+= sizeof(struct virtio_net_hdr
);
2218 macoff
= netoff
- maclen
;
2220 if (po
->tp_version
<= TPACKET_V2
) {
2221 if (macoff
+ snaplen
> po
->rx_ring
.frame_size
) {
2222 if (po
->copy_thresh
&&
2223 atomic_read(&sk
->sk_rmem_alloc
) < sk
->sk_rcvbuf
) {
2224 if (skb_shared(skb
)) {
2225 copy_skb
= skb_clone(skb
, GFP_ATOMIC
);
2227 copy_skb
= skb_get(skb
);
2228 skb_head
= skb
->data
;
2231 skb_set_owner_r(copy_skb
, sk
);
2233 snaplen
= po
->rx_ring
.frame_size
- macoff
;
2234 if ((int)snaplen
< 0) {
2239 } else if (unlikely(macoff
+ snaplen
>
2240 GET_PBDQC_FROM_RB(&po
->rx_ring
)->max_frame_len
)) {
2243 nval
= GET_PBDQC_FROM_RB(&po
->rx_ring
)->max_frame_len
- macoff
;
2244 pr_err_once("tpacket_rcv: packet too big, clamped from %u to %u. macoff=%u\n",
2245 snaplen
, nval
, macoff
);
2247 if (unlikely((int)snaplen
< 0)) {
2249 macoff
= GET_PBDQC_FROM_RB(&po
->rx_ring
)->max_frame_len
;
2253 spin_lock(&sk
->sk_receive_queue
.lock
);
2254 h
.raw
= packet_current_rx_frame(po
, skb
,
2255 TP_STATUS_KERNEL
, (macoff
+snaplen
));
2257 goto drop_n_account
;
2258 if (po
->tp_version
<= TPACKET_V2
) {
2259 packet_increment_rx_head(po
, &po
->rx_ring
);
2261 * LOSING will be reported till you read the stats,
2262 * because it's COR - Clear On Read.
2263 * Anyways, moving it for V1/V2 only as V3 doesn't need this
2266 if (po
->stats
.stats1
.tp_drops
)
2267 status
|= TP_STATUS_LOSING
;
2271 virtio_net_hdr_from_skb(skb
, h
.raw
+ macoff
-
2272 sizeof(struct virtio_net_hdr
),
2274 goto drop_n_account
;
2276 po
->stats
.stats1
.tp_packets
++;
2278 status
|= TP_STATUS_COPY
;
2279 __skb_queue_tail(&sk
->sk_receive_queue
, copy_skb
);
2281 spin_unlock(&sk
->sk_receive_queue
.lock
);
2283 skb_copy_bits(skb
, 0, h
.raw
+ macoff
, snaplen
);
2285 if (!(ts_status
= tpacket_get_timestamp(skb
, &ts
, po
->tp_tstamp
)))
2286 getnstimeofday(&ts
);
2288 status
|= ts_status
;
2290 switch (po
->tp_version
) {
2292 h
.h1
->tp_len
= skb
->len
;
2293 h
.h1
->tp_snaplen
= snaplen
;
2294 h
.h1
->tp_mac
= macoff
;
2295 h
.h1
->tp_net
= netoff
;
2296 h
.h1
->tp_sec
= ts
.tv_sec
;
2297 h
.h1
->tp_usec
= ts
.tv_nsec
/ NSEC_PER_USEC
;
2298 hdrlen
= sizeof(*h
.h1
);
2301 h
.h2
->tp_len
= skb
->len
;
2302 h
.h2
->tp_snaplen
= snaplen
;
2303 h
.h2
->tp_mac
= macoff
;
2304 h
.h2
->tp_net
= netoff
;
2305 h
.h2
->tp_sec
= ts
.tv_sec
;
2306 h
.h2
->tp_nsec
= ts
.tv_nsec
;
2307 if (skb_vlan_tag_present(skb
)) {
2308 h
.h2
->tp_vlan_tci
= skb_vlan_tag_get(skb
);
2309 h
.h2
->tp_vlan_tpid
= ntohs(skb
->vlan_proto
);
2310 status
|= TP_STATUS_VLAN_VALID
| TP_STATUS_VLAN_TPID_VALID
;
2312 h
.h2
->tp_vlan_tci
= 0;
2313 h
.h2
->tp_vlan_tpid
= 0;
2315 memset(h
.h2
->tp_padding
, 0, sizeof(h
.h2
->tp_padding
));
2316 hdrlen
= sizeof(*h
.h2
);
2319 /* tp_nxt_offset,vlan are already populated above.
2320 * So DONT clear those fields here
2322 h
.h3
->tp_status
|= status
;
2323 h
.h3
->tp_len
= skb
->len
;
2324 h
.h3
->tp_snaplen
= snaplen
;
2325 h
.h3
->tp_mac
= macoff
;
2326 h
.h3
->tp_net
= netoff
;
2327 h
.h3
->tp_sec
= ts
.tv_sec
;
2328 h
.h3
->tp_nsec
= ts
.tv_nsec
;
2329 memset(h
.h3
->tp_padding
, 0, sizeof(h
.h3
->tp_padding
));
2330 hdrlen
= sizeof(*h
.h3
);
2336 sll
= h
.raw
+ TPACKET_ALIGN(hdrlen
);
2337 sll
->sll_halen
= dev_parse_header(skb
, sll
->sll_addr
);
2338 sll
->sll_family
= AF_PACKET
;
2339 sll
->sll_hatype
= dev
->type
;
2340 sll
->sll_protocol
= skb
->protocol
;
2341 sll
->sll_pkttype
= skb
->pkt_type
;
2342 if (unlikely(po
->origdev
))
2343 sll
->sll_ifindex
= orig_dev
->ifindex
;
2345 sll
->sll_ifindex
= dev
->ifindex
;
2349 #if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1
2350 if (po
->tp_version
<= TPACKET_V2
) {
2353 end
= (u8
*) PAGE_ALIGN((unsigned long) h
.raw
+
2356 for (start
= h
.raw
; start
< end
; start
+= PAGE_SIZE
)
2357 flush_dcache_page(pgv_to_page(start
));
2362 if (po
->tp_version
<= TPACKET_V2
) {
2363 __packet_set_status(po
, h
.raw
, status
);
2364 sk
->sk_data_ready(sk
);
2366 prb_clear_blk_fill_status(&po
->rx_ring
);
2370 if (skb_head
!= skb
->data
&& skb_shared(skb
)) {
2371 skb
->data
= skb_head
;
2375 if (!is_drop_n_account
)
2382 is_drop_n_account
= true;
2383 po
->stats
.stats1
.tp_drops
++;
2384 spin_unlock(&sk
->sk_receive_queue
.lock
);
2386 sk
->sk_data_ready(sk
);
2387 kfree_skb(copy_skb
);
2388 goto drop_n_restore
;
2391 static void tpacket_destruct_skb(struct sk_buff
*skb
)
2393 struct packet_sock
*po
= pkt_sk(skb
->sk
);
2395 if (likely(po
->tx_ring
.pg_vec
)) {
2399 ph
= skb_zcopy_get_nouarg(skb
);
2400 packet_dec_pending(&po
->tx_ring
);
2402 ts
= __packet_set_timestamp(po
, ph
, skb
);
2403 __packet_set_status(po
, ph
, TP_STATUS_AVAILABLE
| ts
);
2409 static int __packet_snd_vnet_parse(struct virtio_net_hdr
*vnet_hdr
, size_t len
)
2411 if ((vnet_hdr
->flags
& VIRTIO_NET_HDR_F_NEEDS_CSUM
) &&
2412 (__virtio16_to_cpu(vio_le(), vnet_hdr
->csum_start
) +
2413 __virtio16_to_cpu(vio_le(), vnet_hdr
->csum_offset
) + 2 >
2414 __virtio16_to_cpu(vio_le(), vnet_hdr
->hdr_len
)))
2415 vnet_hdr
->hdr_len
= __cpu_to_virtio16(vio_le(),
2416 __virtio16_to_cpu(vio_le(), vnet_hdr
->csum_start
) +
2417 __virtio16_to_cpu(vio_le(), vnet_hdr
->csum_offset
) + 2);
2419 if (__virtio16_to_cpu(vio_le(), vnet_hdr
->hdr_len
) > len
)
2425 static int packet_snd_vnet_parse(struct msghdr
*msg
, size_t *len
,
2426 struct virtio_net_hdr
*vnet_hdr
)
2428 if (*len
< sizeof(*vnet_hdr
))
2430 *len
-= sizeof(*vnet_hdr
);
2432 if (!copy_from_iter_full(vnet_hdr
, sizeof(*vnet_hdr
), &msg
->msg_iter
))
2435 return __packet_snd_vnet_parse(vnet_hdr
, *len
);
2438 static int tpacket_fill_skb(struct packet_sock
*po
, struct sk_buff
*skb
,
2439 void *frame
, struct net_device
*dev
, void *data
, int tp_len
,
2440 __be16 proto
, unsigned char *addr
, int hlen
, int copylen
,
2441 const struct sockcm_cookie
*sockc
)
2443 union tpacket_uhdr ph
;
2444 int to_write
, offset
, len
, nr_frags
, len_max
;
2445 struct socket
*sock
= po
->sk
.sk_socket
;
2451 skb
->protocol
= proto
;
2453 skb
->priority
= po
->sk
.sk_priority
;
2454 skb
->mark
= po
->sk
.sk_mark
;
2455 skb
->tstamp
= sockc
->transmit_time
;
2456 skb_setup_tx_timestamp(skb
, sockc
->tsflags
);
2457 skb_zcopy_set_nouarg(skb
, ph
.raw
);
2459 skb_reserve(skb
, hlen
);
2460 skb_reset_network_header(skb
);
2464 if (sock
->type
== SOCK_DGRAM
) {
2465 err
= dev_hard_header(skb
, dev
, ntohs(proto
), addr
,
2467 if (unlikely(err
< 0))
2469 } else if (copylen
) {
2470 int hdrlen
= min_t(int, copylen
, tp_len
);
2472 skb_push(skb
, dev
->hard_header_len
);
2473 skb_put(skb
, copylen
- dev
->hard_header_len
);
2474 err
= skb_store_bits(skb
, 0, data
, hdrlen
);
2477 if (!dev_validate_header(dev
, skb
->data
, hdrlen
))
2484 offset
= offset_in_page(data
);
2485 len_max
= PAGE_SIZE
- offset
;
2486 len
= ((to_write
> len_max
) ? len_max
: to_write
);
2488 skb
->data_len
= to_write
;
2489 skb
->len
+= to_write
;
2490 skb
->truesize
+= to_write
;
2491 refcount_add(to_write
, &po
->sk
.sk_wmem_alloc
);
2493 while (likely(to_write
)) {
2494 nr_frags
= skb_shinfo(skb
)->nr_frags
;
2496 if (unlikely(nr_frags
>= MAX_SKB_FRAGS
)) {
2497 pr_err("Packet exceed the number of skb frags(%lu)\n",
2502 page
= pgv_to_page(data
);
2504 flush_dcache_page(page
);
2506 skb_fill_page_desc(skb
, nr_frags
, page
, offset
, len
);
2509 len_max
= PAGE_SIZE
;
2510 len
= ((to_write
> len_max
) ? len_max
: to_write
);
2513 packet_parse_headers(skb
, sock
);
2518 static int tpacket_parse_header(struct packet_sock
*po
, void *frame
,
2519 int size_max
, void **data
)
2521 union tpacket_uhdr ph
;
2526 switch (po
->tp_version
) {
2528 if (ph
.h3
->tp_next_offset
!= 0) {
2529 pr_warn_once("variable sized slot not supported");
2532 tp_len
= ph
.h3
->tp_len
;
2535 tp_len
= ph
.h2
->tp_len
;
2538 tp_len
= ph
.h1
->tp_len
;
2541 if (unlikely(tp_len
> size_max
)) {
2542 pr_err("packet size is too long (%d > %d)\n", tp_len
, size_max
);
2546 if (unlikely(po
->tp_tx_has_off
)) {
2547 int off_min
, off_max
;
2549 off_min
= po
->tp_hdrlen
- sizeof(struct sockaddr_ll
);
2550 off_max
= po
->tx_ring
.frame_size
- tp_len
;
2551 if (po
->sk
.sk_type
== SOCK_DGRAM
) {
2552 switch (po
->tp_version
) {
2554 off
= ph
.h3
->tp_net
;
2557 off
= ph
.h2
->tp_net
;
2560 off
= ph
.h1
->tp_net
;
2564 switch (po
->tp_version
) {
2566 off
= ph
.h3
->tp_mac
;
2569 off
= ph
.h2
->tp_mac
;
2572 off
= ph
.h1
->tp_mac
;
2576 if (unlikely((off
< off_min
) || (off_max
< off
)))
2579 off
= po
->tp_hdrlen
- sizeof(struct sockaddr_ll
);
2582 *data
= frame
+ off
;
2586 static int tpacket_snd(struct packet_sock
*po
, struct msghdr
*msg
)
2588 struct sk_buff
*skb
;
2589 struct net_device
*dev
;
2590 struct virtio_net_hdr
*vnet_hdr
= NULL
;
2591 struct sockcm_cookie sockc
;
2593 int err
, reserve
= 0;
2595 DECLARE_SOCKADDR(struct sockaddr_ll
*, saddr
, msg
->msg_name
);
2596 bool need_wait
= !(msg
->msg_flags
& MSG_DONTWAIT
);
2597 unsigned char *addr
= NULL
;
2598 int tp_len
, size_max
;
2601 int status
= TP_STATUS_AVAILABLE
;
2602 int hlen
, tlen
, copylen
= 0;
2604 mutex_lock(&po
->pg_vec_lock
);
2606 if (likely(saddr
== NULL
)) {
2607 dev
= packet_cached_dev_get(po
);
2611 if (msg
->msg_namelen
< sizeof(struct sockaddr_ll
))
2613 if (msg
->msg_namelen
< (saddr
->sll_halen
2614 + offsetof(struct sockaddr_ll
,
2617 proto
= saddr
->sll_protocol
;
2618 dev
= dev_get_by_index(sock_net(&po
->sk
), saddr
->sll_ifindex
);
2619 if (po
->sk
.sk_socket
->type
== SOCK_DGRAM
) {
2620 if (dev
&& msg
->msg_namelen
< dev
->addr_len
+
2621 offsetof(struct sockaddr_ll
, sll_addr
))
2623 addr
= saddr
->sll_addr
;
2628 if (unlikely(dev
== NULL
))
2631 if (unlikely(!(dev
->flags
& IFF_UP
)))
2634 sockcm_init(&sockc
, &po
->sk
);
2635 if (msg
->msg_controllen
) {
2636 err
= sock_cmsg_send(&po
->sk
, msg
, &sockc
);
2641 if (po
->sk
.sk_socket
->type
== SOCK_RAW
)
2642 reserve
= dev
->hard_header_len
;
2643 size_max
= po
->tx_ring
.frame_size
2644 - (po
->tp_hdrlen
- sizeof(struct sockaddr_ll
));
2646 if ((size_max
> dev
->mtu
+ reserve
+ VLAN_HLEN
) && !po
->has_vnet_hdr
)
2647 size_max
= dev
->mtu
+ reserve
+ VLAN_HLEN
;
2650 ph
= packet_current_frame(po
, &po
->tx_ring
,
2651 TP_STATUS_SEND_REQUEST
);
2652 if (unlikely(ph
== NULL
)) {
2653 if (need_wait
&& need_resched())
2659 tp_len
= tpacket_parse_header(po
, ph
, size_max
, &data
);
2663 status
= TP_STATUS_SEND_REQUEST
;
2664 hlen
= LL_RESERVED_SPACE(dev
);
2665 tlen
= dev
->needed_tailroom
;
2666 if (po
->has_vnet_hdr
) {
2668 data
+= sizeof(*vnet_hdr
);
2669 tp_len
-= sizeof(*vnet_hdr
);
2671 __packet_snd_vnet_parse(vnet_hdr
, tp_len
)) {
2675 copylen
= __virtio16_to_cpu(vio_le(),
2678 copylen
= max_t(int, copylen
, dev
->hard_header_len
);
2679 skb
= sock_alloc_send_skb(&po
->sk
,
2680 hlen
+ tlen
+ sizeof(struct sockaddr_ll
) +
2681 (copylen
- dev
->hard_header_len
),
2684 if (unlikely(skb
== NULL
)) {
2685 /* we assume the socket was initially writeable ... */
2686 if (likely(len_sum
> 0))
2690 tp_len
= tpacket_fill_skb(po
, skb
, ph
, dev
, data
, tp_len
, proto
,
2691 addr
, hlen
, copylen
, &sockc
);
2692 if (likely(tp_len
>= 0) &&
2693 tp_len
> dev
->mtu
+ reserve
&&
2694 !po
->has_vnet_hdr
&&
2695 !packet_extra_vlan_len_allowed(dev
, skb
))
2698 if (unlikely(tp_len
< 0)) {
2701 __packet_set_status(po
, ph
,
2702 TP_STATUS_AVAILABLE
);
2703 packet_increment_head(&po
->tx_ring
);
2707 status
= TP_STATUS_WRONG_FORMAT
;
2713 if (po
->has_vnet_hdr
) {
2714 if (virtio_net_hdr_to_skb(skb
, vnet_hdr
, vio_le())) {
2718 virtio_net_hdr_set_proto(skb
, vnet_hdr
);
2721 skb
->destructor
= tpacket_destruct_skb
;
2722 __packet_set_status(po
, ph
, TP_STATUS_SENDING
);
2723 packet_inc_pending(&po
->tx_ring
);
2725 status
= TP_STATUS_SEND_REQUEST
;
2726 err
= po
->xmit(skb
);
2727 if (unlikely(err
> 0)) {
2728 err
= net_xmit_errno(err
);
2729 if (err
&& __packet_get_status(po
, ph
) ==
2730 TP_STATUS_AVAILABLE
) {
2731 /* skb was destructed already */
2736 * skb was dropped but not destructed yet;
2737 * let's treat it like congestion or err < 0
2741 packet_increment_head(&po
->tx_ring
);
2743 } while (likely((ph
!= NULL
) ||
2744 /* Note: packet_read_pending() might be slow if we have
2745 * to call it as it's per_cpu variable, but in fast-path
2746 * we already short-circuit the loop with the first
2747 * condition, and luckily don't have to go that path
2750 (need_wait
&& packet_read_pending(&po
->tx_ring
))));
2756 __packet_set_status(po
, ph
, status
);
2761 mutex_unlock(&po
->pg_vec_lock
);
2765 static struct sk_buff
*packet_alloc_skb(struct sock
*sk
, size_t prepad
,
2766 size_t reserve
, size_t len
,
2767 size_t linear
, int noblock
,
2770 struct sk_buff
*skb
;
2772 /* Under a page? Don't bother with paged skb. */
2773 if (prepad
+ len
< PAGE_SIZE
|| !linear
)
2776 skb
= sock_alloc_send_pskb(sk
, prepad
+ linear
, len
- linear
, noblock
,
2781 skb_reserve(skb
, reserve
);
2782 skb_put(skb
, linear
);
2783 skb
->data_len
= len
- linear
;
2784 skb
->len
+= len
- linear
;
2789 static int packet_snd(struct socket
*sock
, struct msghdr
*msg
, size_t len
)
2791 struct sock
*sk
= sock
->sk
;
2792 DECLARE_SOCKADDR(struct sockaddr_ll
*, saddr
, msg
->msg_name
);
2793 struct sk_buff
*skb
;
2794 struct net_device
*dev
;
2796 unsigned char *addr
= NULL
;
2797 int err
, reserve
= 0;
2798 struct sockcm_cookie sockc
;
2799 struct virtio_net_hdr vnet_hdr
= { 0 };
2801 struct packet_sock
*po
= pkt_sk(sk
);
2802 bool has_vnet_hdr
= false;
2803 int hlen
, tlen
, linear
;
2807 * Get and verify the address.
2810 if (likely(saddr
== NULL
)) {
2811 dev
= packet_cached_dev_get(po
);
2815 if (msg
->msg_namelen
< sizeof(struct sockaddr_ll
))
2817 if (msg
->msg_namelen
< (saddr
->sll_halen
+ offsetof(struct sockaddr_ll
, sll_addr
)))
2819 proto
= saddr
->sll_protocol
;
2820 dev
= dev_get_by_index(sock_net(sk
), saddr
->sll_ifindex
);
2821 if (sock
->type
== SOCK_DGRAM
) {
2822 if (dev
&& msg
->msg_namelen
< dev
->addr_len
+
2823 offsetof(struct sockaddr_ll
, sll_addr
))
2825 addr
= saddr
->sll_addr
;
2830 if (unlikely(dev
== NULL
))
2833 if (unlikely(!(dev
->flags
& IFF_UP
)))
2836 sockcm_init(&sockc
, sk
);
2837 sockc
.mark
= sk
->sk_mark
;
2838 if (msg
->msg_controllen
) {
2839 err
= sock_cmsg_send(sk
, msg
, &sockc
);
2844 if (sock
->type
== SOCK_RAW
)
2845 reserve
= dev
->hard_header_len
;
2846 if (po
->has_vnet_hdr
) {
2847 err
= packet_snd_vnet_parse(msg
, &len
, &vnet_hdr
);
2850 has_vnet_hdr
= true;
2853 if (unlikely(sock_flag(sk
, SOCK_NOFCS
))) {
2854 if (!netif_supports_nofcs(dev
)) {
2855 err
= -EPROTONOSUPPORT
;
2858 extra_len
= 4; /* We're doing our own CRC */
2862 if (!vnet_hdr
.gso_type
&&
2863 (len
> dev
->mtu
+ reserve
+ VLAN_HLEN
+ extra_len
))
2867 hlen
= LL_RESERVED_SPACE(dev
);
2868 tlen
= dev
->needed_tailroom
;
2869 linear
= __virtio16_to_cpu(vio_le(), vnet_hdr
.hdr_len
);
2870 linear
= max(linear
, min_t(int, len
, dev
->hard_header_len
));
2871 skb
= packet_alloc_skb(sk
, hlen
+ tlen
, hlen
, len
, linear
,
2872 msg
->msg_flags
& MSG_DONTWAIT
, &err
);
2876 skb_reset_network_header(skb
);
2879 if (sock
->type
== SOCK_DGRAM
) {
2880 offset
= dev_hard_header(skb
, dev
, ntohs(proto
), addr
, NULL
, len
);
2881 if (unlikely(offset
< 0))
2883 } else if (reserve
) {
2884 skb_reserve(skb
, -reserve
);
2885 if (len
< reserve
+ sizeof(struct ipv6hdr
) &&
2886 dev
->min_header_len
!= dev
->hard_header_len
)
2887 skb_reset_network_header(skb
);
2890 /* Returns -EFAULT on error */
2891 err
= skb_copy_datagram_from_iter(skb
, offset
, &msg
->msg_iter
, len
);
2895 if (sock
->type
== SOCK_RAW
&&
2896 !dev_validate_header(dev
, skb
->data
, len
)) {
2901 skb_setup_tx_timestamp(skb
, sockc
.tsflags
);
2903 if (!vnet_hdr
.gso_type
&& (len
> dev
->mtu
+ reserve
+ extra_len
) &&
2904 !packet_extra_vlan_len_allowed(dev
, skb
)) {
2909 skb
->protocol
= proto
;
2911 skb
->priority
= sk
->sk_priority
;
2912 skb
->mark
= sockc
.mark
;
2913 skb
->tstamp
= sockc
.transmit_time
;
2916 err
= virtio_net_hdr_to_skb(skb
, &vnet_hdr
, vio_le());
2919 len
+= sizeof(vnet_hdr
);
2920 virtio_net_hdr_set_proto(skb
, &vnet_hdr
);
2923 packet_parse_headers(skb
, sock
);
2925 if (unlikely(extra_len
== 4))
2928 err
= po
->xmit(skb
);
2929 if (err
> 0 && (err
= net_xmit_errno(err
)) != 0)
2945 static int packet_sendmsg(struct socket
*sock
, struct msghdr
*msg
, size_t len
)
2947 struct sock
*sk
= sock
->sk
;
2948 struct packet_sock
*po
= pkt_sk(sk
);
2950 if (po
->tx_ring
.pg_vec
)
2951 return tpacket_snd(po
, msg
);
2953 return packet_snd(sock
, msg
, len
);
2957 * Close a PACKET socket. This is fairly simple. We immediately go
2958 * to 'closed' state and remove our protocol entry in the device list.
2961 static int packet_release(struct socket
*sock
)
2963 struct sock
*sk
= sock
->sk
;
2964 struct packet_sock
*po
;
2965 struct packet_fanout
*f
;
2967 union tpacket_req_u req_u
;
2975 mutex_lock(&net
->packet
.sklist_lock
);
2976 sk_del_node_init_rcu(sk
);
2977 mutex_unlock(&net
->packet
.sklist_lock
);
2980 sock_prot_inuse_add(net
, sk
->sk_prot
, -1);
2983 spin_lock(&po
->bind_lock
);
2984 unregister_prot_hook(sk
, false);
2985 packet_cached_dev_reset(po
);
2987 if (po
->prot_hook
.dev
) {
2988 dev_put(po
->prot_hook
.dev
);
2989 po
->prot_hook
.dev
= NULL
;
2991 spin_unlock(&po
->bind_lock
);
2993 packet_flush_mclist(sk
);
2996 if (po
->rx_ring
.pg_vec
) {
2997 memset(&req_u
, 0, sizeof(req_u
));
2998 packet_set_ring(sk
, &req_u
, 1, 0);
3001 if (po
->tx_ring
.pg_vec
) {
3002 memset(&req_u
, 0, sizeof(req_u
));
3003 packet_set_ring(sk
, &req_u
, 1, 1);
3007 f
= fanout_release(sk
);
3011 kfree(po
->rollover
);
3013 fanout_release_data(f
);
3017 * Now the socket is dead. No more input will appear.
3024 skb_queue_purge(&sk
->sk_receive_queue
);
3025 packet_free_pending(po
);
3026 sk_refcnt_debug_release(sk
);
3033 * Attach a packet hook.
3036 static int packet_do_bind(struct sock
*sk
, const char *name
, int ifindex
,
3039 struct packet_sock
*po
= pkt_sk(sk
);
3040 struct net_device
*dev_curr
;
3043 struct net_device
*dev
= NULL
;
3045 bool unlisted
= false;
3048 spin_lock(&po
->bind_lock
);
3057 dev
= dev_get_by_name_rcu(sock_net(sk
), name
);
3062 } else if (ifindex
) {
3063 dev
= dev_get_by_index_rcu(sock_net(sk
), ifindex
);
3073 proto_curr
= po
->prot_hook
.type
;
3074 dev_curr
= po
->prot_hook
.dev
;
3076 need_rehook
= proto_curr
!= proto
|| dev_curr
!= dev
;
3081 /* prevents packet_notifier() from calling
3082 * register_prot_hook()
3085 __unregister_prot_hook(sk
, true);
3087 dev_curr
= po
->prot_hook
.dev
;
3089 unlisted
= !dev_get_by_index_rcu(sock_net(sk
),
3093 BUG_ON(po
->running
);
3095 po
->prot_hook
.type
= proto
;
3097 if (unlikely(unlisted
)) {
3099 po
->prot_hook
.dev
= NULL
;
3101 packet_cached_dev_reset(po
);
3103 po
->prot_hook
.dev
= dev
;
3104 po
->ifindex
= dev
? dev
->ifindex
: 0;
3105 packet_cached_dev_assign(po
, dev
);
3111 if (proto
== 0 || !need_rehook
)
3114 if (!unlisted
&& (!dev
|| (dev
->flags
& IFF_UP
))) {
3115 register_prot_hook(sk
);
3117 sk
->sk_err
= ENETDOWN
;
3118 if (!sock_flag(sk
, SOCK_DEAD
))
3119 sk
->sk_error_report(sk
);
3124 spin_unlock(&po
->bind_lock
);
3130 * Bind a packet socket to a device
3133 static int packet_bind_spkt(struct socket
*sock
, struct sockaddr
*uaddr
,
3136 struct sock
*sk
= sock
->sk
;
3137 char name
[sizeof(uaddr
->sa_data
) + 1];
3143 if (addr_len
!= sizeof(struct sockaddr
))
3145 /* uaddr->sa_data comes from the userspace, it's not guaranteed to be
3148 memcpy(name
, uaddr
->sa_data
, sizeof(uaddr
->sa_data
));
3149 name
[sizeof(uaddr
->sa_data
)] = 0;
3151 return packet_do_bind(sk
, name
, 0, pkt_sk(sk
)->num
);
3154 static int packet_bind(struct socket
*sock
, struct sockaddr
*uaddr
, int addr_len
)
3156 struct sockaddr_ll
*sll
= (struct sockaddr_ll
*)uaddr
;
3157 struct sock
*sk
= sock
->sk
;
3163 if (addr_len
< sizeof(struct sockaddr_ll
))
3165 if (sll
->sll_family
!= AF_PACKET
)
3168 return packet_do_bind(sk
, NULL
, sll
->sll_ifindex
,
3169 sll
->sll_protocol
? : pkt_sk(sk
)->num
);
3172 static struct proto packet_proto
= {
3174 .owner
= THIS_MODULE
,
3175 .obj_size
= sizeof(struct packet_sock
),
3179 * Create a packet of type SOCK_PACKET.
3182 static int packet_create(struct net
*net
, struct socket
*sock
, int protocol
,
3186 struct packet_sock
*po
;
3187 __be16 proto
= (__force __be16
)protocol
; /* weird, but documented */
3190 if (!ns_capable(net
->user_ns
, CAP_NET_RAW
))
3192 if (sock
->type
!= SOCK_DGRAM
&& sock
->type
!= SOCK_RAW
&&
3193 sock
->type
!= SOCK_PACKET
)
3194 return -ESOCKTNOSUPPORT
;
3196 sock
->state
= SS_UNCONNECTED
;
3199 sk
= sk_alloc(net
, PF_PACKET
, GFP_KERNEL
, &packet_proto
, kern
);
3203 sock
->ops
= &packet_ops
;
3204 if (sock
->type
== SOCK_PACKET
)
3205 sock
->ops
= &packet_ops_spkt
;
3207 sock_init_data(sock
, sk
);
3210 sk
->sk_family
= PF_PACKET
;
3212 po
->xmit
= dev_queue_xmit
;
3214 err
= packet_alloc_pending(po
);
3218 packet_cached_dev_reset(po
);
3220 sk
->sk_destruct
= packet_sock_destruct
;
3221 sk_refcnt_debug_inc(sk
);
3224 * Attach a protocol block
3227 spin_lock_init(&po
->bind_lock
);
3228 mutex_init(&po
->pg_vec_lock
);
3229 po
->rollover
= NULL
;
3230 po
->prot_hook
.func
= packet_rcv
;
3232 if (sock
->type
== SOCK_PACKET
)
3233 po
->prot_hook
.func
= packet_rcv_spkt
;
3235 po
->prot_hook
.af_packet_priv
= sk
;
3238 po
->prot_hook
.type
= proto
;
3239 __register_prot_hook(sk
);
3242 mutex_lock(&net
->packet
.sklist_lock
);
3243 sk_add_node_tail_rcu(sk
, &net
->packet
.sklist
);
3244 mutex_unlock(&net
->packet
.sklist_lock
);
3247 sock_prot_inuse_add(net
, &packet_proto
, 1);
3258 * Pull a packet from our receive queue and hand it to the user.
3259 * If necessary we block.
3262 static int packet_recvmsg(struct socket
*sock
, struct msghdr
*msg
, size_t len
,
3265 struct sock
*sk
= sock
->sk
;
3266 struct sk_buff
*skb
;
3268 int vnet_hdr_len
= 0;
3269 unsigned int origlen
= 0;
3272 if (flags
& ~(MSG_PEEK
|MSG_DONTWAIT
|MSG_TRUNC
|MSG_CMSG_COMPAT
|MSG_ERRQUEUE
))
3276 /* What error should we return now? EUNATTACH? */
3277 if (pkt_sk(sk
)->ifindex
< 0)
3281 if (flags
& MSG_ERRQUEUE
) {
3282 err
= sock_recv_errqueue(sk
, msg
, len
,
3283 SOL_PACKET
, PACKET_TX_TIMESTAMP
);
3288 * Call the generic datagram receiver. This handles all sorts
3289 * of horrible races and re-entrancy so we can forget about it
3290 * in the protocol layers.
3292 * Now it will return ENETDOWN, if device have just gone down,
3293 * but then it will block.
3296 skb
= skb_recv_datagram(sk
, flags
, flags
& MSG_DONTWAIT
, &err
);
3299 * An error occurred so return it. Because skb_recv_datagram()
3300 * handles the blocking we don't see and worry about blocking
3307 if (pkt_sk(sk
)->pressure
)
3308 packet_rcv_has_room(pkt_sk(sk
), NULL
);
3310 if (pkt_sk(sk
)->has_vnet_hdr
) {
3311 err
= packet_rcv_vnet(msg
, skb
, &len
);
3314 vnet_hdr_len
= sizeof(struct virtio_net_hdr
);
3317 /* You lose any data beyond the buffer you gave. If it worries
3318 * a user program they can ask the device for its MTU
3324 msg
->msg_flags
|= MSG_TRUNC
;
3327 err
= skb_copy_datagram_msg(skb
, 0, msg
, copied
);
3331 if (sock
->type
!= SOCK_PACKET
) {
3332 struct sockaddr_ll
*sll
= &PACKET_SKB_CB(skb
)->sa
.ll
;
3334 /* Original length was stored in sockaddr_ll fields */
3335 origlen
= PACKET_SKB_CB(skb
)->sa
.origlen
;
3336 sll
->sll_family
= AF_PACKET
;
3337 sll
->sll_protocol
= skb
->protocol
;
3340 sock_recv_ts_and_drops(msg
, sk
, skb
);
3342 if (msg
->msg_name
) {
3345 /* If the address length field is there to be filled
3346 * in, we fill it in now.
3348 if (sock
->type
== SOCK_PACKET
) {
3349 __sockaddr_check_size(sizeof(struct sockaddr_pkt
));
3350 msg
->msg_namelen
= sizeof(struct sockaddr_pkt
);
3351 copy_len
= msg
->msg_namelen
;
3353 struct sockaddr_ll
*sll
= &PACKET_SKB_CB(skb
)->sa
.ll
;
3355 msg
->msg_namelen
= sll
->sll_halen
+
3356 offsetof(struct sockaddr_ll
, sll_addr
);
3357 copy_len
= msg
->msg_namelen
;
3358 if (msg
->msg_namelen
< sizeof(struct sockaddr_ll
)) {
3359 memset(msg
->msg_name
+
3360 offsetof(struct sockaddr_ll
, sll_addr
),
3361 0, sizeof(sll
->sll_addr
));
3362 msg
->msg_namelen
= sizeof(struct sockaddr_ll
);
3365 memcpy(msg
->msg_name
, &PACKET_SKB_CB(skb
)->sa
, copy_len
);
3368 if (pkt_sk(sk
)->auxdata
) {
3369 struct tpacket_auxdata aux
;
3371 aux
.tp_status
= TP_STATUS_USER
;
3372 if (skb
->ip_summed
== CHECKSUM_PARTIAL
)
3373 aux
.tp_status
|= TP_STATUS_CSUMNOTREADY
;
3374 else if (skb
->pkt_type
!= PACKET_OUTGOING
&&
3375 (skb
->ip_summed
== CHECKSUM_COMPLETE
||
3376 skb_csum_unnecessary(skb
)))
3377 aux
.tp_status
|= TP_STATUS_CSUM_VALID
;
3379 aux
.tp_len
= origlen
;
3380 aux
.tp_snaplen
= skb
->len
;
3382 aux
.tp_net
= skb_network_offset(skb
);
3383 if (skb_vlan_tag_present(skb
)) {
3384 aux
.tp_vlan_tci
= skb_vlan_tag_get(skb
);
3385 aux
.tp_vlan_tpid
= ntohs(skb
->vlan_proto
);
3386 aux
.tp_status
|= TP_STATUS_VLAN_VALID
| TP_STATUS_VLAN_TPID_VALID
;
3388 aux
.tp_vlan_tci
= 0;
3389 aux
.tp_vlan_tpid
= 0;
3391 put_cmsg(msg
, SOL_PACKET
, PACKET_AUXDATA
, sizeof(aux
), &aux
);
3395 * Free or return the buffer as appropriate. Again this
3396 * hides all the races and re-entrancy issues from us.
3398 err
= vnet_hdr_len
+ ((flags
&MSG_TRUNC
) ? skb
->len
: copied
);
3401 skb_free_datagram(sk
, skb
);
3406 static int packet_getname_spkt(struct socket
*sock
, struct sockaddr
*uaddr
,
3409 struct net_device
*dev
;
3410 struct sock
*sk
= sock
->sk
;
3415 uaddr
->sa_family
= AF_PACKET
;
3416 memset(uaddr
->sa_data
, 0, sizeof(uaddr
->sa_data
));
3418 dev
= dev_get_by_index_rcu(sock_net(sk
), pkt_sk(sk
)->ifindex
);
3420 strlcpy(uaddr
->sa_data
, dev
->name
, sizeof(uaddr
->sa_data
));
3423 return sizeof(*uaddr
);
3426 static int packet_getname(struct socket
*sock
, struct sockaddr
*uaddr
,
3429 struct net_device
*dev
;
3430 struct sock
*sk
= sock
->sk
;
3431 struct packet_sock
*po
= pkt_sk(sk
);
3432 DECLARE_SOCKADDR(struct sockaddr_ll
*, sll
, uaddr
);
3437 sll
->sll_family
= AF_PACKET
;
3438 sll
->sll_ifindex
= po
->ifindex
;
3439 sll
->sll_protocol
= po
->num
;
3440 sll
->sll_pkttype
= 0;
3442 dev
= dev_get_by_index_rcu(sock_net(sk
), po
->ifindex
);
3444 sll
->sll_hatype
= dev
->type
;
3445 sll
->sll_halen
= dev
->addr_len
;
3446 memcpy(sll
->sll_addr
, dev
->dev_addr
, dev
->addr_len
);
3448 sll
->sll_hatype
= 0; /* Bad: we have no ARPHRD_UNSPEC */
3453 return offsetof(struct sockaddr_ll
, sll_addr
) + sll
->sll_halen
;
3456 static int packet_dev_mc(struct net_device
*dev
, struct packet_mclist
*i
,
3460 case PACKET_MR_MULTICAST
:
3461 if (i
->alen
!= dev
->addr_len
)
3464 return dev_mc_add(dev
, i
->addr
);
3466 return dev_mc_del(dev
, i
->addr
);
3468 case PACKET_MR_PROMISC
:
3469 return dev_set_promiscuity(dev
, what
);
3470 case PACKET_MR_ALLMULTI
:
3471 return dev_set_allmulti(dev
, what
);
3472 case PACKET_MR_UNICAST
:
3473 if (i
->alen
!= dev
->addr_len
)
3476 return dev_uc_add(dev
, i
->addr
);
3478 return dev_uc_del(dev
, i
->addr
);
3486 static void packet_dev_mclist_delete(struct net_device
*dev
,
3487 struct packet_mclist
**mlp
)
3489 struct packet_mclist
*ml
;
3491 while ((ml
= *mlp
) != NULL
) {
3492 if (ml
->ifindex
== dev
->ifindex
) {
3493 packet_dev_mc(dev
, ml
, -1);
3501 static int packet_mc_add(struct sock
*sk
, struct packet_mreq_max
*mreq
)
3503 struct packet_sock
*po
= pkt_sk(sk
);
3504 struct packet_mclist
*ml
, *i
;
3505 struct net_device
*dev
;
3511 dev
= __dev_get_by_index(sock_net(sk
), mreq
->mr_ifindex
);
3516 if (mreq
->mr_alen
> dev
->addr_len
)
3520 i
= kmalloc(sizeof(*i
), GFP_KERNEL
);
3525 for (ml
= po
->mclist
; ml
; ml
= ml
->next
) {
3526 if (ml
->ifindex
== mreq
->mr_ifindex
&&
3527 ml
->type
== mreq
->mr_type
&&
3528 ml
->alen
== mreq
->mr_alen
&&
3529 memcmp(ml
->addr
, mreq
->mr_address
, ml
->alen
) == 0) {
3531 /* Free the new element ... */
3537 i
->type
= mreq
->mr_type
;
3538 i
->ifindex
= mreq
->mr_ifindex
;
3539 i
->alen
= mreq
->mr_alen
;
3540 memcpy(i
->addr
, mreq
->mr_address
, i
->alen
);
3541 memset(i
->addr
+ i
->alen
, 0, sizeof(i
->addr
) - i
->alen
);
3543 i
->next
= po
->mclist
;
3545 err
= packet_dev_mc(dev
, i
, 1);
3547 po
->mclist
= i
->next
;
3556 static int packet_mc_drop(struct sock
*sk
, struct packet_mreq_max
*mreq
)
3558 struct packet_mclist
*ml
, **mlp
;
3562 for (mlp
= &pkt_sk(sk
)->mclist
; (ml
= *mlp
) != NULL
; mlp
= &ml
->next
) {
3563 if (ml
->ifindex
== mreq
->mr_ifindex
&&
3564 ml
->type
== mreq
->mr_type
&&
3565 ml
->alen
== mreq
->mr_alen
&&
3566 memcmp(ml
->addr
, mreq
->mr_address
, ml
->alen
) == 0) {
3567 if (--ml
->count
== 0) {
3568 struct net_device
*dev
;
3570 dev
= __dev_get_by_index(sock_net(sk
), ml
->ifindex
);
3572 packet_dev_mc(dev
, ml
, -1);
3582 static void packet_flush_mclist(struct sock
*sk
)
3584 struct packet_sock
*po
= pkt_sk(sk
);
3585 struct packet_mclist
*ml
;
3591 while ((ml
= po
->mclist
) != NULL
) {
3592 struct net_device
*dev
;
3594 po
->mclist
= ml
->next
;
3595 dev
= __dev_get_by_index(sock_net(sk
), ml
->ifindex
);
3597 packet_dev_mc(dev
, ml
, -1);
3604 packet_setsockopt(struct socket
*sock
, int level
, int optname
, char __user
*optval
, unsigned int optlen
)
3606 struct sock
*sk
= sock
->sk
;
3607 struct packet_sock
*po
= pkt_sk(sk
);
3610 if (level
!= SOL_PACKET
)
3611 return -ENOPROTOOPT
;
3614 case PACKET_ADD_MEMBERSHIP
:
3615 case PACKET_DROP_MEMBERSHIP
:
3617 struct packet_mreq_max mreq
;
3619 memset(&mreq
, 0, sizeof(mreq
));
3620 if (len
< sizeof(struct packet_mreq
))
3622 if (len
> sizeof(mreq
))
3624 if (copy_from_user(&mreq
, optval
, len
))
3626 if (len
< (mreq
.mr_alen
+ offsetof(struct packet_mreq
, mr_address
)))
3628 if (optname
== PACKET_ADD_MEMBERSHIP
)
3629 ret
= packet_mc_add(sk
, &mreq
);
3631 ret
= packet_mc_drop(sk
, &mreq
);
3635 case PACKET_RX_RING
:
3636 case PACKET_TX_RING
:
3638 union tpacket_req_u req_u
;
3642 switch (po
->tp_version
) {
3645 len
= sizeof(req_u
.req
);
3649 len
= sizeof(req_u
.req3
);
3655 if (copy_from_user(&req_u
.req
, optval
, len
))
3658 ret
= packet_set_ring(sk
, &req_u
, 0,
3659 optname
== PACKET_TX_RING
);
3664 case PACKET_COPY_THRESH
:
3668 if (optlen
!= sizeof(val
))
3670 if (copy_from_user(&val
, optval
, sizeof(val
)))
3673 pkt_sk(sk
)->copy_thresh
= val
;
3676 case PACKET_VERSION
:
3680 if (optlen
!= sizeof(val
))
3682 if (copy_from_user(&val
, optval
, sizeof(val
)))
3693 if (po
->rx_ring
.pg_vec
|| po
->tx_ring
.pg_vec
) {
3696 po
->tp_version
= val
;
3702 case PACKET_RESERVE
:
3706 if (optlen
!= sizeof(val
))
3708 if (copy_from_user(&val
, optval
, sizeof(val
)))
3713 if (po
->rx_ring
.pg_vec
|| po
->tx_ring
.pg_vec
) {
3716 po
->tp_reserve
= val
;
3726 if (optlen
!= sizeof(val
))
3728 if (copy_from_user(&val
, optval
, sizeof(val
)))
3732 if (po
->rx_ring
.pg_vec
|| po
->tx_ring
.pg_vec
) {
3735 po
->tp_loss
= !!val
;
3741 case PACKET_AUXDATA
:
3745 if (optlen
< sizeof(val
))
3747 if (copy_from_user(&val
, optval
, sizeof(val
)))
3751 po
->auxdata
= !!val
;
3755 case PACKET_ORIGDEV
:
3759 if (optlen
< sizeof(val
))
3761 if (copy_from_user(&val
, optval
, sizeof(val
)))
3765 po
->origdev
= !!val
;
3769 case PACKET_VNET_HDR
:
3773 if (sock
->type
!= SOCK_RAW
)
3775 if (optlen
< sizeof(val
))
3777 if (copy_from_user(&val
, optval
, sizeof(val
)))
3781 if (po
->rx_ring
.pg_vec
|| po
->tx_ring
.pg_vec
) {
3784 po
->has_vnet_hdr
= !!val
;
3790 case PACKET_TIMESTAMP
:
3794 if (optlen
!= sizeof(val
))
3796 if (copy_from_user(&val
, optval
, sizeof(val
)))
3799 po
->tp_tstamp
= val
;
3806 if (optlen
!= sizeof(val
))
3808 if (copy_from_user(&val
, optval
, sizeof(val
)))
3811 return fanout_add(sk
, val
& 0xffff, val
>> 16);
3813 case PACKET_FANOUT_DATA
:
3818 return fanout_set_data(po
, optval
, optlen
);
3820 case PACKET_IGNORE_OUTGOING
:
3824 if (optlen
!= sizeof(val
))
3826 if (copy_from_user(&val
, optval
, sizeof(val
)))
3828 if (val
< 0 || val
> 1)
3831 po
->prot_hook
.ignore_outgoing
= !!val
;
3834 case PACKET_TX_HAS_OFF
:
3838 if (optlen
!= sizeof(val
))
3840 if (copy_from_user(&val
, optval
, sizeof(val
)))
3844 if (po
->rx_ring
.pg_vec
|| po
->tx_ring
.pg_vec
) {
3847 po
->tp_tx_has_off
= !!val
;
3853 case PACKET_QDISC_BYPASS
:
3857 if (optlen
!= sizeof(val
))
3859 if (copy_from_user(&val
, optval
, sizeof(val
)))
3862 po
->xmit
= val
? packet_direct_xmit
: dev_queue_xmit
;
3866 return -ENOPROTOOPT
;
3870 static int packet_getsockopt(struct socket
*sock
, int level
, int optname
,
3871 char __user
*optval
, int __user
*optlen
)
3874 int val
, lv
= sizeof(val
);
3875 struct sock
*sk
= sock
->sk
;
3876 struct packet_sock
*po
= pkt_sk(sk
);
3878 union tpacket_stats_u st
;
3879 struct tpacket_rollover_stats rstats
;
3881 if (level
!= SOL_PACKET
)
3882 return -ENOPROTOOPT
;
3884 if (get_user(len
, optlen
))
3891 case PACKET_STATISTICS
:
3892 spin_lock_bh(&sk
->sk_receive_queue
.lock
);
3893 memcpy(&st
, &po
->stats
, sizeof(st
));
3894 memset(&po
->stats
, 0, sizeof(po
->stats
));
3895 spin_unlock_bh(&sk
->sk_receive_queue
.lock
);
3897 if (po
->tp_version
== TPACKET_V3
) {
3898 lv
= sizeof(struct tpacket_stats_v3
);
3899 st
.stats3
.tp_packets
+= st
.stats3
.tp_drops
;
3902 lv
= sizeof(struct tpacket_stats
);
3903 st
.stats1
.tp_packets
+= st
.stats1
.tp_drops
;
3908 case PACKET_AUXDATA
:
3911 case PACKET_ORIGDEV
:
3914 case PACKET_VNET_HDR
:
3915 val
= po
->has_vnet_hdr
;
3917 case PACKET_VERSION
:
3918 val
= po
->tp_version
;
3921 if (len
> sizeof(int))
3923 if (len
< sizeof(int))
3925 if (copy_from_user(&val
, optval
, len
))
3929 val
= sizeof(struct tpacket_hdr
);
3932 val
= sizeof(struct tpacket2_hdr
);
3935 val
= sizeof(struct tpacket3_hdr
);
3941 case PACKET_RESERVE
:
3942 val
= po
->tp_reserve
;
3947 case PACKET_TIMESTAMP
:
3948 val
= po
->tp_tstamp
;
3952 ((u32
)po
->fanout
->id
|
3953 ((u32
)po
->fanout
->type
<< 16) |
3954 ((u32
)po
->fanout
->flags
<< 24)) :
3957 case PACKET_IGNORE_OUTGOING
:
3958 val
= po
->prot_hook
.ignore_outgoing
;
3960 case PACKET_ROLLOVER_STATS
:
3963 rstats
.tp_all
= atomic_long_read(&po
->rollover
->num
);
3964 rstats
.tp_huge
= atomic_long_read(&po
->rollover
->num_huge
);
3965 rstats
.tp_failed
= atomic_long_read(&po
->rollover
->num_failed
);
3967 lv
= sizeof(rstats
);
3969 case PACKET_TX_HAS_OFF
:
3970 val
= po
->tp_tx_has_off
;
3972 case PACKET_QDISC_BYPASS
:
3973 val
= packet_use_direct_xmit(po
);
3976 return -ENOPROTOOPT
;
3981 if (put_user(len
, optlen
))
3983 if (copy_to_user(optval
, data
, len
))
3989 #ifdef CONFIG_COMPAT
3990 static int compat_packet_setsockopt(struct socket
*sock
, int level
, int optname
,
3991 char __user
*optval
, unsigned int optlen
)
3993 struct packet_sock
*po
= pkt_sk(sock
->sk
);
3995 if (level
!= SOL_PACKET
)
3996 return -ENOPROTOOPT
;
3998 if (optname
== PACKET_FANOUT_DATA
&&
3999 po
->fanout
&& po
->fanout
->type
== PACKET_FANOUT_CBPF
) {
4000 optval
= (char __user
*)get_compat_bpf_fprog(optval
);
4003 optlen
= sizeof(struct sock_fprog
);
4006 return packet_setsockopt(sock
, level
, optname
, optval
, optlen
);
4010 static int packet_notifier(struct notifier_block
*this,
4011 unsigned long msg
, void *ptr
)
4014 struct net_device
*dev
= netdev_notifier_info_to_dev(ptr
);
4015 struct net
*net
= dev_net(dev
);
4018 sk_for_each_rcu(sk
, &net
->packet
.sklist
) {
4019 struct packet_sock
*po
= pkt_sk(sk
);
4022 case NETDEV_UNREGISTER
:
4024 packet_dev_mclist_delete(dev
, &po
->mclist
);
4028 if (dev
->ifindex
== po
->ifindex
) {
4029 spin_lock(&po
->bind_lock
);
4031 __unregister_prot_hook(sk
, false);
4032 sk
->sk_err
= ENETDOWN
;
4033 if (!sock_flag(sk
, SOCK_DEAD
))
4034 sk
->sk_error_report(sk
);
4036 if (msg
== NETDEV_UNREGISTER
) {
4037 packet_cached_dev_reset(po
);
4039 if (po
->prot_hook
.dev
)
4040 dev_put(po
->prot_hook
.dev
);
4041 po
->prot_hook
.dev
= NULL
;
4043 spin_unlock(&po
->bind_lock
);
4047 if (dev
->ifindex
== po
->ifindex
) {
4048 spin_lock(&po
->bind_lock
);
4050 register_prot_hook(sk
);
4051 spin_unlock(&po
->bind_lock
);
4061 static int packet_ioctl(struct socket
*sock
, unsigned int cmd
,
4064 struct sock
*sk
= sock
->sk
;
4069 int amount
= sk_wmem_alloc_get(sk
);
4071 return put_user(amount
, (int __user
*)arg
);
4075 struct sk_buff
*skb
;
4078 spin_lock_bh(&sk
->sk_receive_queue
.lock
);
4079 skb
= skb_peek(&sk
->sk_receive_queue
);
4082 spin_unlock_bh(&sk
->sk_receive_queue
.lock
);
4083 return put_user(amount
, (int __user
*)arg
);
4093 case SIOCGIFBRDADDR
:
4094 case SIOCSIFBRDADDR
:
4095 case SIOCGIFNETMASK
:
4096 case SIOCSIFNETMASK
:
4097 case SIOCGIFDSTADDR
:
4098 case SIOCSIFDSTADDR
:
4100 return inet_dgram_ops
.ioctl(sock
, cmd
, arg
);
4104 return -ENOIOCTLCMD
;
4109 static __poll_t
packet_poll(struct file
*file
, struct socket
*sock
,
4112 struct sock
*sk
= sock
->sk
;
4113 struct packet_sock
*po
= pkt_sk(sk
);
4114 __poll_t mask
= datagram_poll(file
, sock
, wait
);
4116 spin_lock_bh(&sk
->sk_receive_queue
.lock
);
4117 if (po
->rx_ring
.pg_vec
) {
4118 if (!packet_previous_rx_frame(po
, &po
->rx_ring
,
4120 mask
|= EPOLLIN
| EPOLLRDNORM
;
4122 if (po
->pressure
&& __packet_rcv_has_room(po
, NULL
) == ROOM_NORMAL
)
4124 spin_unlock_bh(&sk
->sk_receive_queue
.lock
);
4125 spin_lock_bh(&sk
->sk_write_queue
.lock
);
4126 if (po
->tx_ring
.pg_vec
) {
4127 if (packet_current_frame(po
, &po
->tx_ring
, TP_STATUS_AVAILABLE
))
4128 mask
|= EPOLLOUT
| EPOLLWRNORM
;
4130 spin_unlock_bh(&sk
->sk_write_queue
.lock
);
4135 /* Dirty? Well, I still did not learn better way to account
4139 static void packet_mm_open(struct vm_area_struct
*vma
)
4141 struct file
*file
= vma
->vm_file
;
4142 struct socket
*sock
= file
->private_data
;
4143 struct sock
*sk
= sock
->sk
;
4146 atomic_inc(&pkt_sk(sk
)->mapped
);
4149 static void packet_mm_close(struct vm_area_struct
*vma
)
4151 struct file
*file
= vma
->vm_file
;
4152 struct socket
*sock
= file
->private_data
;
4153 struct sock
*sk
= sock
->sk
;
4156 atomic_dec(&pkt_sk(sk
)->mapped
);
4159 static const struct vm_operations_struct packet_mmap_ops
= {
4160 .open
= packet_mm_open
,
4161 .close
= packet_mm_close
,
4164 static void free_pg_vec(struct pgv
*pg_vec
, unsigned int order
,
4169 for (i
= 0; i
< len
; i
++) {
4170 if (likely(pg_vec
[i
].buffer
)) {
4171 if (is_vmalloc_addr(pg_vec
[i
].buffer
))
4172 vfree(pg_vec
[i
].buffer
);
4174 free_pages((unsigned long)pg_vec
[i
].buffer
,
4176 pg_vec
[i
].buffer
= NULL
;
4182 static char *alloc_one_pg_vec_page(unsigned long order
)
4185 gfp_t gfp_flags
= GFP_KERNEL
| __GFP_COMP
|
4186 __GFP_ZERO
| __GFP_NOWARN
| __GFP_NORETRY
;
4188 buffer
= (char *) __get_free_pages(gfp_flags
, order
);
4192 /* __get_free_pages failed, fall back to vmalloc */
4193 buffer
= vzalloc(array_size((1 << order
), PAGE_SIZE
));
4197 /* vmalloc failed, lets dig into swap here */
4198 gfp_flags
&= ~__GFP_NORETRY
;
4199 buffer
= (char *) __get_free_pages(gfp_flags
, order
);
4203 /* complete and utter failure */
4207 static struct pgv
*alloc_pg_vec(struct tpacket_req
*req
, int order
)
4209 unsigned int block_nr
= req
->tp_block_nr
;
4213 pg_vec
= kcalloc(block_nr
, sizeof(struct pgv
), GFP_KERNEL
| __GFP_NOWARN
);
4214 if (unlikely(!pg_vec
))
4217 for (i
= 0; i
< block_nr
; i
++) {
4218 pg_vec
[i
].buffer
= alloc_one_pg_vec_page(order
);
4219 if (unlikely(!pg_vec
[i
].buffer
))
4220 goto out_free_pgvec
;
4227 free_pg_vec(pg_vec
, order
, block_nr
);
4232 static int packet_set_ring(struct sock
*sk
, union tpacket_req_u
*req_u
,
4233 int closing
, int tx_ring
)
4235 struct pgv
*pg_vec
= NULL
;
4236 struct packet_sock
*po
= pkt_sk(sk
);
4237 int was_running
, order
= 0;
4238 struct packet_ring_buffer
*rb
;
4239 struct sk_buff_head
*rb_queue
;
4242 /* Added to avoid minimal code churn */
4243 struct tpacket_req
*req
= &req_u
->req
;
4245 rb
= tx_ring
? &po
->tx_ring
: &po
->rx_ring
;
4246 rb_queue
= tx_ring
? &sk
->sk_write_queue
: &sk
->sk_receive_queue
;
4250 if (atomic_read(&po
->mapped
))
4252 if (packet_read_pending(rb
))
4256 if (req
->tp_block_nr
) {
4257 unsigned int min_frame_size
;
4259 /* Sanity tests and some calculations */
4261 if (unlikely(rb
->pg_vec
))
4264 switch (po
->tp_version
) {
4266 po
->tp_hdrlen
= TPACKET_HDRLEN
;
4269 po
->tp_hdrlen
= TPACKET2_HDRLEN
;
4272 po
->tp_hdrlen
= TPACKET3_HDRLEN
;
4277 if (unlikely((int)req
->tp_block_size
<= 0))
4279 if (unlikely(!PAGE_ALIGNED(req
->tp_block_size
)))
4281 min_frame_size
= po
->tp_hdrlen
+ po
->tp_reserve
;
4282 if (po
->tp_version
>= TPACKET_V3
&&
4283 req
->tp_block_size
<
4284 BLK_PLUS_PRIV((u64
)req_u
->req3
.tp_sizeof_priv
) + min_frame_size
)
4286 if (unlikely(req
->tp_frame_size
< min_frame_size
))
4288 if (unlikely(req
->tp_frame_size
& (TPACKET_ALIGNMENT
- 1)))
4291 rb
->frames_per_block
= req
->tp_block_size
/ req
->tp_frame_size
;
4292 if (unlikely(rb
->frames_per_block
== 0))
4294 if (unlikely(rb
->frames_per_block
> UINT_MAX
/ req
->tp_block_nr
))
4296 if (unlikely((rb
->frames_per_block
* req
->tp_block_nr
) !=
4301 order
= get_order(req
->tp_block_size
);
4302 pg_vec
= alloc_pg_vec(req
, order
);
4303 if (unlikely(!pg_vec
))
4305 switch (po
->tp_version
) {
4307 /* Block transmit is not supported yet */
4309 init_prb_bdqc(po
, rb
, pg_vec
, req_u
);
4311 struct tpacket_req3
*req3
= &req_u
->req3
;
4313 if (req3
->tp_retire_blk_tov
||
4314 req3
->tp_sizeof_priv
||
4315 req3
->tp_feature_req_word
) {
4328 if (unlikely(req
->tp_frame_nr
))
4333 /* Detach socket from network */
4334 spin_lock(&po
->bind_lock
);
4335 was_running
= po
->running
;
4339 __unregister_prot_hook(sk
, false);
4341 spin_unlock(&po
->bind_lock
);
4346 mutex_lock(&po
->pg_vec_lock
);
4347 if (closing
|| atomic_read(&po
->mapped
) == 0) {
4349 spin_lock_bh(&rb_queue
->lock
);
4350 swap(rb
->pg_vec
, pg_vec
);
4351 rb
->frame_max
= (req
->tp_frame_nr
- 1);
4353 rb
->frame_size
= req
->tp_frame_size
;
4354 spin_unlock_bh(&rb_queue
->lock
);
4356 swap(rb
->pg_vec_order
, order
);
4357 swap(rb
->pg_vec_len
, req
->tp_block_nr
);
4359 rb
->pg_vec_pages
= req
->tp_block_size
/PAGE_SIZE
;
4360 po
->prot_hook
.func
= (po
->rx_ring
.pg_vec
) ?
4361 tpacket_rcv
: packet_rcv
;
4362 skb_queue_purge(rb_queue
);
4363 if (atomic_read(&po
->mapped
))
4364 pr_err("packet_mmap: vma is busy: %d\n",
4365 atomic_read(&po
->mapped
));
4367 mutex_unlock(&po
->pg_vec_lock
);
4369 spin_lock(&po
->bind_lock
);
4372 register_prot_hook(sk
);
4374 spin_unlock(&po
->bind_lock
);
4375 if (pg_vec
&& (po
->tp_version
> TPACKET_V2
)) {
4376 /* Because we don't support block-based V3 on tx-ring */
4378 prb_shutdown_retire_blk_timer(po
, rb_queue
);
4382 free_pg_vec(pg_vec
, order
, req
->tp_block_nr
);
4387 static int packet_mmap(struct file
*file
, struct socket
*sock
,
4388 struct vm_area_struct
*vma
)
4390 struct sock
*sk
= sock
->sk
;
4391 struct packet_sock
*po
= pkt_sk(sk
);
4392 unsigned long size
, expected_size
;
4393 struct packet_ring_buffer
*rb
;
4394 unsigned long start
;
4401 mutex_lock(&po
->pg_vec_lock
);
4404 for (rb
= &po
->rx_ring
; rb
<= &po
->tx_ring
; rb
++) {
4406 expected_size
+= rb
->pg_vec_len
4412 if (expected_size
== 0)
4415 size
= vma
->vm_end
- vma
->vm_start
;
4416 if (size
!= expected_size
)
4419 start
= vma
->vm_start
;
4420 for (rb
= &po
->rx_ring
; rb
<= &po
->tx_ring
; rb
++) {
4421 if (rb
->pg_vec
== NULL
)
4424 for (i
= 0; i
< rb
->pg_vec_len
; i
++) {
4426 void *kaddr
= rb
->pg_vec
[i
].buffer
;
4429 for (pg_num
= 0; pg_num
< rb
->pg_vec_pages
; pg_num
++) {
4430 page
= pgv_to_page(kaddr
);
4431 err
= vm_insert_page(vma
, start
, page
);
4440 atomic_inc(&po
->mapped
);
4441 vma
->vm_ops
= &packet_mmap_ops
;
4445 mutex_unlock(&po
->pg_vec_lock
);
4449 static const struct proto_ops packet_ops_spkt
= {
4450 .family
= PF_PACKET
,
4451 .owner
= THIS_MODULE
,
4452 .release
= packet_release
,
4453 .bind
= packet_bind_spkt
,
4454 .connect
= sock_no_connect
,
4455 .socketpair
= sock_no_socketpair
,
4456 .accept
= sock_no_accept
,
4457 .getname
= packet_getname_spkt
,
4458 .poll
= datagram_poll
,
4459 .ioctl
= packet_ioctl
,
4460 .gettstamp
= sock_gettstamp
,
4461 .listen
= sock_no_listen
,
4462 .shutdown
= sock_no_shutdown
,
4463 .setsockopt
= sock_no_setsockopt
,
4464 .getsockopt
= sock_no_getsockopt
,
4465 .sendmsg
= packet_sendmsg_spkt
,
4466 .recvmsg
= packet_recvmsg
,
4467 .mmap
= sock_no_mmap
,
4468 .sendpage
= sock_no_sendpage
,
4471 static const struct proto_ops packet_ops
= {
4472 .family
= PF_PACKET
,
4473 .owner
= THIS_MODULE
,
4474 .release
= packet_release
,
4475 .bind
= packet_bind
,
4476 .connect
= sock_no_connect
,
4477 .socketpair
= sock_no_socketpair
,
4478 .accept
= sock_no_accept
,
4479 .getname
= packet_getname
,
4480 .poll
= packet_poll
,
4481 .ioctl
= packet_ioctl
,
4482 .gettstamp
= sock_gettstamp
,
4483 .listen
= sock_no_listen
,
4484 .shutdown
= sock_no_shutdown
,
4485 .setsockopt
= packet_setsockopt
,
4486 .getsockopt
= packet_getsockopt
,
4487 #ifdef CONFIG_COMPAT
4488 .compat_setsockopt
= compat_packet_setsockopt
,
4490 .sendmsg
= packet_sendmsg
,
4491 .recvmsg
= packet_recvmsg
,
4492 .mmap
= packet_mmap
,
4493 .sendpage
= sock_no_sendpage
,
4496 static const struct net_proto_family packet_family_ops
= {
4497 .family
= PF_PACKET
,
4498 .create
= packet_create
,
4499 .owner
= THIS_MODULE
,
4502 static struct notifier_block packet_netdev_notifier
= {
4503 .notifier_call
= packet_notifier
,
4506 #ifdef CONFIG_PROC_FS
4508 static void *packet_seq_start(struct seq_file
*seq
, loff_t
*pos
)
4511 struct net
*net
= seq_file_net(seq
);
4514 return seq_hlist_start_head_rcu(&net
->packet
.sklist
, *pos
);
4517 static void *packet_seq_next(struct seq_file
*seq
, void *v
, loff_t
*pos
)
4519 struct net
*net
= seq_file_net(seq
);
4520 return seq_hlist_next_rcu(v
, &net
->packet
.sklist
, pos
);
4523 static void packet_seq_stop(struct seq_file
*seq
, void *v
)
4529 static int packet_seq_show(struct seq_file
*seq
, void *v
)
4531 if (v
== SEQ_START_TOKEN
)
4532 seq_puts(seq
, "sk RefCnt Type Proto Iface R Rmem User Inode\n");
4534 struct sock
*s
= sk_entry(v
);
4535 const struct packet_sock
*po
= pkt_sk(s
);
4538 "%pK %-6d %-4d %04x %-5d %1d %-6u %-6u %-6lu\n",
4540 refcount_read(&s
->sk_refcnt
),
4545 atomic_read(&s
->sk_rmem_alloc
),
4546 from_kuid_munged(seq_user_ns(seq
), sock_i_uid(s
)),
4553 static const struct seq_operations packet_seq_ops
= {
4554 .start
= packet_seq_start
,
4555 .next
= packet_seq_next
,
4556 .stop
= packet_seq_stop
,
4557 .show
= packet_seq_show
,
4561 static int __net_init
packet_net_init(struct net
*net
)
4563 mutex_init(&net
->packet
.sklist_lock
);
4564 INIT_HLIST_HEAD(&net
->packet
.sklist
);
4566 if (!proc_create_net("packet", 0, net
->proc_net
, &packet_seq_ops
,
4567 sizeof(struct seq_net_private
)))
4573 static void __net_exit
packet_net_exit(struct net
*net
)
4575 remove_proc_entry("packet", net
->proc_net
);
4576 WARN_ON_ONCE(!hlist_empty(&net
->packet
.sklist
));
4579 static struct pernet_operations packet_net_ops
= {
4580 .init
= packet_net_init
,
4581 .exit
= packet_net_exit
,
4585 static void __exit
packet_exit(void)
4587 unregister_netdevice_notifier(&packet_netdev_notifier
);
4588 unregister_pernet_subsys(&packet_net_ops
);
4589 sock_unregister(PF_PACKET
);
4590 proto_unregister(&packet_proto
);
4593 static int __init
packet_init(void)
4597 rc
= proto_register(&packet_proto
, 0);
4600 rc
= sock_register(&packet_family_ops
);
4603 rc
= register_pernet_subsys(&packet_net_ops
);
4606 rc
= register_netdevice_notifier(&packet_netdev_notifier
);
4613 unregister_pernet_subsys(&packet_net_ops
);
4615 sock_unregister(PF_PACKET
);
4617 proto_unregister(&packet_proto
);
4622 module_init(packet_init
);
4623 module_exit(packet_exit
);
4624 MODULE_LICENSE("GPL");
4625 MODULE_ALIAS_NETPROTO(PF_PACKET
);