2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
6 * PACKET - implements raw packet sockets.
9 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10 * Alan Cox, <gw4pts@gw4pts.ampr.org>
13 * Alan Cox : verify_area() now used correctly
14 * Alan Cox : new skbuff lists, look ma no backlogs!
15 * Alan Cox : tidied skbuff lists.
16 * Alan Cox : Now uses generic datagram routines I
17 * added. Also fixed the peek/read crash
18 * from all old Linux datagram code.
19 * Alan Cox : Uses the improved datagram code.
20 * Alan Cox : Added NULL's for socket options.
21 * Alan Cox : Re-commented the code.
22 * Alan Cox : Use new kernel side addressing
23 * Rob Janssen : Correct MTU usage.
24 * Dave Platt : Counter leaks caused by incorrect
25 * interrupt locking and some slightly
26 * dubious gcc output. Can you read
27 * compiler: it said _VOLATILE_
28 * Richard Kooijman : Timestamp fixes.
29 * Alan Cox : New buffers. Use sk->mac.raw.
30 * Alan Cox : sendmsg/recvmsg support.
31 * Alan Cox : Protocol setting support
32 * Alexey Kuznetsov : Untied from IPv4 stack.
33 * Cyrus Durgin : Fixed kerneld for kmod.
34 * Michal Ostrowski : Module initialization cleanup.
35 * Ulises Alonso : Frame number limit removal and
36 * packet_set_ring memory leak.
37 * Eric Biederman : Allow for > 8 byte hardware addresses.
38 * The convention is that longer addresses
39 * will simply extend the hardware address
40 * byte arrays at the end of sockaddr_ll
42 * Johann Baudy : Added TX RING.
43 * Chetan Loke : Implemented TPACKET_V3 block abstraction
45 * Copyright (C) 2011, <lokec@ccs.neu.edu>
48 * This program is free software; you can redistribute it and/or
49 * modify it under the terms of the GNU General Public License
50 * as published by the Free Software Foundation; either version
51 * 2 of the License, or (at your option) any later version.
55 #include <linux/types.h>
57 #include <linux/capability.h>
58 #include <linux/fcntl.h>
59 #include <linux/socket.h>
61 #include <linux/inet.h>
62 #include <linux/netdevice.h>
63 #include <linux/if_packet.h>
64 #include <linux/wireless.h>
65 #include <linux/kernel.h>
66 #include <linux/kmod.h>
67 #include <linux/slab.h>
68 #include <linux/vmalloc.h>
69 #include <net/net_namespace.h>
71 #include <net/protocol.h>
72 #include <linux/skbuff.h>
74 #include <linux/errno.h>
75 #include <linux/timer.h>
76 #include <linux/uaccess.h>
77 #include <asm/ioctls.h>
79 #include <asm/cacheflush.h>
81 #include <linux/proc_fs.h>
82 #include <linux/seq_file.h>
83 #include <linux/poll.h>
84 #include <linux/module.h>
85 #include <linux/init.h>
86 #include <linux/mutex.h>
87 #include <linux/if_vlan.h>
88 #include <linux/virtio_net.h>
89 #include <linux/errqueue.h>
90 #include <linux/net_tstamp.h>
91 #include <linux/percpu.h>
93 #include <net/inet_common.h>
95 #include <linux/bpf.h>
96 #include <net/compat.h>
102 - if device has no dev->hard_header routine, it adds and removes ll header
103 inside itself. In this case ll header is invisible outside of device,
104 but higher levels still should reserve dev->hard_header_len.
105 Some devices are enough clever to reallocate skb, when header
106 will not fit to reserved space (tunnel), another ones are silly
108 - packet socket receives packets with pulled ll header,
109 so that SOCK_RAW should push it back.
114 Incoming, dev->hard_header!=NULL
115 mac_header -> ll header
118 Outgoing, dev->hard_header!=NULL
119 mac_header -> ll header
122 Incoming, dev->hard_header==NULL
123 mac_header -> UNKNOWN position. It is very likely, that it points to ll
124 header. PPP makes it, that is wrong, because introduce
125 assymetry between rx and tx paths.
128 Outgoing, dev->hard_header==NULL
129 mac_header -> data. ll header is still not built!
133 If dev->hard_header==NULL we are unlikely to restore sensible ll header.
139 dev->hard_header != NULL
140 mac_header -> ll header
143 dev->hard_header == NULL (ll header is added by device, we cannot control it)
147 We should set nh.raw on output to correct posistion,
148 packet classifier depends on it.
151 /* Private packet socket structures. */
153 /* identical to struct packet_mreq except it has
154 * a longer address field.
156 struct packet_mreq_max
{
158 unsigned short mr_type
;
159 unsigned short mr_alen
;
160 unsigned char mr_address
[MAX_ADDR_LEN
];
164 struct tpacket_hdr
*h1
;
165 struct tpacket2_hdr
*h2
;
166 struct tpacket3_hdr
*h3
;
170 static int packet_set_ring(struct sock
*sk
, union tpacket_req_u
*req_u
,
171 int closing
, int tx_ring
);
173 #define V3_ALIGNMENT (8)
175 #define BLK_HDR_LEN (ALIGN(sizeof(struct tpacket_block_desc), V3_ALIGNMENT))
177 #define BLK_PLUS_PRIV(sz_of_priv) \
178 (BLK_HDR_LEN + ALIGN((sz_of_priv), V3_ALIGNMENT))
180 #define BLOCK_STATUS(x) ((x)->hdr.bh1.block_status)
181 #define BLOCK_NUM_PKTS(x) ((x)->hdr.bh1.num_pkts)
182 #define BLOCK_O2FP(x) ((x)->hdr.bh1.offset_to_first_pkt)
183 #define BLOCK_LEN(x) ((x)->hdr.bh1.blk_len)
184 #define BLOCK_SNUM(x) ((x)->hdr.bh1.seq_num)
185 #define BLOCK_O2PRIV(x) ((x)->offset_to_priv)
186 #define BLOCK_PRIV(x) ((void *)((char *)(x) + BLOCK_O2PRIV(x)))
189 static int tpacket_rcv(struct sk_buff
*skb
, struct net_device
*dev
,
190 struct packet_type
*pt
, struct net_device
*orig_dev
);
192 static void *packet_previous_frame(struct packet_sock
*po
,
193 struct packet_ring_buffer
*rb
,
195 static void packet_increment_head(struct packet_ring_buffer
*buff
);
196 static int prb_curr_blk_in_use(struct tpacket_block_desc
*);
197 static void *prb_dispatch_next_block(struct tpacket_kbdq_core
*,
198 struct packet_sock
*);
199 static void prb_retire_current_block(struct tpacket_kbdq_core
*,
200 struct packet_sock
*, unsigned int status
);
201 static int prb_queue_frozen(struct tpacket_kbdq_core
*);
202 static void prb_open_block(struct tpacket_kbdq_core
*,
203 struct tpacket_block_desc
*);
204 static void prb_retire_rx_blk_timer_expired(struct timer_list
*);
205 static void _prb_refresh_rx_retire_blk_timer(struct tpacket_kbdq_core
*);
206 static void prb_fill_rxhash(struct tpacket_kbdq_core
*, struct tpacket3_hdr
*);
207 static void prb_clear_rxhash(struct tpacket_kbdq_core
*,
208 struct tpacket3_hdr
*);
209 static void prb_fill_vlan_info(struct tpacket_kbdq_core
*,
210 struct tpacket3_hdr
*);
211 static void packet_flush_mclist(struct sock
*sk
);
212 static u16
packet_pick_tx_queue(struct sk_buff
*skb
);
214 struct packet_skb_cb
{
216 struct sockaddr_pkt pkt
;
218 /* Trick: alias skb original length with
219 * ll.sll_family and ll.protocol in order
222 unsigned int origlen
;
223 struct sockaddr_ll ll
;
228 #define vio_le() virtio_legacy_is_little_endian()
230 #define PACKET_SKB_CB(__skb) ((struct packet_skb_cb *)((__skb)->cb))
232 #define GET_PBDQC_FROM_RB(x) ((struct tpacket_kbdq_core *)(&(x)->prb_bdqc))
233 #define GET_PBLOCK_DESC(x, bid) \
234 ((struct tpacket_block_desc *)((x)->pkbdq[(bid)].buffer))
235 #define GET_CURR_PBLOCK_DESC_FROM_CORE(x) \
236 ((struct tpacket_block_desc *)((x)->pkbdq[(x)->kactive_blk_num].buffer))
237 #define GET_NEXT_PRB_BLK_NUM(x) \
238 (((x)->kactive_blk_num < ((x)->knum_blocks-1)) ? \
239 ((x)->kactive_blk_num+1) : 0)
241 static void __fanout_unlink(struct sock
*sk
, struct packet_sock
*po
);
242 static void __fanout_link(struct sock
*sk
, struct packet_sock
*po
);
244 static int packet_direct_xmit(struct sk_buff
*skb
)
246 return dev_direct_xmit(skb
, packet_pick_tx_queue(skb
));
249 static struct net_device
*packet_cached_dev_get(struct packet_sock
*po
)
251 struct net_device
*dev
;
254 dev
= rcu_dereference(po
->cached_dev
);
262 static void packet_cached_dev_assign(struct packet_sock
*po
,
263 struct net_device
*dev
)
265 rcu_assign_pointer(po
->cached_dev
, dev
);
268 static void packet_cached_dev_reset(struct packet_sock
*po
)
270 RCU_INIT_POINTER(po
->cached_dev
, NULL
);
273 static bool packet_use_direct_xmit(const struct packet_sock
*po
)
275 return po
->xmit
== packet_direct_xmit
;
278 static u16
__packet_pick_tx_queue(struct net_device
*dev
, struct sk_buff
*skb
,
279 struct net_device
*sb_dev
)
281 return dev_pick_tx_cpu_id(dev
, skb
, sb_dev
, NULL
);
284 static u16
packet_pick_tx_queue(struct sk_buff
*skb
)
286 struct net_device
*dev
= skb
->dev
;
287 const struct net_device_ops
*ops
= dev
->netdev_ops
;
290 if (ops
->ndo_select_queue
) {
291 queue_index
= ops
->ndo_select_queue(dev
, skb
, NULL
,
292 __packet_pick_tx_queue
);
293 queue_index
= netdev_cap_txqueue(dev
, queue_index
);
295 queue_index
= __packet_pick_tx_queue(dev
, skb
, NULL
);
301 /* __register_prot_hook must be invoked through register_prot_hook
302 * or from a context in which asynchronous accesses to the packet
303 * socket is not possible (packet_create()).
305 static void __register_prot_hook(struct sock
*sk
)
307 struct packet_sock
*po
= pkt_sk(sk
);
311 __fanout_link(sk
, po
);
313 dev_add_pack(&po
->prot_hook
);
320 static void register_prot_hook(struct sock
*sk
)
322 lockdep_assert_held_once(&pkt_sk(sk
)->bind_lock
);
323 __register_prot_hook(sk
);
326 /* If the sync parameter is true, we will temporarily drop
327 * the po->bind_lock and do a synchronize_net to make sure no
328 * asynchronous packet processing paths still refer to the elements
329 * of po->prot_hook. If the sync parameter is false, it is the
330 * callers responsibility to take care of this.
332 static void __unregister_prot_hook(struct sock
*sk
, bool sync
)
334 struct packet_sock
*po
= pkt_sk(sk
);
336 lockdep_assert_held_once(&po
->bind_lock
);
341 __fanout_unlink(sk
, po
);
343 __dev_remove_pack(&po
->prot_hook
);
348 spin_unlock(&po
->bind_lock
);
350 spin_lock(&po
->bind_lock
);
354 static void unregister_prot_hook(struct sock
*sk
, bool sync
)
356 struct packet_sock
*po
= pkt_sk(sk
);
359 __unregister_prot_hook(sk
, sync
);
362 static inline struct page
* __pure
pgv_to_page(void *addr
)
364 if (is_vmalloc_addr(addr
))
365 return vmalloc_to_page(addr
);
366 return virt_to_page(addr
);
369 static void __packet_set_status(struct packet_sock
*po
, void *frame
, int status
)
371 union tpacket_uhdr h
;
374 switch (po
->tp_version
) {
376 h
.h1
->tp_status
= status
;
377 flush_dcache_page(pgv_to_page(&h
.h1
->tp_status
));
380 h
.h2
->tp_status
= status
;
381 flush_dcache_page(pgv_to_page(&h
.h2
->tp_status
));
384 h
.h3
->tp_status
= status
;
385 flush_dcache_page(pgv_to_page(&h
.h3
->tp_status
));
388 WARN(1, "TPACKET version not supported.\n");
395 static int __packet_get_status(struct packet_sock
*po
, void *frame
)
397 union tpacket_uhdr h
;
402 switch (po
->tp_version
) {
404 flush_dcache_page(pgv_to_page(&h
.h1
->tp_status
));
405 return h
.h1
->tp_status
;
407 flush_dcache_page(pgv_to_page(&h
.h2
->tp_status
));
408 return h
.h2
->tp_status
;
410 flush_dcache_page(pgv_to_page(&h
.h3
->tp_status
));
411 return h
.h3
->tp_status
;
413 WARN(1, "TPACKET version not supported.\n");
419 static __u32
tpacket_get_timestamp(struct sk_buff
*skb
, struct timespec
*ts
,
422 struct skb_shared_hwtstamps
*shhwtstamps
= skb_hwtstamps(skb
);
425 (flags
& SOF_TIMESTAMPING_RAW_HARDWARE
) &&
426 ktime_to_timespec_cond(shhwtstamps
->hwtstamp
, ts
))
427 return TP_STATUS_TS_RAW_HARDWARE
;
429 if (ktime_to_timespec_cond(skb
->tstamp
, ts
))
430 return TP_STATUS_TS_SOFTWARE
;
435 static __u32
__packet_set_timestamp(struct packet_sock
*po
, void *frame
,
438 union tpacket_uhdr h
;
442 if (!(ts_status
= tpacket_get_timestamp(skb
, &ts
, po
->tp_tstamp
)))
446 switch (po
->tp_version
) {
448 h
.h1
->tp_sec
= ts
.tv_sec
;
449 h
.h1
->tp_usec
= ts
.tv_nsec
/ NSEC_PER_USEC
;
452 h
.h2
->tp_sec
= ts
.tv_sec
;
453 h
.h2
->tp_nsec
= ts
.tv_nsec
;
456 h
.h3
->tp_sec
= ts
.tv_sec
;
457 h
.h3
->tp_nsec
= ts
.tv_nsec
;
460 WARN(1, "TPACKET version not supported.\n");
464 /* one flush is safe, as both fields always lie on the same cacheline */
465 flush_dcache_page(pgv_to_page(&h
.h1
->tp_sec
));
471 static void *packet_lookup_frame(struct packet_sock
*po
,
472 struct packet_ring_buffer
*rb
,
473 unsigned int position
,
476 unsigned int pg_vec_pos
, frame_offset
;
477 union tpacket_uhdr h
;
479 pg_vec_pos
= position
/ rb
->frames_per_block
;
480 frame_offset
= position
% rb
->frames_per_block
;
482 h
.raw
= rb
->pg_vec
[pg_vec_pos
].buffer
+
483 (frame_offset
* rb
->frame_size
);
485 if (status
!= __packet_get_status(po
, h
.raw
))
491 static void *packet_current_frame(struct packet_sock
*po
,
492 struct packet_ring_buffer
*rb
,
495 return packet_lookup_frame(po
, rb
, rb
->head
, status
);
498 static void prb_del_retire_blk_timer(struct tpacket_kbdq_core
*pkc
)
500 del_timer_sync(&pkc
->retire_blk_timer
);
503 static void prb_shutdown_retire_blk_timer(struct packet_sock
*po
,
504 struct sk_buff_head
*rb_queue
)
506 struct tpacket_kbdq_core
*pkc
;
508 pkc
= GET_PBDQC_FROM_RB(&po
->rx_ring
);
510 spin_lock_bh(&rb_queue
->lock
);
511 pkc
->delete_blk_timer
= 1;
512 spin_unlock_bh(&rb_queue
->lock
);
514 prb_del_retire_blk_timer(pkc
);
517 static void prb_setup_retire_blk_timer(struct packet_sock
*po
)
519 struct tpacket_kbdq_core
*pkc
;
521 pkc
= GET_PBDQC_FROM_RB(&po
->rx_ring
);
522 timer_setup(&pkc
->retire_blk_timer
, prb_retire_rx_blk_timer_expired
,
524 pkc
->retire_blk_timer
.expires
= jiffies
;
527 static int prb_calc_retire_blk_tmo(struct packet_sock
*po
,
528 int blk_size_in_bytes
)
530 struct net_device
*dev
;
531 unsigned int mbits
= 0, msec
= 0, div
= 0, tmo
= 0;
532 struct ethtool_link_ksettings ecmd
;
536 dev
= __dev_get_by_index(sock_net(&po
->sk
), po
->ifindex
);
537 if (unlikely(!dev
)) {
539 return DEFAULT_PRB_RETIRE_TOV
;
541 err
= __ethtool_get_link_ksettings(dev
, &ecmd
);
545 * If the link speed is so slow you don't really
546 * need to worry about perf anyways
548 if (ecmd
.base
.speed
< SPEED_1000
||
549 ecmd
.base
.speed
== SPEED_UNKNOWN
) {
550 return DEFAULT_PRB_RETIRE_TOV
;
553 div
= ecmd
.base
.speed
/ 1000;
556 return DEFAULT_PRB_RETIRE_TOV
;
558 mbits
= (blk_size_in_bytes
* 8) / (1024 * 1024);
570 static void prb_init_ft_ops(struct tpacket_kbdq_core
*p1
,
571 union tpacket_req_u
*req_u
)
573 p1
->feature_req_word
= req_u
->req3
.tp_feature_req_word
;
576 static void init_prb_bdqc(struct packet_sock
*po
,
577 struct packet_ring_buffer
*rb
,
579 union tpacket_req_u
*req_u
)
581 struct tpacket_kbdq_core
*p1
= GET_PBDQC_FROM_RB(rb
);
582 struct tpacket_block_desc
*pbd
;
584 memset(p1
, 0x0, sizeof(*p1
));
586 p1
->knxt_seq_num
= 1;
588 pbd
= (struct tpacket_block_desc
*)pg_vec
[0].buffer
;
589 p1
->pkblk_start
= pg_vec
[0].buffer
;
590 p1
->kblk_size
= req_u
->req3
.tp_block_size
;
591 p1
->knum_blocks
= req_u
->req3
.tp_block_nr
;
592 p1
->hdrlen
= po
->tp_hdrlen
;
593 p1
->version
= po
->tp_version
;
594 p1
->last_kactive_blk_num
= 0;
595 po
->stats
.stats3
.tp_freeze_q_cnt
= 0;
596 if (req_u
->req3
.tp_retire_blk_tov
)
597 p1
->retire_blk_tov
= req_u
->req3
.tp_retire_blk_tov
;
599 p1
->retire_blk_tov
= prb_calc_retire_blk_tmo(po
,
600 req_u
->req3
.tp_block_size
);
601 p1
->tov_in_jiffies
= msecs_to_jiffies(p1
->retire_blk_tov
);
602 p1
->blk_sizeof_priv
= req_u
->req3
.tp_sizeof_priv
;
604 p1
->max_frame_len
= p1
->kblk_size
- BLK_PLUS_PRIV(p1
->blk_sizeof_priv
);
605 prb_init_ft_ops(p1
, req_u
);
606 prb_setup_retire_blk_timer(po
);
607 prb_open_block(p1
, pbd
);
610 /* Do NOT update the last_blk_num first.
611 * Assumes sk_buff_head lock is held.
613 static void _prb_refresh_rx_retire_blk_timer(struct tpacket_kbdq_core
*pkc
)
615 mod_timer(&pkc
->retire_blk_timer
,
616 jiffies
+ pkc
->tov_in_jiffies
);
617 pkc
->last_kactive_blk_num
= pkc
->kactive_blk_num
;
622 * 1) We refresh the timer only when we open a block.
623 * By doing this we don't waste cycles refreshing the timer
624 * on packet-by-packet basis.
626 * With a 1MB block-size, on a 1Gbps line, it will take
627 * i) ~8 ms to fill a block + ii) memcpy etc.
628 * In this cut we are not accounting for the memcpy time.
630 * So, if the user sets the 'tmo' to 10ms then the timer
631 * will never fire while the block is still getting filled
632 * (which is what we want). However, the user could choose
633 * to close a block early and that's fine.
635 * But when the timer does fire, we check whether or not to refresh it.
636 * Since the tmo granularity is in msecs, it is not too expensive
637 * to refresh the timer, lets say every '8' msecs.
638 * Either the user can set the 'tmo' or we can derive it based on
639 * a) line-speed and b) block-size.
640 * prb_calc_retire_blk_tmo() calculates the tmo.
643 static void prb_retire_rx_blk_timer_expired(struct timer_list
*t
)
645 struct packet_sock
*po
=
646 from_timer(po
, t
, rx_ring
.prb_bdqc
.retire_blk_timer
);
647 struct tpacket_kbdq_core
*pkc
= GET_PBDQC_FROM_RB(&po
->rx_ring
);
649 struct tpacket_block_desc
*pbd
;
651 spin_lock(&po
->sk
.sk_receive_queue
.lock
);
653 frozen
= prb_queue_frozen(pkc
);
654 pbd
= GET_CURR_PBLOCK_DESC_FROM_CORE(pkc
);
656 if (unlikely(pkc
->delete_blk_timer
))
659 /* We only need to plug the race when the block is partially filled.
661 * lock(); increment BLOCK_NUM_PKTS; unlock()
662 * copy_bits() is in progress ...
663 * timer fires on other cpu:
664 * we can't retire the current block because copy_bits
668 if (BLOCK_NUM_PKTS(pbd
)) {
669 while (atomic_read(&pkc
->blk_fill_in_prog
)) {
670 /* Waiting for skb_copy_bits to finish... */
675 if (pkc
->last_kactive_blk_num
== pkc
->kactive_blk_num
) {
677 if (!BLOCK_NUM_PKTS(pbd
)) {
678 /* An empty block. Just refresh the timer. */
681 prb_retire_current_block(pkc
, po
, TP_STATUS_BLK_TMO
);
682 if (!prb_dispatch_next_block(pkc
, po
))
687 /* Case 1. Queue was frozen because user-space was
690 if (prb_curr_blk_in_use(pbd
)) {
692 * Ok, user-space is still behind.
693 * So just refresh the timer.
697 /* Case 2. queue was frozen,user-space caught up,
698 * now the link went idle && the timer fired.
699 * We don't have a block to close.So we open this
700 * block and restart the timer.
701 * opening a block thaws the queue,restarts timer
702 * Thawing/timer-refresh is a side effect.
704 prb_open_block(pkc
, pbd
);
711 _prb_refresh_rx_retire_blk_timer(pkc
);
714 spin_unlock(&po
->sk
.sk_receive_queue
.lock
);
717 static void prb_flush_block(struct tpacket_kbdq_core
*pkc1
,
718 struct tpacket_block_desc
*pbd1
, __u32 status
)
720 /* Flush everything minus the block header */
722 #if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1
727 /* Skip the block header(we know header WILL fit in 4K) */
730 end
= (u8
*)PAGE_ALIGN((unsigned long)pkc1
->pkblk_end
);
731 for (; start
< end
; start
+= PAGE_SIZE
)
732 flush_dcache_page(pgv_to_page(start
));
737 /* Now update the block status. */
739 BLOCK_STATUS(pbd1
) = status
;
741 /* Flush the block header */
743 #if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1
745 flush_dcache_page(pgv_to_page(start
));
755 * 2) Increment active_blk_num
757 * Note:We DONT refresh the timer on purpose.
758 * Because almost always the next block will be opened.
760 static void prb_close_block(struct tpacket_kbdq_core
*pkc1
,
761 struct tpacket_block_desc
*pbd1
,
762 struct packet_sock
*po
, unsigned int stat
)
764 __u32 status
= TP_STATUS_USER
| stat
;
766 struct tpacket3_hdr
*last_pkt
;
767 struct tpacket_hdr_v1
*h1
= &pbd1
->hdr
.bh1
;
768 struct sock
*sk
= &po
->sk
;
770 if (po
->stats
.stats3
.tp_drops
)
771 status
|= TP_STATUS_LOSING
;
773 last_pkt
= (struct tpacket3_hdr
*)pkc1
->prev
;
774 last_pkt
->tp_next_offset
= 0;
776 /* Get the ts of the last pkt */
777 if (BLOCK_NUM_PKTS(pbd1
)) {
778 h1
->ts_last_pkt
.ts_sec
= last_pkt
->tp_sec
;
779 h1
->ts_last_pkt
.ts_nsec
= last_pkt
->tp_nsec
;
781 /* Ok, we tmo'd - so get the current time.
783 * It shouldn't really happen as we don't close empty
784 * blocks. See prb_retire_rx_blk_timer_expired().
788 h1
->ts_last_pkt
.ts_sec
= ts
.tv_sec
;
789 h1
->ts_last_pkt
.ts_nsec
= ts
.tv_nsec
;
794 /* Flush the block */
795 prb_flush_block(pkc1
, pbd1
, status
);
797 sk
->sk_data_ready(sk
);
799 pkc1
->kactive_blk_num
= GET_NEXT_PRB_BLK_NUM(pkc1
);
802 static void prb_thaw_queue(struct tpacket_kbdq_core
*pkc
)
804 pkc
->reset_pending_on_curr_blk
= 0;
808 * Side effect of opening a block:
810 * 1) prb_queue is thawed.
811 * 2) retire_blk_timer is refreshed.
814 static void prb_open_block(struct tpacket_kbdq_core
*pkc1
,
815 struct tpacket_block_desc
*pbd1
)
818 struct tpacket_hdr_v1
*h1
= &pbd1
->hdr
.bh1
;
822 /* We could have just memset this but we will lose the
823 * flexibility of making the priv area sticky
826 BLOCK_SNUM(pbd1
) = pkc1
->knxt_seq_num
++;
827 BLOCK_NUM_PKTS(pbd1
) = 0;
828 BLOCK_LEN(pbd1
) = BLK_PLUS_PRIV(pkc1
->blk_sizeof_priv
);
832 h1
->ts_first_pkt
.ts_sec
= ts
.tv_sec
;
833 h1
->ts_first_pkt
.ts_nsec
= ts
.tv_nsec
;
835 pkc1
->pkblk_start
= (char *)pbd1
;
836 pkc1
->nxt_offset
= pkc1
->pkblk_start
+ BLK_PLUS_PRIV(pkc1
->blk_sizeof_priv
);
838 BLOCK_O2FP(pbd1
) = (__u32
)BLK_PLUS_PRIV(pkc1
->blk_sizeof_priv
);
839 BLOCK_O2PRIV(pbd1
) = BLK_HDR_LEN
;
841 pbd1
->version
= pkc1
->version
;
842 pkc1
->prev
= pkc1
->nxt_offset
;
843 pkc1
->pkblk_end
= pkc1
->pkblk_start
+ pkc1
->kblk_size
;
845 prb_thaw_queue(pkc1
);
846 _prb_refresh_rx_retire_blk_timer(pkc1
);
852 * Queue freeze logic:
853 * 1) Assume tp_block_nr = 8 blocks.
854 * 2) At time 't0', user opens Rx ring.
855 * 3) Some time past 't0', kernel starts filling blocks starting from 0 .. 7
856 * 4) user-space is either sleeping or processing block '0'.
857 * 5) tpacket_rcv is currently filling block '7', since there is no space left,
858 * it will close block-7,loop around and try to fill block '0'.
860 * __packet_lookup_frame_in_block
861 * prb_retire_current_block()
862 * prb_dispatch_next_block()
863 * |->(BLOCK_STATUS == USER) evaluates to true
864 * 5.1) Since block-0 is currently in-use, we just freeze the queue.
865 * 6) Now there are two cases:
866 * 6.1) Link goes idle right after the queue is frozen.
867 * But remember, the last open_block() refreshed the timer.
868 * When this timer expires,it will refresh itself so that we can
869 * re-open block-0 in near future.
870 * 6.2) Link is busy and keeps on receiving packets. This is a simple
871 * case and __packet_lookup_frame_in_block will check if block-0
872 * is free and can now be re-used.
874 static void prb_freeze_queue(struct tpacket_kbdq_core
*pkc
,
875 struct packet_sock
*po
)
877 pkc
->reset_pending_on_curr_blk
= 1;
878 po
->stats
.stats3
.tp_freeze_q_cnt
++;
881 #define TOTAL_PKT_LEN_INCL_ALIGN(length) (ALIGN((length), V3_ALIGNMENT))
884 * If the next block is free then we will dispatch it
885 * and return a good offset.
886 * Else, we will freeze the queue.
887 * So, caller must check the return value.
889 static void *prb_dispatch_next_block(struct tpacket_kbdq_core
*pkc
,
890 struct packet_sock
*po
)
892 struct tpacket_block_desc
*pbd
;
896 /* 1. Get current block num */
897 pbd
= GET_CURR_PBLOCK_DESC_FROM_CORE(pkc
);
899 /* 2. If this block is currently in_use then freeze the queue */
900 if (TP_STATUS_USER
& BLOCK_STATUS(pbd
)) {
901 prb_freeze_queue(pkc
, po
);
907 * open this block and return the offset where the first packet
908 * needs to get stored.
910 prb_open_block(pkc
, pbd
);
911 return (void *)pkc
->nxt_offset
;
914 static void prb_retire_current_block(struct tpacket_kbdq_core
*pkc
,
915 struct packet_sock
*po
, unsigned int status
)
917 struct tpacket_block_desc
*pbd
= GET_CURR_PBLOCK_DESC_FROM_CORE(pkc
);
919 /* retire/close the current block */
920 if (likely(TP_STATUS_KERNEL
== BLOCK_STATUS(pbd
))) {
922 * Plug the case where copy_bits() is in progress on
923 * cpu-0 and tpacket_rcv() got invoked on cpu-1, didn't
924 * have space to copy the pkt in the current block and
925 * called prb_retire_current_block()
927 * We don't need to worry about the TMO case because
928 * the timer-handler already handled this case.
930 if (!(status
& TP_STATUS_BLK_TMO
)) {
931 while (atomic_read(&pkc
->blk_fill_in_prog
)) {
932 /* Waiting for skb_copy_bits to finish... */
936 prb_close_block(pkc
, pbd
, po
, status
);
941 static int prb_curr_blk_in_use(struct tpacket_block_desc
*pbd
)
943 return TP_STATUS_USER
& BLOCK_STATUS(pbd
);
946 static int prb_queue_frozen(struct tpacket_kbdq_core
*pkc
)
948 return pkc
->reset_pending_on_curr_blk
;
951 static void prb_clear_blk_fill_status(struct packet_ring_buffer
*rb
)
953 struct tpacket_kbdq_core
*pkc
= GET_PBDQC_FROM_RB(rb
);
954 atomic_dec(&pkc
->blk_fill_in_prog
);
957 static void prb_fill_rxhash(struct tpacket_kbdq_core
*pkc
,
958 struct tpacket3_hdr
*ppd
)
960 ppd
->hv1
.tp_rxhash
= skb_get_hash(pkc
->skb
);
963 static void prb_clear_rxhash(struct tpacket_kbdq_core
*pkc
,
964 struct tpacket3_hdr
*ppd
)
966 ppd
->hv1
.tp_rxhash
= 0;
969 static void prb_fill_vlan_info(struct tpacket_kbdq_core
*pkc
,
970 struct tpacket3_hdr
*ppd
)
972 if (skb_vlan_tag_present(pkc
->skb
)) {
973 ppd
->hv1
.tp_vlan_tci
= skb_vlan_tag_get(pkc
->skb
);
974 ppd
->hv1
.tp_vlan_tpid
= ntohs(pkc
->skb
->vlan_proto
);
975 ppd
->tp_status
= TP_STATUS_VLAN_VALID
| TP_STATUS_VLAN_TPID_VALID
;
977 ppd
->hv1
.tp_vlan_tci
= 0;
978 ppd
->hv1
.tp_vlan_tpid
= 0;
979 ppd
->tp_status
= TP_STATUS_AVAILABLE
;
983 static void prb_run_all_ft_ops(struct tpacket_kbdq_core
*pkc
,
984 struct tpacket3_hdr
*ppd
)
986 ppd
->hv1
.tp_padding
= 0;
987 prb_fill_vlan_info(pkc
, ppd
);
989 if (pkc
->feature_req_word
& TP_FT_REQ_FILL_RXHASH
)
990 prb_fill_rxhash(pkc
, ppd
);
992 prb_clear_rxhash(pkc
, ppd
);
995 static void prb_fill_curr_block(char *curr
,
996 struct tpacket_kbdq_core
*pkc
,
997 struct tpacket_block_desc
*pbd
,
1000 struct tpacket3_hdr
*ppd
;
1002 ppd
= (struct tpacket3_hdr
*)curr
;
1003 ppd
->tp_next_offset
= TOTAL_PKT_LEN_INCL_ALIGN(len
);
1005 pkc
->nxt_offset
+= TOTAL_PKT_LEN_INCL_ALIGN(len
);
1006 BLOCK_LEN(pbd
) += TOTAL_PKT_LEN_INCL_ALIGN(len
);
1007 BLOCK_NUM_PKTS(pbd
) += 1;
1008 atomic_inc(&pkc
->blk_fill_in_prog
);
1009 prb_run_all_ft_ops(pkc
, ppd
);
1012 /* Assumes caller has the sk->rx_queue.lock */
1013 static void *__packet_lookup_frame_in_block(struct packet_sock
*po
,
1014 struct sk_buff
*skb
,
1019 struct tpacket_kbdq_core
*pkc
;
1020 struct tpacket_block_desc
*pbd
;
1023 pkc
= GET_PBDQC_FROM_RB(&po
->rx_ring
);
1024 pbd
= GET_CURR_PBLOCK_DESC_FROM_CORE(pkc
);
1026 /* Queue is frozen when user space is lagging behind */
1027 if (prb_queue_frozen(pkc
)) {
1029 * Check if that last block which caused the queue to freeze,
1030 * is still in_use by user-space.
1032 if (prb_curr_blk_in_use(pbd
)) {
1033 /* Can't record this packet */
1037 * Ok, the block was released by user-space.
1038 * Now let's open that block.
1039 * opening a block also thaws the queue.
1040 * Thawing is a side effect.
1042 prb_open_block(pkc
, pbd
);
1047 curr
= pkc
->nxt_offset
;
1049 end
= (char *)pbd
+ pkc
->kblk_size
;
1051 /* first try the current block */
1052 if (curr
+TOTAL_PKT_LEN_INCL_ALIGN(len
) < end
) {
1053 prb_fill_curr_block(curr
, pkc
, pbd
, len
);
1054 return (void *)curr
;
1057 /* Ok, close the current block */
1058 prb_retire_current_block(pkc
, po
, 0);
1060 /* Now, try to dispatch the next block */
1061 curr
= (char *)prb_dispatch_next_block(pkc
, po
);
1063 pbd
= GET_CURR_PBLOCK_DESC_FROM_CORE(pkc
);
1064 prb_fill_curr_block(curr
, pkc
, pbd
, len
);
1065 return (void *)curr
;
1069 * No free blocks are available.user_space hasn't caught up yet.
1070 * Queue was just frozen and now this packet will get dropped.
1075 static void *packet_current_rx_frame(struct packet_sock
*po
,
1076 struct sk_buff
*skb
,
1077 int status
, unsigned int len
)
1080 switch (po
->tp_version
) {
1083 curr
= packet_lookup_frame(po
, &po
->rx_ring
,
1084 po
->rx_ring
.head
, status
);
1087 return __packet_lookup_frame_in_block(po
, skb
, status
, len
);
1089 WARN(1, "TPACKET version not supported\n");
1095 static void *prb_lookup_block(struct packet_sock
*po
,
1096 struct packet_ring_buffer
*rb
,
1100 struct tpacket_kbdq_core
*pkc
= GET_PBDQC_FROM_RB(rb
);
1101 struct tpacket_block_desc
*pbd
= GET_PBLOCK_DESC(pkc
, idx
);
1103 if (status
!= BLOCK_STATUS(pbd
))
1108 static int prb_previous_blk_num(struct packet_ring_buffer
*rb
)
1111 if (rb
->prb_bdqc
.kactive_blk_num
)
1112 prev
= rb
->prb_bdqc
.kactive_blk_num
-1;
1114 prev
= rb
->prb_bdqc
.knum_blocks
-1;
1118 /* Assumes caller has held the rx_queue.lock */
1119 static void *__prb_previous_block(struct packet_sock
*po
,
1120 struct packet_ring_buffer
*rb
,
1123 unsigned int previous
= prb_previous_blk_num(rb
);
1124 return prb_lookup_block(po
, rb
, previous
, status
);
1127 static void *packet_previous_rx_frame(struct packet_sock
*po
,
1128 struct packet_ring_buffer
*rb
,
1131 if (po
->tp_version
<= TPACKET_V2
)
1132 return packet_previous_frame(po
, rb
, status
);
1134 return __prb_previous_block(po
, rb
, status
);
1137 static void packet_increment_rx_head(struct packet_sock
*po
,
1138 struct packet_ring_buffer
*rb
)
1140 switch (po
->tp_version
) {
1143 return packet_increment_head(rb
);
1146 WARN(1, "TPACKET version not supported.\n");
1152 static void *packet_previous_frame(struct packet_sock
*po
,
1153 struct packet_ring_buffer
*rb
,
1156 unsigned int previous
= rb
->head
? rb
->head
- 1 : rb
->frame_max
;
1157 return packet_lookup_frame(po
, rb
, previous
, status
);
1160 static void packet_increment_head(struct packet_ring_buffer
*buff
)
1162 buff
->head
= buff
->head
!= buff
->frame_max
? buff
->head
+1 : 0;
1165 static void packet_inc_pending(struct packet_ring_buffer
*rb
)
1167 this_cpu_inc(*rb
->pending_refcnt
);
1170 static void packet_dec_pending(struct packet_ring_buffer
*rb
)
1172 this_cpu_dec(*rb
->pending_refcnt
);
1175 static unsigned int packet_read_pending(const struct packet_ring_buffer
*rb
)
1177 unsigned int refcnt
= 0;
1180 /* We don't use pending refcount in rx_ring. */
1181 if (rb
->pending_refcnt
== NULL
)
1184 for_each_possible_cpu(cpu
)
1185 refcnt
+= *per_cpu_ptr(rb
->pending_refcnt
, cpu
);
1190 static int packet_alloc_pending(struct packet_sock
*po
)
1192 po
->rx_ring
.pending_refcnt
= NULL
;
1194 po
->tx_ring
.pending_refcnt
= alloc_percpu(unsigned int);
1195 if (unlikely(po
->tx_ring
.pending_refcnt
== NULL
))
1201 static void packet_free_pending(struct packet_sock
*po
)
1203 free_percpu(po
->tx_ring
.pending_refcnt
);
1206 #define ROOM_POW_OFF 2
1207 #define ROOM_NONE 0x0
1208 #define ROOM_LOW 0x1
1209 #define ROOM_NORMAL 0x2
1211 static bool __tpacket_has_room(struct packet_sock
*po
, int pow_off
)
1215 len
= po
->rx_ring
.frame_max
+ 1;
1216 idx
= po
->rx_ring
.head
;
1218 idx
+= len
>> pow_off
;
1221 return packet_lookup_frame(po
, &po
->rx_ring
, idx
, TP_STATUS_KERNEL
);
1224 static bool __tpacket_v3_has_room(struct packet_sock
*po
, int pow_off
)
1228 len
= po
->rx_ring
.prb_bdqc
.knum_blocks
;
1229 idx
= po
->rx_ring
.prb_bdqc
.kactive_blk_num
;
1231 idx
+= len
>> pow_off
;
1234 return prb_lookup_block(po
, &po
->rx_ring
, idx
, TP_STATUS_KERNEL
);
1237 static int __packet_rcv_has_room(struct packet_sock
*po
, struct sk_buff
*skb
)
1239 struct sock
*sk
= &po
->sk
;
1240 int ret
= ROOM_NONE
;
1242 if (po
->prot_hook
.func
!= tpacket_rcv
) {
1243 int avail
= sk
->sk_rcvbuf
- atomic_read(&sk
->sk_rmem_alloc
)
1244 - (skb
? skb
->truesize
: 0);
1245 if (avail
> (sk
->sk_rcvbuf
>> ROOM_POW_OFF
))
1253 if (po
->tp_version
== TPACKET_V3
) {
1254 if (__tpacket_v3_has_room(po
, ROOM_POW_OFF
))
1256 else if (__tpacket_v3_has_room(po
, 0))
1259 if (__tpacket_has_room(po
, ROOM_POW_OFF
))
1261 else if (__tpacket_has_room(po
, 0))
1268 static int packet_rcv_has_room(struct packet_sock
*po
, struct sk_buff
*skb
)
1273 spin_lock_bh(&po
->sk
.sk_receive_queue
.lock
);
1274 ret
= __packet_rcv_has_room(po
, skb
);
1275 has_room
= ret
== ROOM_NORMAL
;
1276 if (po
->pressure
== has_room
)
1277 po
->pressure
= !has_room
;
1278 spin_unlock_bh(&po
->sk
.sk_receive_queue
.lock
);
1283 static void packet_sock_destruct(struct sock
*sk
)
1285 skb_queue_purge(&sk
->sk_error_queue
);
1287 WARN_ON(atomic_read(&sk
->sk_rmem_alloc
));
1288 WARN_ON(refcount_read(&sk
->sk_wmem_alloc
));
1290 if (!sock_flag(sk
, SOCK_DEAD
)) {
1291 pr_err("Attempt to release alive packet socket: %p\n", sk
);
1295 sk_refcnt_debug_dec(sk
);
1298 static bool fanout_flow_is_huge(struct packet_sock
*po
, struct sk_buff
*skb
)
1300 u32
*history
= po
->rollover
->history
;
1304 rxhash
= skb_get_hash(skb
);
1305 for (i
= 0; i
< ROLLOVER_HLEN
; i
++)
1306 if (READ_ONCE(history
[i
]) == rxhash
)
1309 victim
= prandom_u32() % ROLLOVER_HLEN
;
1311 /* Avoid dirtying the cache line if possible */
1312 if (READ_ONCE(history
[victim
]) != rxhash
)
1313 WRITE_ONCE(history
[victim
], rxhash
);
1315 return count
> (ROLLOVER_HLEN
>> 1);
1318 static unsigned int fanout_demux_hash(struct packet_fanout
*f
,
1319 struct sk_buff
*skb
,
1322 return reciprocal_scale(__skb_get_hash_symmetric(skb
), num
);
1325 static unsigned int fanout_demux_lb(struct packet_fanout
*f
,
1326 struct sk_buff
*skb
,
1329 unsigned int val
= atomic_inc_return(&f
->rr_cur
);
1334 static unsigned int fanout_demux_cpu(struct packet_fanout
*f
,
1335 struct sk_buff
*skb
,
1338 return smp_processor_id() % num
;
1341 static unsigned int fanout_demux_rnd(struct packet_fanout
*f
,
1342 struct sk_buff
*skb
,
1345 return prandom_u32_max(num
);
1348 static unsigned int fanout_demux_rollover(struct packet_fanout
*f
,
1349 struct sk_buff
*skb
,
1350 unsigned int idx
, bool try_self
,
1353 struct packet_sock
*po
, *po_next
, *po_skip
= NULL
;
1354 unsigned int i
, j
, room
= ROOM_NONE
;
1356 po
= pkt_sk(f
->arr
[idx
]);
1359 room
= packet_rcv_has_room(po
, skb
);
1360 if (room
== ROOM_NORMAL
||
1361 (room
== ROOM_LOW
&& !fanout_flow_is_huge(po
, skb
)))
1366 i
= j
= min_t(int, po
->rollover
->sock
, num
- 1);
1368 po_next
= pkt_sk(f
->arr
[i
]);
1369 if (po_next
!= po_skip
&& !po_next
->pressure
&&
1370 packet_rcv_has_room(po_next
, skb
) == ROOM_NORMAL
) {
1372 po
->rollover
->sock
= i
;
1373 atomic_long_inc(&po
->rollover
->num
);
1374 if (room
== ROOM_LOW
)
1375 atomic_long_inc(&po
->rollover
->num_huge
);
1383 atomic_long_inc(&po
->rollover
->num_failed
);
1387 static unsigned int fanout_demux_qm(struct packet_fanout
*f
,
1388 struct sk_buff
*skb
,
1391 return skb_get_queue_mapping(skb
) % num
;
1394 static unsigned int fanout_demux_bpf(struct packet_fanout
*f
,
1395 struct sk_buff
*skb
,
1398 struct bpf_prog
*prog
;
1399 unsigned int ret
= 0;
1402 prog
= rcu_dereference(f
->bpf_prog
);
1404 ret
= bpf_prog_run_clear_cb(prog
, skb
) % num
;
1410 static bool fanout_has_flag(struct packet_fanout
*f
, u16 flag
)
1412 return f
->flags
& (flag
>> 8);
1415 static int packet_rcv_fanout(struct sk_buff
*skb
, struct net_device
*dev
,
1416 struct packet_type
*pt
, struct net_device
*orig_dev
)
1418 struct packet_fanout
*f
= pt
->af_packet_priv
;
1419 unsigned int num
= READ_ONCE(f
->num_members
);
1420 struct net
*net
= read_pnet(&f
->net
);
1421 struct packet_sock
*po
;
1424 if (!net_eq(dev_net(dev
), net
) || !num
) {
1429 if (fanout_has_flag(f
, PACKET_FANOUT_FLAG_DEFRAG
)) {
1430 skb
= ip_check_defrag(net
, skb
, IP_DEFRAG_AF_PACKET
);
1435 case PACKET_FANOUT_HASH
:
1437 idx
= fanout_demux_hash(f
, skb
, num
);
1439 case PACKET_FANOUT_LB
:
1440 idx
= fanout_demux_lb(f
, skb
, num
);
1442 case PACKET_FANOUT_CPU
:
1443 idx
= fanout_demux_cpu(f
, skb
, num
);
1445 case PACKET_FANOUT_RND
:
1446 idx
= fanout_demux_rnd(f
, skb
, num
);
1448 case PACKET_FANOUT_QM
:
1449 idx
= fanout_demux_qm(f
, skb
, num
);
1451 case PACKET_FANOUT_ROLLOVER
:
1452 idx
= fanout_demux_rollover(f
, skb
, 0, false, num
);
1454 case PACKET_FANOUT_CBPF
:
1455 case PACKET_FANOUT_EBPF
:
1456 idx
= fanout_demux_bpf(f
, skb
, num
);
1460 if (fanout_has_flag(f
, PACKET_FANOUT_FLAG_ROLLOVER
))
1461 idx
= fanout_demux_rollover(f
, skb
, idx
, true, num
);
1463 po
= pkt_sk(f
->arr
[idx
]);
1464 return po
->prot_hook
.func(skb
, dev
, &po
->prot_hook
, orig_dev
);
1467 DEFINE_MUTEX(fanout_mutex
);
1468 EXPORT_SYMBOL_GPL(fanout_mutex
);
1469 static LIST_HEAD(fanout_list
);
1470 static u16 fanout_next_id
;
1472 static void __fanout_link(struct sock
*sk
, struct packet_sock
*po
)
1474 struct packet_fanout
*f
= po
->fanout
;
1476 spin_lock(&f
->lock
);
1477 f
->arr
[f
->num_members
] = sk
;
1480 if (f
->num_members
== 1)
1481 dev_add_pack(&f
->prot_hook
);
1482 spin_unlock(&f
->lock
);
1485 static void __fanout_unlink(struct sock
*sk
, struct packet_sock
*po
)
1487 struct packet_fanout
*f
= po
->fanout
;
1490 spin_lock(&f
->lock
);
1491 for (i
= 0; i
< f
->num_members
; i
++) {
1492 if (f
->arr
[i
] == sk
)
1495 BUG_ON(i
>= f
->num_members
);
1496 f
->arr
[i
] = f
->arr
[f
->num_members
- 1];
1498 if (f
->num_members
== 0)
1499 __dev_remove_pack(&f
->prot_hook
);
1500 spin_unlock(&f
->lock
);
1503 static bool match_fanout_group(struct packet_type
*ptype
, struct sock
*sk
)
1505 if (sk
->sk_family
!= PF_PACKET
)
1508 return ptype
->af_packet_priv
== pkt_sk(sk
)->fanout
;
1511 static void fanout_init_data(struct packet_fanout
*f
)
1514 case PACKET_FANOUT_LB
:
1515 atomic_set(&f
->rr_cur
, 0);
1517 case PACKET_FANOUT_CBPF
:
1518 case PACKET_FANOUT_EBPF
:
1519 RCU_INIT_POINTER(f
->bpf_prog
, NULL
);
1524 static void __fanout_set_data_bpf(struct packet_fanout
*f
, struct bpf_prog
*new)
1526 struct bpf_prog
*old
;
1528 spin_lock(&f
->lock
);
1529 old
= rcu_dereference_protected(f
->bpf_prog
, lockdep_is_held(&f
->lock
));
1530 rcu_assign_pointer(f
->bpf_prog
, new);
1531 spin_unlock(&f
->lock
);
1535 bpf_prog_destroy(old
);
1539 static int fanout_set_data_cbpf(struct packet_sock
*po
, char __user
*data
,
1542 struct bpf_prog
*new;
1543 struct sock_fprog fprog
;
1546 if (sock_flag(&po
->sk
, SOCK_FILTER_LOCKED
))
1548 if (len
!= sizeof(fprog
))
1550 if (copy_from_user(&fprog
, data
, len
))
1553 ret
= bpf_prog_create_from_user(&new, &fprog
, NULL
, false);
1557 __fanout_set_data_bpf(po
->fanout
, new);
1561 static int fanout_set_data_ebpf(struct packet_sock
*po
, char __user
*data
,
1564 struct bpf_prog
*new;
1567 if (sock_flag(&po
->sk
, SOCK_FILTER_LOCKED
))
1569 if (len
!= sizeof(fd
))
1571 if (copy_from_user(&fd
, data
, len
))
1574 new = bpf_prog_get_type(fd
, BPF_PROG_TYPE_SOCKET_FILTER
);
1576 return PTR_ERR(new);
1578 __fanout_set_data_bpf(po
->fanout
, new);
1582 static int fanout_set_data(struct packet_sock
*po
, char __user
*data
,
1585 switch (po
->fanout
->type
) {
1586 case PACKET_FANOUT_CBPF
:
1587 return fanout_set_data_cbpf(po
, data
, len
);
1588 case PACKET_FANOUT_EBPF
:
1589 return fanout_set_data_ebpf(po
, data
, len
);
1595 static void fanout_release_data(struct packet_fanout
*f
)
1598 case PACKET_FANOUT_CBPF
:
1599 case PACKET_FANOUT_EBPF
:
1600 __fanout_set_data_bpf(f
, NULL
);
1604 static bool __fanout_id_is_free(struct sock
*sk
, u16 candidate_id
)
1606 struct packet_fanout
*f
;
1608 list_for_each_entry(f
, &fanout_list
, list
) {
1609 if (f
->id
== candidate_id
&&
1610 read_pnet(&f
->net
) == sock_net(sk
)) {
1617 static bool fanout_find_new_id(struct sock
*sk
, u16
*new_id
)
1619 u16 id
= fanout_next_id
;
1622 if (__fanout_id_is_free(sk
, id
)) {
1624 fanout_next_id
= id
+ 1;
1629 } while (id
!= fanout_next_id
);
1634 static int fanout_add(struct sock
*sk
, u16 id
, u16 type_flags
)
1636 struct packet_rollover
*rollover
= NULL
;
1637 struct packet_sock
*po
= pkt_sk(sk
);
1638 struct packet_fanout
*f
, *match
;
1639 u8 type
= type_flags
& 0xff;
1640 u8 flags
= type_flags
>> 8;
1644 case PACKET_FANOUT_ROLLOVER
:
1645 if (type_flags
& PACKET_FANOUT_FLAG_ROLLOVER
)
1647 case PACKET_FANOUT_HASH
:
1648 case PACKET_FANOUT_LB
:
1649 case PACKET_FANOUT_CPU
:
1650 case PACKET_FANOUT_RND
:
1651 case PACKET_FANOUT_QM
:
1652 case PACKET_FANOUT_CBPF
:
1653 case PACKET_FANOUT_EBPF
:
1659 mutex_lock(&fanout_mutex
);
1665 if (type
== PACKET_FANOUT_ROLLOVER
||
1666 (type_flags
& PACKET_FANOUT_FLAG_ROLLOVER
)) {
1668 rollover
= kzalloc(sizeof(*rollover
), GFP_KERNEL
);
1671 atomic_long_set(&rollover
->num
, 0);
1672 atomic_long_set(&rollover
->num_huge
, 0);
1673 atomic_long_set(&rollover
->num_failed
, 0);
1676 if (type_flags
& PACKET_FANOUT_FLAG_UNIQUEID
) {
1681 if (!fanout_find_new_id(sk
, &id
)) {
1685 /* ephemeral flag for the first socket in the group: drop it */
1686 flags
&= ~(PACKET_FANOUT_FLAG_UNIQUEID
>> 8);
1690 list_for_each_entry(f
, &fanout_list
, list
) {
1692 read_pnet(&f
->net
) == sock_net(sk
)) {
1698 if (match
&& match
->flags
!= flags
)
1702 match
= kzalloc(sizeof(*match
), GFP_KERNEL
);
1705 write_pnet(&match
->net
, sock_net(sk
));
1708 match
->flags
= flags
;
1709 INIT_LIST_HEAD(&match
->list
);
1710 spin_lock_init(&match
->lock
);
1711 refcount_set(&match
->sk_ref
, 0);
1712 fanout_init_data(match
);
1713 match
->prot_hook
.type
= po
->prot_hook
.type
;
1714 match
->prot_hook
.dev
= po
->prot_hook
.dev
;
1715 match
->prot_hook
.func
= packet_rcv_fanout
;
1716 match
->prot_hook
.af_packet_priv
= match
;
1717 match
->prot_hook
.id_match
= match_fanout_group
;
1718 list_add(&match
->list
, &fanout_list
);
1722 spin_lock(&po
->bind_lock
);
1724 match
->type
== type
&&
1725 match
->prot_hook
.type
== po
->prot_hook
.type
&&
1726 match
->prot_hook
.dev
== po
->prot_hook
.dev
) {
1728 if (refcount_read(&match
->sk_ref
) < PACKET_FANOUT_MAX
) {
1729 __dev_remove_pack(&po
->prot_hook
);
1731 po
->rollover
= rollover
;
1733 refcount_set(&match
->sk_ref
, refcount_read(&match
->sk_ref
) + 1);
1734 __fanout_link(sk
, po
);
1738 spin_unlock(&po
->bind_lock
);
1740 if (err
&& !refcount_read(&match
->sk_ref
)) {
1741 list_del(&match
->list
);
1747 mutex_unlock(&fanout_mutex
);
1751 /* If pkt_sk(sk)->fanout->sk_ref is zero, this function removes
1752 * pkt_sk(sk)->fanout from fanout_list and returns pkt_sk(sk)->fanout.
1753 * It is the responsibility of the caller to call fanout_release_data() and
1754 * free the returned packet_fanout (after synchronize_net())
1756 static struct packet_fanout
*fanout_release(struct sock
*sk
)
1758 struct packet_sock
*po
= pkt_sk(sk
);
1759 struct packet_fanout
*f
;
1761 mutex_lock(&fanout_mutex
);
1766 if (refcount_dec_and_test(&f
->sk_ref
))
1771 mutex_unlock(&fanout_mutex
);
1776 static bool packet_extra_vlan_len_allowed(const struct net_device
*dev
,
1777 struct sk_buff
*skb
)
1779 /* Earlier code assumed this would be a VLAN pkt, double-check
1780 * this now that we have the actual packet in hand. We can only
1781 * do this check on Ethernet devices.
1783 if (unlikely(dev
->type
!= ARPHRD_ETHER
))
1786 skb_reset_mac_header(skb
);
1787 return likely(eth_hdr(skb
)->h_proto
== htons(ETH_P_8021Q
));
1790 static const struct proto_ops packet_ops
;
1792 static const struct proto_ops packet_ops_spkt
;
1794 static int packet_rcv_spkt(struct sk_buff
*skb
, struct net_device
*dev
,
1795 struct packet_type
*pt
, struct net_device
*orig_dev
)
1798 struct sockaddr_pkt
*spkt
;
1801 * When we registered the protocol we saved the socket in the data
1802 * field for just this event.
1805 sk
= pt
->af_packet_priv
;
1808 * Yank back the headers [hope the device set this
1809 * right or kerboom...]
1811 * Incoming packets have ll header pulled,
1814 * For outgoing ones skb->data == skb_mac_header(skb)
1815 * so that this procedure is noop.
1818 if (skb
->pkt_type
== PACKET_LOOPBACK
)
1821 if (!net_eq(dev_net(dev
), sock_net(sk
)))
1824 skb
= skb_share_check(skb
, GFP_ATOMIC
);
1828 /* drop any routing info */
1831 /* drop conntrack reference */
1834 spkt
= &PACKET_SKB_CB(skb
)->sa
.pkt
;
1836 skb_push(skb
, skb
->data
- skb_mac_header(skb
));
1839 * The SOCK_PACKET socket receives _all_ frames.
1842 spkt
->spkt_family
= dev
->type
;
1843 strlcpy(spkt
->spkt_device
, dev
->name
, sizeof(spkt
->spkt_device
));
1844 spkt
->spkt_protocol
= skb
->protocol
;
1847 * Charge the memory to the socket. This is done specifically
1848 * to prevent sockets using all the memory up.
1851 if (sock_queue_rcv_skb(sk
, skb
) == 0)
1862 * Output a raw packet to a device layer. This bypasses all the other
1863 * protocol layers and you must therefore supply it with a complete frame
1866 static int packet_sendmsg_spkt(struct socket
*sock
, struct msghdr
*msg
,
1869 struct sock
*sk
= sock
->sk
;
1870 DECLARE_SOCKADDR(struct sockaddr_pkt
*, saddr
, msg
->msg_name
);
1871 struct sk_buff
*skb
= NULL
;
1872 struct net_device
*dev
;
1873 struct sockcm_cookie sockc
;
1879 * Get and verify the address.
1883 if (msg
->msg_namelen
< sizeof(struct sockaddr
))
1885 if (msg
->msg_namelen
== sizeof(struct sockaddr_pkt
))
1886 proto
= saddr
->spkt_protocol
;
1888 return -ENOTCONN
; /* SOCK_PACKET must be sent giving an address */
1891 * Find the device first to size check it
1894 saddr
->spkt_device
[sizeof(saddr
->spkt_device
) - 1] = 0;
1897 dev
= dev_get_by_name_rcu(sock_net(sk
), saddr
->spkt_device
);
1903 if (!(dev
->flags
& IFF_UP
))
1907 * You may not queue a frame bigger than the mtu. This is the lowest level
1908 * raw protocol and you must do your own fragmentation at this level.
1911 if (unlikely(sock_flag(sk
, SOCK_NOFCS
))) {
1912 if (!netif_supports_nofcs(dev
)) {
1913 err
= -EPROTONOSUPPORT
;
1916 extra_len
= 4; /* We're doing our own CRC */
1920 if (len
> dev
->mtu
+ dev
->hard_header_len
+ VLAN_HLEN
+ extra_len
)
1924 size_t reserved
= LL_RESERVED_SPACE(dev
);
1925 int tlen
= dev
->needed_tailroom
;
1926 unsigned int hhlen
= dev
->header_ops
? dev
->hard_header_len
: 0;
1929 skb
= sock_wmalloc(sk
, len
+ reserved
+ tlen
, 0, GFP_KERNEL
);
1932 /* FIXME: Save some space for broken drivers that write a hard
1933 * header at transmission time by themselves. PPP is the notable
1934 * one here. This should really be fixed at the driver level.
1936 skb_reserve(skb
, reserved
);
1937 skb_reset_network_header(skb
);
1939 /* Try to align data part correctly */
1944 skb_reset_network_header(skb
);
1946 err
= memcpy_from_msg(skb_put(skb
, len
), msg
, len
);
1952 if (!dev_validate_header(dev
, skb
->data
, len
)) {
1956 if (len
> (dev
->mtu
+ dev
->hard_header_len
+ extra_len
) &&
1957 !packet_extra_vlan_len_allowed(dev
, skb
)) {
1962 sockcm_init(&sockc
, sk
);
1963 if (msg
->msg_controllen
) {
1964 err
= sock_cmsg_send(sk
, msg
, &sockc
);
1969 skb
->protocol
= proto
;
1971 skb
->priority
= sk
->sk_priority
;
1972 skb
->mark
= sk
->sk_mark
;
1973 skb
->tstamp
= sockc
.transmit_time
;
1975 sock_tx_timestamp(sk
, sockc
.tsflags
, &skb_shinfo(skb
)->tx_flags
);
1977 if (unlikely(extra_len
== 4))
1980 skb_probe_transport_header(skb
, 0);
1982 dev_queue_xmit(skb
);
1993 static unsigned int run_filter(struct sk_buff
*skb
,
1994 const struct sock
*sk
,
1997 struct sk_filter
*filter
;
2000 filter
= rcu_dereference(sk
->sk_filter
);
2002 res
= bpf_prog_run_clear_cb(filter
->prog
, skb
);
2008 static int packet_rcv_vnet(struct msghdr
*msg
, const struct sk_buff
*skb
,
2011 struct virtio_net_hdr vnet_hdr
;
2013 if (*len
< sizeof(vnet_hdr
))
2015 *len
-= sizeof(vnet_hdr
);
2017 if (virtio_net_hdr_from_skb(skb
, &vnet_hdr
, vio_le(), true, 0))
2020 return memcpy_to_msg(msg
, (void *)&vnet_hdr
, sizeof(vnet_hdr
));
2024 * This function makes lazy skb cloning in hope that most of packets
2025 * are discarded by BPF.
2027 * Note tricky part: we DO mangle shared skb! skb->data, skb->len
2028 * and skb->cb are mangled. It works because (and until) packets
2029 * falling here are owned by current CPU. Output packets are cloned
2030 * by dev_queue_xmit_nit(), input packets are processed by net_bh
2031 * sequencially, so that if we return skb to original state on exit,
2032 * we will not harm anyone.
2035 static int packet_rcv(struct sk_buff
*skb
, struct net_device
*dev
,
2036 struct packet_type
*pt
, struct net_device
*orig_dev
)
2039 struct sockaddr_ll
*sll
;
2040 struct packet_sock
*po
;
2041 u8
*skb_head
= skb
->data
;
2042 int skb_len
= skb
->len
;
2043 unsigned int snaplen
, res
;
2044 bool is_drop_n_account
= false;
2046 if (skb
->pkt_type
== PACKET_LOOPBACK
)
2049 sk
= pt
->af_packet_priv
;
2052 if (!net_eq(dev_net(dev
), sock_net(sk
)))
2057 if (dev
->header_ops
) {
2058 /* The device has an explicit notion of ll header,
2059 * exported to higher levels.
2061 * Otherwise, the device hides details of its frame
2062 * structure, so that corresponding packet head is
2063 * never delivered to user.
2065 if (sk
->sk_type
!= SOCK_DGRAM
)
2066 skb_push(skb
, skb
->data
- skb_mac_header(skb
));
2067 else if (skb
->pkt_type
== PACKET_OUTGOING
) {
2068 /* Special case: outgoing packets have ll header at head */
2069 skb_pull(skb
, skb_network_offset(skb
));
2075 res
= run_filter(skb
, sk
, snaplen
);
2077 goto drop_n_restore
;
2081 if (atomic_read(&sk
->sk_rmem_alloc
) >= sk
->sk_rcvbuf
)
2084 if (skb_shared(skb
)) {
2085 struct sk_buff
*nskb
= skb_clone(skb
, GFP_ATOMIC
);
2089 if (skb_head
!= skb
->data
) {
2090 skb
->data
= skb_head
;
2097 sock_skb_cb_check_size(sizeof(*PACKET_SKB_CB(skb
)) + MAX_ADDR_LEN
- 8);
2099 sll
= &PACKET_SKB_CB(skb
)->sa
.ll
;
2100 sll
->sll_hatype
= dev
->type
;
2101 sll
->sll_pkttype
= skb
->pkt_type
;
2102 if (unlikely(po
->origdev
))
2103 sll
->sll_ifindex
= orig_dev
->ifindex
;
2105 sll
->sll_ifindex
= dev
->ifindex
;
2107 sll
->sll_halen
= dev_parse_header(skb
, sll
->sll_addr
);
2109 /* sll->sll_family and sll->sll_protocol are set in packet_recvmsg().
2110 * Use their space for storing the original skb length.
2112 PACKET_SKB_CB(skb
)->sa
.origlen
= skb
->len
;
2114 if (pskb_trim(skb
, snaplen
))
2117 skb_set_owner_r(skb
, sk
);
2121 /* drop conntrack reference */
2124 spin_lock(&sk
->sk_receive_queue
.lock
);
2125 po
->stats
.stats1
.tp_packets
++;
2126 sock_skb_set_dropcount(sk
, skb
);
2127 __skb_queue_tail(&sk
->sk_receive_queue
, skb
);
2128 spin_unlock(&sk
->sk_receive_queue
.lock
);
2129 sk
->sk_data_ready(sk
);
2133 is_drop_n_account
= true;
2134 spin_lock(&sk
->sk_receive_queue
.lock
);
2135 po
->stats
.stats1
.tp_drops
++;
2136 atomic_inc(&sk
->sk_drops
);
2137 spin_unlock(&sk
->sk_receive_queue
.lock
);
2140 if (skb_head
!= skb
->data
&& skb_shared(skb
)) {
2141 skb
->data
= skb_head
;
2145 if (!is_drop_n_account
)
2152 static int tpacket_rcv(struct sk_buff
*skb
, struct net_device
*dev
,
2153 struct packet_type
*pt
, struct net_device
*orig_dev
)
2156 struct packet_sock
*po
;
2157 struct sockaddr_ll
*sll
;
2158 union tpacket_uhdr h
;
2159 u8
*skb_head
= skb
->data
;
2160 int skb_len
= skb
->len
;
2161 unsigned int snaplen
, res
;
2162 unsigned long status
= TP_STATUS_USER
;
2163 unsigned short macoff
, netoff
, hdrlen
;
2164 struct sk_buff
*copy_skb
= NULL
;
2167 bool is_drop_n_account
= false;
2168 unsigned int slot_id
= 0;
2169 bool do_vnet
= false;
2171 /* struct tpacket{2,3}_hdr is aligned to a multiple of TPACKET_ALIGNMENT.
2172 * We may add members to them until current aligned size without forcing
2173 * userspace to call getsockopt(..., PACKET_HDRLEN, ...).
2175 BUILD_BUG_ON(TPACKET_ALIGN(sizeof(*h
.h2
)) != 32);
2176 BUILD_BUG_ON(TPACKET_ALIGN(sizeof(*h
.h3
)) != 48);
2178 if (skb
->pkt_type
== PACKET_LOOPBACK
)
2181 sk
= pt
->af_packet_priv
;
2184 if (!net_eq(dev_net(dev
), sock_net(sk
)))
2187 if (dev
->header_ops
) {
2188 if (sk
->sk_type
!= SOCK_DGRAM
)
2189 skb_push(skb
, skb
->data
- skb_mac_header(skb
));
2190 else if (skb
->pkt_type
== PACKET_OUTGOING
) {
2191 /* Special case: outgoing packets have ll header at head */
2192 skb_pull(skb
, skb_network_offset(skb
));
2198 res
= run_filter(skb
, sk
, snaplen
);
2200 goto drop_n_restore
;
2202 if (skb
->ip_summed
== CHECKSUM_PARTIAL
)
2203 status
|= TP_STATUS_CSUMNOTREADY
;
2204 else if (skb
->pkt_type
!= PACKET_OUTGOING
&&
2205 (skb
->ip_summed
== CHECKSUM_COMPLETE
||
2206 skb_csum_unnecessary(skb
)))
2207 status
|= TP_STATUS_CSUM_VALID
;
2212 if (sk
->sk_type
== SOCK_DGRAM
) {
2213 macoff
= netoff
= TPACKET_ALIGN(po
->tp_hdrlen
) + 16 +
2216 unsigned int maclen
= skb_network_offset(skb
);
2217 netoff
= TPACKET_ALIGN(po
->tp_hdrlen
+
2218 (maclen
< 16 ? 16 : maclen
)) +
2220 if (po
->has_vnet_hdr
) {
2221 netoff
+= sizeof(struct virtio_net_hdr
);
2224 macoff
= netoff
- maclen
;
2226 if (po
->tp_version
<= TPACKET_V2
) {
2227 if (macoff
+ snaplen
> po
->rx_ring
.frame_size
) {
2228 if (po
->copy_thresh
&&
2229 atomic_read(&sk
->sk_rmem_alloc
) < sk
->sk_rcvbuf
) {
2230 if (skb_shared(skb
)) {
2231 copy_skb
= skb_clone(skb
, GFP_ATOMIC
);
2233 copy_skb
= skb_get(skb
);
2234 skb_head
= skb
->data
;
2237 skb_set_owner_r(copy_skb
, sk
);
2239 snaplen
= po
->rx_ring
.frame_size
- macoff
;
2240 if ((int)snaplen
< 0) {
2245 } else if (unlikely(macoff
+ snaplen
>
2246 GET_PBDQC_FROM_RB(&po
->rx_ring
)->max_frame_len
)) {
2249 nval
= GET_PBDQC_FROM_RB(&po
->rx_ring
)->max_frame_len
- macoff
;
2250 pr_err_once("tpacket_rcv: packet too big, clamped from %u to %u. macoff=%u\n",
2251 snaplen
, nval
, macoff
);
2253 if (unlikely((int)snaplen
< 0)) {
2255 macoff
= GET_PBDQC_FROM_RB(&po
->rx_ring
)->max_frame_len
;
2259 spin_lock(&sk
->sk_receive_queue
.lock
);
2260 h
.raw
= packet_current_rx_frame(po
, skb
,
2261 TP_STATUS_KERNEL
, (macoff
+snaplen
));
2263 goto drop_n_account
;
2265 if (po
->tp_version
<= TPACKET_V2
) {
2266 slot_id
= po
->rx_ring
.head
;
2267 if (test_bit(slot_id
, po
->rx_ring
.rx_owner_map
))
2268 goto drop_n_account
;
2269 __set_bit(slot_id
, po
->rx_ring
.rx_owner_map
);
2273 virtio_net_hdr_from_skb(skb
, h
.raw
+ macoff
-
2274 sizeof(struct virtio_net_hdr
),
2276 goto drop_n_account
;
2278 if (po
->tp_version
<= TPACKET_V2
) {
2279 packet_increment_rx_head(po
, &po
->rx_ring
);
2281 * LOSING will be reported till you read the stats,
2282 * because it's COR - Clear On Read.
2283 * Anyways, moving it for V1/V2 only as V3 doesn't need this
2286 if (po
->stats
.stats1
.tp_drops
)
2287 status
|= TP_STATUS_LOSING
;
2290 po
->stats
.stats1
.tp_packets
++;
2292 status
|= TP_STATUS_COPY
;
2293 __skb_queue_tail(&sk
->sk_receive_queue
, copy_skb
);
2295 spin_unlock(&sk
->sk_receive_queue
.lock
);
2297 skb_copy_bits(skb
, 0, h
.raw
+ macoff
, snaplen
);
2299 if (!(ts_status
= tpacket_get_timestamp(skb
, &ts
, po
->tp_tstamp
)))
2300 getnstimeofday(&ts
);
2302 status
|= ts_status
;
2304 switch (po
->tp_version
) {
2306 h
.h1
->tp_len
= skb
->len
;
2307 h
.h1
->tp_snaplen
= snaplen
;
2308 h
.h1
->tp_mac
= macoff
;
2309 h
.h1
->tp_net
= netoff
;
2310 h
.h1
->tp_sec
= ts
.tv_sec
;
2311 h
.h1
->tp_usec
= ts
.tv_nsec
/ NSEC_PER_USEC
;
2312 hdrlen
= sizeof(*h
.h1
);
2315 h
.h2
->tp_len
= skb
->len
;
2316 h
.h2
->tp_snaplen
= snaplen
;
2317 h
.h2
->tp_mac
= macoff
;
2318 h
.h2
->tp_net
= netoff
;
2319 h
.h2
->tp_sec
= ts
.tv_sec
;
2320 h
.h2
->tp_nsec
= ts
.tv_nsec
;
2321 if (skb_vlan_tag_present(skb
)) {
2322 h
.h2
->tp_vlan_tci
= skb_vlan_tag_get(skb
);
2323 h
.h2
->tp_vlan_tpid
= ntohs(skb
->vlan_proto
);
2324 status
|= TP_STATUS_VLAN_VALID
| TP_STATUS_VLAN_TPID_VALID
;
2326 h
.h2
->tp_vlan_tci
= 0;
2327 h
.h2
->tp_vlan_tpid
= 0;
2329 memset(h
.h2
->tp_padding
, 0, sizeof(h
.h2
->tp_padding
));
2330 hdrlen
= sizeof(*h
.h2
);
2333 /* tp_nxt_offset,vlan are already populated above.
2334 * So DONT clear those fields here
2336 h
.h3
->tp_status
|= status
;
2337 h
.h3
->tp_len
= skb
->len
;
2338 h
.h3
->tp_snaplen
= snaplen
;
2339 h
.h3
->tp_mac
= macoff
;
2340 h
.h3
->tp_net
= netoff
;
2341 h
.h3
->tp_sec
= ts
.tv_sec
;
2342 h
.h3
->tp_nsec
= ts
.tv_nsec
;
2343 memset(h
.h3
->tp_padding
, 0, sizeof(h
.h3
->tp_padding
));
2344 hdrlen
= sizeof(*h
.h3
);
2350 sll
= h
.raw
+ TPACKET_ALIGN(hdrlen
);
2351 sll
->sll_halen
= dev_parse_header(skb
, sll
->sll_addr
);
2352 sll
->sll_family
= AF_PACKET
;
2353 sll
->sll_hatype
= dev
->type
;
2354 sll
->sll_protocol
= skb
->protocol
;
2355 sll
->sll_pkttype
= skb
->pkt_type
;
2356 if (unlikely(po
->origdev
))
2357 sll
->sll_ifindex
= orig_dev
->ifindex
;
2359 sll
->sll_ifindex
= dev
->ifindex
;
2363 #if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1
2364 if (po
->tp_version
<= TPACKET_V2
) {
2367 end
= (u8
*) PAGE_ALIGN((unsigned long) h
.raw
+
2370 for (start
= h
.raw
; start
< end
; start
+= PAGE_SIZE
)
2371 flush_dcache_page(pgv_to_page(start
));
2376 if (po
->tp_version
<= TPACKET_V2
) {
2377 spin_lock(&sk
->sk_receive_queue
.lock
);
2378 __packet_set_status(po
, h
.raw
, status
);
2379 __clear_bit(slot_id
, po
->rx_ring
.rx_owner_map
);
2380 spin_unlock(&sk
->sk_receive_queue
.lock
);
2381 sk
->sk_data_ready(sk
);
2383 prb_clear_blk_fill_status(&po
->rx_ring
);
2387 if (skb_head
!= skb
->data
&& skb_shared(skb
)) {
2388 skb
->data
= skb_head
;
2392 if (!is_drop_n_account
)
2399 is_drop_n_account
= true;
2400 po
->stats
.stats1
.tp_drops
++;
2401 spin_unlock(&sk
->sk_receive_queue
.lock
);
2403 sk
->sk_data_ready(sk
);
2404 kfree_skb(copy_skb
);
2405 goto drop_n_restore
;
2408 static void tpacket_destruct_skb(struct sk_buff
*skb
)
2410 struct packet_sock
*po
= pkt_sk(skb
->sk
);
2412 if (likely(po
->tx_ring
.pg_vec
)) {
2416 ph
= skb_zcopy_get_nouarg(skb
);
2417 packet_dec_pending(&po
->tx_ring
);
2419 ts
= __packet_set_timestamp(po
, ph
, skb
);
2420 __packet_set_status(po
, ph
, TP_STATUS_AVAILABLE
| ts
);
2422 if (!packet_read_pending(&po
->tx_ring
))
2423 complete(&po
->skb_completion
);
2429 static void tpacket_set_protocol(const struct net_device
*dev
,
2430 struct sk_buff
*skb
)
2432 if (dev
->type
== ARPHRD_ETHER
) {
2433 skb_reset_mac_header(skb
);
2434 skb
->protocol
= eth_hdr(skb
)->h_proto
;
2438 static int __packet_snd_vnet_parse(struct virtio_net_hdr
*vnet_hdr
, size_t len
)
2440 if ((vnet_hdr
->flags
& VIRTIO_NET_HDR_F_NEEDS_CSUM
) &&
2441 (__virtio16_to_cpu(vio_le(), vnet_hdr
->csum_start
) +
2442 __virtio16_to_cpu(vio_le(), vnet_hdr
->csum_offset
) + 2 >
2443 __virtio16_to_cpu(vio_le(), vnet_hdr
->hdr_len
)))
2444 vnet_hdr
->hdr_len
= __cpu_to_virtio16(vio_le(),
2445 __virtio16_to_cpu(vio_le(), vnet_hdr
->csum_start
) +
2446 __virtio16_to_cpu(vio_le(), vnet_hdr
->csum_offset
) + 2);
2448 if (__virtio16_to_cpu(vio_le(), vnet_hdr
->hdr_len
) > len
)
2454 static int packet_snd_vnet_parse(struct msghdr
*msg
, size_t *len
,
2455 struct virtio_net_hdr
*vnet_hdr
)
2457 if (*len
< sizeof(*vnet_hdr
))
2459 *len
-= sizeof(*vnet_hdr
);
2461 if (!copy_from_iter_full(vnet_hdr
, sizeof(*vnet_hdr
), &msg
->msg_iter
))
2464 return __packet_snd_vnet_parse(vnet_hdr
, *len
);
2467 static int tpacket_fill_skb(struct packet_sock
*po
, struct sk_buff
*skb
,
2468 void *frame
, struct net_device
*dev
, void *data
, int tp_len
,
2469 __be16 proto
, unsigned char *addr
, int hlen
, int copylen
,
2470 const struct sockcm_cookie
*sockc
)
2472 union tpacket_uhdr ph
;
2473 int to_write
, offset
, len
, nr_frags
, len_max
;
2474 struct socket
*sock
= po
->sk
.sk_socket
;
2480 skb
->protocol
= proto
;
2482 skb
->priority
= po
->sk
.sk_priority
;
2483 skb
->mark
= po
->sk
.sk_mark
;
2484 skb
->tstamp
= sockc
->transmit_time
;
2485 sock_tx_timestamp(&po
->sk
, sockc
->tsflags
, &skb_shinfo(skb
)->tx_flags
);
2486 skb_zcopy_set_nouarg(skb
, ph
.raw
);
2488 skb_reserve(skb
, hlen
);
2489 skb_reset_network_header(skb
);
2493 if (sock
->type
== SOCK_DGRAM
) {
2494 err
= dev_hard_header(skb
, dev
, ntohs(proto
), addr
,
2496 if (unlikely(err
< 0))
2498 } else if (copylen
) {
2499 int hdrlen
= min_t(int, copylen
, tp_len
);
2501 skb_push(skb
, dev
->hard_header_len
);
2502 skb_put(skb
, copylen
- dev
->hard_header_len
);
2503 err
= skb_store_bits(skb
, 0, data
, hdrlen
);
2506 if (!dev_validate_header(dev
, skb
->data
, hdrlen
))
2509 tpacket_set_protocol(dev
, skb
);
2515 offset
= offset_in_page(data
);
2516 len_max
= PAGE_SIZE
- offset
;
2517 len
= ((to_write
> len_max
) ? len_max
: to_write
);
2519 skb
->data_len
= to_write
;
2520 skb
->len
+= to_write
;
2521 skb
->truesize
+= to_write
;
2522 refcount_add(to_write
, &po
->sk
.sk_wmem_alloc
);
2524 while (likely(to_write
)) {
2525 nr_frags
= skb_shinfo(skb
)->nr_frags
;
2527 if (unlikely(nr_frags
>= MAX_SKB_FRAGS
)) {
2528 pr_err("Packet exceed the number of skb frags(%lu)\n",
2533 page
= pgv_to_page(data
);
2535 flush_dcache_page(page
);
2537 skb_fill_page_desc(skb
, nr_frags
, page
, offset
, len
);
2540 len_max
= PAGE_SIZE
;
2541 len
= ((to_write
> len_max
) ? len_max
: to_write
);
2544 skb_probe_transport_header(skb
, 0);
2549 static int tpacket_parse_header(struct packet_sock
*po
, void *frame
,
2550 int size_max
, void **data
)
2552 union tpacket_uhdr ph
;
2557 switch (po
->tp_version
) {
2559 if (ph
.h3
->tp_next_offset
!= 0) {
2560 pr_warn_once("variable sized slot not supported");
2563 tp_len
= ph
.h3
->tp_len
;
2566 tp_len
= ph
.h2
->tp_len
;
2569 tp_len
= ph
.h1
->tp_len
;
2572 if (unlikely(tp_len
> size_max
)) {
2573 pr_err("packet size is too long (%d > %d)\n", tp_len
, size_max
);
2577 if (unlikely(po
->tp_tx_has_off
)) {
2578 int off_min
, off_max
;
2580 off_min
= po
->tp_hdrlen
- sizeof(struct sockaddr_ll
);
2581 off_max
= po
->tx_ring
.frame_size
- tp_len
;
2582 if (po
->sk
.sk_type
== SOCK_DGRAM
) {
2583 switch (po
->tp_version
) {
2585 off
= ph
.h3
->tp_net
;
2588 off
= ph
.h2
->tp_net
;
2591 off
= ph
.h1
->tp_net
;
2595 switch (po
->tp_version
) {
2597 off
= ph
.h3
->tp_mac
;
2600 off
= ph
.h2
->tp_mac
;
2603 off
= ph
.h1
->tp_mac
;
2607 if (unlikely((off
< off_min
) || (off_max
< off
)))
2610 off
= po
->tp_hdrlen
- sizeof(struct sockaddr_ll
);
2613 *data
= frame
+ off
;
2617 static int tpacket_snd(struct packet_sock
*po
, struct msghdr
*msg
)
2619 struct sk_buff
*skb
= NULL
;
2620 struct net_device
*dev
;
2621 struct virtio_net_hdr
*vnet_hdr
= NULL
;
2622 struct sockcm_cookie sockc
;
2624 int err
, reserve
= 0;
2626 DECLARE_SOCKADDR(struct sockaddr_ll
*, saddr
, msg
->msg_name
);
2627 bool need_wait
= !(msg
->msg_flags
& MSG_DONTWAIT
);
2628 unsigned char *addr
= NULL
;
2629 int tp_len
, size_max
;
2632 int status
= TP_STATUS_AVAILABLE
;
2633 int hlen
, tlen
, copylen
= 0;
2636 mutex_lock(&po
->pg_vec_lock
);
2638 /* packet_sendmsg() check on tx_ring.pg_vec was lockless,
2639 * we need to confirm it under protection of pg_vec_lock.
2641 if (unlikely(!po
->tx_ring
.pg_vec
)) {
2645 if (likely(saddr
== NULL
)) {
2646 dev
= packet_cached_dev_get(po
);
2650 if (msg
->msg_namelen
< sizeof(struct sockaddr_ll
))
2652 if (msg
->msg_namelen
< (saddr
->sll_halen
2653 + offsetof(struct sockaddr_ll
,
2656 proto
= saddr
->sll_protocol
;
2657 dev
= dev_get_by_index(sock_net(&po
->sk
), saddr
->sll_ifindex
);
2658 if (po
->sk
.sk_socket
->type
== SOCK_DGRAM
) {
2659 if (dev
&& msg
->msg_namelen
< dev
->addr_len
+
2660 offsetof(struct sockaddr_ll
, sll_addr
))
2662 addr
= saddr
->sll_addr
;
2667 if (unlikely(dev
== NULL
))
2670 if (unlikely(!(dev
->flags
& IFF_UP
)))
2673 sockcm_init(&sockc
, &po
->sk
);
2674 if (msg
->msg_controllen
) {
2675 err
= sock_cmsg_send(&po
->sk
, msg
, &sockc
);
2680 if (po
->sk
.sk_socket
->type
== SOCK_RAW
)
2681 reserve
= dev
->hard_header_len
;
2682 size_max
= po
->tx_ring
.frame_size
2683 - (po
->tp_hdrlen
- sizeof(struct sockaddr_ll
));
2685 if ((size_max
> dev
->mtu
+ reserve
+ VLAN_HLEN
) && !po
->has_vnet_hdr
)
2686 size_max
= dev
->mtu
+ reserve
+ VLAN_HLEN
;
2688 reinit_completion(&po
->skb_completion
);
2691 ph
= packet_current_frame(po
, &po
->tx_ring
,
2692 TP_STATUS_SEND_REQUEST
);
2693 if (unlikely(ph
== NULL
)) {
2694 if (need_wait
&& skb
) {
2695 timeo
= sock_sndtimeo(&po
->sk
, msg
->msg_flags
& MSG_DONTWAIT
);
2696 timeo
= wait_for_completion_interruptible_timeout(&po
->skb_completion
, timeo
);
2698 err
= !timeo
? -ETIMEDOUT
: -ERESTARTSYS
;
2702 /* check for additional frames */
2707 tp_len
= tpacket_parse_header(po
, ph
, size_max
, &data
);
2711 status
= TP_STATUS_SEND_REQUEST
;
2712 hlen
= LL_RESERVED_SPACE(dev
);
2713 tlen
= dev
->needed_tailroom
;
2714 if (po
->has_vnet_hdr
) {
2716 data
+= sizeof(*vnet_hdr
);
2717 tp_len
-= sizeof(*vnet_hdr
);
2719 __packet_snd_vnet_parse(vnet_hdr
, tp_len
)) {
2723 copylen
= __virtio16_to_cpu(vio_le(),
2726 copylen
= max_t(int, copylen
, dev
->hard_header_len
);
2727 skb
= sock_alloc_send_skb(&po
->sk
,
2728 hlen
+ tlen
+ sizeof(struct sockaddr_ll
) +
2729 (copylen
- dev
->hard_header_len
),
2732 if (unlikely(skb
== NULL
)) {
2733 /* we assume the socket was initially writeable ... */
2734 if (likely(len_sum
> 0))
2738 tp_len
= tpacket_fill_skb(po
, skb
, ph
, dev
, data
, tp_len
, proto
,
2739 addr
, hlen
, copylen
, &sockc
);
2740 if (likely(tp_len
>= 0) &&
2741 tp_len
> dev
->mtu
+ reserve
&&
2742 !po
->has_vnet_hdr
&&
2743 !packet_extra_vlan_len_allowed(dev
, skb
))
2746 if (unlikely(tp_len
< 0)) {
2749 __packet_set_status(po
, ph
,
2750 TP_STATUS_AVAILABLE
);
2751 packet_increment_head(&po
->tx_ring
);
2755 status
= TP_STATUS_WRONG_FORMAT
;
2761 if (po
->has_vnet_hdr
) {
2762 if (virtio_net_hdr_to_skb(skb
, vnet_hdr
, vio_le())) {
2766 virtio_net_hdr_set_proto(skb
, vnet_hdr
);
2769 skb
->destructor
= tpacket_destruct_skb
;
2770 __packet_set_status(po
, ph
, TP_STATUS_SENDING
);
2771 packet_inc_pending(&po
->tx_ring
);
2773 status
= TP_STATUS_SEND_REQUEST
;
2774 err
= po
->xmit(skb
);
2775 if (unlikely(err
> 0)) {
2776 err
= net_xmit_errno(err
);
2777 if (err
&& __packet_get_status(po
, ph
) ==
2778 TP_STATUS_AVAILABLE
) {
2779 /* skb was destructed already */
2784 * skb was dropped but not destructed yet;
2785 * let's treat it like congestion or err < 0
2789 packet_increment_head(&po
->tx_ring
);
2791 } while (likely((ph
!= NULL
) ||
2792 /* Note: packet_read_pending() might be slow if we have
2793 * to call it as it's per_cpu variable, but in fast-path
2794 * we already short-circuit the loop with the first
2795 * condition, and luckily don't have to go that path
2798 (need_wait
&& packet_read_pending(&po
->tx_ring
))));
2804 __packet_set_status(po
, ph
, status
);
2809 mutex_unlock(&po
->pg_vec_lock
);
2813 static struct sk_buff
*packet_alloc_skb(struct sock
*sk
, size_t prepad
,
2814 size_t reserve
, size_t len
,
2815 size_t linear
, int noblock
,
2818 struct sk_buff
*skb
;
2820 /* Under a page? Don't bother with paged skb. */
2821 if (prepad
+ len
< PAGE_SIZE
|| !linear
)
2824 skb
= sock_alloc_send_pskb(sk
, prepad
+ linear
, len
- linear
, noblock
,
2829 skb_reserve(skb
, reserve
);
2830 skb_put(skb
, linear
);
2831 skb
->data_len
= len
- linear
;
2832 skb
->len
+= len
- linear
;
2837 static int packet_snd(struct socket
*sock
, struct msghdr
*msg
, size_t len
)
2839 struct sock
*sk
= sock
->sk
;
2840 DECLARE_SOCKADDR(struct sockaddr_ll
*, saddr
, msg
->msg_name
);
2841 struct sk_buff
*skb
;
2842 struct net_device
*dev
;
2844 unsigned char *addr
= NULL
;
2845 int err
, reserve
= 0;
2846 struct sockcm_cookie sockc
;
2847 struct virtio_net_hdr vnet_hdr
= { 0 };
2849 struct packet_sock
*po
= pkt_sk(sk
);
2850 bool has_vnet_hdr
= false;
2851 int hlen
, tlen
, linear
;
2855 * Get and verify the address.
2858 if (likely(saddr
== NULL
)) {
2859 dev
= packet_cached_dev_get(po
);
2863 if (msg
->msg_namelen
< sizeof(struct sockaddr_ll
))
2865 if (msg
->msg_namelen
< (saddr
->sll_halen
+ offsetof(struct sockaddr_ll
, sll_addr
)))
2867 proto
= saddr
->sll_protocol
;
2868 dev
= dev_get_by_index(sock_net(sk
), saddr
->sll_ifindex
);
2869 if (sock
->type
== SOCK_DGRAM
) {
2870 if (dev
&& msg
->msg_namelen
< dev
->addr_len
+
2871 offsetof(struct sockaddr_ll
, sll_addr
))
2873 addr
= saddr
->sll_addr
;
2878 if (unlikely(dev
== NULL
))
2881 if (unlikely(!(dev
->flags
& IFF_UP
)))
2884 sockcm_init(&sockc
, sk
);
2885 sockc
.mark
= sk
->sk_mark
;
2886 if (msg
->msg_controllen
) {
2887 err
= sock_cmsg_send(sk
, msg
, &sockc
);
2892 if (sock
->type
== SOCK_RAW
)
2893 reserve
= dev
->hard_header_len
;
2894 if (po
->has_vnet_hdr
) {
2895 err
= packet_snd_vnet_parse(msg
, &len
, &vnet_hdr
);
2898 has_vnet_hdr
= true;
2901 if (unlikely(sock_flag(sk
, SOCK_NOFCS
))) {
2902 if (!netif_supports_nofcs(dev
)) {
2903 err
= -EPROTONOSUPPORT
;
2906 extra_len
= 4; /* We're doing our own CRC */
2910 if (!vnet_hdr
.gso_type
&&
2911 (len
> dev
->mtu
+ reserve
+ VLAN_HLEN
+ extra_len
))
2915 hlen
= LL_RESERVED_SPACE(dev
);
2916 tlen
= dev
->needed_tailroom
;
2917 linear
= __virtio16_to_cpu(vio_le(), vnet_hdr
.hdr_len
);
2918 linear
= max(linear
, min_t(int, len
, dev
->hard_header_len
));
2919 skb
= packet_alloc_skb(sk
, hlen
+ tlen
, hlen
, len
, linear
,
2920 msg
->msg_flags
& MSG_DONTWAIT
, &err
);
2924 skb_reset_network_header(skb
);
2927 if (sock
->type
== SOCK_DGRAM
) {
2928 offset
= dev_hard_header(skb
, dev
, ntohs(proto
), addr
, NULL
, len
);
2929 if (unlikely(offset
< 0))
2931 } else if (reserve
) {
2932 skb_reserve(skb
, -reserve
);
2933 if (len
< reserve
+ sizeof(struct ipv6hdr
) &&
2934 dev
->min_header_len
!= dev
->hard_header_len
)
2935 skb_reset_network_header(skb
);
2938 /* Returns -EFAULT on error */
2939 err
= skb_copy_datagram_from_iter(skb
, offset
, &msg
->msg_iter
, len
);
2943 if (sock
->type
== SOCK_RAW
&&
2944 !dev_validate_header(dev
, skb
->data
, len
)) {
2949 sock_tx_timestamp(sk
, sockc
.tsflags
, &skb_shinfo(skb
)->tx_flags
);
2951 if (!vnet_hdr
.gso_type
&& (len
> dev
->mtu
+ reserve
+ extra_len
) &&
2952 !packet_extra_vlan_len_allowed(dev
, skb
)) {
2957 skb
->protocol
= proto
;
2959 skb
->priority
= sk
->sk_priority
;
2960 skb
->mark
= sockc
.mark
;
2961 skb
->tstamp
= sockc
.transmit_time
;
2964 err
= virtio_net_hdr_to_skb(skb
, &vnet_hdr
, vio_le());
2967 len
+= sizeof(vnet_hdr
);
2968 virtio_net_hdr_set_proto(skb
, &vnet_hdr
);
2971 skb_probe_transport_header(skb
, reserve
);
2973 if (unlikely(extra_len
== 4))
2976 err
= po
->xmit(skb
);
2977 if (err
> 0 && (err
= net_xmit_errno(err
)) != 0)
2993 static int packet_sendmsg(struct socket
*sock
, struct msghdr
*msg
, size_t len
)
2995 struct sock
*sk
= sock
->sk
;
2996 struct packet_sock
*po
= pkt_sk(sk
);
2998 if (po
->tx_ring
.pg_vec
)
2999 return tpacket_snd(po
, msg
);
3001 return packet_snd(sock
, msg
, len
);
3005 * Close a PACKET socket. This is fairly simple. We immediately go
3006 * to 'closed' state and remove our protocol entry in the device list.
3009 static int packet_release(struct socket
*sock
)
3011 struct sock
*sk
= sock
->sk
;
3012 struct packet_sock
*po
;
3013 struct packet_fanout
*f
;
3015 union tpacket_req_u req_u
;
3023 mutex_lock(&net
->packet
.sklist_lock
);
3024 sk_del_node_init_rcu(sk
);
3025 mutex_unlock(&net
->packet
.sklist_lock
);
3028 sock_prot_inuse_add(net
, sk
->sk_prot
, -1);
3031 spin_lock(&po
->bind_lock
);
3032 unregister_prot_hook(sk
, false);
3033 packet_cached_dev_reset(po
);
3035 if (po
->prot_hook
.dev
) {
3036 dev_put(po
->prot_hook
.dev
);
3037 po
->prot_hook
.dev
= NULL
;
3039 spin_unlock(&po
->bind_lock
);
3041 packet_flush_mclist(sk
);
3044 if (po
->rx_ring
.pg_vec
) {
3045 memset(&req_u
, 0, sizeof(req_u
));
3046 packet_set_ring(sk
, &req_u
, 1, 0);
3049 if (po
->tx_ring
.pg_vec
) {
3050 memset(&req_u
, 0, sizeof(req_u
));
3051 packet_set_ring(sk
, &req_u
, 1, 1);
3055 f
= fanout_release(sk
);
3059 kfree(po
->rollover
);
3061 fanout_release_data(f
);
3065 * Now the socket is dead. No more input will appear.
3072 skb_queue_purge(&sk
->sk_receive_queue
);
3073 packet_free_pending(po
);
3074 sk_refcnt_debug_release(sk
);
3081 * Attach a packet hook.
3084 static int packet_do_bind(struct sock
*sk
, const char *name
, int ifindex
,
3087 struct packet_sock
*po
= pkt_sk(sk
);
3088 struct net_device
*dev_curr
;
3091 struct net_device
*dev
= NULL
;
3093 bool unlisted
= false;
3096 spin_lock(&po
->bind_lock
);
3105 dev
= dev_get_by_name_rcu(sock_net(sk
), name
);
3110 } else if (ifindex
) {
3111 dev
= dev_get_by_index_rcu(sock_net(sk
), ifindex
);
3121 proto_curr
= po
->prot_hook
.type
;
3122 dev_curr
= po
->prot_hook
.dev
;
3124 need_rehook
= proto_curr
!= proto
|| dev_curr
!= dev
;
3129 /* prevents packet_notifier() from calling
3130 * register_prot_hook()
3133 __unregister_prot_hook(sk
, true);
3135 dev_curr
= po
->prot_hook
.dev
;
3137 unlisted
= !dev_get_by_index_rcu(sock_net(sk
),
3141 BUG_ON(po
->running
);
3143 po
->prot_hook
.type
= proto
;
3145 if (unlikely(unlisted
)) {
3147 po
->prot_hook
.dev
= NULL
;
3149 packet_cached_dev_reset(po
);
3151 po
->prot_hook
.dev
= dev
;
3152 po
->ifindex
= dev
? dev
->ifindex
: 0;
3153 packet_cached_dev_assign(po
, dev
);
3159 if (proto
== 0 || !need_rehook
)
3162 if (!unlisted
&& (!dev
|| (dev
->flags
& IFF_UP
))) {
3163 register_prot_hook(sk
);
3165 sk
->sk_err
= ENETDOWN
;
3166 if (!sock_flag(sk
, SOCK_DEAD
))
3167 sk
->sk_error_report(sk
);
3172 spin_unlock(&po
->bind_lock
);
3178 * Bind a packet socket to a device
3181 static int packet_bind_spkt(struct socket
*sock
, struct sockaddr
*uaddr
,
3184 struct sock
*sk
= sock
->sk
;
3185 char name
[sizeof(uaddr
->sa_data
) + 1];
3191 if (addr_len
!= sizeof(struct sockaddr
))
3193 /* uaddr->sa_data comes from the userspace, it's not guaranteed to be
3196 memcpy(name
, uaddr
->sa_data
, sizeof(uaddr
->sa_data
));
3197 name
[sizeof(uaddr
->sa_data
)] = 0;
3199 return packet_do_bind(sk
, name
, 0, pkt_sk(sk
)->num
);
3202 static int packet_bind(struct socket
*sock
, struct sockaddr
*uaddr
, int addr_len
)
3204 struct sockaddr_ll
*sll
= (struct sockaddr_ll
*)uaddr
;
3205 struct sock
*sk
= sock
->sk
;
3211 if (addr_len
< sizeof(struct sockaddr_ll
))
3213 if (sll
->sll_family
!= AF_PACKET
)
3216 return packet_do_bind(sk
, NULL
, sll
->sll_ifindex
,
3217 sll
->sll_protocol
? : pkt_sk(sk
)->num
);
3220 static struct proto packet_proto
= {
3222 .owner
= THIS_MODULE
,
3223 .obj_size
= sizeof(struct packet_sock
),
3227 * Create a packet of type SOCK_PACKET.
3230 static int packet_create(struct net
*net
, struct socket
*sock
, int protocol
,
3234 struct packet_sock
*po
;
3235 __be16 proto
= (__force __be16
)protocol
; /* weird, but documented */
3238 if (!ns_capable(net
->user_ns
, CAP_NET_RAW
))
3240 if (sock
->type
!= SOCK_DGRAM
&& sock
->type
!= SOCK_RAW
&&
3241 sock
->type
!= SOCK_PACKET
)
3242 return -ESOCKTNOSUPPORT
;
3244 sock
->state
= SS_UNCONNECTED
;
3247 sk
= sk_alloc(net
, PF_PACKET
, GFP_KERNEL
, &packet_proto
, kern
);
3251 sock
->ops
= &packet_ops
;
3252 if (sock
->type
== SOCK_PACKET
)
3253 sock
->ops
= &packet_ops_spkt
;
3255 sock_init_data(sock
, sk
);
3258 init_completion(&po
->skb_completion
);
3259 sk
->sk_family
= PF_PACKET
;
3261 po
->xmit
= dev_queue_xmit
;
3263 err
= packet_alloc_pending(po
);
3267 packet_cached_dev_reset(po
);
3269 sk
->sk_destruct
= packet_sock_destruct
;
3270 sk_refcnt_debug_inc(sk
);
3273 * Attach a protocol block
3276 spin_lock_init(&po
->bind_lock
);
3277 mutex_init(&po
->pg_vec_lock
);
3278 po
->rollover
= NULL
;
3279 po
->prot_hook
.func
= packet_rcv
;
3281 if (sock
->type
== SOCK_PACKET
)
3282 po
->prot_hook
.func
= packet_rcv_spkt
;
3284 po
->prot_hook
.af_packet_priv
= sk
;
3287 po
->prot_hook
.type
= proto
;
3288 __register_prot_hook(sk
);
3291 mutex_lock(&net
->packet
.sklist_lock
);
3292 sk_add_node_tail_rcu(sk
, &net
->packet
.sklist
);
3293 mutex_unlock(&net
->packet
.sklist_lock
);
3296 sock_prot_inuse_add(net
, &packet_proto
, 1);
3307 * Pull a packet from our receive queue and hand it to the user.
3308 * If necessary we block.
3311 static int packet_recvmsg(struct socket
*sock
, struct msghdr
*msg
, size_t len
,
3314 struct sock
*sk
= sock
->sk
;
3315 struct sk_buff
*skb
;
3317 int vnet_hdr_len
= 0;
3318 unsigned int origlen
= 0;
3321 if (flags
& ~(MSG_PEEK
|MSG_DONTWAIT
|MSG_TRUNC
|MSG_CMSG_COMPAT
|MSG_ERRQUEUE
))
3325 /* What error should we return now? EUNATTACH? */
3326 if (pkt_sk(sk
)->ifindex
< 0)
3330 if (flags
& MSG_ERRQUEUE
) {
3331 err
= sock_recv_errqueue(sk
, msg
, len
,
3332 SOL_PACKET
, PACKET_TX_TIMESTAMP
);
3337 * Call the generic datagram receiver. This handles all sorts
3338 * of horrible races and re-entrancy so we can forget about it
3339 * in the protocol layers.
3341 * Now it will return ENETDOWN, if device have just gone down,
3342 * but then it will block.
3345 skb
= skb_recv_datagram(sk
, flags
, flags
& MSG_DONTWAIT
, &err
);
3348 * An error occurred so return it. Because skb_recv_datagram()
3349 * handles the blocking we don't see and worry about blocking
3356 if (pkt_sk(sk
)->pressure
)
3357 packet_rcv_has_room(pkt_sk(sk
), NULL
);
3359 if (pkt_sk(sk
)->has_vnet_hdr
) {
3360 err
= packet_rcv_vnet(msg
, skb
, &len
);
3363 vnet_hdr_len
= sizeof(struct virtio_net_hdr
);
3366 /* You lose any data beyond the buffer you gave. If it worries
3367 * a user program they can ask the device for its MTU
3373 msg
->msg_flags
|= MSG_TRUNC
;
3376 err
= skb_copy_datagram_msg(skb
, 0, msg
, copied
);
3380 if (sock
->type
!= SOCK_PACKET
) {
3381 struct sockaddr_ll
*sll
= &PACKET_SKB_CB(skb
)->sa
.ll
;
3383 /* Original length was stored in sockaddr_ll fields */
3384 origlen
= PACKET_SKB_CB(skb
)->sa
.origlen
;
3385 sll
->sll_family
= AF_PACKET
;
3386 sll
->sll_protocol
= skb
->protocol
;
3389 sock_recv_ts_and_drops(msg
, sk
, skb
);
3391 if (msg
->msg_name
) {
3394 /* If the address length field is there to be filled
3395 * in, we fill it in now.
3397 if (sock
->type
== SOCK_PACKET
) {
3398 __sockaddr_check_size(sizeof(struct sockaddr_pkt
));
3399 msg
->msg_namelen
= sizeof(struct sockaddr_pkt
);
3400 copy_len
= msg
->msg_namelen
;
3402 struct sockaddr_ll
*sll
= &PACKET_SKB_CB(skb
)->sa
.ll
;
3404 msg
->msg_namelen
= sll
->sll_halen
+
3405 offsetof(struct sockaddr_ll
, sll_addr
);
3406 copy_len
= msg
->msg_namelen
;
3407 if (msg
->msg_namelen
< sizeof(struct sockaddr_ll
)) {
3408 memset(msg
->msg_name
+
3409 offsetof(struct sockaddr_ll
, sll_addr
),
3410 0, sizeof(sll
->sll_addr
));
3411 msg
->msg_namelen
= sizeof(struct sockaddr_ll
);
3414 memcpy(msg
->msg_name
, &PACKET_SKB_CB(skb
)->sa
, copy_len
);
3417 if (pkt_sk(sk
)->auxdata
) {
3418 struct tpacket_auxdata aux
;
3420 aux
.tp_status
= TP_STATUS_USER
;
3421 if (skb
->ip_summed
== CHECKSUM_PARTIAL
)
3422 aux
.tp_status
|= TP_STATUS_CSUMNOTREADY
;
3423 else if (skb
->pkt_type
!= PACKET_OUTGOING
&&
3424 (skb
->ip_summed
== CHECKSUM_COMPLETE
||
3425 skb_csum_unnecessary(skb
)))
3426 aux
.tp_status
|= TP_STATUS_CSUM_VALID
;
3428 aux
.tp_len
= origlen
;
3429 aux
.tp_snaplen
= skb
->len
;
3431 aux
.tp_net
= skb_network_offset(skb
);
3432 if (skb_vlan_tag_present(skb
)) {
3433 aux
.tp_vlan_tci
= skb_vlan_tag_get(skb
);
3434 aux
.tp_vlan_tpid
= ntohs(skb
->vlan_proto
);
3435 aux
.tp_status
|= TP_STATUS_VLAN_VALID
| TP_STATUS_VLAN_TPID_VALID
;
3437 aux
.tp_vlan_tci
= 0;
3438 aux
.tp_vlan_tpid
= 0;
3440 put_cmsg(msg
, SOL_PACKET
, PACKET_AUXDATA
, sizeof(aux
), &aux
);
3444 * Free or return the buffer as appropriate. Again this
3445 * hides all the races and re-entrancy issues from us.
3447 err
= vnet_hdr_len
+ ((flags
&MSG_TRUNC
) ? skb
->len
: copied
);
3450 skb_free_datagram(sk
, skb
);
3455 static int packet_getname_spkt(struct socket
*sock
, struct sockaddr
*uaddr
,
3458 struct net_device
*dev
;
3459 struct sock
*sk
= sock
->sk
;
3464 uaddr
->sa_family
= AF_PACKET
;
3465 memset(uaddr
->sa_data
, 0, sizeof(uaddr
->sa_data
));
3467 dev
= dev_get_by_index_rcu(sock_net(sk
), pkt_sk(sk
)->ifindex
);
3469 strlcpy(uaddr
->sa_data
, dev
->name
, sizeof(uaddr
->sa_data
));
3472 return sizeof(*uaddr
);
3475 static int packet_getname(struct socket
*sock
, struct sockaddr
*uaddr
,
3478 struct net_device
*dev
;
3479 struct sock
*sk
= sock
->sk
;
3480 struct packet_sock
*po
= pkt_sk(sk
);
3481 DECLARE_SOCKADDR(struct sockaddr_ll
*, sll
, uaddr
);
3486 sll
->sll_family
= AF_PACKET
;
3487 sll
->sll_ifindex
= po
->ifindex
;
3488 sll
->sll_protocol
= po
->num
;
3489 sll
->sll_pkttype
= 0;
3491 dev
= dev_get_by_index_rcu(sock_net(sk
), po
->ifindex
);
3493 sll
->sll_hatype
= dev
->type
;
3494 sll
->sll_halen
= dev
->addr_len
;
3495 memcpy(sll
->sll_addr
, dev
->dev_addr
, dev
->addr_len
);
3497 sll
->sll_hatype
= 0; /* Bad: we have no ARPHRD_UNSPEC */
3502 return offsetof(struct sockaddr_ll
, sll_addr
) + sll
->sll_halen
;
3505 static int packet_dev_mc(struct net_device
*dev
, struct packet_mclist
*i
,
3509 case PACKET_MR_MULTICAST
:
3510 if (i
->alen
!= dev
->addr_len
)
3513 return dev_mc_add(dev
, i
->addr
);
3515 return dev_mc_del(dev
, i
->addr
);
3517 case PACKET_MR_PROMISC
:
3518 return dev_set_promiscuity(dev
, what
);
3519 case PACKET_MR_ALLMULTI
:
3520 return dev_set_allmulti(dev
, what
);
3521 case PACKET_MR_UNICAST
:
3522 if (i
->alen
!= dev
->addr_len
)
3525 return dev_uc_add(dev
, i
->addr
);
3527 return dev_uc_del(dev
, i
->addr
);
3535 static void packet_dev_mclist_delete(struct net_device
*dev
,
3536 struct packet_mclist
**mlp
)
3538 struct packet_mclist
*ml
;
3540 while ((ml
= *mlp
) != NULL
) {
3541 if (ml
->ifindex
== dev
->ifindex
) {
3542 packet_dev_mc(dev
, ml
, -1);
3550 static int packet_mc_add(struct sock
*sk
, struct packet_mreq_max
*mreq
)
3552 struct packet_sock
*po
= pkt_sk(sk
);
3553 struct packet_mclist
*ml
, *i
;
3554 struct net_device
*dev
;
3560 dev
= __dev_get_by_index(sock_net(sk
), mreq
->mr_ifindex
);
3565 if (mreq
->mr_alen
> dev
->addr_len
)
3569 i
= kmalloc(sizeof(*i
), GFP_KERNEL
);
3574 for (ml
= po
->mclist
; ml
; ml
= ml
->next
) {
3575 if (ml
->ifindex
== mreq
->mr_ifindex
&&
3576 ml
->type
== mreq
->mr_type
&&
3577 ml
->alen
== mreq
->mr_alen
&&
3578 memcmp(ml
->addr
, mreq
->mr_address
, ml
->alen
) == 0) {
3580 /* Free the new element ... */
3586 i
->type
= mreq
->mr_type
;
3587 i
->ifindex
= mreq
->mr_ifindex
;
3588 i
->alen
= mreq
->mr_alen
;
3589 memcpy(i
->addr
, mreq
->mr_address
, i
->alen
);
3590 memset(i
->addr
+ i
->alen
, 0, sizeof(i
->addr
) - i
->alen
);
3592 i
->next
= po
->mclist
;
3594 err
= packet_dev_mc(dev
, i
, 1);
3596 po
->mclist
= i
->next
;
3605 static int packet_mc_drop(struct sock
*sk
, struct packet_mreq_max
*mreq
)
3607 struct packet_mclist
*ml
, **mlp
;
3611 for (mlp
= &pkt_sk(sk
)->mclist
; (ml
= *mlp
) != NULL
; mlp
= &ml
->next
) {
3612 if (ml
->ifindex
== mreq
->mr_ifindex
&&
3613 ml
->type
== mreq
->mr_type
&&
3614 ml
->alen
== mreq
->mr_alen
&&
3615 memcmp(ml
->addr
, mreq
->mr_address
, ml
->alen
) == 0) {
3616 if (--ml
->count
== 0) {
3617 struct net_device
*dev
;
3619 dev
= __dev_get_by_index(sock_net(sk
), ml
->ifindex
);
3621 packet_dev_mc(dev
, ml
, -1);
3631 static void packet_flush_mclist(struct sock
*sk
)
3633 struct packet_sock
*po
= pkt_sk(sk
);
3634 struct packet_mclist
*ml
;
3640 while ((ml
= po
->mclist
) != NULL
) {
3641 struct net_device
*dev
;
3643 po
->mclist
= ml
->next
;
3644 dev
= __dev_get_by_index(sock_net(sk
), ml
->ifindex
);
3646 packet_dev_mc(dev
, ml
, -1);
3653 packet_setsockopt(struct socket
*sock
, int level
, int optname
, char __user
*optval
, unsigned int optlen
)
3655 struct sock
*sk
= sock
->sk
;
3656 struct packet_sock
*po
= pkt_sk(sk
);
3659 if (level
!= SOL_PACKET
)
3660 return -ENOPROTOOPT
;
3663 case PACKET_ADD_MEMBERSHIP
:
3664 case PACKET_DROP_MEMBERSHIP
:
3666 struct packet_mreq_max mreq
;
3668 memset(&mreq
, 0, sizeof(mreq
));
3669 if (len
< sizeof(struct packet_mreq
))
3671 if (len
> sizeof(mreq
))
3673 if (copy_from_user(&mreq
, optval
, len
))
3675 if (len
< (mreq
.mr_alen
+ offsetof(struct packet_mreq
, mr_address
)))
3677 if (optname
== PACKET_ADD_MEMBERSHIP
)
3678 ret
= packet_mc_add(sk
, &mreq
);
3680 ret
= packet_mc_drop(sk
, &mreq
);
3684 case PACKET_RX_RING
:
3685 case PACKET_TX_RING
:
3687 union tpacket_req_u req_u
;
3691 switch (po
->tp_version
) {
3694 len
= sizeof(req_u
.req
);
3698 len
= sizeof(req_u
.req3
);
3704 if (copy_from_user(&req_u
.req
, optval
, len
))
3707 ret
= packet_set_ring(sk
, &req_u
, 0,
3708 optname
== PACKET_TX_RING
);
3713 case PACKET_COPY_THRESH
:
3717 if (optlen
!= sizeof(val
))
3719 if (copy_from_user(&val
, optval
, sizeof(val
)))
3722 pkt_sk(sk
)->copy_thresh
= val
;
3725 case PACKET_VERSION
:
3729 if (optlen
!= sizeof(val
))
3731 if (copy_from_user(&val
, optval
, sizeof(val
)))
3742 if (po
->rx_ring
.pg_vec
|| po
->tx_ring
.pg_vec
) {
3745 po
->tp_version
= val
;
3751 case PACKET_RESERVE
:
3755 if (optlen
!= sizeof(val
))
3757 if (copy_from_user(&val
, optval
, sizeof(val
)))
3762 if (po
->rx_ring
.pg_vec
|| po
->tx_ring
.pg_vec
) {
3765 po
->tp_reserve
= val
;
3775 if (optlen
!= sizeof(val
))
3777 if (copy_from_user(&val
, optval
, sizeof(val
)))
3781 if (po
->rx_ring
.pg_vec
|| po
->tx_ring
.pg_vec
) {
3784 po
->tp_loss
= !!val
;
3790 case PACKET_AUXDATA
:
3794 if (optlen
< sizeof(val
))
3796 if (copy_from_user(&val
, optval
, sizeof(val
)))
3800 po
->auxdata
= !!val
;
3804 case PACKET_ORIGDEV
:
3808 if (optlen
< sizeof(val
))
3810 if (copy_from_user(&val
, optval
, sizeof(val
)))
3814 po
->origdev
= !!val
;
3818 case PACKET_VNET_HDR
:
3822 if (sock
->type
!= SOCK_RAW
)
3824 if (optlen
< sizeof(val
))
3826 if (copy_from_user(&val
, optval
, sizeof(val
)))
3830 if (po
->rx_ring
.pg_vec
|| po
->tx_ring
.pg_vec
) {
3833 po
->has_vnet_hdr
= !!val
;
3839 case PACKET_TIMESTAMP
:
3843 if (optlen
!= sizeof(val
))
3845 if (copy_from_user(&val
, optval
, sizeof(val
)))
3848 po
->tp_tstamp
= val
;
3855 if (optlen
!= sizeof(val
))
3857 if (copy_from_user(&val
, optval
, sizeof(val
)))
3860 return fanout_add(sk
, val
& 0xffff, val
>> 16);
3862 case PACKET_FANOUT_DATA
:
3867 return fanout_set_data(po
, optval
, optlen
);
3869 case PACKET_TX_HAS_OFF
:
3873 if (optlen
!= sizeof(val
))
3875 if (copy_from_user(&val
, optval
, sizeof(val
)))
3879 if (po
->rx_ring
.pg_vec
|| po
->tx_ring
.pg_vec
) {
3882 po
->tp_tx_has_off
= !!val
;
3888 case PACKET_QDISC_BYPASS
:
3892 if (optlen
!= sizeof(val
))
3894 if (copy_from_user(&val
, optval
, sizeof(val
)))
3897 po
->xmit
= val
? packet_direct_xmit
: dev_queue_xmit
;
3901 return -ENOPROTOOPT
;
3905 static int packet_getsockopt(struct socket
*sock
, int level
, int optname
,
3906 char __user
*optval
, int __user
*optlen
)
3909 int val
, lv
= sizeof(val
);
3910 struct sock
*sk
= sock
->sk
;
3911 struct packet_sock
*po
= pkt_sk(sk
);
3913 union tpacket_stats_u st
;
3914 struct tpacket_rollover_stats rstats
;
3916 if (level
!= SOL_PACKET
)
3917 return -ENOPROTOOPT
;
3919 if (get_user(len
, optlen
))
3926 case PACKET_STATISTICS
:
3927 spin_lock_bh(&sk
->sk_receive_queue
.lock
);
3928 memcpy(&st
, &po
->stats
, sizeof(st
));
3929 memset(&po
->stats
, 0, sizeof(po
->stats
));
3930 spin_unlock_bh(&sk
->sk_receive_queue
.lock
);
3932 if (po
->tp_version
== TPACKET_V3
) {
3933 lv
= sizeof(struct tpacket_stats_v3
);
3934 st
.stats3
.tp_packets
+= st
.stats3
.tp_drops
;
3937 lv
= sizeof(struct tpacket_stats
);
3938 st
.stats1
.tp_packets
+= st
.stats1
.tp_drops
;
3943 case PACKET_AUXDATA
:
3946 case PACKET_ORIGDEV
:
3949 case PACKET_VNET_HDR
:
3950 val
= po
->has_vnet_hdr
;
3952 case PACKET_VERSION
:
3953 val
= po
->tp_version
;
3956 if (len
> sizeof(int))
3958 if (len
< sizeof(int))
3960 if (copy_from_user(&val
, optval
, len
))
3964 val
= sizeof(struct tpacket_hdr
);
3967 val
= sizeof(struct tpacket2_hdr
);
3970 val
= sizeof(struct tpacket3_hdr
);
3976 case PACKET_RESERVE
:
3977 val
= po
->tp_reserve
;
3982 case PACKET_TIMESTAMP
:
3983 val
= po
->tp_tstamp
;
3987 ((u32
)po
->fanout
->id
|
3988 ((u32
)po
->fanout
->type
<< 16) |
3989 ((u32
)po
->fanout
->flags
<< 24)) :
3992 case PACKET_ROLLOVER_STATS
:
3995 rstats
.tp_all
= atomic_long_read(&po
->rollover
->num
);
3996 rstats
.tp_huge
= atomic_long_read(&po
->rollover
->num_huge
);
3997 rstats
.tp_failed
= atomic_long_read(&po
->rollover
->num_failed
);
3999 lv
= sizeof(rstats
);
4001 case PACKET_TX_HAS_OFF
:
4002 val
= po
->tp_tx_has_off
;
4004 case PACKET_QDISC_BYPASS
:
4005 val
= packet_use_direct_xmit(po
);
4008 return -ENOPROTOOPT
;
4013 if (put_user(len
, optlen
))
4015 if (copy_to_user(optval
, data
, len
))
4021 #ifdef CONFIG_COMPAT
4022 static int compat_packet_setsockopt(struct socket
*sock
, int level
, int optname
,
4023 char __user
*optval
, unsigned int optlen
)
4025 struct packet_sock
*po
= pkt_sk(sock
->sk
);
4027 if (level
!= SOL_PACKET
)
4028 return -ENOPROTOOPT
;
4030 if (optname
== PACKET_FANOUT_DATA
&&
4031 po
->fanout
&& po
->fanout
->type
== PACKET_FANOUT_CBPF
) {
4032 optval
= (char __user
*)get_compat_bpf_fprog(optval
);
4035 optlen
= sizeof(struct sock_fprog
);
4038 return packet_setsockopt(sock
, level
, optname
, optval
, optlen
);
4042 static int packet_notifier(struct notifier_block
*this,
4043 unsigned long msg
, void *ptr
)
4046 struct net_device
*dev
= netdev_notifier_info_to_dev(ptr
);
4047 struct net
*net
= dev_net(dev
);
4050 sk_for_each_rcu(sk
, &net
->packet
.sklist
) {
4051 struct packet_sock
*po
= pkt_sk(sk
);
4054 case NETDEV_UNREGISTER
:
4056 packet_dev_mclist_delete(dev
, &po
->mclist
);
4060 if (dev
->ifindex
== po
->ifindex
) {
4061 spin_lock(&po
->bind_lock
);
4063 __unregister_prot_hook(sk
, false);
4064 sk
->sk_err
= ENETDOWN
;
4065 if (!sock_flag(sk
, SOCK_DEAD
))
4066 sk
->sk_error_report(sk
);
4068 if (msg
== NETDEV_UNREGISTER
) {
4069 packet_cached_dev_reset(po
);
4071 if (po
->prot_hook
.dev
)
4072 dev_put(po
->prot_hook
.dev
);
4073 po
->prot_hook
.dev
= NULL
;
4075 spin_unlock(&po
->bind_lock
);
4079 if (dev
->ifindex
== po
->ifindex
) {
4080 spin_lock(&po
->bind_lock
);
4082 register_prot_hook(sk
);
4083 spin_unlock(&po
->bind_lock
);
4093 static int packet_ioctl(struct socket
*sock
, unsigned int cmd
,
4096 struct sock
*sk
= sock
->sk
;
4101 int amount
= sk_wmem_alloc_get(sk
);
4103 return put_user(amount
, (int __user
*)arg
);
4107 struct sk_buff
*skb
;
4110 spin_lock_bh(&sk
->sk_receive_queue
.lock
);
4111 skb
= skb_peek(&sk
->sk_receive_queue
);
4114 spin_unlock_bh(&sk
->sk_receive_queue
.lock
);
4115 return put_user(amount
, (int __user
*)arg
);
4118 return sock_get_timestamp(sk
, (struct timeval __user
*)arg
);
4120 return sock_get_timestampns(sk
, (struct timespec __user
*)arg
);
4130 case SIOCGIFBRDADDR
:
4131 case SIOCSIFBRDADDR
:
4132 case SIOCGIFNETMASK
:
4133 case SIOCSIFNETMASK
:
4134 case SIOCGIFDSTADDR
:
4135 case SIOCSIFDSTADDR
:
4137 return inet_dgram_ops
.ioctl(sock
, cmd
, arg
);
4141 return -ENOIOCTLCMD
;
4146 static __poll_t
packet_poll(struct file
*file
, struct socket
*sock
,
4149 struct sock
*sk
= sock
->sk
;
4150 struct packet_sock
*po
= pkt_sk(sk
);
4151 __poll_t mask
= datagram_poll(file
, sock
, wait
);
4153 spin_lock_bh(&sk
->sk_receive_queue
.lock
);
4154 if (po
->rx_ring
.pg_vec
) {
4155 if (!packet_previous_rx_frame(po
, &po
->rx_ring
,
4157 mask
|= EPOLLIN
| EPOLLRDNORM
;
4159 if (po
->pressure
&& __packet_rcv_has_room(po
, NULL
) == ROOM_NORMAL
)
4161 spin_unlock_bh(&sk
->sk_receive_queue
.lock
);
4162 spin_lock_bh(&sk
->sk_write_queue
.lock
);
4163 if (po
->tx_ring
.pg_vec
) {
4164 if (packet_current_frame(po
, &po
->tx_ring
, TP_STATUS_AVAILABLE
))
4165 mask
|= EPOLLOUT
| EPOLLWRNORM
;
4167 spin_unlock_bh(&sk
->sk_write_queue
.lock
);
4172 /* Dirty? Well, I still did not learn better way to account
4176 static void packet_mm_open(struct vm_area_struct
*vma
)
4178 struct file
*file
= vma
->vm_file
;
4179 struct socket
*sock
= file
->private_data
;
4180 struct sock
*sk
= sock
->sk
;
4183 atomic_inc(&pkt_sk(sk
)->mapped
);
4186 static void packet_mm_close(struct vm_area_struct
*vma
)
4188 struct file
*file
= vma
->vm_file
;
4189 struct socket
*sock
= file
->private_data
;
4190 struct sock
*sk
= sock
->sk
;
4193 atomic_dec(&pkt_sk(sk
)->mapped
);
4196 static const struct vm_operations_struct packet_mmap_ops
= {
4197 .open
= packet_mm_open
,
4198 .close
= packet_mm_close
,
4201 static void free_pg_vec(struct pgv
*pg_vec
, unsigned int order
,
4206 for (i
= 0; i
< len
; i
++) {
4207 if (likely(pg_vec
[i
].buffer
)) {
4208 if (is_vmalloc_addr(pg_vec
[i
].buffer
))
4209 vfree(pg_vec
[i
].buffer
);
4211 free_pages((unsigned long)pg_vec
[i
].buffer
,
4213 pg_vec
[i
].buffer
= NULL
;
4219 static char *alloc_one_pg_vec_page(unsigned long order
)
4222 gfp_t gfp_flags
= GFP_KERNEL
| __GFP_COMP
|
4223 __GFP_ZERO
| __GFP_NOWARN
| __GFP_NORETRY
;
4225 buffer
= (char *) __get_free_pages(gfp_flags
, order
);
4229 /* __get_free_pages failed, fall back to vmalloc */
4230 buffer
= vzalloc(array_size((1 << order
), PAGE_SIZE
));
4234 /* vmalloc failed, lets dig into swap here */
4235 gfp_flags
&= ~__GFP_NORETRY
;
4236 buffer
= (char *) __get_free_pages(gfp_flags
, order
);
4240 /* complete and utter failure */
4244 static struct pgv
*alloc_pg_vec(struct tpacket_req
*req
, int order
)
4246 unsigned int block_nr
= req
->tp_block_nr
;
4250 pg_vec
= kcalloc(block_nr
, sizeof(struct pgv
), GFP_KERNEL
| __GFP_NOWARN
);
4251 if (unlikely(!pg_vec
))
4254 for (i
= 0; i
< block_nr
; i
++) {
4255 pg_vec
[i
].buffer
= alloc_one_pg_vec_page(order
);
4256 if (unlikely(!pg_vec
[i
].buffer
))
4257 goto out_free_pgvec
;
4264 free_pg_vec(pg_vec
, order
, block_nr
);
4269 static int packet_set_ring(struct sock
*sk
, union tpacket_req_u
*req_u
,
4270 int closing
, int tx_ring
)
4272 struct pgv
*pg_vec
= NULL
;
4273 struct packet_sock
*po
= pkt_sk(sk
);
4274 unsigned long *rx_owner_map
= NULL
;
4275 int was_running
, order
= 0;
4276 struct packet_ring_buffer
*rb
;
4277 struct sk_buff_head
*rb_queue
;
4280 /* Added to avoid minimal code churn */
4281 struct tpacket_req
*req
= &req_u
->req
;
4283 rb
= tx_ring
? &po
->tx_ring
: &po
->rx_ring
;
4284 rb_queue
= tx_ring
? &sk
->sk_write_queue
: &sk
->sk_receive_queue
;
4288 if (atomic_read(&po
->mapped
))
4290 if (packet_read_pending(rb
))
4294 if (req
->tp_block_nr
) {
4295 unsigned int min_frame_size
;
4297 /* Sanity tests and some calculations */
4299 if (unlikely(rb
->pg_vec
))
4302 switch (po
->tp_version
) {
4304 po
->tp_hdrlen
= TPACKET_HDRLEN
;
4307 po
->tp_hdrlen
= TPACKET2_HDRLEN
;
4310 po
->tp_hdrlen
= TPACKET3_HDRLEN
;
4315 if (unlikely((int)req
->tp_block_size
<= 0))
4317 if (unlikely(!PAGE_ALIGNED(req
->tp_block_size
)))
4319 min_frame_size
= po
->tp_hdrlen
+ po
->tp_reserve
;
4320 if (po
->tp_version
>= TPACKET_V3
&&
4321 req
->tp_block_size
<
4322 BLK_PLUS_PRIV((u64
)req_u
->req3
.tp_sizeof_priv
) + min_frame_size
)
4324 if (unlikely(req
->tp_frame_size
< min_frame_size
))
4326 if (unlikely(req
->tp_frame_size
& (TPACKET_ALIGNMENT
- 1)))
4329 rb
->frames_per_block
= req
->tp_block_size
/ req
->tp_frame_size
;
4330 if (unlikely(rb
->frames_per_block
== 0))
4332 if (unlikely(rb
->frames_per_block
> UINT_MAX
/ req
->tp_block_nr
))
4334 if (unlikely((rb
->frames_per_block
* req
->tp_block_nr
) !=
4339 order
= get_order(req
->tp_block_size
);
4340 pg_vec
= alloc_pg_vec(req
, order
);
4341 if (unlikely(!pg_vec
))
4343 switch (po
->tp_version
) {
4345 /* Block transmit is not supported yet */
4347 init_prb_bdqc(po
, rb
, pg_vec
, req_u
);
4349 struct tpacket_req3
*req3
= &req_u
->req3
;
4351 if (req3
->tp_retire_blk_tov
||
4352 req3
->tp_sizeof_priv
||
4353 req3
->tp_feature_req_word
) {
4355 goto out_free_pg_vec
;
4361 rx_owner_map
= bitmap_alloc(req
->tp_frame_nr
,
4362 GFP_KERNEL
| __GFP_NOWARN
| __GFP_ZERO
);
4364 goto out_free_pg_vec
;
4372 if (unlikely(req
->tp_frame_nr
))
4377 /* Detach socket from network */
4378 spin_lock(&po
->bind_lock
);
4379 was_running
= po
->running
;
4383 __unregister_prot_hook(sk
, false);
4385 spin_unlock(&po
->bind_lock
);
4390 mutex_lock(&po
->pg_vec_lock
);
4391 if (closing
|| atomic_read(&po
->mapped
) == 0) {
4393 spin_lock_bh(&rb_queue
->lock
);
4394 swap(rb
->pg_vec
, pg_vec
);
4395 if (po
->tp_version
<= TPACKET_V2
)
4396 swap(rb
->rx_owner_map
, rx_owner_map
);
4397 rb
->frame_max
= (req
->tp_frame_nr
- 1);
4399 rb
->frame_size
= req
->tp_frame_size
;
4400 spin_unlock_bh(&rb_queue
->lock
);
4402 swap(rb
->pg_vec_order
, order
);
4403 swap(rb
->pg_vec_len
, req
->tp_block_nr
);
4405 rb
->pg_vec_pages
= req
->tp_block_size
/PAGE_SIZE
;
4406 po
->prot_hook
.func
= (po
->rx_ring
.pg_vec
) ?
4407 tpacket_rcv
: packet_rcv
;
4408 skb_queue_purge(rb_queue
);
4409 if (atomic_read(&po
->mapped
))
4410 pr_err("packet_mmap: vma is busy: %d\n",
4411 atomic_read(&po
->mapped
));
4413 mutex_unlock(&po
->pg_vec_lock
);
4415 spin_lock(&po
->bind_lock
);
4418 register_prot_hook(sk
);
4420 spin_unlock(&po
->bind_lock
);
4421 if (pg_vec
&& (po
->tp_version
> TPACKET_V2
)) {
4422 /* Because we don't support block-based V3 on tx-ring */
4424 prb_shutdown_retire_blk_timer(po
, rb_queue
);
4428 bitmap_free(rx_owner_map
);
4430 free_pg_vec(pg_vec
, order
, req
->tp_block_nr
);
4435 static int packet_mmap(struct file
*file
, struct socket
*sock
,
4436 struct vm_area_struct
*vma
)
4438 struct sock
*sk
= sock
->sk
;
4439 struct packet_sock
*po
= pkt_sk(sk
);
4440 unsigned long size
, expected_size
;
4441 struct packet_ring_buffer
*rb
;
4442 unsigned long start
;
4449 mutex_lock(&po
->pg_vec_lock
);
4452 for (rb
= &po
->rx_ring
; rb
<= &po
->tx_ring
; rb
++) {
4454 expected_size
+= rb
->pg_vec_len
4460 if (expected_size
== 0)
4463 size
= vma
->vm_end
- vma
->vm_start
;
4464 if (size
!= expected_size
)
4467 start
= vma
->vm_start
;
4468 for (rb
= &po
->rx_ring
; rb
<= &po
->tx_ring
; rb
++) {
4469 if (rb
->pg_vec
== NULL
)
4472 for (i
= 0; i
< rb
->pg_vec_len
; i
++) {
4474 void *kaddr
= rb
->pg_vec
[i
].buffer
;
4477 for (pg_num
= 0; pg_num
< rb
->pg_vec_pages
; pg_num
++) {
4478 page
= pgv_to_page(kaddr
);
4479 err
= vm_insert_page(vma
, start
, page
);
4488 atomic_inc(&po
->mapped
);
4489 vma
->vm_ops
= &packet_mmap_ops
;
4493 mutex_unlock(&po
->pg_vec_lock
);
4497 static const struct proto_ops packet_ops_spkt
= {
4498 .family
= PF_PACKET
,
4499 .owner
= THIS_MODULE
,
4500 .release
= packet_release
,
4501 .bind
= packet_bind_spkt
,
4502 .connect
= sock_no_connect
,
4503 .socketpair
= sock_no_socketpair
,
4504 .accept
= sock_no_accept
,
4505 .getname
= packet_getname_spkt
,
4506 .poll
= datagram_poll
,
4507 .ioctl
= packet_ioctl
,
4508 .listen
= sock_no_listen
,
4509 .shutdown
= sock_no_shutdown
,
4510 .setsockopt
= sock_no_setsockopt
,
4511 .getsockopt
= sock_no_getsockopt
,
4512 .sendmsg
= packet_sendmsg_spkt
,
4513 .recvmsg
= packet_recvmsg
,
4514 .mmap
= sock_no_mmap
,
4515 .sendpage
= sock_no_sendpage
,
4518 static const struct proto_ops packet_ops
= {
4519 .family
= PF_PACKET
,
4520 .owner
= THIS_MODULE
,
4521 .release
= packet_release
,
4522 .bind
= packet_bind
,
4523 .connect
= sock_no_connect
,
4524 .socketpair
= sock_no_socketpair
,
4525 .accept
= sock_no_accept
,
4526 .getname
= packet_getname
,
4527 .poll
= packet_poll
,
4528 .ioctl
= packet_ioctl
,
4529 .listen
= sock_no_listen
,
4530 .shutdown
= sock_no_shutdown
,
4531 .setsockopt
= packet_setsockopt
,
4532 .getsockopt
= packet_getsockopt
,
4533 #ifdef CONFIG_COMPAT
4534 .compat_setsockopt
= compat_packet_setsockopt
,
4536 .sendmsg
= packet_sendmsg
,
4537 .recvmsg
= packet_recvmsg
,
4538 .mmap
= packet_mmap
,
4539 .sendpage
= sock_no_sendpage
,
4542 static const struct net_proto_family packet_family_ops
= {
4543 .family
= PF_PACKET
,
4544 .create
= packet_create
,
4545 .owner
= THIS_MODULE
,
4548 static struct notifier_block packet_netdev_notifier
= {
4549 .notifier_call
= packet_notifier
,
4552 #ifdef CONFIG_PROC_FS
4554 static void *packet_seq_start(struct seq_file
*seq
, loff_t
*pos
)
4557 struct net
*net
= seq_file_net(seq
);
4560 return seq_hlist_start_head_rcu(&net
->packet
.sklist
, *pos
);
4563 static void *packet_seq_next(struct seq_file
*seq
, void *v
, loff_t
*pos
)
4565 struct net
*net
= seq_file_net(seq
);
4566 return seq_hlist_next_rcu(v
, &net
->packet
.sklist
, pos
);
4569 static void packet_seq_stop(struct seq_file
*seq
, void *v
)
4575 static int packet_seq_show(struct seq_file
*seq
, void *v
)
4577 if (v
== SEQ_START_TOKEN
)
4578 seq_puts(seq
, "sk RefCnt Type Proto Iface R Rmem User Inode\n");
4580 struct sock
*s
= sk_entry(v
);
4581 const struct packet_sock
*po
= pkt_sk(s
);
4584 "%pK %-6d %-4d %04x %-5d %1d %-6u %-6u %-6lu\n",
4586 refcount_read(&s
->sk_refcnt
),
4591 atomic_read(&s
->sk_rmem_alloc
),
4592 from_kuid_munged(seq_user_ns(seq
), sock_i_uid(s
)),
4599 static const struct seq_operations packet_seq_ops
= {
4600 .start
= packet_seq_start
,
4601 .next
= packet_seq_next
,
4602 .stop
= packet_seq_stop
,
4603 .show
= packet_seq_show
,
4607 static int __net_init
packet_net_init(struct net
*net
)
4609 mutex_init(&net
->packet
.sklist_lock
);
4610 INIT_HLIST_HEAD(&net
->packet
.sklist
);
4612 if (!proc_create_net("packet", 0, net
->proc_net
, &packet_seq_ops
,
4613 sizeof(struct seq_net_private
)))
4619 static void __net_exit
packet_net_exit(struct net
*net
)
4621 remove_proc_entry("packet", net
->proc_net
);
4622 WARN_ON_ONCE(!hlist_empty(&net
->packet
.sklist
));
4625 static struct pernet_operations packet_net_ops
= {
4626 .init
= packet_net_init
,
4627 .exit
= packet_net_exit
,
4631 static void __exit
packet_exit(void)
4633 unregister_netdevice_notifier(&packet_netdev_notifier
);
4634 unregister_pernet_subsys(&packet_net_ops
);
4635 sock_unregister(PF_PACKET
);
4636 proto_unregister(&packet_proto
);
4639 static int __init
packet_init(void)
4643 rc
= proto_register(&packet_proto
, 0);
4646 rc
= sock_register(&packet_family_ops
);
4649 rc
= register_pernet_subsys(&packet_net_ops
);
4652 rc
= register_netdevice_notifier(&packet_netdev_notifier
);
4659 unregister_pernet_subsys(&packet_net_ops
);
4661 sock_unregister(PF_PACKET
);
4663 proto_unregister(&packet_proto
);
4668 module_init(packet_init
);
4669 module_exit(packet_exit
);
4670 MODULE_LICENSE("GPL");
4671 MODULE_ALIAS_NETPROTO(PF_PACKET
);