2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
6 * PACKET - implements raw packet sockets.
9 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10 * Alan Cox, <gw4pts@gw4pts.ampr.org>
13 * Alan Cox : verify_area() now used correctly
14 * Alan Cox : new skbuff lists, look ma no backlogs!
15 * Alan Cox : tidied skbuff lists.
16 * Alan Cox : Now uses generic datagram routines I
17 * added. Also fixed the peek/read crash
18 * from all old Linux datagram code.
19 * Alan Cox : Uses the improved datagram code.
20 * Alan Cox : Added NULL's for socket options.
21 * Alan Cox : Re-commented the code.
22 * Alan Cox : Use new kernel side addressing
23 * Rob Janssen : Correct MTU usage.
24 * Dave Platt : Counter leaks caused by incorrect
25 * interrupt locking and some slightly
26 * dubious gcc output. Can you read
27 * compiler: it said _VOLATILE_
28 * Richard Kooijman : Timestamp fixes.
29 * Alan Cox : New buffers. Use sk->mac.raw.
30 * Alan Cox : sendmsg/recvmsg support.
31 * Alan Cox : Protocol setting support
32 * Alexey Kuznetsov : Untied from IPv4 stack.
33 * Cyrus Durgin : Fixed kerneld for kmod.
34 * Michal Ostrowski : Module initialization cleanup.
35 * Ulises Alonso : Frame number limit removal and
36 * packet_set_ring memory leak.
37 * Eric Biederman : Allow for > 8 byte hardware addresses.
38 * The convention is that longer addresses
39 * will simply extend the hardware address
40 * byte arrays at the end of sockaddr_ll
42 * Johann Baudy : Added TX RING.
43 * Chetan Loke : Implemented TPACKET_V3 block abstraction
45 * Copyright (C) 2011, <lokec@ccs.neu.edu>
48 * This program is free software; you can redistribute it and/or
49 * modify it under the terms of the GNU General Public License
50 * as published by the Free Software Foundation; either version
51 * 2 of the License, or (at your option) any later version.
55 #include <linux/types.h>
57 #include <linux/capability.h>
58 #include <linux/fcntl.h>
59 #include <linux/socket.h>
61 #include <linux/inet.h>
62 #include <linux/netdevice.h>
63 #include <linux/if_packet.h>
64 #include <linux/wireless.h>
65 #include <linux/kernel.h>
66 #include <linux/kmod.h>
67 #include <linux/slab.h>
68 #include <linux/vmalloc.h>
69 #include <net/net_namespace.h>
71 #include <net/protocol.h>
72 #include <linux/skbuff.h>
74 #include <linux/errno.h>
75 #include <linux/timer.h>
76 #include <asm/uaccess.h>
77 #include <asm/ioctls.h>
79 #include <asm/cacheflush.h>
81 #include <linux/proc_fs.h>
82 #include <linux/seq_file.h>
83 #include <linux/poll.h>
84 #include <linux/module.h>
85 #include <linux/init.h>
86 #include <linux/mutex.h>
87 #include <linux/if_vlan.h>
88 #include <linux/virtio_net.h>
89 #include <linux/errqueue.h>
90 #include <linux/net_tstamp.h>
91 #include <linux/reciprocal_div.h>
93 #include <net/inet_common.h>
100 - if device has no dev->hard_header routine, it adds and removes ll header
101 inside itself. In this case ll header is invisible outside of device,
102 but higher levels still should reserve dev->hard_header_len.
103 Some devices are enough clever to reallocate skb, when header
104 will not fit to reserved space (tunnel), another ones are silly
106 - packet socket receives packets with pulled ll header,
107 so that SOCK_RAW should push it back.
112 Incoming, dev->hard_header!=NULL
113 mac_header -> ll header
116 Outgoing, dev->hard_header!=NULL
117 mac_header -> ll header
120 Incoming, dev->hard_header==NULL
121 mac_header -> UNKNOWN position. It is very likely, that it points to ll
122 header. PPP makes it, that is wrong, because introduce
123 assymetry between rx and tx paths.
126 Outgoing, dev->hard_header==NULL
127 mac_header -> data. ll header is still not built!
131 If dev->hard_header==NULL we are unlikely to restore sensible ll header.
137 dev->hard_header != NULL
138 mac_header -> ll header
141 dev->hard_header == NULL (ll header is added by device, we cannot control it)
145 We should set nh.raw on output to correct posistion,
146 packet classifier depends on it.
149 /* Private packet socket structures. */
151 /* identical to struct packet_mreq except it has
152 * a longer address field.
154 struct packet_mreq_max
{
156 unsigned short mr_type
;
157 unsigned short mr_alen
;
158 unsigned char mr_address
[MAX_ADDR_LEN
];
162 struct tpacket_hdr
*h1
;
163 struct tpacket2_hdr
*h2
;
164 struct tpacket3_hdr
*h3
;
168 static int packet_set_ring(struct sock
*sk
, union tpacket_req_u
*req_u
,
169 int closing
, int tx_ring
);
171 #define V3_ALIGNMENT (8)
173 #define BLK_HDR_LEN (ALIGN(sizeof(struct tpacket_block_desc), V3_ALIGNMENT))
175 #define BLK_PLUS_PRIV(sz_of_priv) \
176 (BLK_HDR_LEN + ALIGN((sz_of_priv), V3_ALIGNMENT))
178 #define PGV_FROM_VMALLOC 1
180 #define BLOCK_STATUS(x) ((x)->hdr.bh1.block_status)
181 #define BLOCK_NUM_PKTS(x) ((x)->hdr.bh1.num_pkts)
182 #define BLOCK_O2FP(x) ((x)->hdr.bh1.offset_to_first_pkt)
183 #define BLOCK_LEN(x) ((x)->hdr.bh1.blk_len)
184 #define BLOCK_SNUM(x) ((x)->hdr.bh1.seq_num)
185 #define BLOCK_O2PRIV(x) ((x)->offset_to_priv)
186 #define BLOCK_PRIV(x) ((void *)((char *)(x) + BLOCK_O2PRIV(x)))
189 static int tpacket_snd(struct packet_sock
*po
, struct msghdr
*msg
);
190 static int tpacket_rcv(struct sk_buff
*skb
, struct net_device
*dev
,
191 struct packet_type
*pt
, struct net_device
*orig_dev
);
193 static void *packet_previous_frame(struct packet_sock
*po
,
194 struct packet_ring_buffer
*rb
,
196 static void packet_increment_head(struct packet_ring_buffer
*buff
);
197 static int prb_curr_blk_in_use(struct tpacket_kbdq_core
*,
198 struct tpacket_block_desc
*);
199 static void *prb_dispatch_next_block(struct tpacket_kbdq_core
*,
200 struct packet_sock
*);
201 static void prb_retire_current_block(struct tpacket_kbdq_core
*,
202 struct packet_sock
*, unsigned int status
);
203 static int prb_queue_frozen(struct tpacket_kbdq_core
*);
204 static void prb_open_block(struct tpacket_kbdq_core
*,
205 struct tpacket_block_desc
*);
206 static void prb_retire_rx_blk_timer_expired(unsigned long);
207 static void _prb_refresh_rx_retire_blk_timer(struct tpacket_kbdq_core
*);
208 static void prb_init_blk_timer(struct packet_sock
*,
209 struct tpacket_kbdq_core
*,
210 void (*func
) (unsigned long));
211 static void prb_fill_rxhash(struct tpacket_kbdq_core
*, struct tpacket3_hdr
*);
212 static void prb_clear_rxhash(struct tpacket_kbdq_core
*,
213 struct tpacket3_hdr
*);
214 static void prb_fill_vlan_info(struct tpacket_kbdq_core
*,
215 struct tpacket3_hdr
*);
216 static void packet_flush_mclist(struct sock
*sk
);
218 struct packet_skb_cb
{
219 unsigned int origlen
;
221 struct sockaddr_pkt pkt
;
222 struct sockaddr_ll ll
;
226 #define PACKET_SKB_CB(__skb) ((struct packet_skb_cb *)((__skb)->cb))
228 #define GET_PBDQC_FROM_RB(x) ((struct tpacket_kbdq_core *)(&(x)->prb_bdqc))
229 #define GET_PBLOCK_DESC(x, bid) \
230 ((struct tpacket_block_desc *)((x)->pkbdq[(bid)].buffer))
231 #define GET_CURR_PBLOCK_DESC_FROM_CORE(x) \
232 ((struct tpacket_block_desc *)((x)->pkbdq[(x)->kactive_blk_num].buffer))
233 #define GET_NEXT_PRB_BLK_NUM(x) \
234 (((x)->kactive_blk_num < ((x)->knum_blocks-1)) ? \
235 ((x)->kactive_blk_num+1) : 0)
237 static void __fanout_unlink(struct sock
*sk
, struct packet_sock
*po
);
238 static void __fanout_link(struct sock
*sk
, struct packet_sock
*po
);
240 static struct net_device
*packet_cached_dev_get(struct packet_sock
*po
)
242 struct net_device
*dev
;
245 dev
= rcu_dereference(po
->cached_dev
);
253 static void packet_cached_dev_assign(struct packet_sock
*po
,
254 struct net_device
*dev
)
256 rcu_assign_pointer(po
->cached_dev
, dev
);
259 static void packet_cached_dev_reset(struct packet_sock
*po
)
261 RCU_INIT_POINTER(po
->cached_dev
, NULL
);
264 /* register_prot_hook must be invoked with the po->bind_lock held,
265 * or from a context in which asynchronous accesses to the packet
266 * socket is not possible (packet_create()).
268 static void register_prot_hook(struct sock
*sk
)
270 struct packet_sock
*po
= pkt_sk(sk
);
274 __fanout_link(sk
, po
);
276 dev_add_pack(&po
->prot_hook
);
283 /* {,__}unregister_prot_hook() must be invoked with the po->bind_lock
284 * held. If the sync parameter is true, we will temporarily drop
285 * the po->bind_lock and do a synchronize_net to make sure no
286 * asynchronous packet processing paths still refer to the elements
287 * of po->prot_hook. If the sync parameter is false, it is the
288 * callers responsibility to take care of this.
290 static void __unregister_prot_hook(struct sock
*sk
, bool sync
)
292 struct packet_sock
*po
= pkt_sk(sk
);
297 __fanout_unlink(sk
, po
);
299 __dev_remove_pack(&po
->prot_hook
);
304 spin_unlock(&po
->bind_lock
);
306 spin_lock(&po
->bind_lock
);
310 static void unregister_prot_hook(struct sock
*sk
, bool sync
)
312 struct packet_sock
*po
= pkt_sk(sk
);
315 __unregister_prot_hook(sk
, sync
);
318 static inline __pure
struct page
*pgv_to_page(void *addr
)
320 if (is_vmalloc_addr(addr
))
321 return vmalloc_to_page(addr
);
322 return virt_to_page(addr
);
325 static void __packet_set_status(struct packet_sock
*po
, void *frame
, int status
)
327 union tpacket_uhdr h
;
330 switch (po
->tp_version
) {
332 h
.h1
->tp_status
= status
;
333 flush_dcache_page(pgv_to_page(&h
.h1
->tp_status
));
336 h
.h2
->tp_status
= status
;
337 flush_dcache_page(pgv_to_page(&h
.h2
->tp_status
));
341 WARN(1, "TPACKET version not supported.\n");
348 static int __packet_get_status(struct packet_sock
*po
, void *frame
)
350 union tpacket_uhdr h
;
355 switch (po
->tp_version
) {
357 flush_dcache_page(pgv_to_page(&h
.h1
->tp_status
));
358 return h
.h1
->tp_status
;
360 flush_dcache_page(pgv_to_page(&h
.h2
->tp_status
));
361 return h
.h2
->tp_status
;
364 WARN(1, "TPACKET version not supported.\n");
370 static __u32
tpacket_get_timestamp(struct sk_buff
*skb
, struct timespec
*ts
,
373 struct skb_shared_hwtstamps
*shhwtstamps
= skb_hwtstamps(skb
);
376 if ((flags
& SOF_TIMESTAMPING_SYS_HARDWARE
) &&
377 ktime_to_timespec_cond(shhwtstamps
->syststamp
, ts
))
378 return TP_STATUS_TS_SYS_HARDWARE
;
379 if ((flags
& SOF_TIMESTAMPING_RAW_HARDWARE
) &&
380 ktime_to_timespec_cond(shhwtstamps
->hwtstamp
, ts
))
381 return TP_STATUS_TS_RAW_HARDWARE
;
384 if (ktime_to_timespec_cond(skb
->tstamp
, ts
))
385 return TP_STATUS_TS_SOFTWARE
;
390 static __u32
__packet_set_timestamp(struct packet_sock
*po
, void *frame
,
393 union tpacket_uhdr h
;
397 if (!(ts_status
= tpacket_get_timestamp(skb
, &ts
, po
->tp_tstamp
)))
401 switch (po
->tp_version
) {
403 h
.h1
->tp_sec
= ts
.tv_sec
;
404 h
.h1
->tp_usec
= ts
.tv_nsec
/ NSEC_PER_USEC
;
407 h
.h2
->tp_sec
= ts
.tv_sec
;
408 h
.h2
->tp_nsec
= ts
.tv_nsec
;
412 WARN(1, "TPACKET version not supported.\n");
416 /* one flush is safe, as both fields always lie on the same cacheline */
417 flush_dcache_page(pgv_to_page(&h
.h1
->tp_sec
));
423 static void *packet_lookup_frame(struct packet_sock
*po
,
424 struct packet_ring_buffer
*rb
,
425 unsigned int position
,
428 unsigned int pg_vec_pos
, frame_offset
;
429 union tpacket_uhdr h
;
431 pg_vec_pos
= position
/ rb
->frames_per_block
;
432 frame_offset
= position
% rb
->frames_per_block
;
434 h
.raw
= rb
->pg_vec
[pg_vec_pos
].buffer
+
435 (frame_offset
* rb
->frame_size
);
437 if (status
!= __packet_get_status(po
, h
.raw
))
443 static void *packet_current_frame(struct packet_sock
*po
,
444 struct packet_ring_buffer
*rb
,
447 return packet_lookup_frame(po
, rb
, rb
->head
, status
);
450 static void prb_del_retire_blk_timer(struct tpacket_kbdq_core
*pkc
)
452 del_timer_sync(&pkc
->retire_blk_timer
);
455 static void prb_shutdown_retire_blk_timer(struct packet_sock
*po
,
457 struct sk_buff_head
*rb_queue
)
459 struct tpacket_kbdq_core
*pkc
;
461 pkc
= tx_ring
? &po
->tx_ring
.prb_bdqc
: &po
->rx_ring
.prb_bdqc
;
463 spin_lock_bh(&rb_queue
->lock
);
464 pkc
->delete_blk_timer
= 1;
465 spin_unlock_bh(&rb_queue
->lock
);
467 prb_del_retire_blk_timer(pkc
);
470 static void prb_init_blk_timer(struct packet_sock
*po
,
471 struct tpacket_kbdq_core
*pkc
,
472 void (*func
) (unsigned long))
474 init_timer(&pkc
->retire_blk_timer
);
475 pkc
->retire_blk_timer
.data
= (long)po
;
476 pkc
->retire_blk_timer
.function
= func
;
477 pkc
->retire_blk_timer
.expires
= jiffies
;
480 static void prb_setup_retire_blk_timer(struct packet_sock
*po
, int tx_ring
)
482 struct tpacket_kbdq_core
*pkc
;
487 pkc
= tx_ring
? &po
->tx_ring
.prb_bdqc
: &po
->rx_ring
.prb_bdqc
;
488 prb_init_blk_timer(po
, pkc
, prb_retire_rx_blk_timer_expired
);
491 static int prb_calc_retire_blk_tmo(struct packet_sock
*po
,
492 int blk_size_in_bytes
)
494 struct net_device
*dev
;
495 unsigned int mbits
= 0, msec
= 0, div
= 0, tmo
= 0;
496 struct ethtool_cmd ecmd
;
501 dev
= __dev_get_by_index(sock_net(&po
->sk
), po
->ifindex
);
502 if (unlikely(!dev
)) {
504 return DEFAULT_PRB_RETIRE_TOV
;
506 err
= __ethtool_get_settings(dev
, &ecmd
);
507 speed
= ethtool_cmd_speed(&ecmd
);
511 * If the link speed is so slow you don't really
512 * need to worry about perf anyways
514 if (speed
< SPEED_1000
|| speed
== SPEED_UNKNOWN
) {
515 return DEFAULT_PRB_RETIRE_TOV
;
522 mbits
= (blk_size_in_bytes
* 8) / (1024 * 1024);
534 static void prb_init_ft_ops(struct tpacket_kbdq_core
*p1
,
535 union tpacket_req_u
*req_u
)
537 p1
->feature_req_word
= req_u
->req3
.tp_feature_req_word
;
540 static void init_prb_bdqc(struct packet_sock
*po
,
541 struct packet_ring_buffer
*rb
,
543 union tpacket_req_u
*req_u
, int tx_ring
)
545 struct tpacket_kbdq_core
*p1
= &rb
->prb_bdqc
;
546 struct tpacket_block_desc
*pbd
;
548 memset(p1
, 0x0, sizeof(*p1
));
550 p1
->knxt_seq_num
= 1;
552 pbd
= (struct tpacket_block_desc
*)pg_vec
[0].buffer
;
553 p1
->pkblk_start
= pg_vec
[0].buffer
;
554 p1
->kblk_size
= req_u
->req3
.tp_block_size
;
555 p1
->knum_blocks
= req_u
->req3
.tp_block_nr
;
556 p1
->hdrlen
= po
->tp_hdrlen
;
557 p1
->version
= po
->tp_version
;
558 p1
->last_kactive_blk_num
= 0;
559 po
->stats
.stats3
.tp_freeze_q_cnt
= 0;
560 if (req_u
->req3
.tp_retire_blk_tov
)
561 p1
->retire_blk_tov
= req_u
->req3
.tp_retire_blk_tov
;
563 p1
->retire_blk_tov
= prb_calc_retire_blk_tmo(po
,
564 req_u
->req3
.tp_block_size
);
565 p1
->tov_in_jiffies
= msecs_to_jiffies(p1
->retire_blk_tov
);
566 p1
->blk_sizeof_priv
= req_u
->req3
.tp_sizeof_priv
;
568 prb_init_ft_ops(p1
, req_u
);
569 prb_setup_retire_blk_timer(po
, tx_ring
);
570 prb_open_block(p1
, pbd
);
573 /* Do NOT update the last_blk_num first.
574 * Assumes sk_buff_head lock is held.
576 static void _prb_refresh_rx_retire_blk_timer(struct tpacket_kbdq_core
*pkc
)
578 mod_timer(&pkc
->retire_blk_timer
,
579 jiffies
+ pkc
->tov_in_jiffies
);
580 pkc
->last_kactive_blk_num
= pkc
->kactive_blk_num
;
585 * 1) We refresh the timer only when we open a block.
586 * By doing this we don't waste cycles refreshing the timer
587 * on packet-by-packet basis.
589 * With a 1MB block-size, on a 1Gbps line, it will take
590 * i) ~8 ms to fill a block + ii) memcpy etc.
591 * In this cut we are not accounting for the memcpy time.
593 * So, if the user sets the 'tmo' to 10ms then the timer
594 * will never fire while the block is still getting filled
595 * (which is what we want). However, the user could choose
596 * to close a block early and that's fine.
598 * But when the timer does fire, we check whether or not to refresh it.
599 * Since the tmo granularity is in msecs, it is not too expensive
600 * to refresh the timer, lets say every '8' msecs.
601 * Either the user can set the 'tmo' or we can derive it based on
602 * a) line-speed and b) block-size.
603 * prb_calc_retire_blk_tmo() calculates the tmo.
606 static void prb_retire_rx_blk_timer_expired(unsigned long data
)
608 struct packet_sock
*po
= (struct packet_sock
*)data
;
609 struct tpacket_kbdq_core
*pkc
= &po
->rx_ring
.prb_bdqc
;
611 struct tpacket_block_desc
*pbd
;
613 spin_lock(&po
->sk
.sk_receive_queue
.lock
);
615 frozen
= prb_queue_frozen(pkc
);
616 pbd
= GET_CURR_PBLOCK_DESC_FROM_CORE(pkc
);
618 if (unlikely(pkc
->delete_blk_timer
))
621 /* We only need to plug the race when the block is partially filled.
623 * lock(); increment BLOCK_NUM_PKTS; unlock()
624 * copy_bits() is in progress ...
625 * timer fires on other cpu:
626 * we can't retire the current block because copy_bits
630 if (BLOCK_NUM_PKTS(pbd
)) {
631 while (atomic_read(&pkc
->blk_fill_in_prog
)) {
632 /* Waiting for skb_copy_bits to finish... */
637 if (pkc
->last_kactive_blk_num
== pkc
->kactive_blk_num
) {
639 prb_retire_current_block(pkc
, po
, TP_STATUS_BLK_TMO
);
640 if (!prb_dispatch_next_block(pkc
, po
))
645 /* Case 1. Queue was frozen because user-space was
648 if (prb_curr_blk_in_use(pkc
, pbd
)) {
650 * Ok, user-space is still behind.
651 * So just refresh the timer.
655 /* Case 2. queue was frozen,user-space caught up,
656 * now the link went idle && the timer fired.
657 * We don't have a block to close.So we open this
658 * block and restart the timer.
659 * opening a block thaws the queue,restarts timer
660 * Thawing/timer-refresh is a side effect.
662 prb_open_block(pkc
, pbd
);
669 _prb_refresh_rx_retire_blk_timer(pkc
);
672 spin_unlock(&po
->sk
.sk_receive_queue
.lock
);
675 static void prb_flush_block(struct tpacket_kbdq_core
*pkc1
,
676 struct tpacket_block_desc
*pbd1
, __u32 status
)
678 /* Flush everything minus the block header */
680 #if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1
685 /* Skip the block header(we know header WILL fit in 4K) */
688 end
= (u8
*)PAGE_ALIGN((unsigned long)pkc1
->pkblk_end
);
689 for (; start
< end
; start
+= PAGE_SIZE
)
690 flush_dcache_page(pgv_to_page(start
));
695 /* Now update the block status. */
697 BLOCK_STATUS(pbd1
) = status
;
699 /* Flush the block header */
701 #if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1
703 flush_dcache_page(pgv_to_page(start
));
713 * 2) Increment active_blk_num
715 * Note:We DONT refresh the timer on purpose.
716 * Because almost always the next block will be opened.
718 static void prb_close_block(struct tpacket_kbdq_core
*pkc1
,
719 struct tpacket_block_desc
*pbd1
,
720 struct packet_sock
*po
, unsigned int stat
)
722 __u32 status
= TP_STATUS_USER
| stat
;
724 struct tpacket3_hdr
*last_pkt
;
725 struct tpacket_hdr_v1
*h1
= &pbd1
->hdr
.bh1
;
727 if (po
->stats
.stats3
.tp_drops
)
728 status
|= TP_STATUS_LOSING
;
730 last_pkt
= (struct tpacket3_hdr
*)pkc1
->prev
;
731 last_pkt
->tp_next_offset
= 0;
733 /* Get the ts of the last pkt */
734 if (BLOCK_NUM_PKTS(pbd1
)) {
735 h1
->ts_last_pkt
.ts_sec
= last_pkt
->tp_sec
;
736 h1
->ts_last_pkt
.ts_nsec
= last_pkt
->tp_nsec
;
738 /* Ok, we tmo'd - so get the current time */
741 h1
->ts_last_pkt
.ts_sec
= ts
.tv_sec
;
742 h1
->ts_last_pkt
.ts_nsec
= ts
.tv_nsec
;
747 /* Flush the block */
748 prb_flush_block(pkc1
, pbd1
, status
);
750 pkc1
->kactive_blk_num
= GET_NEXT_PRB_BLK_NUM(pkc1
);
753 static void prb_thaw_queue(struct tpacket_kbdq_core
*pkc
)
755 pkc
->reset_pending_on_curr_blk
= 0;
759 * Side effect of opening a block:
761 * 1) prb_queue is thawed.
762 * 2) retire_blk_timer is refreshed.
765 static void prb_open_block(struct tpacket_kbdq_core
*pkc1
,
766 struct tpacket_block_desc
*pbd1
)
769 struct tpacket_hdr_v1
*h1
= &pbd1
->hdr
.bh1
;
773 /* We could have just memset this but we will lose the
774 * flexibility of making the priv area sticky
777 BLOCK_SNUM(pbd1
) = pkc1
->knxt_seq_num
++;
778 BLOCK_NUM_PKTS(pbd1
) = 0;
779 BLOCK_LEN(pbd1
) = BLK_PLUS_PRIV(pkc1
->blk_sizeof_priv
);
783 h1
->ts_first_pkt
.ts_sec
= ts
.tv_sec
;
784 h1
->ts_first_pkt
.ts_nsec
= ts
.tv_nsec
;
786 pkc1
->pkblk_start
= (char *)pbd1
;
787 pkc1
->nxt_offset
= pkc1
->pkblk_start
+ BLK_PLUS_PRIV(pkc1
->blk_sizeof_priv
);
789 BLOCK_O2FP(pbd1
) = (__u32
)BLK_PLUS_PRIV(pkc1
->blk_sizeof_priv
);
790 BLOCK_O2PRIV(pbd1
) = BLK_HDR_LEN
;
792 pbd1
->version
= pkc1
->version
;
793 pkc1
->prev
= pkc1
->nxt_offset
;
794 pkc1
->pkblk_end
= pkc1
->pkblk_start
+ pkc1
->kblk_size
;
796 prb_thaw_queue(pkc1
);
797 _prb_refresh_rx_retire_blk_timer(pkc1
);
803 * Queue freeze logic:
804 * 1) Assume tp_block_nr = 8 blocks.
805 * 2) At time 't0', user opens Rx ring.
806 * 3) Some time past 't0', kernel starts filling blocks starting from 0 .. 7
807 * 4) user-space is either sleeping or processing block '0'.
808 * 5) tpacket_rcv is currently filling block '7', since there is no space left,
809 * it will close block-7,loop around and try to fill block '0'.
811 * __packet_lookup_frame_in_block
812 * prb_retire_current_block()
813 * prb_dispatch_next_block()
814 * |->(BLOCK_STATUS == USER) evaluates to true
815 * 5.1) Since block-0 is currently in-use, we just freeze the queue.
816 * 6) Now there are two cases:
817 * 6.1) Link goes idle right after the queue is frozen.
818 * But remember, the last open_block() refreshed the timer.
819 * When this timer expires,it will refresh itself so that we can
820 * re-open block-0 in near future.
821 * 6.2) Link is busy and keeps on receiving packets. This is a simple
822 * case and __packet_lookup_frame_in_block will check if block-0
823 * is free and can now be re-used.
825 static void prb_freeze_queue(struct tpacket_kbdq_core
*pkc
,
826 struct packet_sock
*po
)
828 pkc
->reset_pending_on_curr_blk
= 1;
829 po
->stats
.stats3
.tp_freeze_q_cnt
++;
832 #define TOTAL_PKT_LEN_INCL_ALIGN(length) (ALIGN((length), V3_ALIGNMENT))
835 * If the next block is free then we will dispatch it
836 * and return a good offset.
837 * Else, we will freeze the queue.
838 * So, caller must check the return value.
840 static void *prb_dispatch_next_block(struct tpacket_kbdq_core
*pkc
,
841 struct packet_sock
*po
)
843 struct tpacket_block_desc
*pbd
;
847 /* 1. Get current block num */
848 pbd
= GET_CURR_PBLOCK_DESC_FROM_CORE(pkc
);
850 /* 2. If this block is currently in_use then freeze the queue */
851 if (TP_STATUS_USER
& BLOCK_STATUS(pbd
)) {
852 prb_freeze_queue(pkc
, po
);
858 * open this block and return the offset where the first packet
859 * needs to get stored.
861 prb_open_block(pkc
, pbd
);
862 return (void *)pkc
->nxt_offset
;
865 static void prb_retire_current_block(struct tpacket_kbdq_core
*pkc
,
866 struct packet_sock
*po
, unsigned int status
)
868 struct tpacket_block_desc
*pbd
= GET_CURR_PBLOCK_DESC_FROM_CORE(pkc
);
870 /* retire/close the current block */
871 if (likely(TP_STATUS_KERNEL
== BLOCK_STATUS(pbd
))) {
873 * Plug the case where copy_bits() is in progress on
874 * cpu-0 and tpacket_rcv() got invoked on cpu-1, didn't
875 * have space to copy the pkt in the current block and
876 * called prb_retire_current_block()
878 * We don't need to worry about the TMO case because
879 * the timer-handler already handled this case.
881 if (!(status
& TP_STATUS_BLK_TMO
)) {
882 while (atomic_read(&pkc
->blk_fill_in_prog
)) {
883 /* Waiting for skb_copy_bits to finish... */
887 prb_close_block(pkc
, pbd
, po
, status
);
892 static int prb_curr_blk_in_use(struct tpacket_kbdq_core
*pkc
,
893 struct tpacket_block_desc
*pbd
)
895 return TP_STATUS_USER
& BLOCK_STATUS(pbd
);
898 static int prb_queue_frozen(struct tpacket_kbdq_core
*pkc
)
900 return pkc
->reset_pending_on_curr_blk
;
903 static void prb_clear_blk_fill_status(struct packet_ring_buffer
*rb
)
905 struct tpacket_kbdq_core
*pkc
= GET_PBDQC_FROM_RB(rb
);
906 atomic_dec(&pkc
->blk_fill_in_prog
);
909 static void prb_fill_rxhash(struct tpacket_kbdq_core
*pkc
,
910 struct tpacket3_hdr
*ppd
)
912 ppd
->hv1
.tp_rxhash
= skb_get_rxhash(pkc
->skb
);
915 static void prb_clear_rxhash(struct tpacket_kbdq_core
*pkc
,
916 struct tpacket3_hdr
*ppd
)
918 ppd
->hv1
.tp_rxhash
= 0;
921 static void prb_fill_vlan_info(struct tpacket_kbdq_core
*pkc
,
922 struct tpacket3_hdr
*ppd
)
924 if (vlan_tx_tag_present(pkc
->skb
)) {
925 ppd
->hv1
.tp_vlan_tci
= vlan_tx_tag_get(pkc
->skb
);
926 ppd
->tp_status
= TP_STATUS_VLAN_VALID
;
928 ppd
->hv1
.tp_vlan_tci
= 0;
929 ppd
->tp_status
= TP_STATUS_AVAILABLE
;
933 static void prb_run_all_ft_ops(struct tpacket_kbdq_core
*pkc
,
934 struct tpacket3_hdr
*ppd
)
936 prb_fill_vlan_info(pkc
, ppd
);
938 if (pkc
->feature_req_word
& TP_FT_REQ_FILL_RXHASH
)
939 prb_fill_rxhash(pkc
, ppd
);
941 prb_clear_rxhash(pkc
, ppd
);
944 static void prb_fill_curr_block(char *curr
,
945 struct tpacket_kbdq_core
*pkc
,
946 struct tpacket_block_desc
*pbd
,
949 struct tpacket3_hdr
*ppd
;
951 ppd
= (struct tpacket3_hdr
*)curr
;
952 ppd
->tp_next_offset
= TOTAL_PKT_LEN_INCL_ALIGN(len
);
954 pkc
->nxt_offset
+= TOTAL_PKT_LEN_INCL_ALIGN(len
);
955 BLOCK_LEN(pbd
) += TOTAL_PKT_LEN_INCL_ALIGN(len
);
956 BLOCK_NUM_PKTS(pbd
) += 1;
957 atomic_inc(&pkc
->blk_fill_in_prog
);
958 prb_run_all_ft_ops(pkc
, ppd
);
961 /* Assumes caller has the sk->rx_queue.lock */
962 static void *__packet_lookup_frame_in_block(struct packet_sock
*po
,
968 struct tpacket_kbdq_core
*pkc
;
969 struct tpacket_block_desc
*pbd
;
972 pkc
= GET_PBDQC_FROM_RB(&po
->rx_ring
);
973 pbd
= GET_CURR_PBLOCK_DESC_FROM_CORE(pkc
);
975 /* Queue is frozen when user space is lagging behind */
976 if (prb_queue_frozen(pkc
)) {
978 * Check if that last block which caused the queue to freeze,
979 * is still in_use by user-space.
981 if (prb_curr_blk_in_use(pkc
, pbd
)) {
982 /* Can't record this packet */
986 * Ok, the block was released by user-space.
987 * Now let's open that block.
988 * opening a block also thaws the queue.
989 * Thawing is a side effect.
991 prb_open_block(pkc
, pbd
);
996 curr
= pkc
->nxt_offset
;
998 end
= (char *)pbd
+ pkc
->kblk_size
;
1000 /* first try the current block */
1001 if (curr
+TOTAL_PKT_LEN_INCL_ALIGN(len
) < end
) {
1002 prb_fill_curr_block(curr
, pkc
, pbd
, len
);
1003 return (void *)curr
;
1006 /* Ok, close the current block */
1007 prb_retire_current_block(pkc
, po
, 0);
1009 /* Now, try to dispatch the next block */
1010 curr
= (char *)prb_dispatch_next_block(pkc
, po
);
1012 pbd
= GET_CURR_PBLOCK_DESC_FROM_CORE(pkc
);
1013 prb_fill_curr_block(curr
, pkc
, pbd
, len
);
1014 return (void *)curr
;
1018 * No free blocks are available.user_space hasn't caught up yet.
1019 * Queue was just frozen and now this packet will get dropped.
1024 static void *packet_current_rx_frame(struct packet_sock
*po
,
1025 struct sk_buff
*skb
,
1026 int status
, unsigned int len
)
1029 switch (po
->tp_version
) {
1032 curr
= packet_lookup_frame(po
, &po
->rx_ring
,
1033 po
->rx_ring
.head
, status
);
1036 return __packet_lookup_frame_in_block(po
, skb
, status
, len
);
1038 WARN(1, "TPACKET version not supported\n");
1044 static void *prb_lookup_block(struct packet_sock
*po
,
1045 struct packet_ring_buffer
*rb
,
1049 struct tpacket_kbdq_core
*pkc
= GET_PBDQC_FROM_RB(rb
);
1050 struct tpacket_block_desc
*pbd
= GET_PBLOCK_DESC(pkc
, idx
);
1052 if (status
!= BLOCK_STATUS(pbd
))
1057 static int prb_previous_blk_num(struct packet_ring_buffer
*rb
)
1060 if (rb
->prb_bdqc
.kactive_blk_num
)
1061 prev
= rb
->prb_bdqc
.kactive_blk_num
-1;
1063 prev
= rb
->prb_bdqc
.knum_blocks
-1;
1067 /* Assumes caller has held the rx_queue.lock */
1068 static void *__prb_previous_block(struct packet_sock
*po
,
1069 struct packet_ring_buffer
*rb
,
1072 unsigned int previous
= prb_previous_blk_num(rb
);
1073 return prb_lookup_block(po
, rb
, previous
, status
);
1076 static void *packet_previous_rx_frame(struct packet_sock
*po
,
1077 struct packet_ring_buffer
*rb
,
1080 if (po
->tp_version
<= TPACKET_V2
)
1081 return packet_previous_frame(po
, rb
, status
);
1083 return __prb_previous_block(po
, rb
, status
);
1086 static void packet_increment_rx_head(struct packet_sock
*po
,
1087 struct packet_ring_buffer
*rb
)
1089 switch (po
->tp_version
) {
1092 return packet_increment_head(rb
);
1095 WARN(1, "TPACKET version not supported.\n");
1101 static void *packet_previous_frame(struct packet_sock
*po
,
1102 struct packet_ring_buffer
*rb
,
1105 unsigned int previous
= rb
->head
? rb
->head
- 1 : rb
->frame_max
;
1106 return packet_lookup_frame(po
, rb
, previous
, status
);
1109 static void packet_increment_head(struct packet_ring_buffer
*buff
)
1111 buff
->head
= buff
->head
!= buff
->frame_max
? buff
->head
+1 : 0;
1114 static bool packet_rcv_has_room(struct packet_sock
*po
, struct sk_buff
*skb
)
1116 struct sock
*sk
= &po
->sk
;
1119 if (po
->prot_hook
.func
!= tpacket_rcv
)
1120 return (atomic_read(&sk
->sk_rmem_alloc
) + skb
->truesize
)
1123 spin_lock(&sk
->sk_receive_queue
.lock
);
1124 if (po
->tp_version
== TPACKET_V3
)
1125 has_room
= prb_lookup_block(po
, &po
->rx_ring
,
1126 po
->rx_ring
.prb_bdqc
.kactive_blk_num
,
1129 has_room
= packet_lookup_frame(po
, &po
->rx_ring
,
1132 spin_unlock(&sk
->sk_receive_queue
.lock
);
1137 static void packet_sock_destruct(struct sock
*sk
)
1139 skb_queue_purge(&sk
->sk_error_queue
);
1141 WARN_ON(atomic_read(&sk
->sk_rmem_alloc
));
1142 WARN_ON(atomic_read(&sk
->sk_wmem_alloc
));
1144 if (!sock_flag(sk
, SOCK_DEAD
)) {
1145 pr_err("Attempt to release alive packet socket: %p\n", sk
);
1149 sk_refcnt_debug_dec(sk
);
1152 static int fanout_rr_next(struct packet_fanout
*f
, unsigned int num
)
1154 int x
= atomic_read(&f
->rr_cur
) + 1;
1162 static unsigned int fanout_demux_hash(struct packet_fanout
*f
,
1163 struct sk_buff
*skb
,
1166 return reciprocal_divide(skb
->rxhash
, num
);
1169 static unsigned int fanout_demux_lb(struct packet_fanout
*f
,
1170 struct sk_buff
*skb
,
1175 cur
= atomic_read(&f
->rr_cur
);
1176 while ((old
= atomic_cmpxchg(&f
->rr_cur
, cur
,
1177 fanout_rr_next(f
, num
))) != cur
)
1182 static unsigned int fanout_demux_cpu(struct packet_fanout
*f
,
1183 struct sk_buff
*skb
,
1186 return smp_processor_id() % num
;
1189 static unsigned int fanout_demux_rnd(struct packet_fanout
*f
,
1190 struct sk_buff
*skb
,
1193 return reciprocal_divide(prandom_u32(), num
);
1196 static unsigned int fanout_demux_rollover(struct packet_fanout
*f
,
1197 struct sk_buff
*skb
,
1198 unsigned int idx
, unsigned int skip
,
1203 i
= j
= min_t(int, f
->next
[idx
], num
- 1);
1205 if (i
!= skip
&& packet_rcv_has_room(pkt_sk(f
->arr
[i
]), skb
)) {
1217 static bool fanout_has_flag(struct packet_fanout
*f
, u16 flag
)
1219 return f
->flags
& (flag
>> 8);
1222 static int packet_rcv_fanout(struct sk_buff
*skb
, struct net_device
*dev
,
1223 struct packet_type
*pt
, struct net_device
*orig_dev
)
1225 struct packet_fanout
*f
= pt
->af_packet_priv
;
1226 unsigned int num
= f
->num_members
;
1227 struct packet_sock
*po
;
1230 if (!net_eq(dev_net(dev
), read_pnet(&f
->net
)) ||
1237 case PACKET_FANOUT_HASH
:
1239 if (fanout_has_flag(f
, PACKET_FANOUT_FLAG_DEFRAG
)) {
1240 skb
= ip_check_defrag(skb
, IP_DEFRAG_AF_PACKET
);
1244 skb_get_rxhash(skb
);
1245 idx
= fanout_demux_hash(f
, skb
, num
);
1247 case PACKET_FANOUT_LB
:
1248 idx
= fanout_demux_lb(f
, skb
, num
);
1250 case PACKET_FANOUT_CPU
:
1251 idx
= fanout_demux_cpu(f
, skb
, num
);
1253 case PACKET_FANOUT_RND
:
1254 idx
= fanout_demux_rnd(f
, skb
, num
);
1256 case PACKET_FANOUT_ROLLOVER
:
1257 idx
= fanout_demux_rollover(f
, skb
, 0, (unsigned int) -1, num
);
1261 po
= pkt_sk(f
->arr
[idx
]);
1262 if (fanout_has_flag(f
, PACKET_FANOUT_FLAG_ROLLOVER
) &&
1263 unlikely(!packet_rcv_has_room(po
, skb
))) {
1264 idx
= fanout_demux_rollover(f
, skb
, idx
, idx
, num
);
1265 po
= pkt_sk(f
->arr
[idx
]);
1268 return po
->prot_hook
.func(skb
, dev
, &po
->prot_hook
, orig_dev
);
1271 DEFINE_MUTEX(fanout_mutex
);
1272 EXPORT_SYMBOL_GPL(fanout_mutex
);
1273 static LIST_HEAD(fanout_list
);
1275 static void __fanout_link(struct sock
*sk
, struct packet_sock
*po
)
1277 struct packet_fanout
*f
= po
->fanout
;
1279 spin_lock(&f
->lock
);
1280 f
->arr
[f
->num_members
] = sk
;
1283 spin_unlock(&f
->lock
);
1286 static void __fanout_unlink(struct sock
*sk
, struct packet_sock
*po
)
1288 struct packet_fanout
*f
= po
->fanout
;
1291 spin_lock(&f
->lock
);
1292 for (i
= 0; i
< f
->num_members
; i
++) {
1293 if (f
->arr
[i
] == sk
)
1296 BUG_ON(i
>= f
->num_members
);
1297 f
->arr
[i
] = f
->arr
[f
->num_members
- 1];
1299 spin_unlock(&f
->lock
);
1302 static bool match_fanout_group(struct packet_type
*ptype
, struct sock
* sk
)
1304 if (ptype
->af_packet_priv
== (void*)((struct packet_sock
*)sk
)->fanout
)
1310 static int fanout_add(struct sock
*sk
, u16 id
, u16 type_flags
)
1312 struct packet_sock
*po
= pkt_sk(sk
);
1313 struct packet_fanout
*f
, *match
;
1314 u8 type
= type_flags
& 0xff;
1315 u8 flags
= type_flags
>> 8;
1319 case PACKET_FANOUT_ROLLOVER
:
1320 if (type_flags
& PACKET_FANOUT_FLAG_ROLLOVER
)
1322 case PACKET_FANOUT_HASH
:
1323 case PACKET_FANOUT_LB
:
1324 case PACKET_FANOUT_CPU
:
1325 case PACKET_FANOUT_RND
:
1337 mutex_lock(&fanout_mutex
);
1339 list_for_each_entry(f
, &fanout_list
, list
) {
1341 read_pnet(&f
->net
) == sock_net(sk
)) {
1347 if (match
&& match
->flags
!= flags
)
1351 match
= kzalloc(sizeof(*match
), GFP_KERNEL
);
1354 write_pnet(&match
->net
, sock_net(sk
));
1357 match
->flags
= flags
;
1358 atomic_set(&match
->rr_cur
, 0);
1359 INIT_LIST_HEAD(&match
->list
);
1360 spin_lock_init(&match
->lock
);
1361 atomic_set(&match
->sk_ref
, 0);
1362 match
->prot_hook
.type
= po
->prot_hook
.type
;
1363 match
->prot_hook
.dev
= po
->prot_hook
.dev
;
1364 match
->prot_hook
.func
= packet_rcv_fanout
;
1365 match
->prot_hook
.af_packet_priv
= match
;
1366 match
->prot_hook
.id_match
= match_fanout_group
;
1367 dev_add_pack(&match
->prot_hook
);
1368 list_add(&match
->list
, &fanout_list
);
1371 if (match
->type
== type
&&
1372 match
->prot_hook
.type
== po
->prot_hook
.type
&&
1373 match
->prot_hook
.dev
== po
->prot_hook
.dev
) {
1375 if (atomic_read(&match
->sk_ref
) < PACKET_FANOUT_MAX
) {
1376 __dev_remove_pack(&po
->prot_hook
);
1378 atomic_inc(&match
->sk_ref
);
1379 __fanout_link(sk
, po
);
1384 mutex_unlock(&fanout_mutex
);
1388 static void fanout_release(struct sock
*sk
)
1390 struct packet_sock
*po
= pkt_sk(sk
);
1391 struct packet_fanout
*f
;
1397 mutex_lock(&fanout_mutex
);
1400 if (atomic_dec_and_test(&f
->sk_ref
)) {
1402 dev_remove_pack(&f
->prot_hook
);
1405 mutex_unlock(&fanout_mutex
);
1408 static const struct proto_ops packet_ops
;
1410 static const struct proto_ops packet_ops_spkt
;
1412 static int packet_rcv_spkt(struct sk_buff
*skb
, struct net_device
*dev
,
1413 struct packet_type
*pt
, struct net_device
*orig_dev
)
1416 struct sockaddr_pkt
*spkt
;
1419 * When we registered the protocol we saved the socket in the data
1420 * field for just this event.
1423 sk
= pt
->af_packet_priv
;
1426 * Yank back the headers [hope the device set this
1427 * right or kerboom...]
1429 * Incoming packets have ll header pulled,
1432 * For outgoing ones skb->data == skb_mac_header(skb)
1433 * so that this procedure is noop.
1436 if (skb
->pkt_type
== PACKET_LOOPBACK
)
1439 if (!net_eq(dev_net(dev
), sock_net(sk
)))
1442 skb
= skb_share_check(skb
, GFP_ATOMIC
);
1446 /* drop any routing info */
1449 /* drop conntrack reference */
1452 spkt
= &PACKET_SKB_CB(skb
)->sa
.pkt
;
1454 skb_push(skb
, skb
->data
- skb_mac_header(skb
));
1457 * The SOCK_PACKET socket receives _all_ frames.
1460 spkt
->spkt_family
= dev
->type
;
1461 strlcpy(spkt
->spkt_device
, dev
->name
, sizeof(spkt
->spkt_device
));
1462 spkt
->spkt_protocol
= skb
->protocol
;
1465 * Charge the memory to the socket. This is done specifically
1466 * to prevent sockets using all the memory up.
1469 if (sock_queue_rcv_skb(sk
, skb
) == 0)
1480 * Output a raw packet to a device layer. This bypasses all the other
1481 * protocol layers and you must therefore supply it with a complete frame
1484 static int packet_sendmsg_spkt(struct kiocb
*iocb
, struct socket
*sock
,
1485 struct msghdr
*msg
, size_t len
)
1487 struct sock
*sk
= sock
->sk
;
1488 struct sockaddr_pkt
*saddr
= (struct sockaddr_pkt
*)msg
->msg_name
;
1489 struct sk_buff
*skb
= NULL
;
1490 struct net_device
*dev
;
1496 * Get and verify the address.
1500 if (msg
->msg_namelen
< sizeof(struct sockaddr
))
1502 if (msg
->msg_namelen
== sizeof(struct sockaddr_pkt
))
1503 proto
= saddr
->spkt_protocol
;
1505 return -ENOTCONN
; /* SOCK_PACKET must be sent giving an address */
1508 * Find the device first to size check it
1511 saddr
->spkt_device
[sizeof(saddr
->spkt_device
) - 1] = 0;
1514 dev
= dev_get_by_name_rcu(sock_net(sk
), saddr
->spkt_device
);
1520 if (!(dev
->flags
& IFF_UP
))
1524 * You may not queue a frame bigger than the mtu. This is the lowest level
1525 * raw protocol and you must do your own fragmentation at this level.
1528 if (unlikely(sock_flag(sk
, SOCK_NOFCS
))) {
1529 if (!netif_supports_nofcs(dev
)) {
1530 err
= -EPROTONOSUPPORT
;
1533 extra_len
= 4; /* We're doing our own CRC */
1537 if (len
> dev
->mtu
+ dev
->hard_header_len
+ VLAN_HLEN
+ extra_len
)
1541 size_t reserved
= LL_RESERVED_SPACE(dev
);
1542 int tlen
= dev
->needed_tailroom
;
1543 unsigned int hhlen
= dev
->header_ops
? dev
->hard_header_len
: 0;
1546 skb
= sock_wmalloc(sk
, len
+ reserved
+ tlen
, 0, GFP_KERNEL
);
1549 /* FIXME: Save some space for broken drivers that write a hard
1550 * header at transmission time by themselves. PPP is the notable
1551 * one here. This should really be fixed at the driver level.
1553 skb_reserve(skb
, reserved
);
1554 skb_reset_network_header(skb
);
1556 /* Try to align data part correctly */
1561 skb_reset_network_header(skb
);
1563 err
= memcpy_fromiovec(skb_put(skb
, len
), msg
->msg_iov
, len
);
1569 if (len
> (dev
->mtu
+ dev
->hard_header_len
+ extra_len
)) {
1570 /* Earlier code assumed this would be a VLAN pkt,
1571 * double-check this now that we have the actual
1574 struct ethhdr
*ehdr
;
1575 skb_reset_mac_header(skb
);
1576 ehdr
= eth_hdr(skb
);
1577 if (ehdr
->h_proto
!= htons(ETH_P_8021Q
)) {
1583 skb
->protocol
= proto
;
1585 skb
->priority
= sk
->sk_priority
;
1586 skb
->mark
= sk
->sk_mark
;
1588 sock_tx_timestamp(sk
, &skb_shinfo(skb
)->tx_flags
);
1590 if (unlikely(extra_len
== 4))
1593 skb_probe_transport_header(skb
, 0);
1595 dev_queue_xmit(skb
);
1606 static unsigned int run_filter(const struct sk_buff
*skb
,
1607 const struct sock
*sk
,
1610 struct sk_filter
*filter
;
1613 filter
= rcu_dereference(sk
->sk_filter
);
1615 res
= SK_RUN_FILTER(filter
, skb
);
1622 * This function makes lazy skb cloning in hope that most of packets
1623 * are discarded by BPF.
1625 * Note tricky part: we DO mangle shared skb! skb->data, skb->len
1626 * and skb->cb are mangled. It works because (and until) packets
1627 * falling here are owned by current CPU. Output packets are cloned
1628 * by dev_queue_xmit_nit(), input packets are processed by net_bh
1629 * sequencially, so that if we return skb to original state on exit,
1630 * we will not harm anyone.
1633 static int packet_rcv(struct sk_buff
*skb
, struct net_device
*dev
,
1634 struct packet_type
*pt
, struct net_device
*orig_dev
)
1637 struct sockaddr_ll
*sll
;
1638 struct packet_sock
*po
;
1639 u8
*skb_head
= skb
->data
;
1640 int skb_len
= skb
->len
;
1641 unsigned int snaplen
, res
;
1643 if (skb
->pkt_type
== PACKET_LOOPBACK
)
1646 sk
= pt
->af_packet_priv
;
1649 if (!net_eq(dev_net(dev
), sock_net(sk
)))
1654 if (dev
->header_ops
) {
1655 /* The device has an explicit notion of ll header,
1656 * exported to higher levels.
1658 * Otherwise, the device hides details of its frame
1659 * structure, so that corresponding packet head is
1660 * never delivered to user.
1662 if (sk
->sk_type
!= SOCK_DGRAM
)
1663 skb_push(skb
, skb
->data
- skb_mac_header(skb
));
1664 else if (skb
->pkt_type
== PACKET_OUTGOING
) {
1665 /* Special case: outgoing packets have ll header at head */
1666 skb_pull(skb
, skb_network_offset(skb
));
1672 res
= run_filter(skb
, sk
, snaplen
);
1674 goto drop_n_restore
;
1678 if (atomic_read(&sk
->sk_rmem_alloc
) >= sk
->sk_rcvbuf
)
1681 if (skb_shared(skb
)) {
1682 struct sk_buff
*nskb
= skb_clone(skb
, GFP_ATOMIC
);
1686 if (skb_head
!= skb
->data
) {
1687 skb
->data
= skb_head
;
1694 BUILD_BUG_ON(sizeof(*PACKET_SKB_CB(skb
)) + MAX_ADDR_LEN
- 8 >
1697 sll
= &PACKET_SKB_CB(skb
)->sa
.ll
;
1698 sll
->sll_family
= AF_PACKET
;
1699 sll
->sll_hatype
= dev
->type
;
1700 sll
->sll_protocol
= skb
->protocol
;
1701 sll
->sll_pkttype
= skb
->pkt_type
;
1702 if (unlikely(po
->origdev
))
1703 sll
->sll_ifindex
= orig_dev
->ifindex
;
1705 sll
->sll_ifindex
= dev
->ifindex
;
1707 sll
->sll_halen
= dev_parse_header(skb
, sll
->sll_addr
);
1709 PACKET_SKB_CB(skb
)->origlen
= skb
->len
;
1711 if (pskb_trim(skb
, snaplen
))
1714 skb_set_owner_r(skb
, sk
);
1718 /* drop conntrack reference */
1721 spin_lock(&sk
->sk_receive_queue
.lock
);
1722 po
->stats
.stats1
.tp_packets
++;
1723 skb
->dropcount
= atomic_read(&sk
->sk_drops
);
1724 __skb_queue_tail(&sk
->sk_receive_queue
, skb
);
1725 spin_unlock(&sk
->sk_receive_queue
.lock
);
1726 sk
->sk_data_ready(sk
, skb
->len
);
1730 spin_lock(&sk
->sk_receive_queue
.lock
);
1731 po
->stats
.stats1
.tp_drops
++;
1732 atomic_inc(&sk
->sk_drops
);
1733 spin_unlock(&sk
->sk_receive_queue
.lock
);
1736 if (skb_head
!= skb
->data
&& skb_shared(skb
)) {
1737 skb
->data
= skb_head
;
1745 static int tpacket_rcv(struct sk_buff
*skb
, struct net_device
*dev
,
1746 struct packet_type
*pt
, struct net_device
*orig_dev
)
1749 struct packet_sock
*po
;
1750 struct sockaddr_ll
*sll
;
1751 union tpacket_uhdr h
;
1752 u8
*skb_head
= skb
->data
;
1753 int skb_len
= skb
->len
;
1754 unsigned int snaplen
, res
;
1755 unsigned long status
= TP_STATUS_USER
;
1756 unsigned short macoff
, netoff
, hdrlen
;
1757 struct sk_buff
*copy_skb
= NULL
;
1761 if (skb
->pkt_type
== PACKET_LOOPBACK
)
1764 sk
= pt
->af_packet_priv
;
1767 if (!net_eq(dev_net(dev
), sock_net(sk
)))
1770 if (dev
->header_ops
) {
1771 if (sk
->sk_type
!= SOCK_DGRAM
)
1772 skb_push(skb
, skb
->data
- skb_mac_header(skb
));
1773 else if (skb
->pkt_type
== PACKET_OUTGOING
) {
1774 /* Special case: outgoing packets have ll header at head */
1775 skb_pull(skb
, skb_network_offset(skb
));
1779 if (skb
->ip_summed
== CHECKSUM_PARTIAL
)
1780 status
|= TP_STATUS_CSUMNOTREADY
;
1784 res
= run_filter(skb
, sk
, snaplen
);
1786 goto drop_n_restore
;
1790 if (sk
->sk_type
== SOCK_DGRAM
) {
1791 macoff
= netoff
= TPACKET_ALIGN(po
->tp_hdrlen
) + 16 +
1794 unsigned int maclen
= skb_network_offset(skb
);
1795 netoff
= TPACKET_ALIGN(po
->tp_hdrlen
+
1796 (maclen
< 16 ? 16 : maclen
)) +
1798 macoff
= netoff
- maclen
;
1800 if (po
->tp_version
<= TPACKET_V2
) {
1801 if (macoff
+ snaplen
> po
->rx_ring
.frame_size
) {
1802 if (po
->copy_thresh
&&
1803 atomic_read(&sk
->sk_rmem_alloc
) < sk
->sk_rcvbuf
) {
1804 if (skb_shared(skb
)) {
1805 copy_skb
= skb_clone(skb
, GFP_ATOMIC
);
1807 copy_skb
= skb_get(skb
);
1808 skb_head
= skb
->data
;
1811 skb_set_owner_r(copy_skb
, sk
);
1813 snaplen
= po
->rx_ring
.frame_size
- macoff
;
1814 if ((int)snaplen
< 0)
1818 spin_lock(&sk
->sk_receive_queue
.lock
);
1819 h
.raw
= packet_current_rx_frame(po
, skb
,
1820 TP_STATUS_KERNEL
, (macoff
+snaplen
));
1823 if (po
->tp_version
<= TPACKET_V2
) {
1824 packet_increment_rx_head(po
, &po
->rx_ring
);
1826 * LOSING will be reported till you read the stats,
1827 * because it's COR - Clear On Read.
1828 * Anyways, moving it for V1/V2 only as V3 doesn't need this
1831 if (po
->stats
.stats1
.tp_drops
)
1832 status
|= TP_STATUS_LOSING
;
1834 po
->stats
.stats1
.tp_packets
++;
1836 status
|= TP_STATUS_COPY
;
1837 __skb_queue_tail(&sk
->sk_receive_queue
, copy_skb
);
1839 spin_unlock(&sk
->sk_receive_queue
.lock
);
1841 skb_copy_bits(skb
, 0, h
.raw
+ macoff
, snaplen
);
1843 if (!(ts_status
= tpacket_get_timestamp(skb
, &ts
, po
->tp_tstamp
)))
1844 getnstimeofday(&ts
);
1846 status
|= ts_status
;
1848 switch (po
->tp_version
) {
1850 h
.h1
->tp_len
= skb
->len
;
1851 h
.h1
->tp_snaplen
= snaplen
;
1852 h
.h1
->tp_mac
= macoff
;
1853 h
.h1
->tp_net
= netoff
;
1854 h
.h1
->tp_sec
= ts
.tv_sec
;
1855 h
.h1
->tp_usec
= ts
.tv_nsec
/ NSEC_PER_USEC
;
1856 hdrlen
= sizeof(*h
.h1
);
1859 h
.h2
->tp_len
= skb
->len
;
1860 h
.h2
->tp_snaplen
= snaplen
;
1861 h
.h2
->tp_mac
= macoff
;
1862 h
.h2
->tp_net
= netoff
;
1863 h
.h2
->tp_sec
= ts
.tv_sec
;
1864 h
.h2
->tp_nsec
= ts
.tv_nsec
;
1865 if (vlan_tx_tag_present(skb
)) {
1866 h
.h2
->tp_vlan_tci
= vlan_tx_tag_get(skb
);
1867 status
|= TP_STATUS_VLAN_VALID
;
1869 h
.h2
->tp_vlan_tci
= 0;
1871 h
.h2
->tp_padding
= 0;
1872 hdrlen
= sizeof(*h
.h2
);
1875 /* tp_nxt_offset,vlan are already populated above.
1876 * So DONT clear those fields here
1878 h
.h3
->tp_status
|= status
;
1879 h
.h3
->tp_len
= skb
->len
;
1880 h
.h3
->tp_snaplen
= snaplen
;
1881 h
.h3
->tp_mac
= macoff
;
1882 h
.h3
->tp_net
= netoff
;
1883 h
.h3
->tp_sec
= ts
.tv_sec
;
1884 h
.h3
->tp_nsec
= ts
.tv_nsec
;
1885 hdrlen
= sizeof(*h
.h3
);
1891 sll
= h
.raw
+ TPACKET_ALIGN(hdrlen
);
1892 sll
->sll_halen
= dev_parse_header(skb
, sll
->sll_addr
);
1893 sll
->sll_family
= AF_PACKET
;
1894 sll
->sll_hatype
= dev
->type
;
1895 sll
->sll_protocol
= skb
->protocol
;
1896 sll
->sll_pkttype
= skb
->pkt_type
;
1897 if (unlikely(po
->origdev
))
1898 sll
->sll_ifindex
= orig_dev
->ifindex
;
1900 sll
->sll_ifindex
= dev
->ifindex
;
1903 #if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1
1907 if (po
->tp_version
<= TPACKET_V2
) {
1908 end
= (u8
*)PAGE_ALIGN((unsigned long)h
.raw
1909 + macoff
+ snaplen
);
1910 for (start
= h
.raw
; start
< end
; start
+= PAGE_SIZE
)
1911 flush_dcache_page(pgv_to_page(start
));
1916 if (po
->tp_version
<= TPACKET_V2
)
1917 __packet_set_status(po
, h
.raw
, status
);
1919 prb_clear_blk_fill_status(&po
->rx_ring
);
1921 sk
->sk_data_ready(sk
, 0);
1924 if (skb_head
!= skb
->data
&& skb_shared(skb
)) {
1925 skb
->data
= skb_head
;
1933 po
->stats
.stats1
.tp_drops
++;
1934 spin_unlock(&sk
->sk_receive_queue
.lock
);
1936 sk
->sk_data_ready(sk
, 0);
1937 kfree_skb(copy_skb
);
1938 goto drop_n_restore
;
1941 static void tpacket_destruct_skb(struct sk_buff
*skb
)
1943 struct packet_sock
*po
= pkt_sk(skb
->sk
);
1946 if (likely(po
->tx_ring
.pg_vec
)) {
1949 ph
= skb_shinfo(skb
)->destructor_arg
;
1950 BUG_ON(atomic_read(&po
->tx_ring
.pending
) == 0);
1951 atomic_dec(&po
->tx_ring
.pending
);
1953 ts
= __packet_set_timestamp(po
, ph
, skb
);
1954 __packet_set_status(po
, ph
, TP_STATUS_AVAILABLE
| ts
);
1960 static int tpacket_fill_skb(struct packet_sock
*po
, struct sk_buff
*skb
,
1961 void *frame
, struct net_device
*dev
, int size_max
,
1962 __be16 proto
, unsigned char *addr
, int hlen
)
1964 union tpacket_uhdr ph
;
1965 int to_write
, offset
, len
, tp_len
, nr_frags
, len_max
;
1966 struct socket
*sock
= po
->sk
.sk_socket
;
1973 skb
->protocol
= proto
;
1975 skb
->priority
= po
->sk
.sk_priority
;
1976 skb
->mark
= po
->sk
.sk_mark
;
1977 sock_tx_timestamp(&po
->sk
, &skb_shinfo(skb
)->tx_flags
);
1978 skb_shinfo(skb
)->destructor_arg
= ph
.raw
;
1980 switch (po
->tp_version
) {
1982 tp_len
= ph
.h2
->tp_len
;
1985 tp_len
= ph
.h1
->tp_len
;
1988 if (unlikely(tp_len
> size_max
)) {
1989 pr_err("packet size is too long (%d > %d)\n", tp_len
, size_max
);
1993 skb_reserve(skb
, hlen
);
1994 skb_reset_network_header(skb
);
1995 skb_probe_transport_header(skb
, 0);
1997 if (po
->tp_tx_has_off
) {
1998 int off_min
, off_max
, off
;
1999 off_min
= po
->tp_hdrlen
- sizeof(struct sockaddr_ll
);
2000 off_max
= po
->tx_ring
.frame_size
- tp_len
;
2001 if (sock
->type
== SOCK_DGRAM
) {
2002 switch (po
->tp_version
) {
2004 off
= ph
.h2
->tp_net
;
2007 off
= ph
.h1
->tp_net
;
2011 switch (po
->tp_version
) {
2013 off
= ph
.h2
->tp_mac
;
2016 off
= ph
.h1
->tp_mac
;
2020 if (unlikely((off
< off_min
) || (off_max
< off
)))
2022 data
= ph
.raw
+ off
;
2024 data
= ph
.raw
+ po
->tp_hdrlen
- sizeof(struct sockaddr_ll
);
2028 if (sock
->type
== SOCK_DGRAM
) {
2029 err
= dev_hard_header(skb
, dev
, ntohs(proto
), addr
,
2031 if (unlikely(err
< 0))
2033 } else if (dev
->hard_header_len
) {
2034 /* net device doesn't like empty head */
2035 if (unlikely(tp_len
<= dev
->hard_header_len
)) {
2036 pr_err("packet size is too short (%d < %d)\n",
2037 tp_len
, dev
->hard_header_len
);
2041 skb_push(skb
, dev
->hard_header_len
);
2042 err
= skb_store_bits(skb
, 0, data
,
2043 dev
->hard_header_len
);
2047 data
+= dev
->hard_header_len
;
2048 to_write
-= dev
->hard_header_len
;
2051 offset
= offset_in_page(data
);
2052 len_max
= PAGE_SIZE
- offset
;
2053 len
= ((to_write
> len_max
) ? len_max
: to_write
);
2055 skb
->data_len
= to_write
;
2056 skb
->len
+= to_write
;
2057 skb
->truesize
+= to_write
;
2058 atomic_add(to_write
, &po
->sk
.sk_wmem_alloc
);
2060 while (likely(to_write
)) {
2061 nr_frags
= skb_shinfo(skb
)->nr_frags
;
2063 if (unlikely(nr_frags
>= MAX_SKB_FRAGS
)) {
2064 pr_err("Packet exceed the number of skb frags(%lu)\n",
2069 page
= pgv_to_page(data
);
2071 flush_dcache_page(page
);
2073 skb_fill_page_desc(skb
, nr_frags
, page
, offset
, len
);
2076 len_max
= PAGE_SIZE
;
2077 len
= ((to_write
> len_max
) ? len_max
: to_write
);
2083 static int tpacket_snd(struct packet_sock
*po
, struct msghdr
*msg
)
2085 struct sk_buff
*skb
;
2086 struct net_device
*dev
;
2088 int err
, reserve
= 0;
2090 struct sockaddr_ll
*saddr
= (struct sockaddr_ll
*)msg
->msg_name
;
2091 int tp_len
, size_max
;
2092 unsigned char *addr
;
2094 int status
= TP_STATUS_AVAILABLE
;
2097 mutex_lock(&po
->pg_vec_lock
);
2099 if (likely(saddr
== NULL
)) {
2100 dev
= packet_cached_dev_get(po
);
2105 if (msg
->msg_namelen
< sizeof(struct sockaddr_ll
))
2107 if (msg
->msg_namelen
< (saddr
->sll_halen
2108 + offsetof(struct sockaddr_ll
,
2111 proto
= saddr
->sll_protocol
;
2112 addr
= saddr
->sll_addr
;
2113 dev
= dev_get_by_index(sock_net(&po
->sk
), saddr
->sll_ifindex
);
2117 if (unlikely(dev
== NULL
))
2120 if (unlikely(!(dev
->flags
& IFF_UP
)))
2123 reserve
= dev
->hard_header_len
;
2125 size_max
= po
->tx_ring
.frame_size
2126 - (po
->tp_hdrlen
- sizeof(struct sockaddr_ll
));
2128 if (size_max
> dev
->mtu
+ reserve
)
2129 size_max
= dev
->mtu
+ reserve
;
2132 ph
= packet_current_frame(po
, &po
->tx_ring
,
2133 TP_STATUS_SEND_REQUEST
);
2135 if (unlikely(ph
== NULL
)) {
2140 status
= TP_STATUS_SEND_REQUEST
;
2141 hlen
= LL_RESERVED_SPACE(dev
);
2142 tlen
= dev
->needed_tailroom
;
2143 skb
= sock_alloc_send_skb(&po
->sk
,
2144 hlen
+ tlen
+ sizeof(struct sockaddr_ll
),
2147 if (unlikely(skb
== NULL
))
2150 tp_len
= tpacket_fill_skb(po
, skb
, ph
, dev
, size_max
, proto
,
2153 if (unlikely(tp_len
< 0)) {
2155 __packet_set_status(po
, ph
,
2156 TP_STATUS_AVAILABLE
);
2157 packet_increment_head(&po
->tx_ring
);
2161 status
= TP_STATUS_WRONG_FORMAT
;
2167 skb
->destructor
= tpacket_destruct_skb
;
2168 __packet_set_status(po
, ph
, TP_STATUS_SENDING
);
2169 atomic_inc(&po
->tx_ring
.pending
);
2171 status
= TP_STATUS_SEND_REQUEST
;
2172 err
= dev_queue_xmit(skb
);
2173 if (unlikely(err
> 0)) {
2174 err
= net_xmit_errno(err
);
2175 if (err
&& __packet_get_status(po
, ph
) ==
2176 TP_STATUS_AVAILABLE
) {
2177 /* skb was destructed already */
2182 * skb was dropped but not destructed yet;
2183 * let's treat it like congestion or err < 0
2187 packet_increment_head(&po
->tx_ring
);
2189 } while (likely((ph
!= NULL
) ||
2190 ((!(msg
->msg_flags
& MSG_DONTWAIT
)) &&
2191 (atomic_read(&po
->tx_ring
.pending
))))
2198 __packet_set_status(po
, ph
, status
);
2203 mutex_unlock(&po
->pg_vec_lock
);
2207 static struct sk_buff
*packet_alloc_skb(struct sock
*sk
, size_t prepad
,
2208 size_t reserve
, size_t len
,
2209 size_t linear
, int noblock
,
2212 struct sk_buff
*skb
;
2214 /* Under a page? Don't bother with paged skb. */
2215 if (prepad
+ len
< PAGE_SIZE
|| !linear
)
2218 skb
= sock_alloc_send_pskb(sk
, prepad
+ linear
, len
- linear
, noblock
,
2223 skb_reserve(skb
, reserve
);
2224 skb_put(skb
, linear
);
2225 skb
->data_len
= len
- linear
;
2226 skb
->len
+= len
- linear
;
2231 static int packet_snd(struct socket
*sock
,
2232 struct msghdr
*msg
, size_t len
)
2234 struct sock
*sk
= sock
->sk
;
2235 struct sockaddr_ll
*saddr
= (struct sockaddr_ll
*)msg
->msg_name
;
2236 struct sk_buff
*skb
;
2237 struct net_device
*dev
;
2239 unsigned char *addr
;
2240 int err
, reserve
= 0;
2241 struct virtio_net_hdr vnet_hdr
= { 0 };
2244 struct packet_sock
*po
= pkt_sk(sk
);
2245 unsigned short gso_type
= 0;
2250 * Get and verify the address.
2253 if (likely(saddr
== NULL
)) {
2254 dev
= packet_cached_dev_get(po
);
2259 if (msg
->msg_namelen
< sizeof(struct sockaddr_ll
))
2261 if (msg
->msg_namelen
< (saddr
->sll_halen
+ offsetof(struct sockaddr_ll
, sll_addr
)))
2263 proto
= saddr
->sll_protocol
;
2264 addr
= saddr
->sll_addr
;
2265 dev
= dev_get_by_index(sock_net(sk
), saddr
->sll_ifindex
);
2269 if (unlikely(dev
== NULL
))
2272 if (unlikely(!(dev
->flags
& IFF_UP
)))
2275 if (sock
->type
== SOCK_RAW
)
2276 reserve
= dev
->hard_header_len
;
2277 if (po
->has_vnet_hdr
) {
2278 vnet_hdr_len
= sizeof(vnet_hdr
);
2281 if (len
< vnet_hdr_len
)
2284 len
-= vnet_hdr_len
;
2286 err
= memcpy_fromiovec((void *)&vnet_hdr
, msg
->msg_iov
,
2291 if ((vnet_hdr
.flags
& VIRTIO_NET_HDR_F_NEEDS_CSUM
) &&
2292 (vnet_hdr
.csum_start
+ vnet_hdr
.csum_offset
+ 2 >
2294 vnet_hdr
.hdr_len
= vnet_hdr
.csum_start
+
2295 vnet_hdr
.csum_offset
+ 2;
2298 if (vnet_hdr
.hdr_len
> len
)
2301 if (vnet_hdr
.gso_type
!= VIRTIO_NET_HDR_GSO_NONE
) {
2302 switch (vnet_hdr
.gso_type
& ~VIRTIO_NET_HDR_GSO_ECN
) {
2303 case VIRTIO_NET_HDR_GSO_TCPV4
:
2304 gso_type
= SKB_GSO_TCPV4
;
2306 case VIRTIO_NET_HDR_GSO_TCPV6
:
2307 gso_type
= SKB_GSO_TCPV6
;
2309 case VIRTIO_NET_HDR_GSO_UDP
:
2310 gso_type
= SKB_GSO_UDP
;
2316 if (vnet_hdr
.gso_type
& VIRTIO_NET_HDR_GSO_ECN
)
2317 gso_type
|= SKB_GSO_TCP_ECN
;
2319 if (vnet_hdr
.gso_size
== 0)
2325 if (unlikely(sock_flag(sk
, SOCK_NOFCS
))) {
2326 if (!netif_supports_nofcs(dev
)) {
2327 err
= -EPROTONOSUPPORT
;
2330 extra_len
= 4; /* We're doing our own CRC */
2334 if (!gso_type
&& (len
> dev
->mtu
+ reserve
+ VLAN_HLEN
+ extra_len
))
2338 hlen
= LL_RESERVED_SPACE(dev
);
2339 tlen
= dev
->needed_tailroom
;
2340 skb
= packet_alloc_skb(sk
, hlen
+ tlen
, hlen
, len
, vnet_hdr
.hdr_len
,
2341 msg
->msg_flags
& MSG_DONTWAIT
, &err
);
2345 skb_set_network_header(skb
, reserve
);
2348 if (sock
->type
== SOCK_DGRAM
&&
2349 (offset
= dev_hard_header(skb
, dev
, ntohs(proto
), addr
, NULL
, len
)) < 0)
2352 /* Returns -EFAULT on error */
2353 err
= skb_copy_datagram_from_iovec(skb
, offset
, msg
->msg_iov
, 0, len
);
2357 sock_tx_timestamp(sk
, &skb_shinfo(skb
)->tx_flags
);
2359 if (!gso_type
&& (len
> dev
->mtu
+ reserve
+ extra_len
)) {
2360 /* Earlier code assumed this would be a VLAN pkt,
2361 * double-check this now that we have the actual
2364 struct ethhdr
*ehdr
;
2365 skb_reset_mac_header(skb
);
2366 ehdr
= eth_hdr(skb
);
2367 if (ehdr
->h_proto
!= htons(ETH_P_8021Q
)) {
2373 skb
->protocol
= proto
;
2375 skb
->priority
= sk
->sk_priority
;
2376 skb
->mark
= sk
->sk_mark
;
2378 if (po
->has_vnet_hdr
) {
2379 if (vnet_hdr
.flags
& VIRTIO_NET_HDR_F_NEEDS_CSUM
) {
2380 if (!skb_partial_csum_set(skb
, vnet_hdr
.csum_start
,
2381 vnet_hdr
.csum_offset
)) {
2387 skb_shinfo(skb
)->gso_size
= vnet_hdr
.gso_size
;
2388 skb_shinfo(skb
)->gso_type
= gso_type
;
2390 /* Header must be checked, and gso_segs computed. */
2391 skb_shinfo(skb
)->gso_type
|= SKB_GSO_DODGY
;
2392 skb_shinfo(skb
)->gso_segs
= 0;
2394 len
+= vnet_hdr_len
;
2397 skb_probe_transport_header(skb
, reserve
);
2399 if (unlikely(extra_len
== 4))
2406 err
= dev_queue_xmit(skb
);
2407 if (err
> 0 && (err
= net_xmit_errno(err
)) != 0)
2423 static int packet_sendmsg(struct kiocb
*iocb
, struct socket
*sock
,
2424 struct msghdr
*msg
, size_t len
)
2426 struct sock
*sk
= sock
->sk
;
2427 struct packet_sock
*po
= pkt_sk(sk
);
2428 if (po
->tx_ring
.pg_vec
)
2429 return tpacket_snd(po
, msg
);
2431 return packet_snd(sock
, msg
, len
);
2435 * Close a PACKET socket. This is fairly simple. We immediately go
2436 * to 'closed' state and remove our protocol entry in the device list.
2439 static int packet_release(struct socket
*sock
)
2441 struct sock
*sk
= sock
->sk
;
2442 struct packet_sock
*po
;
2444 union tpacket_req_u req_u
;
2452 mutex_lock(&net
->packet
.sklist_lock
);
2453 sk_del_node_init_rcu(sk
);
2454 mutex_unlock(&net
->packet
.sklist_lock
);
2457 sock_prot_inuse_add(net
, sk
->sk_prot
, -1);
2460 spin_lock(&po
->bind_lock
);
2461 unregister_prot_hook(sk
, false);
2462 packet_cached_dev_reset(po
);
2464 if (po
->prot_hook
.dev
) {
2465 dev_put(po
->prot_hook
.dev
);
2466 po
->prot_hook
.dev
= NULL
;
2468 spin_unlock(&po
->bind_lock
);
2470 packet_flush_mclist(sk
);
2472 if (po
->rx_ring
.pg_vec
) {
2473 memset(&req_u
, 0, sizeof(req_u
));
2474 packet_set_ring(sk
, &req_u
, 1, 0);
2477 if (po
->tx_ring
.pg_vec
) {
2478 memset(&req_u
, 0, sizeof(req_u
));
2479 packet_set_ring(sk
, &req_u
, 1, 1);
2486 * Now the socket is dead. No more input will appear.
2493 skb_queue_purge(&sk
->sk_receive_queue
);
2494 sk_refcnt_debug_release(sk
);
2501 * Attach a packet hook.
2504 static int packet_do_bind(struct sock
*sk
, struct net_device
*dev
, __be16 protocol
)
2506 struct packet_sock
*po
= pkt_sk(sk
);
2517 spin_lock(&po
->bind_lock
);
2518 unregister_prot_hook(sk
, true);
2521 po
->prot_hook
.type
= protocol
;
2522 if (po
->prot_hook
.dev
)
2523 dev_put(po
->prot_hook
.dev
);
2525 po
->prot_hook
.dev
= dev
;
2526 po
->ifindex
= dev
? dev
->ifindex
: 0;
2528 packet_cached_dev_assign(po
, dev
);
2533 if (!dev
|| (dev
->flags
& IFF_UP
)) {
2534 register_prot_hook(sk
);
2536 sk
->sk_err
= ENETDOWN
;
2537 if (!sock_flag(sk
, SOCK_DEAD
))
2538 sk
->sk_error_report(sk
);
2542 spin_unlock(&po
->bind_lock
);
2548 * Bind a packet socket to a device
2551 static int packet_bind_spkt(struct socket
*sock
, struct sockaddr
*uaddr
,
2554 struct sock
*sk
= sock
->sk
;
2556 struct net_device
*dev
;
2563 if (addr_len
!= sizeof(struct sockaddr
))
2565 strlcpy(name
, uaddr
->sa_data
, sizeof(name
));
2567 dev
= dev_get_by_name(sock_net(sk
), name
);
2569 err
= packet_do_bind(sk
, dev
, pkt_sk(sk
)->num
);
2573 static int packet_bind(struct socket
*sock
, struct sockaddr
*uaddr
, int addr_len
)
2575 struct sockaddr_ll
*sll
= (struct sockaddr_ll
*)uaddr
;
2576 struct sock
*sk
= sock
->sk
;
2577 struct net_device
*dev
= NULL
;
2585 if (addr_len
< sizeof(struct sockaddr_ll
))
2587 if (sll
->sll_family
!= AF_PACKET
)
2590 if (sll
->sll_ifindex
) {
2592 dev
= dev_get_by_index(sock_net(sk
), sll
->sll_ifindex
);
2596 err
= packet_do_bind(sk
, dev
, sll
->sll_protocol
? : pkt_sk(sk
)->num
);
2602 static struct proto packet_proto
= {
2604 .owner
= THIS_MODULE
,
2605 .obj_size
= sizeof(struct packet_sock
),
2609 * Create a packet of type SOCK_PACKET.
2612 static int packet_create(struct net
*net
, struct socket
*sock
, int protocol
,
2616 struct packet_sock
*po
;
2617 __be16 proto
= (__force __be16
)protocol
; /* weird, but documented */
2620 if (!ns_capable(net
->user_ns
, CAP_NET_RAW
))
2622 if (sock
->type
!= SOCK_DGRAM
&& sock
->type
!= SOCK_RAW
&&
2623 sock
->type
!= SOCK_PACKET
)
2624 return -ESOCKTNOSUPPORT
;
2626 sock
->state
= SS_UNCONNECTED
;
2629 sk
= sk_alloc(net
, PF_PACKET
, GFP_KERNEL
, &packet_proto
);
2633 sock
->ops
= &packet_ops
;
2634 if (sock
->type
== SOCK_PACKET
)
2635 sock
->ops
= &packet_ops_spkt
;
2637 sock_init_data(sock
, sk
);
2640 sk
->sk_family
= PF_PACKET
;
2643 packet_cached_dev_reset(po
);
2645 sk
->sk_destruct
= packet_sock_destruct
;
2646 sk_refcnt_debug_inc(sk
);
2649 * Attach a protocol block
2652 spin_lock_init(&po
->bind_lock
);
2653 mutex_init(&po
->pg_vec_lock
);
2654 po
->prot_hook
.func
= packet_rcv
;
2656 if (sock
->type
== SOCK_PACKET
)
2657 po
->prot_hook
.func
= packet_rcv_spkt
;
2659 po
->prot_hook
.af_packet_priv
= sk
;
2662 po
->prot_hook
.type
= proto
;
2663 register_prot_hook(sk
);
2666 mutex_lock(&net
->packet
.sklist_lock
);
2667 sk_add_node_rcu(sk
, &net
->packet
.sklist
);
2668 mutex_unlock(&net
->packet
.sklist_lock
);
2671 sock_prot_inuse_add(net
, &packet_proto
, 1);
2680 * Pull a packet from our receive queue and hand it to the user.
2681 * If necessary we block.
2684 static int packet_recvmsg(struct kiocb
*iocb
, struct socket
*sock
,
2685 struct msghdr
*msg
, size_t len
, int flags
)
2687 struct sock
*sk
= sock
->sk
;
2688 struct sk_buff
*skb
;
2690 int vnet_hdr_len
= 0;
2693 if (flags
& ~(MSG_PEEK
|MSG_DONTWAIT
|MSG_TRUNC
|MSG_CMSG_COMPAT
|MSG_ERRQUEUE
))
2697 /* What error should we return now? EUNATTACH? */
2698 if (pkt_sk(sk
)->ifindex
< 0)
2702 if (flags
& MSG_ERRQUEUE
) {
2703 err
= sock_recv_errqueue(sk
, msg
, len
,
2704 SOL_PACKET
, PACKET_TX_TIMESTAMP
);
2709 * Call the generic datagram receiver. This handles all sorts
2710 * of horrible races and re-entrancy so we can forget about it
2711 * in the protocol layers.
2713 * Now it will return ENETDOWN, if device have just gone down,
2714 * but then it will block.
2717 skb
= skb_recv_datagram(sk
, flags
, flags
& MSG_DONTWAIT
, &err
);
2720 * An error occurred so return it. Because skb_recv_datagram()
2721 * handles the blocking we don't see and worry about blocking
2728 if (pkt_sk(sk
)->has_vnet_hdr
) {
2729 struct virtio_net_hdr vnet_hdr
= { 0 };
2732 vnet_hdr_len
= sizeof(vnet_hdr
);
2733 if (len
< vnet_hdr_len
)
2736 len
-= vnet_hdr_len
;
2738 if (skb_is_gso(skb
)) {
2739 struct skb_shared_info
*sinfo
= skb_shinfo(skb
);
2741 /* This is a hint as to how much should be linear. */
2742 vnet_hdr
.hdr_len
= skb_headlen(skb
);
2743 vnet_hdr
.gso_size
= sinfo
->gso_size
;
2744 if (sinfo
->gso_type
& SKB_GSO_TCPV4
)
2745 vnet_hdr
.gso_type
= VIRTIO_NET_HDR_GSO_TCPV4
;
2746 else if (sinfo
->gso_type
& SKB_GSO_TCPV6
)
2747 vnet_hdr
.gso_type
= VIRTIO_NET_HDR_GSO_TCPV6
;
2748 else if (sinfo
->gso_type
& SKB_GSO_UDP
)
2749 vnet_hdr
.gso_type
= VIRTIO_NET_HDR_GSO_UDP
;
2750 else if (sinfo
->gso_type
& SKB_GSO_FCOE
)
2754 if (sinfo
->gso_type
& SKB_GSO_TCP_ECN
)
2755 vnet_hdr
.gso_type
|= VIRTIO_NET_HDR_GSO_ECN
;
2757 vnet_hdr
.gso_type
= VIRTIO_NET_HDR_GSO_NONE
;
2759 if (skb
->ip_summed
== CHECKSUM_PARTIAL
) {
2760 vnet_hdr
.flags
= VIRTIO_NET_HDR_F_NEEDS_CSUM
;
2761 vnet_hdr
.csum_start
= skb_checksum_start_offset(skb
);
2762 vnet_hdr
.csum_offset
= skb
->csum_offset
;
2763 } else if (skb
->ip_summed
== CHECKSUM_UNNECESSARY
) {
2764 vnet_hdr
.flags
= VIRTIO_NET_HDR_F_DATA_VALID
;
2765 } /* else everything is zero */
2767 err
= memcpy_toiovec(msg
->msg_iov
, (void *)&vnet_hdr
,
2773 /* You lose any data beyond the buffer you gave. If it worries
2774 * a user program they can ask the device for its MTU
2780 msg
->msg_flags
|= MSG_TRUNC
;
2783 err
= skb_copy_datagram_iovec(skb
, 0, msg
->msg_iov
, copied
);
2787 sock_recv_ts_and_drops(msg
, sk
, skb
);
2789 if (msg
->msg_name
) {
2790 /* If the address length field is there to be filled
2791 * in, we fill it in now.
2793 if (sock
->type
== SOCK_PACKET
) {
2794 msg
->msg_namelen
= sizeof(struct sockaddr_pkt
);
2796 struct sockaddr_ll
*sll
= &PACKET_SKB_CB(skb
)->sa
.ll
;
2797 msg
->msg_namelen
= sll
->sll_halen
+
2798 offsetof(struct sockaddr_ll
, sll_addr
);
2800 memcpy(msg
->msg_name
, &PACKET_SKB_CB(skb
)->sa
,
2804 if (pkt_sk(sk
)->auxdata
) {
2805 struct tpacket_auxdata aux
;
2807 aux
.tp_status
= TP_STATUS_USER
;
2808 if (skb
->ip_summed
== CHECKSUM_PARTIAL
)
2809 aux
.tp_status
|= TP_STATUS_CSUMNOTREADY
;
2810 aux
.tp_len
= PACKET_SKB_CB(skb
)->origlen
;
2811 aux
.tp_snaplen
= skb
->len
;
2813 aux
.tp_net
= skb_network_offset(skb
);
2814 if (vlan_tx_tag_present(skb
)) {
2815 aux
.tp_vlan_tci
= vlan_tx_tag_get(skb
);
2816 aux
.tp_status
|= TP_STATUS_VLAN_VALID
;
2818 aux
.tp_vlan_tci
= 0;
2821 put_cmsg(msg
, SOL_PACKET
, PACKET_AUXDATA
, sizeof(aux
), &aux
);
2825 * Free or return the buffer as appropriate. Again this
2826 * hides all the races and re-entrancy issues from us.
2828 err
= vnet_hdr_len
+ ((flags
&MSG_TRUNC
) ? skb
->len
: copied
);
2831 skb_free_datagram(sk
, skb
);
2836 static int packet_getname_spkt(struct socket
*sock
, struct sockaddr
*uaddr
,
2837 int *uaddr_len
, int peer
)
2839 struct net_device
*dev
;
2840 struct sock
*sk
= sock
->sk
;
2845 uaddr
->sa_family
= AF_PACKET
;
2846 memset(uaddr
->sa_data
, 0, sizeof(uaddr
->sa_data
));
2848 dev
= dev_get_by_index_rcu(sock_net(sk
), pkt_sk(sk
)->ifindex
);
2850 strlcpy(uaddr
->sa_data
, dev
->name
, sizeof(uaddr
->sa_data
));
2852 *uaddr_len
= sizeof(*uaddr
);
2857 static int packet_getname(struct socket
*sock
, struct sockaddr
*uaddr
,
2858 int *uaddr_len
, int peer
)
2860 struct net_device
*dev
;
2861 struct sock
*sk
= sock
->sk
;
2862 struct packet_sock
*po
= pkt_sk(sk
);
2863 DECLARE_SOCKADDR(struct sockaddr_ll
*, sll
, uaddr
);
2868 sll
->sll_family
= AF_PACKET
;
2869 sll
->sll_ifindex
= po
->ifindex
;
2870 sll
->sll_protocol
= po
->num
;
2871 sll
->sll_pkttype
= 0;
2873 dev
= dev_get_by_index_rcu(sock_net(sk
), po
->ifindex
);
2875 sll
->sll_hatype
= dev
->type
;
2876 sll
->sll_halen
= dev
->addr_len
;
2877 memcpy(sll
->sll_addr
, dev
->dev_addr
, dev
->addr_len
);
2879 sll
->sll_hatype
= 0; /* Bad: we have no ARPHRD_UNSPEC */
2883 *uaddr_len
= offsetof(struct sockaddr_ll
, sll_addr
) + sll
->sll_halen
;
2888 static int packet_dev_mc(struct net_device
*dev
, struct packet_mclist
*i
,
2892 case PACKET_MR_MULTICAST
:
2893 if (i
->alen
!= dev
->addr_len
)
2896 return dev_mc_add(dev
, i
->addr
);
2898 return dev_mc_del(dev
, i
->addr
);
2900 case PACKET_MR_PROMISC
:
2901 return dev_set_promiscuity(dev
, what
);
2903 case PACKET_MR_ALLMULTI
:
2904 return dev_set_allmulti(dev
, what
);
2906 case PACKET_MR_UNICAST
:
2907 if (i
->alen
!= dev
->addr_len
)
2910 return dev_uc_add(dev
, i
->addr
);
2912 return dev_uc_del(dev
, i
->addr
);
2920 static void packet_dev_mclist(struct net_device
*dev
, struct packet_mclist
*i
, int what
)
2922 for ( ; i
; i
= i
->next
) {
2923 if (i
->ifindex
== dev
->ifindex
)
2924 packet_dev_mc(dev
, i
, what
);
2928 static int packet_mc_add(struct sock
*sk
, struct packet_mreq_max
*mreq
)
2930 struct packet_sock
*po
= pkt_sk(sk
);
2931 struct packet_mclist
*ml
, *i
;
2932 struct net_device
*dev
;
2938 dev
= __dev_get_by_index(sock_net(sk
), mreq
->mr_ifindex
);
2943 if (mreq
->mr_alen
> dev
->addr_len
)
2947 i
= kmalloc(sizeof(*i
), GFP_KERNEL
);
2952 for (ml
= po
->mclist
; ml
; ml
= ml
->next
) {
2953 if (ml
->ifindex
== mreq
->mr_ifindex
&&
2954 ml
->type
== mreq
->mr_type
&&
2955 ml
->alen
== mreq
->mr_alen
&&
2956 memcmp(ml
->addr
, mreq
->mr_address
, ml
->alen
) == 0) {
2958 /* Free the new element ... */
2964 i
->type
= mreq
->mr_type
;
2965 i
->ifindex
= mreq
->mr_ifindex
;
2966 i
->alen
= mreq
->mr_alen
;
2967 memcpy(i
->addr
, mreq
->mr_address
, i
->alen
);
2969 i
->next
= po
->mclist
;
2971 err
= packet_dev_mc(dev
, i
, 1);
2973 po
->mclist
= i
->next
;
2982 static int packet_mc_drop(struct sock
*sk
, struct packet_mreq_max
*mreq
)
2984 struct packet_mclist
*ml
, **mlp
;
2988 for (mlp
= &pkt_sk(sk
)->mclist
; (ml
= *mlp
) != NULL
; mlp
= &ml
->next
) {
2989 if (ml
->ifindex
== mreq
->mr_ifindex
&&
2990 ml
->type
== mreq
->mr_type
&&
2991 ml
->alen
== mreq
->mr_alen
&&
2992 memcmp(ml
->addr
, mreq
->mr_address
, ml
->alen
) == 0) {
2993 if (--ml
->count
== 0) {
2994 struct net_device
*dev
;
2996 dev
= __dev_get_by_index(sock_net(sk
), ml
->ifindex
);
2998 packet_dev_mc(dev
, ml
, -1);
3006 return -EADDRNOTAVAIL
;
3009 static void packet_flush_mclist(struct sock
*sk
)
3011 struct packet_sock
*po
= pkt_sk(sk
);
3012 struct packet_mclist
*ml
;
3018 while ((ml
= po
->mclist
) != NULL
) {
3019 struct net_device
*dev
;
3021 po
->mclist
= ml
->next
;
3022 dev
= __dev_get_by_index(sock_net(sk
), ml
->ifindex
);
3024 packet_dev_mc(dev
, ml
, -1);
3031 packet_setsockopt(struct socket
*sock
, int level
, int optname
, char __user
*optval
, unsigned int optlen
)
3033 struct sock
*sk
= sock
->sk
;
3034 struct packet_sock
*po
= pkt_sk(sk
);
3037 if (level
!= SOL_PACKET
)
3038 return -ENOPROTOOPT
;
3041 case PACKET_ADD_MEMBERSHIP
:
3042 case PACKET_DROP_MEMBERSHIP
:
3044 struct packet_mreq_max mreq
;
3046 memset(&mreq
, 0, sizeof(mreq
));
3047 if (len
< sizeof(struct packet_mreq
))
3049 if (len
> sizeof(mreq
))
3051 if (copy_from_user(&mreq
, optval
, len
))
3053 if (len
< (mreq
.mr_alen
+ offsetof(struct packet_mreq
, mr_address
)))
3055 if (optname
== PACKET_ADD_MEMBERSHIP
)
3056 ret
= packet_mc_add(sk
, &mreq
);
3058 ret
= packet_mc_drop(sk
, &mreq
);
3062 case PACKET_RX_RING
:
3063 case PACKET_TX_RING
:
3065 union tpacket_req_u req_u
;
3068 switch (po
->tp_version
) {
3071 len
= sizeof(req_u
.req
);
3075 len
= sizeof(req_u
.req3
);
3080 if (pkt_sk(sk
)->has_vnet_hdr
)
3082 if (copy_from_user(&req_u
.req
, optval
, len
))
3084 return packet_set_ring(sk
, &req_u
, 0,
3085 optname
== PACKET_TX_RING
);
3087 case PACKET_COPY_THRESH
:
3091 if (optlen
!= sizeof(val
))
3093 if (copy_from_user(&val
, optval
, sizeof(val
)))
3096 pkt_sk(sk
)->copy_thresh
= val
;
3099 case PACKET_VERSION
:
3103 if (optlen
!= sizeof(val
))
3105 if (po
->rx_ring
.pg_vec
|| po
->tx_ring
.pg_vec
)
3107 if (copy_from_user(&val
, optval
, sizeof(val
)))
3113 po
->tp_version
= val
;
3119 case PACKET_RESERVE
:
3123 if (optlen
!= sizeof(val
))
3125 if (po
->rx_ring
.pg_vec
|| po
->tx_ring
.pg_vec
)
3127 if (copy_from_user(&val
, optval
, sizeof(val
)))
3129 po
->tp_reserve
= val
;
3136 if (optlen
!= sizeof(val
))
3138 if (po
->rx_ring
.pg_vec
|| po
->tx_ring
.pg_vec
)
3140 if (copy_from_user(&val
, optval
, sizeof(val
)))
3142 po
->tp_loss
= !!val
;
3145 case PACKET_AUXDATA
:
3149 if (optlen
< sizeof(val
))
3151 if (copy_from_user(&val
, optval
, sizeof(val
)))
3154 po
->auxdata
= !!val
;
3157 case PACKET_ORIGDEV
:
3161 if (optlen
< sizeof(val
))
3163 if (copy_from_user(&val
, optval
, sizeof(val
)))
3166 po
->origdev
= !!val
;
3169 case PACKET_VNET_HDR
:
3173 if (sock
->type
!= SOCK_RAW
)
3175 if (po
->rx_ring
.pg_vec
|| po
->tx_ring
.pg_vec
)
3177 if (optlen
< sizeof(val
))
3179 if (copy_from_user(&val
, optval
, sizeof(val
)))
3182 po
->has_vnet_hdr
= !!val
;
3185 case PACKET_TIMESTAMP
:
3189 if (optlen
!= sizeof(val
))
3191 if (copy_from_user(&val
, optval
, sizeof(val
)))
3194 po
->tp_tstamp
= val
;
3201 if (optlen
!= sizeof(val
))
3203 if (copy_from_user(&val
, optval
, sizeof(val
)))
3206 return fanout_add(sk
, val
& 0xffff, val
>> 16);
3208 case PACKET_TX_HAS_OFF
:
3212 if (optlen
!= sizeof(val
))
3214 if (po
->rx_ring
.pg_vec
|| po
->tx_ring
.pg_vec
)
3216 if (copy_from_user(&val
, optval
, sizeof(val
)))
3218 po
->tp_tx_has_off
= !!val
;
3222 return -ENOPROTOOPT
;
3226 static int packet_getsockopt(struct socket
*sock
, int level
, int optname
,
3227 char __user
*optval
, int __user
*optlen
)
3230 int val
, lv
= sizeof(val
);
3231 struct sock
*sk
= sock
->sk
;
3232 struct packet_sock
*po
= pkt_sk(sk
);
3234 union tpacket_stats_u st
;
3236 if (level
!= SOL_PACKET
)
3237 return -ENOPROTOOPT
;
3239 if (get_user(len
, optlen
))
3246 case PACKET_STATISTICS
:
3247 spin_lock_bh(&sk
->sk_receive_queue
.lock
);
3248 memcpy(&st
, &po
->stats
, sizeof(st
));
3249 memset(&po
->stats
, 0, sizeof(po
->stats
));
3250 spin_unlock_bh(&sk
->sk_receive_queue
.lock
);
3252 if (po
->tp_version
== TPACKET_V3
) {
3253 lv
= sizeof(struct tpacket_stats_v3
);
3254 st
.stats3
.tp_packets
+= st
.stats3
.tp_drops
;
3257 lv
= sizeof(struct tpacket_stats
);
3258 st
.stats1
.tp_packets
+= st
.stats1
.tp_drops
;
3263 case PACKET_AUXDATA
:
3266 case PACKET_ORIGDEV
:
3269 case PACKET_VNET_HDR
:
3270 val
= po
->has_vnet_hdr
;
3272 case PACKET_VERSION
:
3273 val
= po
->tp_version
;
3276 if (len
> sizeof(int))
3278 if (copy_from_user(&val
, optval
, len
))
3282 val
= sizeof(struct tpacket_hdr
);
3285 val
= sizeof(struct tpacket2_hdr
);
3288 val
= sizeof(struct tpacket3_hdr
);
3294 case PACKET_RESERVE
:
3295 val
= po
->tp_reserve
;
3300 case PACKET_TIMESTAMP
:
3301 val
= po
->tp_tstamp
;
3305 ((u32
)po
->fanout
->id
|
3306 ((u32
)po
->fanout
->type
<< 16) |
3307 ((u32
)po
->fanout
->flags
<< 24)) :
3310 case PACKET_TX_HAS_OFF
:
3311 val
= po
->tp_tx_has_off
;
3314 return -ENOPROTOOPT
;
3319 if (put_user(len
, optlen
))
3321 if (copy_to_user(optval
, data
, len
))
3327 static int packet_notifier(struct notifier_block
*this,
3328 unsigned long msg
, void *ptr
)
3331 struct net_device
*dev
= netdev_notifier_info_to_dev(ptr
);
3332 struct net
*net
= dev_net(dev
);
3335 sk_for_each_rcu(sk
, &net
->packet
.sklist
) {
3336 struct packet_sock
*po
= pkt_sk(sk
);
3339 case NETDEV_UNREGISTER
:
3341 packet_dev_mclist(dev
, po
->mclist
, -1);
3345 if (dev
->ifindex
== po
->ifindex
) {
3346 spin_lock(&po
->bind_lock
);
3348 __unregister_prot_hook(sk
, false);
3349 sk
->sk_err
= ENETDOWN
;
3350 if (!sock_flag(sk
, SOCK_DEAD
))
3351 sk
->sk_error_report(sk
);
3353 if (msg
== NETDEV_UNREGISTER
) {
3354 packet_cached_dev_reset(po
);
3356 if (po
->prot_hook
.dev
)
3357 dev_put(po
->prot_hook
.dev
);
3358 po
->prot_hook
.dev
= NULL
;
3360 spin_unlock(&po
->bind_lock
);
3364 if (dev
->ifindex
== po
->ifindex
) {
3365 spin_lock(&po
->bind_lock
);
3367 register_prot_hook(sk
);
3368 spin_unlock(&po
->bind_lock
);
3378 static int packet_ioctl(struct socket
*sock
, unsigned int cmd
,
3381 struct sock
*sk
= sock
->sk
;
3386 int amount
= sk_wmem_alloc_get(sk
);
3388 return put_user(amount
, (int __user
*)arg
);
3392 struct sk_buff
*skb
;
3395 spin_lock_bh(&sk
->sk_receive_queue
.lock
);
3396 skb
= skb_peek(&sk
->sk_receive_queue
);
3399 spin_unlock_bh(&sk
->sk_receive_queue
.lock
);
3400 return put_user(amount
, (int __user
*)arg
);
3403 return sock_get_timestamp(sk
, (struct timeval __user
*)arg
);
3405 return sock_get_timestampns(sk
, (struct timespec __user
*)arg
);
3415 case SIOCGIFBRDADDR
:
3416 case SIOCSIFBRDADDR
:
3417 case SIOCGIFNETMASK
:
3418 case SIOCSIFNETMASK
:
3419 case SIOCGIFDSTADDR
:
3420 case SIOCSIFDSTADDR
:
3422 return inet_dgram_ops
.ioctl(sock
, cmd
, arg
);
3426 return -ENOIOCTLCMD
;
3431 static unsigned int packet_poll(struct file
*file
, struct socket
*sock
,
3434 struct sock
*sk
= sock
->sk
;
3435 struct packet_sock
*po
= pkt_sk(sk
);
3436 unsigned int mask
= datagram_poll(file
, sock
, wait
);
3438 spin_lock_bh(&sk
->sk_receive_queue
.lock
);
3439 if (po
->rx_ring
.pg_vec
) {
3440 if (!packet_previous_rx_frame(po
, &po
->rx_ring
,
3442 mask
|= POLLIN
| POLLRDNORM
;
3444 spin_unlock_bh(&sk
->sk_receive_queue
.lock
);
3445 spin_lock_bh(&sk
->sk_write_queue
.lock
);
3446 if (po
->tx_ring
.pg_vec
) {
3447 if (packet_current_frame(po
, &po
->tx_ring
, TP_STATUS_AVAILABLE
))
3448 mask
|= POLLOUT
| POLLWRNORM
;
3450 spin_unlock_bh(&sk
->sk_write_queue
.lock
);
3455 /* Dirty? Well, I still did not learn better way to account
3459 static void packet_mm_open(struct vm_area_struct
*vma
)
3461 struct file
*file
= vma
->vm_file
;
3462 struct socket
*sock
= file
->private_data
;
3463 struct sock
*sk
= sock
->sk
;
3466 atomic_inc(&pkt_sk(sk
)->mapped
);
3469 static void packet_mm_close(struct vm_area_struct
*vma
)
3471 struct file
*file
= vma
->vm_file
;
3472 struct socket
*sock
= file
->private_data
;
3473 struct sock
*sk
= sock
->sk
;
3476 atomic_dec(&pkt_sk(sk
)->mapped
);
3479 static const struct vm_operations_struct packet_mmap_ops
= {
3480 .open
= packet_mm_open
,
3481 .close
= packet_mm_close
,
3484 static void free_pg_vec(struct pgv
*pg_vec
, unsigned int order
,
3489 for (i
= 0; i
< len
; i
++) {
3490 if (likely(pg_vec
[i
].buffer
)) {
3491 if (is_vmalloc_addr(pg_vec
[i
].buffer
))
3492 vfree(pg_vec
[i
].buffer
);
3494 free_pages((unsigned long)pg_vec
[i
].buffer
,
3496 pg_vec
[i
].buffer
= NULL
;
3502 static char *alloc_one_pg_vec_page(unsigned long order
)
3504 char *buffer
= NULL
;
3505 gfp_t gfp_flags
= GFP_KERNEL
| __GFP_COMP
|
3506 __GFP_ZERO
| __GFP_NOWARN
| __GFP_NORETRY
;
3508 buffer
= (char *) __get_free_pages(gfp_flags
, order
);
3514 * __get_free_pages failed, fall back to vmalloc
3516 buffer
= vzalloc((1 << order
) * PAGE_SIZE
);
3522 * vmalloc failed, lets dig into swap here
3524 gfp_flags
&= ~__GFP_NORETRY
;
3525 buffer
= (char *)__get_free_pages(gfp_flags
, order
);
3530 * complete and utter failure
3535 static struct pgv
*alloc_pg_vec(struct tpacket_req
*req
, int order
)
3537 unsigned int block_nr
= req
->tp_block_nr
;
3541 pg_vec
= kcalloc(block_nr
, sizeof(struct pgv
), GFP_KERNEL
);
3542 if (unlikely(!pg_vec
))
3545 for (i
= 0; i
< block_nr
; i
++) {
3546 pg_vec
[i
].buffer
= alloc_one_pg_vec_page(order
);
3547 if (unlikely(!pg_vec
[i
].buffer
))
3548 goto out_free_pgvec
;
3555 free_pg_vec(pg_vec
, order
, block_nr
);
3560 static int packet_set_ring(struct sock
*sk
, union tpacket_req_u
*req_u
,
3561 int closing
, int tx_ring
)
3563 struct pgv
*pg_vec
= NULL
;
3564 struct packet_sock
*po
= pkt_sk(sk
);
3565 int was_running
, order
= 0;
3566 struct packet_ring_buffer
*rb
;
3567 struct sk_buff_head
*rb_queue
;
3570 /* Added to avoid minimal code churn */
3571 struct tpacket_req
*req
= &req_u
->req
;
3573 /* Opening a Tx-ring is NOT supported in TPACKET_V3 */
3574 if (!closing
&& tx_ring
&& (po
->tp_version
> TPACKET_V2
)) {
3575 WARN(1, "Tx-ring is not supported.\n");
3579 rb
= tx_ring
? &po
->tx_ring
: &po
->rx_ring
;
3580 rb_queue
= tx_ring
? &sk
->sk_write_queue
: &sk
->sk_receive_queue
;
3584 if (atomic_read(&po
->mapped
))
3586 if (atomic_read(&rb
->pending
))
3590 if (req
->tp_block_nr
) {
3591 /* Sanity tests and some calculations */
3593 if (unlikely(rb
->pg_vec
))
3596 switch (po
->tp_version
) {
3598 po
->tp_hdrlen
= TPACKET_HDRLEN
;
3601 po
->tp_hdrlen
= TPACKET2_HDRLEN
;
3604 po
->tp_hdrlen
= TPACKET3_HDRLEN
;
3609 if (unlikely((int)req
->tp_block_size
<= 0))
3611 if (unlikely(req
->tp_block_size
& (PAGE_SIZE
- 1)))
3613 if (unlikely(req
->tp_frame_size
< po
->tp_hdrlen
+
3616 if (unlikely(req
->tp_frame_size
& (TPACKET_ALIGNMENT
- 1)))
3619 rb
->frames_per_block
= req
->tp_block_size
/req
->tp_frame_size
;
3620 if (unlikely(rb
->frames_per_block
<= 0))
3622 if (unlikely((rb
->frames_per_block
* req
->tp_block_nr
) !=
3627 order
= get_order(req
->tp_block_size
);
3628 pg_vec
= alloc_pg_vec(req
, order
);
3629 if (unlikely(!pg_vec
))
3631 switch (po
->tp_version
) {
3633 /* Transmit path is not supported. We checked
3634 * it above but just being paranoid
3637 init_prb_bdqc(po
, rb
, pg_vec
, req_u
, tx_ring
);
3646 if (unlikely(req
->tp_frame_nr
))
3652 /* Detach socket from network */
3653 spin_lock(&po
->bind_lock
);
3654 was_running
= po
->running
;
3658 __unregister_prot_hook(sk
, false);
3660 spin_unlock(&po
->bind_lock
);
3665 mutex_lock(&po
->pg_vec_lock
);
3666 if (closing
|| atomic_read(&po
->mapped
) == 0) {
3668 spin_lock_bh(&rb_queue
->lock
);
3669 swap(rb
->pg_vec
, pg_vec
);
3670 rb
->frame_max
= (req
->tp_frame_nr
- 1);
3672 rb
->frame_size
= req
->tp_frame_size
;
3673 spin_unlock_bh(&rb_queue
->lock
);
3675 swap(rb
->pg_vec_order
, order
);
3676 swap(rb
->pg_vec_len
, req
->tp_block_nr
);
3678 rb
->pg_vec_pages
= req
->tp_block_size
/PAGE_SIZE
;
3679 po
->prot_hook
.func
= (po
->rx_ring
.pg_vec
) ?
3680 tpacket_rcv
: packet_rcv
;
3681 skb_queue_purge(rb_queue
);
3682 if (atomic_read(&po
->mapped
))
3683 pr_err("packet_mmap: vma is busy: %d\n",
3684 atomic_read(&po
->mapped
));
3686 mutex_unlock(&po
->pg_vec_lock
);
3688 spin_lock(&po
->bind_lock
);
3691 register_prot_hook(sk
);
3693 spin_unlock(&po
->bind_lock
);
3694 if (closing
&& (po
->tp_version
> TPACKET_V2
)) {
3695 /* Because we don't support block-based V3 on tx-ring */
3697 prb_shutdown_retire_blk_timer(po
, tx_ring
, rb_queue
);
3702 free_pg_vec(pg_vec
, order
, req
->tp_block_nr
);
3707 static int packet_mmap(struct file
*file
, struct socket
*sock
,
3708 struct vm_area_struct
*vma
)
3710 struct sock
*sk
= sock
->sk
;
3711 struct packet_sock
*po
= pkt_sk(sk
);
3712 unsigned long size
, expected_size
;
3713 struct packet_ring_buffer
*rb
;
3714 unsigned long start
;
3721 mutex_lock(&po
->pg_vec_lock
);
3724 for (rb
= &po
->rx_ring
; rb
<= &po
->tx_ring
; rb
++) {
3726 expected_size
+= rb
->pg_vec_len
3732 if (expected_size
== 0)
3735 size
= vma
->vm_end
- vma
->vm_start
;
3736 if (size
!= expected_size
)
3739 start
= vma
->vm_start
;
3740 for (rb
= &po
->rx_ring
; rb
<= &po
->tx_ring
; rb
++) {
3741 if (rb
->pg_vec
== NULL
)
3744 for (i
= 0; i
< rb
->pg_vec_len
; i
++) {
3746 void *kaddr
= rb
->pg_vec
[i
].buffer
;
3749 for (pg_num
= 0; pg_num
< rb
->pg_vec_pages
; pg_num
++) {
3750 page
= pgv_to_page(kaddr
);
3751 err
= vm_insert_page(vma
, start
, page
);
3760 atomic_inc(&po
->mapped
);
3761 vma
->vm_ops
= &packet_mmap_ops
;
3765 mutex_unlock(&po
->pg_vec_lock
);
3769 static const struct proto_ops packet_ops_spkt
= {
3770 .family
= PF_PACKET
,
3771 .owner
= THIS_MODULE
,
3772 .release
= packet_release
,
3773 .bind
= packet_bind_spkt
,
3774 .connect
= sock_no_connect
,
3775 .socketpair
= sock_no_socketpair
,
3776 .accept
= sock_no_accept
,
3777 .getname
= packet_getname_spkt
,
3778 .poll
= datagram_poll
,
3779 .ioctl
= packet_ioctl
,
3780 .listen
= sock_no_listen
,
3781 .shutdown
= sock_no_shutdown
,
3782 .setsockopt
= sock_no_setsockopt
,
3783 .getsockopt
= sock_no_getsockopt
,
3784 .sendmsg
= packet_sendmsg_spkt
,
3785 .recvmsg
= packet_recvmsg
,
3786 .mmap
= sock_no_mmap
,
3787 .sendpage
= sock_no_sendpage
,
3790 static const struct proto_ops packet_ops
= {
3791 .family
= PF_PACKET
,
3792 .owner
= THIS_MODULE
,
3793 .release
= packet_release
,
3794 .bind
= packet_bind
,
3795 .connect
= sock_no_connect
,
3796 .socketpair
= sock_no_socketpair
,
3797 .accept
= sock_no_accept
,
3798 .getname
= packet_getname
,
3799 .poll
= packet_poll
,
3800 .ioctl
= packet_ioctl
,
3801 .listen
= sock_no_listen
,
3802 .shutdown
= sock_no_shutdown
,
3803 .setsockopt
= packet_setsockopt
,
3804 .getsockopt
= packet_getsockopt
,
3805 .sendmsg
= packet_sendmsg
,
3806 .recvmsg
= packet_recvmsg
,
3807 .mmap
= packet_mmap
,
3808 .sendpage
= sock_no_sendpage
,
3811 static const struct net_proto_family packet_family_ops
= {
3812 .family
= PF_PACKET
,
3813 .create
= packet_create
,
3814 .owner
= THIS_MODULE
,
3817 static struct notifier_block packet_netdev_notifier
= {
3818 .notifier_call
= packet_notifier
,
3821 #ifdef CONFIG_PROC_FS
3823 static void *packet_seq_start(struct seq_file
*seq
, loff_t
*pos
)
3826 struct net
*net
= seq_file_net(seq
);
3829 return seq_hlist_start_head_rcu(&net
->packet
.sklist
, *pos
);
3832 static void *packet_seq_next(struct seq_file
*seq
, void *v
, loff_t
*pos
)
3834 struct net
*net
= seq_file_net(seq
);
3835 return seq_hlist_next_rcu(v
, &net
->packet
.sklist
, pos
);
3838 static void packet_seq_stop(struct seq_file
*seq
, void *v
)
3844 static int packet_seq_show(struct seq_file
*seq
, void *v
)
3846 if (v
== SEQ_START_TOKEN
)
3847 seq_puts(seq
, "sk RefCnt Type Proto Iface R Rmem User Inode\n");
3849 struct sock
*s
= sk_entry(v
);
3850 const struct packet_sock
*po
= pkt_sk(s
);
3853 "%pK %-6d %-4d %04x %-5d %1d %-6u %-6u %-6lu\n",
3855 atomic_read(&s
->sk_refcnt
),
3860 atomic_read(&s
->sk_rmem_alloc
),
3861 from_kuid_munged(seq_user_ns(seq
), sock_i_uid(s
)),
3868 static const struct seq_operations packet_seq_ops
= {
3869 .start
= packet_seq_start
,
3870 .next
= packet_seq_next
,
3871 .stop
= packet_seq_stop
,
3872 .show
= packet_seq_show
,
3875 static int packet_seq_open(struct inode
*inode
, struct file
*file
)
3877 return seq_open_net(inode
, file
, &packet_seq_ops
,
3878 sizeof(struct seq_net_private
));
3881 static const struct file_operations packet_seq_fops
= {
3882 .owner
= THIS_MODULE
,
3883 .open
= packet_seq_open
,
3885 .llseek
= seq_lseek
,
3886 .release
= seq_release_net
,
3891 static int __net_init
packet_net_init(struct net
*net
)
3893 mutex_init(&net
->packet
.sklist_lock
);
3894 INIT_HLIST_HEAD(&net
->packet
.sklist
);
3896 if (!proc_create("packet", 0, net
->proc_net
, &packet_seq_fops
))
3902 static void __net_exit
packet_net_exit(struct net
*net
)
3904 remove_proc_entry("packet", net
->proc_net
);
3907 static struct pernet_operations packet_net_ops
= {
3908 .init
= packet_net_init
,
3909 .exit
= packet_net_exit
,
3913 static void __exit
packet_exit(void)
3915 unregister_netdevice_notifier(&packet_netdev_notifier
);
3916 unregister_pernet_subsys(&packet_net_ops
);
3917 sock_unregister(PF_PACKET
);
3918 proto_unregister(&packet_proto
);
3921 static int __init
packet_init(void)
3923 int rc
= proto_register(&packet_proto
, 0);
3928 sock_register(&packet_family_ops
);
3929 register_pernet_subsys(&packet_net_ops
);
3930 register_netdevice_notifier(&packet_netdev_notifier
);
3935 module_init(packet_init
);
3936 module_exit(packet_exit
);
3937 MODULE_LICENSE("GPL");
3938 MODULE_ALIAS_NETPROTO(PF_PACKET
);