2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
6 * PACKET - implements raw packet sockets.
9 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10 * Alan Cox, <gw4pts@gw4pts.ampr.org>
13 * Alan Cox : verify_area() now used correctly
14 * Alan Cox : new skbuff lists, look ma no backlogs!
15 * Alan Cox : tidied skbuff lists.
16 * Alan Cox : Now uses generic datagram routines I
17 * added. Also fixed the peek/read crash
18 * from all old Linux datagram code.
19 * Alan Cox : Uses the improved datagram code.
20 * Alan Cox : Added NULL's for socket options.
21 * Alan Cox : Re-commented the code.
22 * Alan Cox : Use new kernel side addressing
23 * Rob Janssen : Correct MTU usage.
24 * Dave Platt : Counter leaks caused by incorrect
25 * interrupt locking and some slightly
26 * dubious gcc output. Can you read
27 * compiler: it said _VOLATILE_
28 * Richard Kooijman : Timestamp fixes.
29 * Alan Cox : New buffers. Use sk->mac.raw.
30 * Alan Cox : sendmsg/recvmsg support.
31 * Alan Cox : Protocol setting support
32 * Alexey Kuznetsov : Untied from IPv4 stack.
33 * Cyrus Durgin : Fixed kerneld for kmod.
34 * Michal Ostrowski : Module initialization cleanup.
35 * Ulises Alonso : Frame number limit removal and
36 * packet_set_ring memory leak.
37 * Eric Biederman : Allow for > 8 byte hardware addresses.
38 * The convention is that longer addresses
39 * will simply extend the hardware address
40 * byte arrays at the end of sockaddr_ll
42 * Johann Baudy : Added TX RING.
43 * Chetan Loke : Implemented TPACKET_V3 block abstraction
45 * Copyright (C) 2011, <lokec@ccs.neu.edu>
48 * This program is free software; you can redistribute it and/or
49 * modify it under the terms of the GNU General Public License
50 * as published by the Free Software Foundation; either version
51 * 2 of the License, or (at your option) any later version.
55 #include <linux/types.h>
57 #include <linux/capability.h>
58 #include <linux/fcntl.h>
59 #include <linux/socket.h>
61 #include <linux/inet.h>
62 #include <linux/netdevice.h>
63 #include <linux/if_packet.h>
64 #include <linux/wireless.h>
65 #include <linux/kernel.h>
66 #include <linux/kmod.h>
67 #include <linux/slab.h>
68 #include <linux/vmalloc.h>
69 #include <net/net_namespace.h>
71 #include <net/protocol.h>
72 #include <linux/skbuff.h>
74 #include <linux/errno.h>
75 #include <linux/timer.h>
76 #include <asm/uaccess.h>
77 #include <asm/ioctls.h>
79 #include <asm/cacheflush.h>
81 #include <linux/proc_fs.h>
82 #include <linux/seq_file.h>
83 #include <linux/poll.h>
84 #include <linux/module.h>
85 #include <linux/init.h>
86 #include <linux/mutex.h>
87 #include <linux/if_vlan.h>
88 #include <linux/virtio_net.h>
89 #include <linux/errqueue.h>
90 #include <linux/net_tstamp.h>
91 #include <linux/percpu.h>
93 #include <net/inet_common.h>
100 - if device has no dev->hard_header routine, it adds and removes ll header
101 inside itself. In this case ll header is invisible outside of device,
102 but higher levels still should reserve dev->hard_header_len.
103 Some devices are enough clever to reallocate skb, when header
104 will not fit to reserved space (tunnel), another ones are silly
106 - packet socket receives packets with pulled ll header,
107 so that SOCK_RAW should push it back.
112 Incoming, dev->hard_header!=NULL
113 mac_header -> ll header
116 Outgoing, dev->hard_header!=NULL
117 mac_header -> ll header
120 Incoming, dev->hard_header==NULL
121 mac_header -> UNKNOWN position. It is very likely, that it points to ll
122 header. PPP makes it, that is wrong, because introduce
123 assymetry between rx and tx paths.
126 Outgoing, dev->hard_header==NULL
127 mac_header -> data. ll header is still not built!
131 If dev->hard_header==NULL we are unlikely to restore sensible ll header.
137 dev->hard_header != NULL
138 mac_header -> ll header
141 dev->hard_header == NULL (ll header is added by device, we cannot control it)
145 We should set nh.raw on output to correct posistion,
146 packet classifier depends on it.
149 /* Private packet socket structures. */
151 /* identical to struct packet_mreq except it has
152 * a longer address field.
154 struct packet_mreq_max
{
156 unsigned short mr_type
;
157 unsigned short mr_alen
;
158 unsigned char mr_address
[MAX_ADDR_LEN
];
162 struct tpacket_hdr
*h1
;
163 struct tpacket2_hdr
*h2
;
164 struct tpacket3_hdr
*h3
;
168 static int packet_set_ring(struct sock
*sk
, union tpacket_req_u
*req_u
,
169 int closing
, int tx_ring
);
171 #define V3_ALIGNMENT (8)
173 #define BLK_HDR_LEN (ALIGN(sizeof(struct tpacket_block_desc), V3_ALIGNMENT))
175 #define BLK_PLUS_PRIV(sz_of_priv) \
176 (BLK_HDR_LEN + ALIGN((sz_of_priv), V3_ALIGNMENT))
178 #define PGV_FROM_VMALLOC 1
180 #define BLOCK_STATUS(x) ((x)->hdr.bh1.block_status)
181 #define BLOCK_NUM_PKTS(x) ((x)->hdr.bh1.num_pkts)
182 #define BLOCK_O2FP(x) ((x)->hdr.bh1.offset_to_first_pkt)
183 #define BLOCK_LEN(x) ((x)->hdr.bh1.blk_len)
184 #define BLOCK_SNUM(x) ((x)->hdr.bh1.seq_num)
185 #define BLOCK_O2PRIV(x) ((x)->offset_to_priv)
186 #define BLOCK_PRIV(x) ((void *)((char *)(x) + BLOCK_O2PRIV(x)))
189 static int tpacket_snd(struct packet_sock
*po
, struct msghdr
*msg
);
190 static int tpacket_rcv(struct sk_buff
*skb
, struct net_device
*dev
,
191 struct packet_type
*pt
, struct net_device
*orig_dev
);
193 static void *packet_previous_frame(struct packet_sock
*po
,
194 struct packet_ring_buffer
*rb
,
196 static void packet_increment_head(struct packet_ring_buffer
*buff
);
197 static int prb_curr_blk_in_use(struct tpacket_kbdq_core
*,
198 struct tpacket_block_desc
*);
199 static void *prb_dispatch_next_block(struct tpacket_kbdq_core
*,
200 struct packet_sock
*);
201 static void prb_retire_current_block(struct tpacket_kbdq_core
*,
202 struct packet_sock
*, unsigned int status
);
203 static int prb_queue_frozen(struct tpacket_kbdq_core
*);
204 static void prb_open_block(struct tpacket_kbdq_core
*,
205 struct tpacket_block_desc
*);
206 static void prb_retire_rx_blk_timer_expired(unsigned long);
207 static void _prb_refresh_rx_retire_blk_timer(struct tpacket_kbdq_core
*);
208 static void prb_init_blk_timer(struct packet_sock
*,
209 struct tpacket_kbdq_core
*,
210 void (*func
) (unsigned long));
211 static void prb_fill_rxhash(struct tpacket_kbdq_core
*, struct tpacket3_hdr
*);
212 static void prb_clear_rxhash(struct tpacket_kbdq_core
*,
213 struct tpacket3_hdr
*);
214 static void prb_fill_vlan_info(struct tpacket_kbdq_core
*,
215 struct tpacket3_hdr
*);
216 static void packet_flush_mclist(struct sock
*sk
);
218 struct packet_skb_cb
{
219 unsigned int origlen
;
221 struct sockaddr_pkt pkt
;
222 struct sockaddr_ll ll
;
226 #define PACKET_SKB_CB(__skb) ((struct packet_skb_cb *)((__skb)->cb))
228 #define GET_PBDQC_FROM_RB(x) ((struct tpacket_kbdq_core *)(&(x)->prb_bdqc))
229 #define GET_PBLOCK_DESC(x, bid) \
230 ((struct tpacket_block_desc *)((x)->pkbdq[(bid)].buffer))
231 #define GET_CURR_PBLOCK_DESC_FROM_CORE(x) \
232 ((struct tpacket_block_desc *)((x)->pkbdq[(x)->kactive_blk_num].buffer))
233 #define GET_NEXT_PRB_BLK_NUM(x) \
234 (((x)->kactive_blk_num < ((x)->knum_blocks-1)) ? \
235 ((x)->kactive_blk_num+1) : 0)
237 static void __fanout_unlink(struct sock
*sk
, struct packet_sock
*po
);
238 static void __fanout_link(struct sock
*sk
, struct packet_sock
*po
);
240 static int packet_direct_xmit(struct sk_buff
*skb
)
242 struct net_device
*dev
= skb
->dev
;
243 netdev_features_t features
;
244 struct netdev_queue
*txq
;
245 int ret
= NETDEV_TX_BUSY
;
247 if (unlikely(!netif_running(dev
) ||
248 !netif_carrier_ok(dev
)))
251 features
= netif_skb_features(skb
);
252 if (skb_needs_linearize(skb
, features
) &&
253 __skb_linearize(skb
))
256 txq
= skb_get_tx_queue(dev
, skb
);
260 HARD_TX_LOCK(dev
, txq
, smp_processor_id());
261 if (!netif_xmit_frozen_or_drv_stopped(txq
))
262 ret
= netdev_start_xmit(skb
, dev
, txq
, false);
263 HARD_TX_UNLOCK(dev
, txq
);
267 if (!dev_xmit_complete(ret
))
272 atomic_long_inc(&dev
->tx_dropped
);
274 return NET_XMIT_DROP
;
277 static struct net_device
*packet_cached_dev_get(struct packet_sock
*po
)
279 struct net_device
*dev
;
282 dev
= rcu_dereference(po
->cached_dev
);
290 static void packet_cached_dev_assign(struct packet_sock
*po
,
291 struct net_device
*dev
)
293 rcu_assign_pointer(po
->cached_dev
, dev
);
296 static void packet_cached_dev_reset(struct packet_sock
*po
)
298 RCU_INIT_POINTER(po
->cached_dev
, NULL
);
301 static bool packet_use_direct_xmit(const struct packet_sock
*po
)
303 return po
->xmit
== packet_direct_xmit
;
306 static u16
__packet_pick_tx_queue(struct net_device
*dev
, struct sk_buff
*skb
)
308 return (u16
) raw_smp_processor_id() % dev
->real_num_tx_queues
;
311 static void packet_pick_tx_queue(struct net_device
*dev
, struct sk_buff
*skb
)
313 const struct net_device_ops
*ops
= dev
->netdev_ops
;
316 if (ops
->ndo_select_queue
) {
317 queue_index
= ops
->ndo_select_queue(dev
, skb
, NULL
,
318 __packet_pick_tx_queue
);
319 queue_index
= netdev_cap_txqueue(dev
, queue_index
);
321 queue_index
= __packet_pick_tx_queue(dev
, skb
);
324 skb_set_queue_mapping(skb
, queue_index
);
327 /* register_prot_hook must be invoked with the po->bind_lock held,
328 * or from a context in which asynchronous accesses to the packet
329 * socket is not possible (packet_create()).
331 static void register_prot_hook(struct sock
*sk
)
333 struct packet_sock
*po
= pkt_sk(sk
);
337 __fanout_link(sk
, po
);
339 dev_add_pack(&po
->prot_hook
);
346 /* {,__}unregister_prot_hook() must be invoked with the po->bind_lock
347 * held. If the sync parameter is true, we will temporarily drop
348 * the po->bind_lock and do a synchronize_net to make sure no
349 * asynchronous packet processing paths still refer to the elements
350 * of po->prot_hook. If the sync parameter is false, it is the
351 * callers responsibility to take care of this.
353 static void __unregister_prot_hook(struct sock
*sk
, bool sync
)
355 struct packet_sock
*po
= pkt_sk(sk
);
360 __fanout_unlink(sk
, po
);
362 __dev_remove_pack(&po
->prot_hook
);
367 spin_unlock(&po
->bind_lock
);
369 spin_lock(&po
->bind_lock
);
373 static void unregister_prot_hook(struct sock
*sk
, bool sync
)
375 struct packet_sock
*po
= pkt_sk(sk
);
378 __unregister_prot_hook(sk
, sync
);
381 static inline struct page
* __pure
pgv_to_page(void *addr
)
383 if (is_vmalloc_addr(addr
))
384 return vmalloc_to_page(addr
);
385 return virt_to_page(addr
);
388 static void __packet_set_status(struct packet_sock
*po
, void *frame
, int status
)
390 union tpacket_uhdr h
;
393 switch (po
->tp_version
) {
395 h
.h1
->tp_status
= status
;
396 flush_dcache_page(pgv_to_page(&h
.h1
->tp_status
));
399 h
.h2
->tp_status
= status
;
400 flush_dcache_page(pgv_to_page(&h
.h2
->tp_status
));
404 WARN(1, "TPACKET version not supported.\n");
411 static int __packet_get_status(struct packet_sock
*po
, void *frame
)
413 union tpacket_uhdr h
;
418 switch (po
->tp_version
) {
420 flush_dcache_page(pgv_to_page(&h
.h1
->tp_status
));
421 return h
.h1
->tp_status
;
423 flush_dcache_page(pgv_to_page(&h
.h2
->tp_status
));
424 return h
.h2
->tp_status
;
427 WARN(1, "TPACKET version not supported.\n");
433 static __u32
tpacket_get_timestamp(struct sk_buff
*skb
, struct timespec
*ts
,
436 struct skb_shared_hwtstamps
*shhwtstamps
= skb_hwtstamps(skb
);
439 (flags
& SOF_TIMESTAMPING_RAW_HARDWARE
) &&
440 ktime_to_timespec_cond(shhwtstamps
->hwtstamp
, ts
))
441 return TP_STATUS_TS_RAW_HARDWARE
;
443 if (ktime_to_timespec_cond(skb
->tstamp
, ts
))
444 return TP_STATUS_TS_SOFTWARE
;
449 static __u32
__packet_set_timestamp(struct packet_sock
*po
, void *frame
,
452 union tpacket_uhdr h
;
456 if (!(ts_status
= tpacket_get_timestamp(skb
, &ts
, po
->tp_tstamp
)))
460 switch (po
->tp_version
) {
462 h
.h1
->tp_sec
= ts
.tv_sec
;
463 h
.h1
->tp_usec
= ts
.tv_nsec
/ NSEC_PER_USEC
;
466 h
.h2
->tp_sec
= ts
.tv_sec
;
467 h
.h2
->tp_nsec
= ts
.tv_nsec
;
471 WARN(1, "TPACKET version not supported.\n");
475 /* one flush is safe, as both fields always lie on the same cacheline */
476 flush_dcache_page(pgv_to_page(&h
.h1
->tp_sec
));
482 static void *packet_lookup_frame(struct packet_sock
*po
,
483 struct packet_ring_buffer
*rb
,
484 unsigned int position
,
487 unsigned int pg_vec_pos
, frame_offset
;
488 union tpacket_uhdr h
;
490 pg_vec_pos
= position
/ rb
->frames_per_block
;
491 frame_offset
= position
% rb
->frames_per_block
;
493 h
.raw
= rb
->pg_vec
[pg_vec_pos
].buffer
+
494 (frame_offset
* rb
->frame_size
);
496 if (status
!= __packet_get_status(po
, h
.raw
))
502 static void *packet_current_frame(struct packet_sock
*po
,
503 struct packet_ring_buffer
*rb
,
506 return packet_lookup_frame(po
, rb
, rb
->head
, status
);
509 static void prb_del_retire_blk_timer(struct tpacket_kbdq_core
*pkc
)
511 del_timer_sync(&pkc
->retire_blk_timer
);
514 static void prb_shutdown_retire_blk_timer(struct packet_sock
*po
,
516 struct sk_buff_head
*rb_queue
)
518 struct tpacket_kbdq_core
*pkc
;
520 pkc
= tx_ring
? GET_PBDQC_FROM_RB(&po
->tx_ring
) :
521 GET_PBDQC_FROM_RB(&po
->rx_ring
);
523 spin_lock_bh(&rb_queue
->lock
);
524 pkc
->delete_blk_timer
= 1;
525 spin_unlock_bh(&rb_queue
->lock
);
527 prb_del_retire_blk_timer(pkc
);
530 static void prb_init_blk_timer(struct packet_sock
*po
,
531 struct tpacket_kbdq_core
*pkc
,
532 void (*func
) (unsigned long))
534 init_timer(&pkc
->retire_blk_timer
);
535 pkc
->retire_blk_timer
.data
= (long)po
;
536 pkc
->retire_blk_timer
.function
= func
;
537 pkc
->retire_blk_timer
.expires
= jiffies
;
540 static void prb_setup_retire_blk_timer(struct packet_sock
*po
, int tx_ring
)
542 struct tpacket_kbdq_core
*pkc
;
547 pkc
= tx_ring
? GET_PBDQC_FROM_RB(&po
->tx_ring
) :
548 GET_PBDQC_FROM_RB(&po
->rx_ring
);
549 prb_init_blk_timer(po
, pkc
, prb_retire_rx_blk_timer_expired
);
552 static int prb_calc_retire_blk_tmo(struct packet_sock
*po
,
553 int blk_size_in_bytes
)
555 struct net_device
*dev
;
556 unsigned int mbits
= 0, msec
= 0, div
= 0, tmo
= 0;
557 struct ethtool_cmd ecmd
;
562 dev
= __dev_get_by_index(sock_net(&po
->sk
), po
->ifindex
);
563 if (unlikely(!dev
)) {
565 return DEFAULT_PRB_RETIRE_TOV
;
567 err
= __ethtool_get_settings(dev
, &ecmd
);
568 speed
= ethtool_cmd_speed(&ecmd
);
572 * If the link speed is so slow you don't really
573 * need to worry about perf anyways
575 if (speed
< SPEED_1000
|| speed
== SPEED_UNKNOWN
) {
576 return DEFAULT_PRB_RETIRE_TOV
;
583 mbits
= (blk_size_in_bytes
* 8) / (1024 * 1024);
595 static void prb_init_ft_ops(struct tpacket_kbdq_core
*p1
,
596 union tpacket_req_u
*req_u
)
598 p1
->feature_req_word
= req_u
->req3
.tp_feature_req_word
;
601 static void init_prb_bdqc(struct packet_sock
*po
,
602 struct packet_ring_buffer
*rb
,
604 union tpacket_req_u
*req_u
, int tx_ring
)
606 struct tpacket_kbdq_core
*p1
= GET_PBDQC_FROM_RB(rb
);
607 struct tpacket_block_desc
*pbd
;
609 memset(p1
, 0x0, sizeof(*p1
));
611 p1
->knxt_seq_num
= 1;
613 pbd
= (struct tpacket_block_desc
*)pg_vec
[0].buffer
;
614 p1
->pkblk_start
= pg_vec
[0].buffer
;
615 p1
->kblk_size
= req_u
->req3
.tp_block_size
;
616 p1
->knum_blocks
= req_u
->req3
.tp_block_nr
;
617 p1
->hdrlen
= po
->tp_hdrlen
;
618 p1
->version
= po
->tp_version
;
619 p1
->last_kactive_blk_num
= 0;
620 po
->stats
.stats3
.tp_freeze_q_cnt
= 0;
621 if (req_u
->req3
.tp_retire_blk_tov
)
622 p1
->retire_blk_tov
= req_u
->req3
.tp_retire_blk_tov
;
624 p1
->retire_blk_tov
= prb_calc_retire_blk_tmo(po
,
625 req_u
->req3
.tp_block_size
);
626 p1
->tov_in_jiffies
= msecs_to_jiffies(p1
->retire_blk_tov
);
627 p1
->blk_sizeof_priv
= req_u
->req3
.tp_sizeof_priv
;
629 p1
->max_frame_len
= p1
->kblk_size
- BLK_PLUS_PRIV(p1
->blk_sizeof_priv
);
630 prb_init_ft_ops(p1
, req_u
);
631 prb_setup_retire_blk_timer(po
, tx_ring
);
632 prb_open_block(p1
, pbd
);
635 /* Do NOT update the last_blk_num first.
636 * Assumes sk_buff_head lock is held.
638 static void _prb_refresh_rx_retire_blk_timer(struct tpacket_kbdq_core
*pkc
)
640 mod_timer(&pkc
->retire_blk_timer
,
641 jiffies
+ pkc
->tov_in_jiffies
);
642 pkc
->last_kactive_blk_num
= pkc
->kactive_blk_num
;
647 * 1) We refresh the timer only when we open a block.
648 * By doing this we don't waste cycles refreshing the timer
649 * on packet-by-packet basis.
651 * With a 1MB block-size, on a 1Gbps line, it will take
652 * i) ~8 ms to fill a block + ii) memcpy etc.
653 * In this cut we are not accounting for the memcpy time.
655 * So, if the user sets the 'tmo' to 10ms then the timer
656 * will never fire while the block is still getting filled
657 * (which is what we want). However, the user could choose
658 * to close a block early and that's fine.
660 * But when the timer does fire, we check whether or not to refresh it.
661 * Since the tmo granularity is in msecs, it is not too expensive
662 * to refresh the timer, lets say every '8' msecs.
663 * Either the user can set the 'tmo' or we can derive it based on
664 * a) line-speed and b) block-size.
665 * prb_calc_retire_blk_tmo() calculates the tmo.
668 static void prb_retire_rx_blk_timer_expired(unsigned long data
)
670 struct packet_sock
*po
= (struct packet_sock
*)data
;
671 struct tpacket_kbdq_core
*pkc
= GET_PBDQC_FROM_RB(&po
->rx_ring
);
673 struct tpacket_block_desc
*pbd
;
675 spin_lock(&po
->sk
.sk_receive_queue
.lock
);
677 frozen
= prb_queue_frozen(pkc
);
678 pbd
= GET_CURR_PBLOCK_DESC_FROM_CORE(pkc
);
680 if (unlikely(pkc
->delete_blk_timer
))
683 /* We only need to plug the race when the block is partially filled.
685 * lock(); increment BLOCK_NUM_PKTS; unlock()
686 * copy_bits() is in progress ...
687 * timer fires on other cpu:
688 * we can't retire the current block because copy_bits
692 if (BLOCK_NUM_PKTS(pbd
)) {
693 while (atomic_read(&pkc
->blk_fill_in_prog
)) {
694 /* Waiting for skb_copy_bits to finish... */
699 if (pkc
->last_kactive_blk_num
== pkc
->kactive_blk_num
) {
701 if (!BLOCK_NUM_PKTS(pbd
)) {
702 /* An empty block. Just refresh the timer. */
705 prb_retire_current_block(pkc
, po
, TP_STATUS_BLK_TMO
);
706 if (!prb_dispatch_next_block(pkc
, po
))
711 /* Case 1. Queue was frozen because user-space was
714 if (prb_curr_blk_in_use(pkc
, pbd
)) {
716 * Ok, user-space is still behind.
717 * So just refresh the timer.
721 /* Case 2. queue was frozen,user-space caught up,
722 * now the link went idle && the timer fired.
723 * We don't have a block to close.So we open this
724 * block and restart the timer.
725 * opening a block thaws the queue,restarts timer
726 * Thawing/timer-refresh is a side effect.
728 prb_open_block(pkc
, pbd
);
735 _prb_refresh_rx_retire_blk_timer(pkc
);
738 spin_unlock(&po
->sk
.sk_receive_queue
.lock
);
741 static void prb_flush_block(struct tpacket_kbdq_core
*pkc1
,
742 struct tpacket_block_desc
*pbd1
, __u32 status
)
744 /* Flush everything minus the block header */
746 #if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1
751 /* Skip the block header(we know header WILL fit in 4K) */
754 end
= (u8
*)PAGE_ALIGN((unsigned long)pkc1
->pkblk_end
);
755 for (; start
< end
; start
+= PAGE_SIZE
)
756 flush_dcache_page(pgv_to_page(start
));
761 /* Now update the block status. */
763 BLOCK_STATUS(pbd1
) = status
;
765 /* Flush the block header */
767 #if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1
769 flush_dcache_page(pgv_to_page(start
));
779 * 2) Increment active_blk_num
781 * Note:We DONT refresh the timer on purpose.
782 * Because almost always the next block will be opened.
784 static void prb_close_block(struct tpacket_kbdq_core
*pkc1
,
785 struct tpacket_block_desc
*pbd1
,
786 struct packet_sock
*po
, unsigned int stat
)
788 __u32 status
= TP_STATUS_USER
| stat
;
790 struct tpacket3_hdr
*last_pkt
;
791 struct tpacket_hdr_v1
*h1
= &pbd1
->hdr
.bh1
;
792 struct sock
*sk
= &po
->sk
;
794 if (po
->stats
.stats3
.tp_drops
)
795 status
|= TP_STATUS_LOSING
;
797 last_pkt
= (struct tpacket3_hdr
*)pkc1
->prev
;
798 last_pkt
->tp_next_offset
= 0;
800 /* Get the ts of the last pkt */
801 if (BLOCK_NUM_PKTS(pbd1
)) {
802 h1
->ts_last_pkt
.ts_sec
= last_pkt
->tp_sec
;
803 h1
->ts_last_pkt
.ts_nsec
= last_pkt
->tp_nsec
;
805 /* Ok, we tmo'd - so get the current time.
807 * It shouldn't really happen as we don't close empty
808 * blocks. See prb_retire_rx_blk_timer_expired().
812 h1
->ts_last_pkt
.ts_sec
= ts
.tv_sec
;
813 h1
->ts_last_pkt
.ts_nsec
= ts
.tv_nsec
;
818 /* Flush the block */
819 prb_flush_block(pkc1
, pbd1
, status
);
821 sk
->sk_data_ready(sk
);
823 pkc1
->kactive_blk_num
= GET_NEXT_PRB_BLK_NUM(pkc1
);
826 static void prb_thaw_queue(struct tpacket_kbdq_core
*pkc
)
828 pkc
->reset_pending_on_curr_blk
= 0;
832 * Side effect of opening a block:
834 * 1) prb_queue is thawed.
835 * 2) retire_blk_timer is refreshed.
838 static void prb_open_block(struct tpacket_kbdq_core
*pkc1
,
839 struct tpacket_block_desc
*pbd1
)
842 struct tpacket_hdr_v1
*h1
= &pbd1
->hdr
.bh1
;
846 /* We could have just memset this but we will lose the
847 * flexibility of making the priv area sticky
850 BLOCK_SNUM(pbd1
) = pkc1
->knxt_seq_num
++;
851 BLOCK_NUM_PKTS(pbd1
) = 0;
852 BLOCK_LEN(pbd1
) = BLK_PLUS_PRIV(pkc1
->blk_sizeof_priv
);
856 h1
->ts_first_pkt
.ts_sec
= ts
.tv_sec
;
857 h1
->ts_first_pkt
.ts_nsec
= ts
.tv_nsec
;
859 pkc1
->pkblk_start
= (char *)pbd1
;
860 pkc1
->nxt_offset
= pkc1
->pkblk_start
+ BLK_PLUS_PRIV(pkc1
->blk_sizeof_priv
);
862 BLOCK_O2FP(pbd1
) = (__u32
)BLK_PLUS_PRIV(pkc1
->blk_sizeof_priv
);
863 BLOCK_O2PRIV(pbd1
) = BLK_HDR_LEN
;
865 pbd1
->version
= pkc1
->version
;
866 pkc1
->prev
= pkc1
->nxt_offset
;
867 pkc1
->pkblk_end
= pkc1
->pkblk_start
+ pkc1
->kblk_size
;
869 prb_thaw_queue(pkc1
);
870 _prb_refresh_rx_retire_blk_timer(pkc1
);
876 * Queue freeze logic:
877 * 1) Assume tp_block_nr = 8 blocks.
878 * 2) At time 't0', user opens Rx ring.
879 * 3) Some time past 't0', kernel starts filling blocks starting from 0 .. 7
880 * 4) user-space is either sleeping or processing block '0'.
881 * 5) tpacket_rcv is currently filling block '7', since there is no space left,
882 * it will close block-7,loop around and try to fill block '0'.
884 * __packet_lookup_frame_in_block
885 * prb_retire_current_block()
886 * prb_dispatch_next_block()
887 * |->(BLOCK_STATUS == USER) evaluates to true
888 * 5.1) Since block-0 is currently in-use, we just freeze the queue.
889 * 6) Now there are two cases:
890 * 6.1) Link goes idle right after the queue is frozen.
891 * But remember, the last open_block() refreshed the timer.
892 * When this timer expires,it will refresh itself so that we can
893 * re-open block-0 in near future.
894 * 6.2) Link is busy and keeps on receiving packets. This is a simple
895 * case and __packet_lookup_frame_in_block will check if block-0
896 * is free and can now be re-used.
898 static void prb_freeze_queue(struct tpacket_kbdq_core
*pkc
,
899 struct packet_sock
*po
)
901 pkc
->reset_pending_on_curr_blk
= 1;
902 po
->stats
.stats3
.tp_freeze_q_cnt
++;
905 #define TOTAL_PKT_LEN_INCL_ALIGN(length) (ALIGN((length), V3_ALIGNMENT))
908 * If the next block is free then we will dispatch it
909 * and return a good offset.
910 * Else, we will freeze the queue.
911 * So, caller must check the return value.
913 static void *prb_dispatch_next_block(struct tpacket_kbdq_core
*pkc
,
914 struct packet_sock
*po
)
916 struct tpacket_block_desc
*pbd
;
920 /* 1. Get current block num */
921 pbd
= GET_CURR_PBLOCK_DESC_FROM_CORE(pkc
);
923 /* 2. If this block is currently in_use then freeze the queue */
924 if (TP_STATUS_USER
& BLOCK_STATUS(pbd
)) {
925 prb_freeze_queue(pkc
, po
);
931 * open this block and return the offset where the first packet
932 * needs to get stored.
934 prb_open_block(pkc
, pbd
);
935 return (void *)pkc
->nxt_offset
;
938 static void prb_retire_current_block(struct tpacket_kbdq_core
*pkc
,
939 struct packet_sock
*po
, unsigned int status
)
941 struct tpacket_block_desc
*pbd
= GET_CURR_PBLOCK_DESC_FROM_CORE(pkc
);
943 /* retire/close the current block */
944 if (likely(TP_STATUS_KERNEL
== BLOCK_STATUS(pbd
))) {
946 * Plug the case where copy_bits() is in progress on
947 * cpu-0 and tpacket_rcv() got invoked on cpu-1, didn't
948 * have space to copy the pkt in the current block and
949 * called prb_retire_current_block()
951 * We don't need to worry about the TMO case because
952 * the timer-handler already handled this case.
954 if (!(status
& TP_STATUS_BLK_TMO
)) {
955 while (atomic_read(&pkc
->blk_fill_in_prog
)) {
956 /* Waiting for skb_copy_bits to finish... */
960 prb_close_block(pkc
, pbd
, po
, status
);
965 static int prb_curr_blk_in_use(struct tpacket_kbdq_core
*pkc
,
966 struct tpacket_block_desc
*pbd
)
968 return TP_STATUS_USER
& BLOCK_STATUS(pbd
);
971 static int prb_queue_frozen(struct tpacket_kbdq_core
*pkc
)
973 return pkc
->reset_pending_on_curr_blk
;
976 static void prb_clear_blk_fill_status(struct packet_ring_buffer
*rb
)
978 struct tpacket_kbdq_core
*pkc
= GET_PBDQC_FROM_RB(rb
);
979 atomic_dec(&pkc
->blk_fill_in_prog
);
982 static void prb_fill_rxhash(struct tpacket_kbdq_core
*pkc
,
983 struct tpacket3_hdr
*ppd
)
985 ppd
->hv1
.tp_rxhash
= skb_get_hash(pkc
->skb
);
988 static void prb_clear_rxhash(struct tpacket_kbdq_core
*pkc
,
989 struct tpacket3_hdr
*ppd
)
991 ppd
->hv1
.tp_rxhash
= 0;
994 static void prb_fill_vlan_info(struct tpacket_kbdq_core
*pkc
,
995 struct tpacket3_hdr
*ppd
)
997 if (skb_vlan_tag_present(pkc
->skb
)) {
998 ppd
->hv1
.tp_vlan_tci
= skb_vlan_tag_get(pkc
->skb
);
999 ppd
->hv1
.tp_vlan_tpid
= ntohs(pkc
->skb
->vlan_proto
);
1000 ppd
->tp_status
= TP_STATUS_VLAN_VALID
| TP_STATUS_VLAN_TPID_VALID
;
1002 ppd
->hv1
.tp_vlan_tci
= 0;
1003 ppd
->hv1
.tp_vlan_tpid
= 0;
1004 ppd
->tp_status
= TP_STATUS_AVAILABLE
;
1008 static void prb_run_all_ft_ops(struct tpacket_kbdq_core
*pkc
,
1009 struct tpacket3_hdr
*ppd
)
1011 ppd
->hv1
.tp_padding
= 0;
1012 prb_fill_vlan_info(pkc
, ppd
);
1014 if (pkc
->feature_req_word
& TP_FT_REQ_FILL_RXHASH
)
1015 prb_fill_rxhash(pkc
, ppd
);
1017 prb_clear_rxhash(pkc
, ppd
);
1020 static void prb_fill_curr_block(char *curr
,
1021 struct tpacket_kbdq_core
*pkc
,
1022 struct tpacket_block_desc
*pbd
,
1025 struct tpacket3_hdr
*ppd
;
1027 ppd
= (struct tpacket3_hdr
*)curr
;
1028 ppd
->tp_next_offset
= TOTAL_PKT_LEN_INCL_ALIGN(len
);
1030 pkc
->nxt_offset
+= TOTAL_PKT_LEN_INCL_ALIGN(len
);
1031 BLOCK_LEN(pbd
) += TOTAL_PKT_LEN_INCL_ALIGN(len
);
1032 BLOCK_NUM_PKTS(pbd
) += 1;
1033 atomic_inc(&pkc
->blk_fill_in_prog
);
1034 prb_run_all_ft_ops(pkc
, ppd
);
1037 /* Assumes caller has the sk->rx_queue.lock */
1038 static void *__packet_lookup_frame_in_block(struct packet_sock
*po
,
1039 struct sk_buff
*skb
,
1044 struct tpacket_kbdq_core
*pkc
;
1045 struct tpacket_block_desc
*pbd
;
1048 pkc
= GET_PBDQC_FROM_RB(&po
->rx_ring
);
1049 pbd
= GET_CURR_PBLOCK_DESC_FROM_CORE(pkc
);
1051 /* Queue is frozen when user space is lagging behind */
1052 if (prb_queue_frozen(pkc
)) {
1054 * Check if that last block which caused the queue to freeze,
1055 * is still in_use by user-space.
1057 if (prb_curr_blk_in_use(pkc
, pbd
)) {
1058 /* Can't record this packet */
1062 * Ok, the block was released by user-space.
1063 * Now let's open that block.
1064 * opening a block also thaws the queue.
1065 * Thawing is a side effect.
1067 prb_open_block(pkc
, pbd
);
1072 curr
= pkc
->nxt_offset
;
1074 end
= (char *)pbd
+ pkc
->kblk_size
;
1076 /* first try the current block */
1077 if (curr
+TOTAL_PKT_LEN_INCL_ALIGN(len
) < end
) {
1078 prb_fill_curr_block(curr
, pkc
, pbd
, len
);
1079 return (void *)curr
;
1082 /* Ok, close the current block */
1083 prb_retire_current_block(pkc
, po
, 0);
1085 /* Now, try to dispatch the next block */
1086 curr
= (char *)prb_dispatch_next_block(pkc
, po
);
1088 pbd
= GET_CURR_PBLOCK_DESC_FROM_CORE(pkc
);
1089 prb_fill_curr_block(curr
, pkc
, pbd
, len
);
1090 return (void *)curr
;
1094 * No free blocks are available.user_space hasn't caught up yet.
1095 * Queue was just frozen and now this packet will get dropped.
1100 static void *packet_current_rx_frame(struct packet_sock
*po
,
1101 struct sk_buff
*skb
,
1102 int status
, unsigned int len
)
1105 switch (po
->tp_version
) {
1108 curr
= packet_lookup_frame(po
, &po
->rx_ring
,
1109 po
->rx_ring
.head
, status
);
1112 return __packet_lookup_frame_in_block(po
, skb
, status
, len
);
1114 WARN(1, "TPACKET version not supported\n");
1120 static void *prb_lookup_block(struct packet_sock
*po
,
1121 struct packet_ring_buffer
*rb
,
1125 struct tpacket_kbdq_core
*pkc
= GET_PBDQC_FROM_RB(rb
);
1126 struct tpacket_block_desc
*pbd
= GET_PBLOCK_DESC(pkc
, idx
);
1128 if (status
!= BLOCK_STATUS(pbd
))
1133 static int prb_previous_blk_num(struct packet_ring_buffer
*rb
)
1136 if (rb
->prb_bdqc
.kactive_blk_num
)
1137 prev
= rb
->prb_bdqc
.kactive_blk_num
-1;
1139 prev
= rb
->prb_bdqc
.knum_blocks
-1;
1143 /* Assumes caller has held the rx_queue.lock */
1144 static void *__prb_previous_block(struct packet_sock
*po
,
1145 struct packet_ring_buffer
*rb
,
1148 unsigned int previous
= prb_previous_blk_num(rb
);
1149 return prb_lookup_block(po
, rb
, previous
, status
);
1152 static void *packet_previous_rx_frame(struct packet_sock
*po
,
1153 struct packet_ring_buffer
*rb
,
1156 if (po
->tp_version
<= TPACKET_V2
)
1157 return packet_previous_frame(po
, rb
, status
);
1159 return __prb_previous_block(po
, rb
, status
);
1162 static void packet_increment_rx_head(struct packet_sock
*po
,
1163 struct packet_ring_buffer
*rb
)
1165 switch (po
->tp_version
) {
1168 return packet_increment_head(rb
);
1171 WARN(1, "TPACKET version not supported.\n");
1177 static void *packet_previous_frame(struct packet_sock
*po
,
1178 struct packet_ring_buffer
*rb
,
1181 unsigned int previous
= rb
->head
? rb
->head
- 1 : rb
->frame_max
;
1182 return packet_lookup_frame(po
, rb
, previous
, status
);
1185 static void packet_increment_head(struct packet_ring_buffer
*buff
)
1187 buff
->head
= buff
->head
!= buff
->frame_max
? buff
->head
+1 : 0;
1190 static void packet_inc_pending(struct packet_ring_buffer
*rb
)
1192 this_cpu_inc(*rb
->pending_refcnt
);
1195 static void packet_dec_pending(struct packet_ring_buffer
*rb
)
1197 this_cpu_dec(*rb
->pending_refcnt
);
1200 static unsigned int packet_read_pending(const struct packet_ring_buffer
*rb
)
1202 unsigned int refcnt
= 0;
1205 /* We don't use pending refcount in rx_ring. */
1206 if (rb
->pending_refcnt
== NULL
)
1209 for_each_possible_cpu(cpu
)
1210 refcnt
+= *per_cpu_ptr(rb
->pending_refcnt
, cpu
);
1215 static int packet_alloc_pending(struct packet_sock
*po
)
1217 po
->rx_ring
.pending_refcnt
= NULL
;
1219 po
->tx_ring
.pending_refcnt
= alloc_percpu(unsigned int);
1220 if (unlikely(po
->tx_ring
.pending_refcnt
== NULL
))
1226 static void packet_free_pending(struct packet_sock
*po
)
1228 free_percpu(po
->tx_ring
.pending_refcnt
);
1231 static bool packet_rcv_has_room(struct packet_sock
*po
, struct sk_buff
*skb
)
1233 struct sock
*sk
= &po
->sk
;
1236 if (po
->prot_hook
.func
!= tpacket_rcv
)
1237 return (atomic_read(&sk
->sk_rmem_alloc
) + skb
->truesize
)
1240 spin_lock(&sk
->sk_receive_queue
.lock
);
1241 if (po
->tp_version
== TPACKET_V3
)
1242 has_room
= prb_lookup_block(po
, &po
->rx_ring
,
1243 po
->rx_ring
.prb_bdqc
.kactive_blk_num
,
1246 has_room
= packet_lookup_frame(po
, &po
->rx_ring
,
1249 spin_unlock(&sk
->sk_receive_queue
.lock
);
1254 static void packet_sock_destruct(struct sock
*sk
)
1256 skb_queue_purge(&sk
->sk_error_queue
);
1258 WARN_ON(atomic_read(&sk
->sk_rmem_alloc
));
1259 WARN_ON(atomic_read(&sk
->sk_wmem_alloc
));
1261 if (!sock_flag(sk
, SOCK_DEAD
)) {
1262 pr_err("Attempt to release alive packet socket: %p\n", sk
);
1266 sk_refcnt_debug_dec(sk
);
1269 static int fanout_rr_next(struct packet_fanout
*f
, unsigned int num
)
1271 int x
= atomic_read(&f
->rr_cur
) + 1;
1279 static unsigned int fanout_demux_hash(struct packet_fanout
*f
,
1280 struct sk_buff
*skb
,
1283 return reciprocal_scale(skb_get_hash(skb
), num
);
1286 static unsigned int fanout_demux_lb(struct packet_fanout
*f
,
1287 struct sk_buff
*skb
,
1292 cur
= atomic_read(&f
->rr_cur
);
1293 while ((old
= atomic_cmpxchg(&f
->rr_cur
, cur
,
1294 fanout_rr_next(f
, num
))) != cur
)
1299 static unsigned int fanout_demux_cpu(struct packet_fanout
*f
,
1300 struct sk_buff
*skb
,
1303 return smp_processor_id() % num
;
1306 static unsigned int fanout_demux_rnd(struct packet_fanout
*f
,
1307 struct sk_buff
*skb
,
1310 return prandom_u32_max(num
);
1313 static unsigned int fanout_demux_rollover(struct packet_fanout
*f
,
1314 struct sk_buff
*skb
,
1315 unsigned int idx
, unsigned int skip
,
1320 i
= j
= min_t(int, f
->next
[idx
], num
- 1);
1322 if (i
!= skip
&& packet_rcv_has_room(pkt_sk(f
->arr
[i
]), skb
)) {
1334 static unsigned int fanout_demux_qm(struct packet_fanout
*f
,
1335 struct sk_buff
*skb
,
1338 return skb_get_queue_mapping(skb
) % num
;
1341 static bool fanout_has_flag(struct packet_fanout
*f
, u16 flag
)
1343 return f
->flags
& (flag
>> 8);
1346 static int packet_rcv_fanout(struct sk_buff
*skb
, struct net_device
*dev
,
1347 struct packet_type
*pt
, struct net_device
*orig_dev
)
1349 struct packet_fanout
*f
= pt
->af_packet_priv
;
1350 unsigned int num
= f
->num_members
;
1351 struct packet_sock
*po
;
1354 if (!net_eq(dev_net(dev
), read_pnet(&f
->net
)) ||
1360 if (fanout_has_flag(f
, PACKET_FANOUT_FLAG_DEFRAG
)) {
1361 skb
= ip_check_defrag(skb
, IP_DEFRAG_AF_PACKET
);
1366 case PACKET_FANOUT_HASH
:
1368 idx
= fanout_demux_hash(f
, skb
, num
);
1370 case PACKET_FANOUT_LB
:
1371 idx
= fanout_demux_lb(f
, skb
, num
);
1373 case PACKET_FANOUT_CPU
:
1374 idx
= fanout_demux_cpu(f
, skb
, num
);
1376 case PACKET_FANOUT_RND
:
1377 idx
= fanout_demux_rnd(f
, skb
, num
);
1379 case PACKET_FANOUT_QM
:
1380 idx
= fanout_demux_qm(f
, skb
, num
);
1382 case PACKET_FANOUT_ROLLOVER
:
1383 idx
= fanout_demux_rollover(f
, skb
, 0, (unsigned int) -1, num
);
1387 po
= pkt_sk(f
->arr
[idx
]);
1388 if (fanout_has_flag(f
, PACKET_FANOUT_FLAG_ROLLOVER
) &&
1389 unlikely(!packet_rcv_has_room(po
, skb
))) {
1390 idx
= fanout_demux_rollover(f
, skb
, idx
, idx
, num
);
1391 po
= pkt_sk(f
->arr
[idx
]);
1394 return po
->prot_hook
.func(skb
, dev
, &po
->prot_hook
, orig_dev
);
1397 DEFINE_MUTEX(fanout_mutex
);
1398 EXPORT_SYMBOL_GPL(fanout_mutex
);
1399 static LIST_HEAD(fanout_list
);
1401 static void __fanout_link(struct sock
*sk
, struct packet_sock
*po
)
1403 struct packet_fanout
*f
= po
->fanout
;
1405 spin_lock(&f
->lock
);
1406 f
->arr
[f
->num_members
] = sk
;
1409 spin_unlock(&f
->lock
);
1412 static void __fanout_unlink(struct sock
*sk
, struct packet_sock
*po
)
1414 struct packet_fanout
*f
= po
->fanout
;
1417 spin_lock(&f
->lock
);
1418 for (i
= 0; i
< f
->num_members
; i
++) {
1419 if (f
->arr
[i
] == sk
)
1422 BUG_ON(i
>= f
->num_members
);
1423 f
->arr
[i
] = f
->arr
[f
->num_members
- 1];
1425 spin_unlock(&f
->lock
);
1428 static bool match_fanout_group(struct packet_type
*ptype
, struct sock
*sk
)
1430 if (ptype
->af_packet_priv
== (void *)((struct packet_sock
*)sk
)->fanout
)
1436 static int fanout_add(struct sock
*sk
, u16 id
, u16 type_flags
)
1438 struct packet_sock
*po
= pkt_sk(sk
);
1439 struct packet_fanout
*f
, *match
;
1440 u8 type
= type_flags
& 0xff;
1441 u8 flags
= type_flags
>> 8;
1445 case PACKET_FANOUT_ROLLOVER
:
1446 if (type_flags
& PACKET_FANOUT_FLAG_ROLLOVER
)
1448 case PACKET_FANOUT_HASH
:
1449 case PACKET_FANOUT_LB
:
1450 case PACKET_FANOUT_CPU
:
1451 case PACKET_FANOUT_RND
:
1452 case PACKET_FANOUT_QM
:
1464 mutex_lock(&fanout_mutex
);
1466 list_for_each_entry(f
, &fanout_list
, list
) {
1468 read_pnet(&f
->net
) == sock_net(sk
)) {
1474 if (match
&& match
->flags
!= flags
)
1478 match
= kzalloc(sizeof(*match
), GFP_KERNEL
);
1481 write_pnet(&match
->net
, sock_net(sk
));
1484 match
->flags
= flags
;
1485 atomic_set(&match
->rr_cur
, 0);
1486 INIT_LIST_HEAD(&match
->list
);
1487 spin_lock_init(&match
->lock
);
1488 atomic_set(&match
->sk_ref
, 0);
1489 match
->prot_hook
.type
= po
->prot_hook
.type
;
1490 match
->prot_hook
.dev
= po
->prot_hook
.dev
;
1491 match
->prot_hook
.func
= packet_rcv_fanout
;
1492 match
->prot_hook
.af_packet_priv
= match
;
1493 match
->prot_hook
.id_match
= match_fanout_group
;
1494 dev_add_pack(&match
->prot_hook
);
1495 list_add(&match
->list
, &fanout_list
);
1498 if (match
->type
== type
&&
1499 match
->prot_hook
.type
== po
->prot_hook
.type
&&
1500 match
->prot_hook
.dev
== po
->prot_hook
.dev
) {
1502 if (atomic_read(&match
->sk_ref
) < PACKET_FANOUT_MAX
) {
1503 __dev_remove_pack(&po
->prot_hook
);
1505 atomic_inc(&match
->sk_ref
);
1506 __fanout_link(sk
, po
);
1511 mutex_unlock(&fanout_mutex
);
1515 static void fanout_release(struct sock
*sk
)
1517 struct packet_sock
*po
= pkt_sk(sk
);
1518 struct packet_fanout
*f
;
1524 mutex_lock(&fanout_mutex
);
1527 if (atomic_dec_and_test(&f
->sk_ref
)) {
1529 dev_remove_pack(&f
->prot_hook
);
1532 mutex_unlock(&fanout_mutex
);
1535 static const struct proto_ops packet_ops
;
1537 static const struct proto_ops packet_ops_spkt
;
1539 static int packet_rcv_spkt(struct sk_buff
*skb
, struct net_device
*dev
,
1540 struct packet_type
*pt
, struct net_device
*orig_dev
)
1543 struct sockaddr_pkt
*spkt
;
1546 * When we registered the protocol we saved the socket in the data
1547 * field for just this event.
1550 sk
= pt
->af_packet_priv
;
1553 * Yank back the headers [hope the device set this
1554 * right or kerboom...]
1556 * Incoming packets have ll header pulled,
1559 * For outgoing ones skb->data == skb_mac_header(skb)
1560 * so that this procedure is noop.
1563 if (skb
->pkt_type
== PACKET_LOOPBACK
)
1566 if (!net_eq(dev_net(dev
), sock_net(sk
)))
1569 skb
= skb_share_check(skb
, GFP_ATOMIC
);
1573 /* drop any routing info */
1576 /* drop conntrack reference */
1579 spkt
= &PACKET_SKB_CB(skb
)->sa
.pkt
;
1581 skb_push(skb
, skb
->data
- skb_mac_header(skb
));
1584 * The SOCK_PACKET socket receives _all_ frames.
1587 spkt
->spkt_family
= dev
->type
;
1588 strlcpy(spkt
->spkt_device
, dev
->name
, sizeof(spkt
->spkt_device
));
1589 spkt
->spkt_protocol
= skb
->protocol
;
1592 * Charge the memory to the socket. This is done specifically
1593 * to prevent sockets using all the memory up.
1596 if (sock_queue_rcv_skb(sk
, skb
) == 0)
1607 * Output a raw packet to a device layer. This bypasses all the other
1608 * protocol layers and you must therefore supply it with a complete frame
1611 static int packet_sendmsg_spkt(struct kiocb
*iocb
, struct socket
*sock
,
1612 struct msghdr
*msg
, size_t len
)
1614 struct sock
*sk
= sock
->sk
;
1615 DECLARE_SOCKADDR(struct sockaddr_pkt
*, saddr
, msg
->msg_name
);
1616 struct sk_buff
*skb
= NULL
;
1617 struct net_device
*dev
;
1623 * Get and verify the address.
1627 if (msg
->msg_namelen
< sizeof(struct sockaddr
))
1629 if (msg
->msg_namelen
== sizeof(struct sockaddr_pkt
))
1630 proto
= saddr
->spkt_protocol
;
1632 return -ENOTCONN
; /* SOCK_PACKET must be sent giving an address */
1635 * Find the device first to size check it
1638 saddr
->spkt_device
[sizeof(saddr
->spkt_device
) - 1] = 0;
1641 dev
= dev_get_by_name_rcu(sock_net(sk
), saddr
->spkt_device
);
1647 if (!(dev
->flags
& IFF_UP
))
1651 * You may not queue a frame bigger than the mtu. This is the lowest level
1652 * raw protocol and you must do your own fragmentation at this level.
1655 if (unlikely(sock_flag(sk
, SOCK_NOFCS
))) {
1656 if (!netif_supports_nofcs(dev
)) {
1657 err
= -EPROTONOSUPPORT
;
1660 extra_len
= 4; /* We're doing our own CRC */
1664 if (len
> dev
->mtu
+ dev
->hard_header_len
+ VLAN_HLEN
+ extra_len
)
1668 size_t reserved
= LL_RESERVED_SPACE(dev
);
1669 int tlen
= dev
->needed_tailroom
;
1670 unsigned int hhlen
= dev
->header_ops
? dev
->hard_header_len
: 0;
1673 skb
= sock_wmalloc(sk
, len
+ reserved
+ tlen
, 0, GFP_KERNEL
);
1676 /* FIXME: Save some space for broken drivers that write a hard
1677 * header at transmission time by themselves. PPP is the notable
1678 * one here. This should really be fixed at the driver level.
1680 skb_reserve(skb
, reserved
);
1681 skb_reset_network_header(skb
);
1683 /* Try to align data part correctly */
1688 skb_reset_network_header(skb
);
1690 err
= memcpy_from_msg(skb_put(skb
, len
), msg
, len
);
1696 if (len
> (dev
->mtu
+ dev
->hard_header_len
+ extra_len
)) {
1697 /* Earlier code assumed this would be a VLAN pkt,
1698 * double-check this now that we have the actual
1701 struct ethhdr
*ehdr
;
1702 skb_reset_mac_header(skb
);
1703 ehdr
= eth_hdr(skb
);
1704 if (ehdr
->h_proto
!= htons(ETH_P_8021Q
)) {
1710 skb
->protocol
= proto
;
1712 skb
->priority
= sk
->sk_priority
;
1713 skb
->mark
= sk
->sk_mark
;
1715 sock_tx_timestamp(sk
, &skb_shinfo(skb
)->tx_flags
);
1717 if (unlikely(extra_len
== 4))
1720 skb_probe_transport_header(skb
, 0);
1722 dev_queue_xmit(skb
);
1733 static unsigned int run_filter(const struct sk_buff
*skb
,
1734 const struct sock
*sk
,
1737 struct sk_filter
*filter
;
1740 filter
= rcu_dereference(sk
->sk_filter
);
1742 res
= SK_RUN_FILTER(filter
, skb
);
1749 * This function makes lazy skb cloning in hope that most of packets
1750 * are discarded by BPF.
1752 * Note tricky part: we DO mangle shared skb! skb->data, skb->len
1753 * and skb->cb are mangled. It works because (and until) packets
1754 * falling here are owned by current CPU. Output packets are cloned
1755 * by dev_queue_xmit_nit(), input packets are processed by net_bh
1756 * sequencially, so that if we return skb to original state on exit,
1757 * we will not harm anyone.
1760 static int packet_rcv(struct sk_buff
*skb
, struct net_device
*dev
,
1761 struct packet_type
*pt
, struct net_device
*orig_dev
)
1764 struct sockaddr_ll
*sll
;
1765 struct packet_sock
*po
;
1766 u8
*skb_head
= skb
->data
;
1767 int skb_len
= skb
->len
;
1768 unsigned int snaplen
, res
;
1770 if (skb
->pkt_type
== PACKET_LOOPBACK
)
1773 sk
= pt
->af_packet_priv
;
1776 if (!net_eq(dev_net(dev
), sock_net(sk
)))
1781 if (dev
->header_ops
) {
1782 /* The device has an explicit notion of ll header,
1783 * exported to higher levels.
1785 * Otherwise, the device hides details of its frame
1786 * structure, so that corresponding packet head is
1787 * never delivered to user.
1789 if (sk
->sk_type
!= SOCK_DGRAM
)
1790 skb_push(skb
, skb
->data
- skb_mac_header(skb
));
1791 else if (skb
->pkt_type
== PACKET_OUTGOING
) {
1792 /* Special case: outgoing packets have ll header at head */
1793 skb_pull(skb
, skb_network_offset(skb
));
1799 res
= run_filter(skb
, sk
, snaplen
);
1801 goto drop_n_restore
;
1805 if (atomic_read(&sk
->sk_rmem_alloc
) >= sk
->sk_rcvbuf
)
1808 if (skb_shared(skb
)) {
1809 struct sk_buff
*nskb
= skb_clone(skb
, GFP_ATOMIC
);
1813 if (skb_head
!= skb
->data
) {
1814 skb
->data
= skb_head
;
1821 BUILD_BUG_ON(sizeof(*PACKET_SKB_CB(skb
)) + MAX_ADDR_LEN
- 8 >
1824 sll
= &PACKET_SKB_CB(skb
)->sa
.ll
;
1825 sll
->sll_family
= AF_PACKET
;
1826 sll
->sll_hatype
= dev
->type
;
1827 sll
->sll_protocol
= skb
->protocol
;
1828 sll
->sll_pkttype
= skb
->pkt_type
;
1829 if (unlikely(po
->origdev
))
1830 sll
->sll_ifindex
= orig_dev
->ifindex
;
1832 sll
->sll_ifindex
= dev
->ifindex
;
1834 sll
->sll_halen
= dev_parse_header(skb
, sll
->sll_addr
);
1836 PACKET_SKB_CB(skb
)->origlen
= skb
->len
;
1838 if (pskb_trim(skb
, snaplen
))
1841 skb_set_owner_r(skb
, sk
);
1845 /* drop conntrack reference */
1848 spin_lock(&sk
->sk_receive_queue
.lock
);
1849 po
->stats
.stats1
.tp_packets
++;
1850 skb
->dropcount
= atomic_read(&sk
->sk_drops
);
1851 __skb_queue_tail(&sk
->sk_receive_queue
, skb
);
1852 spin_unlock(&sk
->sk_receive_queue
.lock
);
1853 sk
->sk_data_ready(sk
);
1857 spin_lock(&sk
->sk_receive_queue
.lock
);
1858 po
->stats
.stats1
.tp_drops
++;
1859 atomic_inc(&sk
->sk_drops
);
1860 spin_unlock(&sk
->sk_receive_queue
.lock
);
1863 if (skb_head
!= skb
->data
&& skb_shared(skb
)) {
1864 skb
->data
= skb_head
;
1872 static int tpacket_rcv(struct sk_buff
*skb
, struct net_device
*dev
,
1873 struct packet_type
*pt
, struct net_device
*orig_dev
)
1876 struct packet_sock
*po
;
1877 struct sockaddr_ll
*sll
;
1878 union tpacket_uhdr h
;
1879 u8
*skb_head
= skb
->data
;
1880 int skb_len
= skb
->len
;
1881 unsigned int snaplen
, res
;
1882 unsigned long status
= TP_STATUS_USER
;
1883 unsigned short macoff
, netoff
, hdrlen
;
1884 struct sk_buff
*copy_skb
= NULL
;
1888 /* struct tpacket{2,3}_hdr is aligned to a multiple of TPACKET_ALIGNMENT.
1889 * We may add members to them until current aligned size without forcing
1890 * userspace to call getsockopt(..., PACKET_HDRLEN, ...).
1892 BUILD_BUG_ON(TPACKET_ALIGN(sizeof(*h
.h2
)) != 32);
1893 BUILD_BUG_ON(TPACKET_ALIGN(sizeof(*h
.h3
)) != 48);
1895 if (skb
->pkt_type
== PACKET_LOOPBACK
)
1898 sk
= pt
->af_packet_priv
;
1901 if (!net_eq(dev_net(dev
), sock_net(sk
)))
1904 if (dev
->header_ops
) {
1905 if (sk
->sk_type
!= SOCK_DGRAM
)
1906 skb_push(skb
, skb
->data
- skb_mac_header(skb
));
1907 else if (skb
->pkt_type
== PACKET_OUTGOING
) {
1908 /* Special case: outgoing packets have ll header at head */
1909 skb_pull(skb
, skb_network_offset(skb
));
1913 if (skb
->ip_summed
== CHECKSUM_PARTIAL
)
1914 status
|= TP_STATUS_CSUMNOTREADY
;
1918 res
= run_filter(skb
, sk
, snaplen
);
1920 goto drop_n_restore
;
1924 if (sk
->sk_type
== SOCK_DGRAM
) {
1925 macoff
= netoff
= TPACKET_ALIGN(po
->tp_hdrlen
) + 16 +
1928 unsigned int maclen
= skb_network_offset(skb
);
1929 netoff
= TPACKET_ALIGN(po
->tp_hdrlen
+
1930 (maclen
< 16 ? 16 : maclen
)) +
1932 macoff
= netoff
- maclen
;
1934 if (po
->tp_version
<= TPACKET_V2
) {
1935 if (macoff
+ snaplen
> po
->rx_ring
.frame_size
) {
1936 if (po
->copy_thresh
&&
1937 atomic_read(&sk
->sk_rmem_alloc
) < sk
->sk_rcvbuf
) {
1938 if (skb_shared(skb
)) {
1939 copy_skb
= skb_clone(skb
, GFP_ATOMIC
);
1941 copy_skb
= skb_get(skb
);
1942 skb_head
= skb
->data
;
1945 skb_set_owner_r(copy_skb
, sk
);
1947 snaplen
= po
->rx_ring
.frame_size
- macoff
;
1948 if ((int)snaplen
< 0)
1951 } else if (unlikely(macoff
+ snaplen
>
1952 GET_PBDQC_FROM_RB(&po
->rx_ring
)->max_frame_len
)) {
1955 nval
= GET_PBDQC_FROM_RB(&po
->rx_ring
)->max_frame_len
- macoff
;
1956 pr_err_once("tpacket_rcv: packet too big, clamped from %u to %u. macoff=%u\n",
1957 snaplen
, nval
, macoff
);
1959 if (unlikely((int)snaplen
< 0)) {
1961 macoff
= GET_PBDQC_FROM_RB(&po
->rx_ring
)->max_frame_len
;
1964 spin_lock(&sk
->sk_receive_queue
.lock
);
1965 h
.raw
= packet_current_rx_frame(po
, skb
,
1966 TP_STATUS_KERNEL
, (macoff
+snaplen
));
1969 if (po
->tp_version
<= TPACKET_V2
) {
1970 packet_increment_rx_head(po
, &po
->rx_ring
);
1972 * LOSING will be reported till you read the stats,
1973 * because it's COR - Clear On Read.
1974 * Anyways, moving it for V1/V2 only as V3 doesn't need this
1977 if (po
->stats
.stats1
.tp_drops
)
1978 status
|= TP_STATUS_LOSING
;
1980 po
->stats
.stats1
.tp_packets
++;
1982 status
|= TP_STATUS_COPY
;
1983 __skb_queue_tail(&sk
->sk_receive_queue
, copy_skb
);
1985 spin_unlock(&sk
->sk_receive_queue
.lock
);
1987 skb_copy_bits(skb
, 0, h
.raw
+ macoff
, snaplen
);
1989 if (!(ts_status
= tpacket_get_timestamp(skb
, &ts
, po
->tp_tstamp
)))
1990 getnstimeofday(&ts
);
1992 status
|= ts_status
;
1994 switch (po
->tp_version
) {
1996 h
.h1
->tp_len
= skb
->len
;
1997 h
.h1
->tp_snaplen
= snaplen
;
1998 h
.h1
->tp_mac
= macoff
;
1999 h
.h1
->tp_net
= netoff
;
2000 h
.h1
->tp_sec
= ts
.tv_sec
;
2001 h
.h1
->tp_usec
= ts
.tv_nsec
/ NSEC_PER_USEC
;
2002 hdrlen
= sizeof(*h
.h1
);
2005 h
.h2
->tp_len
= skb
->len
;
2006 h
.h2
->tp_snaplen
= snaplen
;
2007 h
.h2
->tp_mac
= macoff
;
2008 h
.h2
->tp_net
= netoff
;
2009 h
.h2
->tp_sec
= ts
.tv_sec
;
2010 h
.h2
->tp_nsec
= ts
.tv_nsec
;
2011 if (skb_vlan_tag_present(skb
)) {
2012 h
.h2
->tp_vlan_tci
= skb_vlan_tag_get(skb
);
2013 h
.h2
->tp_vlan_tpid
= ntohs(skb
->vlan_proto
);
2014 status
|= TP_STATUS_VLAN_VALID
| TP_STATUS_VLAN_TPID_VALID
;
2016 h
.h2
->tp_vlan_tci
= 0;
2017 h
.h2
->tp_vlan_tpid
= 0;
2019 memset(h
.h2
->tp_padding
, 0, sizeof(h
.h2
->tp_padding
));
2020 hdrlen
= sizeof(*h
.h2
);
2023 /* tp_nxt_offset,vlan are already populated above.
2024 * So DONT clear those fields here
2026 h
.h3
->tp_status
|= status
;
2027 h
.h3
->tp_len
= skb
->len
;
2028 h
.h3
->tp_snaplen
= snaplen
;
2029 h
.h3
->tp_mac
= macoff
;
2030 h
.h3
->tp_net
= netoff
;
2031 h
.h3
->tp_sec
= ts
.tv_sec
;
2032 h
.h3
->tp_nsec
= ts
.tv_nsec
;
2033 memset(h
.h3
->tp_padding
, 0, sizeof(h
.h3
->tp_padding
));
2034 hdrlen
= sizeof(*h
.h3
);
2040 sll
= h
.raw
+ TPACKET_ALIGN(hdrlen
);
2041 sll
->sll_halen
= dev_parse_header(skb
, sll
->sll_addr
);
2042 sll
->sll_family
= AF_PACKET
;
2043 sll
->sll_hatype
= dev
->type
;
2044 sll
->sll_protocol
= skb
->protocol
;
2045 sll
->sll_pkttype
= skb
->pkt_type
;
2046 if (unlikely(po
->origdev
))
2047 sll
->sll_ifindex
= orig_dev
->ifindex
;
2049 sll
->sll_ifindex
= dev
->ifindex
;
2053 #if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1
2054 if (po
->tp_version
<= TPACKET_V2
) {
2057 end
= (u8
*) PAGE_ALIGN((unsigned long) h
.raw
+
2060 for (start
= h
.raw
; start
< end
; start
+= PAGE_SIZE
)
2061 flush_dcache_page(pgv_to_page(start
));
2066 if (po
->tp_version
<= TPACKET_V2
) {
2067 __packet_set_status(po
, h
.raw
, status
);
2068 sk
->sk_data_ready(sk
);
2070 prb_clear_blk_fill_status(&po
->rx_ring
);
2074 if (skb_head
!= skb
->data
&& skb_shared(skb
)) {
2075 skb
->data
= skb_head
;
2083 po
->stats
.stats1
.tp_drops
++;
2084 spin_unlock(&sk
->sk_receive_queue
.lock
);
2086 sk
->sk_data_ready(sk
);
2087 kfree_skb(copy_skb
);
2088 goto drop_n_restore
;
2091 static void tpacket_destruct_skb(struct sk_buff
*skb
)
2093 struct packet_sock
*po
= pkt_sk(skb
->sk
);
2095 if (likely(po
->tx_ring
.pg_vec
)) {
2099 ph
= skb_shinfo(skb
)->destructor_arg
;
2100 packet_dec_pending(&po
->tx_ring
);
2102 ts
= __packet_set_timestamp(po
, ph
, skb
);
2103 __packet_set_status(po
, ph
, TP_STATUS_AVAILABLE
| ts
);
2109 static bool ll_header_truncated(const struct net_device
*dev
, int len
)
2111 /* net device doesn't like empty head */
2112 if (unlikely(len
<= dev
->hard_header_len
)) {
2113 net_warn_ratelimited("%s: packet size is too short (%d <= %d)\n",
2114 current
->comm
, len
, dev
->hard_header_len
);
2121 static int tpacket_fill_skb(struct packet_sock
*po
, struct sk_buff
*skb
,
2122 void *frame
, struct net_device
*dev
, int size_max
,
2123 __be16 proto
, unsigned char *addr
, int hlen
)
2125 union tpacket_uhdr ph
;
2126 int to_write
, offset
, len
, tp_len
, nr_frags
, len_max
;
2127 struct socket
*sock
= po
->sk
.sk_socket
;
2134 skb
->protocol
= proto
;
2136 skb
->priority
= po
->sk
.sk_priority
;
2137 skb
->mark
= po
->sk
.sk_mark
;
2138 sock_tx_timestamp(&po
->sk
, &skb_shinfo(skb
)->tx_flags
);
2139 skb_shinfo(skb
)->destructor_arg
= ph
.raw
;
2141 switch (po
->tp_version
) {
2143 tp_len
= ph
.h2
->tp_len
;
2146 tp_len
= ph
.h1
->tp_len
;
2149 if (unlikely(tp_len
> size_max
)) {
2150 pr_err("packet size is too long (%d > %d)\n", tp_len
, size_max
);
2154 skb_reserve(skb
, hlen
);
2155 skb_reset_network_header(skb
);
2157 if (!packet_use_direct_xmit(po
))
2158 skb_probe_transport_header(skb
, 0);
2159 if (unlikely(po
->tp_tx_has_off
)) {
2160 int off_min
, off_max
, off
;
2161 off_min
= po
->tp_hdrlen
- sizeof(struct sockaddr_ll
);
2162 off_max
= po
->tx_ring
.frame_size
- tp_len
;
2163 if (sock
->type
== SOCK_DGRAM
) {
2164 switch (po
->tp_version
) {
2166 off
= ph
.h2
->tp_net
;
2169 off
= ph
.h1
->tp_net
;
2173 switch (po
->tp_version
) {
2175 off
= ph
.h2
->tp_mac
;
2178 off
= ph
.h1
->tp_mac
;
2182 if (unlikely((off
< off_min
) || (off_max
< off
)))
2184 data
= ph
.raw
+ off
;
2186 data
= ph
.raw
+ po
->tp_hdrlen
- sizeof(struct sockaddr_ll
);
2190 if (sock
->type
== SOCK_DGRAM
) {
2191 err
= dev_hard_header(skb
, dev
, ntohs(proto
), addr
,
2193 if (unlikely(err
< 0))
2195 } else if (dev
->hard_header_len
) {
2196 if (ll_header_truncated(dev
, tp_len
))
2199 skb_push(skb
, dev
->hard_header_len
);
2200 err
= skb_store_bits(skb
, 0, data
,
2201 dev
->hard_header_len
);
2205 data
+= dev
->hard_header_len
;
2206 to_write
-= dev
->hard_header_len
;
2209 offset
= offset_in_page(data
);
2210 len_max
= PAGE_SIZE
- offset
;
2211 len
= ((to_write
> len_max
) ? len_max
: to_write
);
2213 skb
->data_len
= to_write
;
2214 skb
->len
+= to_write
;
2215 skb
->truesize
+= to_write
;
2216 atomic_add(to_write
, &po
->sk
.sk_wmem_alloc
);
2218 while (likely(to_write
)) {
2219 nr_frags
= skb_shinfo(skb
)->nr_frags
;
2221 if (unlikely(nr_frags
>= MAX_SKB_FRAGS
)) {
2222 pr_err("Packet exceed the number of skb frags(%lu)\n",
2227 page
= pgv_to_page(data
);
2229 flush_dcache_page(page
);
2231 skb_fill_page_desc(skb
, nr_frags
, page
, offset
, len
);
2234 len_max
= PAGE_SIZE
;
2235 len
= ((to_write
> len_max
) ? len_max
: to_write
);
2241 static int tpacket_snd(struct packet_sock
*po
, struct msghdr
*msg
)
2243 struct sk_buff
*skb
;
2244 struct net_device
*dev
;
2246 int err
, reserve
= 0;
2248 DECLARE_SOCKADDR(struct sockaddr_ll
*, saddr
, msg
->msg_name
);
2249 bool need_wait
= !(msg
->msg_flags
& MSG_DONTWAIT
);
2250 int tp_len
, size_max
;
2251 unsigned char *addr
;
2253 int status
= TP_STATUS_AVAILABLE
;
2256 mutex_lock(&po
->pg_vec_lock
);
2258 if (likely(saddr
== NULL
)) {
2259 dev
= packet_cached_dev_get(po
);
2264 if (msg
->msg_namelen
< sizeof(struct sockaddr_ll
))
2266 if (msg
->msg_namelen
< (saddr
->sll_halen
2267 + offsetof(struct sockaddr_ll
,
2270 proto
= saddr
->sll_protocol
;
2271 addr
= saddr
->sll_addr
;
2272 dev
= dev_get_by_index(sock_net(&po
->sk
), saddr
->sll_ifindex
);
2276 if (unlikely(dev
== NULL
))
2279 if (unlikely(!(dev
->flags
& IFF_UP
)))
2282 reserve
= dev
->hard_header_len
+ VLAN_HLEN
;
2283 size_max
= po
->tx_ring
.frame_size
2284 - (po
->tp_hdrlen
- sizeof(struct sockaddr_ll
));
2286 if (size_max
> dev
->mtu
+ reserve
)
2287 size_max
= dev
->mtu
+ reserve
;
2290 ph
= packet_current_frame(po
, &po
->tx_ring
,
2291 TP_STATUS_SEND_REQUEST
);
2292 if (unlikely(ph
== NULL
)) {
2293 if (need_wait
&& need_resched())
2298 status
= TP_STATUS_SEND_REQUEST
;
2299 hlen
= LL_RESERVED_SPACE(dev
);
2300 tlen
= dev
->needed_tailroom
;
2301 skb
= sock_alloc_send_skb(&po
->sk
,
2302 hlen
+ tlen
+ sizeof(struct sockaddr_ll
),
2305 if (unlikely(skb
== NULL
))
2308 tp_len
= tpacket_fill_skb(po
, skb
, ph
, dev
, size_max
, proto
,
2310 if (tp_len
> dev
->mtu
+ dev
->hard_header_len
) {
2311 struct ethhdr
*ehdr
;
2312 /* Earlier code assumed this would be a VLAN pkt,
2313 * double-check this now that we have the actual
2317 skb_reset_mac_header(skb
);
2318 ehdr
= eth_hdr(skb
);
2319 if (ehdr
->h_proto
!= htons(ETH_P_8021Q
))
2322 if (unlikely(tp_len
< 0)) {
2324 __packet_set_status(po
, ph
,
2325 TP_STATUS_AVAILABLE
);
2326 packet_increment_head(&po
->tx_ring
);
2330 status
= TP_STATUS_WRONG_FORMAT
;
2336 packet_pick_tx_queue(dev
, skb
);
2338 skb
->destructor
= tpacket_destruct_skb
;
2339 __packet_set_status(po
, ph
, TP_STATUS_SENDING
);
2340 packet_inc_pending(&po
->tx_ring
);
2342 status
= TP_STATUS_SEND_REQUEST
;
2343 err
= po
->xmit(skb
);
2344 if (unlikely(err
> 0)) {
2345 err
= net_xmit_errno(err
);
2346 if (err
&& __packet_get_status(po
, ph
) ==
2347 TP_STATUS_AVAILABLE
) {
2348 /* skb was destructed already */
2353 * skb was dropped but not destructed yet;
2354 * let's treat it like congestion or err < 0
2358 packet_increment_head(&po
->tx_ring
);
2360 } while (likely((ph
!= NULL
) ||
2361 /* Note: packet_read_pending() might be slow if we have
2362 * to call it as it's per_cpu variable, but in fast-path
2363 * we already short-circuit the loop with the first
2364 * condition, and luckily don't have to go that path
2367 (need_wait
&& packet_read_pending(&po
->tx_ring
))));
2373 __packet_set_status(po
, ph
, status
);
2378 mutex_unlock(&po
->pg_vec_lock
);
2382 static struct sk_buff
*packet_alloc_skb(struct sock
*sk
, size_t prepad
,
2383 size_t reserve
, size_t len
,
2384 size_t linear
, int noblock
,
2387 struct sk_buff
*skb
;
2389 /* Under a page? Don't bother with paged skb. */
2390 if (prepad
+ len
< PAGE_SIZE
|| !linear
)
2393 skb
= sock_alloc_send_pskb(sk
, prepad
+ linear
, len
- linear
, noblock
,
2398 skb_reserve(skb
, reserve
);
2399 skb_put(skb
, linear
);
2400 skb
->data_len
= len
- linear
;
2401 skb
->len
+= len
- linear
;
2406 static int packet_snd(struct socket
*sock
, struct msghdr
*msg
, size_t len
)
2408 struct sock
*sk
= sock
->sk
;
2409 DECLARE_SOCKADDR(struct sockaddr_ll
*, saddr
, msg
->msg_name
);
2410 struct sk_buff
*skb
;
2411 struct net_device
*dev
;
2413 unsigned char *addr
;
2414 int err
, reserve
= 0;
2415 struct virtio_net_hdr vnet_hdr
= { 0 };
2418 struct packet_sock
*po
= pkt_sk(sk
);
2419 unsigned short gso_type
= 0;
2425 * Get and verify the address.
2428 if (likely(saddr
== NULL
)) {
2429 dev
= packet_cached_dev_get(po
);
2434 if (msg
->msg_namelen
< sizeof(struct sockaddr_ll
))
2436 if (msg
->msg_namelen
< (saddr
->sll_halen
+ offsetof(struct sockaddr_ll
, sll_addr
)))
2438 proto
= saddr
->sll_protocol
;
2439 addr
= saddr
->sll_addr
;
2440 dev
= dev_get_by_index(sock_net(sk
), saddr
->sll_ifindex
);
2444 if (unlikely(dev
== NULL
))
2447 if (unlikely(!(dev
->flags
& IFF_UP
)))
2450 if (sock
->type
== SOCK_RAW
)
2451 reserve
= dev
->hard_header_len
;
2452 if (po
->has_vnet_hdr
) {
2453 vnet_hdr_len
= sizeof(vnet_hdr
);
2456 if (len
< vnet_hdr_len
)
2459 len
-= vnet_hdr_len
;
2462 n
= copy_from_iter(&vnet_hdr
, vnet_hdr_len
, &msg
->msg_iter
);
2463 if (n
!= vnet_hdr_len
)
2466 if ((vnet_hdr
.flags
& VIRTIO_NET_HDR_F_NEEDS_CSUM
) &&
2467 (__virtio16_to_cpu(false, vnet_hdr
.csum_start
) +
2468 __virtio16_to_cpu(false, vnet_hdr
.csum_offset
) + 2 >
2469 __virtio16_to_cpu(false, vnet_hdr
.hdr_len
)))
2470 vnet_hdr
.hdr_len
= __cpu_to_virtio16(false,
2471 __virtio16_to_cpu(false, vnet_hdr
.csum_start
) +
2472 __virtio16_to_cpu(false, vnet_hdr
.csum_offset
) + 2);
2475 if (__virtio16_to_cpu(false, vnet_hdr
.hdr_len
) > len
)
2478 if (vnet_hdr
.gso_type
!= VIRTIO_NET_HDR_GSO_NONE
) {
2479 switch (vnet_hdr
.gso_type
& ~VIRTIO_NET_HDR_GSO_ECN
) {
2480 case VIRTIO_NET_HDR_GSO_TCPV4
:
2481 gso_type
= SKB_GSO_TCPV4
;
2483 case VIRTIO_NET_HDR_GSO_TCPV6
:
2484 gso_type
= SKB_GSO_TCPV6
;
2486 case VIRTIO_NET_HDR_GSO_UDP
:
2487 gso_type
= SKB_GSO_UDP
;
2493 if (vnet_hdr
.gso_type
& VIRTIO_NET_HDR_GSO_ECN
)
2494 gso_type
|= SKB_GSO_TCP_ECN
;
2496 if (vnet_hdr
.gso_size
== 0)
2502 if (unlikely(sock_flag(sk
, SOCK_NOFCS
))) {
2503 if (!netif_supports_nofcs(dev
)) {
2504 err
= -EPROTONOSUPPORT
;
2507 extra_len
= 4; /* We're doing our own CRC */
2511 if (!gso_type
&& (len
> dev
->mtu
+ reserve
+ VLAN_HLEN
+ extra_len
))
2515 hlen
= LL_RESERVED_SPACE(dev
);
2516 tlen
= dev
->needed_tailroom
;
2517 skb
= packet_alloc_skb(sk
, hlen
+ tlen
, hlen
, len
,
2518 __virtio16_to_cpu(false, vnet_hdr
.hdr_len
),
2519 msg
->msg_flags
& MSG_DONTWAIT
, &err
);
2523 skb_set_network_header(skb
, reserve
);
2526 if (sock
->type
== SOCK_DGRAM
) {
2527 offset
= dev_hard_header(skb
, dev
, ntohs(proto
), addr
, NULL
, len
);
2528 if (unlikely(offset
< 0))
2531 if (ll_header_truncated(dev
, len
))
2535 /* Returns -EFAULT on error */
2536 err
= skb_copy_datagram_from_iter(skb
, offset
, &msg
->msg_iter
, len
);
2540 sock_tx_timestamp(sk
, &skb_shinfo(skb
)->tx_flags
);
2542 if (!gso_type
&& (len
> dev
->mtu
+ reserve
+ extra_len
)) {
2543 /* Earlier code assumed this would be a VLAN pkt,
2544 * double-check this now that we have the actual
2547 struct ethhdr
*ehdr
;
2548 skb_reset_mac_header(skb
);
2549 ehdr
= eth_hdr(skb
);
2550 if (ehdr
->h_proto
!= htons(ETH_P_8021Q
)) {
2556 skb
->protocol
= proto
;
2558 skb
->priority
= sk
->sk_priority
;
2559 skb
->mark
= sk
->sk_mark
;
2561 packet_pick_tx_queue(dev
, skb
);
2563 if (po
->has_vnet_hdr
) {
2564 if (vnet_hdr
.flags
& VIRTIO_NET_HDR_F_NEEDS_CSUM
) {
2565 u16 s
= __virtio16_to_cpu(false, vnet_hdr
.csum_start
);
2566 u16 o
= __virtio16_to_cpu(false, vnet_hdr
.csum_offset
);
2567 if (!skb_partial_csum_set(skb
, s
, o
)) {
2573 skb_shinfo(skb
)->gso_size
=
2574 __virtio16_to_cpu(false, vnet_hdr
.gso_size
);
2575 skb_shinfo(skb
)->gso_type
= gso_type
;
2577 /* Header must be checked, and gso_segs computed. */
2578 skb_shinfo(skb
)->gso_type
|= SKB_GSO_DODGY
;
2579 skb_shinfo(skb
)->gso_segs
= 0;
2581 len
+= vnet_hdr_len
;
2584 if (!packet_use_direct_xmit(po
))
2585 skb_probe_transport_header(skb
, reserve
);
2586 if (unlikely(extra_len
== 4))
2589 err
= po
->xmit(skb
);
2590 if (err
> 0 && (err
= net_xmit_errno(err
)) != 0)
2606 static int packet_sendmsg(struct kiocb
*iocb
, struct socket
*sock
,
2607 struct msghdr
*msg
, size_t len
)
2609 struct sock
*sk
= sock
->sk
;
2610 struct packet_sock
*po
= pkt_sk(sk
);
2612 if (po
->tx_ring
.pg_vec
)
2613 return tpacket_snd(po
, msg
);
2615 return packet_snd(sock
, msg
, len
);
2619 * Close a PACKET socket. This is fairly simple. We immediately go
2620 * to 'closed' state and remove our protocol entry in the device list.
2623 static int packet_release(struct socket
*sock
)
2625 struct sock
*sk
= sock
->sk
;
2626 struct packet_sock
*po
;
2628 union tpacket_req_u req_u
;
2636 mutex_lock(&net
->packet
.sklist_lock
);
2637 sk_del_node_init_rcu(sk
);
2638 mutex_unlock(&net
->packet
.sklist_lock
);
2641 sock_prot_inuse_add(net
, sk
->sk_prot
, -1);
2644 spin_lock(&po
->bind_lock
);
2645 unregister_prot_hook(sk
, false);
2646 packet_cached_dev_reset(po
);
2648 if (po
->prot_hook
.dev
) {
2649 dev_put(po
->prot_hook
.dev
);
2650 po
->prot_hook
.dev
= NULL
;
2652 spin_unlock(&po
->bind_lock
);
2654 packet_flush_mclist(sk
);
2656 if (po
->rx_ring
.pg_vec
) {
2657 memset(&req_u
, 0, sizeof(req_u
));
2658 packet_set_ring(sk
, &req_u
, 1, 0);
2661 if (po
->tx_ring
.pg_vec
) {
2662 memset(&req_u
, 0, sizeof(req_u
));
2663 packet_set_ring(sk
, &req_u
, 1, 1);
2670 * Now the socket is dead. No more input will appear.
2677 skb_queue_purge(&sk
->sk_receive_queue
);
2678 packet_free_pending(po
);
2679 sk_refcnt_debug_release(sk
);
2686 * Attach a packet hook.
2689 static int packet_do_bind(struct sock
*sk
, struct net_device
*dev
, __be16 proto
)
2691 struct packet_sock
*po
= pkt_sk(sk
);
2692 const struct net_device
*dev_curr
;
2704 spin_lock(&po
->bind_lock
);
2706 proto_curr
= po
->prot_hook
.type
;
2707 dev_curr
= po
->prot_hook
.dev
;
2709 need_rehook
= proto_curr
!= proto
|| dev_curr
!= dev
;
2712 unregister_prot_hook(sk
, true);
2715 po
->prot_hook
.type
= proto
;
2717 if (po
->prot_hook
.dev
)
2718 dev_put(po
->prot_hook
.dev
);
2720 po
->prot_hook
.dev
= dev
;
2722 po
->ifindex
= dev
? dev
->ifindex
: 0;
2723 packet_cached_dev_assign(po
, dev
);
2726 if (proto
== 0 || !need_rehook
)
2729 if (!dev
|| (dev
->flags
& IFF_UP
)) {
2730 register_prot_hook(sk
);
2732 sk
->sk_err
= ENETDOWN
;
2733 if (!sock_flag(sk
, SOCK_DEAD
))
2734 sk
->sk_error_report(sk
);
2738 spin_unlock(&po
->bind_lock
);
2744 * Bind a packet socket to a device
2747 static int packet_bind_spkt(struct socket
*sock
, struct sockaddr
*uaddr
,
2750 struct sock
*sk
= sock
->sk
;
2752 struct net_device
*dev
;
2759 if (addr_len
!= sizeof(struct sockaddr
))
2761 strlcpy(name
, uaddr
->sa_data
, sizeof(name
));
2763 dev
= dev_get_by_name(sock_net(sk
), name
);
2765 err
= packet_do_bind(sk
, dev
, pkt_sk(sk
)->num
);
2769 static int packet_bind(struct socket
*sock
, struct sockaddr
*uaddr
, int addr_len
)
2771 struct sockaddr_ll
*sll
= (struct sockaddr_ll
*)uaddr
;
2772 struct sock
*sk
= sock
->sk
;
2773 struct net_device
*dev
= NULL
;
2781 if (addr_len
< sizeof(struct sockaddr_ll
))
2783 if (sll
->sll_family
!= AF_PACKET
)
2786 if (sll
->sll_ifindex
) {
2788 dev
= dev_get_by_index(sock_net(sk
), sll
->sll_ifindex
);
2792 err
= packet_do_bind(sk
, dev
, sll
->sll_protocol
? : pkt_sk(sk
)->num
);
2798 static struct proto packet_proto
= {
2800 .owner
= THIS_MODULE
,
2801 .obj_size
= sizeof(struct packet_sock
),
2805 * Create a packet of type SOCK_PACKET.
2808 static int packet_create(struct net
*net
, struct socket
*sock
, int protocol
,
2812 struct packet_sock
*po
;
2813 __be16 proto
= (__force __be16
)protocol
; /* weird, but documented */
2816 if (!ns_capable(net
->user_ns
, CAP_NET_RAW
))
2818 if (sock
->type
!= SOCK_DGRAM
&& sock
->type
!= SOCK_RAW
&&
2819 sock
->type
!= SOCK_PACKET
)
2820 return -ESOCKTNOSUPPORT
;
2822 sock
->state
= SS_UNCONNECTED
;
2825 sk
= sk_alloc(net
, PF_PACKET
, GFP_KERNEL
, &packet_proto
);
2829 sock
->ops
= &packet_ops
;
2830 if (sock
->type
== SOCK_PACKET
)
2831 sock
->ops
= &packet_ops_spkt
;
2833 sock_init_data(sock
, sk
);
2836 sk
->sk_family
= PF_PACKET
;
2838 po
->xmit
= dev_queue_xmit
;
2840 err
= packet_alloc_pending(po
);
2844 packet_cached_dev_reset(po
);
2846 sk
->sk_destruct
= packet_sock_destruct
;
2847 sk_refcnt_debug_inc(sk
);
2850 * Attach a protocol block
2853 spin_lock_init(&po
->bind_lock
);
2854 mutex_init(&po
->pg_vec_lock
);
2855 po
->prot_hook
.func
= packet_rcv
;
2857 if (sock
->type
== SOCK_PACKET
)
2858 po
->prot_hook
.func
= packet_rcv_spkt
;
2860 po
->prot_hook
.af_packet_priv
= sk
;
2863 po
->prot_hook
.type
= proto
;
2864 register_prot_hook(sk
);
2867 mutex_lock(&net
->packet
.sklist_lock
);
2868 sk_add_node_rcu(sk
, &net
->packet
.sklist
);
2869 mutex_unlock(&net
->packet
.sklist_lock
);
2872 sock_prot_inuse_add(net
, &packet_proto
, 1);
2883 * Pull a packet from our receive queue and hand it to the user.
2884 * If necessary we block.
2887 static int packet_recvmsg(struct kiocb
*iocb
, struct socket
*sock
,
2888 struct msghdr
*msg
, size_t len
, int flags
)
2890 struct sock
*sk
= sock
->sk
;
2891 struct sk_buff
*skb
;
2893 int vnet_hdr_len
= 0;
2896 if (flags
& ~(MSG_PEEK
|MSG_DONTWAIT
|MSG_TRUNC
|MSG_CMSG_COMPAT
|MSG_ERRQUEUE
))
2900 /* What error should we return now? EUNATTACH? */
2901 if (pkt_sk(sk
)->ifindex
< 0)
2905 if (flags
& MSG_ERRQUEUE
) {
2906 err
= sock_recv_errqueue(sk
, msg
, len
,
2907 SOL_PACKET
, PACKET_TX_TIMESTAMP
);
2912 * Call the generic datagram receiver. This handles all sorts
2913 * of horrible races and re-entrancy so we can forget about it
2914 * in the protocol layers.
2916 * Now it will return ENETDOWN, if device have just gone down,
2917 * but then it will block.
2920 skb
= skb_recv_datagram(sk
, flags
, flags
& MSG_DONTWAIT
, &err
);
2923 * An error occurred so return it. Because skb_recv_datagram()
2924 * handles the blocking we don't see and worry about blocking
2931 if (pkt_sk(sk
)->has_vnet_hdr
) {
2932 struct virtio_net_hdr vnet_hdr
= { 0 };
2935 vnet_hdr_len
= sizeof(vnet_hdr
);
2936 if (len
< vnet_hdr_len
)
2939 len
-= vnet_hdr_len
;
2941 if (skb_is_gso(skb
)) {
2942 struct skb_shared_info
*sinfo
= skb_shinfo(skb
);
2944 /* This is a hint as to how much should be linear. */
2946 __cpu_to_virtio16(false, skb_headlen(skb
));
2948 __cpu_to_virtio16(false, sinfo
->gso_size
);
2949 if (sinfo
->gso_type
& SKB_GSO_TCPV4
)
2950 vnet_hdr
.gso_type
= VIRTIO_NET_HDR_GSO_TCPV4
;
2951 else if (sinfo
->gso_type
& SKB_GSO_TCPV6
)
2952 vnet_hdr
.gso_type
= VIRTIO_NET_HDR_GSO_TCPV6
;
2953 else if (sinfo
->gso_type
& SKB_GSO_UDP
)
2954 vnet_hdr
.gso_type
= VIRTIO_NET_HDR_GSO_UDP
;
2955 else if (sinfo
->gso_type
& SKB_GSO_FCOE
)
2959 if (sinfo
->gso_type
& SKB_GSO_TCP_ECN
)
2960 vnet_hdr
.gso_type
|= VIRTIO_NET_HDR_GSO_ECN
;
2962 vnet_hdr
.gso_type
= VIRTIO_NET_HDR_GSO_NONE
;
2964 if (skb
->ip_summed
== CHECKSUM_PARTIAL
) {
2965 vnet_hdr
.flags
= VIRTIO_NET_HDR_F_NEEDS_CSUM
;
2966 vnet_hdr
.csum_start
= __cpu_to_virtio16(false,
2967 skb_checksum_start_offset(skb
));
2968 vnet_hdr
.csum_offset
= __cpu_to_virtio16(false,
2970 } else if (skb
->ip_summed
== CHECKSUM_UNNECESSARY
) {
2971 vnet_hdr
.flags
= VIRTIO_NET_HDR_F_DATA_VALID
;
2972 } /* else everything is zero */
2974 err
= memcpy_to_msg(msg
, (void *)&vnet_hdr
, vnet_hdr_len
);
2979 /* You lose any data beyond the buffer you gave. If it worries
2980 * a user program they can ask the device for its MTU
2986 msg
->msg_flags
|= MSG_TRUNC
;
2989 err
= skb_copy_datagram_msg(skb
, 0, msg
, copied
);
2993 sock_recv_ts_and_drops(msg
, sk
, skb
);
2995 if (msg
->msg_name
) {
2996 /* If the address length field is there to be filled
2997 * in, we fill it in now.
2999 if (sock
->type
== SOCK_PACKET
) {
3000 __sockaddr_check_size(sizeof(struct sockaddr_pkt
));
3001 msg
->msg_namelen
= sizeof(struct sockaddr_pkt
);
3003 struct sockaddr_ll
*sll
= &PACKET_SKB_CB(skb
)->sa
.ll
;
3004 msg
->msg_namelen
= sll
->sll_halen
+
3005 offsetof(struct sockaddr_ll
, sll_addr
);
3007 memcpy(msg
->msg_name
, &PACKET_SKB_CB(skb
)->sa
,
3011 if (pkt_sk(sk
)->auxdata
) {
3012 struct tpacket_auxdata aux
;
3014 aux
.tp_status
= TP_STATUS_USER
;
3015 if (skb
->ip_summed
== CHECKSUM_PARTIAL
)
3016 aux
.tp_status
|= TP_STATUS_CSUMNOTREADY
;
3017 aux
.tp_len
= PACKET_SKB_CB(skb
)->origlen
;
3018 aux
.tp_snaplen
= skb
->len
;
3020 aux
.tp_net
= skb_network_offset(skb
);
3021 if (skb_vlan_tag_present(skb
)) {
3022 aux
.tp_vlan_tci
= skb_vlan_tag_get(skb
);
3023 aux
.tp_vlan_tpid
= ntohs(skb
->vlan_proto
);
3024 aux
.tp_status
|= TP_STATUS_VLAN_VALID
| TP_STATUS_VLAN_TPID_VALID
;
3026 aux
.tp_vlan_tci
= 0;
3027 aux
.tp_vlan_tpid
= 0;
3029 put_cmsg(msg
, SOL_PACKET
, PACKET_AUXDATA
, sizeof(aux
), &aux
);
3033 * Free or return the buffer as appropriate. Again this
3034 * hides all the races and re-entrancy issues from us.
3036 err
= vnet_hdr_len
+ ((flags
&MSG_TRUNC
) ? skb
->len
: copied
);
3039 skb_free_datagram(sk
, skb
);
3044 static int packet_getname_spkt(struct socket
*sock
, struct sockaddr
*uaddr
,
3045 int *uaddr_len
, int peer
)
3047 struct net_device
*dev
;
3048 struct sock
*sk
= sock
->sk
;
3053 uaddr
->sa_family
= AF_PACKET
;
3054 memset(uaddr
->sa_data
, 0, sizeof(uaddr
->sa_data
));
3056 dev
= dev_get_by_index_rcu(sock_net(sk
), pkt_sk(sk
)->ifindex
);
3058 strlcpy(uaddr
->sa_data
, dev
->name
, sizeof(uaddr
->sa_data
));
3060 *uaddr_len
= sizeof(*uaddr
);
3065 static int packet_getname(struct socket
*sock
, struct sockaddr
*uaddr
,
3066 int *uaddr_len
, int peer
)
3068 struct net_device
*dev
;
3069 struct sock
*sk
= sock
->sk
;
3070 struct packet_sock
*po
= pkt_sk(sk
);
3071 DECLARE_SOCKADDR(struct sockaddr_ll
*, sll
, uaddr
);
3076 sll
->sll_family
= AF_PACKET
;
3077 sll
->sll_ifindex
= po
->ifindex
;
3078 sll
->sll_protocol
= po
->num
;
3079 sll
->sll_pkttype
= 0;
3081 dev
= dev_get_by_index_rcu(sock_net(sk
), po
->ifindex
);
3083 sll
->sll_hatype
= dev
->type
;
3084 sll
->sll_halen
= dev
->addr_len
;
3085 memcpy(sll
->sll_addr
, dev
->dev_addr
, dev
->addr_len
);
3087 sll
->sll_hatype
= 0; /* Bad: we have no ARPHRD_UNSPEC */
3091 *uaddr_len
= offsetof(struct sockaddr_ll
, sll_addr
) + sll
->sll_halen
;
3096 static int packet_dev_mc(struct net_device
*dev
, struct packet_mclist
*i
,
3100 case PACKET_MR_MULTICAST
:
3101 if (i
->alen
!= dev
->addr_len
)
3104 return dev_mc_add(dev
, i
->addr
);
3106 return dev_mc_del(dev
, i
->addr
);
3108 case PACKET_MR_PROMISC
:
3109 return dev_set_promiscuity(dev
, what
);
3110 case PACKET_MR_ALLMULTI
:
3111 return dev_set_allmulti(dev
, what
);
3112 case PACKET_MR_UNICAST
:
3113 if (i
->alen
!= dev
->addr_len
)
3116 return dev_uc_add(dev
, i
->addr
);
3118 return dev_uc_del(dev
, i
->addr
);
3126 static void packet_dev_mclist_delete(struct net_device
*dev
,
3127 struct packet_mclist
**mlp
)
3129 struct packet_mclist
*ml
;
3131 while ((ml
= *mlp
) != NULL
) {
3132 if (ml
->ifindex
== dev
->ifindex
) {
3133 packet_dev_mc(dev
, ml
, -1);
3141 static int packet_mc_add(struct sock
*sk
, struct packet_mreq_max
*mreq
)
3143 struct packet_sock
*po
= pkt_sk(sk
);
3144 struct packet_mclist
*ml
, *i
;
3145 struct net_device
*dev
;
3151 dev
= __dev_get_by_index(sock_net(sk
), mreq
->mr_ifindex
);
3156 if (mreq
->mr_alen
> dev
->addr_len
)
3160 i
= kmalloc(sizeof(*i
), GFP_KERNEL
);
3165 for (ml
= po
->mclist
; ml
; ml
= ml
->next
) {
3166 if (ml
->ifindex
== mreq
->mr_ifindex
&&
3167 ml
->type
== mreq
->mr_type
&&
3168 ml
->alen
== mreq
->mr_alen
&&
3169 memcmp(ml
->addr
, mreq
->mr_address
, ml
->alen
) == 0) {
3171 /* Free the new element ... */
3177 i
->type
= mreq
->mr_type
;
3178 i
->ifindex
= mreq
->mr_ifindex
;
3179 i
->alen
= mreq
->mr_alen
;
3180 memcpy(i
->addr
, mreq
->mr_address
, i
->alen
);
3182 i
->next
= po
->mclist
;
3184 err
= packet_dev_mc(dev
, i
, 1);
3186 po
->mclist
= i
->next
;
3195 static int packet_mc_drop(struct sock
*sk
, struct packet_mreq_max
*mreq
)
3197 struct packet_mclist
*ml
, **mlp
;
3201 for (mlp
= &pkt_sk(sk
)->mclist
; (ml
= *mlp
) != NULL
; mlp
= &ml
->next
) {
3202 if (ml
->ifindex
== mreq
->mr_ifindex
&&
3203 ml
->type
== mreq
->mr_type
&&
3204 ml
->alen
== mreq
->mr_alen
&&
3205 memcmp(ml
->addr
, mreq
->mr_address
, ml
->alen
) == 0) {
3206 if (--ml
->count
== 0) {
3207 struct net_device
*dev
;
3209 dev
= __dev_get_by_index(sock_net(sk
), ml
->ifindex
);
3211 packet_dev_mc(dev
, ml
, -1);
3221 static void packet_flush_mclist(struct sock
*sk
)
3223 struct packet_sock
*po
= pkt_sk(sk
);
3224 struct packet_mclist
*ml
;
3230 while ((ml
= po
->mclist
) != NULL
) {
3231 struct net_device
*dev
;
3233 po
->mclist
= ml
->next
;
3234 dev
= __dev_get_by_index(sock_net(sk
), ml
->ifindex
);
3236 packet_dev_mc(dev
, ml
, -1);
3243 packet_setsockopt(struct socket
*sock
, int level
, int optname
, char __user
*optval
, unsigned int optlen
)
3245 struct sock
*sk
= sock
->sk
;
3246 struct packet_sock
*po
= pkt_sk(sk
);
3249 if (level
!= SOL_PACKET
)
3250 return -ENOPROTOOPT
;
3253 case PACKET_ADD_MEMBERSHIP
:
3254 case PACKET_DROP_MEMBERSHIP
:
3256 struct packet_mreq_max mreq
;
3258 memset(&mreq
, 0, sizeof(mreq
));
3259 if (len
< sizeof(struct packet_mreq
))
3261 if (len
> sizeof(mreq
))
3263 if (copy_from_user(&mreq
, optval
, len
))
3265 if (len
< (mreq
.mr_alen
+ offsetof(struct packet_mreq
, mr_address
)))
3267 if (optname
== PACKET_ADD_MEMBERSHIP
)
3268 ret
= packet_mc_add(sk
, &mreq
);
3270 ret
= packet_mc_drop(sk
, &mreq
);
3274 case PACKET_RX_RING
:
3275 case PACKET_TX_RING
:
3277 union tpacket_req_u req_u
;
3280 switch (po
->tp_version
) {
3283 len
= sizeof(req_u
.req
);
3287 len
= sizeof(req_u
.req3
);
3292 if (pkt_sk(sk
)->has_vnet_hdr
)
3294 if (copy_from_user(&req_u
.req
, optval
, len
))
3296 return packet_set_ring(sk
, &req_u
, 0,
3297 optname
== PACKET_TX_RING
);
3299 case PACKET_COPY_THRESH
:
3303 if (optlen
!= sizeof(val
))
3305 if (copy_from_user(&val
, optval
, sizeof(val
)))
3308 pkt_sk(sk
)->copy_thresh
= val
;
3311 case PACKET_VERSION
:
3315 if (optlen
!= sizeof(val
))
3317 if (po
->rx_ring
.pg_vec
|| po
->tx_ring
.pg_vec
)
3319 if (copy_from_user(&val
, optval
, sizeof(val
)))
3325 po
->tp_version
= val
;
3331 case PACKET_RESERVE
:
3335 if (optlen
!= sizeof(val
))
3337 if (po
->rx_ring
.pg_vec
|| po
->tx_ring
.pg_vec
)
3339 if (copy_from_user(&val
, optval
, sizeof(val
)))
3341 po
->tp_reserve
= val
;
3348 if (optlen
!= sizeof(val
))
3350 if (po
->rx_ring
.pg_vec
|| po
->tx_ring
.pg_vec
)
3352 if (copy_from_user(&val
, optval
, sizeof(val
)))
3354 po
->tp_loss
= !!val
;
3357 case PACKET_AUXDATA
:
3361 if (optlen
< sizeof(val
))
3363 if (copy_from_user(&val
, optval
, sizeof(val
)))
3366 po
->auxdata
= !!val
;
3369 case PACKET_ORIGDEV
:
3373 if (optlen
< sizeof(val
))
3375 if (copy_from_user(&val
, optval
, sizeof(val
)))
3378 po
->origdev
= !!val
;
3381 case PACKET_VNET_HDR
:
3385 if (sock
->type
!= SOCK_RAW
)
3387 if (po
->rx_ring
.pg_vec
|| po
->tx_ring
.pg_vec
)
3389 if (optlen
< sizeof(val
))
3391 if (copy_from_user(&val
, optval
, sizeof(val
)))
3394 po
->has_vnet_hdr
= !!val
;
3397 case PACKET_TIMESTAMP
:
3401 if (optlen
!= sizeof(val
))
3403 if (copy_from_user(&val
, optval
, sizeof(val
)))
3406 po
->tp_tstamp
= val
;
3413 if (optlen
!= sizeof(val
))
3415 if (copy_from_user(&val
, optval
, sizeof(val
)))
3418 return fanout_add(sk
, val
& 0xffff, val
>> 16);
3420 case PACKET_TX_HAS_OFF
:
3424 if (optlen
!= sizeof(val
))
3426 if (po
->rx_ring
.pg_vec
|| po
->tx_ring
.pg_vec
)
3428 if (copy_from_user(&val
, optval
, sizeof(val
)))
3430 po
->tp_tx_has_off
= !!val
;
3433 case PACKET_QDISC_BYPASS
:
3437 if (optlen
!= sizeof(val
))
3439 if (copy_from_user(&val
, optval
, sizeof(val
)))
3442 po
->xmit
= val
? packet_direct_xmit
: dev_queue_xmit
;
3446 return -ENOPROTOOPT
;
3450 static int packet_getsockopt(struct socket
*sock
, int level
, int optname
,
3451 char __user
*optval
, int __user
*optlen
)
3454 int val
, lv
= sizeof(val
);
3455 struct sock
*sk
= sock
->sk
;
3456 struct packet_sock
*po
= pkt_sk(sk
);
3458 union tpacket_stats_u st
;
3460 if (level
!= SOL_PACKET
)
3461 return -ENOPROTOOPT
;
3463 if (get_user(len
, optlen
))
3470 case PACKET_STATISTICS
:
3471 spin_lock_bh(&sk
->sk_receive_queue
.lock
);
3472 memcpy(&st
, &po
->stats
, sizeof(st
));
3473 memset(&po
->stats
, 0, sizeof(po
->stats
));
3474 spin_unlock_bh(&sk
->sk_receive_queue
.lock
);
3476 if (po
->tp_version
== TPACKET_V3
) {
3477 lv
= sizeof(struct tpacket_stats_v3
);
3478 st
.stats3
.tp_packets
+= st
.stats3
.tp_drops
;
3481 lv
= sizeof(struct tpacket_stats
);
3482 st
.stats1
.tp_packets
+= st
.stats1
.tp_drops
;
3487 case PACKET_AUXDATA
:
3490 case PACKET_ORIGDEV
:
3493 case PACKET_VNET_HDR
:
3494 val
= po
->has_vnet_hdr
;
3496 case PACKET_VERSION
:
3497 val
= po
->tp_version
;
3500 if (len
> sizeof(int))
3502 if (copy_from_user(&val
, optval
, len
))
3506 val
= sizeof(struct tpacket_hdr
);
3509 val
= sizeof(struct tpacket2_hdr
);
3512 val
= sizeof(struct tpacket3_hdr
);
3518 case PACKET_RESERVE
:
3519 val
= po
->tp_reserve
;
3524 case PACKET_TIMESTAMP
:
3525 val
= po
->tp_tstamp
;
3529 ((u32
)po
->fanout
->id
|
3530 ((u32
)po
->fanout
->type
<< 16) |
3531 ((u32
)po
->fanout
->flags
<< 24)) :
3534 case PACKET_TX_HAS_OFF
:
3535 val
= po
->tp_tx_has_off
;
3537 case PACKET_QDISC_BYPASS
:
3538 val
= packet_use_direct_xmit(po
);
3541 return -ENOPROTOOPT
;
3546 if (put_user(len
, optlen
))
3548 if (copy_to_user(optval
, data
, len
))
3554 static int packet_notifier(struct notifier_block
*this,
3555 unsigned long msg
, void *ptr
)
3558 struct net_device
*dev
= netdev_notifier_info_to_dev(ptr
);
3559 struct net
*net
= dev_net(dev
);
3562 sk_for_each_rcu(sk
, &net
->packet
.sklist
) {
3563 struct packet_sock
*po
= pkt_sk(sk
);
3566 case NETDEV_UNREGISTER
:
3568 packet_dev_mclist_delete(dev
, &po
->mclist
);
3572 if (dev
->ifindex
== po
->ifindex
) {
3573 spin_lock(&po
->bind_lock
);
3575 __unregister_prot_hook(sk
, false);
3576 sk
->sk_err
= ENETDOWN
;
3577 if (!sock_flag(sk
, SOCK_DEAD
))
3578 sk
->sk_error_report(sk
);
3580 if (msg
== NETDEV_UNREGISTER
) {
3581 packet_cached_dev_reset(po
);
3583 if (po
->prot_hook
.dev
)
3584 dev_put(po
->prot_hook
.dev
);
3585 po
->prot_hook
.dev
= NULL
;
3587 spin_unlock(&po
->bind_lock
);
3591 if (dev
->ifindex
== po
->ifindex
) {
3592 spin_lock(&po
->bind_lock
);
3594 register_prot_hook(sk
);
3595 spin_unlock(&po
->bind_lock
);
3605 static int packet_ioctl(struct socket
*sock
, unsigned int cmd
,
3608 struct sock
*sk
= sock
->sk
;
3613 int amount
= sk_wmem_alloc_get(sk
);
3615 return put_user(amount
, (int __user
*)arg
);
3619 struct sk_buff
*skb
;
3622 spin_lock_bh(&sk
->sk_receive_queue
.lock
);
3623 skb
= skb_peek(&sk
->sk_receive_queue
);
3626 spin_unlock_bh(&sk
->sk_receive_queue
.lock
);
3627 return put_user(amount
, (int __user
*)arg
);
3630 return sock_get_timestamp(sk
, (struct timeval __user
*)arg
);
3632 return sock_get_timestampns(sk
, (struct timespec __user
*)arg
);
3642 case SIOCGIFBRDADDR
:
3643 case SIOCSIFBRDADDR
:
3644 case SIOCGIFNETMASK
:
3645 case SIOCSIFNETMASK
:
3646 case SIOCGIFDSTADDR
:
3647 case SIOCSIFDSTADDR
:
3649 return inet_dgram_ops
.ioctl(sock
, cmd
, arg
);
3653 return -ENOIOCTLCMD
;
3658 static unsigned int packet_poll(struct file
*file
, struct socket
*sock
,
3661 struct sock
*sk
= sock
->sk
;
3662 struct packet_sock
*po
= pkt_sk(sk
);
3663 unsigned int mask
= datagram_poll(file
, sock
, wait
);
3665 spin_lock_bh(&sk
->sk_receive_queue
.lock
);
3666 if (po
->rx_ring
.pg_vec
) {
3667 if (!packet_previous_rx_frame(po
, &po
->rx_ring
,
3669 mask
|= POLLIN
| POLLRDNORM
;
3671 spin_unlock_bh(&sk
->sk_receive_queue
.lock
);
3672 spin_lock_bh(&sk
->sk_write_queue
.lock
);
3673 if (po
->tx_ring
.pg_vec
) {
3674 if (packet_current_frame(po
, &po
->tx_ring
, TP_STATUS_AVAILABLE
))
3675 mask
|= POLLOUT
| POLLWRNORM
;
3677 spin_unlock_bh(&sk
->sk_write_queue
.lock
);
3682 /* Dirty? Well, I still did not learn better way to account
3686 static void packet_mm_open(struct vm_area_struct
*vma
)
3688 struct file
*file
= vma
->vm_file
;
3689 struct socket
*sock
= file
->private_data
;
3690 struct sock
*sk
= sock
->sk
;
3693 atomic_inc(&pkt_sk(sk
)->mapped
);
3696 static void packet_mm_close(struct vm_area_struct
*vma
)
3698 struct file
*file
= vma
->vm_file
;
3699 struct socket
*sock
= file
->private_data
;
3700 struct sock
*sk
= sock
->sk
;
3703 atomic_dec(&pkt_sk(sk
)->mapped
);
3706 static const struct vm_operations_struct packet_mmap_ops
= {
3707 .open
= packet_mm_open
,
3708 .close
= packet_mm_close
,
3711 static void free_pg_vec(struct pgv
*pg_vec
, unsigned int order
,
3716 for (i
= 0; i
< len
; i
++) {
3717 if (likely(pg_vec
[i
].buffer
)) {
3718 if (is_vmalloc_addr(pg_vec
[i
].buffer
))
3719 vfree(pg_vec
[i
].buffer
);
3721 free_pages((unsigned long)pg_vec
[i
].buffer
,
3723 pg_vec
[i
].buffer
= NULL
;
3729 static char *alloc_one_pg_vec_page(unsigned long order
)
3732 gfp_t gfp_flags
= GFP_KERNEL
| __GFP_COMP
|
3733 __GFP_ZERO
| __GFP_NOWARN
| __GFP_NORETRY
;
3735 buffer
= (char *) __get_free_pages(gfp_flags
, order
);
3739 /* __get_free_pages failed, fall back to vmalloc */
3740 buffer
= vzalloc((1 << order
) * PAGE_SIZE
);
3744 /* vmalloc failed, lets dig into swap here */
3745 gfp_flags
&= ~__GFP_NORETRY
;
3746 buffer
= (char *) __get_free_pages(gfp_flags
, order
);
3750 /* complete and utter failure */
3754 static struct pgv
*alloc_pg_vec(struct tpacket_req
*req
, int order
)
3756 unsigned int block_nr
= req
->tp_block_nr
;
3760 pg_vec
= kcalloc(block_nr
, sizeof(struct pgv
), GFP_KERNEL
);
3761 if (unlikely(!pg_vec
))
3764 for (i
= 0; i
< block_nr
; i
++) {
3765 pg_vec
[i
].buffer
= alloc_one_pg_vec_page(order
);
3766 if (unlikely(!pg_vec
[i
].buffer
))
3767 goto out_free_pgvec
;
3774 free_pg_vec(pg_vec
, order
, block_nr
);
3779 static int packet_set_ring(struct sock
*sk
, union tpacket_req_u
*req_u
,
3780 int closing
, int tx_ring
)
3782 struct pgv
*pg_vec
= NULL
;
3783 struct packet_sock
*po
= pkt_sk(sk
);
3784 int was_running
, order
= 0;
3785 struct packet_ring_buffer
*rb
;
3786 struct sk_buff_head
*rb_queue
;
3789 /* Added to avoid minimal code churn */
3790 struct tpacket_req
*req
= &req_u
->req
;
3792 /* Opening a Tx-ring is NOT supported in TPACKET_V3 */
3793 if (!closing
&& tx_ring
&& (po
->tp_version
> TPACKET_V2
)) {
3794 WARN(1, "Tx-ring is not supported.\n");
3798 rb
= tx_ring
? &po
->tx_ring
: &po
->rx_ring
;
3799 rb_queue
= tx_ring
? &sk
->sk_write_queue
: &sk
->sk_receive_queue
;
3803 if (atomic_read(&po
->mapped
))
3805 if (packet_read_pending(rb
))
3809 if (req
->tp_block_nr
) {
3810 /* Sanity tests and some calculations */
3812 if (unlikely(rb
->pg_vec
))
3815 switch (po
->tp_version
) {
3817 po
->tp_hdrlen
= TPACKET_HDRLEN
;
3820 po
->tp_hdrlen
= TPACKET2_HDRLEN
;
3823 po
->tp_hdrlen
= TPACKET3_HDRLEN
;
3828 if (unlikely((int)req
->tp_block_size
<= 0))
3830 if (unlikely(req
->tp_block_size
& (PAGE_SIZE
- 1)))
3832 if (po
->tp_version
>= TPACKET_V3
&&
3833 (int)(req
->tp_block_size
-
3834 BLK_PLUS_PRIV(req_u
->req3
.tp_sizeof_priv
)) <= 0)
3836 if (unlikely(req
->tp_frame_size
< po
->tp_hdrlen
+
3839 if (unlikely(req
->tp_frame_size
& (TPACKET_ALIGNMENT
- 1)))
3842 rb
->frames_per_block
= req
->tp_block_size
/req
->tp_frame_size
;
3843 if (unlikely(rb
->frames_per_block
<= 0))
3845 if (unlikely((rb
->frames_per_block
* req
->tp_block_nr
) !=
3850 order
= get_order(req
->tp_block_size
);
3851 pg_vec
= alloc_pg_vec(req
, order
);
3852 if (unlikely(!pg_vec
))
3854 switch (po
->tp_version
) {
3856 /* Transmit path is not supported. We checked
3857 * it above but just being paranoid
3860 init_prb_bdqc(po
, rb
, pg_vec
, req_u
, tx_ring
);
3869 if (unlikely(req
->tp_frame_nr
))
3875 /* Detach socket from network */
3876 spin_lock(&po
->bind_lock
);
3877 was_running
= po
->running
;
3881 __unregister_prot_hook(sk
, false);
3883 spin_unlock(&po
->bind_lock
);
3888 mutex_lock(&po
->pg_vec_lock
);
3889 if (closing
|| atomic_read(&po
->mapped
) == 0) {
3891 spin_lock_bh(&rb_queue
->lock
);
3892 swap(rb
->pg_vec
, pg_vec
);
3893 rb
->frame_max
= (req
->tp_frame_nr
- 1);
3895 rb
->frame_size
= req
->tp_frame_size
;
3896 spin_unlock_bh(&rb_queue
->lock
);
3898 swap(rb
->pg_vec_order
, order
);
3899 swap(rb
->pg_vec_len
, req
->tp_block_nr
);
3901 rb
->pg_vec_pages
= req
->tp_block_size
/PAGE_SIZE
;
3902 po
->prot_hook
.func
= (po
->rx_ring
.pg_vec
) ?
3903 tpacket_rcv
: packet_rcv
;
3904 skb_queue_purge(rb_queue
);
3905 if (atomic_read(&po
->mapped
))
3906 pr_err("packet_mmap: vma is busy: %d\n",
3907 atomic_read(&po
->mapped
));
3909 mutex_unlock(&po
->pg_vec_lock
);
3911 spin_lock(&po
->bind_lock
);
3914 register_prot_hook(sk
);
3916 spin_unlock(&po
->bind_lock
);
3917 if (closing
&& (po
->tp_version
> TPACKET_V2
)) {
3918 /* Because we don't support block-based V3 on tx-ring */
3920 prb_shutdown_retire_blk_timer(po
, tx_ring
, rb_queue
);
3925 free_pg_vec(pg_vec
, order
, req
->tp_block_nr
);
3930 static int packet_mmap(struct file
*file
, struct socket
*sock
,
3931 struct vm_area_struct
*vma
)
3933 struct sock
*sk
= sock
->sk
;
3934 struct packet_sock
*po
= pkt_sk(sk
);
3935 unsigned long size
, expected_size
;
3936 struct packet_ring_buffer
*rb
;
3937 unsigned long start
;
3944 mutex_lock(&po
->pg_vec_lock
);
3947 for (rb
= &po
->rx_ring
; rb
<= &po
->tx_ring
; rb
++) {
3949 expected_size
+= rb
->pg_vec_len
3955 if (expected_size
== 0)
3958 size
= vma
->vm_end
- vma
->vm_start
;
3959 if (size
!= expected_size
)
3962 start
= vma
->vm_start
;
3963 for (rb
= &po
->rx_ring
; rb
<= &po
->tx_ring
; rb
++) {
3964 if (rb
->pg_vec
== NULL
)
3967 for (i
= 0; i
< rb
->pg_vec_len
; i
++) {
3969 void *kaddr
= rb
->pg_vec
[i
].buffer
;
3972 for (pg_num
= 0; pg_num
< rb
->pg_vec_pages
; pg_num
++) {
3973 page
= pgv_to_page(kaddr
);
3974 err
= vm_insert_page(vma
, start
, page
);
3983 atomic_inc(&po
->mapped
);
3984 vma
->vm_ops
= &packet_mmap_ops
;
3988 mutex_unlock(&po
->pg_vec_lock
);
3992 static const struct proto_ops packet_ops_spkt
= {
3993 .family
= PF_PACKET
,
3994 .owner
= THIS_MODULE
,
3995 .release
= packet_release
,
3996 .bind
= packet_bind_spkt
,
3997 .connect
= sock_no_connect
,
3998 .socketpair
= sock_no_socketpair
,
3999 .accept
= sock_no_accept
,
4000 .getname
= packet_getname_spkt
,
4001 .poll
= datagram_poll
,
4002 .ioctl
= packet_ioctl
,
4003 .listen
= sock_no_listen
,
4004 .shutdown
= sock_no_shutdown
,
4005 .setsockopt
= sock_no_setsockopt
,
4006 .getsockopt
= sock_no_getsockopt
,
4007 .sendmsg
= packet_sendmsg_spkt
,
4008 .recvmsg
= packet_recvmsg
,
4009 .mmap
= sock_no_mmap
,
4010 .sendpage
= sock_no_sendpage
,
4013 static const struct proto_ops packet_ops
= {
4014 .family
= PF_PACKET
,
4015 .owner
= THIS_MODULE
,
4016 .release
= packet_release
,
4017 .bind
= packet_bind
,
4018 .connect
= sock_no_connect
,
4019 .socketpair
= sock_no_socketpair
,
4020 .accept
= sock_no_accept
,
4021 .getname
= packet_getname
,
4022 .poll
= packet_poll
,
4023 .ioctl
= packet_ioctl
,
4024 .listen
= sock_no_listen
,
4025 .shutdown
= sock_no_shutdown
,
4026 .setsockopt
= packet_setsockopt
,
4027 .getsockopt
= packet_getsockopt
,
4028 .sendmsg
= packet_sendmsg
,
4029 .recvmsg
= packet_recvmsg
,
4030 .mmap
= packet_mmap
,
4031 .sendpage
= sock_no_sendpage
,
4034 static const struct net_proto_family packet_family_ops
= {
4035 .family
= PF_PACKET
,
4036 .create
= packet_create
,
4037 .owner
= THIS_MODULE
,
4040 static struct notifier_block packet_netdev_notifier
= {
4041 .notifier_call
= packet_notifier
,
4044 #ifdef CONFIG_PROC_FS
4046 static void *packet_seq_start(struct seq_file
*seq
, loff_t
*pos
)
4049 struct net
*net
= seq_file_net(seq
);
4052 return seq_hlist_start_head_rcu(&net
->packet
.sklist
, *pos
);
4055 static void *packet_seq_next(struct seq_file
*seq
, void *v
, loff_t
*pos
)
4057 struct net
*net
= seq_file_net(seq
);
4058 return seq_hlist_next_rcu(v
, &net
->packet
.sklist
, pos
);
4061 static void packet_seq_stop(struct seq_file
*seq
, void *v
)
4067 static int packet_seq_show(struct seq_file
*seq
, void *v
)
4069 if (v
== SEQ_START_TOKEN
)
4070 seq_puts(seq
, "sk RefCnt Type Proto Iface R Rmem User Inode\n");
4072 struct sock
*s
= sk_entry(v
);
4073 const struct packet_sock
*po
= pkt_sk(s
);
4076 "%pK %-6d %-4d %04x %-5d %1d %-6u %-6u %-6lu\n",
4078 atomic_read(&s
->sk_refcnt
),
4083 atomic_read(&s
->sk_rmem_alloc
),
4084 from_kuid_munged(seq_user_ns(seq
), sock_i_uid(s
)),
4091 static const struct seq_operations packet_seq_ops
= {
4092 .start
= packet_seq_start
,
4093 .next
= packet_seq_next
,
4094 .stop
= packet_seq_stop
,
4095 .show
= packet_seq_show
,
4098 static int packet_seq_open(struct inode
*inode
, struct file
*file
)
4100 return seq_open_net(inode
, file
, &packet_seq_ops
,
4101 sizeof(struct seq_net_private
));
4104 static const struct file_operations packet_seq_fops
= {
4105 .owner
= THIS_MODULE
,
4106 .open
= packet_seq_open
,
4108 .llseek
= seq_lseek
,
4109 .release
= seq_release_net
,
4114 static int __net_init
packet_net_init(struct net
*net
)
4116 mutex_init(&net
->packet
.sklist_lock
);
4117 INIT_HLIST_HEAD(&net
->packet
.sklist
);
4119 if (!proc_create("packet", 0, net
->proc_net
, &packet_seq_fops
))
4125 static void __net_exit
packet_net_exit(struct net
*net
)
4127 remove_proc_entry("packet", net
->proc_net
);
4130 static struct pernet_operations packet_net_ops
= {
4131 .init
= packet_net_init
,
4132 .exit
= packet_net_exit
,
4136 static void __exit
packet_exit(void)
4138 unregister_netdevice_notifier(&packet_netdev_notifier
);
4139 unregister_pernet_subsys(&packet_net_ops
);
4140 sock_unregister(PF_PACKET
);
4141 proto_unregister(&packet_proto
);
4144 static int __init
packet_init(void)
4146 int rc
= proto_register(&packet_proto
, 0);
4151 sock_register(&packet_family_ops
);
4152 register_pernet_subsys(&packet_net_ops
);
4153 register_netdevice_notifier(&packet_netdev_notifier
);
4158 module_init(packet_init
);
4159 module_exit(packet_exit
);
4160 MODULE_LICENSE("GPL");
4161 MODULE_ALIAS_NETPROTO(PF_PACKET
);