2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
6 * PACKET - implements raw packet sockets.
9 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10 * Alan Cox, <gw4pts@gw4pts.ampr.org>
13 * Alan Cox : verify_area() now used correctly
14 * Alan Cox : new skbuff lists, look ma no backlogs!
15 * Alan Cox : tidied skbuff lists.
16 * Alan Cox : Now uses generic datagram routines I
17 * added. Also fixed the peek/read crash
18 * from all old Linux datagram code.
19 * Alan Cox : Uses the improved datagram code.
20 * Alan Cox : Added NULL's for socket options.
21 * Alan Cox : Re-commented the code.
22 * Alan Cox : Use new kernel side addressing
23 * Rob Janssen : Correct MTU usage.
24 * Dave Platt : Counter leaks caused by incorrect
25 * interrupt locking and some slightly
26 * dubious gcc output. Can you read
27 * compiler: it said _VOLATILE_
28 * Richard Kooijman : Timestamp fixes.
29 * Alan Cox : New buffers. Use sk->mac.raw.
30 * Alan Cox : sendmsg/recvmsg support.
31 * Alan Cox : Protocol setting support
32 * Alexey Kuznetsov : Untied from IPv4 stack.
33 * Cyrus Durgin : Fixed kerneld for kmod.
34 * Michal Ostrowski : Module initialization cleanup.
35 * Ulises Alonso : Frame number limit removal and
36 * packet_set_ring memory leak.
37 * Eric Biederman : Allow for > 8 byte hardware addresses.
38 * The convention is that longer addresses
39 * will simply extend the hardware address
40 * byte arrays at the end of sockaddr_ll
42 * Johann Baudy : Added TX RING.
43 * Chetan Loke : Implemented TPACKET_V3 block abstraction
45 * Copyright (C) 2011, <lokec@ccs.neu.edu>
48 * This program is free software; you can redistribute it and/or
49 * modify it under the terms of the GNU General Public License
50 * as published by the Free Software Foundation; either version
51 * 2 of the License, or (at your option) any later version.
55 #include <linux/types.h>
57 #include <linux/capability.h>
58 #include <linux/fcntl.h>
59 #include <linux/socket.h>
61 #include <linux/inet.h>
62 #include <linux/netdevice.h>
63 #include <linux/if_packet.h>
64 #include <linux/wireless.h>
65 #include <linux/kernel.h>
66 #include <linux/kmod.h>
67 #include <linux/slab.h>
68 #include <linux/vmalloc.h>
69 #include <net/net_namespace.h>
71 #include <net/protocol.h>
72 #include <linux/skbuff.h>
74 #include <linux/errno.h>
75 #include <linux/timer.h>
76 #include <asm/uaccess.h>
77 #include <asm/ioctls.h>
79 #include <asm/cacheflush.h>
81 #include <linux/proc_fs.h>
82 #include <linux/seq_file.h>
83 #include <linux/poll.h>
84 #include <linux/module.h>
85 #include <linux/init.h>
86 #include <linux/mutex.h>
87 #include <linux/if_vlan.h>
88 #include <linux/virtio_net.h>
89 #include <linux/errqueue.h>
90 #include <linux/net_tstamp.h>
91 #include <linux/percpu.h>
93 #include <net/inet_common.h>
100 - if device has no dev->hard_header routine, it adds and removes ll header
101 inside itself. In this case ll header is invisible outside of device,
102 but higher levels still should reserve dev->hard_header_len.
103 Some devices are enough clever to reallocate skb, when header
104 will not fit to reserved space (tunnel), another ones are silly
106 - packet socket receives packets with pulled ll header,
107 so that SOCK_RAW should push it back.
112 Incoming, dev->hard_header!=NULL
113 mac_header -> ll header
116 Outgoing, dev->hard_header!=NULL
117 mac_header -> ll header
120 Incoming, dev->hard_header==NULL
121 mac_header -> UNKNOWN position. It is very likely, that it points to ll
122 header. PPP makes it, that is wrong, because introduce
123 assymetry between rx and tx paths.
126 Outgoing, dev->hard_header==NULL
127 mac_header -> data. ll header is still not built!
131 If dev->hard_header==NULL we are unlikely to restore sensible ll header.
137 dev->hard_header != NULL
138 mac_header -> ll header
141 dev->hard_header == NULL (ll header is added by device, we cannot control it)
145 We should set nh.raw on output to correct posistion,
146 packet classifier depends on it.
149 /* Private packet socket structures. */
151 /* identical to struct packet_mreq except it has
152 * a longer address field.
154 struct packet_mreq_max
{
156 unsigned short mr_type
;
157 unsigned short mr_alen
;
158 unsigned char mr_address
[MAX_ADDR_LEN
];
162 struct tpacket_hdr
*h1
;
163 struct tpacket2_hdr
*h2
;
164 struct tpacket3_hdr
*h3
;
168 static int packet_set_ring(struct sock
*sk
, union tpacket_req_u
*req_u
,
169 int closing
, int tx_ring
);
171 #define V3_ALIGNMENT (8)
173 #define BLK_HDR_LEN (ALIGN(sizeof(struct tpacket_block_desc), V3_ALIGNMENT))
175 #define BLK_PLUS_PRIV(sz_of_priv) \
176 (BLK_HDR_LEN + ALIGN((sz_of_priv), V3_ALIGNMENT))
178 #define PGV_FROM_VMALLOC 1
180 #define BLOCK_STATUS(x) ((x)->hdr.bh1.block_status)
181 #define BLOCK_NUM_PKTS(x) ((x)->hdr.bh1.num_pkts)
182 #define BLOCK_O2FP(x) ((x)->hdr.bh1.offset_to_first_pkt)
183 #define BLOCK_LEN(x) ((x)->hdr.bh1.blk_len)
184 #define BLOCK_SNUM(x) ((x)->hdr.bh1.seq_num)
185 #define BLOCK_O2PRIV(x) ((x)->offset_to_priv)
186 #define BLOCK_PRIV(x) ((void *)((char *)(x) + BLOCK_O2PRIV(x)))
189 static int tpacket_snd(struct packet_sock
*po
, struct msghdr
*msg
);
190 static int tpacket_rcv(struct sk_buff
*skb
, struct net_device
*dev
,
191 struct packet_type
*pt
, struct net_device
*orig_dev
);
193 static void *packet_previous_frame(struct packet_sock
*po
,
194 struct packet_ring_buffer
*rb
,
196 static void packet_increment_head(struct packet_ring_buffer
*buff
);
197 static int prb_curr_blk_in_use(struct tpacket_kbdq_core
*,
198 struct tpacket_block_desc
*);
199 static void *prb_dispatch_next_block(struct tpacket_kbdq_core
*,
200 struct packet_sock
*);
201 static void prb_retire_current_block(struct tpacket_kbdq_core
*,
202 struct packet_sock
*, unsigned int status
);
203 static int prb_queue_frozen(struct tpacket_kbdq_core
*);
204 static void prb_open_block(struct tpacket_kbdq_core
*,
205 struct tpacket_block_desc
*);
206 static void prb_retire_rx_blk_timer_expired(unsigned long);
207 static void _prb_refresh_rx_retire_blk_timer(struct tpacket_kbdq_core
*);
208 static void prb_init_blk_timer(struct packet_sock
*,
209 struct tpacket_kbdq_core
*,
210 void (*func
) (unsigned long));
211 static void prb_fill_rxhash(struct tpacket_kbdq_core
*, struct tpacket3_hdr
*);
212 static void prb_clear_rxhash(struct tpacket_kbdq_core
*,
213 struct tpacket3_hdr
*);
214 static void prb_fill_vlan_info(struct tpacket_kbdq_core
*,
215 struct tpacket3_hdr
*);
216 static void packet_flush_mclist(struct sock
*sk
);
218 struct packet_skb_cb
{
219 unsigned int origlen
;
221 struct sockaddr_pkt pkt
;
222 struct sockaddr_ll ll
;
226 #define PACKET_SKB_CB(__skb) ((struct packet_skb_cb *)((__skb)->cb))
228 #define GET_PBDQC_FROM_RB(x) ((struct tpacket_kbdq_core *)(&(x)->prb_bdqc))
229 #define GET_PBLOCK_DESC(x, bid) \
230 ((struct tpacket_block_desc *)((x)->pkbdq[(bid)].buffer))
231 #define GET_CURR_PBLOCK_DESC_FROM_CORE(x) \
232 ((struct tpacket_block_desc *)((x)->pkbdq[(x)->kactive_blk_num].buffer))
233 #define GET_NEXT_PRB_BLK_NUM(x) \
234 (((x)->kactive_blk_num < ((x)->knum_blocks-1)) ? \
235 ((x)->kactive_blk_num+1) : 0)
237 static void __fanout_unlink(struct sock
*sk
, struct packet_sock
*po
);
238 static void __fanout_link(struct sock
*sk
, struct packet_sock
*po
);
240 static int packet_direct_xmit(struct sk_buff
*skb
)
242 struct net_device
*dev
= skb
->dev
;
243 netdev_features_t features
;
244 struct netdev_queue
*txq
;
245 int ret
= NETDEV_TX_BUSY
;
247 if (unlikely(!netif_running(dev
) ||
248 !netif_carrier_ok(dev
)))
251 features
= netif_skb_features(skb
);
252 if (skb_needs_linearize(skb
, features
) &&
253 __skb_linearize(skb
))
256 txq
= skb_get_tx_queue(dev
, skb
);
260 HARD_TX_LOCK(dev
, txq
, smp_processor_id());
261 if (!netif_xmit_frozen_or_drv_stopped(txq
))
262 ret
= netdev_start_xmit(skb
, dev
, txq
, false);
263 HARD_TX_UNLOCK(dev
, txq
);
267 if (!dev_xmit_complete(ret
))
272 atomic_long_inc(&dev
->tx_dropped
);
274 return NET_XMIT_DROP
;
277 static struct net_device
*packet_cached_dev_get(struct packet_sock
*po
)
279 struct net_device
*dev
;
282 dev
= rcu_dereference(po
->cached_dev
);
290 static void packet_cached_dev_assign(struct packet_sock
*po
,
291 struct net_device
*dev
)
293 rcu_assign_pointer(po
->cached_dev
, dev
);
296 static void packet_cached_dev_reset(struct packet_sock
*po
)
298 RCU_INIT_POINTER(po
->cached_dev
, NULL
);
301 static bool packet_use_direct_xmit(const struct packet_sock
*po
)
303 return po
->xmit
== packet_direct_xmit
;
306 static u16
__packet_pick_tx_queue(struct net_device
*dev
, struct sk_buff
*skb
)
308 return (u16
) raw_smp_processor_id() % dev
->real_num_tx_queues
;
311 static void packet_pick_tx_queue(struct net_device
*dev
, struct sk_buff
*skb
)
313 const struct net_device_ops
*ops
= dev
->netdev_ops
;
316 if (ops
->ndo_select_queue
) {
317 queue_index
= ops
->ndo_select_queue(dev
, skb
, NULL
,
318 __packet_pick_tx_queue
);
319 queue_index
= netdev_cap_txqueue(dev
, queue_index
);
321 queue_index
= __packet_pick_tx_queue(dev
, skb
);
324 skb_set_queue_mapping(skb
, queue_index
);
327 /* register_prot_hook must be invoked with the po->bind_lock held,
328 * or from a context in which asynchronous accesses to the packet
329 * socket is not possible (packet_create()).
331 static void register_prot_hook(struct sock
*sk
)
333 struct packet_sock
*po
= pkt_sk(sk
);
337 __fanout_link(sk
, po
);
339 dev_add_pack(&po
->prot_hook
);
346 /* {,__}unregister_prot_hook() must be invoked with the po->bind_lock
347 * held. If the sync parameter is true, we will temporarily drop
348 * the po->bind_lock and do a synchronize_net to make sure no
349 * asynchronous packet processing paths still refer to the elements
350 * of po->prot_hook. If the sync parameter is false, it is the
351 * callers responsibility to take care of this.
353 static void __unregister_prot_hook(struct sock
*sk
, bool sync
)
355 struct packet_sock
*po
= pkt_sk(sk
);
360 __fanout_unlink(sk
, po
);
362 __dev_remove_pack(&po
->prot_hook
);
367 spin_unlock(&po
->bind_lock
);
369 spin_lock(&po
->bind_lock
);
373 static void unregister_prot_hook(struct sock
*sk
, bool sync
)
375 struct packet_sock
*po
= pkt_sk(sk
);
378 __unregister_prot_hook(sk
, sync
);
381 static inline struct page
* __pure
pgv_to_page(void *addr
)
383 if (is_vmalloc_addr(addr
))
384 return vmalloc_to_page(addr
);
385 return virt_to_page(addr
);
388 static void __packet_set_status(struct packet_sock
*po
, void *frame
, int status
)
390 union tpacket_uhdr h
;
393 switch (po
->tp_version
) {
395 h
.h1
->tp_status
= status
;
396 flush_dcache_page(pgv_to_page(&h
.h1
->tp_status
));
399 h
.h2
->tp_status
= status
;
400 flush_dcache_page(pgv_to_page(&h
.h2
->tp_status
));
404 WARN(1, "TPACKET version not supported.\n");
411 static int __packet_get_status(struct packet_sock
*po
, void *frame
)
413 union tpacket_uhdr h
;
418 switch (po
->tp_version
) {
420 flush_dcache_page(pgv_to_page(&h
.h1
->tp_status
));
421 return h
.h1
->tp_status
;
423 flush_dcache_page(pgv_to_page(&h
.h2
->tp_status
));
424 return h
.h2
->tp_status
;
427 WARN(1, "TPACKET version not supported.\n");
433 static __u32
tpacket_get_timestamp(struct sk_buff
*skb
, struct timespec
*ts
,
436 struct skb_shared_hwtstamps
*shhwtstamps
= skb_hwtstamps(skb
);
439 (flags
& SOF_TIMESTAMPING_RAW_HARDWARE
) &&
440 ktime_to_timespec_cond(shhwtstamps
->hwtstamp
, ts
))
441 return TP_STATUS_TS_RAW_HARDWARE
;
443 if (ktime_to_timespec_cond(skb
->tstamp
, ts
))
444 return TP_STATUS_TS_SOFTWARE
;
449 static __u32
__packet_set_timestamp(struct packet_sock
*po
, void *frame
,
452 union tpacket_uhdr h
;
456 if (!(ts_status
= tpacket_get_timestamp(skb
, &ts
, po
->tp_tstamp
)))
460 switch (po
->tp_version
) {
462 h
.h1
->tp_sec
= ts
.tv_sec
;
463 h
.h1
->tp_usec
= ts
.tv_nsec
/ NSEC_PER_USEC
;
466 h
.h2
->tp_sec
= ts
.tv_sec
;
467 h
.h2
->tp_nsec
= ts
.tv_nsec
;
471 WARN(1, "TPACKET version not supported.\n");
475 /* one flush is safe, as both fields always lie on the same cacheline */
476 flush_dcache_page(pgv_to_page(&h
.h1
->tp_sec
));
482 static void *packet_lookup_frame(struct packet_sock
*po
,
483 struct packet_ring_buffer
*rb
,
484 unsigned int position
,
487 unsigned int pg_vec_pos
, frame_offset
;
488 union tpacket_uhdr h
;
490 pg_vec_pos
= position
/ rb
->frames_per_block
;
491 frame_offset
= position
% rb
->frames_per_block
;
493 h
.raw
= rb
->pg_vec
[pg_vec_pos
].buffer
+
494 (frame_offset
* rb
->frame_size
);
496 if (status
!= __packet_get_status(po
, h
.raw
))
502 static void *packet_current_frame(struct packet_sock
*po
,
503 struct packet_ring_buffer
*rb
,
506 return packet_lookup_frame(po
, rb
, rb
->head
, status
);
509 static void prb_del_retire_blk_timer(struct tpacket_kbdq_core
*pkc
)
511 del_timer_sync(&pkc
->retire_blk_timer
);
514 static void prb_shutdown_retire_blk_timer(struct packet_sock
*po
,
516 struct sk_buff_head
*rb_queue
)
518 struct tpacket_kbdq_core
*pkc
;
520 pkc
= tx_ring
? GET_PBDQC_FROM_RB(&po
->tx_ring
) :
521 GET_PBDQC_FROM_RB(&po
->rx_ring
);
523 spin_lock_bh(&rb_queue
->lock
);
524 pkc
->delete_blk_timer
= 1;
525 spin_unlock_bh(&rb_queue
->lock
);
527 prb_del_retire_blk_timer(pkc
);
530 static void prb_init_blk_timer(struct packet_sock
*po
,
531 struct tpacket_kbdq_core
*pkc
,
532 void (*func
) (unsigned long))
534 init_timer(&pkc
->retire_blk_timer
);
535 pkc
->retire_blk_timer
.data
= (long)po
;
536 pkc
->retire_blk_timer
.function
= func
;
537 pkc
->retire_blk_timer
.expires
= jiffies
;
540 static void prb_setup_retire_blk_timer(struct packet_sock
*po
, int tx_ring
)
542 struct tpacket_kbdq_core
*pkc
;
547 pkc
= tx_ring
? GET_PBDQC_FROM_RB(&po
->tx_ring
) :
548 GET_PBDQC_FROM_RB(&po
->rx_ring
);
549 prb_init_blk_timer(po
, pkc
, prb_retire_rx_blk_timer_expired
);
552 static int prb_calc_retire_blk_tmo(struct packet_sock
*po
,
553 int blk_size_in_bytes
)
555 struct net_device
*dev
;
556 unsigned int mbits
= 0, msec
= 0, div
= 0, tmo
= 0;
557 struct ethtool_cmd ecmd
;
562 dev
= __dev_get_by_index(sock_net(&po
->sk
), po
->ifindex
);
563 if (unlikely(!dev
)) {
565 return DEFAULT_PRB_RETIRE_TOV
;
567 err
= __ethtool_get_settings(dev
, &ecmd
);
568 speed
= ethtool_cmd_speed(&ecmd
);
572 * If the link speed is so slow you don't really
573 * need to worry about perf anyways
575 if (speed
< SPEED_1000
|| speed
== SPEED_UNKNOWN
) {
576 return DEFAULT_PRB_RETIRE_TOV
;
583 mbits
= (blk_size_in_bytes
* 8) / (1024 * 1024);
595 static void prb_init_ft_ops(struct tpacket_kbdq_core
*p1
,
596 union tpacket_req_u
*req_u
)
598 p1
->feature_req_word
= req_u
->req3
.tp_feature_req_word
;
601 static void init_prb_bdqc(struct packet_sock
*po
,
602 struct packet_ring_buffer
*rb
,
604 union tpacket_req_u
*req_u
, int tx_ring
)
606 struct tpacket_kbdq_core
*p1
= GET_PBDQC_FROM_RB(rb
);
607 struct tpacket_block_desc
*pbd
;
609 memset(p1
, 0x0, sizeof(*p1
));
611 p1
->knxt_seq_num
= 1;
613 pbd
= (struct tpacket_block_desc
*)pg_vec
[0].buffer
;
614 p1
->pkblk_start
= pg_vec
[0].buffer
;
615 p1
->kblk_size
= req_u
->req3
.tp_block_size
;
616 p1
->knum_blocks
= req_u
->req3
.tp_block_nr
;
617 p1
->hdrlen
= po
->tp_hdrlen
;
618 p1
->version
= po
->tp_version
;
619 p1
->last_kactive_blk_num
= 0;
620 po
->stats
.stats3
.tp_freeze_q_cnt
= 0;
621 if (req_u
->req3
.tp_retire_blk_tov
)
622 p1
->retire_blk_tov
= req_u
->req3
.tp_retire_blk_tov
;
624 p1
->retire_blk_tov
= prb_calc_retire_blk_tmo(po
,
625 req_u
->req3
.tp_block_size
);
626 p1
->tov_in_jiffies
= msecs_to_jiffies(p1
->retire_blk_tov
);
627 p1
->blk_sizeof_priv
= req_u
->req3
.tp_sizeof_priv
;
629 p1
->max_frame_len
= p1
->kblk_size
- BLK_PLUS_PRIV(p1
->blk_sizeof_priv
);
630 prb_init_ft_ops(p1
, req_u
);
631 prb_setup_retire_blk_timer(po
, tx_ring
);
632 prb_open_block(p1
, pbd
);
635 /* Do NOT update the last_blk_num first.
636 * Assumes sk_buff_head lock is held.
638 static void _prb_refresh_rx_retire_blk_timer(struct tpacket_kbdq_core
*pkc
)
640 mod_timer(&pkc
->retire_blk_timer
,
641 jiffies
+ pkc
->tov_in_jiffies
);
642 pkc
->last_kactive_blk_num
= pkc
->kactive_blk_num
;
647 * 1) We refresh the timer only when we open a block.
648 * By doing this we don't waste cycles refreshing the timer
649 * on packet-by-packet basis.
651 * With a 1MB block-size, on a 1Gbps line, it will take
652 * i) ~8 ms to fill a block + ii) memcpy etc.
653 * In this cut we are not accounting for the memcpy time.
655 * So, if the user sets the 'tmo' to 10ms then the timer
656 * will never fire while the block is still getting filled
657 * (which is what we want). However, the user could choose
658 * to close a block early and that's fine.
660 * But when the timer does fire, we check whether or not to refresh it.
661 * Since the tmo granularity is in msecs, it is not too expensive
662 * to refresh the timer, lets say every '8' msecs.
663 * Either the user can set the 'tmo' or we can derive it based on
664 * a) line-speed and b) block-size.
665 * prb_calc_retire_blk_tmo() calculates the tmo.
668 static void prb_retire_rx_blk_timer_expired(unsigned long data
)
670 struct packet_sock
*po
= (struct packet_sock
*)data
;
671 struct tpacket_kbdq_core
*pkc
= GET_PBDQC_FROM_RB(&po
->rx_ring
);
673 struct tpacket_block_desc
*pbd
;
675 spin_lock(&po
->sk
.sk_receive_queue
.lock
);
677 frozen
= prb_queue_frozen(pkc
);
678 pbd
= GET_CURR_PBLOCK_DESC_FROM_CORE(pkc
);
680 if (unlikely(pkc
->delete_blk_timer
))
683 /* We only need to plug the race when the block is partially filled.
685 * lock(); increment BLOCK_NUM_PKTS; unlock()
686 * copy_bits() is in progress ...
687 * timer fires on other cpu:
688 * we can't retire the current block because copy_bits
692 if (BLOCK_NUM_PKTS(pbd
)) {
693 while (atomic_read(&pkc
->blk_fill_in_prog
)) {
694 /* Waiting for skb_copy_bits to finish... */
699 if (pkc
->last_kactive_blk_num
== pkc
->kactive_blk_num
) {
701 prb_retire_current_block(pkc
, po
, TP_STATUS_BLK_TMO
);
702 if (!prb_dispatch_next_block(pkc
, po
))
707 /* Case 1. Queue was frozen because user-space was
710 if (prb_curr_blk_in_use(pkc
, pbd
)) {
712 * Ok, user-space is still behind.
713 * So just refresh the timer.
717 /* Case 2. queue was frozen,user-space caught up,
718 * now the link went idle && the timer fired.
719 * We don't have a block to close.So we open this
720 * block and restart the timer.
721 * opening a block thaws the queue,restarts timer
722 * Thawing/timer-refresh is a side effect.
724 prb_open_block(pkc
, pbd
);
731 _prb_refresh_rx_retire_blk_timer(pkc
);
734 spin_unlock(&po
->sk
.sk_receive_queue
.lock
);
737 static void prb_flush_block(struct tpacket_kbdq_core
*pkc1
,
738 struct tpacket_block_desc
*pbd1
, __u32 status
)
740 /* Flush everything minus the block header */
742 #if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1
747 /* Skip the block header(we know header WILL fit in 4K) */
750 end
= (u8
*)PAGE_ALIGN((unsigned long)pkc1
->pkblk_end
);
751 for (; start
< end
; start
+= PAGE_SIZE
)
752 flush_dcache_page(pgv_to_page(start
));
757 /* Now update the block status. */
759 BLOCK_STATUS(pbd1
) = status
;
761 /* Flush the block header */
763 #if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1
765 flush_dcache_page(pgv_to_page(start
));
775 * 2) Increment active_blk_num
777 * Note:We DONT refresh the timer on purpose.
778 * Because almost always the next block will be opened.
780 static void prb_close_block(struct tpacket_kbdq_core
*pkc1
,
781 struct tpacket_block_desc
*pbd1
,
782 struct packet_sock
*po
, unsigned int stat
)
784 __u32 status
= TP_STATUS_USER
| stat
;
786 struct tpacket3_hdr
*last_pkt
;
787 struct tpacket_hdr_v1
*h1
= &pbd1
->hdr
.bh1
;
788 struct sock
*sk
= &po
->sk
;
790 if (po
->stats
.stats3
.tp_drops
)
791 status
|= TP_STATUS_LOSING
;
793 last_pkt
= (struct tpacket3_hdr
*)pkc1
->prev
;
794 last_pkt
->tp_next_offset
= 0;
796 /* Get the ts of the last pkt */
797 if (BLOCK_NUM_PKTS(pbd1
)) {
798 h1
->ts_last_pkt
.ts_sec
= last_pkt
->tp_sec
;
799 h1
->ts_last_pkt
.ts_nsec
= last_pkt
->tp_nsec
;
801 /* Ok, we tmo'd - so get the current time */
804 h1
->ts_last_pkt
.ts_sec
= ts
.tv_sec
;
805 h1
->ts_last_pkt
.ts_nsec
= ts
.tv_nsec
;
810 /* Flush the block */
811 prb_flush_block(pkc1
, pbd1
, status
);
813 sk
->sk_data_ready(sk
);
815 pkc1
->kactive_blk_num
= GET_NEXT_PRB_BLK_NUM(pkc1
);
818 static void prb_thaw_queue(struct tpacket_kbdq_core
*pkc
)
820 pkc
->reset_pending_on_curr_blk
= 0;
824 * Side effect of opening a block:
826 * 1) prb_queue is thawed.
827 * 2) retire_blk_timer is refreshed.
830 static void prb_open_block(struct tpacket_kbdq_core
*pkc1
,
831 struct tpacket_block_desc
*pbd1
)
834 struct tpacket_hdr_v1
*h1
= &pbd1
->hdr
.bh1
;
838 /* We could have just memset this but we will lose the
839 * flexibility of making the priv area sticky
842 BLOCK_SNUM(pbd1
) = pkc1
->knxt_seq_num
++;
843 BLOCK_NUM_PKTS(pbd1
) = 0;
844 BLOCK_LEN(pbd1
) = BLK_PLUS_PRIV(pkc1
->blk_sizeof_priv
);
848 h1
->ts_first_pkt
.ts_sec
= ts
.tv_sec
;
849 h1
->ts_first_pkt
.ts_nsec
= ts
.tv_nsec
;
851 pkc1
->pkblk_start
= (char *)pbd1
;
852 pkc1
->nxt_offset
= pkc1
->pkblk_start
+ BLK_PLUS_PRIV(pkc1
->blk_sizeof_priv
);
854 BLOCK_O2FP(pbd1
) = (__u32
)BLK_PLUS_PRIV(pkc1
->blk_sizeof_priv
);
855 BLOCK_O2PRIV(pbd1
) = BLK_HDR_LEN
;
857 pbd1
->version
= pkc1
->version
;
858 pkc1
->prev
= pkc1
->nxt_offset
;
859 pkc1
->pkblk_end
= pkc1
->pkblk_start
+ pkc1
->kblk_size
;
861 prb_thaw_queue(pkc1
);
862 _prb_refresh_rx_retire_blk_timer(pkc1
);
868 * Queue freeze logic:
869 * 1) Assume tp_block_nr = 8 blocks.
870 * 2) At time 't0', user opens Rx ring.
871 * 3) Some time past 't0', kernel starts filling blocks starting from 0 .. 7
872 * 4) user-space is either sleeping or processing block '0'.
873 * 5) tpacket_rcv is currently filling block '7', since there is no space left,
874 * it will close block-7,loop around and try to fill block '0'.
876 * __packet_lookup_frame_in_block
877 * prb_retire_current_block()
878 * prb_dispatch_next_block()
879 * |->(BLOCK_STATUS == USER) evaluates to true
880 * 5.1) Since block-0 is currently in-use, we just freeze the queue.
881 * 6) Now there are two cases:
882 * 6.1) Link goes idle right after the queue is frozen.
883 * But remember, the last open_block() refreshed the timer.
884 * When this timer expires,it will refresh itself so that we can
885 * re-open block-0 in near future.
886 * 6.2) Link is busy and keeps on receiving packets. This is a simple
887 * case and __packet_lookup_frame_in_block will check if block-0
888 * is free and can now be re-used.
890 static void prb_freeze_queue(struct tpacket_kbdq_core
*pkc
,
891 struct packet_sock
*po
)
893 pkc
->reset_pending_on_curr_blk
= 1;
894 po
->stats
.stats3
.tp_freeze_q_cnt
++;
897 #define TOTAL_PKT_LEN_INCL_ALIGN(length) (ALIGN((length), V3_ALIGNMENT))
900 * If the next block is free then we will dispatch it
901 * and return a good offset.
902 * Else, we will freeze the queue.
903 * So, caller must check the return value.
905 static void *prb_dispatch_next_block(struct tpacket_kbdq_core
*pkc
,
906 struct packet_sock
*po
)
908 struct tpacket_block_desc
*pbd
;
912 /* 1. Get current block num */
913 pbd
= GET_CURR_PBLOCK_DESC_FROM_CORE(pkc
);
915 /* 2. If this block is currently in_use then freeze the queue */
916 if (TP_STATUS_USER
& BLOCK_STATUS(pbd
)) {
917 prb_freeze_queue(pkc
, po
);
923 * open this block and return the offset where the first packet
924 * needs to get stored.
926 prb_open_block(pkc
, pbd
);
927 return (void *)pkc
->nxt_offset
;
930 static void prb_retire_current_block(struct tpacket_kbdq_core
*pkc
,
931 struct packet_sock
*po
, unsigned int status
)
933 struct tpacket_block_desc
*pbd
= GET_CURR_PBLOCK_DESC_FROM_CORE(pkc
);
935 /* retire/close the current block */
936 if (likely(TP_STATUS_KERNEL
== BLOCK_STATUS(pbd
))) {
938 * Plug the case where copy_bits() is in progress on
939 * cpu-0 and tpacket_rcv() got invoked on cpu-1, didn't
940 * have space to copy the pkt in the current block and
941 * called prb_retire_current_block()
943 * We don't need to worry about the TMO case because
944 * the timer-handler already handled this case.
946 if (!(status
& TP_STATUS_BLK_TMO
)) {
947 while (atomic_read(&pkc
->blk_fill_in_prog
)) {
948 /* Waiting for skb_copy_bits to finish... */
952 prb_close_block(pkc
, pbd
, po
, status
);
957 static int prb_curr_blk_in_use(struct tpacket_kbdq_core
*pkc
,
958 struct tpacket_block_desc
*pbd
)
960 return TP_STATUS_USER
& BLOCK_STATUS(pbd
);
963 static int prb_queue_frozen(struct tpacket_kbdq_core
*pkc
)
965 return pkc
->reset_pending_on_curr_blk
;
968 static void prb_clear_blk_fill_status(struct packet_ring_buffer
*rb
)
970 struct tpacket_kbdq_core
*pkc
= GET_PBDQC_FROM_RB(rb
);
971 atomic_dec(&pkc
->blk_fill_in_prog
);
974 static void prb_fill_rxhash(struct tpacket_kbdq_core
*pkc
,
975 struct tpacket3_hdr
*ppd
)
977 ppd
->hv1
.tp_rxhash
= skb_get_hash(pkc
->skb
);
980 static void prb_clear_rxhash(struct tpacket_kbdq_core
*pkc
,
981 struct tpacket3_hdr
*ppd
)
983 ppd
->hv1
.tp_rxhash
= 0;
986 static void prb_fill_vlan_info(struct tpacket_kbdq_core
*pkc
,
987 struct tpacket3_hdr
*ppd
)
989 if (skb_vlan_tag_present(pkc
->skb
)) {
990 ppd
->hv1
.tp_vlan_tci
= skb_vlan_tag_get(pkc
->skb
);
991 ppd
->hv1
.tp_vlan_tpid
= ntohs(pkc
->skb
->vlan_proto
);
992 ppd
->tp_status
= TP_STATUS_VLAN_VALID
| TP_STATUS_VLAN_TPID_VALID
;
994 ppd
->hv1
.tp_vlan_tci
= 0;
995 ppd
->hv1
.tp_vlan_tpid
= 0;
996 ppd
->tp_status
= TP_STATUS_AVAILABLE
;
1000 static void prb_run_all_ft_ops(struct tpacket_kbdq_core
*pkc
,
1001 struct tpacket3_hdr
*ppd
)
1003 ppd
->hv1
.tp_padding
= 0;
1004 prb_fill_vlan_info(pkc
, ppd
);
1006 if (pkc
->feature_req_word
& TP_FT_REQ_FILL_RXHASH
)
1007 prb_fill_rxhash(pkc
, ppd
);
1009 prb_clear_rxhash(pkc
, ppd
);
1012 static void prb_fill_curr_block(char *curr
,
1013 struct tpacket_kbdq_core
*pkc
,
1014 struct tpacket_block_desc
*pbd
,
1017 struct tpacket3_hdr
*ppd
;
1019 ppd
= (struct tpacket3_hdr
*)curr
;
1020 ppd
->tp_next_offset
= TOTAL_PKT_LEN_INCL_ALIGN(len
);
1022 pkc
->nxt_offset
+= TOTAL_PKT_LEN_INCL_ALIGN(len
);
1023 BLOCK_LEN(pbd
) += TOTAL_PKT_LEN_INCL_ALIGN(len
);
1024 BLOCK_NUM_PKTS(pbd
) += 1;
1025 atomic_inc(&pkc
->blk_fill_in_prog
);
1026 prb_run_all_ft_ops(pkc
, ppd
);
1029 /* Assumes caller has the sk->rx_queue.lock */
1030 static void *__packet_lookup_frame_in_block(struct packet_sock
*po
,
1031 struct sk_buff
*skb
,
1036 struct tpacket_kbdq_core
*pkc
;
1037 struct tpacket_block_desc
*pbd
;
1040 pkc
= GET_PBDQC_FROM_RB(&po
->rx_ring
);
1041 pbd
= GET_CURR_PBLOCK_DESC_FROM_CORE(pkc
);
1043 /* Queue is frozen when user space is lagging behind */
1044 if (prb_queue_frozen(pkc
)) {
1046 * Check if that last block which caused the queue to freeze,
1047 * is still in_use by user-space.
1049 if (prb_curr_blk_in_use(pkc
, pbd
)) {
1050 /* Can't record this packet */
1054 * Ok, the block was released by user-space.
1055 * Now let's open that block.
1056 * opening a block also thaws the queue.
1057 * Thawing is a side effect.
1059 prb_open_block(pkc
, pbd
);
1064 curr
= pkc
->nxt_offset
;
1066 end
= (char *)pbd
+ pkc
->kblk_size
;
1068 /* first try the current block */
1069 if (curr
+TOTAL_PKT_LEN_INCL_ALIGN(len
) < end
) {
1070 prb_fill_curr_block(curr
, pkc
, pbd
, len
);
1071 return (void *)curr
;
1074 /* Ok, close the current block */
1075 prb_retire_current_block(pkc
, po
, 0);
1077 /* Now, try to dispatch the next block */
1078 curr
= (char *)prb_dispatch_next_block(pkc
, po
);
1080 pbd
= GET_CURR_PBLOCK_DESC_FROM_CORE(pkc
);
1081 prb_fill_curr_block(curr
, pkc
, pbd
, len
);
1082 return (void *)curr
;
1086 * No free blocks are available.user_space hasn't caught up yet.
1087 * Queue was just frozen and now this packet will get dropped.
1092 static void *packet_current_rx_frame(struct packet_sock
*po
,
1093 struct sk_buff
*skb
,
1094 int status
, unsigned int len
)
1097 switch (po
->tp_version
) {
1100 curr
= packet_lookup_frame(po
, &po
->rx_ring
,
1101 po
->rx_ring
.head
, status
);
1104 return __packet_lookup_frame_in_block(po
, skb
, status
, len
);
1106 WARN(1, "TPACKET version not supported\n");
1112 static void *prb_lookup_block(struct packet_sock
*po
,
1113 struct packet_ring_buffer
*rb
,
1117 struct tpacket_kbdq_core
*pkc
= GET_PBDQC_FROM_RB(rb
);
1118 struct tpacket_block_desc
*pbd
= GET_PBLOCK_DESC(pkc
, idx
);
1120 if (status
!= BLOCK_STATUS(pbd
))
1125 static int prb_previous_blk_num(struct packet_ring_buffer
*rb
)
1128 if (rb
->prb_bdqc
.kactive_blk_num
)
1129 prev
= rb
->prb_bdqc
.kactive_blk_num
-1;
1131 prev
= rb
->prb_bdqc
.knum_blocks
-1;
1135 /* Assumes caller has held the rx_queue.lock */
1136 static void *__prb_previous_block(struct packet_sock
*po
,
1137 struct packet_ring_buffer
*rb
,
1140 unsigned int previous
= prb_previous_blk_num(rb
);
1141 return prb_lookup_block(po
, rb
, previous
, status
);
1144 static void *packet_previous_rx_frame(struct packet_sock
*po
,
1145 struct packet_ring_buffer
*rb
,
1148 if (po
->tp_version
<= TPACKET_V2
)
1149 return packet_previous_frame(po
, rb
, status
);
1151 return __prb_previous_block(po
, rb
, status
);
1154 static void packet_increment_rx_head(struct packet_sock
*po
,
1155 struct packet_ring_buffer
*rb
)
1157 switch (po
->tp_version
) {
1160 return packet_increment_head(rb
);
1163 WARN(1, "TPACKET version not supported.\n");
1169 static void *packet_previous_frame(struct packet_sock
*po
,
1170 struct packet_ring_buffer
*rb
,
1173 unsigned int previous
= rb
->head
? rb
->head
- 1 : rb
->frame_max
;
1174 return packet_lookup_frame(po
, rb
, previous
, status
);
1177 static void packet_increment_head(struct packet_ring_buffer
*buff
)
1179 buff
->head
= buff
->head
!= buff
->frame_max
? buff
->head
+1 : 0;
1182 static void packet_inc_pending(struct packet_ring_buffer
*rb
)
1184 this_cpu_inc(*rb
->pending_refcnt
);
1187 static void packet_dec_pending(struct packet_ring_buffer
*rb
)
1189 this_cpu_dec(*rb
->pending_refcnt
);
1192 static unsigned int packet_read_pending(const struct packet_ring_buffer
*rb
)
1194 unsigned int refcnt
= 0;
1197 /* We don't use pending refcount in rx_ring. */
1198 if (rb
->pending_refcnt
== NULL
)
1201 for_each_possible_cpu(cpu
)
1202 refcnt
+= *per_cpu_ptr(rb
->pending_refcnt
, cpu
);
1207 static int packet_alloc_pending(struct packet_sock
*po
)
1209 po
->rx_ring
.pending_refcnt
= NULL
;
1211 po
->tx_ring
.pending_refcnt
= alloc_percpu(unsigned int);
1212 if (unlikely(po
->tx_ring
.pending_refcnt
== NULL
))
1218 static void packet_free_pending(struct packet_sock
*po
)
1220 free_percpu(po
->tx_ring
.pending_refcnt
);
1223 static bool packet_rcv_has_room(struct packet_sock
*po
, struct sk_buff
*skb
)
1225 struct sock
*sk
= &po
->sk
;
1228 if (po
->prot_hook
.func
!= tpacket_rcv
)
1229 return (atomic_read(&sk
->sk_rmem_alloc
) + skb
->truesize
)
1232 spin_lock(&sk
->sk_receive_queue
.lock
);
1233 if (po
->tp_version
== TPACKET_V3
)
1234 has_room
= prb_lookup_block(po
, &po
->rx_ring
,
1235 po
->rx_ring
.prb_bdqc
.kactive_blk_num
,
1238 has_room
= packet_lookup_frame(po
, &po
->rx_ring
,
1241 spin_unlock(&sk
->sk_receive_queue
.lock
);
1246 static void packet_sock_destruct(struct sock
*sk
)
1248 skb_queue_purge(&sk
->sk_error_queue
);
1250 WARN_ON(atomic_read(&sk
->sk_rmem_alloc
));
1251 WARN_ON(atomic_read(&sk
->sk_wmem_alloc
));
1253 if (!sock_flag(sk
, SOCK_DEAD
)) {
1254 pr_err("Attempt to release alive packet socket: %p\n", sk
);
1258 sk_refcnt_debug_dec(sk
);
1261 static int fanout_rr_next(struct packet_fanout
*f
, unsigned int num
)
1263 int x
= atomic_read(&f
->rr_cur
) + 1;
1271 static unsigned int fanout_demux_hash(struct packet_fanout
*f
,
1272 struct sk_buff
*skb
,
1275 return reciprocal_scale(skb_get_hash(skb
), num
);
1278 static unsigned int fanout_demux_lb(struct packet_fanout
*f
,
1279 struct sk_buff
*skb
,
1284 cur
= atomic_read(&f
->rr_cur
);
1285 while ((old
= atomic_cmpxchg(&f
->rr_cur
, cur
,
1286 fanout_rr_next(f
, num
))) != cur
)
1291 static unsigned int fanout_demux_cpu(struct packet_fanout
*f
,
1292 struct sk_buff
*skb
,
1295 return smp_processor_id() % num
;
1298 static unsigned int fanout_demux_rnd(struct packet_fanout
*f
,
1299 struct sk_buff
*skb
,
1302 return prandom_u32_max(num
);
1305 static unsigned int fanout_demux_rollover(struct packet_fanout
*f
,
1306 struct sk_buff
*skb
,
1307 unsigned int idx
, unsigned int skip
,
1312 i
= j
= min_t(int, f
->next
[idx
], num
- 1);
1314 if (i
!= skip
&& packet_rcv_has_room(pkt_sk(f
->arr
[i
]), skb
)) {
1326 static unsigned int fanout_demux_qm(struct packet_fanout
*f
,
1327 struct sk_buff
*skb
,
1330 return skb_get_queue_mapping(skb
) % num
;
1333 static bool fanout_has_flag(struct packet_fanout
*f
, u16 flag
)
1335 return f
->flags
& (flag
>> 8);
1338 static int packet_rcv_fanout(struct sk_buff
*skb
, struct net_device
*dev
,
1339 struct packet_type
*pt
, struct net_device
*orig_dev
)
1341 struct packet_fanout
*f
= pt
->af_packet_priv
;
1342 unsigned int num
= f
->num_members
;
1343 struct packet_sock
*po
;
1346 if (!net_eq(dev_net(dev
), read_pnet(&f
->net
)) ||
1353 case PACKET_FANOUT_HASH
:
1355 if (fanout_has_flag(f
, PACKET_FANOUT_FLAG_DEFRAG
)) {
1356 skb
= ip_check_defrag(skb
, IP_DEFRAG_AF_PACKET
);
1360 idx
= fanout_demux_hash(f
, skb
, num
);
1362 case PACKET_FANOUT_LB
:
1363 idx
= fanout_demux_lb(f
, skb
, num
);
1365 case PACKET_FANOUT_CPU
:
1366 idx
= fanout_demux_cpu(f
, skb
, num
);
1368 case PACKET_FANOUT_RND
:
1369 idx
= fanout_demux_rnd(f
, skb
, num
);
1371 case PACKET_FANOUT_QM
:
1372 idx
= fanout_demux_qm(f
, skb
, num
);
1374 case PACKET_FANOUT_ROLLOVER
:
1375 idx
= fanout_demux_rollover(f
, skb
, 0, (unsigned int) -1, num
);
1379 po
= pkt_sk(f
->arr
[idx
]);
1380 if (fanout_has_flag(f
, PACKET_FANOUT_FLAG_ROLLOVER
) &&
1381 unlikely(!packet_rcv_has_room(po
, skb
))) {
1382 idx
= fanout_demux_rollover(f
, skb
, idx
, idx
, num
);
1383 po
= pkt_sk(f
->arr
[idx
]);
1386 return po
->prot_hook
.func(skb
, dev
, &po
->prot_hook
, orig_dev
);
1389 DEFINE_MUTEX(fanout_mutex
);
1390 EXPORT_SYMBOL_GPL(fanout_mutex
);
1391 static LIST_HEAD(fanout_list
);
1393 static void __fanout_link(struct sock
*sk
, struct packet_sock
*po
)
1395 struct packet_fanout
*f
= po
->fanout
;
1397 spin_lock(&f
->lock
);
1398 f
->arr
[f
->num_members
] = sk
;
1401 spin_unlock(&f
->lock
);
1404 static void __fanout_unlink(struct sock
*sk
, struct packet_sock
*po
)
1406 struct packet_fanout
*f
= po
->fanout
;
1409 spin_lock(&f
->lock
);
1410 for (i
= 0; i
< f
->num_members
; i
++) {
1411 if (f
->arr
[i
] == sk
)
1414 BUG_ON(i
>= f
->num_members
);
1415 f
->arr
[i
] = f
->arr
[f
->num_members
- 1];
1417 spin_unlock(&f
->lock
);
1420 static bool match_fanout_group(struct packet_type
*ptype
, struct sock
*sk
)
1422 if (ptype
->af_packet_priv
== (void *)((struct packet_sock
*)sk
)->fanout
)
1428 static int fanout_add(struct sock
*sk
, u16 id
, u16 type_flags
)
1430 struct packet_sock
*po
= pkt_sk(sk
);
1431 struct packet_fanout
*f
, *match
;
1432 u8 type
= type_flags
& 0xff;
1433 u8 flags
= type_flags
>> 8;
1437 case PACKET_FANOUT_ROLLOVER
:
1438 if (type_flags
& PACKET_FANOUT_FLAG_ROLLOVER
)
1440 case PACKET_FANOUT_HASH
:
1441 case PACKET_FANOUT_LB
:
1442 case PACKET_FANOUT_CPU
:
1443 case PACKET_FANOUT_RND
:
1444 case PACKET_FANOUT_QM
:
1456 mutex_lock(&fanout_mutex
);
1458 list_for_each_entry(f
, &fanout_list
, list
) {
1460 read_pnet(&f
->net
) == sock_net(sk
)) {
1466 if (match
&& match
->flags
!= flags
)
1470 match
= kzalloc(sizeof(*match
), GFP_KERNEL
);
1473 write_pnet(&match
->net
, sock_net(sk
));
1476 match
->flags
= flags
;
1477 atomic_set(&match
->rr_cur
, 0);
1478 INIT_LIST_HEAD(&match
->list
);
1479 spin_lock_init(&match
->lock
);
1480 atomic_set(&match
->sk_ref
, 0);
1481 match
->prot_hook
.type
= po
->prot_hook
.type
;
1482 match
->prot_hook
.dev
= po
->prot_hook
.dev
;
1483 match
->prot_hook
.func
= packet_rcv_fanout
;
1484 match
->prot_hook
.af_packet_priv
= match
;
1485 match
->prot_hook
.id_match
= match_fanout_group
;
1486 dev_add_pack(&match
->prot_hook
);
1487 list_add(&match
->list
, &fanout_list
);
1490 if (match
->type
== type
&&
1491 match
->prot_hook
.type
== po
->prot_hook
.type
&&
1492 match
->prot_hook
.dev
== po
->prot_hook
.dev
) {
1494 if (atomic_read(&match
->sk_ref
) < PACKET_FANOUT_MAX
) {
1495 __dev_remove_pack(&po
->prot_hook
);
1497 atomic_inc(&match
->sk_ref
);
1498 __fanout_link(sk
, po
);
1503 mutex_unlock(&fanout_mutex
);
1507 static void fanout_release(struct sock
*sk
)
1509 struct packet_sock
*po
= pkt_sk(sk
);
1510 struct packet_fanout
*f
;
1516 mutex_lock(&fanout_mutex
);
1519 if (atomic_dec_and_test(&f
->sk_ref
)) {
1521 dev_remove_pack(&f
->prot_hook
);
1524 mutex_unlock(&fanout_mutex
);
1527 static const struct proto_ops packet_ops
;
1529 static const struct proto_ops packet_ops_spkt
;
1531 static int packet_rcv_spkt(struct sk_buff
*skb
, struct net_device
*dev
,
1532 struct packet_type
*pt
, struct net_device
*orig_dev
)
1535 struct sockaddr_pkt
*spkt
;
1538 * When we registered the protocol we saved the socket in the data
1539 * field for just this event.
1542 sk
= pt
->af_packet_priv
;
1545 * Yank back the headers [hope the device set this
1546 * right or kerboom...]
1548 * Incoming packets have ll header pulled,
1551 * For outgoing ones skb->data == skb_mac_header(skb)
1552 * so that this procedure is noop.
1555 if (skb
->pkt_type
== PACKET_LOOPBACK
)
1558 if (!net_eq(dev_net(dev
), sock_net(sk
)))
1561 skb
= skb_share_check(skb
, GFP_ATOMIC
);
1565 /* drop any routing info */
1568 /* drop conntrack reference */
1571 spkt
= &PACKET_SKB_CB(skb
)->sa
.pkt
;
1573 skb_push(skb
, skb
->data
- skb_mac_header(skb
));
1576 * The SOCK_PACKET socket receives _all_ frames.
1579 spkt
->spkt_family
= dev
->type
;
1580 strlcpy(spkt
->spkt_device
, dev
->name
, sizeof(spkt
->spkt_device
));
1581 spkt
->spkt_protocol
= skb
->protocol
;
1584 * Charge the memory to the socket. This is done specifically
1585 * to prevent sockets using all the memory up.
1588 if (sock_queue_rcv_skb(sk
, skb
) == 0)
1599 * Output a raw packet to a device layer. This bypasses all the other
1600 * protocol layers and you must therefore supply it with a complete frame
1603 static int packet_sendmsg_spkt(struct kiocb
*iocb
, struct socket
*sock
,
1604 struct msghdr
*msg
, size_t len
)
1606 struct sock
*sk
= sock
->sk
;
1607 DECLARE_SOCKADDR(struct sockaddr_pkt
*, saddr
, msg
->msg_name
);
1608 struct sk_buff
*skb
= NULL
;
1609 struct net_device
*dev
;
1615 * Get and verify the address.
1619 if (msg
->msg_namelen
< sizeof(struct sockaddr
))
1621 if (msg
->msg_namelen
== sizeof(struct sockaddr_pkt
))
1622 proto
= saddr
->spkt_protocol
;
1624 return -ENOTCONN
; /* SOCK_PACKET must be sent giving an address */
1627 * Find the device first to size check it
1630 saddr
->spkt_device
[sizeof(saddr
->spkt_device
) - 1] = 0;
1633 dev
= dev_get_by_name_rcu(sock_net(sk
), saddr
->spkt_device
);
1639 if (!(dev
->flags
& IFF_UP
))
1643 * You may not queue a frame bigger than the mtu. This is the lowest level
1644 * raw protocol and you must do your own fragmentation at this level.
1647 if (unlikely(sock_flag(sk
, SOCK_NOFCS
))) {
1648 if (!netif_supports_nofcs(dev
)) {
1649 err
= -EPROTONOSUPPORT
;
1652 extra_len
= 4; /* We're doing our own CRC */
1656 if (len
> dev
->mtu
+ dev
->hard_header_len
+ VLAN_HLEN
+ extra_len
)
1660 size_t reserved
= LL_RESERVED_SPACE(dev
);
1661 int tlen
= dev
->needed_tailroom
;
1662 unsigned int hhlen
= dev
->header_ops
? dev
->hard_header_len
: 0;
1665 skb
= sock_wmalloc(sk
, len
+ reserved
+ tlen
, 0, GFP_KERNEL
);
1668 /* FIXME: Save some space for broken drivers that write a hard
1669 * header at transmission time by themselves. PPP is the notable
1670 * one here. This should really be fixed at the driver level.
1672 skb_reserve(skb
, reserved
);
1673 skb_reset_network_header(skb
);
1675 /* Try to align data part correctly */
1680 skb_reset_network_header(skb
);
1682 err
= memcpy_from_msg(skb_put(skb
, len
), msg
, len
);
1688 if (len
> (dev
->mtu
+ dev
->hard_header_len
+ extra_len
)) {
1689 /* Earlier code assumed this would be a VLAN pkt,
1690 * double-check this now that we have the actual
1693 struct ethhdr
*ehdr
;
1694 skb_reset_mac_header(skb
);
1695 ehdr
= eth_hdr(skb
);
1696 if (ehdr
->h_proto
!= htons(ETH_P_8021Q
)) {
1702 skb
->protocol
= proto
;
1704 skb
->priority
= sk
->sk_priority
;
1705 skb
->mark
= sk
->sk_mark
;
1707 sock_tx_timestamp(sk
, &skb_shinfo(skb
)->tx_flags
);
1709 if (unlikely(extra_len
== 4))
1712 skb_probe_transport_header(skb
, 0);
1714 dev_queue_xmit(skb
);
1725 static unsigned int run_filter(const struct sk_buff
*skb
,
1726 const struct sock
*sk
,
1729 struct sk_filter
*filter
;
1732 filter
= rcu_dereference(sk
->sk_filter
);
1734 res
= SK_RUN_FILTER(filter
, skb
);
1741 * This function makes lazy skb cloning in hope that most of packets
1742 * are discarded by BPF.
1744 * Note tricky part: we DO mangle shared skb! skb->data, skb->len
1745 * and skb->cb are mangled. It works because (and until) packets
1746 * falling here are owned by current CPU. Output packets are cloned
1747 * by dev_queue_xmit_nit(), input packets are processed by net_bh
1748 * sequencially, so that if we return skb to original state on exit,
1749 * we will not harm anyone.
1752 static int packet_rcv(struct sk_buff
*skb
, struct net_device
*dev
,
1753 struct packet_type
*pt
, struct net_device
*orig_dev
)
1756 struct sockaddr_ll
*sll
;
1757 struct packet_sock
*po
;
1758 u8
*skb_head
= skb
->data
;
1759 int skb_len
= skb
->len
;
1760 unsigned int snaplen
, res
;
1762 if (skb
->pkt_type
== PACKET_LOOPBACK
)
1765 sk
= pt
->af_packet_priv
;
1768 if (!net_eq(dev_net(dev
), sock_net(sk
)))
1773 if (dev
->header_ops
) {
1774 /* The device has an explicit notion of ll header,
1775 * exported to higher levels.
1777 * Otherwise, the device hides details of its frame
1778 * structure, so that corresponding packet head is
1779 * never delivered to user.
1781 if (sk
->sk_type
!= SOCK_DGRAM
)
1782 skb_push(skb
, skb
->data
- skb_mac_header(skb
));
1783 else if (skb
->pkt_type
== PACKET_OUTGOING
) {
1784 /* Special case: outgoing packets have ll header at head */
1785 skb_pull(skb
, skb_network_offset(skb
));
1791 res
= run_filter(skb
, sk
, snaplen
);
1793 goto drop_n_restore
;
1797 if (atomic_read(&sk
->sk_rmem_alloc
) >= sk
->sk_rcvbuf
)
1800 if (skb_shared(skb
)) {
1801 struct sk_buff
*nskb
= skb_clone(skb
, GFP_ATOMIC
);
1805 if (skb_head
!= skb
->data
) {
1806 skb
->data
= skb_head
;
1813 BUILD_BUG_ON(sizeof(*PACKET_SKB_CB(skb
)) + MAX_ADDR_LEN
- 8 >
1816 sll
= &PACKET_SKB_CB(skb
)->sa
.ll
;
1817 sll
->sll_family
= AF_PACKET
;
1818 sll
->sll_hatype
= dev
->type
;
1819 sll
->sll_protocol
= skb
->protocol
;
1820 sll
->sll_pkttype
= skb
->pkt_type
;
1821 if (unlikely(po
->origdev
))
1822 sll
->sll_ifindex
= orig_dev
->ifindex
;
1824 sll
->sll_ifindex
= dev
->ifindex
;
1826 sll
->sll_halen
= dev_parse_header(skb
, sll
->sll_addr
);
1828 PACKET_SKB_CB(skb
)->origlen
= skb
->len
;
1830 if (pskb_trim(skb
, snaplen
))
1833 skb_set_owner_r(skb
, sk
);
1837 /* drop conntrack reference */
1840 spin_lock(&sk
->sk_receive_queue
.lock
);
1841 po
->stats
.stats1
.tp_packets
++;
1842 skb
->dropcount
= atomic_read(&sk
->sk_drops
);
1843 __skb_queue_tail(&sk
->sk_receive_queue
, skb
);
1844 spin_unlock(&sk
->sk_receive_queue
.lock
);
1845 sk
->sk_data_ready(sk
);
1849 spin_lock(&sk
->sk_receive_queue
.lock
);
1850 po
->stats
.stats1
.tp_drops
++;
1851 atomic_inc(&sk
->sk_drops
);
1852 spin_unlock(&sk
->sk_receive_queue
.lock
);
1855 if (skb_head
!= skb
->data
&& skb_shared(skb
)) {
1856 skb
->data
= skb_head
;
1864 static int tpacket_rcv(struct sk_buff
*skb
, struct net_device
*dev
,
1865 struct packet_type
*pt
, struct net_device
*orig_dev
)
1868 struct packet_sock
*po
;
1869 struct sockaddr_ll
*sll
;
1870 union tpacket_uhdr h
;
1871 u8
*skb_head
= skb
->data
;
1872 int skb_len
= skb
->len
;
1873 unsigned int snaplen
, res
;
1874 unsigned long status
= TP_STATUS_USER
;
1875 unsigned short macoff
, netoff
, hdrlen
;
1876 struct sk_buff
*copy_skb
= NULL
;
1880 /* struct tpacket{2,3}_hdr is aligned to a multiple of TPACKET_ALIGNMENT.
1881 * We may add members to them until current aligned size without forcing
1882 * userspace to call getsockopt(..., PACKET_HDRLEN, ...).
1884 BUILD_BUG_ON(TPACKET_ALIGN(sizeof(*h
.h2
)) != 32);
1885 BUILD_BUG_ON(TPACKET_ALIGN(sizeof(*h
.h3
)) != 48);
1887 if (skb
->pkt_type
== PACKET_LOOPBACK
)
1890 sk
= pt
->af_packet_priv
;
1893 if (!net_eq(dev_net(dev
), sock_net(sk
)))
1896 if (dev
->header_ops
) {
1897 if (sk
->sk_type
!= SOCK_DGRAM
)
1898 skb_push(skb
, skb
->data
- skb_mac_header(skb
));
1899 else if (skb
->pkt_type
== PACKET_OUTGOING
) {
1900 /* Special case: outgoing packets have ll header at head */
1901 skb_pull(skb
, skb_network_offset(skb
));
1905 if (skb
->ip_summed
== CHECKSUM_PARTIAL
)
1906 status
|= TP_STATUS_CSUMNOTREADY
;
1910 res
= run_filter(skb
, sk
, snaplen
);
1912 goto drop_n_restore
;
1916 if (sk
->sk_type
== SOCK_DGRAM
) {
1917 macoff
= netoff
= TPACKET_ALIGN(po
->tp_hdrlen
) + 16 +
1920 unsigned int maclen
= skb_network_offset(skb
);
1921 netoff
= TPACKET_ALIGN(po
->tp_hdrlen
+
1922 (maclen
< 16 ? 16 : maclen
)) +
1924 macoff
= netoff
- maclen
;
1926 if (po
->tp_version
<= TPACKET_V2
) {
1927 if (macoff
+ snaplen
> po
->rx_ring
.frame_size
) {
1928 if (po
->copy_thresh
&&
1929 atomic_read(&sk
->sk_rmem_alloc
) < sk
->sk_rcvbuf
) {
1930 if (skb_shared(skb
)) {
1931 copy_skb
= skb_clone(skb
, GFP_ATOMIC
);
1933 copy_skb
= skb_get(skb
);
1934 skb_head
= skb
->data
;
1937 skb_set_owner_r(copy_skb
, sk
);
1939 snaplen
= po
->rx_ring
.frame_size
- macoff
;
1940 if ((int)snaplen
< 0)
1943 } else if (unlikely(macoff
+ snaplen
>
1944 GET_PBDQC_FROM_RB(&po
->rx_ring
)->max_frame_len
)) {
1947 nval
= GET_PBDQC_FROM_RB(&po
->rx_ring
)->max_frame_len
- macoff
;
1948 pr_err_once("tpacket_rcv: packet too big, clamped from %u to %u. macoff=%u\n",
1949 snaplen
, nval
, macoff
);
1951 if (unlikely((int)snaplen
< 0)) {
1953 macoff
= GET_PBDQC_FROM_RB(&po
->rx_ring
)->max_frame_len
;
1956 spin_lock(&sk
->sk_receive_queue
.lock
);
1957 h
.raw
= packet_current_rx_frame(po
, skb
,
1958 TP_STATUS_KERNEL
, (macoff
+snaplen
));
1961 if (po
->tp_version
<= TPACKET_V2
) {
1962 packet_increment_rx_head(po
, &po
->rx_ring
);
1964 * LOSING will be reported till you read the stats,
1965 * because it's COR - Clear On Read.
1966 * Anyways, moving it for V1/V2 only as V3 doesn't need this
1969 if (po
->stats
.stats1
.tp_drops
)
1970 status
|= TP_STATUS_LOSING
;
1972 po
->stats
.stats1
.tp_packets
++;
1974 status
|= TP_STATUS_COPY
;
1975 __skb_queue_tail(&sk
->sk_receive_queue
, copy_skb
);
1977 spin_unlock(&sk
->sk_receive_queue
.lock
);
1979 skb_copy_bits(skb
, 0, h
.raw
+ macoff
, snaplen
);
1981 if (!(ts_status
= tpacket_get_timestamp(skb
, &ts
, po
->tp_tstamp
)))
1982 getnstimeofday(&ts
);
1984 status
|= ts_status
;
1986 switch (po
->tp_version
) {
1988 h
.h1
->tp_len
= skb
->len
;
1989 h
.h1
->tp_snaplen
= snaplen
;
1990 h
.h1
->tp_mac
= macoff
;
1991 h
.h1
->tp_net
= netoff
;
1992 h
.h1
->tp_sec
= ts
.tv_sec
;
1993 h
.h1
->tp_usec
= ts
.tv_nsec
/ NSEC_PER_USEC
;
1994 hdrlen
= sizeof(*h
.h1
);
1997 h
.h2
->tp_len
= skb
->len
;
1998 h
.h2
->tp_snaplen
= snaplen
;
1999 h
.h2
->tp_mac
= macoff
;
2000 h
.h2
->tp_net
= netoff
;
2001 h
.h2
->tp_sec
= ts
.tv_sec
;
2002 h
.h2
->tp_nsec
= ts
.tv_nsec
;
2003 if (skb_vlan_tag_present(skb
)) {
2004 h
.h2
->tp_vlan_tci
= skb_vlan_tag_get(skb
);
2005 h
.h2
->tp_vlan_tpid
= ntohs(skb
->vlan_proto
);
2006 status
|= TP_STATUS_VLAN_VALID
| TP_STATUS_VLAN_TPID_VALID
;
2008 h
.h2
->tp_vlan_tci
= 0;
2009 h
.h2
->tp_vlan_tpid
= 0;
2011 memset(h
.h2
->tp_padding
, 0, sizeof(h
.h2
->tp_padding
));
2012 hdrlen
= sizeof(*h
.h2
);
2015 /* tp_nxt_offset,vlan are already populated above.
2016 * So DONT clear those fields here
2018 h
.h3
->tp_status
|= status
;
2019 h
.h3
->tp_len
= skb
->len
;
2020 h
.h3
->tp_snaplen
= snaplen
;
2021 h
.h3
->tp_mac
= macoff
;
2022 h
.h3
->tp_net
= netoff
;
2023 h
.h3
->tp_sec
= ts
.tv_sec
;
2024 h
.h3
->tp_nsec
= ts
.tv_nsec
;
2025 memset(h
.h3
->tp_padding
, 0, sizeof(h
.h3
->tp_padding
));
2026 hdrlen
= sizeof(*h
.h3
);
2032 sll
= h
.raw
+ TPACKET_ALIGN(hdrlen
);
2033 sll
->sll_halen
= dev_parse_header(skb
, sll
->sll_addr
);
2034 sll
->sll_family
= AF_PACKET
;
2035 sll
->sll_hatype
= dev
->type
;
2036 sll
->sll_protocol
= skb
->protocol
;
2037 sll
->sll_pkttype
= skb
->pkt_type
;
2038 if (unlikely(po
->origdev
))
2039 sll
->sll_ifindex
= orig_dev
->ifindex
;
2041 sll
->sll_ifindex
= dev
->ifindex
;
2045 #if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1
2046 if (po
->tp_version
<= TPACKET_V2
) {
2049 end
= (u8
*) PAGE_ALIGN((unsigned long) h
.raw
+
2052 for (start
= h
.raw
; start
< end
; start
+= PAGE_SIZE
)
2053 flush_dcache_page(pgv_to_page(start
));
2058 if (po
->tp_version
<= TPACKET_V2
) {
2059 __packet_set_status(po
, h
.raw
, status
);
2060 sk
->sk_data_ready(sk
);
2062 prb_clear_blk_fill_status(&po
->rx_ring
);
2066 if (skb_head
!= skb
->data
&& skb_shared(skb
)) {
2067 skb
->data
= skb_head
;
2075 po
->stats
.stats1
.tp_drops
++;
2076 spin_unlock(&sk
->sk_receive_queue
.lock
);
2078 sk
->sk_data_ready(sk
);
2079 kfree_skb(copy_skb
);
2080 goto drop_n_restore
;
2083 static void tpacket_destruct_skb(struct sk_buff
*skb
)
2085 struct packet_sock
*po
= pkt_sk(skb
->sk
);
2087 if (likely(po
->tx_ring
.pg_vec
)) {
2091 ph
= skb_shinfo(skb
)->destructor_arg
;
2092 packet_dec_pending(&po
->tx_ring
);
2094 ts
= __packet_set_timestamp(po
, ph
, skb
);
2095 __packet_set_status(po
, ph
, TP_STATUS_AVAILABLE
| ts
);
2101 static bool ll_header_truncated(const struct net_device
*dev
, int len
)
2103 /* net device doesn't like empty head */
2104 if (unlikely(len
<= dev
->hard_header_len
)) {
2105 net_warn_ratelimited("%s: packet size is too short (%d <= %d)\n",
2106 current
->comm
, len
, dev
->hard_header_len
);
2113 static int tpacket_fill_skb(struct packet_sock
*po
, struct sk_buff
*skb
,
2114 void *frame
, struct net_device
*dev
, int size_max
,
2115 __be16 proto
, unsigned char *addr
, int hlen
)
2117 union tpacket_uhdr ph
;
2118 int to_write
, offset
, len
, tp_len
, nr_frags
, len_max
;
2119 struct socket
*sock
= po
->sk
.sk_socket
;
2126 skb
->protocol
= proto
;
2128 skb
->priority
= po
->sk
.sk_priority
;
2129 skb
->mark
= po
->sk
.sk_mark
;
2130 sock_tx_timestamp(&po
->sk
, &skb_shinfo(skb
)->tx_flags
);
2131 skb_shinfo(skb
)->destructor_arg
= ph
.raw
;
2133 switch (po
->tp_version
) {
2135 tp_len
= ph
.h2
->tp_len
;
2138 tp_len
= ph
.h1
->tp_len
;
2141 if (unlikely(tp_len
> size_max
)) {
2142 pr_err("packet size is too long (%d > %d)\n", tp_len
, size_max
);
2146 skb_reserve(skb
, hlen
);
2147 skb_reset_network_header(skb
);
2149 if (!packet_use_direct_xmit(po
))
2150 skb_probe_transport_header(skb
, 0);
2151 if (unlikely(po
->tp_tx_has_off
)) {
2152 int off_min
, off_max
, off
;
2153 off_min
= po
->tp_hdrlen
- sizeof(struct sockaddr_ll
);
2154 off_max
= po
->tx_ring
.frame_size
- tp_len
;
2155 if (sock
->type
== SOCK_DGRAM
) {
2156 switch (po
->tp_version
) {
2158 off
= ph
.h2
->tp_net
;
2161 off
= ph
.h1
->tp_net
;
2165 switch (po
->tp_version
) {
2167 off
= ph
.h2
->tp_mac
;
2170 off
= ph
.h1
->tp_mac
;
2174 if (unlikely((off
< off_min
) || (off_max
< off
)))
2176 data
= ph
.raw
+ off
;
2178 data
= ph
.raw
+ po
->tp_hdrlen
- sizeof(struct sockaddr_ll
);
2182 if (sock
->type
== SOCK_DGRAM
) {
2183 err
= dev_hard_header(skb
, dev
, ntohs(proto
), addr
,
2185 if (unlikely(err
< 0))
2187 } else if (dev
->hard_header_len
) {
2188 if (ll_header_truncated(dev
, tp_len
))
2191 skb_push(skb
, dev
->hard_header_len
);
2192 err
= skb_store_bits(skb
, 0, data
,
2193 dev
->hard_header_len
);
2197 data
+= dev
->hard_header_len
;
2198 to_write
-= dev
->hard_header_len
;
2201 offset
= offset_in_page(data
);
2202 len_max
= PAGE_SIZE
- offset
;
2203 len
= ((to_write
> len_max
) ? len_max
: to_write
);
2205 skb
->data_len
= to_write
;
2206 skb
->len
+= to_write
;
2207 skb
->truesize
+= to_write
;
2208 atomic_add(to_write
, &po
->sk
.sk_wmem_alloc
);
2210 while (likely(to_write
)) {
2211 nr_frags
= skb_shinfo(skb
)->nr_frags
;
2213 if (unlikely(nr_frags
>= MAX_SKB_FRAGS
)) {
2214 pr_err("Packet exceed the number of skb frags(%lu)\n",
2219 page
= pgv_to_page(data
);
2221 flush_dcache_page(page
);
2223 skb_fill_page_desc(skb
, nr_frags
, page
, offset
, len
);
2226 len_max
= PAGE_SIZE
;
2227 len
= ((to_write
> len_max
) ? len_max
: to_write
);
2233 static int tpacket_snd(struct packet_sock
*po
, struct msghdr
*msg
)
2235 struct sk_buff
*skb
;
2236 struct net_device
*dev
;
2238 int err
, reserve
= 0;
2240 DECLARE_SOCKADDR(struct sockaddr_ll
*, saddr
, msg
->msg_name
);
2241 bool need_wait
= !(msg
->msg_flags
& MSG_DONTWAIT
);
2242 int tp_len
, size_max
;
2243 unsigned char *addr
;
2245 int status
= TP_STATUS_AVAILABLE
;
2248 mutex_lock(&po
->pg_vec_lock
);
2250 if (likely(saddr
== NULL
)) {
2251 dev
= packet_cached_dev_get(po
);
2256 if (msg
->msg_namelen
< sizeof(struct sockaddr_ll
))
2258 if (msg
->msg_namelen
< (saddr
->sll_halen
2259 + offsetof(struct sockaddr_ll
,
2262 proto
= saddr
->sll_protocol
;
2263 addr
= saddr
->sll_addr
;
2264 dev
= dev_get_by_index(sock_net(&po
->sk
), saddr
->sll_ifindex
);
2268 if (unlikely(dev
== NULL
))
2271 if (unlikely(!(dev
->flags
& IFF_UP
)))
2274 reserve
= dev
->hard_header_len
+ VLAN_HLEN
;
2275 size_max
= po
->tx_ring
.frame_size
2276 - (po
->tp_hdrlen
- sizeof(struct sockaddr_ll
));
2278 if (size_max
> dev
->mtu
+ reserve
)
2279 size_max
= dev
->mtu
+ reserve
;
2282 ph
= packet_current_frame(po
, &po
->tx_ring
,
2283 TP_STATUS_SEND_REQUEST
);
2284 if (unlikely(ph
== NULL
)) {
2285 if (need_wait
&& need_resched())
2290 status
= TP_STATUS_SEND_REQUEST
;
2291 hlen
= LL_RESERVED_SPACE(dev
);
2292 tlen
= dev
->needed_tailroom
;
2293 skb
= sock_alloc_send_skb(&po
->sk
,
2294 hlen
+ tlen
+ sizeof(struct sockaddr_ll
),
2297 if (unlikely(skb
== NULL
))
2300 tp_len
= tpacket_fill_skb(po
, skb
, ph
, dev
, size_max
, proto
,
2302 if (tp_len
> dev
->mtu
+ dev
->hard_header_len
) {
2303 struct ethhdr
*ehdr
;
2304 /* Earlier code assumed this would be a VLAN pkt,
2305 * double-check this now that we have the actual
2309 skb_reset_mac_header(skb
);
2310 ehdr
= eth_hdr(skb
);
2311 if (ehdr
->h_proto
!= htons(ETH_P_8021Q
))
2314 if (unlikely(tp_len
< 0)) {
2316 __packet_set_status(po
, ph
,
2317 TP_STATUS_AVAILABLE
);
2318 packet_increment_head(&po
->tx_ring
);
2322 status
= TP_STATUS_WRONG_FORMAT
;
2328 packet_pick_tx_queue(dev
, skb
);
2330 skb
->destructor
= tpacket_destruct_skb
;
2331 __packet_set_status(po
, ph
, TP_STATUS_SENDING
);
2332 packet_inc_pending(&po
->tx_ring
);
2334 status
= TP_STATUS_SEND_REQUEST
;
2335 err
= po
->xmit(skb
);
2336 if (unlikely(err
> 0)) {
2337 err
= net_xmit_errno(err
);
2338 if (err
&& __packet_get_status(po
, ph
) ==
2339 TP_STATUS_AVAILABLE
) {
2340 /* skb was destructed already */
2345 * skb was dropped but not destructed yet;
2346 * let's treat it like congestion or err < 0
2350 packet_increment_head(&po
->tx_ring
);
2352 } while (likely((ph
!= NULL
) ||
2353 /* Note: packet_read_pending() might be slow if we have
2354 * to call it as it's per_cpu variable, but in fast-path
2355 * we already short-circuit the loop with the first
2356 * condition, and luckily don't have to go that path
2359 (need_wait
&& packet_read_pending(&po
->tx_ring
))));
2365 __packet_set_status(po
, ph
, status
);
2370 mutex_unlock(&po
->pg_vec_lock
);
2374 static struct sk_buff
*packet_alloc_skb(struct sock
*sk
, size_t prepad
,
2375 size_t reserve
, size_t len
,
2376 size_t linear
, int noblock
,
2379 struct sk_buff
*skb
;
2381 /* Under a page? Don't bother with paged skb. */
2382 if (prepad
+ len
< PAGE_SIZE
|| !linear
)
2385 skb
= sock_alloc_send_pskb(sk
, prepad
+ linear
, len
- linear
, noblock
,
2390 skb_reserve(skb
, reserve
);
2391 skb_put(skb
, linear
);
2392 skb
->data_len
= len
- linear
;
2393 skb
->len
+= len
- linear
;
2398 static int packet_snd(struct socket
*sock
, struct msghdr
*msg
, size_t len
)
2400 struct sock
*sk
= sock
->sk
;
2401 DECLARE_SOCKADDR(struct sockaddr_ll
*, saddr
, msg
->msg_name
);
2402 struct sk_buff
*skb
;
2403 struct net_device
*dev
;
2405 unsigned char *addr
;
2406 int err
, reserve
= 0;
2407 struct virtio_net_hdr vnet_hdr
= { 0 };
2410 struct packet_sock
*po
= pkt_sk(sk
);
2411 unsigned short gso_type
= 0;
2417 * Get and verify the address.
2420 if (likely(saddr
== NULL
)) {
2421 dev
= packet_cached_dev_get(po
);
2426 if (msg
->msg_namelen
< sizeof(struct sockaddr_ll
))
2428 if (msg
->msg_namelen
< (saddr
->sll_halen
+ offsetof(struct sockaddr_ll
, sll_addr
)))
2430 proto
= saddr
->sll_protocol
;
2431 addr
= saddr
->sll_addr
;
2432 dev
= dev_get_by_index(sock_net(sk
), saddr
->sll_ifindex
);
2436 if (unlikely(dev
== NULL
))
2439 if (unlikely(!(dev
->flags
& IFF_UP
)))
2442 if (sock
->type
== SOCK_RAW
)
2443 reserve
= dev
->hard_header_len
;
2444 if (po
->has_vnet_hdr
) {
2445 vnet_hdr_len
= sizeof(vnet_hdr
);
2448 if (len
< vnet_hdr_len
)
2451 len
-= vnet_hdr_len
;
2454 n
= copy_from_iter(&vnet_hdr
, vnet_hdr_len
, &msg
->msg_iter
);
2455 if (n
!= vnet_hdr_len
)
2458 if ((vnet_hdr
.flags
& VIRTIO_NET_HDR_F_NEEDS_CSUM
) &&
2459 (__virtio16_to_cpu(false, vnet_hdr
.csum_start
) +
2460 __virtio16_to_cpu(false, vnet_hdr
.csum_offset
) + 2 >
2461 __virtio16_to_cpu(false, vnet_hdr
.hdr_len
)))
2462 vnet_hdr
.hdr_len
= __cpu_to_virtio16(false,
2463 __virtio16_to_cpu(false, vnet_hdr
.csum_start
) +
2464 __virtio16_to_cpu(false, vnet_hdr
.csum_offset
) + 2);
2467 if (__virtio16_to_cpu(false, vnet_hdr
.hdr_len
) > len
)
2470 if (vnet_hdr
.gso_type
!= VIRTIO_NET_HDR_GSO_NONE
) {
2471 switch (vnet_hdr
.gso_type
& ~VIRTIO_NET_HDR_GSO_ECN
) {
2472 case VIRTIO_NET_HDR_GSO_TCPV4
:
2473 gso_type
= SKB_GSO_TCPV4
;
2475 case VIRTIO_NET_HDR_GSO_TCPV6
:
2476 gso_type
= SKB_GSO_TCPV6
;
2478 case VIRTIO_NET_HDR_GSO_UDP
:
2479 gso_type
= SKB_GSO_UDP
;
2485 if (vnet_hdr
.gso_type
& VIRTIO_NET_HDR_GSO_ECN
)
2486 gso_type
|= SKB_GSO_TCP_ECN
;
2488 if (vnet_hdr
.gso_size
== 0)
2494 if (unlikely(sock_flag(sk
, SOCK_NOFCS
))) {
2495 if (!netif_supports_nofcs(dev
)) {
2496 err
= -EPROTONOSUPPORT
;
2499 extra_len
= 4; /* We're doing our own CRC */
2503 if (!gso_type
&& (len
> dev
->mtu
+ reserve
+ VLAN_HLEN
+ extra_len
))
2507 hlen
= LL_RESERVED_SPACE(dev
);
2508 tlen
= dev
->needed_tailroom
;
2509 skb
= packet_alloc_skb(sk
, hlen
+ tlen
, hlen
, len
,
2510 __virtio16_to_cpu(false, vnet_hdr
.hdr_len
),
2511 msg
->msg_flags
& MSG_DONTWAIT
, &err
);
2515 skb_set_network_header(skb
, reserve
);
2518 if (sock
->type
== SOCK_DGRAM
) {
2519 offset
= dev_hard_header(skb
, dev
, ntohs(proto
), addr
, NULL
, len
);
2520 if (unlikely(offset
< 0))
2523 if (ll_header_truncated(dev
, len
))
2527 /* Returns -EFAULT on error */
2528 err
= skb_copy_datagram_from_iter(skb
, offset
, &msg
->msg_iter
, len
);
2532 sock_tx_timestamp(sk
, &skb_shinfo(skb
)->tx_flags
);
2534 if (!gso_type
&& (len
> dev
->mtu
+ reserve
+ extra_len
)) {
2535 /* Earlier code assumed this would be a VLAN pkt,
2536 * double-check this now that we have the actual
2539 struct ethhdr
*ehdr
;
2540 skb_reset_mac_header(skb
);
2541 ehdr
= eth_hdr(skb
);
2542 if (ehdr
->h_proto
!= htons(ETH_P_8021Q
)) {
2548 skb
->protocol
= proto
;
2550 skb
->priority
= sk
->sk_priority
;
2551 skb
->mark
= sk
->sk_mark
;
2553 packet_pick_tx_queue(dev
, skb
);
2555 if (po
->has_vnet_hdr
) {
2556 if (vnet_hdr
.flags
& VIRTIO_NET_HDR_F_NEEDS_CSUM
) {
2557 u16 s
= __virtio16_to_cpu(false, vnet_hdr
.csum_start
);
2558 u16 o
= __virtio16_to_cpu(false, vnet_hdr
.csum_offset
);
2559 if (!skb_partial_csum_set(skb
, s
, o
)) {
2565 skb_shinfo(skb
)->gso_size
=
2566 __virtio16_to_cpu(false, vnet_hdr
.gso_size
);
2567 skb_shinfo(skb
)->gso_type
= gso_type
;
2569 /* Header must be checked, and gso_segs computed. */
2570 skb_shinfo(skb
)->gso_type
|= SKB_GSO_DODGY
;
2571 skb_shinfo(skb
)->gso_segs
= 0;
2573 len
+= vnet_hdr_len
;
2576 if (!packet_use_direct_xmit(po
))
2577 skb_probe_transport_header(skb
, reserve
);
2578 if (unlikely(extra_len
== 4))
2581 err
= po
->xmit(skb
);
2582 if (err
> 0 && (err
= net_xmit_errno(err
)) != 0)
2598 static int packet_sendmsg(struct kiocb
*iocb
, struct socket
*sock
,
2599 struct msghdr
*msg
, size_t len
)
2601 struct sock
*sk
= sock
->sk
;
2602 struct packet_sock
*po
= pkt_sk(sk
);
2604 if (po
->tx_ring
.pg_vec
)
2605 return tpacket_snd(po
, msg
);
2607 return packet_snd(sock
, msg
, len
);
2611 * Close a PACKET socket. This is fairly simple. We immediately go
2612 * to 'closed' state and remove our protocol entry in the device list.
2615 static int packet_release(struct socket
*sock
)
2617 struct sock
*sk
= sock
->sk
;
2618 struct packet_sock
*po
;
2620 union tpacket_req_u req_u
;
2628 mutex_lock(&net
->packet
.sklist_lock
);
2629 sk_del_node_init_rcu(sk
);
2630 mutex_unlock(&net
->packet
.sklist_lock
);
2633 sock_prot_inuse_add(net
, sk
->sk_prot
, -1);
2636 spin_lock(&po
->bind_lock
);
2637 unregister_prot_hook(sk
, false);
2638 packet_cached_dev_reset(po
);
2640 if (po
->prot_hook
.dev
) {
2641 dev_put(po
->prot_hook
.dev
);
2642 po
->prot_hook
.dev
= NULL
;
2644 spin_unlock(&po
->bind_lock
);
2646 packet_flush_mclist(sk
);
2648 if (po
->rx_ring
.pg_vec
) {
2649 memset(&req_u
, 0, sizeof(req_u
));
2650 packet_set_ring(sk
, &req_u
, 1, 0);
2653 if (po
->tx_ring
.pg_vec
) {
2654 memset(&req_u
, 0, sizeof(req_u
));
2655 packet_set_ring(sk
, &req_u
, 1, 1);
2662 * Now the socket is dead. No more input will appear.
2669 skb_queue_purge(&sk
->sk_receive_queue
);
2670 packet_free_pending(po
);
2671 sk_refcnt_debug_release(sk
);
2678 * Attach a packet hook.
2681 static int packet_do_bind(struct sock
*sk
, struct net_device
*dev
, __be16 proto
)
2683 struct packet_sock
*po
= pkt_sk(sk
);
2684 const struct net_device
*dev_curr
;
2696 spin_lock(&po
->bind_lock
);
2698 proto_curr
= po
->prot_hook
.type
;
2699 dev_curr
= po
->prot_hook
.dev
;
2701 need_rehook
= proto_curr
!= proto
|| dev_curr
!= dev
;
2704 unregister_prot_hook(sk
, true);
2707 po
->prot_hook
.type
= proto
;
2709 if (po
->prot_hook
.dev
)
2710 dev_put(po
->prot_hook
.dev
);
2712 po
->prot_hook
.dev
= dev
;
2714 po
->ifindex
= dev
? dev
->ifindex
: 0;
2715 packet_cached_dev_assign(po
, dev
);
2718 if (proto
== 0 || !need_rehook
)
2721 if (!dev
|| (dev
->flags
& IFF_UP
)) {
2722 register_prot_hook(sk
);
2724 sk
->sk_err
= ENETDOWN
;
2725 if (!sock_flag(sk
, SOCK_DEAD
))
2726 sk
->sk_error_report(sk
);
2730 spin_unlock(&po
->bind_lock
);
2736 * Bind a packet socket to a device
2739 static int packet_bind_spkt(struct socket
*sock
, struct sockaddr
*uaddr
,
2742 struct sock
*sk
= sock
->sk
;
2744 struct net_device
*dev
;
2751 if (addr_len
!= sizeof(struct sockaddr
))
2753 strlcpy(name
, uaddr
->sa_data
, sizeof(name
));
2755 dev
= dev_get_by_name(sock_net(sk
), name
);
2757 err
= packet_do_bind(sk
, dev
, pkt_sk(sk
)->num
);
2761 static int packet_bind(struct socket
*sock
, struct sockaddr
*uaddr
, int addr_len
)
2763 struct sockaddr_ll
*sll
= (struct sockaddr_ll
*)uaddr
;
2764 struct sock
*sk
= sock
->sk
;
2765 struct net_device
*dev
= NULL
;
2773 if (addr_len
< sizeof(struct sockaddr_ll
))
2775 if (sll
->sll_family
!= AF_PACKET
)
2778 if (sll
->sll_ifindex
) {
2780 dev
= dev_get_by_index(sock_net(sk
), sll
->sll_ifindex
);
2784 err
= packet_do_bind(sk
, dev
, sll
->sll_protocol
? : pkt_sk(sk
)->num
);
2790 static struct proto packet_proto
= {
2792 .owner
= THIS_MODULE
,
2793 .obj_size
= sizeof(struct packet_sock
),
2797 * Create a packet of type SOCK_PACKET.
2800 static int packet_create(struct net
*net
, struct socket
*sock
, int protocol
,
2804 struct packet_sock
*po
;
2805 __be16 proto
= (__force __be16
)protocol
; /* weird, but documented */
2808 if (!ns_capable(net
->user_ns
, CAP_NET_RAW
))
2810 if (sock
->type
!= SOCK_DGRAM
&& sock
->type
!= SOCK_RAW
&&
2811 sock
->type
!= SOCK_PACKET
)
2812 return -ESOCKTNOSUPPORT
;
2814 sock
->state
= SS_UNCONNECTED
;
2817 sk
= sk_alloc(net
, PF_PACKET
, GFP_KERNEL
, &packet_proto
);
2821 sock
->ops
= &packet_ops
;
2822 if (sock
->type
== SOCK_PACKET
)
2823 sock
->ops
= &packet_ops_spkt
;
2825 sock_init_data(sock
, sk
);
2828 sk
->sk_family
= PF_PACKET
;
2830 po
->xmit
= dev_queue_xmit
;
2832 err
= packet_alloc_pending(po
);
2836 packet_cached_dev_reset(po
);
2838 sk
->sk_destruct
= packet_sock_destruct
;
2839 sk_refcnt_debug_inc(sk
);
2842 * Attach a protocol block
2845 spin_lock_init(&po
->bind_lock
);
2846 mutex_init(&po
->pg_vec_lock
);
2847 po
->prot_hook
.func
= packet_rcv
;
2849 if (sock
->type
== SOCK_PACKET
)
2850 po
->prot_hook
.func
= packet_rcv_spkt
;
2852 po
->prot_hook
.af_packet_priv
= sk
;
2855 po
->prot_hook
.type
= proto
;
2856 register_prot_hook(sk
);
2859 mutex_lock(&net
->packet
.sklist_lock
);
2860 sk_add_node_rcu(sk
, &net
->packet
.sklist
);
2861 mutex_unlock(&net
->packet
.sklist_lock
);
2864 sock_prot_inuse_add(net
, &packet_proto
, 1);
2875 * Pull a packet from our receive queue and hand it to the user.
2876 * If necessary we block.
2879 static int packet_recvmsg(struct kiocb
*iocb
, struct socket
*sock
,
2880 struct msghdr
*msg
, size_t len
, int flags
)
2882 struct sock
*sk
= sock
->sk
;
2883 struct sk_buff
*skb
;
2885 int vnet_hdr_len
= 0;
2888 if (flags
& ~(MSG_PEEK
|MSG_DONTWAIT
|MSG_TRUNC
|MSG_CMSG_COMPAT
|MSG_ERRQUEUE
))
2892 /* What error should we return now? EUNATTACH? */
2893 if (pkt_sk(sk
)->ifindex
< 0)
2897 if (flags
& MSG_ERRQUEUE
) {
2898 err
= sock_recv_errqueue(sk
, msg
, len
,
2899 SOL_PACKET
, PACKET_TX_TIMESTAMP
);
2904 * Call the generic datagram receiver. This handles all sorts
2905 * of horrible races and re-entrancy so we can forget about it
2906 * in the protocol layers.
2908 * Now it will return ENETDOWN, if device have just gone down,
2909 * but then it will block.
2912 skb
= skb_recv_datagram(sk
, flags
, flags
& MSG_DONTWAIT
, &err
);
2915 * An error occurred so return it. Because skb_recv_datagram()
2916 * handles the blocking we don't see and worry about blocking
2923 if (pkt_sk(sk
)->has_vnet_hdr
) {
2924 struct virtio_net_hdr vnet_hdr
= { 0 };
2927 vnet_hdr_len
= sizeof(vnet_hdr
);
2928 if (len
< vnet_hdr_len
)
2931 len
-= vnet_hdr_len
;
2933 if (skb_is_gso(skb
)) {
2934 struct skb_shared_info
*sinfo
= skb_shinfo(skb
);
2936 /* This is a hint as to how much should be linear. */
2938 __cpu_to_virtio16(false, skb_headlen(skb
));
2940 __cpu_to_virtio16(false, sinfo
->gso_size
);
2941 if (sinfo
->gso_type
& SKB_GSO_TCPV4
)
2942 vnet_hdr
.gso_type
= VIRTIO_NET_HDR_GSO_TCPV4
;
2943 else if (sinfo
->gso_type
& SKB_GSO_TCPV6
)
2944 vnet_hdr
.gso_type
= VIRTIO_NET_HDR_GSO_TCPV6
;
2945 else if (sinfo
->gso_type
& SKB_GSO_UDP
)
2946 vnet_hdr
.gso_type
= VIRTIO_NET_HDR_GSO_UDP
;
2947 else if (sinfo
->gso_type
& SKB_GSO_FCOE
)
2951 if (sinfo
->gso_type
& SKB_GSO_TCP_ECN
)
2952 vnet_hdr
.gso_type
|= VIRTIO_NET_HDR_GSO_ECN
;
2954 vnet_hdr
.gso_type
= VIRTIO_NET_HDR_GSO_NONE
;
2956 if (skb
->ip_summed
== CHECKSUM_PARTIAL
) {
2957 vnet_hdr
.flags
= VIRTIO_NET_HDR_F_NEEDS_CSUM
;
2958 vnet_hdr
.csum_start
= __cpu_to_virtio16(false,
2959 skb_checksum_start_offset(skb
));
2960 vnet_hdr
.csum_offset
= __cpu_to_virtio16(false,
2962 } else if (skb
->ip_summed
== CHECKSUM_UNNECESSARY
) {
2963 vnet_hdr
.flags
= VIRTIO_NET_HDR_F_DATA_VALID
;
2964 } /* else everything is zero */
2966 err
= memcpy_to_msg(msg
, (void *)&vnet_hdr
, vnet_hdr_len
);
2971 /* You lose any data beyond the buffer you gave. If it worries
2972 * a user program they can ask the device for its MTU
2978 msg
->msg_flags
|= MSG_TRUNC
;
2981 err
= skb_copy_datagram_msg(skb
, 0, msg
, copied
);
2985 sock_recv_ts_and_drops(msg
, sk
, skb
);
2987 if (msg
->msg_name
) {
2988 /* If the address length field is there to be filled
2989 * in, we fill it in now.
2991 if (sock
->type
== SOCK_PACKET
) {
2992 __sockaddr_check_size(sizeof(struct sockaddr_pkt
));
2993 msg
->msg_namelen
= sizeof(struct sockaddr_pkt
);
2995 struct sockaddr_ll
*sll
= &PACKET_SKB_CB(skb
)->sa
.ll
;
2996 msg
->msg_namelen
= sll
->sll_halen
+
2997 offsetof(struct sockaddr_ll
, sll_addr
);
2999 memcpy(msg
->msg_name
, &PACKET_SKB_CB(skb
)->sa
,
3003 if (pkt_sk(sk
)->auxdata
) {
3004 struct tpacket_auxdata aux
;
3006 aux
.tp_status
= TP_STATUS_USER
;
3007 if (skb
->ip_summed
== CHECKSUM_PARTIAL
)
3008 aux
.tp_status
|= TP_STATUS_CSUMNOTREADY
;
3009 aux
.tp_len
= PACKET_SKB_CB(skb
)->origlen
;
3010 aux
.tp_snaplen
= skb
->len
;
3012 aux
.tp_net
= skb_network_offset(skb
);
3013 if (skb_vlan_tag_present(skb
)) {
3014 aux
.tp_vlan_tci
= skb_vlan_tag_get(skb
);
3015 aux
.tp_vlan_tpid
= ntohs(skb
->vlan_proto
);
3016 aux
.tp_status
|= TP_STATUS_VLAN_VALID
| TP_STATUS_VLAN_TPID_VALID
;
3018 aux
.tp_vlan_tci
= 0;
3019 aux
.tp_vlan_tpid
= 0;
3021 put_cmsg(msg
, SOL_PACKET
, PACKET_AUXDATA
, sizeof(aux
), &aux
);
3025 * Free or return the buffer as appropriate. Again this
3026 * hides all the races and re-entrancy issues from us.
3028 err
= vnet_hdr_len
+ ((flags
&MSG_TRUNC
) ? skb
->len
: copied
);
3031 skb_free_datagram(sk
, skb
);
3036 static int packet_getname_spkt(struct socket
*sock
, struct sockaddr
*uaddr
,
3037 int *uaddr_len
, int peer
)
3039 struct net_device
*dev
;
3040 struct sock
*sk
= sock
->sk
;
3045 uaddr
->sa_family
= AF_PACKET
;
3046 memset(uaddr
->sa_data
, 0, sizeof(uaddr
->sa_data
));
3048 dev
= dev_get_by_index_rcu(sock_net(sk
), pkt_sk(sk
)->ifindex
);
3050 strlcpy(uaddr
->sa_data
, dev
->name
, sizeof(uaddr
->sa_data
));
3052 *uaddr_len
= sizeof(*uaddr
);
3057 static int packet_getname(struct socket
*sock
, struct sockaddr
*uaddr
,
3058 int *uaddr_len
, int peer
)
3060 struct net_device
*dev
;
3061 struct sock
*sk
= sock
->sk
;
3062 struct packet_sock
*po
= pkt_sk(sk
);
3063 DECLARE_SOCKADDR(struct sockaddr_ll
*, sll
, uaddr
);
3068 sll
->sll_family
= AF_PACKET
;
3069 sll
->sll_ifindex
= po
->ifindex
;
3070 sll
->sll_protocol
= po
->num
;
3071 sll
->sll_pkttype
= 0;
3073 dev
= dev_get_by_index_rcu(sock_net(sk
), po
->ifindex
);
3075 sll
->sll_hatype
= dev
->type
;
3076 sll
->sll_halen
= dev
->addr_len
;
3077 memcpy(sll
->sll_addr
, dev
->dev_addr
, dev
->addr_len
);
3079 sll
->sll_hatype
= 0; /* Bad: we have no ARPHRD_UNSPEC */
3083 *uaddr_len
= offsetof(struct sockaddr_ll
, sll_addr
) + sll
->sll_halen
;
3088 static int packet_dev_mc(struct net_device
*dev
, struct packet_mclist
*i
,
3092 case PACKET_MR_MULTICAST
:
3093 if (i
->alen
!= dev
->addr_len
)
3096 return dev_mc_add(dev
, i
->addr
);
3098 return dev_mc_del(dev
, i
->addr
);
3100 case PACKET_MR_PROMISC
:
3101 return dev_set_promiscuity(dev
, what
);
3102 case PACKET_MR_ALLMULTI
:
3103 return dev_set_allmulti(dev
, what
);
3104 case PACKET_MR_UNICAST
:
3105 if (i
->alen
!= dev
->addr_len
)
3108 return dev_uc_add(dev
, i
->addr
);
3110 return dev_uc_del(dev
, i
->addr
);
3118 static void packet_dev_mclist(struct net_device
*dev
, struct packet_mclist
*i
, int what
)
3120 for ( ; i
; i
= i
->next
) {
3121 if (i
->ifindex
== dev
->ifindex
)
3122 packet_dev_mc(dev
, i
, what
);
3126 static int packet_mc_add(struct sock
*sk
, struct packet_mreq_max
*mreq
)
3128 struct packet_sock
*po
= pkt_sk(sk
);
3129 struct packet_mclist
*ml
, *i
;
3130 struct net_device
*dev
;
3136 dev
= __dev_get_by_index(sock_net(sk
), mreq
->mr_ifindex
);
3141 if (mreq
->mr_alen
> dev
->addr_len
)
3145 i
= kmalloc(sizeof(*i
), GFP_KERNEL
);
3150 for (ml
= po
->mclist
; ml
; ml
= ml
->next
) {
3151 if (ml
->ifindex
== mreq
->mr_ifindex
&&
3152 ml
->type
== mreq
->mr_type
&&
3153 ml
->alen
== mreq
->mr_alen
&&
3154 memcmp(ml
->addr
, mreq
->mr_address
, ml
->alen
) == 0) {
3156 /* Free the new element ... */
3162 i
->type
= mreq
->mr_type
;
3163 i
->ifindex
= mreq
->mr_ifindex
;
3164 i
->alen
= mreq
->mr_alen
;
3165 memcpy(i
->addr
, mreq
->mr_address
, i
->alen
);
3167 i
->next
= po
->mclist
;
3169 err
= packet_dev_mc(dev
, i
, 1);
3171 po
->mclist
= i
->next
;
3180 static int packet_mc_drop(struct sock
*sk
, struct packet_mreq_max
*mreq
)
3182 struct packet_mclist
*ml
, **mlp
;
3186 for (mlp
= &pkt_sk(sk
)->mclist
; (ml
= *mlp
) != NULL
; mlp
= &ml
->next
) {
3187 if (ml
->ifindex
== mreq
->mr_ifindex
&&
3188 ml
->type
== mreq
->mr_type
&&
3189 ml
->alen
== mreq
->mr_alen
&&
3190 memcmp(ml
->addr
, mreq
->mr_address
, ml
->alen
) == 0) {
3191 if (--ml
->count
== 0) {
3192 struct net_device
*dev
;
3194 dev
= __dev_get_by_index(sock_net(sk
), ml
->ifindex
);
3196 packet_dev_mc(dev
, ml
, -1);
3204 return -EADDRNOTAVAIL
;
3207 static void packet_flush_mclist(struct sock
*sk
)
3209 struct packet_sock
*po
= pkt_sk(sk
);
3210 struct packet_mclist
*ml
;
3216 while ((ml
= po
->mclist
) != NULL
) {
3217 struct net_device
*dev
;
3219 po
->mclist
= ml
->next
;
3220 dev
= __dev_get_by_index(sock_net(sk
), ml
->ifindex
);
3222 packet_dev_mc(dev
, ml
, -1);
3229 packet_setsockopt(struct socket
*sock
, int level
, int optname
, char __user
*optval
, unsigned int optlen
)
3231 struct sock
*sk
= sock
->sk
;
3232 struct packet_sock
*po
= pkt_sk(sk
);
3235 if (level
!= SOL_PACKET
)
3236 return -ENOPROTOOPT
;
3239 case PACKET_ADD_MEMBERSHIP
:
3240 case PACKET_DROP_MEMBERSHIP
:
3242 struct packet_mreq_max mreq
;
3244 memset(&mreq
, 0, sizeof(mreq
));
3245 if (len
< sizeof(struct packet_mreq
))
3247 if (len
> sizeof(mreq
))
3249 if (copy_from_user(&mreq
, optval
, len
))
3251 if (len
< (mreq
.mr_alen
+ offsetof(struct packet_mreq
, mr_address
)))
3253 if (optname
== PACKET_ADD_MEMBERSHIP
)
3254 ret
= packet_mc_add(sk
, &mreq
);
3256 ret
= packet_mc_drop(sk
, &mreq
);
3260 case PACKET_RX_RING
:
3261 case PACKET_TX_RING
:
3263 union tpacket_req_u req_u
;
3266 switch (po
->tp_version
) {
3269 len
= sizeof(req_u
.req
);
3273 len
= sizeof(req_u
.req3
);
3278 if (pkt_sk(sk
)->has_vnet_hdr
)
3280 if (copy_from_user(&req_u
.req
, optval
, len
))
3282 return packet_set_ring(sk
, &req_u
, 0,
3283 optname
== PACKET_TX_RING
);
3285 case PACKET_COPY_THRESH
:
3289 if (optlen
!= sizeof(val
))
3291 if (copy_from_user(&val
, optval
, sizeof(val
)))
3294 pkt_sk(sk
)->copy_thresh
= val
;
3297 case PACKET_VERSION
:
3301 if (optlen
!= sizeof(val
))
3303 if (po
->rx_ring
.pg_vec
|| po
->tx_ring
.pg_vec
)
3305 if (copy_from_user(&val
, optval
, sizeof(val
)))
3311 po
->tp_version
= val
;
3317 case PACKET_RESERVE
:
3321 if (optlen
!= sizeof(val
))
3323 if (po
->rx_ring
.pg_vec
|| po
->tx_ring
.pg_vec
)
3325 if (copy_from_user(&val
, optval
, sizeof(val
)))
3327 po
->tp_reserve
= val
;
3334 if (optlen
!= sizeof(val
))
3336 if (po
->rx_ring
.pg_vec
|| po
->tx_ring
.pg_vec
)
3338 if (copy_from_user(&val
, optval
, sizeof(val
)))
3340 po
->tp_loss
= !!val
;
3343 case PACKET_AUXDATA
:
3347 if (optlen
< sizeof(val
))
3349 if (copy_from_user(&val
, optval
, sizeof(val
)))
3352 po
->auxdata
= !!val
;
3355 case PACKET_ORIGDEV
:
3359 if (optlen
< sizeof(val
))
3361 if (copy_from_user(&val
, optval
, sizeof(val
)))
3364 po
->origdev
= !!val
;
3367 case PACKET_VNET_HDR
:
3371 if (sock
->type
!= SOCK_RAW
)
3373 if (po
->rx_ring
.pg_vec
|| po
->tx_ring
.pg_vec
)
3375 if (optlen
< sizeof(val
))
3377 if (copy_from_user(&val
, optval
, sizeof(val
)))
3380 po
->has_vnet_hdr
= !!val
;
3383 case PACKET_TIMESTAMP
:
3387 if (optlen
!= sizeof(val
))
3389 if (copy_from_user(&val
, optval
, sizeof(val
)))
3392 po
->tp_tstamp
= val
;
3399 if (optlen
!= sizeof(val
))
3401 if (copy_from_user(&val
, optval
, sizeof(val
)))
3404 return fanout_add(sk
, val
& 0xffff, val
>> 16);
3406 case PACKET_TX_HAS_OFF
:
3410 if (optlen
!= sizeof(val
))
3412 if (po
->rx_ring
.pg_vec
|| po
->tx_ring
.pg_vec
)
3414 if (copy_from_user(&val
, optval
, sizeof(val
)))
3416 po
->tp_tx_has_off
= !!val
;
3419 case PACKET_QDISC_BYPASS
:
3423 if (optlen
!= sizeof(val
))
3425 if (copy_from_user(&val
, optval
, sizeof(val
)))
3428 po
->xmit
= val
? packet_direct_xmit
: dev_queue_xmit
;
3432 return -ENOPROTOOPT
;
3436 static int packet_getsockopt(struct socket
*sock
, int level
, int optname
,
3437 char __user
*optval
, int __user
*optlen
)
3440 int val
, lv
= sizeof(val
);
3441 struct sock
*sk
= sock
->sk
;
3442 struct packet_sock
*po
= pkt_sk(sk
);
3444 union tpacket_stats_u st
;
3446 if (level
!= SOL_PACKET
)
3447 return -ENOPROTOOPT
;
3449 if (get_user(len
, optlen
))
3456 case PACKET_STATISTICS
:
3457 spin_lock_bh(&sk
->sk_receive_queue
.lock
);
3458 memcpy(&st
, &po
->stats
, sizeof(st
));
3459 memset(&po
->stats
, 0, sizeof(po
->stats
));
3460 spin_unlock_bh(&sk
->sk_receive_queue
.lock
);
3462 if (po
->tp_version
== TPACKET_V3
) {
3463 lv
= sizeof(struct tpacket_stats_v3
);
3464 st
.stats3
.tp_packets
+= st
.stats3
.tp_drops
;
3467 lv
= sizeof(struct tpacket_stats
);
3468 st
.stats1
.tp_packets
+= st
.stats1
.tp_drops
;
3473 case PACKET_AUXDATA
:
3476 case PACKET_ORIGDEV
:
3479 case PACKET_VNET_HDR
:
3480 val
= po
->has_vnet_hdr
;
3482 case PACKET_VERSION
:
3483 val
= po
->tp_version
;
3486 if (len
> sizeof(int))
3488 if (copy_from_user(&val
, optval
, len
))
3492 val
= sizeof(struct tpacket_hdr
);
3495 val
= sizeof(struct tpacket2_hdr
);
3498 val
= sizeof(struct tpacket3_hdr
);
3504 case PACKET_RESERVE
:
3505 val
= po
->tp_reserve
;
3510 case PACKET_TIMESTAMP
:
3511 val
= po
->tp_tstamp
;
3515 ((u32
)po
->fanout
->id
|
3516 ((u32
)po
->fanout
->type
<< 16) |
3517 ((u32
)po
->fanout
->flags
<< 24)) :
3520 case PACKET_TX_HAS_OFF
:
3521 val
= po
->tp_tx_has_off
;
3523 case PACKET_QDISC_BYPASS
:
3524 val
= packet_use_direct_xmit(po
);
3527 return -ENOPROTOOPT
;
3532 if (put_user(len
, optlen
))
3534 if (copy_to_user(optval
, data
, len
))
3540 static int packet_notifier(struct notifier_block
*this,
3541 unsigned long msg
, void *ptr
)
3544 struct net_device
*dev
= netdev_notifier_info_to_dev(ptr
);
3545 struct net
*net
= dev_net(dev
);
3548 sk_for_each_rcu(sk
, &net
->packet
.sklist
) {
3549 struct packet_sock
*po
= pkt_sk(sk
);
3552 case NETDEV_UNREGISTER
:
3554 packet_dev_mclist(dev
, po
->mclist
, -1);
3558 if (dev
->ifindex
== po
->ifindex
) {
3559 spin_lock(&po
->bind_lock
);
3561 __unregister_prot_hook(sk
, false);
3562 sk
->sk_err
= ENETDOWN
;
3563 if (!sock_flag(sk
, SOCK_DEAD
))
3564 sk
->sk_error_report(sk
);
3566 if (msg
== NETDEV_UNREGISTER
) {
3567 packet_cached_dev_reset(po
);
3569 if (po
->prot_hook
.dev
)
3570 dev_put(po
->prot_hook
.dev
);
3571 po
->prot_hook
.dev
= NULL
;
3573 spin_unlock(&po
->bind_lock
);
3577 if (dev
->ifindex
== po
->ifindex
) {
3578 spin_lock(&po
->bind_lock
);
3580 register_prot_hook(sk
);
3581 spin_unlock(&po
->bind_lock
);
3591 static int packet_ioctl(struct socket
*sock
, unsigned int cmd
,
3594 struct sock
*sk
= sock
->sk
;
3599 int amount
= sk_wmem_alloc_get(sk
);
3601 return put_user(amount
, (int __user
*)arg
);
3605 struct sk_buff
*skb
;
3608 spin_lock_bh(&sk
->sk_receive_queue
.lock
);
3609 skb
= skb_peek(&sk
->sk_receive_queue
);
3612 spin_unlock_bh(&sk
->sk_receive_queue
.lock
);
3613 return put_user(amount
, (int __user
*)arg
);
3616 return sock_get_timestamp(sk
, (struct timeval __user
*)arg
);
3618 return sock_get_timestampns(sk
, (struct timespec __user
*)arg
);
3628 case SIOCGIFBRDADDR
:
3629 case SIOCSIFBRDADDR
:
3630 case SIOCGIFNETMASK
:
3631 case SIOCSIFNETMASK
:
3632 case SIOCGIFDSTADDR
:
3633 case SIOCSIFDSTADDR
:
3635 return inet_dgram_ops
.ioctl(sock
, cmd
, arg
);
3639 return -ENOIOCTLCMD
;
3644 static unsigned int packet_poll(struct file
*file
, struct socket
*sock
,
3647 struct sock
*sk
= sock
->sk
;
3648 struct packet_sock
*po
= pkt_sk(sk
);
3649 unsigned int mask
= datagram_poll(file
, sock
, wait
);
3651 spin_lock_bh(&sk
->sk_receive_queue
.lock
);
3652 if (po
->rx_ring
.pg_vec
) {
3653 if (!packet_previous_rx_frame(po
, &po
->rx_ring
,
3655 mask
|= POLLIN
| POLLRDNORM
;
3657 spin_unlock_bh(&sk
->sk_receive_queue
.lock
);
3658 spin_lock_bh(&sk
->sk_write_queue
.lock
);
3659 if (po
->tx_ring
.pg_vec
) {
3660 if (packet_current_frame(po
, &po
->tx_ring
, TP_STATUS_AVAILABLE
))
3661 mask
|= POLLOUT
| POLLWRNORM
;
3663 spin_unlock_bh(&sk
->sk_write_queue
.lock
);
3668 /* Dirty? Well, I still did not learn better way to account
3672 static void packet_mm_open(struct vm_area_struct
*vma
)
3674 struct file
*file
= vma
->vm_file
;
3675 struct socket
*sock
= file
->private_data
;
3676 struct sock
*sk
= sock
->sk
;
3679 atomic_inc(&pkt_sk(sk
)->mapped
);
3682 static void packet_mm_close(struct vm_area_struct
*vma
)
3684 struct file
*file
= vma
->vm_file
;
3685 struct socket
*sock
= file
->private_data
;
3686 struct sock
*sk
= sock
->sk
;
3689 atomic_dec(&pkt_sk(sk
)->mapped
);
3692 static const struct vm_operations_struct packet_mmap_ops
= {
3693 .open
= packet_mm_open
,
3694 .close
= packet_mm_close
,
3697 static void free_pg_vec(struct pgv
*pg_vec
, unsigned int order
,
3702 for (i
= 0; i
< len
; i
++) {
3703 if (likely(pg_vec
[i
].buffer
)) {
3704 if (is_vmalloc_addr(pg_vec
[i
].buffer
))
3705 vfree(pg_vec
[i
].buffer
);
3707 free_pages((unsigned long)pg_vec
[i
].buffer
,
3709 pg_vec
[i
].buffer
= NULL
;
3715 static char *alloc_one_pg_vec_page(unsigned long order
)
3718 gfp_t gfp_flags
= GFP_KERNEL
| __GFP_COMP
|
3719 __GFP_ZERO
| __GFP_NOWARN
| __GFP_NORETRY
;
3721 buffer
= (char *) __get_free_pages(gfp_flags
, order
);
3725 /* __get_free_pages failed, fall back to vmalloc */
3726 buffer
= vzalloc((1 << order
) * PAGE_SIZE
);
3730 /* vmalloc failed, lets dig into swap here */
3731 gfp_flags
&= ~__GFP_NORETRY
;
3732 buffer
= (char *) __get_free_pages(gfp_flags
, order
);
3736 /* complete and utter failure */
3740 static struct pgv
*alloc_pg_vec(struct tpacket_req
*req
, int order
)
3742 unsigned int block_nr
= req
->tp_block_nr
;
3746 pg_vec
= kcalloc(block_nr
, sizeof(struct pgv
), GFP_KERNEL
);
3747 if (unlikely(!pg_vec
))
3750 for (i
= 0; i
< block_nr
; i
++) {
3751 pg_vec
[i
].buffer
= alloc_one_pg_vec_page(order
);
3752 if (unlikely(!pg_vec
[i
].buffer
))
3753 goto out_free_pgvec
;
3760 free_pg_vec(pg_vec
, order
, block_nr
);
3765 static int packet_set_ring(struct sock
*sk
, union tpacket_req_u
*req_u
,
3766 int closing
, int tx_ring
)
3768 struct pgv
*pg_vec
= NULL
;
3769 struct packet_sock
*po
= pkt_sk(sk
);
3770 int was_running
, order
= 0;
3771 struct packet_ring_buffer
*rb
;
3772 struct sk_buff_head
*rb_queue
;
3775 /* Added to avoid minimal code churn */
3776 struct tpacket_req
*req
= &req_u
->req
;
3778 /* Opening a Tx-ring is NOT supported in TPACKET_V3 */
3779 if (!closing
&& tx_ring
&& (po
->tp_version
> TPACKET_V2
)) {
3780 WARN(1, "Tx-ring is not supported.\n");
3784 rb
= tx_ring
? &po
->tx_ring
: &po
->rx_ring
;
3785 rb_queue
= tx_ring
? &sk
->sk_write_queue
: &sk
->sk_receive_queue
;
3789 if (atomic_read(&po
->mapped
))
3791 if (packet_read_pending(rb
))
3795 if (req
->tp_block_nr
) {
3796 /* Sanity tests and some calculations */
3798 if (unlikely(rb
->pg_vec
))
3801 switch (po
->tp_version
) {
3803 po
->tp_hdrlen
= TPACKET_HDRLEN
;
3806 po
->tp_hdrlen
= TPACKET2_HDRLEN
;
3809 po
->tp_hdrlen
= TPACKET3_HDRLEN
;
3814 if (unlikely((int)req
->tp_block_size
<= 0))
3816 if (unlikely(req
->tp_block_size
& (PAGE_SIZE
- 1)))
3818 if (po
->tp_version
>= TPACKET_V3
&&
3819 (int)(req
->tp_block_size
-
3820 BLK_PLUS_PRIV(req_u
->req3
.tp_sizeof_priv
)) <= 0)
3822 if (unlikely(req
->tp_frame_size
< po
->tp_hdrlen
+
3825 if (unlikely(req
->tp_frame_size
& (TPACKET_ALIGNMENT
- 1)))
3828 rb
->frames_per_block
= req
->tp_block_size
/req
->tp_frame_size
;
3829 if (unlikely(rb
->frames_per_block
<= 0))
3831 if (unlikely((rb
->frames_per_block
* req
->tp_block_nr
) !=
3836 order
= get_order(req
->tp_block_size
);
3837 pg_vec
= alloc_pg_vec(req
, order
);
3838 if (unlikely(!pg_vec
))
3840 switch (po
->tp_version
) {
3842 /* Transmit path is not supported. We checked
3843 * it above but just being paranoid
3846 init_prb_bdqc(po
, rb
, pg_vec
, req_u
, tx_ring
);
3855 if (unlikely(req
->tp_frame_nr
))
3861 /* Detach socket from network */
3862 spin_lock(&po
->bind_lock
);
3863 was_running
= po
->running
;
3867 __unregister_prot_hook(sk
, false);
3869 spin_unlock(&po
->bind_lock
);
3874 mutex_lock(&po
->pg_vec_lock
);
3875 if (closing
|| atomic_read(&po
->mapped
) == 0) {
3877 spin_lock_bh(&rb_queue
->lock
);
3878 swap(rb
->pg_vec
, pg_vec
);
3879 rb
->frame_max
= (req
->tp_frame_nr
- 1);
3881 rb
->frame_size
= req
->tp_frame_size
;
3882 spin_unlock_bh(&rb_queue
->lock
);
3884 swap(rb
->pg_vec_order
, order
);
3885 swap(rb
->pg_vec_len
, req
->tp_block_nr
);
3887 rb
->pg_vec_pages
= req
->tp_block_size
/PAGE_SIZE
;
3888 po
->prot_hook
.func
= (po
->rx_ring
.pg_vec
) ?
3889 tpacket_rcv
: packet_rcv
;
3890 skb_queue_purge(rb_queue
);
3891 if (atomic_read(&po
->mapped
))
3892 pr_err("packet_mmap: vma is busy: %d\n",
3893 atomic_read(&po
->mapped
));
3895 mutex_unlock(&po
->pg_vec_lock
);
3897 spin_lock(&po
->bind_lock
);
3900 register_prot_hook(sk
);
3902 spin_unlock(&po
->bind_lock
);
3903 if (closing
&& (po
->tp_version
> TPACKET_V2
)) {
3904 /* Because we don't support block-based V3 on tx-ring */
3906 prb_shutdown_retire_blk_timer(po
, tx_ring
, rb_queue
);
3911 free_pg_vec(pg_vec
, order
, req
->tp_block_nr
);
3916 static int packet_mmap(struct file
*file
, struct socket
*sock
,
3917 struct vm_area_struct
*vma
)
3919 struct sock
*sk
= sock
->sk
;
3920 struct packet_sock
*po
= pkt_sk(sk
);
3921 unsigned long size
, expected_size
;
3922 struct packet_ring_buffer
*rb
;
3923 unsigned long start
;
3930 mutex_lock(&po
->pg_vec_lock
);
3933 for (rb
= &po
->rx_ring
; rb
<= &po
->tx_ring
; rb
++) {
3935 expected_size
+= rb
->pg_vec_len
3941 if (expected_size
== 0)
3944 size
= vma
->vm_end
- vma
->vm_start
;
3945 if (size
!= expected_size
)
3948 start
= vma
->vm_start
;
3949 for (rb
= &po
->rx_ring
; rb
<= &po
->tx_ring
; rb
++) {
3950 if (rb
->pg_vec
== NULL
)
3953 for (i
= 0; i
< rb
->pg_vec_len
; i
++) {
3955 void *kaddr
= rb
->pg_vec
[i
].buffer
;
3958 for (pg_num
= 0; pg_num
< rb
->pg_vec_pages
; pg_num
++) {
3959 page
= pgv_to_page(kaddr
);
3960 err
= vm_insert_page(vma
, start
, page
);
3969 atomic_inc(&po
->mapped
);
3970 vma
->vm_ops
= &packet_mmap_ops
;
3974 mutex_unlock(&po
->pg_vec_lock
);
3978 static const struct proto_ops packet_ops_spkt
= {
3979 .family
= PF_PACKET
,
3980 .owner
= THIS_MODULE
,
3981 .release
= packet_release
,
3982 .bind
= packet_bind_spkt
,
3983 .connect
= sock_no_connect
,
3984 .socketpair
= sock_no_socketpair
,
3985 .accept
= sock_no_accept
,
3986 .getname
= packet_getname_spkt
,
3987 .poll
= datagram_poll
,
3988 .ioctl
= packet_ioctl
,
3989 .listen
= sock_no_listen
,
3990 .shutdown
= sock_no_shutdown
,
3991 .setsockopt
= sock_no_setsockopt
,
3992 .getsockopt
= sock_no_getsockopt
,
3993 .sendmsg
= packet_sendmsg_spkt
,
3994 .recvmsg
= packet_recvmsg
,
3995 .mmap
= sock_no_mmap
,
3996 .sendpage
= sock_no_sendpage
,
3999 static const struct proto_ops packet_ops
= {
4000 .family
= PF_PACKET
,
4001 .owner
= THIS_MODULE
,
4002 .release
= packet_release
,
4003 .bind
= packet_bind
,
4004 .connect
= sock_no_connect
,
4005 .socketpair
= sock_no_socketpair
,
4006 .accept
= sock_no_accept
,
4007 .getname
= packet_getname
,
4008 .poll
= packet_poll
,
4009 .ioctl
= packet_ioctl
,
4010 .listen
= sock_no_listen
,
4011 .shutdown
= sock_no_shutdown
,
4012 .setsockopt
= packet_setsockopt
,
4013 .getsockopt
= packet_getsockopt
,
4014 .sendmsg
= packet_sendmsg
,
4015 .recvmsg
= packet_recvmsg
,
4016 .mmap
= packet_mmap
,
4017 .sendpage
= sock_no_sendpage
,
4020 static const struct net_proto_family packet_family_ops
= {
4021 .family
= PF_PACKET
,
4022 .create
= packet_create
,
4023 .owner
= THIS_MODULE
,
4026 static struct notifier_block packet_netdev_notifier
= {
4027 .notifier_call
= packet_notifier
,
4030 #ifdef CONFIG_PROC_FS
4032 static void *packet_seq_start(struct seq_file
*seq
, loff_t
*pos
)
4035 struct net
*net
= seq_file_net(seq
);
4038 return seq_hlist_start_head_rcu(&net
->packet
.sklist
, *pos
);
4041 static void *packet_seq_next(struct seq_file
*seq
, void *v
, loff_t
*pos
)
4043 struct net
*net
= seq_file_net(seq
);
4044 return seq_hlist_next_rcu(v
, &net
->packet
.sklist
, pos
);
4047 static void packet_seq_stop(struct seq_file
*seq
, void *v
)
4053 static int packet_seq_show(struct seq_file
*seq
, void *v
)
4055 if (v
== SEQ_START_TOKEN
)
4056 seq_puts(seq
, "sk RefCnt Type Proto Iface R Rmem User Inode\n");
4058 struct sock
*s
= sk_entry(v
);
4059 const struct packet_sock
*po
= pkt_sk(s
);
4062 "%pK %-6d %-4d %04x %-5d %1d %-6u %-6u %-6lu\n",
4064 atomic_read(&s
->sk_refcnt
),
4069 atomic_read(&s
->sk_rmem_alloc
),
4070 from_kuid_munged(seq_user_ns(seq
), sock_i_uid(s
)),
4077 static const struct seq_operations packet_seq_ops
= {
4078 .start
= packet_seq_start
,
4079 .next
= packet_seq_next
,
4080 .stop
= packet_seq_stop
,
4081 .show
= packet_seq_show
,
4084 static int packet_seq_open(struct inode
*inode
, struct file
*file
)
4086 return seq_open_net(inode
, file
, &packet_seq_ops
,
4087 sizeof(struct seq_net_private
));
4090 static const struct file_operations packet_seq_fops
= {
4091 .owner
= THIS_MODULE
,
4092 .open
= packet_seq_open
,
4094 .llseek
= seq_lseek
,
4095 .release
= seq_release_net
,
4100 static int __net_init
packet_net_init(struct net
*net
)
4102 mutex_init(&net
->packet
.sklist_lock
);
4103 INIT_HLIST_HEAD(&net
->packet
.sklist
);
4105 if (!proc_create("packet", 0, net
->proc_net
, &packet_seq_fops
))
4111 static void __net_exit
packet_net_exit(struct net
*net
)
4113 remove_proc_entry("packet", net
->proc_net
);
4116 static struct pernet_operations packet_net_ops
= {
4117 .init
= packet_net_init
,
4118 .exit
= packet_net_exit
,
4122 static void __exit
packet_exit(void)
4124 unregister_netdevice_notifier(&packet_netdev_notifier
);
4125 unregister_pernet_subsys(&packet_net_ops
);
4126 sock_unregister(PF_PACKET
);
4127 proto_unregister(&packet_proto
);
4130 static int __init
packet_init(void)
4132 int rc
= proto_register(&packet_proto
, 0);
4137 sock_register(&packet_family_ops
);
4138 register_pernet_subsys(&packet_net_ops
);
4139 register_netdevice_notifier(&packet_netdev_notifier
);
4144 module_init(packet_init
);
4145 module_exit(packet_exit
);
4146 MODULE_LICENSE("GPL");
4147 MODULE_ALIAS_NETPROTO(PF_PACKET
);