1 // SPDX-License-Identifier: GPL-2.0-or-later
3 * INET An implementation of the TCP/IP protocol suite for the LINUX
4 * operating system. INET is implemented using the BSD Socket
5 * interface as the means of communication with the user level.
7 * Generic socket support routines. Memory allocators, socket lock/release
8 * handler for protocols to use and generic option handler.
11 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12 * Florian La Roche, <flla@stud.uni-sb.de>
13 * Alan Cox, <A.Cox@swansea.ac.uk>
16 * Alan Cox : Numerous verify_area() problems
17 * Alan Cox : Connecting on a connecting socket
18 * now returns an error for tcp.
19 * Alan Cox : sock->protocol is set correctly.
20 * and is not sometimes left as 0.
21 * Alan Cox : connect handles icmp errors on a
22 * connect properly. Unfortunately there
23 * is a restart syscall nasty there. I
24 * can't match BSD without hacking the C
25 * library. Ideas urgently sought!
26 * Alan Cox : Disallow bind() to addresses that are
27 * not ours - especially broadcast ones!!
28 * Alan Cox : Socket 1024 _IS_ ok for users. (fencepost)
29 * Alan Cox : sock_wfree/sock_rfree don't destroy sockets,
30 * instead they leave that for the DESTROY timer.
31 * Alan Cox : Clean up error flag in accept
32 * Alan Cox : TCP ack handling is buggy, the DESTROY timer
33 * was buggy. Put a remove_sock() in the handler
34 * for memory when we hit 0. Also altered the timer
35 * code. The ACK stuff can wait and needs major
37 * Alan Cox : Fixed TCP ack bug, removed remove sock
38 * and fixed timer/inet_bh race.
39 * Alan Cox : Added zapped flag for TCP
40 * Alan Cox : Move kfree_skb into skbuff.c and tidied up surplus code
41 * Alan Cox : for new sk_buff allocations wmalloc/rmalloc now call alloc_skb
42 * Alan Cox : kfree_s calls now are kfree_skbmem so we can track skb resources
43 * Alan Cox : Supports socket option broadcast now as does udp. Packet and raw need fixing.
44 * Alan Cox : Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so...
45 * Rick Sladkey : Relaxed UDP rules for matching packets.
46 * C.E.Hawkins : IFF_PROMISC/SIOCGHWADDR support
47 * Pauline Middelink : identd support
48 * Alan Cox : Fixed connect() taking signals I think.
49 * Alan Cox : SO_LINGER supported
50 * Alan Cox : Error reporting fixes
51 * Anonymous : inet_create tidied up (sk->reuse setting)
52 * Alan Cox : inet sockets don't set sk->type!
53 * Alan Cox : Split socket option code
54 * Alan Cox : Callbacks
55 * Alan Cox : Nagle flag for Charles & Johannes stuff
56 * Alex : Removed restriction on inet fioctl
57 * Alan Cox : Splitting INET from NET core
58 * Alan Cox : Fixed bogus SO_TYPE handling in getsockopt()
59 * Adam Caldwell : Missing return in SO_DONTROUTE/SO_DEBUG code
60 * Alan Cox : Split IP from generic code
61 * Alan Cox : New kfree_skbmem()
62 * Alan Cox : Make SO_DEBUG superuser only.
63 * Alan Cox : Allow anyone to clear SO_DEBUG
65 * Alan Cox : Added optimistic memory grabbing for AF_UNIX throughput.
66 * Alan Cox : Allocator for a socket is settable.
67 * Alan Cox : SO_ERROR includes soft errors.
68 * Alan Cox : Allow NULL arguments on some SO_ opts
69 * Alan Cox : Generic socket allocation to make hooks
70 * easier (suggested by Craig Metz).
71 * Michael Pall : SO_ERROR returns positive errno again
72 * Steve Whitehouse: Added default destructor to free
73 * protocol private data.
74 * Steve Whitehouse: Added various other default routines
75 * common to several socket families.
76 * Chris Evans : Call suser() check last on F_SETOWN
77 * Jay Schulist : Added SO_ATTACH_FILTER and SO_DETACH_FILTER.
78 * Andi Kleen : Add sock_kmalloc()/sock_kfree_s()
79 * Andi Kleen : Fix write_space callback
80 * Chris Evans : Security fixes - signedness again
81 * Arnaldo C. Melo : cleanups, use skb_queue_purge
86 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
88 #include <linux/unaligned.h>
89 #include <linux/capability.h>
90 #include <linux/errno.h>
91 #include <linux/errqueue.h>
92 #include <linux/types.h>
93 #include <linux/socket.h>
95 #include <linux/kernel.h>
96 #include <linux/module.h>
97 #include <linux/proc_fs.h>
98 #include <linux/seq_file.h>
99 #include <linux/sched.h>
100 #include <linux/sched/mm.h>
101 #include <linux/timer.h>
102 #include <linux/string.h>
103 #include <linux/sockios.h>
104 #include <linux/net.h>
105 #include <linux/mm.h>
106 #include <linux/slab.h>
107 #include <linux/interrupt.h>
108 #include <linux/poll.h>
109 #include <linux/tcp.h>
110 #include <linux/udp.h>
111 #include <linux/init.h>
112 #include <linux/highmem.h>
113 #include <linux/user_namespace.h>
114 #include <linux/static_key.h>
115 #include <linux/memcontrol.h>
116 #include <linux/prefetch.h>
117 #include <linux/compat.h>
118 #include <linux/mroute.h>
119 #include <linux/mroute6.h>
120 #include <linux/icmpv6.h>
122 #include <linux/uaccess.h>
124 #include <linux/netdevice.h>
125 #include <net/protocol.h>
126 #include <linux/skbuff.h>
127 #include <linux/skbuff_ref.h>
128 #include <net/net_namespace.h>
129 #include <net/request_sock.h>
130 #include <net/sock.h>
131 #include <net/proto_memory.h>
132 #include <linux/net_tstamp.h>
133 #include <net/xfrm.h>
134 #include <linux/ipsec.h>
135 #include <net/cls_cgroup.h>
136 #include <net/netprio_cgroup.h>
137 #include <linux/sock_diag.h>
139 #include <linux/filter.h>
140 #include <net/sock_reuseport.h>
141 #include <net/bpf_sk_storage.h>
143 #include <trace/events/sock.h>
146 #include <net/busy_poll.h>
147 #include <net/phonet/phonet.h>
149 #include <linux/ethtool.h>
153 static DEFINE_MUTEX(proto_list_mutex
);
154 static LIST_HEAD(proto_list
);
156 static void sock_def_write_space_wfree(struct sock
*sk
);
157 static void sock_def_write_space(struct sock
*sk
);
160 * sk_ns_capable - General socket capability test
161 * @sk: Socket to use a capability on or through
162 * @user_ns: The user namespace of the capability to use
163 * @cap: The capability to use
165 * Test to see if the opener of the socket had when the socket was
166 * created and the current process has the capability @cap in the user
167 * namespace @user_ns.
169 bool sk_ns_capable(const struct sock
*sk
,
170 struct user_namespace
*user_ns
, int cap
)
172 return file_ns_capable(sk
->sk_socket
->file
, user_ns
, cap
) &&
173 ns_capable(user_ns
, cap
);
175 EXPORT_SYMBOL(sk_ns_capable
);
178 * sk_capable - Socket global capability test
179 * @sk: Socket to use a capability on or through
180 * @cap: The global capability to use
182 * Test to see if the opener of the socket had when the socket was
183 * created and the current process has the capability @cap in all user
186 bool sk_capable(const struct sock
*sk
, int cap
)
188 return sk_ns_capable(sk
, &init_user_ns
, cap
);
190 EXPORT_SYMBOL(sk_capable
);
193 * sk_net_capable - Network namespace socket capability test
194 * @sk: Socket to use a capability on or through
195 * @cap: The capability to use
197 * Test to see if the opener of the socket had when the socket was created
198 * and the current process has the capability @cap over the network namespace
199 * the socket is a member of.
201 bool sk_net_capable(const struct sock
*sk
, int cap
)
203 return sk_ns_capable(sk
, sock_net(sk
)->user_ns
, cap
);
205 EXPORT_SYMBOL(sk_net_capable
);
208 * Each address family might have different locking rules, so we have
209 * one slock key per address family and separate keys for internal and
212 static struct lock_class_key af_family_keys
[AF_MAX
];
213 static struct lock_class_key af_family_kern_keys
[AF_MAX
];
214 static struct lock_class_key af_family_slock_keys
[AF_MAX
];
215 static struct lock_class_key af_family_kern_slock_keys
[AF_MAX
];
218 * Make lock validator output more readable. (we pre-construct these
219 * strings build-time, so that runtime initialization of socket
223 #define _sock_locks(x) \
224 x "AF_UNSPEC", x "AF_UNIX" , x "AF_INET" , \
225 x "AF_AX25" , x "AF_IPX" , x "AF_APPLETALK", \
226 x "AF_NETROM", x "AF_BRIDGE" , x "AF_ATMPVC" , \
227 x "AF_X25" , x "AF_INET6" , x "AF_ROSE" , \
228 x "AF_DECnet", x "AF_NETBEUI" , x "AF_SECURITY" , \
229 x "AF_KEY" , x "AF_NETLINK" , x "AF_PACKET" , \
230 x "AF_ASH" , x "AF_ECONET" , x "AF_ATMSVC" , \
231 x "AF_RDS" , x "AF_SNA" , x "AF_IRDA" , \
232 x "AF_PPPOX" , x "AF_WANPIPE" , x "AF_LLC" , \
233 x "27" , x "28" , x "AF_CAN" , \
234 x "AF_TIPC" , x "AF_BLUETOOTH", x "IUCV" , \
235 x "AF_RXRPC" , x "AF_ISDN" , x "AF_PHONET" , \
236 x "AF_IEEE802154", x "AF_CAIF" , x "AF_ALG" , \
237 x "AF_NFC" , x "AF_VSOCK" , x "AF_KCM" , \
238 x "AF_QIPCRTR", x "AF_SMC" , x "AF_XDP" , \
242 static const char *const af_family_key_strings
[AF_MAX
+1] = {
243 _sock_locks("sk_lock-")
245 static const char *const af_family_slock_key_strings
[AF_MAX
+1] = {
246 _sock_locks("slock-")
248 static const char *const af_family_clock_key_strings
[AF_MAX
+1] = {
249 _sock_locks("clock-")
252 static const char *const af_family_kern_key_strings
[AF_MAX
+1] = {
253 _sock_locks("k-sk_lock-")
255 static const char *const af_family_kern_slock_key_strings
[AF_MAX
+1] = {
256 _sock_locks("k-slock-")
258 static const char *const af_family_kern_clock_key_strings
[AF_MAX
+1] = {
259 _sock_locks("k-clock-")
261 static const char *const af_family_rlock_key_strings
[AF_MAX
+1] = {
262 _sock_locks("rlock-")
264 static const char *const af_family_wlock_key_strings
[AF_MAX
+1] = {
265 _sock_locks("wlock-")
267 static const char *const af_family_elock_key_strings
[AF_MAX
+1] = {
268 _sock_locks("elock-")
272 * sk_callback_lock and sk queues locking rules are per-address-family,
273 * so split the lock classes by using a per-AF key:
275 static struct lock_class_key af_callback_keys
[AF_MAX
];
276 static struct lock_class_key af_rlock_keys
[AF_MAX
];
277 static struct lock_class_key af_wlock_keys
[AF_MAX
];
278 static struct lock_class_key af_elock_keys
[AF_MAX
];
279 static struct lock_class_key af_kern_callback_keys
[AF_MAX
];
281 /* Run time adjustable parameters. */
282 __u32 sysctl_wmem_max __read_mostly
= SK_WMEM_MAX
;
283 EXPORT_SYMBOL(sysctl_wmem_max
);
284 __u32 sysctl_rmem_max __read_mostly
= SK_RMEM_MAX
;
285 EXPORT_SYMBOL(sysctl_rmem_max
);
286 __u32 sysctl_wmem_default __read_mostly
= SK_WMEM_MAX
;
287 __u32 sysctl_rmem_default __read_mostly
= SK_RMEM_MAX
;
289 DEFINE_STATIC_KEY_FALSE(memalloc_socks_key
);
290 EXPORT_SYMBOL_GPL(memalloc_socks_key
);
293 * sk_set_memalloc - sets %SOCK_MEMALLOC
294 * @sk: socket to set it on
296 * Set %SOCK_MEMALLOC on a socket for access to emergency reserves.
297 * It's the responsibility of the admin to adjust min_free_kbytes
298 * to meet the requirements
300 void sk_set_memalloc(struct sock
*sk
)
302 sock_set_flag(sk
, SOCK_MEMALLOC
);
303 sk
->sk_allocation
|= __GFP_MEMALLOC
;
304 static_branch_inc(&memalloc_socks_key
);
306 EXPORT_SYMBOL_GPL(sk_set_memalloc
);
308 void sk_clear_memalloc(struct sock
*sk
)
310 sock_reset_flag(sk
, SOCK_MEMALLOC
);
311 sk
->sk_allocation
&= ~__GFP_MEMALLOC
;
312 static_branch_dec(&memalloc_socks_key
);
315 * SOCK_MEMALLOC is allowed to ignore rmem limits to ensure forward
316 * progress of swapping. SOCK_MEMALLOC may be cleared while
317 * it has rmem allocations due to the last swapfile being deactivated
318 * but there is a risk that the socket is unusable due to exceeding
319 * the rmem limits. Reclaim the reserves and obey rmem limits again.
323 EXPORT_SYMBOL_GPL(sk_clear_memalloc
);
325 int __sk_backlog_rcv(struct sock
*sk
, struct sk_buff
*skb
)
328 unsigned int noreclaim_flag
;
330 /* these should have been dropped before queueing */
331 BUG_ON(!sock_flag(sk
, SOCK_MEMALLOC
));
333 noreclaim_flag
= memalloc_noreclaim_save();
334 ret
= INDIRECT_CALL_INET(sk
->sk_backlog_rcv
,
338 memalloc_noreclaim_restore(noreclaim_flag
);
342 EXPORT_SYMBOL(__sk_backlog_rcv
);
344 void sk_error_report(struct sock
*sk
)
346 sk
->sk_error_report(sk
);
348 switch (sk
->sk_family
) {
352 trace_inet_sk_error_report(sk
);
358 EXPORT_SYMBOL(sk_error_report
);
360 int sock_get_timeout(long timeo
, void *optval
, bool old_timeval
)
362 struct __kernel_sock_timeval tv
;
364 if (timeo
== MAX_SCHEDULE_TIMEOUT
) {
368 tv
.tv_sec
= timeo
/ HZ
;
369 tv
.tv_usec
= ((timeo
% HZ
) * USEC_PER_SEC
) / HZ
;
372 if (old_timeval
&& in_compat_syscall() && !COMPAT_USE_64BIT_TIME
) {
373 struct old_timeval32 tv32
= { tv
.tv_sec
, tv
.tv_usec
};
374 *(struct old_timeval32
*)optval
= tv32
;
379 struct __kernel_old_timeval old_tv
;
380 old_tv
.tv_sec
= tv
.tv_sec
;
381 old_tv
.tv_usec
= tv
.tv_usec
;
382 *(struct __kernel_old_timeval
*)optval
= old_tv
;
383 return sizeof(old_tv
);
386 *(struct __kernel_sock_timeval
*)optval
= tv
;
389 EXPORT_SYMBOL(sock_get_timeout
);
391 int sock_copy_user_timeval(struct __kernel_sock_timeval
*tv
,
392 sockptr_t optval
, int optlen
, bool old_timeval
)
394 if (old_timeval
&& in_compat_syscall() && !COMPAT_USE_64BIT_TIME
) {
395 struct old_timeval32 tv32
;
397 if (optlen
< sizeof(tv32
))
400 if (copy_from_sockptr(&tv32
, optval
, sizeof(tv32
)))
402 tv
->tv_sec
= tv32
.tv_sec
;
403 tv
->tv_usec
= tv32
.tv_usec
;
404 } else if (old_timeval
) {
405 struct __kernel_old_timeval old_tv
;
407 if (optlen
< sizeof(old_tv
))
409 if (copy_from_sockptr(&old_tv
, optval
, sizeof(old_tv
)))
411 tv
->tv_sec
= old_tv
.tv_sec
;
412 tv
->tv_usec
= old_tv
.tv_usec
;
414 if (optlen
< sizeof(*tv
))
416 if (copy_from_sockptr(tv
, optval
, sizeof(*tv
)))
422 EXPORT_SYMBOL(sock_copy_user_timeval
);
424 static int sock_set_timeout(long *timeo_p
, sockptr_t optval
, int optlen
,
427 struct __kernel_sock_timeval tv
;
428 int err
= sock_copy_user_timeval(&tv
, optval
, optlen
, old_timeval
);
434 if (tv
.tv_usec
< 0 || tv
.tv_usec
>= USEC_PER_SEC
)
438 static int warned __read_mostly
;
440 WRITE_ONCE(*timeo_p
, 0);
441 if (warned
< 10 && net_ratelimit()) {
443 pr_info("%s: `%s' (pid %d) tries to set negative timeout\n",
444 __func__
, current
->comm
, task_pid_nr(current
));
448 val
= MAX_SCHEDULE_TIMEOUT
;
449 if ((tv
.tv_sec
|| tv
.tv_usec
) &&
450 (tv
.tv_sec
< (MAX_SCHEDULE_TIMEOUT
/ HZ
- 1)))
451 val
= tv
.tv_sec
* HZ
+ DIV_ROUND_UP((unsigned long)tv
.tv_usec
,
453 WRITE_ONCE(*timeo_p
, val
);
457 static bool sock_needs_netstamp(const struct sock
*sk
)
459 switch (sk
->sk_family
) {
468 static void sock_disable_timestamp(struct sock
*sk
, unsigned long flags
)
470 if (sk
->sk_flags
& flags
) {
471 sk
->sk_flags
&= ~flags
;
472 if (sock_needs_netstamp(sk
) &&
473 !(sk
->sk_flags
& SK_FLAGS_TIMESTAMP
))
474 net_disable_timestamp();
479 int __sock_queue_rcv_skb(struct sock
*sk
, struct sk_buff
*skb
)
482 struct sk_buff_head
*list
= &sk
->sk_receive_queue
;
484 if (atomic_read(&sk
->sk_rmem_alloc
) >= READ_ONCE(sk
->sk_rcvbuf
)) {
485 atomic_inc(&sk
->sk_drops
);
486 trace_sock_rcvqueue_full(sk
, skb
);
490 if (!sk_rmem_schedule(sk
, skb
, skb
->truesize
)) {
491 atomic_inc(&sk
->sk_drops
);
496 skb_set_owner_r(skb
, sk
);
498 /* we escape from rcu protected region, make sure we dont leak
503 spin_lock_irqsave(&list
->lock
, flags
);
504 sock_skb_set_dropcount(sk
, skb
);
505 __skb_queue_tail(list
, skb
);
506 spin_unlock_irqrestore(&list
->lock
, flags
);
508 if (!sock_flag(sk
, SOCK_DEAD
))
509 sk
->sk_data_ready(sk
);
512 EXPORT_SYMBOL(__sock_queue_rcv_skb
);
514 int sock_queue_rcv_skb_reason(struct sock
*sk
, struct sk_buff
*skb
,
515 enum skb_drop_reason
*reason
)
517 enum skb_drop_reason drop_reason
;
520 err
= sk_filter(sk
, skb
);
522 drop_reason
= SKB_DROP_REASON_SOCKET_FILTER
;
525 err
= __sock_queue_rcv_skb(sk
, skb
);
528 drop_reason
= SKB_DROP_REASON_SOCKET_RCVBUFF
;
531 drop_reason
= SKB_DROP_REASON_PROTO_MEM
;
534 drop_reason
= SKB_NOT_DROPPED_YET
;
539 *reason
= drop_reason
;
542 EXPORT_SYMBOL(sock_queue_rcv_skb_reason
);
544 int __sk_receive_skb(struct sock
*sk
, struct sk_buff
*skb
,
545 const int nested
, unsigned int trim_cap
, bool refcounted
)
547 int rc
= NET_RX_SUCCESS
;
549 if (sk_filter_trim_cap(sk
, skb
, trim_cap
))
550 goto discard_and_relse
;
554 if (sk_rcvqueues_full(sk
, READ_ONCE(sk
->sk_rcvbuf
))) {
555 atomic_inc(&sk
->sk_drops
);
556 goto discard_and_relse
;
559 bh_lock_sock_nested(sk
);
562 if (!sock_owned_by_user(sk
)) {
564 * trylock + unlock semantics:
566 mutex_acquire(&sk
->sk_lock
.dep_map
, 0, 1, _RET_IP_
);
568 rc
= sk_backlog_rcv(sk
, skb
);
570 mutex_release(&sk
->sk_lock
.dep_map
, _RET_IP_
);
571 } else if (sk_add_backlog(sk
, skb
, READ_ONCE(sk
->sk_rcvbuf
))) {
573 atomic_inc(&sk
->sk_drops
);
574 goto discard_and_relse
;
586 EXPORT_SYMBOL(__sk_receive_skb
);
588 INDIRECT_CALLABLE_DECLARE(struct dst_entry
*ip6_dst_check(struct dst_entry
*,
590 INDIRECT_CALLABLE_DECLARE(struct dst_entry
*ipv4_dst_check(struct dst_entry
*,
592 struct dst_entry
*__sk_dst_check(struct sock
*sk
, u32 cookie
)
594 struct dst_entry
*dst
= __sk_dst_get(sk
);
596 if (dst
&& dst
->obsolete
&&
597 INDIRECT_CALL_INET(dst
->ops
->check
, ip6_dst_check
, ipv4_dst_check
,
598 dst
, cookie
) == NULL
) {
599 sk_tx_queue_clear(sk
);
600 WRITE_ONCE(sk
->sk_dst_pending_confirm
, 0);
601 RCU_INIT_POINTER(sk
->sk_dst_cache
, NULL
);
608 EXPORT_SYMBOL(__sk_dst_check
);
610 struct dst_entry
*sk_dst_check(struct sock
*sk
, u32 cookie
)
612 struct dst_entry
*dst
= sk_dst_get(sk
);
614 if (dst
&& dst
->obsolete
&&
615 INDIRECT_CALL_INET(dst
->ops
->check
, ip6_dst_check
, ipv4_dst_check
,
616 dst
, cookie
) == NULL
) {
624 EXPORT_SYMBOL(sk_dst_check
);
626 static int sock_bindtoindex_locked(struct sock
*sk
, int ifindex
)
628 int ret
= -ENOPROTOOPT
;
629 #ifdef CONFIG_NETDEVICES
630 struct net
*net
= sock_net(sk
);
634 if (sk
->sk_bound_dev_if
&& !ns_capable(net
->user_ns
, CAP_NET_RAW
))
641 /* Paired with all READ_ONCE() done locklessly. */
642 WRITE_ONCE(sk
->sk_bound_dev_if
, ifindex
);
644 if (sk
->sk_prot
->rehash
)
645 sk
->sk_prot
->rehash(sk
);
656 int sock_bindtoindex(struct sock
*sk
, int ifindex
, bool lock_sk
)
662 ret
= sock_bindtoindex_locked(sk
, ifindex
);
668 EXPORT_SYMBOL(sock_bindtoindex
);
670 static int sock_setbindtodevice(struct sock
*sk
, sockptr_t optval
, int optlen
)
672 int ret
= -ENOPROTOOPT
;
673 #ifdef CONFIG_NETDEVICES
674 struct net
*net
= sock_net(sk
);
675 char devname
[IFNAMSIZ
];
682 /* Bind this socket to a particular device like "eth0",
683 * as specified in the passed interface name. If the
684 * name is "" or the option length is zero the socket
687 if (optlen
> IFNAMSIZ
- 1)
688 optlen
= IFNAMSIZ
- 1;
689 memset(devname
, 0, sizeof(devname
));
692 if (copy_from_sockptr(devname
, optval
, optlen
))
696 if (devname
[0] != '\0') {
697 struct net_device
*dev
;
700 dev
= dev_get_by_name_rcu(net
, devname
);
702 index
= dev
->ifindex
;
709 sockopt_lock_sock(sk
);
710 ret
= sock_bindtoindex_locked(sk
, index
);
711 sockopt_release_sock(sk
);
718 static int sock_getbindtodevice(struct sock
*sk
, sockptr_t optval
,
719 sockptr_t optlen
, int len
)
721 int ret
= -ENOPROTOOPT
;
722 #ifdef CONFIG_NETDEVICES
723 int bound_dev_if
= READ_ONCE(sk
->sk_bound_dev_if
);
724 struct net
*net
= sock_net(sk
);
725 char devname
[IFNAMSIZ
];
727 if (bound_dev_if
== 0) {
736 ret
= netdev_get_name(net
, devname
, bound_dev_if
);
740 len
= strlen(devname
) + 1;
743 if (copy_to_sockptr(optval
, devname
, len
))
748 if (copy_to_sockptr(optlen
, &len
, sizeof(int)))
759 bool sk_mc_loop(const struct sock
*sk
)
761 if (dev_recursion_level())
765 /* IPV6_ADDRFORM can change sk->sk_family under us. */
766 switch (READ_ONCE(sk
->sk_family
)) {
768 return inet_test_bit(MC_LOOP
, sk
);
769 #if IS_ENABLED(CONFIG_IPV6)
771 return inet6_test_bit(MC6_LOOP
, sk
);
777 EXPORT_SYMBOL(sk_mc_loop
);
779 void sock_set_reuseaddr(struct sock
*sk
)
782 sk
->sk_reuse
= SK_CAN_REUSE
;
785 EXPORT_SYMBOL(sock_set_reuseaddr
);
787 void sock_set_reuseport(struct sock
*sk
)
790 sk
->sk_reuseport
= true;
793 EXPORT_SYMBOL(sock_set_reuseport
);
795 void sock_no_linger(struct sock
*sk
)
798 WRITE_ONCE(sk
->sk_lingertime
, 0);
799 sock_set_flag(sk
, SOCK_LINGER
);
802 EXPORT_SYMBOL(sock_no_linger
);
804 void sock_set_priority(struct sock
*sk
, u32 priority
)
806 WRITE_ONCE(sk
->sk_priority
, priority
);
808 EXPORT_SYMBOL(sock_set_priority
);
810 void sock_set_sndtimeo(struct sock
*sk
, s64 secs
)
813 if (secs
&& secs
< MAX_SCHEDULE_TIMEOUT
/ HZ
- 1)
814 WRITE_ONCE(sk
->sk_sndtimeo
, secs
* HZ
);
816 WRITE_ONCE(sk
->sk_sndtimeo
, MAX_SCHEDULE_TIMEOUT
);
819 EXPORT_SYMBOL(sock_set_sndtimeo
);
821 static void __sock_set_timestamps(struct sock
*sk
, bool val
, bool new, bool ns
)
823 sock_valbool_flag(sk
, SOCK_RCVTSTAMP
, val
);
824 sock_valbool_flag(sk
, SOCK_RCVTSTAMPNS
, val
&& ns
);
826 sock_valbool_flag(sk
, SOCK_TSTAMP_NEW
, new);
827 sock_enable_timestamp(sk
, SOCK_TIMESTAMP
);
831 void sock_enable_timestamps(struct sock
*sk
)
834 __sock_set_timestamps(sk
, true, false, true);
837 EXPORT_SYMBOL(sock_enable_timestamps
);
839 void sock_set_timestamp(struct sock
*sk
, int optname
, bool valbool
)
842 case SO_TIMESTAMP_OLD
:
843 __sock_set_timestamps(sk
, valbool
, false, false);
845 case SO_TIMESTAMP_NEW
:
846 __sock_set_timestamps(sk
, valbool
, true, false);
848 case SO_TIMESTAMPNS_OLD
:
849 __sock_set_timestamps(sk
, valbool
, false, true);
851 case SO_TIMESTAMPNS_NEW
:
852 __sock_set_timestamps(sk
, valbool
, true, true);
857 static int sock_timestamping_bind_phc(struct sock
*sk
, int phc_index
)
859 struct net
*net
= sock_net(sk
);
860 struct net_device
*dev
= NULL
;
865 if (sk
->sk_bound_dev_if
)
866 dev
= dev_get_by_index(net
, sk
->sk_bound_dev_if
);
869 pr_err("%s: sock not bind to device\n", __func__
);
873 num
= ethtool_get_phc_vclocks(dev
, &vclock_index
);
876 for (i
= 0; i
< num
; i
++) {
877 if (*(vclock_index
+ i
) == phc_index
) {
889 WRITE_ONCE(sk
->sk_bind_phc
, phc_index
);
894 int sock_set_timestamping(struct sock
*sk
, int optname
,
895 struct so_timestamping timestamping
)
897 int val
= timestamping
.flags
;
900 if (val
& ~SOF_TIMESTAMPING_MASK
)
903 if (val
& SOF_TIMESTAMPING_OPT_ID_TCP
&&
904 !(val
& SOF_TIMESTAMPING_OPT_ID
))
907 if (val
& SOF_TIMESTAMPING_OPT_ID
&&
908 !(sk
->sk_tsflags
& SOF_TIMESTAMPING_OPT_ID
)) {
910 if ((1 << sk
->sk_state
) &
911 (TCPF_CLOSE
| TCPF_LISTEN
))
913 if (val
& SOF_TIMESTAMPING_OPT_ID_TCP
)
914 atomic_set(&sk
->sk_tskey
, tcp_sk(sk
)->write_seq
);
916 atomic_set(&sk
->sk_tskey
, tcp_sk(sk
)->snd_una
);
918 atomic_set(&sk
->sk_tskey
, 0);
922 if (val
& SOF_TIMESTAMPING_OPT_STATS
&&
923 !(val
& SOF_TIMESTAMPING_OPT_TSONLY
))
926 if (val
& SOF_TIMESTAMPING_BIND_PHC
) {
927 ret
= sock_timestamping_bind_phc(sk
, timestamping
.bind_phc
);
932 WRITE_ONCE(sk
->sk_tsflags
, val
);
933 sock_valbool_flag(sk
, SOCK_TSTAMP_NEW
, optname
== SO_TIMESTAMPING_NEW
);
935 if (val
& SOF_TIMESTAMPING_RX_SOFTWARE
)
936 sock_enable_timestamp(sk
,
937 SOCK_TIMESTAMPING_RX_SOFTWARE
);
939 sock_disable_timestamp(sk
,
940 (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE
));
944 void sock_set_keepalive(struct sock
*sk
)
947 if (sk
->sk_prot
->keepalive
)
948 sk
->sk_prot
->keepalive(sk
, true);
949 sock_valbool_flag(sk
, SOCK_KEEPOPEN
, true);
952 EXPORT_SYMBOL(sock_set_keepalive
);
954 static void __sock_set_rcvbuf(struct sock
*sk
, int val
)
956 /* Ensure val * 2 fits into an int, to prevent max_t() from treating it
957 * as a negative value.
959 val
= min_t(int, val
, INT_MAX
/ 2);
960 sk
->sk_userlocks
|= SOCK_RCVBUF_LOCK
;
962 /* We double it on the way in to account for "struct sk_buff" etc.
963 * overhead. Applications assume that the SO_RCVBUF setting they make
964 * will allow that much actual data to be received on that socket.
966 * Applications are unaware that "struct sk_buff" and other overheads
967 * allocate from the receive buffer during socket buffer allocation.
969 * And after considering the possible alternatives, returning the value
970 * we actually used in getsockopt is the most desirable behavior.
972 WRITE_ONCE(sk
->sk_rcvbuf
, max_t(int, val
* 2, SOCK_MIN_RCVBUF
));
975 void sock_set_rcvbuf(struct sock
*sk
, int val
)
978 __sock_set_rcvbuf(sk
, val
);
981 EXPORT_SYMBOL(sock_set_rcvbuf
);
983 static void __sock_set_mark(struct sock
*sk
, u32 val
)
985 if (val
!= sk
->sk_mark
) {
986 WRITE_ONCE(sk
->sk_mark
, val
);
991 void sock_set_mark(struct sock
*sk
, u32 val
)
994 __sock_set_mark(sk
, val
);
997 EXPORT_SYMBOL(sock_set_mark
);
999 static void sock_release_reserved_memory(struct sock
*sk
, int bytes
)
1001 /* Round down bytes to multiple of pages */
1002 bytes
= round_down(bytes
, PAGE_SIZE
);
1004 WARN_ON(bytes
> sk
->sk_reserved_mem
);
1005 WRITE_ONCE(sk
->sk_reserved_mem
, sk
->sk_reserved_mem
- bytes
);
1009 static int sock_reserve_memory(struct sock
*sk
, int bytes
)
1015 if (!mem_cgroup_sockets_enabled
|| !sk
->sk_memcg
|| !sk_has_account(sk
))
1021 pages
= sk_mem_pages(bytes
);
1023 /* pre-charge to memcg */
1024 charged
= mem_cgroup_charge_skmem(sk
->sk_memcg
, pages
,
1025 GFP_KERNEL
| __GFP_RETRY_MAYFAIL
);
1029 /* pre-charge to forward_alloc */
1030 sk_memory_allocated_add(sk
, pages
);
1031 allocated
= sk_memory_allocated(sk
);
1032 /* If the system goes into memory pressure with this
1033 * precharge, give up and return error.
1035 if (allocated
> sk_prot_mem_limits(sk
, 1)) {
1036 sk_memory_allocated_sub(sk
, pages
);
1037 mem_cgroup_uncharge_skmem(sk
->sk_memcg
, pages
);
1040 sk_forward_alloc_add(sk
, pages
<< PAGE_SHIFT
);
1042 WRITE_ONCE(sk
->sk_reserved_mem
,
1043 sk
->sk_reserved_mem
+ (pages
<< PAGE_SHIFT
));
1048 #ifdef CONFIG_PAGE_POOL
1050 /* This is the number of tokens and frags that the user can SO_DEVMEM_DONTNEED
1051 * in 1 syscall. The limit exists to limit the amount of memory the kernel
1052 * allocates to copy these tokens, and to prevent looping over the frags for
1055 #define MAX_DONTNEED_TOKENS 128
1056 #define MAX_DONTNEED_FRAGS 1024
1058 static noinline_for_stack
int
1059 sock_devmem_dontneed(struct sock
*sk
, sockptr_t optval
, unsigned int optlen
)
1061 unsigned int num_tokens
, i
, j
, k
, netmem_num
= 0;
1062 struct dmabuf_token
*tokens
;
1063 int ret
= 0, num_frags
= 0;
1064 netmem_ref netmems
[16];
1069 if (optlen
% sizeof(*tokens
) ||
1070 optlen
> sizeof(*tokens
) * MAX_DONTNEED_TOKENS
)
1073 num_tokens
= optlen
/ sizeof(*tokens
);
1074 tokens
= kvmalloc_array(num_tokens
, sizeof(*tokens
), GFP_KERNEL
);
1078 if (copy_from_sockptr(tokens
, optval
, optlen
)) {
1083 xa_lock_bh(&sk
->sk_user_frags
);
1084 for (i
= 0; i
< num_tokens
; i
++) {
1085 for (j
= 0; j
< tokens
[i
].token_count
; j
++) {
1086 if (++num_frags
> MAX_DONTNEED_FRAGS
)
1087 goto frag_limit_reached
;
1089 netmem_ref netmem
= (__force netmem_ref
)__xa_erase(
1090 &sk
->sk_user_frags
, tokens
[i
].token_start
+ j
);
1092 if (!netmem
|| WARN_ON_ONCE(!netmem_is_net_iov(netmem
)))
1095 netmems
[netmem_num
++] = netmem
;
1096 if (netmem_num
== ARRAY_SIZE(netmems
)) {
1097 xa_unlock_bh(&sk
->sk_user_frags
);
1098 for (k
= 0; k
< netmem_num
; k
++)
1099 WARN_ON_ONCE(!napi_pp_put_page(netmems
[k
]));
1101 xa_lock_bh(&sk
->sk_user_frags
);
1108 xa_unlock_bh(&sk
->sk_user_frags
);
1109 for (k
= 0; k
< netmem_num
; k
++)
1110 WARN_ON_ONCE(!napi_pp_put_page(netmems
[k
]));
1117 void sockopt_lock_sock(struct sock
*sk
)
1119 /* When current->bpf_ctx is set, the setsockopt is called from
1120 * a bpf prog. bpf has ensured the sk lock has been
1121 * acquired before calling setsockopt().
1123 if (has_current_bpf_ctx())
1128 EXPORT_SYMBOL(sockopt_lock_sock
);
1130 void sockopt_release_sock(struct sock
*sk
)
1132 if (has_current_bpf_ctx())
1137 EXPORT_SYMBOL(sockopt_release_sock
);
1139 bool sockopt_ns_capable(struct user_namespace
*ns
, int cap
)
1141 return has_current_bpf_ctx() || ns_capable(ns
, cap
);
1143 EXPORT_SYMBOL(sockopt_ns_capable
);
1145 bool sockopt_capable(int cap
)
1147 return has_current_bpf_ctx() || capable(cap
);
1149 EXPORT_SYMBOL(sockopt_capable
);
1151 static int sockopt_validate_clockid(__kernel_clockid_t value
)
1154 case CLOCK_REALTIME
:
1155 case CLOCK_MONOTONIC
:
1163 * This is meant for all protocols to use and covers goings on
1164 * at the socket level. Everything here is generic.
1167 int sk_setsockopt(struct sock
*sk
, int level
, int optname
,
1168 sockptr_t optval
, unsigned int optlen
)
1170 struct so_timestamping timestamping
;
1171 struct socket
*sock
= sk
->sk_socket
;
1172 struct sock_txtime sk_txtime
;
1179 * Options without arguments
1182 if (optname
== SO_BINDTODEVICE
)
1183 return sock_setbindtodevice(sk
, optval
, optlen
);
1185 if (optlen
< sizeof(int))
1188 if (copy_from_sockptr(&val
, optval
, sizeof(val
)))
1191 valbool
= val
? 1 : 0;
1193 /* handle options which do not require locking the socket. */
1196 if ((val
>= 0 && val
<= 6) ||
1197 sockopt_ns_capable(sock_net(sk
)->user_ns
, CAP_NET_RAW
) ||
1198 sockopt_ns_capable(sock_net(sk
)->user_ns
, CAP_NET_ADMIN
)) {
1199 sock_set_priority(sk
, val
);
1204 assign_bit(SOCK_PASSSEC
, &sock
->flags
, valbool
);
1207 assign_bit(SOCK_PASSCRED
, &sock
->flags
, valbool
);
1210 assign_bit(SOCK_PASSPIDFD
, &sock
->flags
, valbool
);
1216 return -ENOPROTOOPT
;
1217 #ifdef CONFIG_NET_RX_BUSY_POLL
1221 WRITE_ONCE(sk
->sk_ll_usec
, val
);
1223 case SO_PREFER_BUSY_POLL
:
1224 if (valbool
&& !sockopt_capable(CAP_NET_ADMIN
))
1226 WRITE_ONCE(sk
->sk_prefer_busy_poll
, valbool
);
1228 case SO_BUSY_POLL_BUDGET
:
1229 if (val
> READ_ONCE(sk
->sk_busy_poll_budget
) &&
1230 !sockopt_capable(CAP_NET_ADMIN
))
1232 if (val
< 0 || val
> U16_MAX
)
1234 WRITE_ONCE(sk
->sk_busy_poll_budget
, val
);
1237 case SO_MAX_PACING_RATE
:
1239 unsigned long ulval
= (val
== ~0U) ? ~0UL : (unsigned int)val
;
1240 unsigned long pacing_rate
;
1242 if (sizeof(ulval
) != sizeof(val
) &&
1243 optlen
>= sizeof(ulval
) &&
1244 copy_from_sockptr(&ulval
, optval
, sizeof(ulval
))) {
1248 cmpxchg(&sk
->sk_pacing_status
,
1251 /* Pairs with READ_ONCE() from sk_getsockopt() */
1252 WRITE_ONCE(sk
->sk_max_pacing_rate
, ulval
);
1253 pacing_rate
= READ_ONCE(sk
->sk_pacing_rate
);
1254 if (ulval
< pacing_rate
)
1255 WRITE_ONCE(sk
->sk_pacing_rate
, ulval
);
1259 if (val
< -1 || val
> 1)
1261 if ((u8
)val
== SOCK_TXREHASH_DEFAULT
)
1262 val
= READ_ONCE(sock_net(sk
)->core
.sysctl_txrehash
);
1263 /* Paired with READ_ONCE() in tcp_rtx_synack()
1264 * and sk_getsockopt().
1266 WRITE_ONCE(sk
->sk_txrehash
, (u8
)val
);
1270 int (*set_peek_off
)(struct sock
*sk
, int val
);
1272 set_peek_off
= READ_ONCE(sock
->ops
)->set_peek_off
;
1274 ret
= set_peek_off(sk
, val
);
1279 #ifdef CONFIG_PAGE_POOL
1280 case SO_DEVMEM_DONTNEED
:
1281 return sock_devmem_dontneed(sk
, optval
, optlen
);
1285 sockopt_lock_sock(sk
);
1289 if (val
&& !sockopt_capable(CAP_NET_ADMIN
))
1292 sock_valbool_flag(sk
, SOCK_DBG
, valbool
);
1295 sk
->sk_reuse
= (valbool
? SK_CAN_REUSE
: SK_NO_REUSE
);
1298 sk
->sk_reuseport
= valbool
;
1301 sock_valbool_flag(sk
, SOCK_LOCALROUTE
, valbool
);
1305 sock_valbool_flag(sk
, SOCK_BROADCAST
, valbool
);
1308 /* Don't error on this BSD doesn't and if you think
1309 * about it this is right. Otherwise apps have to
1310 * play 'guess the biggest size' games. RCVBUF/SNDBUF
1311 * are treated in BSD as hints
1313 val
= min_t(u32
, val
, READ_ONCE(sysctl_wmem_max
));
1315 /* Ensure val * 2 fits into an int, to prevent max_t()
1316 * from treating it as a negative value.
1318 val
= min_t(int, val
, INT_MAX
/ 2);
1319 sk
->sk_userlocks
|= SOCK_SNDBUF_LOCK
;
1320 WRITE_ONCE(sk
->sk_sndbuf
,
1321 max_t(int, val
* 2, SOCK_MIN_SNDBUF
));
1322 /* Wake up sending tasks if we upped the value. */
1323 sk
->sk_write_space(sk
);
1326 case SO_SNDBUFFORCE
:
1327 if (!sockopt_capable(CAP_NET_ADMIN
)) {
1332 /* No negative values (to prevent underflow, as val will be
1340 /* Don't error on this BSD doesn't and if you think
1341 * about it this is right. Otherwise apps have to
1342 * play 'guess the biggest size' games. RCVBUF/SNDBUF
1343 * are treated in BSD as hints
1345 __sock_set_rcvbuf(sk
, min_t(u32
, val
, READ_ONCE(sysctl_rmem_max
)));
1348 case SO_RCVBUFFORCE
:
1349 if (!sockopt_capable(CAP_NET_ADMIN
)) {
1354 /* No negative values (to prevent underflow, as val will be
1357 __sock_set_rcvbuf(sk
, max(val
, 0));
1361 if (sk
->sk_prot
->keepalive
)
1362 sk
->sk_prot
->keepalive(sk
, valbool
);
1363 sock_valbool_flag(sk
, SOCK_KEEPOPEN
, valbool
);
1367 sock_valbool_flag(sk
, SOCK_URGINLINE
, valbool
);
1371 sk
->sk_no_check_tx
= valbool
;
1375 if (optlen
< sizeof(ling
)) {
1376 ret
= -EINVAL
; /* 1003.1g */
1379 if (copy_from_sockptr(&ling
, optval
, sizeof(ling
))) {
1383 if (!ling
.l_onoff
) {
1384 sock_reset_flag(sk
, SOCK_LINGER
);
1386 unsigned long t_sec
= ling
.l_linger
;
1388 if (t_sec
>= MAX_SCHEDULE_TIMEOUT
/ HZ
)
1389 WRITE_ONCE(sk
->sk_lingertime
, MAX_SCHEDULE_TIMEOUT
);
1391 WRITE_ONCE(sk
->sk_lingertime
, t_sec
* HZ
);
1392 sock_set_flag(sk
, SOCK_LINGER
);
1399 case SO_TIMESTAMP_OLD
:
1400 case SO_TIMESTAMP_NEW
:
1401 case SO_TIMESTAMPNS_OLD
:
1402 case SO_TIMESTAMPNS_NEW
:
1403 sock_set_timestamp(sk
, optname
, valbool
);
1406 case SO_TIMESTAMPING_NEW
:
1407 case SO_TIMESTAMPING_OLD
:
1408 if (optlen
== sizeof(timestamping
)) {
1409 if (copy_from_sockptr(×tamping
, optval
,
1410 sizeof(timestamping
))) {
1415 memset(×tamping
, 0, sizeof(timestamping
));
1416 timestamping
.flags
= val
;
1418 ret
= sock_set_timestamping(sk
, optname
, timestamping
);
1423 int (*set_rcvlowat
)(struct sock
*sk
, int val
) = NULL
;
1428 set_rcvlowat
= READ_ONCE(sock
->ops
)->set_rcvlowat
;
1430 ret
= set_rcvlowat(sk
, val
);
1432 WRITE_ONCE(sk
->sk_rcvlowat
, val
? : 1);
1435 case SO_RCVTIMEO_OLD
:
1436 case SO_RCVTIMEO_NEW
:
1437 ret
= sock_set_timeout(&sk
->sk_rcvtimeo
, optval
,
1438 optlen
, optname
== SO_RCVTIMEO_OLD
);
1441 case SO_SNDTIMEO_OLD
:
1442 case SO_SNDTIMEO_NEW
:
1443 ret
= sock_set_timeout(&sk
->sk_sndtimeo
, optval
,
1444 optlen
, optname
== SO_SNDTIMEO_OLD
);
1447 case SO_ATTACH_FILTER
: {
1448 struct sock_fprog fprog
;
1450 ret
= copy_bpf_fprog_from_user(&fprog
, optval
, optlen
);
1452 ret
= sk_attach_filter(&fprog
, sk
);
1457 if (optlen
== sizeof(u32
)) {
1461 if (copy_from_sockptr(&ufd
, optval
, sizeof(ufd
)))
1464 ret
= sk_attach_bpf(ufd
, sk
);
1468 case SO_ATTACH_REUSEPORT_CBPF
: {
1469 struct sock_fprog fprog
;
1471 ret
= copy_bpf_fprog_from_user(&fprog
, optval
, optlen
);
1473 ret
= sk_reuseport_attach_filter(&fprog
, sk
);
1476 case SO_ATTACH_REUSEPORT_EBPF
:
1478 if (optlen
== sizeof(u32
)) {
1482 if (copy_from_sockptr(&ufd
, optval
, sizeof(ufd
)))
1485 ret
= sk_reuseport_attach_bpf(ufd
, sk
);
1489 case SO_DETACH_REUSEPORT_BPF
:
1490 ret
= reuseport_detach_prog(sk
);
1493 case SO_DETACH_FILTER
:
1494 ret
= sk_detach_filter(sk
);
1497 case SO_LOCK_FILTER
:
1498 if (sock_flag(sk
, SOCK_FILTER_LOCKED
) && !valbool
)
1501 sock_valbool_flag(sk
, SOCK_FILTER_LOCKED
, valbool
);
1505 if (!sockopt_ns_capable(sock_net(sk
)->user_ns
, CAP_NET_RAW
) &&
1506 !sockopt_ns_capable(sock_net(sk
)->user_ns
, CAP_NET_ADMIN
)) {
1511 __sock_set_mark(sk
, val
);
1514 sock_valbool_flag(sk
, SOCK_RCVMARK
, valbool
);
1518 sock_valbool_flag(sk
, SOCK_RXQ_OVFL
, valbool
);
1521 case SO_WIFI_STATUS
:
1522 sock_valbool_flag(sk
, SOCK_WIFI_STATUS
, valbool
);
1526 sock_valbool_flag(sk
, SOCK_NOFCS
, valbool
);
1529 case SO_SELECT_ERR_QUEUE
:
1530 sock_valbool_flag(sk
, SOCK_SELECT_ERR_QUEUE
, valbool
);
1534 case SO_INCOMING_CPU
:
1535 reuseport_update_incoming_cpu(sk
, val
);
1540 dst_negative_advice(sk
);
1544 if (sk
->sk_family
== PF_INET
|| sk
->sk_family
== PF_INET6
) {
1545 if (!(sk_is_tcp(sk
) ||
1546 (sk
->sk_type
== SOCK_DGRAM
&&
1547 sk
->sk_protocol
== IPPROTO_UDP
)))
1549 } else if (sk
->sk_family
!= PF_RDS
) {
1553 if (val
< 0 || val
> 1)
1556 sock_valbool_flag(sk
, SOCK_ZEROCOPY
, valbool
);
1561 if (optlen
!= sizeof(struct sock_txtime
)) {
1564 } else if (copy_from_sockptr(&sk_txtime
, optval
,
1565 sizeof(struct sock_txtime
))) {
1568 } else if (sk_txtime
.flags
& ~SOF_TXTIME_FLAGS_MASK
) {
1572 /* CLOCK_MONOTONIC is only used by sch_fq, and this packet
1573 * scheduler has enough safe guards.
1575 if (sk_txtime
.clockid
!= CLOCK_MONOTONIC
&&
1576 !sockopt_ns_capable(sock_net(sk
)->user_ns
, CAP_NET_ADMIN
)) {
1581 ret
= sockopt_validate_clockid(sk_txtime
.clockid
);
1585 sock_valbool_flag(sk
, SOCK_TXTIME
, true);
1586 sk
->sk_clockid
= sk_txtime
.clockid
;
1587 sk
->sk_txtime_deadline_mode
=
1588 !!(sk_txtime
.flags
& SOF_TXTIME_DEADLINE_MODE
);
1589 sk
->sk_txtime_report_errors
=
1590 !!(sk_txtime
.flags
& SOF_TXTIME_REPORT_ERRORS
);
1593 case SO_BINDTOIFINDEX
:
1594 ret
= sock_bindtoindex_locked(sk
, val
);
1598 if (val
& ~SOCK_BUF_LOCK_MASK
) {
1602 sk
->sk_userlocks
= val
| (sk
->sk_userlocks
&
1603 ~SOCK_BUF_LOCK_MASK
);
1606 case SO_RESERVE_MEM
:
1615 delta
= val
- sk
->sk_reserved_mem
;
1617 sock_release_reserved_memory(sk
, -delta
);
1619 ret
= sock_reserve_memory(sk
, delta
);
1627 sockopt_release_sock(sk
);
1631 int sock_setsockopt(struct socket
*sock
, int level
, int optname
,
1632 sockptr_t optval
, unsigned int optlen
)
1634 return sk_setsockopt(sock
->sk
, level
, optname
,
1637 EXPORT_SYMBOL(sock_setsockopt
);
1639 static const struct cred
*sk_get_peer_cred(struct sock
*sk
)
1641 const struct cred
*cred
;
1643 spin_lock(&sk
->sk_peer_lock
);
1644 cred
= get_cred(sk
->sk_peer_cred
);
1645 spin_unlock(&sk
->sk_peer_lock
);
1650 static void cred_to_ucred(struct pid
*pid
, const struct cred
*cred
,
1651 struct ucred
*ucred
)
1653 ucred
->pid
= pid_vnr(pid
);
1654 ucred
->uid
= ucred
->gid
= -1;
1656 struct user_namespace
*current_ns
= current_user_ns();
1658 ucred
->uid
= from_kuid_munged(current_ns
, cred
->euid
);
1659 ucred
->gid
= from_kgid_munged(current_ns
, cred
->egid
);
1663 static int groups_to_user(sockptr_t dst
, const struct group_info
*src
)
1665 struct user_namespace
*user_ns
= current_user_ns();
1668 for (i
= 0; i
< src
->ngroups
; i
++) {
1669 gid_t gid
= from_kgid_munged(user_ns
, src
->gid
[i
]);
1671 if (copy_to_sockptr_offset(dst
, i
* sizeof(gid
), &gid
, sizeof(gid
)))
1678 int sk_getsockopt(struct sock
*sk
, int level
, int optname
,
1679 sockptr_t optval
, sockptr_t optlen
)
1681 struct socket
*sock
= sk
->sk_socket
;
1686 unsigned long ulval
;
1688 struct old_timeval32 tm32
;
1689 struct __kernel_old_timeval tm
;
1690 struct __kernel_sock_timeval stm
;
1691 struct sock_txtime txtime
;
1692 struct so_timestamping timestamping
;
1695 int lv
= sizeof(int);
1698 if (copy_from_sockptr(&len
, optlen
, sizeof(int)))
1703 memset(&v
, 0, sizeof(v
));
1707 v
.val
= sock_flag(sk
, SOCK_DBG
);
1711 v
.val
= sock_flag(sk
, SOCK_LOCALROUTE
);
1715 v
.val
= sock_flag(sk
, SOCK_BROADCAST
);
1719 v
.val
= READ_ONCE(sk
->sk_sndbuf
);
1723 v
.val
= READ_ONCE(sk
->sk_rcvbuf
);
1727 v
.val
= sk
->sk_reuse
;
1731 v
.val
= sk
->sk_reuseport
;
1735 v
.val
= sock_flag(sk
, SOCK_KEEPOPEN
);
1739 v
.val
= sk
->sk_type
;
1743 v
.val
= sk
->sk_protocol
;
1747 v
.val
= sk
->sk_family
;
1751 v
.val
= -sock_error(sk
);
1753 v
.val
= xchg(&sk
->sk_err_soft
, 0);
1757 v
.val
= sock_flag(sk
, SOCK_URGINLINE
);
1761 v
.val
= sk
->sk_no_check_tx
;
1765 v
.val
= READ_ONCE(sk
->sk_priority
);
1769 lv
= sizeof(v
.ling
);
1770 v
.ling
.l_onoff
= sock_flag(sk
, SOCK_LINGER
);
1771 v
.ling
.l_linger
= READ_ONCE(sk
->sk_lingertime
) / HZ
;
1777 case SO_TIMESTAMP_OLD
:
1778 v
.val
= sock_flag(sk
, SOCK_RCVTSTAMP
) &&
1779 !sock_flag(sk
, SOCK_TSTAMP_NEW
) &&
1780 !sock_flag(sk
, SOCK_RCVTSTAMPNS
);
1783 case SO_TIMESTAMPNS_OLD
:
1784 v
.val
= sock_flag(sk
, SOCK_RCVTSTAMPNS
) && !sock_flag(sk
, SOCK_TSTAMP_NEW
);
1787 case SO_TIMESTAMP_NEW
:
1788 v
.val
= sock_flag(sk
, SOCK_RCVTSTAMP
) && sock_flag(sk
, SOCK_TSTAMP_NEW
);
1791 case SO_TIMESTAMPNS_NEW
:
1792 v
.val
= sock_flag(sk
, SOCK_RCVTSTAMPNS
) && sock_flag(sk
, SOCK_TSTAMP_NEW
);
1795 case SO_TIMESTAMPING_OLD
:
1796 case SO_TIMESTAMPING_NEW
:
1797 lv
= sizeof(v
.timestamping
);
1798 /* For the later-added case SO_TIMESTAMPING_NEW: Be strict about only
1799 * returning the flags when they were set through the same option.
1800 * Don't change the beviour for the old case SO_TIMESTAMPING_OLD.
1802 if (optname
== SO_TIMESTAMPING_OLD
|| sock_flag(sk
, SOCK_TSTAMP_NEW
)) {
1803 v
.timestamping
.flags
= READ_ONCE(sk
->sk_tsflags
);
1804 v
.timestamping
.bind_phc
= READ_ONCE(sk
->sk_bind_phc
);
1808 case SO_RCVTIMEO_OLD
:
1809 case SO_RCVTIMEO_NEW
:
1810 lv
= sock_get_timeout(READ_ONCE(sk
->sk_rcvtimeo
), &v
,
1811 SO_RCVTIMEO_OLD
== optname
);
1814 case SO_SNDTIMEO_OLD
:
1815 case SO_SNDTIMEO_NEW
:
1816 lv
= sock_get_timeout(READ_ONCE(sk
->sk_sndtimeo
), &v
,
1817 SO_SNDTIMEO_OLD
== optname
);
1821 v
.val
= READ_ONCE(sk
->sk_rcvlowat
);
1829 v
.val
= !!test_bit(SOCK_PASSCRED
, &sock
->flags
);
1833 v
.val
= !!test_bit(SOCK_PASSPIDFD
, &sock
->flags
);
1838 struct ucred peercred
;
1839 if (len
> sizeof(peercred
))
1840 len
= sizeof(peercred
);
1842 spin_lock(&sk
->sk_peer_lock
);
1843 cred_to_ucred(sk
->sk_peer_pid
, sk
->sk_peer_cred
, &peercred
);
1844 spin_unlock(&sk
->sk_peer_lock
);
1846 if (copy_to_sockptr(optval
, &peercred
, len
))
1853 struct pid
*peer_pid
;
1854 struct file
*pidfd_file
= NULL
;
1857 if (len
> sizeof(pidfd
))
1858 len
= sizeof(pidfd
);
1860 spin_lock(&sk
->sk_peer_lock
);
1861 peer_pid
= get_pid(sk
->sk_peer_pid
);
1862 spin_unlock(&sk
->sk_peer_lock
);
1867 pidfd
= pidfd_prepare(peer_pid
, 0, &pidfd_file
);
1872 if (copy_to_sockptr(optval
, &pidfd
, len
) ||
1873 copy_to_sockptr(optlen
, &len
, sizeof(int))) {
1874 put_unused_fd(pidfd
);
1880 fd_install(pidfd
, pidfd_file
);
1886 const struct cred
*cred
;
1889 cred
= sk_get_peer_cred(sk
);
1893 n
= cred
->group_info
->ngroups
;
1894 if (len
< n
* sizeof(gid_t
)) {
1895 len
= n
* sizeof(gid_t
);
1897 return copy_to_sockptr(optlen
, &len
, sizeof(int)) ? -EFAULT
: -ERANGE
;
1899 len
= n
* sizeof(gid_t
);
1901 ret
= groups_to_user(optval
, cred
->group_info
);
1910 struct sockaddr_storage address
;
1912 lv
= READ_ONCE(sock
->ops
)->getname(sock
, (struct sockaddr
*)&address
, 2);
1917 if (copy_to_sockptr(optval
, &address
, len
))
1922 /* Dubious BSD thing... Probably nobody even uses it, but
1923 * the UNIX standard wants it for whatever reason... -DaveM
1926 v
.val
= sk
->sk_state
== TCP_LISTEN
;
1930 v
.val
= !!test_bit(SOCK_PASSSEC
, &sock
->flags
);
1934 return security_socket_getpeersec_stream(sock
,
1935 optval
, optlen
, len
);
1938 v
.val
= READ_ONCE(sk
->sk_mark
);
1942 v
.val
= sock_flag(sk
, SOCK_RCVMARK
);
1946 v
.val
= sock_flag(sk
, SOCK_RXQ_OVFL
);
1949 case SO_WIFI_STATUS
:
1950 v
.val
= sock_flag(sk
, SOCK_WIFI_STATUS
);
1954 if (!READ_ONCE(sock
->ops
)->set_peek_off
)
1957 v
.val
= READ_ONCE(sk
->sk_peek_off
);
1960 v
.val
= sock_flag(sk
, SOCK_NOFCS
);
1963 case SO_BINDTODEVICE
:
1964 return sock_getbindtodevice(sk
, optval
, optlen
, len
);
1967 len
= sk_get_filter(sk
, optval
, len
);
1973 case SO_LOCK_FILTER
:
1974 v
.val
= sock_flag(sk
, SOCK_FILTER_LOCKED
);
1977 case SO_BPF_EXTENSIONS
:
1978 v
.val
= bpf_tell_extensions();
1981 case SO_SELECT_ERR_QUEUE
:
1982 v
.val
= sock_flag(sk
, SOCK_SELECT_ERR_QUEUE
);
1985 #ifdef CONFIG_NET_RX_BUSY_POLL
1987 v
.val
= READ_ONCE(sk
->sk_ll_usec
);
1989 case SO_PREFER_BUSY_POLL
:
1990 v
.val
= READ_ONCE(sk
->sk_prefer_busy_poll
);
1994 case SO_MAX_PACING_RATE
:
1995 /* The READ_ONCE() pair with the WRITE_ONCE() in sk_setsockopt() */
1996 if (sizeof(v
.ulval
) != sizeof(v
.val
) && len
>= sizeof(v
.ulval
)) {
1997 lv
= sizeof(v
.ulval
);
1998 v
.ulval
= READ_ONCE(sk
->sk_max_pacing_rate
);
2001 v
.val
= min_t(unsigned long, ~0U,
2002 READ_ONCE(sk
->sk_max_pacing_rate
));
2006 case SO_INCOMING_CPU
:
2007 v
.val
= READ_ONCE(sk
->sk_incoming_cpu
);
2012 u32 meminfo
[SK_MEMINFO_VARS
];
2014 sk_get_meminfo(sk
, meminfo
);
2016 len
= min_t(unsigned int, len
, sizeof(meminfo
));
2017 if (copy_to_sockptr(optval
, &meminfo
, len
))
2023 #ifdef CONFIG_NET_RX_BUSY_POLL
2024 case SO_INCOMING_NAPI_ID
:
2025 v
.val
= READ_ONCE(sk
->sk_napi_id
);
2027 /* aggregate non-NAPI IDs down to 0 */
2028 if (v
.val
< MIN_NAPI_ID
)
2038 v
.val64
= sock_gen_cookie(sk
);
2042 v
.val
= sock_flag(sk
, SOCK_ZEROCOPY
);
2046 lv
= sizeof(v
.txtime
);
2047 v
.txtime
.clockid
= sk
->sk_clockid
;
2048 v
.txtime
.flags
|= sk
->sk_txtime_deadline_mode
?
2049 SOF_TXTIME_DEADLINE_MODE
: 0;
2050 v
.txtime
.flags
|= sk
->sk_txtime_report_errors
?
2051 SOF_TXTIME_REPORT_ERRORS
: 0;
2054 case SO_BINDTOIFINDEX
:
2055 v
.val
= READ_ONCE(sk
->sk_bound_dev_if
);
2058 case SO_NETNS_COOKIE
:
2062 v
.val64
= sock_net(sk
)->net_cookie
;
2066 v
.val
= sk
->sk_userlocks
& SOCK_BUF_LOCK_MASK
;
2069 case SO_RESERVE_MEM
:
2070 v
.val
= READ_ONCE(sk
->sk_reserved_mem
);
2074 /* Paired with WRITE_ONCE() in sk_setsockopt() */
2075 v
.val
= READ_ONCE(sk
->sk_txrehash
);
2079 /* We implement the SO_SNDLOWAT etc to not be settable
2082 return -ENOPROTOOPT
;
2087 if (copy_to_sockptr(optval
, &v
, len
))
2090 if (copy_to_sockptr(optlen
, &len
, sizeof(int)))
2096 * Initialize an sk_lock.
2098 * (We also register the sk_lock with the lock validator.)
2100 static inline void sock_lock_init(struct sock
*sk
)
2102 if (sk
->sk_kern_sock
)
2103 sock_lock_init_class_and_name(
2105 af_family_kern_slock_key_strings
[sk
->sk_family
],
2106 af_family_kern_slock_keys
+ sk
->sk_family
,
2107 af_family_kern_key_strings
[sk
->sk_family
],
2108 af_family_kern_keys
+ sk
->sk_family
);
2110 sock_lock_init_class_and_name(
2112 af_family_slock_key_strings
[sk
->sk_family
],
2113 af_family_slock_keys
+ sk
->sk_family
,
2114 af_family_key_strings
[sk
->sk_family
],
2115 af_family_keys
+ sk
->sk_family
);
2119 * Copy all fields from osk to nsk but nsk->sk_refcnt must not change yet,
2120 * even temporarily, because of RCU lookups. sk_node should also be left as is.
2121 * We must not copy fields between sk_dontcopy_begin and sk_dontcopy_end
2123 static void sock_copy(struct sock
*nsk
, const struct sock
*osk
)
2125 const struct proto
*prot
= READ_ONCE(osk
->sk_prot
);
2126 #ifdef CONFIG_SECURITY_NETWORK
2127 void *sptr
= nsk
->sk_security
;
2130 /* If we move sk_tx_queue_mapping out of the private section,
2131 * we must check if sk_tx_queue_clear() is called after
2132 * sock_copy() in sk_clone_lock().
2134 BUILD_BUG_ON(offsetof(struct sock
, sk_tx_queue_mapping
) <
2135 offsetof(struct sock
, sk_dontcopy_begin
) ||
2136 offsetof(struct sock
, sk_tx_queue_mapping
) >=
2137 offsetof(struct sock
, sk_dontcopy_end
));
2139 memcpy(nsk
, osk
, offsetof(struct sock
, sk_dontcopy_begin
));
2141 unsafe_memcpy(&nsk
->sk_dontcopy_end
, &osk
->sk_dontcopy_end
,
2142 prot
->obj_size
- offsetof(struct sock
, sk_dontcopy_end
),
2143 /* alloc is larger than struct, see sk_prot_alloc() */);
2145 #ifdef CONFIG_SECURITY_NETWORK
2146 nsk
->sk_security
= sptr
;
2147 security_sk_clone(osk
, nsk
);
2151 static struct sock
*sk_prot_alloc(struct proto
*prot
, gfp_t priority
,
2155 struct kmem_cache
*slab
;
2159 sk
= kmem_cache_alloc(slab
, priority
& ~__GFP_ZERO
);
2162 if (want_init_on_alloc(priority
))
2163 sk_prot_clear_nulls(sk
, prot
->obj_size
);
2165 sk
= kmalloc(prot
->obj_size
, priority
);
2168 if (security_sk_alloc(sk
, family
, priority
))
2171 if (!try_module_get(prot
->owner
))
2178 security_sk_free(sk
);
2181 kmem_cache_free(slab
, sk
);
2187 static void sk_prot_free(struct proto
*prot
, struct sock
*sk
)
2189 struct kmem_cache
*slab
;
2190 struct module
*owner
;
2192 owner
= prot
->owner
;
2195 cgroup_sk_free(&sk
->sk_cgrp_data
);
2196 mem_cgroup_sk_free(sk
);
2197 security_sk_free(sk
);
2199 kmem_cache_free(slab
, sk
);
2206 * sk_alloc - All socket objects are allocated here
2207 * @net: the applicable net namespace
2208 * @family: protocol family
2209 * @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
2210 * @prot: struct proto associated with this new sock instance
2211 * @kern: is this to be a kernel socket?
2213 struct sock
*sk_alloc(struct net
*net
, int family
, gfp_t priority
,
2214 struct proto
*prot
, int kern
)
2218 sk
= sk_prot_alloc(prot
, priority
| __GFP_ZERO
, family
);
2220 sk
->sk_family
= family
;
2222 * See comment in struct sock definition to understand
2223 * why we need sk_prot_creator -acme
2225 sk
->sk_prot
= sk
->sk_prot_creator
= prot
;
2226 sk
->sk_kern_sock
= kern
;
2228 sk
->sk_net_refcnt
= kern
? 0 : 1;
2229 if (likely(sk
->sk_net_refcnt
)) {
2230 get_net_track(net
, &sk
->ns_tracker
, priority
);
2231 sock_inuse_add(net
, 1);
2233 __netns_tracker_alloc(net
, &sk
->ns_tracker
,
2237 sock_net_set(sk
, net
);
2238 refcount_set(&sk
->sk_wmem_alloc
, 1);
2240 mem_cgroup_sk_alloc(sk
);
2241 cgroup_sk_alloc(&sk
->sk_cgrp_data
);
2242 sock_update_classid(&sk
->sk_cgrp_data
);
2243 sock_update_netprioidx(&sk
->sk_cgrp_data
);
2244 sk_tx_queue_clear(sk
);
2249 EXPORT_SYMBOL(sk_alloc
);
2251 /* Sockets having SOCK_RCU_FREE will call this function after one RCU
2252 * grace period. This is the case for UDP sockets and TCP listeners.
2254 static void __sk_destruct(struct rcu_head
*head
)
2256 struct sock
*sk
= container_of(head
, struct sock
, sk_rcu
);
2257 struct sk_filter
*filter
;
2259 if (sk
->sk_destruct
)
2260 sk
->sk_destruct(sk
);
2262 filter
= rcu_dereference_check(sk
->sk_filter
,
2263 refcount_read(&sk
->sk_wmem_alloc
) == 0);
2265 sk_filter_uncharge(sk
, filter
);
2266 RCU_INIT_POINTER(sk
->sk_filter
, NULL
);
2269 sock_disable_timestamp(sk
, SK_FLAGS_TIMESTAMP
);
2271 #ifdef CONFIG_BPF_SYSCALL
2272 bpf_sk_storage_free(sk
);
2275 if (atomic_read(&sk
->sk_omem_alloc
))
2276 pr_debug("%s: optmem leakage (%d bytes) detected\n",
2277 __func__
, atomic_read(&sk
->sk_omem_alloc
));
2279 if (sk
->sk_frag
.page
) {
2280 put_page(sk
->sk_frag
.page
);
2281 sk
->sk_frag
.page
= NULL
;
2284 /* We do not need to acquire sk->sk_peer_lock, we are the last user. */
2285 put_cred(sk
->sk_peer_cred
);
2286 put_pid(sk
->sk_peer_pid
);
2288 if (likely(sk
->sk_net_refcnt
))
2289 put_net_track(sock_net(sk
), &sk
->ns_tracker
);
2291 __netns_tracker_free(sock_net(sk
), &sk
->ns_tracker
, false);
2293 sk_prot_free(sk
->sk_prot_creator
, sk
);
2296 void sk_destruct(struct sock
*sk
)
2298 bool use_call_rcu
= sock_flag(sk
, SOCK_RCU_FREE
);
2300 if (rcu_access_pointer(sk
->sk_reuseport_cb
)) {
2301 reuseport_detach_sock(sk
);
2302 use_call_rcu
= true;
2306 call_rcu(&sk
->sk_rcu
, __sk_destruct
);
2308 __sk_destruct(&sk
->sk_rcu
);
2311 static void __sk_free(struct sock
*sk
)
2313 if (likely(sk
->sk_net_refcnt
))
2314 sock_inuse_add(sock_net(sk
), -1);
2316 if (unlikely(sk
->sk_net_refcnt
&& sock_diag_has_destroy_listeners(sk
)))
2317 sock_diag_broadcast_destroy(sk
);
2322 void sk_free(struct sock
*sk
)
2325 * We subtract one from sk_wmem_alloc and can know if
2326 * some packets are still in some tx queue.
2327 * If not null, sock_wfree() will call __sk_free(sk) later
2329 if (refcount_dec_and_test(&sk
->sk_wmem_alloc
))
2332 EXPORT_SYMBOL(sk_free
);
2334 static void sk_init_common(struct sock
*sk
)
2336 skb_queue_head_init(&sk
->sk_receive_queue
);
2337 skb_queue_head_init(&sk
->sk_write_queue
);
2338 skb_queue_head_init(&sk
->sk_error_queue
);
2340 rwlock_init(&sk
->sk_callback_lock
);
2341 lockdep_set_class_and_name(&sk
->sk_receive_queue
.lock
,
2342 af_rlock_keys
+ sk
->sk_family
,
2343 af_family_rlock_key_strings
[sk
->sk_family
]);
2344 lockdep_set_class_and_name(&sk
->sk_write_queue
.lock
,
2345 af_wlock_keys
+ sk
->sk_family
,
2346 af_family_wlock_key_strings
[sk
->sk_family
]);
2347 lockdep_set_class_and_name(&sk
->sk_error_queue
.lock
,
2348 af_elock_keys
+ sk
->sk_family
,
2349 af_family_elock_key_strings
[sk
->sk_family
]);
2350 if (sk
->sk_kern_sock
)
2351 lockdep_set_class_and_name(&sk
->sk_callback_lock
,
2352 af_kern_callback_keys
+ sk
->sk_family
,
2353 af_family_kern_clock_key_strings
[sk
->sk_family
]);
2355 lockdep_set_class_and_name(&sk
->sk_callback_lock
,
2356 af_callback_keys
+ sk
->sk_family
,
2357 af_family_clock_key_strings
[sk
->sk_family
]);
2361 * sk_clone_lock - clone a socket, and lock its clone
2362 * @sk: the socket to clone
2363 * @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
2365 * Caller must unlock socket even in error path (bh_unlock_sock(newsk))
2367 struct sock
*sk_clone_lock(const struct sock
*sk
, const gfp_t priority
)
2369 struct proto
*prot
= READ_ONCE(sk
->sk_prot
);
2370 struct sk_filter
*filter
;
2371 bool is_charged
= true;
2374 newsk
= sk_prot_alloc(prot
, priority
, sk
->sk_family
);
2378 sock_copy(newsk
, sk
);
2380 newsk
->sk_prot_creator
= prot
;
2383 if (likely(newsk
->sk_net_refcnt
)) {
2384 get_net_track(sock_net(newsk
), &newsk
->ns_tracker
, priority
);
2385 sock_inuse_add(sock_net(newsk
), 1);
2387 /* Kernel sockets are not elevating the struct net refcount.
2388 * Instead, use a tracker to more easily detect if a layer
2389 * is not properly dismantling its kernel sockets at netns
2392 __netns_tracker_alloc(sock_net(newsk
), &newsk
->ns_tracker
,
2395 sk_node_init(&newsk
->sk_node
);
2396 sock_lock_init(newsk
);
2397 bh_lock_sock(newsk
);
2398 newsk
->sk_backlog
.head
= newsk
->sk_backlog
.tail
= NULL
;
2399 newsk
->sk_backlog
.len
= 0;
2401 atomic_set(&newsk
->sk_rmem_alloc
, 0);
2403 /* sk_wmem_alloc set to one (see sk_free() and sock_wfree()) */
2404 refcount_set(&newsk
->sk_wmem_alloc
, 1);
2406 atomic_set(&newsk
->sk_omem_alloc
, 0);
2407 sk_init_common(newsk
);
2409 newsk
->sk_dst_cache
= NULL
;
2410 newsk
->sk_dst_pending_confirm
= 0;
2411 newsk
->sk_wmem_queued
= 0;
2412 newsk
->sk_forward_alloc
= 0;
2413 newsk
->sk_reserved_mem
= 0;
2414 atomic_set(&newsk
->sk_drops
, 0);
2415 newsk
->sk_send_head
= NULL
;
2416 newsk
->sk_userlocks
= sk
->sk_userlocks
& ~SOCK_BINDPORT_LOCK
;
2417 atomic_set(&newsk
->sk_zckey
, 0);
2419 sock_reset_flag(newsk
, SOCK_DONE
);
2421 /* sk->sk_memcg will be populated at accept() time */
2422 newsk
->sk_memcg
= NULL
;
2424 cgroup_sk_clone(&newsk
->sk_cgrp_data
);
2427 filter
= rcu_dereference(sk
->sk_filter
);
2429 /* though it's an empty new sock, the charging may fail
2430 * if sysctl_optmem_max was changed between creation of
2431 * original socket and cloning
2433 is_charged
= sk_filter_charge(newsk
, filter
);
2434 RCU_INIT_POINTER(newsk
->sk_filter
, filter
);
2437 if (unlikely(!is_charged
|| xfrm_sk_clone_policy(newsk
, sk
))) {
2438 /* We need to make sure that we don't uncharge the new
2439 * socket if we couldn't charge it in the first place
2440 * as otherwise we uncharge the parent's filter.
2443 RCU_INIT_POINTER(newsk
->sk_filter
, NULL
);
2444 sk_free_unlock_clone(newsk
);
2448 RCU_INIT_POINTER(newsk
->sk_reuseport_cb
, NULL
);
2450 if (bpf_sk_storage_clone(sk
, newsk
)) {
2451 sk_free_unlock_clone(newsk
);
2456 /* Clear sk_user_data if parent had the pointer tagged
2457 * as not suitable for copying when cloning.
2459 if (sk_user_data_is_nocopy(newsk
))
2460 newsk
->sk_user_data
= NULL
;
2463 newsk
->sk_err_soft
= 0;
2464 newsk
->sk_priority
= 0;
2465 newsk
->sk_incoming_cpu
= raw_smp_processor_id();
2467 /* Before updating sk_refcnt, we must commit prior changes to memory
2468 * (Documentation/RCU/rculist_nulls.rst for details)
2471 refcount_set(&newsk
->sk_refcnt
, 2);
2473 sk_set_socket(newsk
, NULL
);
2474 sk_tx_queue_clear(newsk
);
2475 RCU_INIT_POINTER(newsk
->sk_wq
, NULL
);
2477 if (newsk
->sk_prot
->sockets_allocated
)
2478 sk_sockets_allocated_inc(newsk
);
2480 if (sock_needs_netstamp(sk
) && newsk
->sk_flags
& SK_FLAGS_TIMESTAMP
)
2481 net_enable_timestamp();
2485 EXPORT_SYMBOL_GPL(sk_clone_lock
);
2487 void sk_free_unlock_clone(struct sock
*sk
)
2489 /* It is still raw copy of parent, so invalidate
2490 * destructor and make plain sk_free() */
2491 sk
->sk_destruct
= NULL
;
2495 EXPORT_SYMBOL_GPL(sk_free_unlock_clone
);
2497 static u32
sk_dst_gso_max_size(struct sock
*sk
, struct dst_entry
*dst
)
2499 bool is_ipv6
= false;
2502 #if IS_ENABLED(CONFIG_IPV6)
2503 is_ipv6
= (sk
->sk_family
== AF_INET6
&&
2504 !ipv6_addr_v4mapped(&sk
->sk_v6_rcv_saddr
));
2506 /* pairs with the WRITE_ONCE() in netif_set_gso(_ipv4)_max_size() */
2507 max_size
= is_ipv6
? READ_ONCE(dst
->dev
->gso_max_size
) :
2508 READ_ONCE(dst
->dev
->gso_ipv4_max_size
);
2509 if (max_size
> GSO_LEGACY_MAX_SIZE
&& !sk_is_tcp(sk
))
2510 max_size
= GSO_LEGACY_MAX_SIZE
;
2512 return max_size
- (MAX_TCP_HEADER
+ 1);
2515 void sk_setup_caps(struct sock
*sk
, struct dst_entry
*dst
)
2519 sk
->sk_route_caps
= dst
->dev
->features
;
2521 sk
->sk_route_caps
|= NETIF_F_GSO
;
2522 if (sk
->sk_route_caps
& NETIF_F_GSO
)
2523 sk
->sk_route_caps
|= NETIF_F_GSO_SOFTWARE
;
2524 if (unlikely(sk
->sk_gso_disabled
))
2525 sk
->sk_route_caps
&= ~NETIF_F_GSO_MASK
;
2526 if (sk_can_gso(sk
)) {
2527 if (dst
->header_len
&& !xfrm_dst_offload_ok(dst
)) {
2528 sk
->sk_route_caps
&= ~NETIF_F_GSO_MASK
;
2530 sk
->sk_route_caps
|= NETIF_F_SG
| NETIF_F_HW_CSUM
;
2531 sk
->sk_gso_max_size
= sk_dst_gso_max_size(sk
, dst
);
2532 /* pairs with the WRITE_ONCE() in netif_set_gso_max_segs() */
2533 max_segs
= max_t(u32
, READ_ONCE(dst
->dev
->gso_max_segs
), 1);
2536 sk
->sk_gso_max_segs
= max_segs
;
2537 sk_dst_set(sk
, dst
);
2539 EXPORT_SYMBOL_GPL(sk_setup_caps
);
2542 * Simple resource managers for sockets.
2547 * Write buffer destructor automatically called from kfree_skb.
2549 void sock_wfree(struct sk_buff
*skb
)
2551 struct sock
*sk
= skb
->sk
;
2552 unsigned int len
= skb
->truesize
;
2555 if (!sock_flag(sk
, SOCK_USE_WRITE_QUEUE
)) {
2556 if (sock_flag(sk
, SOCK_RCU_FREE
) &&
2557 sk
->sk_write_space
== sock_def_write_space
) {
2559 free
= refcount_sub_and_test(len
, &sk
->sk_wmem_alloc
);
2560 sock_def_write_space_wfree(sk
);
2568 * Keep a reference on sk_wmem_alloc, this will be released
2569 * after sk_write_space() call
2571 WARN_ON(refcount_sub_and_test(len
- 1, &sk
->sk_wmem_alloc
));
2572 sk
->sk_write_space(sk
);
2576 * if sk_wmem_alloc reaches 0, we must finish what sk_free()
2577 * could not do because of in-flight packets
2579 if (refcount_sub_and_test(len
, &sk
->sk_wmem_alloc
))
2582 EXPORT_SYMBOL(sock_wfree
);
2584 /* This variant of sock_wfree() is used by TCP,
2585 * since it sets SOCK_USE_WRITE_QUEUE.
2587 void __sock_wfree(struct sk_buff
*skb
)
2589 struct sock
*sk
= skb
->sk
;
2591 if (refcount_sub_and_test(skb
->truesize
, &sk
->sk_wmem_alloc
))
2595 void skb_set_owner_w(struct sk_buff
*skb
, struct sock
*sk
)
2599 if (unlikely(!sk_fullsock(sk
)))
2600 return skb_set_owner_edemux(skb
, sk
);
2603 skb
->destructor
= sock_wfree
;
2604 skb_set_hash_from_sk(skb
, sk
);
2606 * We used to take a refcount on sk, but following operation
2607 * is enough to guarantee sk_free() won't free this sock until
2608 * all in-flight packets are completed
2610 refcount_add(skb
->truesize
, &sk
->sk_wmem_alloc
);
2612 EXPORT_SYMBOL(skb_set_owner_w
);
2614 static bool can_skb_orphan_partial(const struct sk_buff
*skb
)
2616 /* Drivers depend on in-order delivery for crypto offload,
2617 * partial orphan breaks out-of-order-OK logic.
2619 if (skb_is_decrypted(skb
))
2622 return (skb
->destructor
== sock_wfree
||
2623 (IS_ENABLED(CONFIG_INET
) && skb
->destructor
== tcp_wfree
));
2626 /* This helper is used by netem, as it can hold packets in its
2627 * delay queue. We want to allow the owner socket to send more
2628 * packets, as if they were already TX completed by a typical driver.
2629 * But we also want to keep skb->sk set because some packet schedulers
2630 * rely on it (sch_fq for example).
2632 void skb_orphan_partial(struct sk_buff
*skb
)
2634 if (skb_is_tcp_pure_ack(skb
))
2637 if (can_skb_orphan_partial(skb
) && skb_set_owner_sk_safe(skb
, skb
->sk
))
2642 EXPORT_SYMBOL(skb_orphan_partial
);
2645 * Read buffer destructor automatically called from kfree_skb.
2647 void sock_rfree(struct sk_buff
*skb
)
2649 struct sock
*sk
= skb
->sk
;
2650 unsigned int len
= skb
->truesize
;
2652 atomic_sub(len
, &sk
->sk_rmem_alloc
);
2653 sk_mem_uncharge(sk
, len
);
2655 EXPORT_SYMBOL(sock_rfree
);
2658 * Buffer destructor for skbs that are not used directly in read or write
2659 * path, e.g. for error handler skbs. Automatically called from kfree_skb.
2661 void sock_efree(struct sk_buff
*skb
)
2665 EXPORT_SYMBOL(sock_efree
);
2667 /* Buffer destructor for prefetch/receive path where reference count may
2668 * not be held, e.g. for listen sockets.
2671 void sock_pfree(struct sk_buff
*skb
)
2673 struct sock
*sk
= skb
->sk
;
2675 if (!sk_is_refcounted(sk
))
2678 if (sk
->sk_state
== TCP_NEW_SYN_RECV
&& inet_reqsk(sk
)->syncookie
) {
2679 inet_reqsk(sk
)->rsk_listener
= NULL
;
2680 reqsk_free(inet_reqsk(sk
));
2686 EXPORT_SYMBOL(sock_pfree
);
2687 #endif /* CONFIG_INET */
2689 kuid_t
sock_i_uid(struct sock
*sk
)
2693 read_lock_bh(&sk
->sk_callback_lock
);
2694 uid
= sk
->sk_socket
? SOCK_INODE(sk
->sk_socket
)->i_uid
: GLOBAL_ROOT_UID
;
2695 read_unlock_bh(&sk
->sk_callback_lock
);
2698 EXPORT_SYMBOL(sock_i_uid
);
2700 unsigned long __sock_i_ino(struct sock
*sk
)
2704 read_lock(&sk
->sk_callback_lock
);
2705 ino
= sk
->sk_socket
? SOCK_INODE(sk
->sk_socket
)->i_ino
: 0;
2706 read_unlock(&sk
->sk_callback_lock
);
2709 EXPORT_SYMBOL(__sock_i_ino
);
2711 unsigned long sock_i_ino(struct sock
*sk
)
2716 ino
= __sock_i_ino(sk
);
2720 EXPORT_SYMBOL(sock_i_ino
);
2723 * Allocate a skb from the socket's send buffer.
2725 struct sk_buff
*sock_wmalloc(struct sock
*sk
, unsigned long size
, int force
,
2729 refcount_read(&sk
->sk_wmem_alloc
) < READ_ONCE(sk
->sk_sndbuf
)) {
2730 struct sk_buff
*skb
= alloc_skb(size
, priority
);
2733 skb_set_owner_w(skb
, sk
);
2739 EXPORT_SYMBOL(sock_wmalloc
);
2741 static void sock_ofree(struct sk_buff
*skb
)
2743 struct sock
*sk
= skb
->sk
;
2745 atomic_sub(skb
->truesize
, &sk
->sk_omem_alloc
);
2748 struct sk_buff
*sock_omalloc(struct sock
*sk
, unsigned long size
,
2751 struct sk_buff
*skb
;
2753 /* small safe race: SKB_TRUESIZE may differ from final skb->truesize */
2754 if (atomic_read(&sk
->sk_omem_alloc
) + SKB_TRUESIZE(size
) >
2755 READ_ONCE(sock_net(sk
)->core
.sysctl_optmem_max
))
2758 skb
= alloc_skb(size
, priority
);
2762 atomic_add(skb
->truesize
, &sk
->sk_omem_alloc
);
2764 skb
->destructor
= sock_ofree
;
2769 * Allocate a memory block from the socket's option memory buffer.
2771 void *sock_kmalloc(struct sock
*sk
, int size
, gfp_t priority
)
2773 int optmem_max
= READ_ONCE(sock_net(sk
)->core
.sysctl_optmem_max
);
2775 if ((unsigned int)size
<= optmem_max
&&
2776 atomic_read(&sk
->sk_omem_alloc
) + size
< optmem_max
) {
2778 /* First do the add, to avoid the race if kmalloc
2781 atomic_add(size
, &sk
->sk_omem_alloc
);
2782 mem
= kmalloc(size
, priority
);
2785 atomic_sub(size
, &sk
->sk_omem_alloc
);
2789 EXPORT_SYMBOL(sock_kmalloc
);
2791 /* Free an option memory block. Note, we actually want the inline
2792 * here as this allows gcc to detect the nullify and fold away the
2793 * condition entirely.
2795 static inline void __sock_kfree_s(struct sock
*sk
, void *mem
, int size
,
2798 if (WARN_ON_ONCE(!mem
))
2801 kfree_sensitive(mem
);
2804 atomic_sub(size
, &sk
->sk_omem_alloc
);
2807 void sock_kfree_s(struct sock
*sk
, void *mem
, int size
)
2809 __sock_kfree_s(sk
, mem
, size
, false);
2811 EXPORT_SYMBOL(sock_kfree_s
);
2813 void sock_kzfree_s(struct sock
*sk
, void *mem
, int size
)
2815 __sock_kfree_s(sk
, mem
, size
, true);
2817 EXPORT_SYMBOL(sock_kzfree_s
);
2819 /* It is almost wait_for_tcp_memory minus release_sock/lock_sock.
2820 I think, these locks should be removed for datagram sockets.
2822 static long sock_wait_for_wmem(struct sock
*sk
, long timeo
)
2826 sk_clear_bit(SOCKWQ_ASYNC_NOSPACE
, sk
);
2830 if (signal_pending(current
))
2832 set_bit(SOCK_NOSPACE
, &sk
->sk_socket
->flags
);
2833 prepare_to_wait(sk_sleep(sk
), &wait
, TASK_INTERRUPTIBLE
);
2834 if (refcount_read(&sk
->sk_wmem_alloc
) < READ_ONCE(sk
->sk_sndbuf
))
2836 if (READ_ONCE(sk
->sk_shutdown
) & SEND_SHUTDOWN
)
2838 if (READ_ONCE(sk
->sk_err
))
2840 timeo
= schedule_timeout(timeo
);
2842 finish_wait(sk_sleep(sk
), &wait
);
2848 * Generic send/receive buffer handlers
2851 struct sk_buff
*sock_alloc_send_pskb(struct sock
*sk
, unsigned long header_len
,
2852 unsigned long data_len
, int noblock
,
2853 int *errcode
, int max_page_order
)
2855 struct sk_buff
*skb
;
2859 timeo
= sock_sndtimeo(sk
, noblock
);
2861 err
= sock_error(sk
);
2866 if (READ_ONCE(sk
->sk_shutdown
) & SEND_SHUTDOWN
)
2869 if (sk_wmem_alloc_get(sk
) < READ_ONCE(sk
->sk_sndbuf
))
2872 sk_set_bit(SOCKWQ_ASYNC_NOSPACE
, sk
);
2873 set_bit(SOCK_NOSPACE
, &sk
->sk_socket
->flags
);
2877 if (signal_pending(current
))
2879 timeo
= sock_wait_for_wmem(sk
, timeo
);
2881 skb
= alloc_skb_with_frags(header_len
, data_len
, max_page_order
,
2882 errcode
, sk
->sk_allocation
);
2884 skb_set_owner_w(skb
, sk
);
2888 err
= sock_intr_errno(timeo
);
2893 EXPORT_SYMBOL(sock_alloc_send_pskb
);
2895 int __sock_cmsg_send(struct sock
*sk
, struct cmsghdr
*cmsg
,
2896 struct sockcm_cookie
*sockc
)
2900 BUILD_BUG_ON(SOF_TIMESTAMPING_LAST
== (1 << 31));
2902 switch (cmsg
->cmsg_type
) {
2904 if (!ns_capable(sock_net(sk
)->user_ns
, CAP_NET_RAW
) &&
2905 !ns_capable(sock_net(sk
)->user_ns
, CAP_NET_ADMIN
))
2907 if (cmsg
->cmsg_len
!= CMSG_LEN(sizeof(u32
)))
2909 sockc
->mark
= *(u32
*)CMSG_DATA(cmsg
);
2911 case SO_TIMESTAMPING_OLD
:
2912 case SO_TIMESTAMPING_NEW
:
2913 if (cmsg
->cmsg_len
!= CMSG_LEN(sizeof(u32
)))
2916 tsflags
= *(u32
*)CMSG_DATA(cmsg
);
2917 if (tsflags
& ~SOF_TIMESTAMPING_TX_RECORD_MASK
)
2920 sockc
->tsflags
&= ~SOF_TIMESTAMPING_TX_RECORD_MASK
;
2921 sockc
->tsflags
|= tsflags
;
2924 if (!sock_flag(sk
, SOCK_TXTIME
))
2926 if (cmsg
->cmsg_len
!= CMSG_LEN(sizeof(u64
)))
2928 sockc
->transmit_time
= get_unaligned((u64
*)CMSG_DATA(cmsg
));
2933 tsflags
= READ_ONCE(sk
->sk_tsflags
);
2934 if (!(tsflags
& SOF_TIMESTAMPING_OPT_ID
))
2936 if (cmsg
->cmsg_len
!= CMSG_LEN(sizeof(u32
)))
2938 sockc
->ts_opt_id
= *(u32
*)CMSG_DATA(cmsg
);
2939 sockc
->tsflags
|= SOCKCM_FLAG_TS_OPT_ID
;
2941 /* SCM_RIGHTS and SCM_CREDENTIALS are semantically in SOL_UNIX. */
2943 case SCM_CREDENTIALS
:
2950 EXPORT_SYMBOL(__sock_cmsg_send
);
2952 int sock_cmsg_send(struct sock
*sk
, struct msghdr
*msg
,
2953 struct sockcm_cookie
*sockc
)
2955 struct cmsghdr
*cmsg
;
2958 for_each_cmsghdr(cmsg
, msg
) {
2959 if (!CMSG_OK(msg
, cmsg
))
2961 if (cmsg
->cmsg_level
!= SOL_SOCKET
)
2963 ret
= __sock_cmsg_send(sk
, cmsg
, sockc
);
2969 EXPORT_SYMBOL(sock_cmsg_send
);
2971 static void sk_enter_memory_pressure(struct sock
*sk
)
2973 if (!sk
->sk_prot
->enter_memory_pressure
)
2976 sk
->sk_prot
->enter_memory_pressure(sk
);
2979 static void sk_leave_memory_pressure(struct sock
*sk
)
2981 if (sk
->sk_prot
->leave_memory_pressure
) {
2982 INDIRECT_CALL_INET_1(sk
->sk_prot
->leave_memory_pressure
,
2983 tcp_leave_memory_pressure
, sk
);
2985 unsigned long *memory_pressure
= sk
->sk_prot
->memory_pressure
;
2987 if (memory_pressure
&& READ_ONCE(*memory_pressure
))
2988 WRITE_ONCE(*memory_pressure
, 0);
2992 DEFINE_STATIC_KEY_FALSE(net_high_order_alloc_disable_key
);
2995 * skb_page_frag_refill - check that a page_frag contains enough room
2996 * @sz: minimum size of the fragment we want to get
2997 * @pfrag: pointer to page_frag
2998 * @gfp: priority for memory allocation
3000 * Note: While this allocator tries to use high order pages, there is
3001 * no guarantee that allocations succeed. Therefore, @sz MUST be
3002 * less or equal than PAGE_SIZE.
3004 bool skb_page_frag_refill(unsigned int sz
, struct page_frag
*pfrag
, gfp_t gfp
)
3007 if (page_ref_count(pfrag
->page
) == 1) {
3011 if (pfrag
->offset
+ sz
<= pfrag
->size
)
3013 put_page(pfrag
->page
);
3017 if (SKB_FRAG_PAGE_ORDER
&&
3018 !static_branch_unlikely(&net_high_order_alloc_disable_key
)) {
3019 /* Avoid direct reclaim but allow kswapd to wake */
3020 pfrag
->page
= alloc_pages((gfp
& ~__GFP_DIRECT_RECLAIM
) |
3021 __GFP_COMP
| __GFP_NOWARN
|
3023 SKB_FRAG_PAGE_ORDER
);
3024 if (likely(pfrag
->page
)) {
3025 pfrag
->size
= PAGE_SIZE
<< SKB_FRAG_PAGE_ORDER
;
3029 pfrag
->page
= alloc_page(gfp
);
3030 if (likely(pfrag
->page
)) {
3031 pfrag
->size
= PAGE_SIZE
;
3036 EXPORT_SYMBOL(skb_page_frag_refill
);
3038 bool sk_page_frag_refill(struct sock
*sk
, struct page_frag
*pfrag
)
3040 if (likely(skb_page_frag_refill(32U, pfrag
, sk
->sk_allocation
)))
3043 sk_enter_memory_pressure(sk
);
3044 sk_stream_moderate_sndbuf(sk
);
3047 EXPORT_SYMBOL(sk_page_frag_refill
);
3049 void __lock_sock(struct sock
*sk
)
3050 __releases(&sk
->sk_lock
.slock
)
3051 __acquires(&sk
->sk_lock
.slock
)
3056 prepare_to_wait_exclusive(&sk
->sk_lock
.wq
, &wait
,
3057 TASK_UNINTERRUPTIBLE
);
3058 spin_unlock_bh(&sk
->sk_lock
.slock
);
3060 spin_lock_bh(&sk
->sk_lock
.slock
);
3061 if (!sock_owned_by_user(sk
))
3064 finish_wait(&sk
->sk_lock
.wq
, &wait
);
3067 void __release_sock(struct sock
*sk
)
3068 __releases(&sk
->sk_lock
.slock
)
3069 __acquires(&sk
->sk_lock
.slock
)
3071 struct sk_buff
*skb
, *next
;
3073 while ((skb
= sk
->sk_backlog
.head
) != NULL
) {
3074 sk
->sk_backlog
.head
= sk
->sk_backlog
.tail
= NULL
;
3076 spin_unlock_bh(&sk
->sk_lock
.slock
);
3081 DEBUG_NET_WARN_ON_ONCE(skb_dst_is_noref(skb
));
3082 skb_mark_not_on_list(skb
);
3083 sk_backlog_rcv(sk
, skb
);
3088 } while (skb
!= NULL
);
3090 spin_lock_bh(&sk
->sk_lock
.slock
);
3094 * Doing the zeroing here guarantee we can not loop forever
3095 * while a wild producer attempts to flood us.
3097 sk
->sk_backlog
.len
= 0;
3100 void __sk_flush_backlog(struct sock
*sk
)
3102 spin_lock_bh(&sk
->sk_lock
.slock
);
3105 if (sk
->sk_prot
->release_cb
)
3106 INDIRECT_CALL_INET_1(sk
->sk_prot
->release_cb
,
3107 tcp_release_cb
, sk
);
3109 spin_unlock_bh(&sk
->sk_lock
.slock
);
3111 EXPORT_SYMBOL_GPL(__sk_flush_backlog
);
3114 * sk_wait_data - wait for data to arrive at sk_receive_queue
3115 * @sk: sock to wait on
3116 * @timeo: for how long
3117 * @skb: last skb seen on sk_receive_queue
3119 * Now socket state including sk->sk_err is changed only under lock,
3120 * hence we may omit checks after joining wait queue.
3121 * We check receive queue before schedule() only as optimization;
3122 * it is very likely that release_sock() added new data.
3124 int sk_wait_data(struct sock
*sk
, long *timeo
, const struct sk_buff
*skb
)
3126 DEFINE_WAIT_FUNC(wait
, woken_wake_function
);
3129 add_wait_queue(sk_sleep(sk
), &wait
);
3130 sk_set_bit(SOCKWQ_ASYNC_WAITDATA
, sk
);
3131 rc
= sk_wait_event(sk
, timeo
, skb_peek_tail(&sk
->sk_receive_queue
) != skb
, &wait
);
3132 sk_clear_bit(SOCKWQ_ASYNC_WAITDATA
, sk
);
3133 remove_wait_queue(sk_sleep(sk
), &wait
);
3136 EXPORT_SYMBOL(sk_wait_data
);
3139 * __sk_mem_raise_allocated - increase memory_allocated
3141 * @size: memory size to allocate
3142 * @amt: pages to allocate
3143 * @kind: allocation type
3145 * Similar to __sk_mem_schedule(), but does not update sk_forward_alloc.
3147 * Unlike the globally shared limits among the sockets under same protocol,
3148 * consuming the budget of a memcg won't have direct effect on other ones.
3149 * So be optimistic about memcg's tolerance, and leave the callers to decide
3150 * whether or not to raise allocated through sk_under_memory_pressure() or
3153 int __sk_mem_raise_allocated(struct sock
*sk
, int size
, int amt
, int kind
)
3155 struct mem_cgroup
*memcg
= mem_cgroup_sockets_enabled
? sk
->sk_memcg
: NULL
;
3156 struct proto
*prot
= sk
->sk_prot
;
3157 bool charged
= false;
3160 sk_memory_allocated_add(sk
, amt
);
3161 allocated
= sk_memory_allocated(sk
);
3164 if (!mem_cgroup_charge_skmem(memcg
, amt
, gfp_memcg_charge()))
3165 goto suppress_allocation
;
3170 if (allocated
<= sk_prot_mem_limits(sk
, 0)) {
3171 sk_leave_memory_pressure(sk
);
3175 /* Under pressure. */
3176 if (allocated
> sk_prot_mem_limits(sk
, 1))
3177 sk_enter_memory_pressure(sk
);
3179 /* Over hard limit. */
3180 if (allocated
> sk_prot_mem_limits(sk
, 2))
3181 goto suppress_allocation
;
3183 /* Guarantee minimum buffer size under pressure (either global
3184 * or memcg) to make sure features described in RFC 7323 (TCP
3185 * Extensions for High Performance) work properly.
3187 * This rule does NOT stand when exceeds global or memcg's hard
3188 * limit, or else a DoS attack can be taken place by spawning
3189 * lots of sockets whose usage are under minimum buffer size.
3191 if (kind
== SK_MEM_RECV
) {
3192 if (atomic_read(&sk
->sk_rmem_alloc
) < sk_get_rmem0(sk
, prot
))
3195 } else { /* SK_MEM_SEND */
3196 int wmem0
= sk_get_wmem0(sk
, prot
);
3198 if (sk
->sk_type
== SOCK_STREAM
) {
3199 if (sk
->sk_wmem_queued
< wmem0
)
3201 } else if (refcount_read(&sk
->sk_wmem_alloc
) < wmem0
) {
3206 if (sk_has_memory_pressure(sk
)) {
3209 /* The following 'average' heuristic is within the
3210 * scope of global accounting, so it only makes
3211 * sense for global memory pressure.
3213 if (!sk_under_global_memory_pressure(sk
))
3216 /* Try to be fair among all the sockets under global
3217 * pressure by allowing the ones that below average
3220 alloc
= sk_sockets_allocated_read_positive(sk
);
3221 if (sk_prot_mem_limits(sk
, 2) > alloc
*
3222 sk_mem_pages(sk
->sk_wmem_queued
+
3223 atomic_read(&sk
->sk_rmem_alloc
) +
3224 sk
->sk_forward_alloc
))
3228 suppress_allocation
:
3230 if (kind
== SK_MEM_SEND
&& sk
->sk_type
== SOCK_STREAM
) {
3231 sk_stream_moderate_sndbuf(sk
);
3233 /* Fail only if socket is _under_ its sndbuf.
3234 * In this case we cannot block, so that we have to fail.
3236 if (sk
->sk_wmem_queued
+ size
>= sk
->sk_sndbuf
) {
3237 /* Force charge with __GFP_NOFAIL */
3238 if (memcg
&& !charged
) {
3239 mem_cgroup_charge_skmem(memcg
, amt
,
3240 gfp_memcg_charge() | __GFP_NOFAIL
);
3246 if (kind
== SK_MEM_SEND
|| (kind
== SK_MEM_RECV
&& charged
))
3247 trace_sock_exceed_buf_limit(sk
, prot
, allocated
, kind
);
3249 sk_memory_allocated_sub(sk
, amt
);
3252 mem_cgroup_uncharge_skmem(memcg
, amt
);
3258 * __sk_mem_schedule - increase sk_forward_alloc and memory_allocated
3260 * @size: memory size to allocate
3261 * @kind: allocation type
3263 * If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means
3264 * rmem allocation. This function assumes that protocols which have
3265 * memory_pressure use sk_wmem_queued as write buffer accounting.
3267 int __sk_mem_schedule(struct sock
*sk
, int size
, int kind
)
3269 int ret
, amt
= sk_mem_pages(size
);
3271 sk_forward_alloc_add(sk
, amt
<< PAGE_SHIFT
);
3272 ret
= __sk_mem_raise_allocated(sk
, size
, amt
, kind
);
3274 sk_forward_alloc_add(sk
, -(amt
<< PAGE_SHIFT
));
3277 EXPORT_SYMBOL(__sk_mem_schedule
);
3280 * __sk_mem_reduce_allocated - reclaim memory_allocated
3282 * @amount: number of quanta
3284 * Similar to __sk_mem_reclaim(), but does not update sk_forward_alloc
3286 void __sk_mem_reduce_allocated(struct sock
*sk
, int amount
)
3288 sk_memory_allocated_sub(sk
, amount
);
3290 if (mem_cgroup_sockets_enabled
&& sk
->sk_memcg
)
3291 mem_cgroup_uncharge_skmem(sk
->sk_memcg
, amount
);
3293 if (sk_under_global_memory_pressure(sk
) &&
3294 (sk_memory_allocated(sk
) < sk_prot_mem_limits(sk
, 0)))
3295 sk_leave_memory_pressure(sk
);
3299 * __sk_mem_reclaim - reclaim sk_forward_alloc and memory_allocated
3301 * @amount: number of bytes (rounded down to a PAGE_SIZE multiple)
3303 void __sk_mem_reclaim(struct sock
*sk
, int amount
)
3305 amount
>>= PAGE_SHIFT
;
3306 sk_forward_alloc_add(sk
, -(amount
<< PAGE_SHIFT
));
3307 __sk_mem_reduce_allocated(sk
, amount
);
3309 EXPORT_SYMBOL(__sk_mem_reclaim
);
3311 int sk_set_peek_off(struct sock
*sk
, int val
)
3313 WRITE_ONCE(sk
->sk_peek_off
, val
);
3316 EXPORT_SYMBOL_GPL(sk_set_peek_off
);
3319 * Set of default routines for initialising struct proto_ops when
3320 * the protocol does not support a particular function. In certain
3321 * cases where it makes no sense for a protocol to have a "do nothing"
3322 * function, some default processing is provided.
3325 int sock_no_bind(struct socket
*sock
, struct sockaddr
*saddr
, int len
)
3329 EXPORT_SYMBOL(sock_no_bind
);
3331 int sock_no_connect(struct socket
*sock
, struct sockaddr
*saddr
,
3336 EXPORT_SYMBOL(sock_no_connect
);
3338 int sock_no_socketpair(struct socket
*sock1
, struct socket
*sock2
)
3342 EXPORT_SYMBOL(sock_no_socketpair
);
3344 int sock_no_accept(struct socket
*sock
, struct socket
*newsock
,
3345 struct proto_accept_arg
*arg
)
3349 EXPORT_SYMBOL(sock_no_accept
);
3351 int sock_no_getname(struct socket
*sock
, struct sockaddr
*saddr
,
3356 EXPORT_SYMBOL(sock_no_getname
);
3358 int sock_no_ioctl(struct socket
*sock
, unsigned int cmd
, unsigned long arg
)
3362 EXPORT_SYMBOL(sock_no_ioctl
);
3364 int sock_no_listen(struct socket
*sock
, int backlog
)
3368 EXPORT_SYMBOL(sock_no_listen
);
3370 int sock_no_shutdown(struct socket
*sock
, int how
)
3374 EXPORT_SYMBOL(sock_no_shutdown
);
3376 int sock_no_sendmsg(struct socket
*sock
, struct msghdr
*m
, size_t len
)
3380 EXPORT_SYMBOL(sock_no_sendmsg
);
3382 int sock_no_sendmsg_locked(struct sock
*sk
, struct msghdr
*m
, size_t len
)
3386 EXPORT_SYMBOL(sock_no_sendmsg_locked
);
3388 int sock_no_recvmsg(struct socket
*sock
, struct msghdr
*m
, size_t len
,
3393 EXPORT_SYMBOL(sock_no_recvmsg
);
3395 int sock_no_mmap(struct file
*file
, struct socket
*sock
, struct vm_area_struct
*vma
)
3397 /* Mirror missing mmap method error code */
3400 EXPORT_SYMBOL(sock_no_mmap
);
3403 * When a file is received (via SCM_RIGHTS, etc), we must bump the
3404 * various sock-based usage counts.
3406 void __receive_sock(struct file
*file
)
3408 struct socket
*sock
;
3410 sock
= sock_from_file(file
);
3412 sock_update_netprioidx(&sock
->sk
->sk_cgrp_data
);
3413 sock_update_classid(&sock
->sk
->sk_cgrp_data
);
3418 * Default Socket Callbacks
3421 static void sock_def_wakeup(struct sock
*sk
)
3423 struct socket_wq
*wq
;
3426 wq
= rcu_dereference(sk
->sk_wq
);
3427 if (skwq_has_sleeper(wq
))
3428 wake_up_interruptible_all(&wq
->wait
);
3432 static void sock_def_error_report(struct sock
*sk
)
3434 struct socket_wq
*wq
;
3437 wq
= rcu_dereference(sk
->sk_wq
);
3438 if (skwq_has_sleeper(wq
))
3439 wake_up_interruptible_poll(&wq
->wait
, EPOLLERR
);
3440 sk_wake_async_rcu(sk
, SOCK_WAKE_IO
, POLL_ERR
);
3444 void sock_def_readable(struct sock
*sk
)
3446 struct socket_wq
*wq
;
3448 trace_sk_data_ready(sk
);
3451 wq
= rcu_dereference(sk
->sk_wq
);
3452 if (skwq_has_sleeper(wq
))
3453 wake_up_interruptible_sync_poll(&wq
->wait
, EPOLLIN
| EPOLLPRI
|
3454 EPOLLRDNORM
| EPOLLRDBAND
);
3455 sk_wake_async_rcu(sk
, SOCK_WAKE_WAITD
, POLL_IN
);
3459 static void sock_def_write_space(struct sock
*sk
)
3461 struct socket_wq
*wq
;
3465 /* Do not wake up a writer until he can make "significant"
3468 if (sock_writeable(sk
)) {
3469 wq
= rcu_dereference(sk
->sk_wq
);
3470 if (skwq_has_sleeper(wq
))
3471 wake_up_interruptible_sync_poll(&wq
->wait
, EPOLLOUT
|
3472 EPOLLWRNORM
| EPOLLWRBAND
);
3474 /* Should agree with poll, otherwise some programs break */
3475 sk_wake_async_rcu(sk
, SOCK_WAKE_SPACE
, POLL_OUT
);
3481 /* An optimised version of sock_def_write_space(), should only be called
3482 * for SOCK_RCU_FREE sockets under RCU read section and after putting
3485 static void sock_def_write_space_wfree(struct sock
*sk
)
3487 /* Do not wake up a writer until he can make "significant"
3490 if (sock_writeable(sk
)) {
3491 struct socket_wq
*wq
= rcu_dereference(sk
->sk_wq
);
3493 /* rely on refcount_sub from sock_wfree() */
3494 smp_mb__after_atomic();
3495 if (wq
&& waitqueue_active(&wq
->wait
))
3496 wake_up_interruptible_sync_poll(&wq
->wait
, EPOLLOUT
|
3497 EPOLLWRNORM
| EPOLLWRBAND
);
3499 /* Should agree with poll, otherwise some programs break */
3500 sk_wake_async_rcu(sk
, SOCK_WAKE_SPACE
, POLL_OUT
);
3504 static void sock_def_destruct(struct sock
*sk
)
3508 void sk_send_sigurg(struct sock
*sk
)
3510 if (sk
->sk_socket
&& sk
->sk_socket
->file
)
3511 if (send_sigurg(sk
->sk_socket
->file
))
3512 sk_wake_async(sk
, SOCK_WAKE_URG
, POLL_PRI
);
3514 EXPORT_SYMBOL(sk_send_sigurg
);
3516 void sk_reset_timer(struct sock
*sk
, struct timer_list
* timer
,
3517 unsigned long expires
)
3519 if (!mod_timer(timer
, expires
))
3522 EXPORT_SYMBOL(sk_reset_timer
);
3524 void sk_stop_timer(struct sock
*sk
, struct timer_list
* timer
)
3526 if (del_timer(timer
))
3529 EXPORT_SYMBOL(sk_stop_timer
);
3531 void sk_stop_timer_sync(struct sock
*sk
, struct timer_list
*timer
)
3533 if (del_timer_sync(timer
))
3536 EXPORT_SYMBOL(sk_stop_timer_sync
);
3538 void sock_init_data_uid(struct socket
*sock
, struct sock
*sk
, kuid_t uid
)
3541 sk
->sk_send_head
= NULL
;
3543 timer_setup(&sk
->sk_timer
, NULL
, 0);
3545 sk
->sk_allocation
= GFP_KERNEL
;
3546 sk
->sk_rcvbuf
= READ_ONCE(sysctl_rmem_default
);
3547 sk
->sk_sndbuf
= READ_ONCE(sysctl_wmem_default
);
3548 sk
->sk_state
= TCP_CLOSE
;
3549 sk
->sk_use_task_frag
= true;
3550 sk_set_socket(sk
, sock
);
3552 sock_set_flag(sk
, SOCK_ZAPPED
);
3555 sk
->sk_type
= sock
->type
;
3556 RCU_INIT_POINTER(sk
->sk_wq
, &sock
->wq
);
3559 RCU_INIT_POINTER(sk
->sk_wq
, NULL
);
3563 sk
->sk_state_change
= sock_def_wakeup
;
3564 sk
->sk_data_ready
= sock_def_readable
;
3565 sk
->sk_write_space
= sock_def_write_space
;
3566 sk
->sk_error_report
= sock_def_error_report
;
3567 sk
->sk_destruct
= sock_def_destruct
;
3569 sk
->sk_frag
.page
= NULL
;
3570 sk
->sk_frag
.offset
= 0;
3571 sk
->sk_peek_off
= -1;
3573 sk
->sk_peer_pid
= NULL
;
3574 sk
->sk_peer_cred
= NULL
;
3575 spin_lock_init(&sk
->sk_peer_lock
);
3577 sk
->sk_write_pending
= 0;
3578 sk
->sk_rcvlowat
= 1;
3579 sk
->sk_rcvtimeo
= MAX_SCHEDULE_TIMEOUT
;
3580 sk
->sk_sndtimeo
= MAX_SCHEDULE_TIMEOUT
;
3582 sk
->sk_stamp
= SK_DEFAULT_STAMP
;
3583 #if BITS_PER_LONG==32
3584 seqlock_init(&sk
->sk_stamp_seq
);
3586 atomic_set(&sk
->sk_zckey
, 0);
3588 #ifdef CONFIG_NET_RX_BUSY_POLL
3590 sk
->sk_ll_usec
= READ_ONCE(sysctl_net_busy_read
);
3593 sk
->sk_max_pacing_rate
= ~0UL;
3594 sk
->sk_pacing_rate
= ~0UL;
3595 WRITE_ONCE(sk
->sk_pacing_shift
, 10);
3596 sk
->sk_incoming_cpu
= -1;
3598 sk_rx_queue_clear(sk
);
3600 * Before updating sk_refcnt, we must commit prior changes to memory
3601 * (Documentation/RCU/rculist_nulls.rst for details)
3604 refcount_set(&sk
->sk_refcnt
, 1);
3605 atomic_set(&sk
->sk_drops
, 0);
3607 EXPORT_SYMBOL(sock_init_data_uid
);
3609 void sock_init_data(struct socket
*sock
, struct sock
*sk
)
3612 SOCK_INODE(sock
)->i_uid
:
3613 make_kuid(sock_net(sk
)->user_ns
, 0);
3615 sock_init_data_uid(sock
, sk
, uid
);
3617 EXPORT_SYMBOL(sock_init_data
);
3619 void lock_sock_nested(struct sock
*sk
, int subclass
)
3621 /* The sk_lock has mutex_lock() semantics here. */
3622 mutex_acquire(&sk
->sk_lock
.dep_map
, subclass
, 0, _RET_IP_
);
3625 spin_lock_bh(&sk
->sk_lock
.slock
);
3626 if (sock_owned_by_user_nocheck(sk
))
3628 sk
->sk_lock
.owned
= 1;
3629 spin_unlock_bh(&sk
->sk_lock
.slock
);
3631 EXPORT_SYMBOL(lock_sock_nested
);
3633 void release_sock(struct sock
*sk
)
3635 spin_lock_bh(&sk
->sk_lock
.slock
);
3636 if (sk
->sk_backlog
.tail
)
3639 if (sk
->sk_prot
->release_cb
)
3640 INDIRECT_CALL_INET_1(sk
->sk_prot
->release_cb
,
3641 tcp_release_cb
, sk
);
3643 sock_release_ownership(sk
);
3644 if (waitqueue_active(&sk
->sk_lock
.wq
))
3645 wake_up(&sk
->sk_lock
.wq
);
3646 spin_unlock_bh(&sk
->sk_lock
.slock
);
3648 EXPORT_SYMBOL(release_sock
);
3650 bool __lock_sock_fast(struct sock
*sk
) __acquires(&sk
->sk_lock
.slock
)
3653 spin_lock_bh(&sk
->sk_lock
.slock
);
3655 if (!sock_owned_by_user_nocheck(sk
)) {
3657 * Fast path return with bottom halves disabled and
3658 * sock::sk_lock.slock held.
3660 * The 'mutex' is not contended and holding
3661 * sock::sk_lock.slock prevents all other lockers to
3662 * proceed so the corresponding unlock_sock_fast() can
3663 * avoid the slow path of release_sock() completely and
3664 * just release slock.
3666 * From a semantical POV this is equivalent to 'acquiring'
3667 * the 'mutex', hence the corresponding lockdep
3668 * mutex_release() has to happen in the fast path of
3669 * unlock_sock_fast().
3675 sk
->sk_lock
.owned
= 1;
3676 __acquire(&sk
->sk_lock
.slock
);
3677 spin_unlock_bh(&sk
->sk_lock
.slock
);
3680 EXPORT_SYMBOL(__lock_sock_fast
);
3682 int sock_gettstamp(struct socket
*sock
, void __user
*userstamp
,
3683 bool timeval
, bool time32
)
3685 struct sock
*sk
= sock
->sk
;
3686 struct timespec64 ts
;
3688 sock_enable_timestamp(sk
, SOCK_TIMESTAMP
);
3689 ts
= ktime_to_timespec64(sock_read_timestamp(sk
));
3690 if (ts
.tv_sec
== -1)
3692 if (ts
.tv_sec
== 0) {
3693 ktime_t kt
= ktime_get_real();
3694 sock_write_timestamp(sk
, kt
);
3695 ts
= ktime_to_timespec64(kt
);
3701 #ifdef CONFIG_COMPAT_32BIT_TIME
3703 return put_old_timespec32(&ts
, userstamp
);
3705 #ifdef CONFIG_SPARC64
3706 /* beware of padding in sparc64 timeval */
3707 if (timeval
&& !in_compat_syscall()) {
3708 struct __kernel_old_timeval __user tv
= {
3709 .tv_sec
= ts
.tv_sec
,
3710 .tv_usec
= ts
.tv_nsec
,
3712 if (copy_to_user(userstamp
, &tv
, sizeof(tv
)))
3717 return put_timespec64(&ts
, userstamp
);
3719 EXPORT_SYMBOL(sock_gettstamp
);
3721 void sock_enable_timestamp(struct sock
*sk
, enum sock_flags flag
)
3723 if (!sock_flag(sk
, flag
)) {
3724 unsigned long previous_flags
= sk
->sk_flags
;
3726 sock_set_flag(sk
, flag
);
3728 * we just set one of the two flags which require net
3729 * time stamping, but time stamping might have been on
3730 * already because of the other one
3732 if (sock_needs_netstamp(sk
) &&
3733 !(previous_flags
& SK_FLAGS_TIMESTAMP
))
3734 net_enable_timestamp();
3738 int sock_recv_errqueue(struct sock
*sk
, struct msghdr
*msg
, int len
,
3739 int level
, int type
)
3741 struct sock_exterr_skb
*serr
;
3742 struct sk_buff
*skb
;
3746 skb
= sock_dequeue_err_skb(sk
);
3752 msg
->msg_flags
|= MSG_TRUNC
;
3755 err
= skb_copy_datagram_msg(skb
, 0, msg
, copied
);
3759 sock_recv_timestamp(msg
, sk
, skb
);
3761 serr
= SKB_EXT_ERR(skb
);
3762 put_cmsg(msg
, level
, type
, sizeof(serr
->ee
), &serr
->ee
);
3764 msg
->msg_flags
|= MSG_ERRQUEUE
;
3772 EXPORT_SYMBOL(sock_recv_errqueue
);
3775 * Get a socket option on an socket.
3777 * FIX: POSIX 1003.1g is very ambiguous here. It states that
3778 * asynchronous errors should be reported by getsockopt. We assume
3779 * this means if you specify SO_ERROR (otherwise what is the point of it).
3781 int sock_common_getsockopt(struct socket
*sock
, int level
, int optname
,
3782 char __user
*optval
, int __user
*optlen
)
3784 struct sock
*sk
= sock
->sk
;
3786 /* IPV6_ADDRFORM can change sk->sk_prot under us. */
3787 return READ_ONCE(sk
->sk_prot
)->getsockopt(sk
, level
, optname
, optval
, optlen
);
3789 EXPORT_SYMBOL(sock_common_getsockopt
);
3791 int sock_common_recvmsg(struct socket
*sock
, struct msghdr
*msg
, size_t size
,
3794 struct sock
*sk
= sock
->sk
;
3798 err
= sk
->sk_prot
->recvmsg(sk
, msg
, size
, flags
, &addr_len
);
3800 msg
->msg_namelen
= addr_len
;
3803 EXPORT_SYMBOL(sock_common_recvmsg
);
3806 * Set socket options on an inet socket.
3808 int sock_common_setsockopt(struct socket
*sock
, int level
, int optname
,
3809 sockptr_t optval
, unsigned int optlen
)
3811 struct sock
*sk
= sock
->sk
;
3813 /* IPV6_ADDRFORM can change sk->sk_prot under us. */
3814 return READ_ONCE(sk
->sk_prot
)->setsockopt(sk
, level
, optname
, optval
, optlen
);
3816 EXPORT_SYMBOL(sock_common_setsockopt
);
3818 void sk_common_release(struct sock
*sk
)
3820 if (sk
->sk_prot
->destroy
)
3821 sk
->sk_prot
->destroy(sk
);
3824 * Observation: when sk_common_release is called, processes have
3825 * no access to socket. But net still has.
3826 * Step one, detach it from networking:
3828 * A. Remove from hash tables.
3831 sk
->sk_prot
->unhash(sk
);
3834 * In this point socket cannot receive new packets, but it is possible
3835 * that some packets are in flight because some CPU runs receiver and
3836 * did hash table lookup before we unhashed socket. They will achieve
3837 * receive queue and will be purged by socket destructor.
3839 * Also we still have packets pending on receive queue and probably,
3840 * our own packets waiting in device queues. sock_destroy will drain
3841 * receive queue, but transmitted packets will delay socket destruction
3842 * until the last reference will be released.
3847 xfrm_sk_free_policy(sk
);
3851 EXPORT_SYMBOL(sk_common_release
);
3853 void sk_get_meminfo(const struct sock
*sk
, u32
*mem
)
3855 memset(mem
, 0, sizeof(*mem
) * SK_MEMINFO_VARS
);
3857 mem
[SK_MEMINFO_RMEM_ALLOC
] = sk_rmem_alloc_get(sk
);
3858 mem
[SK_MEMINFO_RCVBUF
] = READ_ONCE(sk
->sk_rcvbuf
);
3859 mem
[SK_MEMINFO_WMEM_ALLOC
] = sk_wmem_alloc_get(sk
);
3860 mem
[SK_MEMINFO_SNDBUF
] = READ_ONCE(sk
->sk_sndbuf
);
3861 mem
[SK_MEMINFO_FWD_ALLOC
] = sk_forward_alloc_get(sk
);
3862 mem
[SK_MEMINFO_WMEM_QUEUED
] = READ_ONCE(sk
->sk_wmem_queued
);
3863 mem
[SK_MEMINFO_OPTMEM
] = atomic_read(&sk
->sk_omem_alloc
);
3864 mem
[SK_MEMINFO_BACKLOG
] = READ_ONCE(sk
->sk_backlog
.len
);
3865 mem
[SK_MEMINFO_DROPS
] = atomic_read(&sk
->sk_drops
);
3868 #ifdef CONFIG_PROC_FS
3869 static DECLARE_BITMAP(proto_inuse_idx
, PROTO_INUSE_NR
);
3871 int sock_prot_inuse_get(struct net
*net
, struct proto
*prot
)
3873 int cpu
, idx
= prot
->inuse_idx
;
3876 for_each_possible_cpu(cpu
)
3877 res
+= per_cpu_ptr(net
->core
.prot_inuse
, cpu
)->val
[idx
];
3879 return res
>= 0 ? res
: 0;
3881 EXPORT_SYMBOL_GPL(sock_prot_inuse_get
);
3883 int sock_inuse_get(struct net
*net
)
3887 for_each_possible_cpu(cpu
)
3888 res
+= per_cpu_ptr(net
->core
.prot_inuse
, cpu
)->all
;
3893 EXPORT_SYMBOL_GPL(sock_inuse_get
);
3895 static int __net_init
sock_inuse_init_net(struct net
*net
)
3897 net
->core
.prot_inuse
= alloc_percpu(struct prot_inuse
);
3898 if (net
->core
.prot_inuse
== NULL
)
3903 static void __net_exit
sock_inuse_exit_net(struct net
*net
)
3905 free_percpu(net
->core
.prot_inuse
);
3908 static struct pernet_operations net_inuse_ops
= {
3909 .init
= sock_inuse_init_net
,
3910 .exit
= sock_inuse_exit_net
,
3913 static __init
int net_inuse_init(void)
3915 if (register_pernet_subsys(&net_inuse_ops
))
3916 panic("Cannot initialize net inuse counters");
3921 core_initcall(net_inuse_init
);
3923 static int assign_proto_idx(struct proto
*prot
)
3925 prot
->inuse_idx
= find_first_zero_bit(proto_inuse_idx
, PROTO_INUSE_NR
);
3927 if (unlikely(prot
->inuse_idx
== PROTO_INUSE_NR
- 1)) {
3928 pr_err("PROTO_INUSE_NR exhausted\n");
3932 set_bit(prot
->inuse_idx
, proto_inuse_idx
);
3936 static void release_proto_idx(struct proto
*prot
)
3938 if (prot
->inuse_idx
!= PROTO_INUSE_NR
- 1)
3939 clear_bit(prot
->inuse_idx
, proto_inuse_idx
);
3942 static inline int assign_proto_idx(struct proto
*prot
)
3947 static inline void release_proto_idx(struct proto
*prot
)
3953 static void tw_prot_cleanup(struct timewait_sock_ops
*twsk_prot
)
3957 kfree(twsk_prot
->twsk_slab_name
);
3958 twsk_prot
->twsk_slab_name
= NULL
;
3959 kmem_cache_destroy(twsk_prot
->twsk_slab
);
3960 twsk_prot
->twsk_slab
= NULL
;
3963 static int tw_prot_init(const struct proto
*prot
)
3965 struct timewait_sock_ops
*twsk_prot
= prot
->twsk_prot
;
3970 twsk_prot
->twsk_slab_name
= kasprintf(GFP_KERNEL
, "tw_sock_%s",
3972 if (!twsk_prot
->twsk_slab_name
)
3975 twsk_prot
->twsk_slab
=
3976 kmem_cache_create(twsk_prot
->twsk_slab_name
,
3977 twsk_prot
->twsk_obj_size
, 0,
3978 SLAB_ACCOUNT
| prot
->slab_flags
,
3980 if (!twsk_prot
->twsk_slab
) {
3981 pr_crit("%s: Can't create timewait sock SLAB cache!\n",
3989 static void req_prot_cleanup(struct request_sock_ops
*rsk_prot
)
3993 kfree(rsk_prot
->slab_name
);
3994 rsk_prot
->slab_name
= NULL
;
3995 kmem_cache_destroy(rsk_prot
->slab
);
3996 rsk_prot
->slab
= NULL
;
3999 static int req_prot_init(const struct proto
*prot
)
4001 struct request_sock_ops
*rsk_prot
= prot
->rsk_prot
;
4006 rsk_prot
->slab_name
= kasprintf(GFP_KERNEL
, "request_sock_%s",
4008 if (!rsk_prot
->slab_name
)
4011 rsk_prot
->slab
= kmem_cache_create(rsk_prot
->slab_name
,
4012 rsk_prot
->obj_size
, 0,
4013 SLAB_ACCOUNT
| prot
->slab_flags
,
4016 if (!rsk_prot
->slab
) {
4017 pr_crit("%s: Can't create request sock SLAB cache!\n",
4024 int proto_register(struct proto
*prot
, int alloc_slab
)
4028 if (prot
->memory_allocated
&& !prot
->sysctl_mem
) {
4029 pr_err("%s: missing sysctl_mem\n", prot
->name
);
4032 if (prot
->memory_allocated
&& !prot
->per_cpu_fw_alloc
) {
4033 pr_err("%s: missing per_cpu_fw_alloc\n", prot
->name
);
4037 prot
->slab
= kmem_cache_create_usercopy(prot
->name
,
4039 SLAB_HWCACHE_ALIGN
| SLAB_ACCOUNT
|
4041 prot
->useroffset
, prot
->usersize
,
4044 if (prot
->slab
== NULL
) {
4045 pr_crit("%s: Can't create sock SLAB cache!\n",
4050 if (req_prot_init(prot
))
4051 goto out_free_request_sock_slab
;
4053 if (tw_prot_init(prot
))
4054 goto out_free_timewait_sock_slab
;
4057 mutex_lock(&proto_list_mutex
);
4058 ret
= assign_proto_idx(prot
);
4060 mutex_unlock(&proto_list_mutex
);
4061 goto out_free_timewait_sock_slab
;
4063 list_add(&prot
->node
, &proto_list
);
4064 mutex_unlock(&proto_list_mutex
);
4067 out_free_timewait_sock_slab
:
4069 tw_prot_cleanup(prot
->twsk_prot
);
4070 out_free_request_sock_slab
:
4072 req_prot_cleanup(prot
->rsk_prot
);
4074 kmem_cache_destroy(prot
->slab
);
4080 EXPORT_SYMBOL(proto_register
);
4082 void proto_unregister(struct proto
*prot
)
4084 mutex_lock(&proto_list_mutex
);
4085 release_proto_idx(prot
);
4086 list_del(&prot
->node
);
4087 mutex_unlock(&proto_list_mutex
);
4089 kmem_cache_destroy(prot
->slab
);
4092 req_prot_cleanup(prot
->rsk_prot
);
4093 tw_prot_cleanup(prot
->twsk_prot
);
4095 EXPORT_SYMBOL(proto_unregister
);
4097 int sock_load_diag_module(int family
, int protocol
)
4100 if (!sock_is_registered(family
))
4103 return request_module("net-pf-%d-proto-%d-type-%d", PF_NETLINK
,
4104 NETLINK_SOCK_DIAG
, family
);
4108 if (family
== AF_INET
&&
4109 protocol
!= IPPROTO_RAW
&&
4110 protocol
< MAX_INET_PROTOS
&&
4111 !rcu_access_pointer(inet_protos
[protocol
]))
4115 return request_module("net-pf-%d-proto-%d-type-%d-%d", PF_NETLINK
,
4116 NETLINK_SOCK_DIAG
, family
, protocol
);
4118 EXPORT_SYMBOL(sock_load_diag_module
);
4120 #ifdef CONFIG_PROC_FS
4121 static void *proto_seq_start(struct seq_file
*seq
, loff_t
*pos
)
4122 __acquires(proto_list_mutex
)
4124 mutex_lock(&proto_list_mutex
);
4125 return seq_list_start_head(&proto_list
, *pos
);
4128 static void *proto_seq_next(struct seq_file
*seq
, void *v
, loff_t
*pos
)
4130 return seq_list_next(v
, &proto_list
, pos
);
4133 static void proto_seq_stop(struct seq_file
*seq
, void *v
)
4134 __releases(proto_list_mutex
)
4136 mutex_unlock(&proto_list_mutex
);
4139 static char proto_method_implemented(const void *method
)
4141 return method
== NULL
? 'n' : 'y';
4143 static long sock_prot_memory_allocated(struct proto
*proto
)
4145 return proto
->memory_allocated
!= NULL
? proto_memory_allocated(proto
) : -1L;
4148 static const char *sock_prot_memory_pressure(struct proto
*proto
)
4150 return proto
->memory_pressure
!= NULL
?
4151 proto_memory_pressure(proto
) ? "yes" : "no" : "NI";
4154 static void proto_seq_printf(struct seq_file
*seq
, struct proto
*proto
)
4157 seq_printf(seq
, "%-9s %4u %6d %6ld %-3s %6u %-3s %-10s "
4158 "%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n",
4161 sock_prot_inuse_get(seq_file_net(seq
), proto
),
4162 sock_prot_memory_allocated(proto
),
4163 sock_prot_memory_pressure(proto
),
4165 proto
->slab
== NULL
? "no" : "yes",
4166 module_name(proto
->owner
),
4167 proto_method_implemented(proto
->close
),
4168 proto_method_implemented(proto
->connect
),
4169 proto_method_implemented(proto
->disconnect
),
4170 proto_method_implemented(proto
->accept
),
4171 proto_method_implemented(proto
->ioctl
),
4172 proto_method_implemented(proto
->init
),
4173 proto_method_implemented(proto
->destroy
),
4174 proto_method_implemented(proto
->shutdown
),
4175 proto_method_implemented(proto
->setsockopt
),
4176 proto_method_implemented(proto
->getsockopt
),
4177 proto_method_implemented(proto
->sendmsg
),
4178 proto_method_implemented(proto
->recvmsg
),
4179 proto_method_implemented(proto
->bind
),
4180 proto_method_implemented(proto
->backlog_rcv
),
4181 proto_method_implemented(proto
->hash
),
4182 proto_method_implemented(proto
->unhash
),
4183 proto_method_implemented(proto
->get_port
),
4184 proto_method_implemented(proto
->enter_memory_pressure
));
4187 static int proto_seq_show(struct seq_file
*seq
, void *v
)
4189 if (v
== &proto_list
)
4190 seq_printf(seq
, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s",
4199 "cl co di ac io in de sh ss gs se re bi br ha uh gp em\n");
4201 proto_seq_printf(seq
, list_entry(v
, struct proto
, node
));
4205 static const struct seq_operations proto_seq_ops
= {
4206 .start
= proto_seq_start
,
4207 .next
= proto_seq_next
,
4208 .stop
= proto_seq_stop
,
4209 .show
= proto_seq_show
,
4212 static __net_init
int proto_init_net(struct net
*net
)
4214 if (!proc_create_net("protocols", 0444, net
->proc_net
, &proto_seq_ops
,
4215 sizeof(struct seq_net_private
)))
4221 static __net_exit
void proto_exit_net(struct net
*net
)
4223 remove_proc_entry("protocols", net
->proc_net
);
4227 static __net_initdata
struct pernet_operations proto_net_ops
= {
4228 .init
= proto_init_net
,
4229 .exit
= proto_exit_net
,
4232 static int __init
proto_init(void)
4234 return register_pernet_subsys(&proto_net_ops
);
4237 subsys_initcall(proto_init
);
4239 #endif /* PROC_FS */
4241 #ifdef CONFIG_NET_RX_BUSY_POLL
4242 bool sk_busy_loop_end(void *p
, unsigned long start_time
)
4244 struct sock
*sk
= p
;
4246 if (!skb_queue_empty_lockless(&sk
->sk_receive_queue
))
4249 if (sk_is_udp(sk
) &&
4250 !skb_queue_empty_lockless(&udp_sk(sk
)->reader_queue
))
4253 return sk_busy_loop_timeout(sk
, start_time
);
4255 EXPORT_SYMBOL(sk_busy_loop_end
);
4256 #endif /* CONFIG_NET_RX_BUSY_POLL */
4258 int sock_bind_add(struct sock
*sk
, struct sockaddr
*addr
, int addr_len
)
4260 if (!sk
->sk_prot
->bind_add
)
4262 return sk
->sk_prot
->bind_add(sk
, addr
, addr_len
);
4264 EXPORT_SYMBOL(sock_bind_add
);
4266 /* Copy 'size' bytes from userspace and return `size` back to userspace */
4267 int sock_ioctl_inout(struct sock
*sk
, unsigned int cmd
,
4268 void __user
*arg
, void *karg
, size_t size
)
4272 if (copy_from_user(karg
, arg
, size
))
4275 ret
= READ_ONCE(sk
->sk_prot
)->ioctl(sk
, cmd
, karg
);
4279 if (copy_to_user(arg
, karg
, size
))
4284 EXPORT_SYMBOL(sock_ioctl_inout
);
4286 /* This is the most common ioctl prep function, where the result (4 bytes) is
4287 * copied back to userspace if the ioctl() returns successfully. No input is
4288 * copied from userspace as input argument.
4290 static int sock_ioctl_out(struct sock
*sk
, unsigned int cmd
, void __user
*arg
)
4294 ret
= READ_ONCE(sk
->sk_prot
)->ioctl(sk
, cmd
, &karg
);
4298 return put_user(karg
, (int __user
*)arg
);
4301 /* A wrapper around sock ioctls, which copies the data from userspace
4302 * (depending on the protocol/ioctl), and copies back the result to userspace.
4303 * The main motivation for this function is to pass kernel memory to the
4304 * protocol ioctl callbacks, instead of userspace memory.
4306 int sk_ioctl(struct sock
*sk
, unsigned int cmd
, void __user
*arg
)
4310 if (sk
->sk_type
== SOCK_RAW
&& sk
->sk_family
== AF_INET
)
4311 rc
= ipmr_sk_ioctl(sk
, cmd
, arg
);
4312 else if (sk
->sk_type
== SOCK_RAW
&& sk
->sk_family
== AF_INET6
)
4313 rc
= ip6mr_sk_ioctl(sk
, cmd
, arg
);
4314 else if (sk_is_phonet(sk
))
4315 rc
= phonet_sk_ioctl(sk
, cmd
, arg
);
4317 /* If ioctl was processed, returns its value */
4321 /* Otherwise call the default handler */
4322 return sock_ioctl_out(sk
, cmd
, arg
);
4324 EXPORT_SYMBOL(sk_ioctl
);
4326 static int __init
sock_struct_check(void)
4328 CACHELINE_ASSERT_GROUP_MEMBER(struct sock
, sock_write_rx
, sk_drops
);
4329 CACHELINE_ASSERT_GROUP_MEMBER(struct sock
, sock_write_rx
, sk_peek_off
);
4330 CACHELINE_ASSERT_GROUP_MEMBER(struct sock
, sock_write_rx
, sk_error_queue
);
4331 CACHELINE_ASSERT_GROUP_MEMBER(struct sock
, sock_write_rx
, sk_receive_queue
);
4332 CACHELINE_ASSERT_GROUP_MEMBER(struct sock
, sock_write_rx
, sk_backlog
);
4334 CACHELINE_ASSERT_GROUP_MEMBER(struct sock
, sock_read_rx
, sk_rx_dst
);
4335 CACHELINE_ASSERT_GROUP_MEMBER(struct sock
, sock_read_rx
, sk_rx_dst_ifindex
);
4336 CACHELINE_ASSERT_GROUP_MEMBER(struct sock
, sock_read_rx
, sk_rx_dst_cookie
);
4337 CACHELINE_ASSERT_GROUP_MEMBER(struct sock
, sock_read_rx
, sk_rcvbuf
);
4338 CACHELINE_ASSERT_GROUP_MEMBER(struct sock
, sock_read_rx
, sk_filter
);
4339 CACHELINE_ASSERT_GROUP_MEMBER(struct sock
, sock_read_rx
, sk_wq
);
4340 CACHELINE_ASSERT_GROUP_MEMBER(struct sock
, sock_read_rx
, sk_data_ready
);
4341 CACHELINE_ASSERT_GROUP_MEMBER(struct sock
, sock_read_rx
, sk_rcvtimeo
);
4342 CACHELINE_ASSERT_GROUP_MEMBER(struct sock
, sock_read_rx
, sk_rcvlowat
);
4344 CACHELINE_ASSERT_GROUP_MEMBER(struct sock
, sock_read_rxtx
, sk_err
);
4345 CACHELINE_ASSERT_GROUP_MEMBER(struct sock
, sock_read_rxtx
, sk_socket
);
4346 CACHELINE_ASSERT_GROUP_MEMBER(struct sock
, sock_read_rxtx
, sk_memcg
);
4348 CACHELINE_ASSERT_GROUP_MEMBER(struct sock
, sock_write_rxtx
, sk_lock
);
4349 CACHELINE_ASSERT_GROUP_MEMBER(struct sock
, sock_write_rxtx
, sk_reserved_mem
);
4350 CACHELINE_ASSERT_GROUP_MEMBER(struct sock
, sock_write_rxtx
, sk_forward_alloc
);
4351 CACHELINE_ASSERT_GROUP_MEMBER(struct sock
, sock_write_rxtx
, sk_tsflags
);
4353 CACHELINE_ASSERT_GROUP_MEMBER(struct sock
, sock_write_tx
, sk_omem_alloc
);
4354 CACHELINE_ASSERT_GROUP_MEMBER(struct sock
, sock_write_tx
, sk_omem_alloc
);
4355 CACHELINE_ASSERT_GROUP_MEMBER(struct sock
, sock_write_tx
, sk_sndbuf
);
4356 CACHELINE_ASSERT_GROUP_MEMBER(struct sock
, sock_write_tx
, sk_wmem_queued
);
4357 CACHELINE_ASSERT_GROUP_MEMBER(struct sock
, sock_write_tx
, sk_wmem_alloc
);
4358 CACHELINE_ASSERT_GROUP_MEMBER(struct sock
, sock_write_tx
, sk_tsq_flags
);
4359 CACHELINE_ASSERT_GROUP_MEMBER(struct sock
, sock_write_tx
, sk_send_head
);
4360 CACHELINE_ASSERT_GROUP_MEMBER(struct sock
, sock_write_tx
, sk_write_queue
);
4361 CACHELINE_ASSERT_GROUP_MEMBER(struct sock
, sock_write_tx
, sk_write_pending
);
4362 CACHELINE_ASSERT_GROUP_MEMBER(struct sock
, sock_write_tx
, sk_dst_pending_confirm
);
4363 CACHELINE_ASSERT_GROUP_MEMBER(struct sock
, sock_write_tx
, sk_pacing_status
);
4364 CACHELINE_ASSERT_GROUP_MEMBER(struct sock
, sock_write_tx
, sk_frag
);
4365 CACHELINE_ASSERT_GROUP_MEMBER(struct sock
, sock_write_tx
, sk_timer
);
4366 CACHELINE_ASSERT_GROUP_MEMBER(struct sock
, sock_write_tx
, sk_pacing_rate
);
4367 CACHELINE_ASSERT_GROUP_MEMBER(struct sock
, sock_write_tx
, sk_zckey
);
4368 CACHELINE_ASSERT_GROUP_MEMBER(struct sock
, sock_write_tx
, sk_tskey
);
4370 CACHELINE_ASSERT_GROUP_MEMBER(struct sock
, sock_read_tx
, sk_max_pacing_rate
);
4371 CACHELINE_ASSERT_GROUP_MEMBER(struct sock
, sock_read_tx
, sk_sndtimeo
);
4372 CACHELINE_ASSERT_GROUP_MEMBER(struct sock
, sock_read_tx
, sk_priority
);
4373 CACHELINE_ASSERT_GROUP_MEMBER(struct sock
, sock_read_tx
, sk_mark
);
4374 CACHELINE_ASSERT_GROUP_MEMBER(struct sock
, sock_read_tx
, sk_dst_cache
);
4375 CACHELINE_ASSERT_GROUP_MEMBER(struct sock
, sock_read_tx
, sk_route_caps
);
4376 CACHELINE_ASSERT_GROUP_MEMBER(struct sock
, sock_read_tx
, sk_gso_type
);
4377 CACHELINE_ASSERT_GROUP_MEMBER(struct sock
, sock_read_tx
, sk_gso_max_size
);
4378 CACHELINE_ASSERT_GROUP_MEMBER(struct sock
, sock_read_tx
, sk_allocation
);
4379 CACHELINE_ASSERT_GROUP_MEMBER(struct sock
, sock_read_tx
, sk_txhash
);
4380 CACHELINE_ASSERT_GROUP_MEMBER(struct sock
, sock_read_tx
, sk_gso_max_segs
);
4381 CACHELINE_ASSERT_GROUP_MEMBER(struct sock
, sock_read_tx
, sk_pacing_shift
);
4382 CACHELINE_ASSERT_GROUP_MEMBER(struct sock
, sock_read_tx
, sk_use_task_frag
);
4386 core_initcall(sock_struct_check
);