1 // SPDX-License-Identifier: GPL-2.0-or-later
3 * INET An implementation of the TCP/IP protocol suite for the LINUX
4 * operating system. INET is implemented using the BSD Socket
5 * interface as the means of communication with the user level.
7 * Generic socket support routines. Memory allocators, socket lock/release
8 * handler for protocols to use and generic option handler.
11 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12 * Florian La Roche, <flla@stud.uni-sb.de>
13 * Alan Cox, <A.Cox@swansea.ac.uk>
16 * Alan Cox : Numerous verify_area() problems
17 * Alan Cox : Connecting on a connecting socket
18 * now returns an error for tcp.
19 * Alan Cox : sock->protocol is set correctly.
20 * and is not sometimes left as 0.
21 * Alan Cox : connect handles icmp errors on a
22 * connect properly. Unfortunately there
23 * is a restart syscall nasty there. I
24 * can't match BSD without hacking the C
25 * library. Ideas urgently sought!
26 * Alan Cox : Disallow bind() to addresses that are
27 * not ours - especially broadcast ones!!
28 * Alan Cox : Socket 1024 _IS_ ok for users. (fencepost)
29 * Alan Cox : sock_wfree/sock_rfree don't destroy sockets,
30 * instead they leave that for the DESTROY timer.
31 * Alan Cox : Clean up error flag in accept
32 * Alan Cox : TCP ack handling is buggy, the DESTROY timer
33 * was buggy. Put a remove_sock() in the handler
34 * for memory when we hit 0. Also altered the timer
35 * code. The ACK stuff can wait and needs major
37 * Alan Cox : Fixed TCP ack bug, removed remove sock
38 * and fixed timer/inet_bh race.
39 * Alan Cox : Added zapped flag for TCP
40 * Alan Cox : Move kfree_skb into skbuff.c and tidied up surplus code
41 * Alan Cox : for new sk_buff allocations wmalloc/rmalloc now call alloc_skb
42 * Alan Cox : kfree_s calls now are kfree_skbmem so we can track skb resources
43 * Alan Cox : Supports socket option broadcast now as does udp. Packet and raw need fixing.
44 * Alan Cox : Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so...
45 * Rick Sladkey : Relaxed UDP rules for matching packets.
46 * C.E.Hawkins : IFF_PROMISC/SIOCGHWADDR support
47 * Pauline Middelink : identd support
48 * Alan Cox : Fixed connect() taking signals I think.
49 * Alan Cox : SO_LINGER supported
50 * Alan Cox : Error reporting fixes
51 * Anonymous : inet_create tidied up (sk->reuse setting)
52 * Alan Cox : inet sockets don't set sk->type!
53 * Alan Cox : Split socket option code
54 * Alan Cox : Callbacks
55 * Alan Cox : Nagle flag for Charles & Johannes stuff
56 * Alex : Removed restriction on inet fioctl
57 * Alan Cox : Splitting INET from NET core
58 * Alan Cox : Fixed bogus SO_TYPE handling in getsockopt()
59 * Adam Caldwell : Missing return in SO_DONTROUTE/SO_DEBUG code
60 * Alan Cox : Split IP from generic code
61 * Alan Cox : New kfree_skbmem()
62 * Alan Cox : Make SO_DEBUG superuser only.
63 * Alan Cox : Allow anyone to clear SO_DEBUG
65 * Alan Cox : Added optimistic memory grabbing for AF_UNIX throughput.
66 * Alan Cox : Allocator for a socket is settable.
67 * Alan Cox : SO_ERROR includes soft errors.
68 * Alan Cox : Allow NULL arguments on some SO_ opts
69 * Alan Cox : Generic socket allocation to make hooks
70 * easier (suggested by Craig Metz).
71 * Michael Pall : SO_ERROR returns positive errno again
72 * Steve Whitehouse: Added default destructor to free
73 * protocol private data.
74 * Steve Whitehouse: Added various other default routines
75 * common to several socket families.
76 * Chris Evans : Call suser() check last on F_SETOWN
77 * Jay Schulist : Added SO_ATTACH_FILTER and SO_DETACH_FILTER.
78 * Andi Kleen : Add sock_kmalloc()/sock_kfree_s()
79 * Andi Kleen : Fix write_space callback
80 * Chris Evans : Security fixes - signedness again
81 * Arnaldo C. Melo : cleanups, use skb_queue_purge
86 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
88 #include <linux/unaligned.h>
89 #include <linux/capability.h>
90 #include <linux/errno.h>
91 #include <linux/errqueue.h>
92 #include <linux/types.h>
93 #include <linux/socket.h>
95 #include <linux/kernel.h>
96 #include <linux/module.h>
97 #include <linux/proc_fs.h>
98 #include <linux/seq_file.h>
99 #include <linux/sched.h>
100 #include <linux/sched/mm.h>
101 #include <linux/timer.h>
102 #include <linux/string.h>
103 #include <linux/sockios.h>
104 #include <linux/net.h>
105 #include <linux/mm.h>
106 #include <linux/slab.h>
107 #include <linux/interrupt.h>
108 #include <linux/poll.h>
109 #include <linux/tcp.h>
110 #include <linux/udp.h>
111 #include <linux/init.h>
112 #include <linux/highmem.h>
113 #include <linux/user_namespace.h>
114 #include <linux/static_key.h>
115 #include <linux/memcontrol.h>
116 #include <linux/prefetch.h>
117 #include <linux/compat.h>
118 #include <linux/mroute.h>
119 #include <linux/mroute6.h>
120 #include <linux/icmpv6.h>
122 #include <linux/uaccess.h>
124 #include <linux/netdevice.h>
125 #include <net/protocol.h>
126 #include <linux/skbuff.h>
127 #include <linux/skbuff_ref.h>
128 #include <net/net_namespace.h>
129 #include <net/request_sock.h>
130 #include <net/sock.h>
131 #include <net/proto_memory.h>
132 #include <linux/net_tstamp.h>
133 #include <net/xfrm.h>
134 #include <linux/ipsec.h>
135 #include <net/cls_cgroup.h>
136 #include <net/netprio_cgroup.h>
137 #include <linux/sock_diag.h>
139 #include <linux/filter.h>
140 #include <net/sock_reuseport.h>
141 #include <net/bpf_sk_storage.h>
143 #include <trace/events/sock.h>
146 #include <net/busy_poll.h>
147 #include <net/phonet/phonet.h>
149 #include <linux/ethtool.h>
153 static DEFINE_MUTEX(proto_list_mutex
);
154 static LIST_HEAD(proto_list
);
156 static void sock_def_write_space_wfree(struct sock
*sk
);
157 static void sock_def_write_space(struct sock
*sk
);
160 * sk_ns_capable - General socket capability test
161 * @sk: Socket to use a capability on or through
162 * @user_ns: The user namespace of the capability to use
163 * @cap: The capability to use
165 * Test to see if the opener of the socket had when the socket was
166 * created and the current process has the capability @cap in the user
167 * namespace @user_ns.
169 bool sk_ns_capable(const struct sock
*sk
,
170 struct user_namespace
*user_ns
, int cap
)
172 return file_ns_capable(sk
->sk_socket
->file
, user_ns
, cap
) &&
173 ns_capable(user_ns
, cap
);
175 EXPORT_SYMBOL(sk_ns_capable
);
178 * sk_capable - Socket global capability test
179 * @sk: Socket to use a capability on or through
180 * @cap: The global capability to use
182 * Test to see if the opener of the socket had when the socket was
183 * created and the current process has the capability @cap in all user
186 bool sk_capable(const struct sock
*sk
, int cap
)
188 return sk_ns_capable(sk
, &init_user_ns
, cap
);
190 EXPORT_SYMBOL(sk_capable
);
193 * sk_net_capable - Network namespace socket capability test
194 * @sk: Socket to use a capability on or through
195 * @cap: The capability to use
197 * Test to see if the opener of the socket had when the socket was created
198 * and the current process has the capability @cap over the network namespace
199 * the socket is a member of.
201 bool sk_net_capable(const struct sock
*sk
, int cap
)
203 return sk_ns_capable(sk
, sock_net(sk
)->user_ns
, cap
);
205 EXPORT_SYMBOL(sk_net_capable
);
208 * Each address family might have different locking rules, so we have
209 * one slock key per address family and separate keys for internal and
212 static struct lock_class_key af_family_keys
[AF_MAX
];
213 static struct lock_class_key af_family_kern_keys
[AF_MAX
];
214 static struct lock_class_key af_family_slock_keys
[AF_MAX
];
215 static struct lock_class_key af_family_kern_slock_keys
[AF_MAX
];
218 * Make lock validator output more readable. (we pre-construct these
219 * strings build-time, so that runtime initialization of socket
223 #define _sock_locks(x) \
224 x "AF_UNSPEC", x "AF_UNIX" , x "AF_INET" , \
225 x "AF_AX25" , x "AF_IPX" , x "AF_APPLETALK", \
226 x "AF_NETROM", x "AF_BRIDGE" , x "AF_ATMPVC" , \
227 x "AF_X25" , x "AF_INET6" , x "AF_ROSE" , \
228 x "AF_DECnet", x "AF_NETBEUI" , x "AF_SECURITY" , \
229 x "AF_KEY" , x "AF_NETLINK" , x "AF_PACKET" , \
230 x "AF_ASH" , x "AF_ECONET" , x "AF_ATMSVC" , \
231 x "AF_RDS" , x "AF_SNA" , x "AF_IRDA" , \
232 x "AF_PPPOX" , x "AF_WANPIPE" , x "AF_LLC" , \
233 x "27" , x "28" , x "AF_CAN" , \
234 x "AF_TIPC" , x "AF_BLUETOOTH", x "IUCV" , \
235 x "AF_RXRPC" , x "AF_ISDN" , x "AF_PHONET" , \
236 x "AF_IEEE802154", x "AF_CAIF" , x "AF_ALG" , \
237 x "AF_NFC" , x "AF_VSOCK" , x "AF_KCM" , \
238 x "AF_QIPCRTR", x "AF_SMC" , x "AF_XDP" , \
242 static const char *const af_family_key_strings
[AF_MAX
+1] = {
243 _sock_locks("sk_lock-")
245 static const char *const af_family_slock_key_strings
[AF_MAX
+1] = {
246 _sock_locks("slock-")
248 static const char *const af_family_clock_key_strings
[AF_MAX
+1] = {
249 _sock_locks("clock-")
252 static const char *const af_family_kern_key_strings
[AF_MAX
+1] = {
253 _sock_locks("k-sk_lock-")
255 static const char *const af_family_kern_slock_key_strings
[AF_MAX
+1] = {
256 _sock_locks("k-slock-")
258 static const char *const af_family_kern_clock_key_strings
[AF_MAX
+1] = {
259 _sock_locks("k-clock-")
261 static const char *const af_family_rlock_key_strings
[AF_MAX
+1] = {
262 _sock_locks("rlock-")
264 static const char *const af_family_wlock_key_strings
[AF_MAX
+1] = {
265 _sock_locks("wlock-")
267 static const char *const af_family_elock_key_strings
[AF_MAX
+1] = {
268 _sock_locks("elock-")
272 * sk_callback_lock and sk queues locking rules are per-address-family,
273 * so split the lock classes by using a per-AF key:
275 static struct lock_class_key af_callback_keys
[AF_MAX
];
276 static struct lock_class_key af_rlock_keys
[AF_MAX
];
277 static struct lock_class_key af_wlock_keys
[AF_MAX
];
278 static struct lock_class_key af_elock_keys
[AF_MAX
];
279 static struct lock_class_key af_kern_callback_keys
[AF_MAX
];
281 /* Run time adjustable parameters. */
282 __u32 sysctl_wmem_max __read_mostly
= SK_WMEM_MAX
;
283 EXPORT_SYMBOL(sysctl_wmem_max
);
284 __u32 sysctl_rmem_max __read_mostly
= SK_RMEM_MAX
;
285 EXPORT_SYMBOL(sysctl_rmem_max
);
286 __u32 sysctl_wmem_default __read_mostly
= SK_WMEM_MAX
;
287 __u32 sysctl_rmem_default __read_mostly
= SK_RMEM_MAX
;
289 DEFINE_STATIC_KEY_FALSE(memalloc_socks_key
);
290 EXPORT_SYMBOL_GPL(memalloc_socks_key
);
293 * sk_set_memalloc - sets %SOCK_MEMALLOC
294 * @sk: socket to set it on
296 * Set %SOCK_MEMALLOC on a socket for access to emergency reserves.
297 * It's the responsibility of the admin to adjust min_free_kbytes
298 * to meet the requirements
300 void sk_set_memalloc(struct sock
*sk
)
302 sock_set_flag(sk
, SOCK_MEMALLOC
);
303 sk
->sk_allocation
|= __GFP_MEMALLOC
;
304 static_branch_inc(&memalloc_socks_key
);
306 EXPORT_SYMBOL_GPL(sk_set_memalloc
);
308 void sk_clear_memalloc(struct sock
*sk
)
310 sock_reset_flag(sk
, SOCK_MEMALLOC
);
311 sk
->sk_allocation
&= ~__GFP_MEMALLOC
;
312 static_branch_dec(&memalloc_socks_key
);
315 * SOCK_MEMALLOC is allowed to ignore rmem limits to ensure forward
316 * progress of swapping. SOCK_MEMALLOC may be cleared while
317 * it has rmem allocations due to the last swapfile being deactivated
318 * but there is a risk that the socket is unusable due to exceeding
319 * the rmem limits. Reclaim the reserves and obey rmem limits again.
323 EXPORT_SYMBOL_GPL(sk_clear_memalloc
);
325 int __sk_backlog_rcv(struct sock
*sk
, struct sk_buff
*skb
)
328 unsigned int noreclaim_flag
;
330 /* these should have been dropped before queueing */
331 BUG_ON(!sock_flag(sk
, SOCK_MEMALLOC
));
333 noreclaim_flag
= memalloc_noreclaim_save();
334 ret
= INDIRECT_CALL_INET(sk
->sk_backlog_rcv
,
338 memalloc_noreclaim_restore(noreclaim_flag
);
342 EXPORT_SYMBOL(__sk_backlog_rcv
);
344 void sk_error_report(struct sock
*sk
)
346 sk
->sk_error_report(sk
);
348 switch (sk
->sk_family
) {
352 trace_inet_sk_error_report(sk
);
358 EXPORT_SYMBOL(sk_error_report
);
360 int sock_get_timeout(long timeo
, void *optval
, bool old_timeval
)
362 struct __kernel_sock_timeval tv
;
364 if (timeo
== MAX_SCHEDULE_TIMEOUT
) {
368 tv
.tv_sec
= timeo
/ HZ
;
369 tv
.tv_usec
= ((timeo
% HZ
) * USEC_PER_SEC
) / HZ
;
372 if (old_timeval
&& in_compat_syscall() && !COMPAT_USE_64BIT_TIME
) {
373 struct old_timeval32 tv32
= { tv
.tv_sec
, tv
.tv_usec
};
374 *(struct old_timeval32
*)optval
= tv32
;
379 struct __kernel_old_timeval old_tv
;
380 old_tv
.tv_sec
= tv
.tv_sec
;
381 old_tv
.tv_usec
= tv
.tv_usec
;
382 *(struct __kernel_old_timeval
*)optval
= old_tv
;
383 return sizeof(old_tv
);
386 *(struct __kernel_sock_timeval
*)optval
= tv
;
389 EXPORT_SYMBOL(sock_get_timeout
);
391 int sock_copy_user_timeval(struct __kernel_sock_timeval
*tv
,
392 sockptr_t optval
, int optlen
, bool old_timeval
)
394 if (old_timeval
&& in_compat_syscall() && !COMPAT_USE_64BIT_TIME
) {
395 struct old_timeval32 tv32
;
397 if (optlen
< sizeof(tv32
))
400 if (copy_from_sockptr(&tv32
, optval
, sizeof(tv32
)))
402 tv
->tv_sec
= tv32
.tv_sec
;
403 tv
->tv_usec
= tv32
.tv_usec
;
404 } else if (old_timeval
) {
405 struct __kernel_old_timeval old_tv
;
407 if (optlen
< sizeof(old_tv
))
409 if (copy_from_sockptr(&old_tv
, optval
, sizeof(old_tv
)))
411 tv
->tv_sec
= old_tv
.tv_sec
;
412 tv
->tv_usec
= old_tv
.tv_usec
;
414 if (optlen
< sizeof(*tv
))
416 if (copy_from_sockptr(tv
, optval
, sizeof(*tv
)))
422 EXPORT_SYMBOL(sock_copy_user_timeval
);
424 static int sock_set_timeout(long *timeo_p
, sockptr_t optval
, int optlen
,
427 struct __kernel_sock_timeval tv
;
428 int err
= sock_copy_user_timeval(&tv
, optval
, optlen
, old_timeval
);
434 if (tv
.tv_usec
< 0 || tv
.tv_usec
>= USEC_PER_SEC
)
438 static int warned __read_mostly
;
440 WRITE_ONCE(*timeo_p
, 0);
441 if (warned
< 10 && net_ratelimit()) {
443 pr_info("%s: `%s' (pid %d) tries to set negative timeout\n",
444 __func__
, current
->comm
, task_pid_nr(current
));
448 val
= MAX_SCHEDULE_TIMEOUT
;
449 if ((tv
.tv_sec
|| tv
.tv_usec
) &&
450 (tv
.tv_sec
< (MAX_SCHEDULE_TIMEOUT
/ HZ
- 1)))
451 val
= tv
.tv_sec
* HZ
+ DIV_ROUND_UP((unsigned long)tv
.tv_usec
,
453 WRITE_ONCE(*timeo_p
, val
);
457 static bool sk_set_prio_allowed(const struct sock
*sk
, int val
)
459 return ((val
>= TC_PRIO_BESTEFFORT
&& val
<= TC_PRIO_INTERACTIVE
) ||
460 sockopt_ns_capable(sock_net(sk
)->user_ns
, CAP_NET_RAW
) ||
461 sockopt_ns_capable(sock_net(sk
)->user_ns
, CAP_NET_ADMIN
));
464 static bool sock_needs_netstamp(const struct sock
*sk
)
466 switch (sk
->sk_family
) {
475 static void sock_disable_timestamp(struct sock
*sk
, unsigned long flags
)
477 if (sk
->sk_flags
& flags
) {
478 sk
->sk_flags
&= ~flags
;
479 if (sock_needs_netstamp(sk
) &&
480 !(sk
->sk_flags
& SK_FLAGS_TIMESTAMP
))
481 net_disable_timestamp();
486 int __sock_queue_rcv_skb(struct sock
*sk
, struct sk_buff
*skb
)
489 struct sk_buff_head
*list
= &sk
->sk_receive_queue
;
491 if (atomic_read(&sk
->sk_rmem_alloc
) >= READ_ONCE(sk
->sk_rcvbuf
)) {
492 atomic_inc(&sk
->sk_drops
);
493 trace_sock_rcvqueue_full(sk
, skb
);
497 if (!sk_rmem_schedule(sk
, skb
, skb
->truesize
)) {
498 atomic_inc(&sk
->sk_drops
);
503 skb_set_owner_r(skb
, sk
);
505 /* we escape from rcu protected region, make sure we dont leak
510 spin_lock_irqsave(&list
->lock
, flags
);
511 sock_skb_set_dropcount(sk
, skb
);
512 __skb_queue_tail(list
, skb
);
513 spin_unlock_irqrestore(&list
->lock
, flags
);
515 if (!sock_flag(sk
, SOCK_DEAD
))
516 sk
->sk_data_ready(sk
);
519 EXPORT_SYMBOL(__sock_queue_rcv_skb
);
521 int sock_queue_rcv_skb_reason(struct sock
*sk
, struct sk_buff
*skb
,
522 enum skb_drop_reason
*reason
)
524 enum skb_drop_reason drop_reason
;
527 err
= sk_filter(sk
, skb
);
529 drop_reason
= SKB_DROP_REASON_SOCKET_FILTER
;
532 err
= __sock_queue_rcv_skb(sk
, skb
);
535 drop_reason
= SKB_DROP_REASON_SOCKET_RCVBUFF
;
538 drop_reason
= SKB_DROP_REASON_PROTO_MEM
;
541 drop_reason
= SKB_NOT_DROPPED_YET
;
546 *reason
= drop_reason
;
549 EXPORT_SYMBOL(sock_queue_rcv_skb_reason
);
551 int __sk_receive_skb(struct sock
*sk
, struct sk_buff
*skb
,
552 const int nested
, unsigned int trim_cap
, bool refcounted
)
554 int rc
= NET_RX_SUCCESS
;
556 if (sk_filter_trim_cap(sk
, skb
, trim_cap
))
557 goto discard_and_relse
;
561 if (sk_rcvqueues_full(sk
, READ_ONCE(sk
->sk_rcvbuf
))) {
562 atomic_inc(&sk
->sk_drops
);
563 goto discard_and_relse
;
566 bh_lock_sock_nested(sk
);
569 if (!sock_owned_by_user(sk
)) {
571 * trylock + unlock semantics:
573 mutex_acquire(&sk
->sk_lock
.dep_map
, 0, 1, _RET_IP_
);
575 rc
= sk_backlog_rcv(sk
, skb
);
577 mutex_release(&sk
->sk_lock
.dep_map
, _RET_IP_
);
578 } else if (sk_add_backlog(sk
, skb
, READ_ONCE(sk
->sk_rcvbuf
))) {
580 atomic_inc(&sk
->sk_drops
);
581 goto discard_and_relse
;
593 EXPORT_SYMBOL(__sk_receive_skb
);
595 INDIRECT_CALLABLE_DECLARE(struct dst_entry
*ip6_dst_check(struct dst_entry
*,
597 INDIRECT_CALLABLE_DECLARE(struct dst_entry
*ipv4_dst_check(struct dst_entry
*,
599 struct dst_entry
*__sk_dst_check(struct sock
*sk
, u32 cookie
)
601 struct dst_entry
*dst
= __sk_dst_get(sk
);
603 if (dst
&& dst
->obsolete
&&
604 INDIRECT_CALL_INET(dst
->ops
->check
, ip6_dst_check
, ipv4_dst_check
,
605 dst
, cookie
) == NULL
) {
606 sk_tx_queue_clear(sk
);
607 WRITE_ONCE(sk
->sk_dst_pending_confirm
, 0);
608 RCU_INIT_POINTER(sk
->sk_dst_cache
, NULL
);
615 EXPORT_SYMBOL(__sk_dst_check
);
617 struct dst_entry
*sk_dst_check(struct sock
*sk
, u32 cookie
)
619 struct dst_entry
*dst
= sk_dst_get(sk
);
621 if (dst
&& dst
->obsolete
&&
622 INDIRECT_CALL_INET(dst
->ops
->check
, ip6_dst_check
, ipv4_dst_check
,
623 dst
, cookie
) == NULL
) {
631 EXPORT_SYMBOL(sk_dst_check
);
633 static int sock_bindtoindex_locked(struct sock
*sk
, int ifindex
)
635 int ret
= -ENOPROTOOPT
;
636 #ifdef CONFIG_NETDEVICES
637 struct net
*net
= sock_net(sk
);
641 if (sk
->sk_bound_dev_if
&& !ns_capable(net
->user_ns
, CAP_NET_RAW
))
648 /* Paired with all READ_ONCE() done locklessly. */
649 WRITE_ONCE(sk
->sk_bound_dev_if
, ifindex
);
651 if (sk
->sk_prot
->rehash
)
652 sk
->sk_prot
->rehash(sk
);
663 int sock_bindtoindex(struct sock
*sk
, int ifindex
, bool lock_sk
)
669 ret
= sock_bindtoindex_locked(sk
, ifindex
);
675 EXPORT_SYMBOL(sock_bindtoindex
);
677 static int sock_setbindtodevice(struct sock
*sk
, sockptr_t optval
, int optlen
)
679 int ret
= -ENOPROTOOPT
;
680 #ifdef CONFIG_NETDEVICES
681 struct net
*net
= sock_net(sk
);
682 char devname
[IFNAMSIZ
];
689 /* Bind this socket to a particular device like "eth0",
690 * as specified in the passed interface name. If the
691 * name is "" or the option length is zero the socket
694 if (optlen
> IFNAMSIZ
- 1)
695 optlen
= IFNAMSIZ
- 1;
696 memset(devname
, 0, sizeof(devname
));
699 if (copy_from_sockptr(devname
, optval
, optlen
))
703 if (devname
[0] != '\0') {
704 struct net_device
*dev
;
707 dev
= dev_get_by_name_rcu(net
, devname
);
709 index
= dev
->ifindex
;
716 sockopt_lock_sock(sk
);
717 ret
= sock_bindtoindex_locked(sk
, index
);
718 sockopt_release_sock(sk
);
725 static int sock_getbindtodevice(struct sock
*sk
, sockptr_t optval
,
726 sockptr_t optlen
, int len
)
728 int ret
= -ENOPROTOOPT
;
729 #ifdef CONFIG_NETDEVICES
730 int bound_dev_if
= READ_ONCE(sk
->sk_bound_dev_if
);
731 struct net
*net
= sock_net(sk
);
732 char devname
[IFNAMSIZ
];
734 if (bound_dev_if
== 0) {
743 ret
= netdev_get_name(net
, devname
, bound_dev_if
);
747 len
= strlen(devname
) + 1;
750 if (copy_to_sockptr(optval
, devname
, len
))
755 if (copy_to_sockptr(optlen
, &len
, sizeof(int)))
766 bool sk_mc_loop(const struct sock
*sk
)
768 if (dev_recursion_level())
772 /* IPV6_ADDRFORM can change sk->sk_family under us. */
773 switch (READ_ONCE(sk
->sk_family
)) {
775 return inet_test_bit(MC_LOOP
, sk
);
776 #if IS_ENABLED(CONFIG_IPV6)
778 return inet6_test_bit(MC6_LOOP
, sk
);
784 EXPORT_SYMBOL(sk_mc_loop
);
786 void sock_set_reuseaddr(struct sock
*sk
)
789 sk
->sk_reuse
= SK_CAN_REUSE
;
792 EXPORT_SYMBOL(sock_set_reuseaddr
);
794 void sock_set_reuseport(struct sock
*sk
)
797 sk
->sk_reuseport
= true;
800 EXPORT_SYMBOL(sock_set_reuseport
);
802 void sock_no_linger(struct sock
*sk
)
805 WRITE_ONCE(sk
->sk_lingertime
, 0);
806 sock_set_flag(sk
, SOCK_LINGER
);
809 EXPORT_SYMBOL(sock_no_linger
);
811 void sock_set_priority(struct sock
*sk
, u32 priority
)
813 WRITE_ONCE(sk
->sk_priority
, priority
);
815 EXPORT_SYMBOL(sock_set_priority
);
817 void sock_set_sndtimeo(struct sock
*sk
, s64 secs
)
820 if (secs
&& secs
< MAX_SCHEDULE_TIMEOUT
/ HZ
- 1)
821 WRITE_ONCE(sk
->sk_sndtimeo
, secs
* HZ
);
823 WRITE_ONCE(sk
->sk_sndtimeo
, MAX_SCHEDULE_TIMEOUT
);
826 EXPORT_SYMBOL(sock_set_sndtimeo
);
828 static void __sock_set_timestamps(struct sock
*sk
, bool val
, bool new, bool ns
)
830 sock_valbool_flag(sk
, SOCK_RCVTSTAMP
, val
);
831 sock_valbool_flag(sk
, SOCK_RCVTSTAMPNS
, val
&& ns
);
833 sock_valbool_flag(sk
, SOCK_TSTAMP_NEW
, new);
834 sock_enable_timestamp(sk
, SOCK_TIMESTAMP
);
838 void sock_enable_timestamps(struct sock
*sk
)
841 __sock_set_timestamps(sk
, true, false, true);
844 EXPORT_SYMBOL(sock_enable_timestamps
);
846 void sock_set_timestamp(struct sock
*sk
, int optname
, bool valbool
)
849 case SO_TIMESTAMP_OLD
:
850 __sock_set_timestamps(sk
, valbool
, false, false);
852 case SO_TIMESTAMP_NEW
:
853 __sock_set_timestamps(sk
, valbool
, true, false);
855 case SO_TIMESTAMPNS_OLD
:
856 __sock_set_timestamps(sk
, valbool
, false, true);
858 case SO_TIMESTAMPNS_NEW
:
859 __sock_set_timestamps(sk
, valbool
, true, true);
864 static int sock_timestamping_bind_phc(struct sock
*sk
, int phc_index
)
866 struct net
*net
= sock_net(sk
);
867 struct net_device
*dev
= NULL
;
872 if (sk
->sk_bound_dev_if
)
873 dev
= dev_get_by_index(net
, sk
->sk_bound_dev_if
);
876 pr_err("%s: sock not bind to device\n", __func__
);
880 num
= ethtool_get_phc_vclocks(dev
, &vclock_index
);
883 for (i
= 0; i
< num
; i
++) {
884 if (*(vclock_index
+ i
) == phc_index
) {
896 WRITE_ONCE(sk
->sk_bind_phc
, phc_index
);
901 int sock_set_timestamping(struct sock
*sk
, int optname
,
902 struct so_timestamping timestamping
)
904 int val
= timestamping
.flags
;
907 if (val
& ~SOF_TIMESTAMPING_MASK
)
910 if (val
& SOF_TIMESTAMPING_OPT_ID_TCP
&&
911 !(val
& SOF_TIMESTAMPING_OPT_ID
))
914 if (val
& SOF_TIMESTAMPING_OPT_ID
&&
915 !(sk
->sk_tsflags
& SOF_TIMESTAMPING_OPT_ID
)) {
917 if ((1 << sk
->sk_state
) &
918 (TCPF_CLOSE
| TCPF_LISTEN
))
920 if (val
& SOF_TIMESTAMPING_OPT_ID_TCP
)
921 atomic_set(&sk
->sk_tskey
, tcp_sk(sk
)->write_seq
);
923 atomic_set(&sk
->sk_tskey
, tcp_sk(sk
)->snd_una
);
925 atomic_set(&sk
->sk_tskey
, 0);
929 if (val
& SOF_TIMESTAMPING_OPT_STATS
&&
930 !(val
& SOF_TIMESTAMPING_OPT_TSONLY
))
933 if (val
& SOF_TIMESTAMPING_BIND_PHC
) {
934 ret
= sock_timestamping_bind_phc(sk
, timestamping
.bind_phc
);
939 WRITE_ONCE(sk
->sk_tsflags
, val
);
940 sock_valbool_flag(sk
, SOCK_TSTAMP_NEW
, optname
== SO_TIMESTAMPING_NEW
);
942 if (val
& SOF_TIMESTAMPING_RX_SOFTWARE
)
943 sock_enable_timestamp(sk
,
944 SOCK_TIMESTAMPING_RX_SOFTWARE
);
946 sock_disable_timestamp(sk
,
947 (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE
));
951 void sock_set_keepalive(struct sock
*sk
)
954 if (sk
->sk_prot
->keepalive
)
955 sk
->sk_prot
->keepalive(sk
, true);
956 sock_valbool_flag(sk
, SOCK_KEEPOPEN
, true);
959 EXPORT_SYMBOL(sock_set_keepalive
);
961 static void __sock_set_rcvbuf(struct sock
*sk
, int val
)
963 /* Ensure val * 2 fits into an int, to prevent max_t() from treating it
964 * as a negative value.
966 val
= min_t(int, val
, INT_MAX
/ 2);
967 sk
->sk_userlocks
|= SOCK_RCVBUF_LOCK
;
969 /* We double it on the way in to account for "struct sk_buff" etc.
970 * overhead. Applications assume that the SO_RCVBUF setting they make
971 * will allow that much actual data to be received on that socket.
973 * Applications are unaware that "struct sk_buff" and other overheads
974 * allocate from the receive buffer during socket buffer allocation.
976 * And after considering the possible alternatives, returning the value
977 * we actually used in getsockopt is the most desirable behavior.
979 WRITE_ONCE(sk
->sk_rcvbuf
, max_t(int, val
* 2, SOCK_MIN_RCVBUF
));
982 void sock_set_rcvbuf(struct sock
*sk
, int val
)
985 __sock_set_rcvbuf(sk
, val
);
988 EXPORT_SYMBOL(sock_set_rcvbuf
);
990 static void __sock_set_mark(struct sock
*sk
, u32 val
)
992 if (val
!= sk
->sk_mark
) {
993 WRITE_ONCE(sk
->sk_mark
, val
);
998 void sock_set_mark(struct sock
*sk
, u32 val
)
1001 __sock_set_mark(sk
, val
);
1004 EXPORT_SYMBOL(sock_set_mark
);
1006 static void sock_release_reserved_memory(struct sock
*sk
, int bytes
)
1008 /* Round down bytes to multiple of pages */
1009 bytes
= round_down(bytes
, PAGE_SIZE
);
1011 WARN_ON(bytes
> sk
->sk_reserved_mem
);
1012 WRITE_ONCE(sk
->sk_reserved_mem
, sk
->sk_reserved_mem
- bytes
);
1016 static int sock_reserve_memory(struct sock
*sk
, int bytes
)
1022 if (!mem_cgroup_sockets_enabled
|| !sk
->sk_memcg
|| !sk_has_account(sk
))
1028 pages
= sk_mem_pages(bytes
);
1030 /* pre-charge to memcg */
1031 charged
= mem_cgroup_charge_skmem(sk
->sk_memcg
, pages
,
1032 GFP_KERNEL
| __GFP_RETRY_MAYFAIL
);
1036 /* pre-charge to forward_alloc */
1037 sk_memory_allocated_add(sk
, pages
);
1038 allocated
= sk_memory_allocated(sk
);
1039 /* If the system goes into memory pressure with this
1040 * precharge, give up and return error.
1042 if (allocated
> sk_prot_mem_limits(sk
, 1)) {
1043 sk_memory_allocated_sub(sk
, pages
);
1044 mem_cgroup_uncharge_skmem(sk
->sk_memcg
, pages
);
1047 sk_forward_alloc_add(sk
, pages
<< PAGE_SHIFT
);
1049 WRITE_ONCE(sk
->sk_reserved_mem
,
1050 sk
->sk_reserved_mem
+ (pages
<< PAGE_SHIFT
));
1055 #ifdef CONFIG_PAGE_POOL
1057 /* This is the number of tokens and frags that the user can SO_DEVMEM_DONTNEED
1058 * in 1 syscall. The limit exists to limit the amount of memory the kernel
1059 * allocates to copy these tokens, and to prevent looping over the frags for
1062 #define MAX_DONTNEED_TOKENS 128
1063 #define MAX_DONTNEED_FRAGS 1024
1065 static noinline_for_stack
int
1066 sock_devmem_dontneed(struct sock
*sk
, sockptr_t optval
, unsigned int optlen
)
1068 unsigned int num_tokens
, i
, j
, k
, netmem_num
= 0;
1069 struct dmabuf_token
*tokens
;
1070 int ret
= 0, num_frags
= 0;
1071 netmem_ref netmems
[16];
1076 if (optlen
% sizeof(*tokens
) ||
1077 optlen
> sizeof(*tokens
) * MAX_DONTNEED_TOKENS
)
1080 num_tokens
= optlen
/ sizeof(*tokens
);
1081 tokens
= kvmalloc_array(num_tokens
, sizeof(*tokens
), GFP_KERNEL
);
1085 if (copy_from_sockptr(tokens
, optval
, optlen
)) {
1090 xa_lock_bh(&sk
->sk_user_frags
);
1091 for (i
= 0; i
< num_tokens
; i
++) {
1092 for (j
= 0; j
< tokens
[i
].token_count
; j
++) {
1093 if (++num_frags
> MAX_DONTNEED_FRAGS
)
1094 goto frag_limit_reached
;
1096 netmem_ref netmem
= (__force netmem_ref
)__xa_erase(
1097 &sk
->sk_user_frags
, tokens
[i
].token_start
+ j
);
1099 if (!netmem
|| WARN_ON_ONCE(!netmem_is_net_iov(netmem
)))
1102 netmems
[netmem_num
++] = netmem
;
1103 if (netmem_num
== ARRAY_SIZE(netmems
)) {
1104 xa_unlock_bh(&sk
->sk_user_frags
);
1105 for (k
= 0; k
< netmem_num
; k
++)
1106 WARN_ON_ONCE(!napi_pp_put_page(netmems
[k
]));
1108 xa_lock_bh(&sk
->sk_user_frags
);
1115 xa_unlock_bh(&sk
->sk_user_frags
);
1116 for (k
= 0; k
< netmem_num
; k
++)
1117 WARN_ON_ONCE(!napi_pp_put_page(netmems
[k
]));
1124 void sockopt_lock_sock(struct sock
*sk
)
1126 /* When current->bpf_ctx is set, the setsockopt is called from
1127 * a bpf prog. bpf has ensured the sk lock has been
1128 * acquired before calling setsockopt().
1130 if (has_current_bpf_ctx())
1135 EXPORT_SYMBOL(sockopt_lock_sock
);
1137 void sockopt_release_sock(struct sock
*sk
)
1139 if (has_current_bpf_ctx())
1144 EXPORT_SYMBOL(sockopt_release_sock
);
1146 bool sockopt_ns_capable(struct user_namespace
*ns
, int cap
)
1148 return has_current_bpf_ctx() || ns_capable(ns
, cap
);
1150 EXPORT_SYMBOL(sockopt_ns_capable
);
1152 bool sockopt_capable(int cap
)
1154 return has_current_bpf_ctx() || capable(cap
);
1156 EXPORT_SYMBOL(sockopt_capable
);
1158 static int sockopt_validate_clockid(__kernel_clockid_t value
)
1161 case CLOCK_REALTIME
:
1162 case CLOCK_MONOTONIC
:
1170 * This is meant for all protocols to use and covers goings on
1171 * at the socket level. Everything here is generic.
1174 int sk_setsockopt(struct sock
*sk
, int level
, int optname
,
1175 sockptr_t optval
, unsigned int optlen
)
1177 struct so_timestamping timestamping
;
1178 struct socket
*sock
= sk
->sk_socket
;
1179 struct sock_txtime sk_txtime
;
1186 * Options without arguments
1189 if (optname
== SO_BINDTODEVICE
)
1190 return sock_setbindtodevice(sk
, optval
, optlen
);
1192 if (optlen
< sizeof(int))
1195 if (copy_from_sockptr(&val
, optval
, sizeof(val
)))
1198 valbool
= val
? 1 : 0;
1200 /* handle options which do not require locking the socket. */
1203 if (sk_set_prio_allowed(sk
, val
)) {
1204 sock_set_priority(sk
, val
);
1209 assign_bit(SOCK_PASSSEC
, &sock
->flags
, valbool
);
1212 assign_bit(SOCK_PASSCRED
, &sock
->flags
, valbool
);
1215 assign_bit(SOCK_PASSPIDFD
, &sock
->flags
, valbool
);
1221 return -ENOPROTOOPT
;
1222 #ifdef CONFIG_NET_RX_BUSY_POLL
1226 WRITE_ONCE(sk
->sk_ll_usec
, val
);
1228 case SO_PREFER_BUSY_POLL
:
1229 if (valbool
&& !sockopt_capable(CAP_NET_ADMIN
))
1231 WRITE_ONCE(sk
->sk_prefer_busy_poll
, valbool
);
1233 case SO_BUSY_POLL_BUDGET
:
1234 if (val
> READ_ONCE(sk
->sk_busy_poll_budget
) &&
1235 !sockopt_capable(CAP_NET_ADMIN
))
1237 if (val
< 0 || val
> U16_MAX
)
1239 WRITE_ONCE(sk
->sk_busy_poll_budget
, val
);
1242 case SO_MAX_PACING_RATE
:
1244 unsigned long ulval
= (val
== ~0U) ? ~0UL : (unsigned int)val
;
1245 unsigned long pacing_rate
;
1247 if (sizeof(ulval
) != sizeof(val
) &&
1248 optlen
>= sizeof(ulval
) &&
1249 copy_from_sockptr(&ulval
, optval
, sizeof(ulval
))) {
1253 cmpxchg(&sk
->sk_pacing_status
,
1256 /* Pairs with READ_ONCE() from sk_getsockopt() */
1257 WRITE_ONCE(sk
->sk_max_pacing_rate
, ulval
);
1258 pacing_rate
= READ_ONCE(sk
->sk_pacing_rate
);
1259 if (ulval
< pacing_rate
)
1260 WRITE_ONCE(sk
->sk_pacing_rate
, ulval
);
1264 if (val
< -1 || val
> 1)
1266 if ((u8
)val
== SOCK_TXREHASH_DEFAULT
)
1267 val
= READ_ONCE(sock_net(sk
)->core
.sysctl_txrehash
);
1268 /* Paired with READ_ONCE() in tcp_rtx_synack()
1269 * and sk_getsockopt().
1271 WRITE_ONCE(sk
->sk_txrehash
, (u8
)val
);
1275 int (*set_peek_off
)(struct sock
*sk
, int val
);
1277 set_peek_off
= READ_ONCE(sock
->ops
)->set_peek_off
;
1279 ret
= set_peek_off(sk
, val
);
1284 #ifdef CONFIG_PAGE_POOL
1285 case SO_DEVMEM_DONTNEED
:
1286 return sock_devmem_dontneed(sk
, optval
, optlen
);
1290 sockopt_lock_sock(sk
);
1294 if (val
&& !sockopt_capable(CAP_NET_ADMIN
))
1297 sock_valbool_flag(sk
, SOCK_DBG
, valbool
);
1300 sk
->sk_reuse
= (valbool
? SK_CAN_REUSE
: SK_NO_REUSE
);
1303 if (valbool
&& !sk_is_inet(sk
))
1306 sk
->sk_reuseport
= valbool
;
1309 sock_valbool_flag(sk
, SOCK_LOCALROUTE
, valbool
);
1313 sock_valbool_flag(sk
, SOCK_BROADCAST
, valbool
);
1316 /* Don't error on this BSD doesn't and if you think
1317 * about it this is right. Otherwise apps have to
1318 * play 'guess the biggest size' games. RCVBUF/SNDBUF
1319 * are treated in BSD as hints
1321 val
= min_t(u32
, val
, READ_ONCE(sysctl_wmem_max
));
1323 /* Ensure val * 2 fits into an int, to prevent max_t()
1324 * from treating it as a negative value.
1326 val
= min_t(int, val
, INT_MAX
/ 2);
1327 sk
->sk_userlocks
|= SOCK_SNDBUF_LOCK
;
1328 WRITE_ONCE(sk
->sk_sndbuf
,
1329 max_t(int, val
* 2, SOCK_MIN_SNDBUF
));
1330 /* Wake up sending tasks if we upped the value. */
1331 sk
->sk_write_space(sk
);
1334 case SO_SNDBUFFORCE
:
1335 if (!sockopt_capable(CAP_NET_ADMIN
)) {
1340 /* No negative values (to prevent underflow, as val will be
1348 /* Don't error on this BSD doesn't and if you think
1349 * about it this is right. Otherwise apps have to
1350 * play 'guess the biggest size' games. RCVBUF/SNDBUF
1351 * are treated in BSD as hints
1353 __sock_set_rcvbuf(sk
, min_t(u32
, val
, READ_ONCE(sysctl_rmem_max
)));
1356 case SO_RCVBUFFORCE
:
1357 if (!sockopt_capable(CAP_NET_ADMIN
)) {
1362 /* No negative values (to prevent underflow, as val will be
1365 __sock_set_rcvbuf(sk
, max(val
, 0));
1369 if (sk
->sk_prot
->keepalive
)
1370 sk
->sk_prot
->keepalive(sk
, valbool
);
1371 sock_valbool_flag(sk
, SOCK_KEEPOPEN
, valbool
);
1375 sock_valbool_flag(sk
, SOCK_URGINLINE
, valbool
);
1379 sk
->sk_no_check_tx
= valbool
;
1383 if (optlen
< sizeof(ling
)) {
1384 ret
= -EINVAL
; /* 1003.1g */
1387 if (copy_from_sockptr(&ling
, optval
, sizeof(ling
))) {
1391 if (!ling
.l_onoff
) {
1392 sock_reset_flag(sk
, SOCK_LINGER
);
1394 unsigned long t_sec
= ling
.l_linger
;
1396 if (t_sec
>= MAX_SCHEDULE_TIMEOUT
/ HZ
)
1397 WRITE_ONCE(sk
->sk_lingertime
, MAX_SCHEDULE_TIMEOUT
);
1399 WRITE_ONCE(sk
->sk_lingertime
, t_sec
* HZ
);
1400 sock_set_flag(sk
, SOCK_LINGER
);
1407 case SO_TIMESTAMP_OLD
:
1408 case SO_TIMESTAMP_NEW
:
1409 case SO_TIMESTAMPNS_OLD
:
1410 case SO_TIMESTAMPNS_NEW
:
1411 sock_set_timestamp(sk
, optname
, valbool
);
1414 case SO_TIMESTAMPING_NEW
:
1415 case SO_TIMESTAMPING_OLD
:
1416 if (optlen
== sizeof(timestamping
)) {
1417 if (copy_from_sockptr(×tamping
, optval
,
1418 sizeof(timestamping
))) {
1423 memset(×tamping
, 0, sizeof(timestamping
));
1424 timestamping
.flags
= val
;
1426 ret
= sock_set_timestamping(sk
, optname
, timestamping
);
1431 int (*set_rcvlowat
)(struct sock
*sk
, int val
) = NULL
;
1436 set_rcvlowat
= READ_ONCE(sock
->ops
)->set_rcvlowat
;
1438 ret
= set_rcvlowat(sk
, val
);
1440 WRITE_ONCE(sk
->sk_rcvlowat
, val
? : 1);
1443 case SO_RCVTIMEO_OLD
:
1444 case SO_RCVTIMEO_NEW
:
1445 ret
= sock_set_timeout(&sk
->sk_rcvtimeo
, optval
,
1446 optlen
, optname
== SO_RCVTIMEO_OLD
);
1449 case SO_SNDTIMEO_OLD
:
1450 case SO_SNDTIMEO_NEW
:
1451 ret
= sock_set_timeout(&sk
->sk_sndtimeo
, optval
,
1452 optlen
, optname
== SO_SNDTIMEO_OLD
);
1455 case SO_ATTACH_FILTER
: {
1456 struct sock_fprog fprog
;
1458 ret
= copy_bpf_fprog_from_user(&fprog
, optval
, optlen
);
1460 ret
= sk_attach_filter(&fprog
, sk
);
1465 if (optlen
== sizeof(u32
)) {
1469 if (copy_from_sockptr(&ufd
, optval
, sizeof(ufd
)))
1472 ret
= sk_attach_bpf(ufd
, sk
);
1476 case SO_ATTACH_REUSEPORT_CBPF
: {
1477 struct sock_fprog fprog
;
1479 ret
= copy_bpf_fprog_from_user(&fprog
, optval
, optlen
);
1481 ret
= sk_reuseport_attach_filter(&fprog
, sk
);
1484 case SO_ATTACH_REUSEPORT_EBPF
:
1486 if (optlen
== sizeof(u32
)) {
1490 if (copy_from_sockptr(&ufd
, optval
, sizeof(ufd
)))
1493 ret
= sk_reuseport_attach_bpf(ufd
, sk
);
1497 case SO_DETACH_REUSEPORT_BPF
:
1498 ret
= reuseport_detach_prog(sk
);
1501 case SO_DETACH_FILTER
:
1502 ret
= sk_detach_filter(sk
);
1505 case SO_LOCK_FILTER
:
1506 if (sock_flag(sk
, SOCK_FILTER_LOCKED
) && !valbool
)
1509 sock_valbool_flag(sk
, SOCK_FILTER_LOCKED
, valbool
);
1513 if (!sockopt_ns_capable(sock_net(sk
)->user_ns
, CAP_NET_RAW
) &&
1514 !sockopt_ns_capable(sock_net(sk
)->user_ns
, CAP_NET_ADMIN
)) {
1519 __sock_set_mark(sk
, val
);
1522 sock_valbool_flag(sk
, SOCK_RCVMARK
, valbool
);
1525 case SO_RCVPRIORITY
:
1526 sock_valbool_flag(sk
, SOCK_RCVPRIORITY
, valbool
);
1530 sock_valbool_flag(sk
, SOCK_RXQ_OVFL
, valbool
);
1533 case SO_WIFI_STATUS
:
1534 sock_valbool_flag(sk
, SOCK_WIFI_STATUS
, valbool
);
1538 sock_valbool_flag(sk
, SOCK_NOFCS
, valbool
);
1541 case SO_SELECT_ERR_QUEUE
:
1542 sock_valbool_flag(sk
, SOCK_SELECT_ERR_QUEUE
, valbool
);
1546 case SO_INCOMING_CPU
:
1547 reuseport_update_incoming_cpu(sk
, val
);
1552 dst_negative_advice(sk
);
1556 if (sk
->sk_family
== PF_INET
|| sk
->sk_family
== PF_INET6
) {
1557 if (!(sk_is_tcp(sk
) ||
1558 (sk
->sk_type
== SOCK_DGRAM
&&
1559 sk
->sk_protocol
== IPPROTO_UDP
)))
1561 } else if (sk
->sk_family
!= PF_RDS
) {
1565 if (val
< 0 || val
> 1)
1568 sock_valbool_flag(sk
, SOCK_ZEROCOPY
, valbool
);
1573 if (optlen
!= sizeof(struct sock_txtime
)) {
1576 } else if (copy_from_sockptr(&sk_txtime
, optval
,
1577 sizeof(struct sock_txtime
))) {
1580 } else if (sk_txtime
.flags
& ~SOF_TXTIME_FLAGS_MASK
) {
1584 /* CLOCK_MONOTONIC is only used by sch_fq, and this packet
1585 * scheduler has enough safe guards.
1587 if (sk_txtime
.clockid
!= CLOCK_MONOTONIC
&&
1588 !sockopt_ns_capable(sock_net(sk
)->user_ns
, CAP_NET_ADMIN
)) {
1593 ret
= sockopt_validate_clockid(sk_txtime
.clockid
);
1597 sock_valbool_flag(sk
, SOCK_TXTIME
, true);
1598 sk
->sk_clockid
= sk_txtime
.clockid
;
1599 sk
->sk_txtime_deadline_mode
=
1600 !!(sk_txtime
.flags
& SOF_TXTIME_DEADLINE_MODE
);
1601 sk
->sk_txtime_report_errors
=
1602 !!(sk_txtime
.flags
& SOF_TXTIME_REPORT_ERRORS
);
1605 case SO_BINDTOIFINDEX
:
1606 ret
= sock_bindtoindex_locked(sk
, val
);
1610 if (val
& ~SOCK_BUF_LOCK_MASK
) {
1614 sk
->sk_userlocks
= val
| (sk
->sk_userlocks
&
1615 ~SOCK_BUF_LOCK_MASK
);
1618 case SO_RESERVE_MEM
:
1627 delta
= val
- sk
->sk_reserved_mem
;
1629 sock_release_reserved_memory(sk
, -delta
);
1631 ret
= sock_reserve_memory(sk
, delta
);
1639 sockopt_release_sock(sk
);
1643 int sock_setsockopt(struct socket
*sock
, int level
, int optname
,
1644 sockptr_t optval
, unsigned int optlen
)
1646 return sk_setsockopt(sock
->sk
, level
, optname
,
1649 EXPORT_SYMBOL(sock_setsockopt
);
1651 static const struct cred
*sk_get_peer_cred(struct sock
*sk
)
1653 const struct cred
*cred
;
1655 spin_lock(&sk
->sk_peer_lock
);
1656 cred
= get_cred(sk
->sk_peer_cred
);
1657 spin_unlock(&sk
->sk_peer_lock
);
1662 static void cred_to_ucred(struct pid
*pid
, const struct cred
*cred
,
1663 struct ucred
*ucred
)
1665 ucred
->pid
= pid_vnr(pid
);
1666 ucred
->uid
= ucred
->gid
= -1;
1668 struct user_namespace
*current_ns
= current_user_ns();
1670 ucred
->uid
= from_kuid_munged(current_ns
, cred
->euid
);
1671 ucred
->gid
= from_kgid_munged(current_ns
, cred
->egid
);
1675 static int groups_to_user(sockptr_t dst
, const struct group_info
*src
)
1677 struct user_namespace
*user_ns
= current_user_ns();
1680 for (i
= 0; i
< src
->ngroups
; i
++) {
1681 gid_t gid
= from_kgid_munged(user_ns
, src
->gid
[i
]);
1683 if (copy_to_sockptr_offset(dst
, i
* sizeof(gid
), &gid
, sizeof(gid
)))
1690 int sk_getsockopt(struct sock
*sk
, int level
, int optname
,
1691 sockptr_t optval
, sockptr_t optlen
)
1693 struct socket
*sock
= sk
->sk_socket
;
1698 unsigned long ulval
;
1700 struct old_timeval32 tm32
;
1701 struct __kernel_old_timeval tm
;
1702 struct __kernel_sock_timeval stm
;
1703 struct sock_txtime txtime
;
1704 struct so_timestamping timestamping
;
1707 int lv
= sizeof(int);
1710 if (copy_from_sockptr(&len
, optlen
, sizeof(int)))
1715 memset(&v
, 0, sizeof(v
));
1719 v
.val
= sock_flag(sk
, SOCK_DBG
);
1723 v
.val
= sock_flag(sk
, SOCK_LOCALROUTE
);
1727 v
.val
= sock_flag(sk
, SOCK_BROADCAST
);
1731 v
.val
= READ_ONCE(sk
->sk_sndbuf
);
1735 v
.val
= READ_ONCE(sk
->sk_rcvbuf
);
1739 v
.val
= sk
->sk_reuse
;
1743 v
.val
= sk
->sk_reuseport
;
1747 v
.val
= sock_flag(sk
, SOCK_KEEPOPEN
);
1751 v
.val
= sk
->sk_type
;
1755 v
.val
= sk
->sk_protocol
;
1759 v
.val
= sk
->sk_family
;
1763 v
.val
= -sock_error(sk
);
1765 v
.val
= xchg(&sk
->sk_err_soft
, 0);
1769 v
.val
= sock_flag(sk
, SOCK_URGINLINE
);
1773 v
.val
= sk
->sk_no_check_tx
;
1777 v
.val
= READ_ONCE(sk
->sk_priority
);
1781 lv
= sizeof(v
.ling
);
1782 v
.ling
.l_onoff
= sock_flag(sk
, SOCK_LINGER
);
1783 v
.ling
.l_linger
= READ_ONCE(sk
->sk_lingertime
) / HZ
;
1789 case SO_TIMESTAMP_OLD
:
1790 v
.val
= sock_flag(sk
, SOCK_RCVTSTAMP
) &&
1791 !sock_flag(sk
, SOCK_TSTAMP_NEW
) &&
1792 !sock_flag(sk
, SOCK_RCVTSTAMPNS
);
1795 case SO_TIMESTAMPNS_OLD
:
1796 v
.val
= sock_flag(sk
, SOCK_RCVTSTAMPNS
) && !sock_flag(sk
, SOCK_TSTAMP_NEW
);
1799 case SO_TIMESTAMP_NEW
:
1800 v
.val
= sock_flag(sk
, SOCK_RCVTSTAMP
) && sock_flag(sk
, SOCK_TSTAMP_NEW
);
1803 case SO_TIMESTAMPNS_NEW
:
1804 v
.val
= sock_flag(sk
, SOCK_RCVTSTAMPNS
) && sock_flag(sk
, SOCK_TSTAMP_NEW
);
1807 case SO_TIMESTAMPING_OLD
:
1808 case SO_TIMESTAMPING_NEW
:
1809 lv
= sizeof(v
.timestamping
);
1810 /* For the later-added case SO_TIMESTAMPING_NEW: Be strict about only
1811 * returning the flags when they were set through the same option.
1812 * Don't change the beviour for the old case SO_TIMESTAMPING_OLD.
1814 if (optname
== SO_TIMESTAMPING_OLD
|| sock_flag(sk
, SOCK_TSTAMP_NEW
)) {
1815 v
.timestamping
.flags
= READ_ONCE(sk
->sk_tsflags
);
1816 v
.timestamping
.bind_phc
= READ_ONCE(sk
->sk_bind_phc
);
1820 case SO_RCVTIMEO_OLD
:
1821 case SO_RCVTIMEO_NEW
:
1822 lv
= sock_get_timeout(READ_ONCE(sk
->sk_rcvtimeo
), &v
,
1823 SO_RCVTIMEO_OLD
== optname
);
1826 case SO_SNDTIMEO_OLD
:
1827 case SO_SNDTIMEO_NEW
:
1828 lv
= sock_get_timeout(READ_ONCE(sk
->sk_sndtimeo
), &v
,
1829 SO_SNDTIMEO_OLD
== optname
);
1833 v
.val
= READ_ONCE(sk
->sk_rcvlowat
);
1841 v
.val
= !!test_bit(SOCK_PASSCRED
, &sock
->flags
);
1845 v
.val
= !!test_bit(SOCK_PASSPIDFD
, &sock
->flags
);
1850 struct ucred peercred
;
1851 if (len
> sizeof(peercred
))
1852 len
= sizeof(peercred
);
1854 spin_lock(&sk
->sk_peer_lock
);
1855 cred_to_ucred(sk
->sk_peer_pid
, sk
->sk_peer_cred
, &peercred
);
1856 spin_unlock(&sk
->sk_peer_lock
);
1858 if (copy_to_sockptr(optval
, &peercred
, len
))
1865 struct pid
*peer_pid
;
1866 struct file
*pidfd_file
= NULL
;
1869 if (len
> sizeof(pidfd
))
1870 len
= sizeof(pidfd
);
1872 spin_lock(&sk
->sk_peer_lock
);
1873 peer_pid
= get_pid(sk
->sk_peer_pid
);
1874 spin_unlock(&sk
->sk_peer_lock
);
1879 pidfd
= pidfd_prepare(peer_pid
, 0, &pidfd_file
);
1884 if (copy_to_sockptr(optval
, &pidfd
, len
) ||
1885 copy_to_sockptr(optlen
, &len
, sizeof(int))) {
1886 put_unused_fd(pidfd
);
1892 fd_install(pidfd
, pidfd_file
);
1898 const struct cred
*cred
;
1901 cred
= sk_get_peer_cred(sk
);
1905 n
= cred
->group_info
->ngroups
;
1906 if (len
< n
* sizeof(gid_t
)) {
1907 len
= n
* sizeof(gid_t
);
1909 return copy_to_sockptr(optlen
, &len
, sizeof(int)) ? -EFAULT
: -ERANGE
;
1911 len
= n
* sizeof(gid_t
);
1913 ret
= groups_to_user(optval
, cred
->group_info
);
1922 struct sockaddr_storage address
;
1924 lv
= READ_ONCE(sock
->ops
)->getname(sock
, (struct sockaddr
*)&address
, 2);
1929 if (copy_to_sockptr(optval
, &address
, len
))
1934 /* Dubious BSD thing... Probably nobody even uses it, but
1935 * the UNIX standard wants it for whatever reason... -DaveM
1938 v
.val
= sk
->sk_state
== TCP_LISTEN
;
1942 v
.val
= !!test_bit(SOCK_PASSSEC
, &sock
->flags
);
1946 return security_socket_getpeersec_stream(sock
,
1947 optval
, optlen
, len
);
1950 v
.val
= READ_ONCE(sk
->sk_mark
);
1954 v
.val
= sock_flag(sk
, SOCK_RCVMARK
);
1957 case SO_RCVPRIORITY
:
1958 v
.val
= sock_flag(sk
, SOCK_RCVPRIORITY
);
1962 v
.val
= sock_flag(sk
, SOCK_RXQ_OVFL
);
1965 case SO_WIFI_STATUS
:
1966 v
.val
= sock_flag(sk
, SOCK_WIFI_STATUS
);
1970 if (!READ_ONCE(sock
->ops
)->set_peek_off
)
1973 v
.val
= READ_ONCE(sk
->sk_peek_off
);
1976 v
.val
= sock_flag(sk
, SOCK_NOFCS
);
1979 case SO_BINDTODEVICE
:
1980 return sock_getbindtodevice(sk
, optval
, optlen
, len
);
1983 len
= sk_get_filter(sk
, optval
, len
);
1989 case SO_LOCK_FILTER
:
1990 v
.val
= sock_flag(sk
, SOCK_FILTER_LOCKED
);
1993 case SO_BPF_EXTENSIONS
:
1994 v
.val
= bpf_tell_extensions();
1997 case SO_SELECT_ERR_QUEUE
:
1998 v
.val
= sock_flag(sk
, SOCK_SELECT_ERR_QUEUE
);
2001 #ifdef CONFIG_NET_RX_BUSY_POLL
2003 v
.val
= READ_ONCE(sk
->sk_ll_usec
);
2005 case SO_PREFER_BUSY_POLL
:
2006 v
.val
= READ_ONCE(sk
->sk_prefer_busy_poll
);
2010 case SO_MAX_PACING_RATE
:
2011 /* The READ_ONCE() pair with the WRITE_ONCE() in sk_setsockopt() */
2012 if (sizeof(v
.ulval
) != sizeof(v
.val
) && len
>= sizeof(v
.ulval
)) {
2013 lv
= sizeof(v
.ulval
);
2014 v
.ulval
= READ_ONCE(sk
->sk_max_pacing_rate
);
2017 v
.val
= min_t(unsigned long, ~0U,
2018 READ_ONCE(sk
->sk_max_pacing_rate
));
2022 case SO_INCOMING_CPU
:
2023 v
.val
= READ_ONCE(sk
->sk_incoming_cpu
);
2028 u32 meminfo
[SK_MEMINFO_VARS
];
2030 sk_get_meminfo(sk
, meminfo
);
2032 len
= min_t(unsigned int, len
, sizeof(meminfo
));
2033 if (copy_to_sockptr(optval
, &meminfo
, len
))
2039 #ifdef CONFIG_NET_RX_BUSY_POLL
2040 case SO_INCOMING_NAPI_ID
:
2041 v
.val
= READ_ONCE(sk
->sk_napi_id
);
2043 /* aggregate non-NAPI IDs down to 0 */
2044 if (v
.val
< MIN_NAPI_ID
)
2054 v
.val64
= sock_gen_cookie(sk
);
2058 v
.val
= sock_flag(sk
, SOCK_ZEROCOPY
);
2062 lv
= sizeof(v
.txtime
);
2063 v
.txtime
.clockid
= sk
->sk_clockid
;
2064 v
.txtime
.flags
|= sk
->sk_txtime_deadline_mode
?
2065 SOF_TXTIME_DEADLINE_MODE
: 0;
2066 v
.txtime
.flags
|= sk
->sk_txtime_report_errors
?
2067 SOF_TXTIME_REPORT_ERRORS
: 0;
2070 case SO_BINDTOIFINDEX
:
2071 v
.val
= READ_ONCE(sk
->sk_bound_dev_if
);
2074 case SO_NETNS_COOKIE
:
2078 v
.val64
= sock_net(sk
)->net_cookie
;
2082 v
.val
= sk
->sk_userlocks
& SOCK_BUF_LOCK_MASK
;
2085 case SO_RESERVE_MEM
:
2086 v
.val
= READ_ONCE(sk
->sk_reserved_mem
);
2090 /* Paired with WRITE_ONCE() in sk_setsockopt() */
2091 v
.val
= READ_ONCE(sk
->sk_txrehash
);
2095 /* We implement the SO_SNDLOWAT etc to not be settable
2098 return -ENOPROTOOPT
;
2103 if (copy_to_sockptr(optval
, &v
, len
))
2106 if (copy_to_sockptr(optlen
, &len
, sizeof(int)))
2112 * Initialize an sk_lock.
2114 * (We also register the sk_lock with the lock validator.)
2116 static inline void sock_lock_init(struct sock
*sk
)
2118 if (sk
->sk_kern_sock
)
2119 sock_lock_init_class_and_name(
2121 af_family_kern_slock_key_strings
[sk
->sk_family
],
2122 af_family_kern_slock_keys
+ sk
->sk_family
,
2123 af_family_kern_key_strings
[sk
->sk_family
],
2124 af_family_kern_keys
+ sk
->sk_family
);
2126 sock_lock_init_class_and_name(
2128 af_family_slock_key_strings
[sk
->sk_family
],
2129 af_family_slock_keys
+ sk
->sk_family
,
2130 af_family_key_strings
[sk
->sk_family
],
2131 af_family_keys
+ sk
->sk_family
);
2135 * Copy all fields from osk to nsk but nsk->sk_refcnt must not change yet,
2136 * even temporarily, because of RCU lookups. sk_node should also be left as is.
2137 * We must not copy fields between sk_dontcopy_begin and sk_dontcopy_end
2139 static void sock_copy(struct sock
*nsk
, const struct sock
*osk
)
2141 const struct proto
*prot
= READ_ONCE(osk
->sk_prot
);
2142 #ifdef CONFIG_SECURITY_NETWORK
2143 void *sptr
= nsk
->sk_security
;
2146 /* If we move sk_tx_queue_mapping out of the private section,
2147 * we must check if sk_tx_queue_clear() is called after
2148 * sock_copy() in sk_clone_lock().
2150 BUILD_BUG_ON(offsetof(struct sock
, sk_tx_queue_mapping
) <
2151 offsetof(struct sock
, sk_dontcopy_begin
) ||
2152 offsetof(struct sock
, sk_tx_queue_mapping
) >=
2153 offsetof(struct sock
, sk_dontcopy_end
));
2155 memcpy(nsk
, osk
, offsetof(struct sock
, sk_dontcopy_begin
));
2157 unsafe_memcpy(&nsk
->sk_dontcopy_end
, &osk
->sk_dontcopy_end
,
2158 prot
->obj_size
- offsetof(struct sock
, sk_dontcopy_end
),
2159 /* alloc is larger than struct, see sk_prot_alloc() */);
2161 #ifdef CONFIG_SECURITY_NETWORK
2162 nsk
->sk_security
= sptr
;
2163 security_sk_clone(osk
, nsk
);
2167 static struct sock
*sk_prot_alloc(struct proto
*prot
, gfp_t priority
,
2171 struct kmem_cache
*slab
;
2175 sk
= kmem_cache_alloc(slab
, priority
& ~__GFP_ZERO
);
2178 if (want_init_on_alloc(priority
))
2179 sk_prot_clear_nulls(sk
, prot
->obj_size
);
2181 sk
= kmalloc(prot
->obj_size
, priority
);
2184 if (security_sk_alloc(sk
, family
, priority
))
2187 if (!try_module_get(prot
->owner
))
2194 security_sk_free(sk
);
2197 kmem_cache_free(slab
, sk
);
2203 static void sk_prot_free(struct proto
*prot
, struct sock
*sk
)
2205 struct kmem_cache
*slab
;
2206 struct module
*owner
;
2208 owner
= prot
->owner
;
2211 cgroup_sk_free(&sk
->sk_cgrp_data
);
2212 mem_cgroup_sk_free(sk
);
2213 security_sk_free(sk
);
2215 kmem_cache_free(slab
, sk
);
2222 * sk_alloc - All socket objects are allocated here
2223 * @net: the applicable net namespace
2224 * @family: protocol family
2225 * @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
2226 * @prot: struct proto associated with this new sock instance
2227 * @kern: is this to be a kernel socket?
2229 struct sock
*sk_alloc(struct net
*net
, int family
, gfp_t priority
,
2230 struct proto
*prot
, int kern
)
2234 sk
= sk_prot_alloc(prot
, priority
| __GFP_ZERO
, family
);
2236 sk
->sk_family
= family
;
2238 * See comment in struct sock definition to understand
2239 * why we need sk_prot_creator -acme
2241 sk
->sk_prot
= sk
->sk_prot_creator
= prot
;
2242 sk
->sk_kern_sock
= kern
;
2244 sk
->sk_net_refcnt
= kern
? 0 : 1;
2245 if (likely(sk
->sk_net_refcnt
)) {
2246 get_net_track(net
, &sk
->ns_tracker
, priority
);
2247 sock_inuse_add(net
, 1);
2249 __netns_tracker_alloc(net
, &sk
->ns_tracker
,
2253 sock_net_set(sk
, net
);
2254 refcount_set(&sk
->sk_wmem_alloc
, 1);
2256 mem_cgroup_sk_alloc(sk
);
2257 cgroup_sk_alloc(&sk
->sk_cgrp_data
);
2258 sock_update_classid(&sk
->sk_cgrp_data
);
2259 sock_update_netprioidx(&sk
->sk_cgrp_data
);
2260 sk_tx_queue_clear(sk
);
2265 EXPORT_SYMBOL(sk_alloc
);
2267 /* Sockets having SOCK_RCU_FREE will call this function after one RCU
2268 * grace period. This is the case for UDP sockets and TCP listeners.
2270 static void __sk_destruct(struct rcu_head
*head
)
2272 struct sock
*sk
= container_of(head
, struct sock
, sk_rcu
);
2273 struct sk_filter
*filter
;
2275 if (sk
->sk_destruct
)
2276 sk
->sk_destruct(sk
);
2278 filter
= rcu_dereference_check(sk
->sk_filter
,
2279 refcount_read(&sk
->sk_wmem_alloc
) == 0);
2281 sk_filter_uncharge(sk
, filter
);
2282 RCU_INIT_POINTER(sk
->sk_filter
, NULL
);
2285 sock_disable_timestamp(sk
, SK_FLAGS_TIMESTAMP
);
2287 #ifdef CONFIG_BPF_SYSCALL
2288 bpf_sk_storage_free(sk
);
2291 if (atomic_read(&sk
->sk_omem_alloc
))
2292 pr_debug("%s: optmem leakage (%d bytes) detected\n",
2293 __func__
, atomic_read(&sk
->sk_omem_alloc
));
2295 if (sk
->sk_frag
.page
) {
2296 put_page(sk
->sk_frag
.page
);
2297 sk
->sk_frag
.page
= NULL
;
2300 /* We do not need to acquire sk->sk_peer_lock, we are the last user. */
2301 put_cred(sk
->sk_peer_cred
);
2302 put_pid(sk
->sk_peer_pid
);
2304 if (likely(sk
->sk_net_refcnt
))
2305 put_net_track(sock_net(sk
), &sk
->ns_tracker
);
2307 __netns_tracker_free(sock_net(sk
), &sk
->ns_tracker
, false);
2309 sk_prot_free(sk
->sk_prot_creator
, sk
);
2312 void sk_destruct(struct sock
*sk
)
2314 bool use_call_rcu
= sock_flag(sk
, SOCK_RCU_FREE
);
2316 if (rcu_access_pointer(sk
->sk_reuseport_cb
)) {
2317 reuseport_detach_sock(sk
);
2318 use_call_rcu
= true;
2322 call_rcu(&sk
->sk_rcu
, __sk_destruct
);
2324 __sk_destruct(&sk
->sk_rcu
);
2327 static void __sk_free(struct sock
*sk
)
2329 if (likely(sk
->sk_net_refcnt
))
2330 sock_inuse_add(sock_net(sk
), -1);
2332 if (unlikely(sk
->sk_net_refcnt
&& sock_diag_has_destroy_listeners(sk
)))
2333 sock_diag_broadcast_destroy(sk
);
2338 void sk_free(struct sock
*sk
)
2341 * We subtract one from sk_wmem_alloc and can know if
2342 * some packets are still in some tx queue.
2343 * If not null, sock_wfree() will call __sk_free(sk) later
2345 if (refcount_dec_and_test(&sk
->sk_wmem_alloc
))
2348 EXPORT_SYMBOL(sk_free
);
2350 static void sk_init_common(struct sock
*sk
)
2352 skb_queue_head_init(&sk
->sk_receive_queue
);
2353 skb_queue_head_init(&sk
->sk_write_queue
);
2354 skb_queue_head_init(&sk
->sk_error_queue
);
2356 rwlock_init(&sk
->sk_callback_lock
);
2357 lockdep_set_class_and_name(&sk
->sk_receive_queue
.lock
,
2358 af_rlock_keys
+ sk
->sk_family
,
2359 af_family_rlock_key_strings
[sk
->sk_family
]);
2360 lockdep_set_class_and_name(&sk
->sk_write_queue
.lock
,
2361 af_wlock_keys
+ sk
->sk_family
,
2362 af_family_wlock_key_strings
[sk
->sk_family
]);
2363 lockdep_set_class_and_name(&sk
->sk_error_queue
.lock
,
2364 af_elock_keys
+ sk
->sk_family
,
2365 af_family_elock_key_strings
[sk
->sk_family
]);
2366 if (sk
->sk_kern_sock
)
2367 lockdep_set_class_and_name(&sk
->sk_callback_lock
,
2368 af_kern_callback_keys
+ sk
->sk_family
,
2369 af_family_kern_clock_key_strings
[sk
->sk_family
]);
2371 lockdep_set_class_and_name(&sk
->sk_callback_lock
,
2372 af_callback_keys
+ sk
->sk_family
,
2373 af_family_clock_key_strings
[sk
->sk_family
]);
2377 * sk_clone_lock - clone a socket, and lock its clone
2378 * @sk: the socket to clone
2379 * @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
2381 * Caller must unlock socket even in error path (bh_unlock_sock(newsk))
2383 struct sock
*sk_clone_lock(const struct sock
*sk
, const gfp_t priority
)
2385 struct proto
*prot
= READ_ONCE(sk
->sk_prot
);
2386 struct sk_filter
*filter
;
2387 bool is_charged
= true;
2390 newsk
= sk_prot_alloc(prot
, priority
, sk
->sk_family
);
2394 sock_copy(newsk
, sk
);
2396 newsk
->sk_prot_creator
= prot
;
2399 if (likely(newsk
->sk_net_refcnt
)) {
2400 get_net_track(sock_net(newsk
), &newsk
->ns_tracker
, priority
);
2401 sock_inuse_add(sock_net(newsk
), 1);
2403 /* Kernel sockets are not elevating the struct net refcount.
2404 * Instead, use a tracker to more easily detect if a layer
2405 * is not properly dismantling its kernel sockets at netns
2408 __netns_tracker_alloc(sock_net(newsk
), &newsk
->ns_tracker
,
2411 sk_node_init(&newsk
->sk_node
);
2412 sock_lock_init(newsk
);
2413 bh_lock_sock(newsk
);
2414 newsk
->sk_backlog
.head
= newsk
->sk_backlog
.tail
= NULL
;
2415 newsk
->sk_backlog
.len
= 0;
2417 atomic_set(&newsk
->sk_rmem_alloc
, 0);
2419 /* sk_wmem_alloc set to one (see sk_free() and sock_wfree()) */
2420 refcount_set(&newsk
->sk_wmem_alloc
, 1);
2422 atomic_set(&newsk
->sk_omem_alloc
, 0);
2423 sk_init_common(newsk
);
2425 newsk
->sk_dst_cache
= NULL
;
2426 newsk
->sk_dst_pending_confirm
= 0;
2427 newsk
->sk_wmem_queued
= 0;
2428 newsk
->sk_forward_alloc
= 0;
2429 newsk
->sk_reserved_mem
= 0;
2430 atomic_set(&newsk
->sk_drops
, 0);
2431 newsk
->sk_send_head
= NULL
;
2432 newsk
->sk_userlocks
= sk
->sk_userlocks
& ~SOCK_BINDPORT_LOCK
;
2433 atomic_set(&newsk
->sk_zckey
, 0);
2435 sock_reset_flag(newsk
, SOCK_DONE
);
2437 /* sk->sk_memcg will be populated at accept() time */
2438 newsk
->sk_memcg
= NULL
;
2440 cgroup_sk_clone(&newsk
->sk_cgrp_data
);
2443 filter
= rcu_dereference(sk
->sk_filter
);
2445 /* though it's an empty new sock, the charging may fail
2446 * if sysctl_optmem_max was changed between creation of
2447 * original socket and cloning
2449 is_charged
= sk_filter_charge(newsk
, filter
);
2450 RCU_INIT_POINTER(newsk
->sk_filter
, filter
);
2453 if (unlikely(!is_charged
|| xfrm_sk_clone_policy(newsk
, sk
))) {
2454 /* We need to make sure that we don't uncharge the new
2455 * socket if we couldn't charge it in the first place
2456 * as otherwise we uncharge the parent's filter.
2459 RCU_INIT_POINTER(newsk
->sk_filter
, NULL
);
2460 sk_free_unlock_clone(newsk
);
2464 RCU_INIT_POINTER(newsk
->sk_reuseport_cb
, NULL
);
2466 if (bpf_sk_storage_clone(sk
, newsk
)) {
2467 sk_free_unlock_clone(newsk
);
2472 /* Clear sk_user_data if parent had the pointer tagged
2473 * as not suitable for copying when cloning.
2475 if (sk_user_data_is_nocopy(newsk
))
2476 newsk
->sk_user_data
= NULL
;
2479 newsk
->sk_err_soft
= 0;
2480 newsk
->sk_priority
= 0;
2481 newsk
->sk_incoming_cpu
= raw_smp_processor_id();
2483 /* Before updating sk_refcnt, we must commit prior changes to memory
2484 * (Documentation/RCU/rculist_nulls.rst for details)
2487 refcount_set(&newsk
->sk_refcnt
, 2);
2489 sk_set_socket(newsk
, NULL
);
2490 sk_tx_queue_clear(newsk
);
2491 RCU_INIT_POINTER(newsk
->sk_wq
, NULL
);
2493 if (newsk
->sk_prot
->sockets_allocated
)
2494 sk_sockets_allocated_inc(newsk
);
2496 if (sock_needs_netstamp(sk
) && newsk
->sk_flags
& SK_FLAGS_TIMESTAMP
)
2497 net_enable_timestamp();
2501 EXPORT_SYMBOL_GPL(sk_clone_lock
);
2503 void sk_free_unlock_clone(struct sock
*sk
)
2505 /* It is still raw copy of parent, so invalidate
2506 * destructor and make plain sk_free() */
2507 sk
->sk_destruct
= NULL
;
2511 EXPORT_SYMBOL_GPL(sk_free_unlock_clone
);
2513 static u32
sk_dst_gso_max_size(struct sock
*sk
, struct dst_entry
*dst
)
2515 bool is_ipv6
= false;
2518 #if IS_ENABLED(CONFIG_IPV6)
2519 is_ipv6
= (sk
->sk_family
== AF_INET6
&&
2520 !ipv6_addr_v4mapped(&sk
->sk_v6_rcv_saddr
));
2522 /* pairs with the WRITE_ONCE() in netif_set_gso(_ipv4)_max_size() */
2523 max_size
= is_ipv6
? READ_ONCE(dst
->dev
->gso_max_size
) :
2524 READ_ONCE(dst
->dev
->gso_ipv4_max_size
);
2525 if (max_size
> GSO_LEGACY_MAX_SIZE
&& !sk_is_tcp(sk
))
2526 max_size
= GSO_LEGACY_MAX_SIZE
;
2528 return max_size
- (MAX_TCP_HEADER
+ 1);
2531 void sk_setup_caps(struct sock
*sk
, struct dst_entry
*dst
)
2535 sk
->sk_route_caps
= dst
->dev
->features
;
2537 sk
->sk_route_caps
|= NETIF_F_GSO
;
2538 if (sk
->sk_route_caps
& NETIF_F_GSO
)
2539 sk
->sk_route_caps
|= NETIF_F_GSO_SOFTWARE
;
2540 if (unlikely(sk
->sk_gso_disabled
))
2541 sk
->sk_route_caps
&= ~NETIF_F_GSO_MASK
;
2542 if (sk_can_gso(sk
)) {
2543 if (dst
->header_len
&& !xfrm_dst_offload_ok(dst
)) {
2544 sk
->sk_route_caps
&= ~NETIF_F_GSO_MASK
;
2546 sk
->sk_route_caps
|= NETIF_F_SG
| NETIF_F_HW_CSUM
;
2547 sk
->sk_gso_max_size
= sk_dst_gso_max_size(sk
, dst
);
2548 /* pairs with the WRITE_ONCE() in netif_set_gso_max_segs() */
2549 max_segs
= max_t(u32
, READ_ONCE(dst
->dev
->gso_max_segs
), 1);
2552 sk
->sk_gso_max_segs
= max_segs
;
2553 sk_dst_set(sk
, dst
);
2555 EXPORT_SYMBOL_GPL(sk_setup_caps
);
2558 * Simple resource managers for sockets.
2563 * Write buffer destructor automatically called from kfree_skb.
2565 void sock_wfree(struct sk_buff
*skb
)
2567 struct sock
*sk
= skb
->sk
;
2568 unsigned int len
= skb
->truesize
;
2571 if (!sock_flag(sk
, SOCK_USE_WRITE_QUEUE
)) {
2572 if (sock_flag(sk
, SOCK_RCU_FREE
) &&
2573 sk
->sk_write_space
== sock_def_write_space
) {
2575 free
= refcount_sub_and_test(len
, &sk
->sk_wmem_alloc
);
2576 sock_def_write_space_wfree(sk
);
2584 * Keep a reference on sk_wmem_alloc, this will be released
2585 * after sk_write_space() call
2587 WARN_ON(refcount_sub_and_test(len
- 1, &sk
->sk_wmem_alloc
));
2588 sk
->sk_write_space(sk
);
2592 * if sk_wmem_alloc reaches 0, we must finish what sk_free()
2593 * could not do because of in-flight packets
2595 if (refcount_sub_and_test(len
, &sk
->sk_wmem_alloc
))
2598 EXPORT_SYMBOL(sock_wfree
);
2600 /* This variant of sock_wfree() is used by TCP,
2601 * since it sets SOCK_USE_WRITE_QUEUE.
2603 void __sock_wfree(struct sk_buff
*skb
)
2605 struct sock
*sk
= skb
->sk
;
2607 if (refcount_sub_and_test(skb
->truesize
, &sk
->sk_wmem_alloc
))
2611 void skb_set_owner_w(struct sk_buff
*skb
, struct sock
*sk
)
2615 if (unlikely(!sk_fullsock(sk
)))
2616 return skb_set_owner_edemux(skb
, sk
);
2619 skb
->destructor
= sock_wfree
;
2620 skb_set_hash_from_sk(skb
, sk
);
2622 * We used to take a refcount on sk, but following operation
2623 * is enough to guarantee sk_free() won't free this sock until
2624 * all in-flight packets are completed
2626 refcount_add(skb
->truesize
, &sk
->sk_wmem_alloc
);
2628 EXPORT_SYMBOL(skb_set_owner_w
);
2630 static bool can_skb_orphan_partial(const struct sk_buff
*skb
)
2632 /* Drivers depend on in-order delivery for crypto offload,
2633 * partial orphan breaks out-of-order-OK logic.
2635 if (skb_is_decrypted(skb
))
2638 return (skb
->destructor
== sock_wfree
||
2639 (IS_ENABLED(CONFIG_INET
) && skb
->destructor
== tcp_wfree
));
2642 /* This helper is used by netem, as it can hold packets in its
2643 * delay queue. We want to allow the owner socket to send more
2644 * packets, as if they were already TX completed by a typical driver.
2645 * But we also want to keep skb->sk set because some packet schedulers
2646 * rely on it (sch_fq for example).
2648 void skb_orphan_partial(struct sk_buff
*skb
)
2650 if (skb_is_tcp_pure_ack(skb
))
2653 if (can_skb_orphan_partial(skb
) && skb_set_owner_sk_safe(skb
, skb
->sk
))
2658 EXPORT_SYMBOL(skb_orphan_partial
);
2661 * Read buffer destructor automatically called from kfree_skb.
2663 void sock_rfree(struct sk_buff
*skb
)
2665 struct sock
*sk
= skb
->sk
;
2666 unsigned int len
= skb
->truesize
;
2668 atomic_sub(len
, &sk
->sk_rmem_alloc
);
2669 sk_mem_uncharge(sk
, len
);
2671 EXPORT_SYMBOL(sock_rfree
);
2674 * Buffer destructor for skbs that are not used directly in read or write
2675 * path, e.g. for error handler skbs. Automatically called from kfree_skb.
2677 void sock_efree(struct sk_buff
*skb
)
2681 EXPORT_SYMBOL(sock_efree
);
2683 /* Buffer destructor for prefetch/receive path where reference count may
2684 * not be held, e.g. for listen sockets.
2687 void sock_pfree(struct sk_buff
*skb
)
2689 struct sock
*sk
= skb
->sk
;
2691 if (!sk_is_refcounted(sk
))
2694 if (sk
->sk_state
== TCP_NEW_SYN_RECV
&& inet_reqsk(sk
)->syncookie
) {
2695 inet_reqsk(sk
)->rsk_listener
= NULL
;
2696 reqsk_free(inet_reqsk(sk
));
2702 EXPORT_SYMBOL(sock_pfree
);
2703 #endif /* CONFIG_INET */
2705 kuid_t
sock_i_uid(struct sock
*sk
)
2709 read_lock_bh(&sk
->sk_callback_lock
);
2710 uid
= sk
->sk_socket
? SOCK_INODE(sk
->sk_socket
)->i_uid
: GLOBAL_ROOT_UID
;
2711 read_unlock_bh(&sk
->sk_callback_lock
);
2714 EXPORT_SYMBOL(sock_i_uid
);
2716 unsigned long __sock_i_ino(struct sock
*sk
)
2720 read_lock(&sk
->sk_callback_lock
);
2721 ino
= sk
->sk_socket
? SOCK_INODE(sk
->sk_socket
)->i_ino
: 0;
2722 read_unlock(&sk
->sk_callback_lock
);
2725 EXPORT_SYMBOL(__sock_i_ino
);
2727 unsigned long sock_i_ino(struct sock
*sk
)
2732 ino
= __sock_i_ino(sk
);
2736 EXPORT_SYMBOL(sock_i_ino
);
2739 * Allocate a skb from the socket's send buffer.
2741 struct sk_buff
*sock_wmalloc(struct sock
*sk
, unsigned long size
, int force
,
2745 refcount_read(&sk
->sk_wmem_alloc
) < READ_ONCE(sk
->sk_sndbuf
)) {
2746 struct sk_buff
*skb
= alloc_skb(size
, priority
);
2749 skb_set_owner_w(skb
, sk
);
2755 EXPORT_SYMBOL(sock_wmalloc
);
2757 static void sock_ofree(struct sk_buff
*skb
)
2759 struct sock
*sk
= skb
->sk
;
2761 atomic_sub(skb
->truesize
, &sk
->sk_omem_alloc
);
2764 struct sk_buff
*sock_omalloc(struct sock
*sk
, unsigned long size
,
2767 struct sk_buff
*skb
;
2769 /* small safe race: SKB_TRUESIZE may differ from final skb->truesize */
2770 if (atomic_read(&sk
->sk_omem_alloc
) + SKB_TRUESIZE(size
) >
2771 READ_ONCE(sock_net(sk
)->core
.sysctl_optmem_max
))
2774 skb
= alloc_skb(size
, priority
);
2778 atomic_add(skb
->truesize
, &sk
->sk_omem_alloc
);
2780 skb
->destructor
= sock_ofree
;
2785 * Allocate a memory block from the socket's option memory buffer.
2787 void *sock_kmalloc(struct sock
*sk
, int size
, gfp_t priority
)
2789 int optmem_max
= READ_ONCE(sock_net(sk
)->core
.sysctl_optmem_max
);
2791 if ((unsigned int)size
<= optmem_max
&&
2792 atomic_read(&sk
->sk_omem_alloc
) + size
< optmem_max
) {
2794 /* First do the add, to avoid the race if kmalloc
2797 atomic_add(size
, &sk
->sk_omem_alloc
);
2798 mem
= kmalloc(size
, priority
);
2801 atomic_sub(size
, &sk
->sk_omem_alloc
);
2805 EXPORT_SYMBOL(sock_kmalloc
);
2807 /* Free an option memory block. Note, we actually want the inline
2808 * here as this allows gcc to detect the nullify and fold away the
2809 * condition entirely.
2811 static inline void __sock_kfree_s(struct sock
*sk
, void *mem
, int size
,
2814 if (WARN_ON_ONCE(!mem
))
2817 kfree_sensitive(mem
);
2820 atomic_sub(size
, &sk
->sk_omem_alloc
);
2823 void sock_kfree_s(struct sock
*sk
, void *mem
, int size
)
2825 __sock_kfree_s(sk
, mem
, size
, false);
2827 EXPORT_SYMBOL(sock_kfree_s
);
2829 void sock_kzfree_s(struct sock
*sk
, void *mem
, int size
)
2831 __sock_kfree_s(sk
, mem
, size
, true);
2833 EXPORT_SYMBOL(sock_kzfree_s
);
2835 /* It is almost wait_for_tcp_memory minus release_sock/lock_sock.
2836 I think, these locks should be removed for datagram sockets.
2838 static long sock_wait_for_wmem(struct sock
*sk
, long timeo
)
2842 sk_clear_bit(SOCKWQ_ASYNC_NOSPACE
, sk
);
2846 if (signal_pending(current
))
2848 set_bit(SOCK_NOSPACE
, &sk
->sk_socket
->flags
);
2849 prepare_to_wait(sk_sleep(sk
), &wait
, TASK_INTERRUPTIBLE
);
2850 if (refcount_read(&sk
->sk_wmem_alloc
) < READ_ONCE(sk
->sk_sndbuf
))
2852 if (READ_ONCE(sk
->sk_shutdown
) & SEND_SHUTDOWN
)
2854 if (READ_ONCE(sk
->sk_err
))
2856 timeo
= schedule_timeout(timeo
);
2858 finish_wait(sk_sleep(sk
), &wait
);
2864 * Generic send/receive buffer handlers
2867 struct sk_buff
*sock_alloc_send_pskb(struct sock
*sk
, unsigned long header_len
,
2868 unsigned long data_len
, int noblock
,
2869 int *errcode
, int max_page_order
)
2871 struct sk_buff
*skb
;
2875 timeo
= sock_sndtimeo(sk
, noblock
);
2877 err
= sock_error(sk
);
2882 if (READ_ONCE(sk
->sk_shutdown
) & SEND_SHUTDOWN
)
2885 if (sk_wmem_alloc_get(sk
) < READ_ONCE(sk
->sk_sndbuf
))
2888 sk_set_bit(SOCKWQ_ASYNC_NOSPACE
, sk
);
2889 set_bit(SOCK_NOSPACE
, &sk
->sk_socket
->flags
);
2893 if (signal_pending(current
))
2895 timeo
= sock_wait_for_wmem(sk
, timeo
);
2897 skb
= alloc_skb_with_frags(header_len
, data_len
, max_page_order
,
2898 errcode
, sk
->sk_allocation
);
2900 skb_set_owner_w(skb
, sk
);
2904 err
= sock_intr_errno(timeo
);
2909 EXPORT_SYMBOL(sock_alloc_send_pskb
);
2911 int __sock_cmsg_send(struct sock
*sk
, struct cmsghdr
*cmsg
,
2912 struct sockcm_cookie
*sockc
)
2916 BUILD_BUG_ON(SOF_TIMESTAMPING_LAST
== (1 << 31));
2918 switch (cmsg
->cmsg_type
) {
2920 if (!ns_capable(sock_net(sk
)->user_ns
, CAP_NET_RAW
) &&
2921 !ns_capable(sock_net(sk
)->user_ns
, CAP_NET_ADMIN
))
2923 if (cmsg
->cmsg_len
!= CMSG_LEN(sizeof(u32
)))
2925 sockc
->mark
= *(u32
*)CMSG_DATA(cmsg
);
2927 case SO_TIMESTAMPING_OLD
:
2928 case SO_TIMESTAMPING_NEW
:
2929 if (cmsg
->cmsg_len
!= CMSG_LEN(sizeof(u32
)))
2932 tsflags
= *(u32
*)CMSG_DATA(cmsg
);
2933 if (tsflags
& ~SOF_TIMESTAMPING_TX_RECORD_MASK
)
2936 sockc
->tsflags
&= ~SOF_TIMESTAMPING_TX_RECORD_MASK
;
2937 sockc
->tsflags
|= tsflags
;
2940 if (!sock_flag(sk
, SOCK_TXTIME
))
2942 if (cmsg
->cmsg_len
!= CMSG_LEN(sizeof(u64
)))
2944 sockc
->transmit_time
= get_unaligned((u64
*)CMSG_DATA(cmsg
));
2949 tsflags
= READ_ONCE(sk
->sk_tsflags
);
2950 if (!(tsflags
& SOF_TIMESTAMPING_OPT_ID
))
2952 if (cmsg
->cmsg_len
!= CMSG_LEN(sizeof(u32
)))
2954 sockc
->ts_opt_id
= *(u32
*)CMSG_DATA(cmsg
);
2955 sockc
->tsflags
|= SOCKCM_FLAG_TS_OPT_ID
;
2957 /* SCM_RIGHTS and SCM_CREDENTIALS are semantically in SOL_UNIX. */
2959 case SCM_CREDENTIALS
:
2962 if (cmsg
->cmsg_len
!= CMSG_LEN(sizeof(u32
)))
2964 if (!sk_set_prio_allowed(sk
, *(u32
*)CMSG_DATA(cmsg
)))
2966 sockc
->priority
= *(u32
*)CMSG_DATA(cmsg
);
2973 EXPORT_SYMBOL(__sock_cmsg_send
);
2975 int sock_cmsg_send(struct sock
*sk
, struct msghdr
*msg
,
2976 struct sockcm_cookie
*sockc
)
2978 struct cmsghdr
*cmsg
;
2981 for_each_cmsghdr(cmsg
, msg
) {
2982 if (!CMSG_OK(msg
, cmsg
))
2984 if (cmsg
->cmsg_level
!= SOL_SOCKET
)
2986 ret
= __sock_cmsg_send(sk
, cmsg
, sockc
);
2992 EXPORT_SYMBOL(sock_cmsg_send
);
2994 static void sk_enter_memory_pressure(struct sock
*sk
)
2996 if (!sk
->sk_prot
->enter_memory_pressure
)
2999 sk
->sk_prot
->enter_memory_pressure(sk
);
3002 static void sk_leave_memory_pressure(struct sock
*sk
)
3004 if (sk
->sk_prot
->leave_memory_pressure
) {
3005 INDIRECT_CALL_INET_1(sk
->sk_prot
->leave_memory_pressure
,
3006 tcp_leave_memory_pressure
, sk
);
3008 unsigned long *memory_pressure
= sk
->sk_prot
->memory_pressure
;
3010 if (memory_pressure
&& READ_ONCE(*memory_pressure
))
3011 WRITE_ONCE(*memory_pressure
, 0);
3015 DEFINE_STATIC_KEY_FALSE(net_high_order_alloc_disable_key
);
3018 * skb_page_frag_refill - check that a page_frag contains enough room
3019 * @sz: minimum size of the fragment we want to get
3020 * @pfrag: pointer to page_frag
3021 * @gfp: priority for memory allocation
3023 * Note: While this allocator tries to use high order pages, there is
3024 * no guarantee that allocations succeed. Therefore, @sz MUST be
3025 * less or equal than PAGE_SIZE.
3027 bool skb_page_frag_refill(unsigned int sz
, struct page_frag
*pfrag
, gfp_t gfp
)
3030 if (page_ref_count(pfrag
->page
) == 1) {
3034 if (pfrag
->offset
+ sz
<= pfrag
->size
)
3036 put_page(pfrag
->page
);
3040 if (SKB_FRAG_PAGE_ORDER
&&
3041 !static_branch_unlikely(&net_high_order_alloc_disable_key
)) {
3042 /* Avoid direct reclaim but allow kswapd to wake */
3043 pfrag
->page
= alloc_pages((gfp
& ~__GFP_DIRECT_RECLAIM
) |
3044 __GFP_COMP
| __GFP_NOWARN
|
3046 SKB_FRAG_PAGE_ORDER
);
3047 if (likely(pfrag
->page
)) {
3048 pfrag
->size
= PAGE_SIZE
<< SKB_FRAG_PAGE_ORDER
;
3052 pfrag
->page
= alloc_page(gfp
);
3053 if (likely(pfrag
->page
)) {
3054 pfrag
->size
= PAGE_SIZE
;
3059 EXPORT_SYMBOL(skb_page_frag_refill
);
3061 bool sk_page_frag_refill(struct sock
*sk
, struct page_frag
*pfrag
)
3063 if (likely(skb_page_frag_refill(32U, pfrag
, sk
->sk_allocation
)))
3066 sk_enter_memory_pressure(sk
);
3067 sk_stream_moderate_sndbuf(sk
);
3070 EXPORT_SYMBOL(sk_page_frag_refill
);
3072 void __lock_sock(struct sock
*sk
)
3073 __releases(&sk
->sk_lock
.slock
)
3074 __acquires(&sk
->sk_lock
.slock
)
3079 prepare_to_wait_exclusive(&sk
->sk_lock
.wq
, &wait
,
3080 TASK_UNINTERRUPTIBLE
);
3081 spin_unlock_bh(&sk
->sk_lock
.slock
);
3083 spin_lock_bh(&sk
->sk_lock
.slock
);
3084 if (!sock_owned_by_user(sk
))
3087 finish_wait(&sk
->sk_lock
.wq
, &wait
);
3090 void __release_sock(struct sock
*sk
)
3091 __releases(&sk
->sk_lock
.slock
)
3092 __acquires(&sk
->sk_lock
.slock
)
3094 struct sk_buff
*skb
, *next
;
3096 while ((skb
= sk
->sk_backlog
.head
) != NULL
) {
3097 sk
->sk_backlog
.head
= sk
->sk_backlog
.tail
= NULL
;
3099 spin_unlock_bh(&sk
->sk_lock
.slock
);
3104 DEBUG_NET_WARN_ON_ONCE(skb_dst_is_noref(skb
));
3105 skb_mark_not_on_list(skb
);
3106 sk_backlog_rcv(sk
, skb
);
3111 } while (skb
!= NULL
);
3113 spin_lock_bh(&sk
->sk_lock
.slock
);
3117 * Doing the zeroing here guarantee we can not loop forever
3118 * while a wild producer attempts to flood us.
3120 sk
->sk_backlog
.len
= 0;
3123 void __sk_flush_backlog(struct sock
*sk
)
3125 spin_lock_bh(&sk
->sk_lock
.slock
);
3128 if (sk
->sk_prot
->release_cb
)
3129 INDIRECT_CALL_INET_1(sk
->sk_prot
->release_cb
,
3130 tcp_release_cb
, sk
);
3132 spin_unlock_bh(&sk
->sk_lock
.slock
);
3134 EXPORT_SYMBOL_GPL(__sk_flush_backlog
);
3137 * sk_wait_data - wait for data to arrive at sk_receive_queue
3138 * @sk: sock to wait on
3139 * @timeo: for how long
3140 * @skb: last skb seen on sk_receive_queue
3142 * Now socket state including sk->sk_err is changed only under lock,
3143 * hence we may omit checks after joining wait queue.
3144 * We check receive queue before schedule() only as optimization;
3145 * it is very likely that release_sock() added new data.
3147 int sk_wait_data(struct sock
*sk
, long *timeo
, const struct sk_buff
*skb
)
3149 DEFINE_WAIT_FUNC(wait
, woken_wake_function
);
3152 add_wait_queue(sk_sleep(sk
), &wait
);
3153 sk_set_bit(SOCKWQ_ASYNC_WAITDATA
, sk
);
3154 rc
= sk_wait_event(sk
, timeo
, skb_peek_tail(&sk
->sk_receive_queue
) != skb
, &wait
);
3155 sk_clear_bit(SOCKWQ_ASYNC_WAITDATA
, sk
);
3156 remove_wait_queue(sk_sleep(sk
), &wait
);
3159 EXPORT_SYMBOL(sk_wait_data
);
3162 * __sk_mem_raise_allocated - increase memory_allocated
3164 * @size: memory size to allocate
3165 * @amt: pages to allocate
3166 * @kind: allocation type
3168 * Similar to __sk_mem_schedule(), but does not update sk_forward_alloc.
3170 * Unlike the globally shared limits among the sockets under same protocol,
3171 * consuming the budget of a memcg won't have direct effect on other ones.
3172 * So be optimistic about memcg's tolerance, and leave the callers to decide
3173 * whether or not to raise allocated through sk_under_memory_pressure() or
3176 int __sk_mem_raise_allocated(struct sock
*sk
, int size
, int amt
, int kind
)
3178 struct mem_cgroup
*memcg
= mem_cgroup_sockets_enabled
? sk
->sk_memcg
: NULL
;
3179 struct proto
*prot
= sk
->sk_prot
;
3180 bool charged
= false;
3183 sk_memory_allocated_add(sk
, amt
);
3184 allocated
= sk_memory_allocated(sk
);
3187 if (!mem_cgroup_charge_skmem(memcg
, amt
, gfp_memcg_charge()))
3188 goto suppress_allocation
;
3193 if (allocated
<= sk_prot_mem_limits(sk
, 0)) {
3194 sk_leave_memory_pressure(sk
);
3198 /* Under pressure. */
3199 if (allocated
> sk_prot_mem_limits(sk
, 1))
3200 sk_enter_memory_pressure(sk
);
3202 /* Over hard limit. */
3203 if (allocated
> sk_prot_mem_limits(sk
, 2))
3204 goto suppress_allocation
;
3206 /* Guarantee minimum buffer size under pressure (either global
3207 * or memcg) to make sure features described in RFC 7323 (TCP
3208 * Extensions for High Performance) work properly.
3210 * This rule does NOT stand when exceeds global or memcg's hard
3211 * limit, or else a DoS attack can be taken place by spawning
3212 * lots of sockets whose usage are under minimum buffer size.
3214 if (kind
== SK_MEM_RECV
) {
3215 if (atomic_read(&sk
->sk_rmem_alloc
) < sk_get_rmem0(sk
, prot
))
3218 } else { /* SK_MEM_SEND */
3219 int wmem0
= sk_get_wmem0(sk
, prot
);
3221 if (sk
->sk_type
== SOCK_STREAM
) {
3222 if (sk
->sk_wmem_queued
< wmem0
)
3224 } else if (refcount_read(&sk
->sk_wmem_alloc
) < wmem0
) {
3229 if (sk_has_memory_pressure(sk
)) {
3232 /* The following 'average' heuristic is within the
3233 * scope of global accounting, so it only makes
3234 * sense for global memory pressure.
3236 if (!sk_under_global_memory_pressure(sk
))
3239 /* Try to be fair among all the sockets under global
3240 * pressure by allowing the ones that below average
3243 alloc
= sk_sockets_allocated_read_positive(sk
);
3244 if (sk_prot_mem_limits(sk
, 2) > alloc
*
3245 sk_mem_pages(sk
->sk_wmem_queued
+
3246 atomic_read(&sk
->sk_rmem_alloc
) +
3247 sk
->sk_forward_alloc
))
3251 suppress_allocation
:
3253 if (kind
== SK_MEM_SEND
&& sk
->sk_type
== SOCK_STREAM
) {
3254 sk_stream_moderate_sndbuf(sk
);
3256 /* Fail only if socket is _under_ its sndbuf.
3257 * In this case we cannot block, so that we have to fail.
3259 if (sk
->sk_wmem_queued
+ size
>= sk
->sk_sndbuf
) {
3260 /* Force charge with __GFP_NOFAIL */
3261 if (memcg
&& !charged
) {
3262 mem_cgroup_charge_skmem(memcg
, amt
,
3263 gfp_memcg_charge() | __GFP_NOFAIL
);
3269 if (kind
== SK_MEM_SEND
|| (kind
== SK_MEM_RECV
&& charged
))
3270 trace_sock_exceed_buf_limit(sk
, prot
, allocated
, kind
);
3272 sk_memory_allocated_sub(sk
, amt
);
3275 mem_cgroup_uncharge_skmem(memcg
, amt
);
3281 * __sk_mem_schedule - increase sk_forward_alloc and memory_allocated
3283 * @size: memory size to allocate
3284 * @kind: allocation type
3286 * If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means
3287 * rmem allocation. This function assumes that protocols which have
3288 * memory_pressure use sk_wmem_queued as write buffer accounting.
3290 int __sk_mem_schedule(struct sock
*sk
, int size
, int kind
)
3292 int ret
, amt
= sk_mem_pages(size
);
3294 sk_forward_alloc_add(sk
, amt
<< PAGE_SHIFT
);
3295 ret
= __sk_mem_raise_allocated(sk
, size
, amt
, kind
);
3297 sk_forward_alloc_add(sk
, -(amt
<< PAGE_SHIFT
));
3300 EXPORT_SYMBOL(__sk_mem_schedule
);
3303 * __sk_mem_reduce_allocated - reclaim memory_allocated
3305 * @amount: number of quanta
3307 * Similar to __sk_mem_reclaim(), but does not update sk_forward_alloc
3309 void __sk_mem_reduce_allocated(struct sock
*sk
, int amount
)
3311 sk_memory_allocated_sub(sk
, amount
);
3313 if (mem_cgroup_sockets_enabled
&& sk
->sk_memcg
)
3314 mem_cgroup_uncharge_skmem(sk
->sk_memcg
, amount
);
3316 if (sk_under_global_memory_pressure(sk
) &&
3317 (sk_memory_allocated(sk
) < sk_prot_mem_limits(sk
, 0)))
3318 sk_leave_memory_pressure(sk
);
3322 * __sk_mem_reclaim - reclaim sk_forward_alloc and memory_allocated
3324 * @amount: number of bytes (rounded down to a PAGE_SIZE multiple)
3326 void __sk_mem_reclaim(struct sock
*sk
, int amount
)
3328 amount
>>= PAGE_SHIFT
;
3329 sk_forward_alloc_add(sk
, -(amount
<< PAGE_SHIFT
));
3330 __sk_mem_reduce_allocated(sk
, amount
);
3332 EXPORT_SYMBOL(__sk_mem_reclaim
);
3334 int sk_set_peek_off(struct sock
*sk
, int val
)
3336 WRITE_ONCE(sk
->sk_peek_off
, val
);
3339 EXPORT_SYMBOL_GPL(sk_set_peek_off
);
3342 * Set of default routines for initialising struct proto_ops when
3343 * the protocol does not support a particular function. In certain
3344 * cases where it makes no sense for a protocol to have a "do nothing"
3345 * function, some default processing is provided.
3348 int sock_no_bind(struct socket
*sock
, struct sockaddr
*saddr
, int len
)
3352 EXPORT_SYMBOL(sock_no_bind
);
3354 int sock_no_connect(struct socket
*sock
, struct sockaddr
*saddr
,
3359 EXPORT_SYMBOL(sock_no_connect
);
3361 int sock_no_socketpair(struct socket
*sock1
, struct socket
*sock2
)
3365 EXPORT_SYMBOL(sock_no_socketpair
);
3367 int sock_no_accept(struct socket
*sock
, struct socket
*newsock
,
3368 struct proto_accept_arg
*arg
)
3372 EXPORT_SYMBOL(sock_no_accept
);
3374 int sock_no_getname(struct socket
*sock
, struct sockaddr
*saddr
,
3379 EXPORT_SYMBOL(sock_no_getname
);
3381 int sock_no_ioctl(struct socket
*sock
, unsigned int cmd
, unsigned long arg
)
3385 EXPORT_SYMBOL(sock_no_ioctl
);
3387 int sock_no_listen(struct socket
*sock
, int backlog
)
3391 EXPORT_SYMBOL(sock_no_listen
);
3393 int sock_no_shutdown(struct socket
*sock
, int how
)
3397 EXPORT_SYMBOL(sock_no_shutdown
);
3399 int sock_no_sendmsg(struct socket
*sock
, struct msghdr
*m
, size_t len
)
3403 EXPORT_SYMBOL(sock_no_sendmsg
);
3405 int sock_no_sendmsg_locked(struct sock
*sk
, struct msghdr
*m
, size_t len
)
3409 EXPORT_SYMBOL(sock_no_sendmsg_locked
);
3411 int sock_no_recvmsg(struct socket
*sock
, struct msghdr
*m
, size_t len
,
3416 EXPORT_SYMBOL(sock_no_recvmsg
);
3418 int sock_no_mmap(struct file
*file
, struct socket
*sock
, struct vm_area_struct
*vma
)
3420 /* Mirror missing mmap method error code */
3423 EXPORT_SYMBOL(sock_no_mmap
);
3426 * When a file is received (via SCM_RIGHTS, etc), we must bump the
3427 * various sock-based usage counts.
3429 void __receive_sock(struct file
*file
)
3431 struct socket
*sock
;
3433 sock
= sock_from_file(file
);
3435 sock_update_netprioidx(&sock
->sk
->sk_cgrp_data
);
3436 sock_update_classid(&sock
->sk
->sk_cgrp_data
);
3441 * Default Socket Callbacks
3444 static void sock_def_wakeup(struct sock
*sk
)
3446 struct socket_wq
*wq
;
3449 wq
= rcu_dereference(sk
->sk_wq
);
3450 if (skwq_has_sleeper(wq
))
3451 wake_up_interruptible_all(&wq
->wait
);
3455 static void sock_def_error_report(struct sock
*sk
)
3457 struct socket_wq
*wq
;
3460 wq
= rcu_dereference(sk
->sk_wq
);
3461 if (skwq_has_sleeper(wq
))
3462 wake_up_interruptible_poll(&wq
->wait
, EPOLLERR
);
3463 sk_wake_async_rcu(sk
, SOCK_WAKE_IO
, POLL_ERR
);
3467 void sock_def_readable(struct sock
*sk
)
3469 struct socket_wq
*wq
;
3471 trace_sk_data_ready(sk
);
3474 wq
= rcu_dereference(sk
->sk_wq
);
3475 if (skwq_has_sleeper(wq
))
3476 wake_up_interruptible_sync_poll(&wq
->wait
, EPOLLIN
| EPOLLPRI
|
3477 EPOLLRDNORM
| EPOLLRDBAND
);
3478 sk_wake_async_rcu(sk
, SOCK_WAKE_WAITD
, POLL_IN
);
3482 static void sock_def_write_space(struct sock
*sk
)
3484 struct socket_wq
*wq
;
3488 /* Do not wake up a writer until he can make "significant"
3491 if (sock_writeable(sk
)) {
3492 wq
= rcu_dereference(sk
->sk_wq
);
3493 if (skwq_has_sleeper(wq
))
3494 wake_up_interruptible_sync_poll(&wq
->wait
, EPOLLOUT
|
3495 EPOLLWRNORM
| EPOLLWRBAND
);
3497 /* Should agree with poll, otherwise some programs break */
3498 sk_wake_async_rcu(sk
, SOCK_WAKE_SPACE
, POLL_OUT
);
3504 /* An optimised version of sock_def_write_space(), should only be called
3505 * for SOCK_RCU_FREE sockets under RCU read section and after putting
3508 static void sock_def_write_space_wfree(struct sock
*sk
)
3510 /* Do not wake up a writer until he can make "significant"
3513 if (sock_writeable(sk
)) {
3514 struct socket_wq
*wq
= rcu_dereference(sk
->sk_wq
);
3516 /* rely on refcount_sub from sock_wfree() */
3517 smp_mb__after_atomic();
3518 if (wq
&& waitqueue_active(&wq
->wait
))
3519 wake_up_interruptible_sync_poll(&wq
->wait
, EPOLLOUT
|
3520 EPOLLWRNORM
| EPOLLWRBAND
);
3522 /* Should agree with poll, otherwise some programs break */
3523 sk_wake_async_rcu(sk
, SOCK_WAKE_SPACE
, POLL_OUT
);
3527 static void sock_def_destruct(struct sock
*sk
)
3531 void sk_send_sigurg(struct sock
*sk
)
3533 if (sk
->sk_socket
&& sk
->sk_socket
->file
)
3534 if (send_sigurg(sk
->sk_socket
->file
))
3535 sk_wake_async(sk
, SOCK_WAKE_URG
, POLL_PRI
);
3537 EXPORT_SYMBOL(sk_send_sigurg
);
3539 void sk_reset_timer(struct sock
*sk
, struct timer_list
* timer
,
3540 unsigned long expires
)
3542 if (!mod_timer(timer
, expires
))
3545 EXPORT_SYMBOL(sk_reset_timer
);
3547 void sk_stop_timer(struct sock
*sk
, struct timer_list
* timer
)
3549 if (del_timer(timer
))
3552 EXPORT_SYMBOL(sk_stop_timer
);
3554 void sk_stop_timer_sync(struct sock
*sk
, struct timer_list
*timer
)
3556 if (del_timer_sync(timer
))
3559 EXPORT_SYMBOL(sk_stop_timer_sync
);
3561 void sock_init_data_uid(struct socket
*sock
, struct sock
*sk
, kuid_t uid
)
3564 sk
->sk_send_head
= NULL
;
3566 timer_setup(&sk
->sk_timer
, NULL
, 0);
3568 sk
->sk_allocation
= GFP_KERNEL
;
3569 sk
->sk_rcvbuf
= READ_ONCE(sysctl_rmem_default
);
3570 sk
->sk_sndbuf
= READ_ONCE(sysctl_wmem_default
);
3571 sk
->sk_state
= TCP_CLOSE
;
3572 sk
->sk_use_task_frag
= true;
3573 sk_set_socket(sk
, sock
);
3575 sock_set_flag(sk
, SOCK_ZAPPED
);
3578 sk
->sk_type
= sock
->type
;
3579 RCU_INIT_POINTER(sk
->sk_wq
, &sock
->wq
);
3582 RCU_INIT_POINTER(sk
->sk_wq
, NULL
);
3586 sk
->sk_state_change
= sock_def_wakeup
;
3587 sk
->sk_data_ready
= sock_def_readable
;
3588 sk
->sk_write_space
= sock_def_write_space
;
3589 sk
->sk_error_report
= sock_def_error_report
;
3590 sk
->sk_destruct
= sock_def_destruct
;
3592 sk
->sk_frag
.page
= NULL
;
3593 sk
->sk_frag
.offset
= 0;
3594 sk
->sk_peek_off
= -1;
3596 sk
->sk_peer_pid
= NULL
;
3597 sk
->sk_peer_cred
= NULL
;
3598 spin_lock_init(&sk
->sk_peer_lock
);
3600 sk
->sk_write_pending
= 0;
3601 sk
->sk_rcvlowat
= 1;
3602 sk
->sk_rcvtimeo
= MAX_SCHEDULE_TIMEOUT
;
3603 sk
->sk_sndtimeo
= MAX_SCHEDULE_TIMEOUT
;
3605 sk
->sk_stamp
= SK_DEFAULT_STAMP
;
3606 #if BITS_PER_LONG==32
3607 seqlock_init(&sk
->sk_stamp_seq
);
3609 atomic_set(&sk
->sk_zckey
, 0);
3611 #ifdef CONFIG_NET_RX_BUSY_POLL
3613 sk
->sk_ll_usec
= READ_ONCE(sysctl_net_busy_read
);
3616 sk
->sk_max_pacing_rate
= ~0UL;
3617 sk
->sk_pacing_rate
= ~0UL;
3618 WRITE_ONCE(sk
->sk_pacing_shift
, 10);
3619 sk
->sk_incoming_cpu
= -1;
3621 sk_rx_queue_clear(sk
);
3623 * Before updating sk_refcnt, we must commit prior changes to memory
3624 * (Documentation/RCU/rculist_nulls.rst for details)
3627 refcount_set(&sk
->sk_refcnt
, 1);
3628 atomic_set(&sk
->sk_drops
, 0);
3630 EXPORT_SYMBOL(sock_init_data_uid
);
3632 void sock_init_data(struct socket
*sock
, struct sock
*sk
)
3635 SOCK_INODE(sock
)->i_uid
:
3636 make_kuid(sock_net(sk
)->user_ns
, 0);
3638 sock_init_data_uid(sock
, sk
, uid
);
3640 EXPORT_SYMBOL(sock_init_data
);
3642 void lock_sock_nested(struct sock
*sk
, int subclass
)
3644 /* The sk_lock has mutex_lock() semantics here. */
3645 mutex_acquire(&sk
->sk_lock
.dep_map
, subclass
, 0, _RET_IP_
);
3648 spin_lock_bh(&sk
->sk_lock
.slock
);
3649 if (sock_owned_by_user_nocheck(sk
))
3651 sk
->sk_lock
.owned
= 1;
3652 spin_unlock_bh(&sk
->sk_lock
.slock
);
3654 EXPORT_SYMBOL(lock_sock_nested
);
3656 void release_sock(struct sock
*sk
)
3658 spin_lock_bh(&sk
->sk_lock
.slock
);
3659 if (sk
->sk_backlog
.tail
)
3662 if (sk
->sk_prot
->release_cb
)
3663 INDIRECT_CALL_INET_1(sk
->sk_prot
->release_cb
,
3664 tcp_release_cb
, sk
);
3666 sock_release_ownership(sk
);
3667 if (waitqueue_active(&sk
->sk_lock
.wq
))
3668 wake_up(&sk
->sk_lock
.wq
);
3669 spin_unlock_bh(&sk
->sk_lock
.slock
);
3671 EXPORT_SYMBOL(release_sock
);
3673 bool __lock_sock_fast(struct sock
*sk
) __acquires(&sk
->sk_lock
.slock
)
3676 spin_lock_bh(&sk
->sk_lock
.slock
);
3678 if (!sock_owned_by_user_nocheck(sk
)) {
3680 * Fast path return with bottom halves disabled and
3681 * sock::sk_lock.slock held.
3683 * The 'mutex' is not contended and holding
3684 * sock::sk_lock.slock prevents all other lockers to
3685 * proceed so the corresponding unlock_sock_fast() can
3686 * avoid the slow path of release_sock() completely and
3687 * just release slock.
3689 * From a semantical POV this is equivalent to 'acquiring'
3690 * the 'mutex', hence the corresponding lockdep
3691 * mutex_release() has to happen in the fast path of
3692 * unlock_sock_fast().
3698 sk
->sk_lock
.owned
= 1;
3699 __acquire(&sk
->sk_lock
.slock
);
3700 spin_unlock_bh(&sk
->sk_lock
.slock
);
3703 EXPORT_SYMBOL(__lock_sock_fast
);
3705 int sock_gettstamp(struct socket
*sock
, void __user
*userstamp
,
3706 bool timeval
, bool time32
)
3708 struct sock
*sk
= sock
->sk
;
3709 struct timespec64 ts
;
3711 sock_enable_timestamp(sk
, SOCK_TIMESTAMP
);
3712 ts
= ktime_to_timespec64(sock_read_timestamp(sk
));
3713 if (ts
.tv_sec
== -1)
3715 if (ts
.tv_sec
== 0) {
3716 ktime_t kt
= ktime_get_real();
3717 sock_write_timestamp(sk
, kt
);
3718 ts
= ktime_to_timespec64(kt
);
3724 #ifdef CONFIG_COMPAT_32BIT_TIME
3726 return put_old_timespec32(&ts
, userstamp
);
3728 #ifdef CONFIG_SPARC64
3729 /* beware of padding in sparc64 timeval */
3730 if (timeval
&& !in_compat_syscall()) {
3731 struct __kernel_old_timeval __user tv
= {
3732 .tv_sec
= ts
.tv_sec
,
3733 .tv_usec
= ts
.tv_nsec
,
3735 if (copy_to_user(userstamp
, &tv
, sizeof(tv
)))
3740 return put_timespec64(&ts
, userstamp
);
3742 EXPORT_SYMBOL(sock_gettstamp
);
3744 void sock_enable_timestamp(struct sock
*sk
, enum sock_flags flag
)
3746 if (!sock_flag(sk
, flag
)) {
3747 unsigned long previous_flags
= sk
->sk_flags
;
3749 sock_set_flag(sk
, flag
);
3751 * we just set one of the two flags which require net
3752 * time stamping, but time stamping might have been on
3753 * already because of the other one
3755 if (sock_needs_netstamp(sk
) &&
3756 !(previous_flags
& SK_FLAGS_TIMESTAMP
))
3757 net_enable_timestamp();
3761 int sock_recv_errqueue(struct sock
*sk
, struct msghdr
*msg
, int len
,
3762 int level
, int type
)
3764 struct sock_exterr_skb
*serr
;
3765 struct sk_buff
*skb
;
3769 skb
= sock_dequeue_err_skb(sk
);
3775 msg
->msg_flags
|= MSG_TRUNC
;
3778 err
= skb_copy_datagram_msg(skb
, 0, msg
, copied
);
3782 sock_recv_timestamp(msg
, sk
, skb
);
3784 serr
= SKB_EXT_ERR(skb
);
3785 put_cmsg(msg
, level
, type
, sizeof(serr
->ee
), &serr
->ee
);
3787 msg
->msg_flags
|= MSG_ERRQUEUE
;
3795 EXPORT_SYMBOL(sock_recv_errqueue
);
3798 * Get a socket option on an socket.
3800 * FIX: POSIX 1003.1g is very ambiguous here. It states that
3801 * asynchronous errors should be reported by getsockopt. We assume
3802 * this means if you specify SO_ERROR (otherwise what is the point of it).
3804 int sock_common_getsockopt(struct socket
*sock
, int level
, int optname
,
3805 char __user
*optval
, int __user
*optlen
)
3807 struct sock
*sk
= sock
->sk
;
3809 /* IPV6_ADDRFORM can change sk->sk_prot under us. */
3810 return READ_ONCE(sk
->sk_prot
)->getsockopt(sk
, level
, optname
, optval
, optlen
);
3812 EXPORT_SYMBOL(sock_common_getsockopt
);
3814 int sock_common_recvmsg(struct socket
*sock
, struct msghdr
*msg
, size_t size
,
3817 struct sock
*sk
= sock
->sk
;
3821 err
= sk
->sk_prot
->recvmsg(sk
, msg
, size
, flags
, &addr_len
);
3823 msg
->msg_namelen
= addr_len
;
3826 EXPORT_SYMBOL(sock_common_recvmsg
);
3829 * Set socket options on an inet socket.
3831 int sock_common_setsockopt(struct socket
*sock
, int level
, int optname
,
3832 sockptr_t optval
, unsigned int optlen
)
3834 struct sock
*sk
= sock
->sk
;
3836 /* IPV6_ADDRFORM can change sk->sk_prot under us. */
3837 return READ_ONCE(sk
->sk_prot
)->setsockopt(sk
, level
, optname
, optval
, optlen
);
3839 EXPORT_SYMBOL(sock_common_setsockopt
);
3841 void sk_common_release(struct sock
*sk
)
3843 if (sk
->sk_prot
->destroy
)
3844 sk
->sk_prot
->destroy(sk
);
3847 * Observation: when sk_common_release is called, processes have
3848 * no access to socket. But net still has.
3849 * Step one, detach it from networking:
3851 * A. Remove from hash tables.
3854 sk
->sk_prot
->unhash(sk
);
3857 * In this point socket cannot receive new packets, but it is possible
3858 * that some packets are in flight because some CPU runs receiver and
3859 * did hash table lookup before we unhashed socket. They will achieve
3860 * receive queue and will be purged by socket destructor.
3862 * Also we still have packets pending on receive queue and probably,
3863 * our own packets waiting in device queues. sock_destroy will drain
3864 * receive queue, but transmitted packets will delay socket destruction
3865 * until the last reference will be released.
3870 xfrm_sk_free_policy(sk
);
3874 EXPORT_SYMBOL(sk_common_release
);
3876 void sk_get_meminfo(const struct sock
*sk
, u32
*mem
)
3878 memset(mem
, 0, sizeof(*mem
) * SK_MEMINFO_VARS
);
3880 mem
[SK_MEMINFO_RMEM_ALLOC
] = sk_rmem_alloc_get(sk
);
3881 mem
[SK_MEMINFO_RCVBUF
] = READ_ONCE(sk
->sk_rcvbuf
);
3882 mem
[SK_MEMINFO_WMEM_ALLOC
] = sk_wmem_alloc_get(sk
);
3883 mem
[SK_MEMINFO_SNDBUF
] = READ_ONCE(sk
->sk_sndbuf
);
3884 mem
[SK_MEMINFO_FWD_ALLOC
] = sk_forward_alloc_get(sk
);
3885 mem
[SK_MEMINFO_WMEM_QUEUED
] = READ_ONCE(sk
->sk_wmem_queued
);
3886 mem
[SK_MEMINFO_OPTMEM
] = atomic_read(&sk
->sk_omem_alloc
);
3887 mem
[SK_MEMINFO_BACKLOG
] = READ_ONCE(sk
->sk_backlog
.len
);
3888 mem
[SK_MEMINFO_DROPS
] = atomic_read(&sk
->sk_drops
);
3891 #ifdef CONFIG_PROC_FS
3892 static DECLARE_BITMAP(proto_inuse_idx
, PROTO_INUSE_NR
);
3894 int sock_prot_inuse_get(struct net
*net
, struct proto
*prot
)
3896 int cpu
, idx
= prot
->inuse_idx
;
3899 for_each_possible_cpu(cpu
)
3900 res
+= per_cpu_ptr(net
->core
.prot_inuse
, cpu
)->val
[idx
];
3902 return res
>= 0 ? res
: 0;
3904 EXPORT_SYMBOL_GPL(sock_prot_inuse_get
);
3906 int sock_inuse_get(struct net
*net
)
3910 for_each_possible_cpu(cpu
)
3911 res
+= per_cpu_ptr(net
->core
.prot_inuse
, cpu
)->all
;
3916 EXPORT_SYMBOL_GPL(sock_inuse_get
);
3918 static int __net_init
sock_inuse_init_net(struct net
*net
)
3920 net
->core
.prot_inuse
= alloc_percpu(struct prot_inuse
);
3921 if (net
->core
.prot_inuse
== NULL
)
3926 static void __net_exit
sock_inuse_exit_net(struct net
*net
)
3928 free_percpu(net
->core
.prot_inuse
);
3931 static struct pernet_operations net_inuse_ops
= {
3932 .init
= sock_inuse_init_net
,
3933 .exit
= sock_inuse_exit_net
,
3936 static __init
int net_inuse_init(void)
3938 if (register_pernet_subsys(&net_inuse_ops
))
3939 panic("Cannot initialize net inuse counters");
3944 core_initcall(net_inuse_init
);
3946 static int assign_proto_idx(struct proto
*prot
)
3948 prot
->inuse_idx
= find_first_zero_bit(proto_inuse_idx
, PROTO_INUSE_NR
);
3950 if (unlikely(prot
->inuse_idx
== PROTO_INUSE_NR
- 1)) {
3951 pr_err("PROTO_INUSE_NR exhausted\n");
3955 set_bit(prot
->inuse_idx
, proto_inuse_idx
);
3959 static void release_proto_idx(struct proto
*prot
)
3961 if (prot
->inuse_idx
!= PROTO_INUSE_NR
- 1)
3962 clear_bit(prot
->inuse_idx
, proto_inuse_idx
);
3965 static inline int assign_proto_idx(struct proto
*prot
)
3970 static inline void release_proto_idx(struct proto
*prot
)
3976 static void tw_prot_cleanup(struct timewait_sock_ops
*twsk_prot
)
3980 kfree(twsk_prot
->twsk_slab_name
);
3981 twsk_prot
->twsk_slab_name
= NULL
;
3982 kmem_cache_destroy(twsk_prot
->twsk_slab
);
3983 twsk_prot
->twsk_slab
= NULL
;
3986 static int tw_prot_init(const struct proto
*prot
)
3988 struct timewait_sock_ops
*twsk_prot
= prot
->twsk_prot
;
3993 twsk_prot
->twsk_slab_name
= kasprintf(GFP_KERNEL
, "tw_sock_%s",
3995 if (!twsk_prot
->twsk_slab_name
)
3998 twsk_prot
->twsk_slab
=
3999 kmem_cache_create(twsk_prot
->twsk_slab_name
,
4000 twsk_prot
->twsk_obj_size
, 0,
4001 SLAB_ACCOUNT
| prot
->slab_flags
,
4003 if (!twsk_prot
->twsk_slab
) {
4004 pr_crit("%s: Can't create timewait sock SLAB cache!\n",
4012 static void req_prot_cleanup(struct request_sock_ops
*rsk_prot
)
4016 kfree(rsk_prot
->slab_name
);
4017 rsk_prot
->slab_name
= NULL
;
4018 kmem_cache_destroy(rsk_prot
->slab
);
4019 rsk_prot
->slab
= NULL
;
4022 static int req_prot_init(const struct proto
*prot
)
4024 struct request_sock_ops
*rsk_prot
= prot
->rsk_prot
;
4029 rsk_prot
->slab_name
= kasprintf(GFP_KERNEL
, "request_sock_%s",
4031 if (!rsk_prot
->slab_name
)
4034 rsk_prot
->slab
= kmem_cache_create(rsk_prot
->slab_name
,
4035 rsk_prot
->obj_size
, 0,
4036 SLAB_ACCOUNT
| prot
->slab_flags
,
4039 if (!rsk_prot
->slab
) {
4040 pr_crit("%s: Can't create request sock SLAB cache!\n",
4047 int proto_register(struct proto
*prot
, int alloc_slab
)
4051 if (prot
->memory_allocated
&& !prot
->sysctl_mem
) {
4052 pr_err("%s: missing sysctl_mem\n", prot
->name
);
4055 if (prot
->memory_allocated
&& !prot
->per_cpu_fw_alloc
) {
4056 pr_err("%s: missing per_cpu_fw_alloc\n", prot
->name
);
4060 prot
->slab
= kmem_cache_create_usercopy(prot
->name
,
4062 SLAB_HWCACHE_ALIGN
| SLAB_ACCOUNT
|
4064 prot
->useroffset
, prot
->usersize
,
4067 if (prot
->slab
== NULL
) {
4068 pr_crit("%s: Can't create sock SLAB cache!\n",
4073 if (req_prot_init(prot
))
4074 goto out_free_request_sock_slab
;
4076 if (tw_prot_init(prot
))
4077 goto out_free_timewait_sock_slab
;
4080 mutex_lock(&proto_list_mutex
);
4081 ret
= assign_proto_idx(prot
);
4083 mutex_unlock(&proto_list_mutex
);
4084 goto out_free_timewait_sock_slab
;
4086 list_add(&prot
->node
, &proto_list
);
4087 mutex_unlock(&proto_list_mutex
);
4090 out_free_timewait_sock_slab
:
4092 tw_prot_cleanup(prot
->twsk_prot
);
4093 out_free_request_sock_slab
:
4095 req_prot_cleanup(prot
->rsk_prot
);
4097 kmem_cache_destroy(prot
->slab
);
4103 EXPORT_SYMBOL(proto_register
);
4105 void proto_unregister(struct proto
*prot
)
4107 mutex_lock(&proto_list_mutex
);
4108 release_proto_idx(prot
);
4109 list_del(&prot
->node
);
4110 mutex_unlock(&proto_list_mutex
);
4112 kmem_cache_destroy(prot
->slab
);
4115 req_prot_cleanup(prot
->rsk_prot
);
4116 tw_prot_cleanup(prot
->twsk_prot
);
4118 EXPORT_SYMBOL(proto_unregister
);
4120 int sock_load_diag_module(int family
, int protocol
)
4123 if (!sock_is_registered(family
))
4126 return request_module("net-pf-%d-proto-%d-type-%d", PF_NETLINK
,
4127 NETLINK_SOCK_DIAG
, family
);
4131 if (family
== AF_INET
&&
4132 protocol
!= IPPROTO_RAW
&&
4133 protocol
< MAX_INET_PROTOS
&&
4134 !rcu_access_pointer(inet_protos
[protocol
]))
4138 return request_module("net-pf-%d-proto-%d-type-%d-%d", PF_NETLINK
,
4139 NETLINK_SOCK_DIAG
, family
, protocol
);
4141 EXPORT_SYMBOL(sock_load_diag_module
);
4143 #ifdef CONFIG_PROC_FS
4144 static void *proto_seq_start(struct seq_file
*seq
, loff_t
*pos
)
4145 __acquires(proto_list_mutex
)
4147 mutex_lock(&proto_list_mutex
);
4148 return seq_list_start_head(&proto_list
, *pos
);
4151 static void *proto_seq_next(struct seq_file
*seq
, void *v
, loff_t
*pos
)
4153 return seq_list_next(v
, &proto_list
, pos
);
4156 static void proto_seq_stop(struct seq_file
*seq
, void *v
)
4157 __releases(proto_list_mutex
)
4159 mutex_unlock(&proto_list_mutex
);
4162 static char proto_method_implemented(const void *method
)
4164 return method
== NULL
? 'n' : 'y';
4166 static long sock_prot_memory_allocated(struct proto
*proto
)
4168 return proto
->memory_allocated
!= NULL
? proto_memory_allocated(proto
) : -1L;
4171 static const char *sock_prot_memory_pressure(struct proto
*proto
)
4173 return proto
->memory_pressure
!= NULL
?
4174 proto_memory_pressure(proto
) ? "yes" : "no" : "NI";
4177 static void proto_seq_printf(struct seq_file
*seq
, struct proto
*proto
)
4180 seq_printf(seq
, "%-9s %4u %6d %6ld %-3s %6u %-3s %-10s "
4181 "%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n",
4184 sock_prot_inuse_get(seq_file_net(seq
), proto
),
4185 sock_prot_memory_allocated(proto
),
4186 sock_prot_memory_pressure(proto
),
4188 proto
->slab
== NULL
? "no" : "yes",
4189 module_name(proto
->owner
),
4190 proto_method_implemented(proto
->close
),
4191 proto_method_implemented(proto
->connect
),
4192 proto_method_implemented(proto
->disconnect
),
4193 proto_method_implemented(proto
->accept
),
4194 proto_method_implemented(proto
->ioctl
),
4195 proto_method_implemented(proto
->init
),
4196 proto_method_implemented(proto
->destroy
),
4197 proto_method_implemented(proto
->shutdown
),
4198 proto_method_implemented(proto
->setsockopt
),
4199 proto_method_implemented(proto
->getsockopt
),
4200 proto_method_implemented(proto
->sendmsg
),
4201 proto_method_implemented(proto
->recvmsg
),
4202 proto_method_implemented(proto
->bind
),
4203 proto_method_implemented(proto
->backlog_rcv
),
4204 proto_method_implemented(proto
->hash
),
4205 proto_method_implemented(proto
->unhash
),
4206 proto_method_implemented(proto
->get_port
),
4207 proto_method_implemented(proto
->enter_memory_pressure
));
4210 static int proto_seq_show(struct seq_file
*seq
, void *v
)
4212 if (v
== &proto_list
)
4213 seq_printf(seq
, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s",
4222 "cl co di ac io in de sh ss gs se re bi br ha uh gp em\n");
4224 proto_seq_printf(seq
, list_entry(v
, struct proto
, node
));
4228 static const struct seq_operations proto_seq_ops
= {
4229 .start
= proto_seq_start
,
4230 .next
= proto_seq_next
,
4231 .stop
= proto_seq_stop
,
4232 .show
= proto_seq_show
,
4235 static __net_init
int proto_init_net(struct net
*net
)
4237 if (!proc_create_net("protocols", 0444, net
->proc_net
, &proto_seq_ops
,
4238 sizeof(struct seq_net_private
)))
4244 static __net_exit
void proto_exit_net(struct net
*net
)
4246 remove_proc_entry("protocols", net
->proc_net
);
4250 static __net_initdata
struct pernet_operations proto_net_ops
= {
4251 .init
= proto_init_net
,
4252 .exit
= proto_exit_net
,
4255 static int __init
proto_init(void)
4257 return register_pernet_subsys(&proto_net_ops
);
4260 subsys_initcall(proto_init
);
4262 #endif /* PROC_FS */
4264 #ifdef CONFIG_NET_RX_BUSY_POLL
4265 bool sk_busy_loop_end(void *p
, unsigned long start_time
)
4267 struct sock
*sk
= p
;
4269 if (!skb_queue_empty_lockless(&sk
->sk_receive_queue
))
4272 if (sk_is_udp(sk
) &&
4273 !skb_queue_empty_lockless(&udp_sk(sk
)->reader_queue
))
4276 return sk_busy_loop_timeout(sk
, start_time
);
4278 EXPORT_SYMBOL(sk_busy_loop_end
);
4279 #endif /* CONFIG_NET_RX_BUSY_POLL */
4281 int sock_bind_add(struct sock
*sk
, struct sockaddr
*addr
, int addr_len
)
4283 if (!sk
->sk_prot
->bind_add
)
4285 return sk
->sk_prot
->bind_add(sk
, addr
, addr_len
);
4287 EXPORT_SYMBOL(sock_bind_add
);
4289 /* Copy 'size' bytes from userspace and return `size` back to userspace */
4290 int sock_ioctl_inout(struct sock
*sk
, unsigned int cmd
,
4291 void __user
*arg
, void *karg
, size_t size
)
4295 if (copy_from_user(karg
, arg
, size
))
4298 ret
= READ_ONCE(sk
->sk_prot
)->ioctl(sk
, cmd
, karg
);
4302 if (copy_to_user(arg
, karg
, size
))
4307 EXPORT_SYMBOL(sock_ioctl_inout
);
4309 /* This is the most common ioctl prep function, where the result (4 bytes) is
4310 * copied back to userspace if the ioctl() returns successfully. No input is
4311 * copied from userspace as input argument.
4313 static int sock_ioctl_out(struct sock
*sk
, unsigned int cmd
, void __user
*arg
)
4317 ret
= READ_ONCE(sk
->sk_prot
)->ioctl(sk
, cmd
, &karg
);
4321 return put_user(karg
, (int __user
*)arg
);
4324 /* A wrapper around sock ioctls, which copies the data from userspace
4325 * (depending on the protocol/ioctl), and copies back the result to userspace.
4326 * The main motivation for this function is to pass kernel memory to the
4327 * protocol ioctl callbacks, instead of userspace memory.
4329 int sk_ioctl(struct sock
*sk
, unsigned int cmd
, void __user
*arg
)
4333 if (sk
->sk_type
== SOCK_RAW
&& sk
->sk_family
== AF_INET
)
4334 rc
= ipmr_sk_ioctl(sk
, cmd
, arg
);
4335 else if (sk
->sk_type
== SOCK_RAW
&& sk
->sk_family
== AF_INET6
)
4336 rc
= ip6mr_sk_ioctl(sk
, cmd
, arg
);
4337 else if (sk_is_phonet(sk
))
4338 rc
= phonet_sk_ioctl(sk
, cmd
, arg
);
4340 /* If ioctl was processed, returns its value */
4344 /* Otherwise call the default handler */
4345 return sock_ioctl_out(sk
, cmd
, arg
);
4347 EXPORT_SYMBOL(sk_ioctl
);
4349 static int __init
sock_struct_check(void)
4351 CACHELINE_ASSERT_GROUP_MEMBER(struct sock
, sock_write_rx
, sk_drops
);
4352 CACHELINE_ASSERT_GROUP_MEMBER(struct sock
, sock_write_rx
, sk_peek_off
);
4353 CACHELINE_ASSERT_GROUP_MEMBER(struct sock
, sock_write_rx
, sk_error_queue
);
4354 CACHELINE_ASSERT_GROUP_MEMBER(struct sock
, sock_write_rx
, sk_receive_queue
);
4355 CACHELINE_ASSERT_GROUP_MEMBER(struct sock
, sock_write_rx
, sk_backlog
);
4357 CACHELINE_ASSERT_GROUP_MEMBER(struct sock
, sock_read_rx
, sk_rx_dst
);
4358 CACHELINE_ASSERT_GROUP_MEMBER(struct sock
, sock_read_rx
, sk_rx_dst_ifindex
);
4359 CACHELINE_ASSERT_GROUP_MEMBER(struct sock
, sock_read_rx
, sk_rx_dst_cookie
);
4360 CACHELINE_ASSERT_GROUP_MEMBER(struct sock
, sock_read_rx
, sk_rcvbuf
);
4361 CACHELINE_ASSERT_GROUP_MEMBER(struct sock
, sock_read_rx
, sk_filter
);
4362 CACHELINE_ASSERT_GROUP_MEMBER(struct sock
, sock_read_rx
, sk_wq
);
4363 CACHELINE_ASSERT_GROUP_MEMBER(struct sock
, sock_read_rx
, sk_data_ready
);
4364 CACHELINE_ASSERT_GROUP_MEMBER(struct sock
, sock_read_rx
, sk_rcvtimeo
);
4365 CACHELINE_ASSERT_GROUP_MEMBER(struct sock
, sock_read_rx
, sk_rcvlowat
);
4367 CACHELINE_ASSERT_GROUP_MEMBER(struct sock
, sock_read_rxtx
, sk_err
);
4368 CACHELINE_ASSERT_GROUP_MEMBER(struct sock
, sock_read_rxtx
, sk_socket
);
4369 CACHELINE_ASSERT_GROUP_MEMBER(struct sock
, sock_read_rxtx
, sk_memcg
);
4371 CACHELINE_ASSERT_GROUP_MEMBER(struct sock
, sock_write_rxtx
, sk_lock
);
4372 CACHELINE_ASSERT_GROUP_MEMBER(struct sock
, sock_write_rxtx
, sk_reserved_mem
);
4373 CACHELINE_ASSERT_GROUP_MEMBER(struct sock
, sock_write_rxtx
, sk_forward_alloc
);
4374 CACHELINE_ASSERT_GROUP_MEMBER(struct sock
, sock_write_rxtx
, sk_tsflags
);
4376 CACHELINE_ASSERT_GROUP_MEMBER(struct sock
, sock_write_tx
, sk_omem_alloc
);
4377 CACHELINE_ASSERT_GROUP_MEMBER(struct sock
, sock_write_tx
, sk_omem_alloc
);
4378 CACHELINE_ASSERT_GROUP_MEMBER(struct sock
, sock_write_tx
, sk_sndbuf
);
4379 CACHELINE_ASSERT_GROUP_MEMBER(struct sock
, sock_write_tx
, sk_wmem_queued
);
4380 CACHELINE_ASSERT_GROUP_MEMBER(struct sock
, sock_write_tx
, sk_wmem_alloc
);
4381 CACHELINE_ASSERT_GROUP_MEMBER(struct sock
, sock_write_tx
, sk_tsq_flags
);
4382 CACHELINE_ASSERT_GROUP_MEMBER(struct sock
, sock_write_tx
, sk_send_head
);
4383 CACHELINE_ASSERT_GROUP_MEMBER(struct sock
, sock_write_tx
, sk_write_queue
);
4384 CACHELINE_ASSERT_GROUP_MEMBER(struct sock
, sock_write_tx
, sk_write_pending
);
4385 CACHELINE_ASSERT_GROUP_MEMBER(struct sock
, sock_write_tx
, sk_dst_pending_confirm
);
4386 CACHELINE_ASSERT_GROUP_MEMBER(struct sock
, sock_write_tx
, sk_pacing_status
);
4387 CACHELINE_ASSERT_GROUP_MEMBER(struct sock
, sock_write_tx
, sk_frag
);
4388 CACHELINE_ASSERT_GROUP_MEMBER(struct sock
, sock_write_tx
, sk_timer
);
4389 CACHELINE_ASSERT_GROUP_MEMBER(struct sock
, sock_write_tx
, sk_pacing_rate
);
4390 CACHELINE_ASSERT_GROUP_MEMBER(struct sock
, sock_write_tx
, sk_zckey
);
4391 CACHELINE_ASSERT_GROUP_MEMBER(struct sock
, sock_write_tx
, sk_tskey
);
4393 CACHELINE_ASSERT_GROUP_MEMBER(struct sock
, sock_read_tx
, sk_max_pacing_rate
);
4394 CACHELINE_ASSERT_GROUP_MEMBER(struct sock
, sock_read_tx
, sk_sndtimeo
);
4395 CACHELINE_ASSERT_GROUP_MEMBER(struct sock
, sock_read_tx
, sk_priority
);
4396 CACHELINE_ASSERT_GROUP_MEMBER(struct sock
, sock_read_tx
, sk_mark
);
4397 CACHELINE_ASSERT_GROUP_MEMBER(struct sock
, sock_read_tx
, sk_dst_cache
);
4398 CACHELINE_ASSERT_GROUP_MEMBER(struct sock
, sock_read_tx
, sk_route_caps
);
4399 CACHELINE_ASSERT_GROUP_MEMBER(struct sock
, sock_read_tx
, sk_gso_type
);
4400 CACHELINE_ASSERT_GROUP_MEMBER(struct sock
, sock_read_tx
, sk_gso_max_size
);
4401 CACHELINE_ASSERT_GROUP_MEMBER(struct sock
, sock_read_tx
, sk_allocation
);
4402 CACHELINE_ASSERT_GROUP_MEMBER(struct sock
, sock_read_tx
, sk_txhash
);
4403 CACHELINE_ASSERT_GROUP_MEMBER(struct sock
, sock_read_tx
, sk_gso_max_segs
);
4404 CACHELINE_ASSERT_GROUP_MEMBER(struct sock
, sock_read_tx
, sk_pacing_shift
);
4405 CACHELINE_ASSERT_GROUP_MEMBER(struct sock
, sock_read_tx
, sk_use_task_frag
);
4409 core_initcall(sock_struct_check
);