Linux 4.19.133
[linux/fpc-iii.git] / net / core / sock.c
blob8721264a2b39440bc8d1625bb6a4ef3f47ed09af
1 /*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
6 * Generic socket support routines. Memory allocators, socket lock/release
7 * handler for protocols to use and generic option handler.
10 * Authors: Ross Biro
11 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12 * Florian La Roche, <flla@stud.uni-sb.de>
13 * Alan Cox, <A.Cox@swansea.ac.uk>
15 * Fixes:
16 * Alan Cox : Numerous verify_area() problems
17 * Alan Cox : Connecting on a connecting socket
18 * now returns an error for tcp.
19 * Alan Cox : sock->protocol is set correctly.
20 * and is not sometimes left as 0.
21 * Alan Cox : connect handles icmp errors on a
22 * connect properly. Unfortunately there
23 * is a restart syscall nasty there. I
24 * can't match BSD without hacking the C
25 * library. Ideas urgently sought!
26 * Alan Cox : Disallow bind() to addresses that are
27 * not ours - especially broadcast ones!!
28 * Alan Cox : Socket 1024 _IS_ ok for users. (fencepost)
29 * Alan Cox : sock_wfree/sock_rfree don't destroy sockets,
30 * instead they leave that for the DESTROY timer.
31 * Alan Cox : Clean up error flag in accept
32 * Alan Cox : TCP ack handling is buggy, the DESTROY timer
33 * was buggy. Put a remove_sock() in the handler
34 * for memory when we hit 0. Also altered the timer
35 * code. The ACK stuff can wait and needs major
36 * TCP layer surgery.
37 * Alan Cox : Fixed TCP ack bug, removed remove sock
38 * and fixed timer/inet_bh race.
39 * Alan Cox : Added zapped flag for TCP
40 * Alan Cox : Move kfree_skb into skbuff.c and tidied up surplus code
41 * Alan Cox : for new sk_buff allocations wmalloc/rmalloc now call alloc_skb
42 * Alan Cox : kfree_s calls now are kfree_skbmem so we can track skb resources
43 * Alan Cox : Supports socket option broadcast now as does udp. Packet and raw need fixing.
44 * Alan Cox : Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so...
45 * Rick Sladkey : Relaxed UDP rules for matching packets.
46 * C.E.Hawkins : IFF_PROMISC/SIOCGHWADDR support
47 * Pauline Middelink : identd support
48 * Alan Cox : Fixed connect() taking signals I think.
49 * Alan Cox : SO_LINGER supported
50 * Alan Cox : Error reporting fixes
51 * Anonymous : inet_create tidied up (sk->reuse setting)
52 * Alan Cox : inet sockets don't set sk->type!
53 * Alan Cox : Split socket option code
54 * Alan Cox : Callbacks
55 * Alan Cox : Nagle flag for Charles & Johannes stuff
56 * Alex : Removed restriction on inet fioctl
57 * Alan Cox : Splitting INET from NET core
58 * Alan Cox : Fixed bogus SO_TYPE handling in getsockopt()
59 * Adam Caldwell : Missing return in SO_DONTROUTE/SO_DEBUG code
60 * Alan Cox : Split IP from generic code
61 * Alan Cox : New kfree_skbmem()
62 * Alan Cox : Make SO_DEBUG superuser only.
63 * Alan Cox : Allow anyone to clear SO_DEBUG
64 * (compatibility fix)
65 * Alan Cox : Added optimistic memory grabbing for AF_UNIX throughput.
66 * Alan Cox : Allocator for a socket is settable.
67 * Alan Cox : SO_ERROR includes soft errors.
68 * Alan Cox : Allow NULL arguments on some SO_ opts
69 * Alan Cox : Generic socket allocation to make hooks
70 * easier (suggested by Craig Metz).
71 * Michael Pall : SO_ERROR returns positive errno again
72 * Steve Whitehouse: Added default destructor to free
73 * protocol private data.
74 * Steve Whitehouse: Added various other default routines
75 * common to several socket families.
76 * Chris Evans : Call suser() check last on F_SETOWN
77 * Jay Schulist : Added SO_ATTACH_FILTER and SO_DETACH_FILTER.
78 * Andi Kleen : Add sock_kmalloc()/sock_kfree_s()
79 * Andi Kleen : Fix write_space callback
80 * Chris Evans : Security fixes - signedness again
81 * Arnaldo C. Melo : cleanups, use skb_queue_purge
83 * To Fix:
86 * This program is free software; you can redistribute it and/or
87 * modify it under the terms of the GNU General Public License
88 * as published by the Free Software Foundation; either version
89 * 2 of the License, or (at your option) any later version.
92 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
94 #include <asm/unaligned.h>
95 #include <linux/capability.h>
96 #include <linux/errno.h>
97 #include <linux/errqueue.h>
98 #include <linux/types.h>
99 #include <linux/socket.h>
100 #include <linux/in.h>
101 #include <linux/kernel.h>
102 #include <linux/module.h>
103 #include <linux/proc_fs.h>
104 #include <linux/seq_file.h>
105 #include <linux/sched.h>
106 #include <linux/sched/mm.h>
107 #include <linux/timer.h>
108 #include <linux/string.h>
109 #include <linux/sockios.h>
110 #include <linux/net.h>
111 #include <linux/mm.h>
112 #include <linux/slab.h>
113 #include <linux/interrupt.h>
114 #include <linux/poll.h>
115 #include <linux/tcp.h>
116 #include <linux/init.h>
117 #include <linux/highmem.h>
118 #include <linux/user_namespace.h>
119 #include <linux/static_key.h>
120 #include <linux/memcontrol.h>
121 #include <linux/prefetch.h>
123 #include <linux/uaccess.h>
125 #include <linux/netdevice.h>
126 #include <net/protocol.h>
127 #include <linux/skbuff.h>
128 #include <net/net_namespace.h>
129 #include <net/request_sock.h>
130 #include <net/sock.h>
131 #include <linux/net_tstamp.h>
132 #include <net/xfrm.h>
133 #include <linux/ipsec.h>
134 #include <net/cls_cgroup.h>
135 #include <net/netprio_cgroup.h>
136 #include <linux/sock_diag.h>
138 #include <linux/filter.h>
139 #include <net/sock_reuseport.h>
141 #include <trace/events/sock.h>
143 #include <net/tcp.h>
144 #include <net/busy_poll.h>
146 static DEFINE_MUTEX(proto_list_mutex);
147 static LIST_HEAD(proto_list);
149 static void sock_inuse_add(struct net *net, int val);
152 * sk_ns_capable - General socket capability test
153 * @sk: Socket to use a capability on or through
154 * @user_ns: The user namespace of the capability to use
155 * @cap: The capability to use
157 * Test to see if the opener of the socket had when the socket was
158 * created and the current process has the capability @cap in the user
159 * namespace @user_ns.
161 bool sk_ns_capable(const struct sock *sk,
162 struct user_namespace *user_ns, int cap)
164 return file_ns_capable(sk->sk_socket->file, user_ns, cap) &&
165 ns_capable(user_ns, cap);
167 EXPORT_SYMBOL(sk_ns_capable);
170 * sk_capable - Socket global capability test
171 * @sk: Socket to use a capability on or through
172 * @cap: The global capability to use
174 * Test to see if the opener of the socket had when the socket was
175 * created and the current process has the capability @cap in all user
176 * namespaces.
178 bool sk_capable(const struct sock *sk, int cap)
180 return sk_ns_capable(sk, &init_user_ns, cap);
182 EXPORT_SYMBOL(sk_capable);
185 * sk_net_capable - Network namespace socket capability test
186 * @sk: Socket to use a capability on or through
187 * @cap: The capability to use
189 * Test to see if the opener of the socket had when the socket was created
190 * and the current process has the capability @cap over the network namespace
191 * the socket is a member of.
193 bool sk_net_capable(const struct sock *sk, int cap)
195 return sk_ns_capable(sk, sock_net(sk)->user_ns, cap);
197 EXPORT_SYMBOL(sk_net_capable);
200 * Each address family might have different locking rules, so we have
201 * one slock key per address family and separate keys for internal and
202 * userspace sockets.
204 static struct lock_class_key af_family_keys[AF_MAX];
205 static struct lock_class_key af_family_kern_keys[AF_MAX];
206 static struct lock_class_key af_family_slock_keys[AF_MAX];
207 static struct lock_class_key af_family_kern_slock_keys[AF_MAX];
210 * Make lock validator output more readable. (we pre-construct these
211 * strings build-time, so that runtime initialization of socket
212 * locks is fast):
215 #define _sock_locks(x) \
216 x "AF_UNSPEC", x "AF_UNIX" , x "AF_INET" , \
217 x "AF_AX25" , x "AF_IPX" , x "AF_APPLETALK", \
218 x "AF_NETROM", x "AF_BRIDGE" , x "AF_ATMPVC" , \
219 x "AF_X25" , x "AF_INET6" , x "AF_ROSE" , \
220 x "AF_DECnet", x "AF_NETBEUI" , x "AF_SECURITY" , \
221 x "AF_KEY" , x "AF_NETLINK" , x "AF_PACKET" , \
222 x "AF_ASH" , x "AF_ECONET" , x "AF_ATMSVC" , \
223 x "AF_RDS" , x "AF_SNA" , x "AF_IRDA" , \
224 x "AF_PPPOX" , x "AF_WANPIPE" , x "AF_LLC" , \
225 x "27" , x "28" , x "AF_CAN" , \
226 x "AF_TIPC" , x "AF_BLUETOOTH", x "IUCV" , \
227 x "AF_RXRPC" , x "AF_ISDN" , x "AF_PHONET" , \
228 x "AF_IEEE802154", x "AF_CAIF" , x "AF_ALG" , \
229 x "AF_NFC" , x "AF_VSOCK" , x "AF_KCM" , \
230 x "AF_QIPCRTR", x "AF_SMC" , x "AF_XDP" , \
231 x "AF_MAX"
233 static const char *const af_family_key_strings[AF_MAX+1] = {
234 _sock_locks("sk_lock-")
236 static const char *const af_family_slock_key_strings[AF_MAX+1] = {
237 _sock_locks("slock-")
239 static const char *const af_family_clock_key_strings[AF_MAX+1] = {
240 _sock_locks("clock-")
243 static const char *const af_family_kern_key_strings[AF_MAX+1] = {
244 _sock_locks("k-sk_lock-")
246 static const char *const af_family_kern_slock_key_strings[AF_MAX+1] = {
247 _sock_locks("k-slock-")
249 static const char *const af_family_kern_clock_key_strings[AF_MAX+1] = {
250 _sock_locks("k-clock-")
252 static const char *const af_family_rlock_key_strings[AF_MAX+1] = {
253 _sock_locks("rlock-")
255 static const char *const af_family_wlock_key_strings[AF_MAX+1] = {
256 _sock_locks("wlock-")
258 static const char *const af_family_elock_key_strings[AF_MAX+1] = {
259 _sock_locks("elock-")
263 * sk_callback_lock and sk queues locking rules are per-address-family,
264 * so split the lock classes by using a per-AF key:
266 static struct lock_class_key af_callback_keys[AF_MAX];
267 static struct lock_class_key af_rlock_keys[AF_MAX];
268 static struct lock_class_key af_wlock_keys[AF_MAX];
269 static struct lock_class_key af_elock_keys[AF_MAX];
270 static struct lock_class_key af_kern_callback_keys[AF_MAX];
272 /* Run time adjustable parameters. */
273 __u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX;
274 EXPORT_SYMBOL(sysctl_wmem_max);
275 __u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX;
276 EXPORT_SYMBOL(sysctl_rmem_max);
277 __u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX;
278 __u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX;
280 /* Maximal space eaten by iovec or ancillary data plus some space */
281 int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512);
282 EXPORT_SYMBOL(sysctl_optmem_max);
284 int sysctl_tstamp_allow_data __read_mostly = 1;
286 DEFINE_STATIC_KEY_FALSE(memalloc_socks_key);
287 EXPORT_SYMBOL_GPL(memalloc_socks_key);
290 * sk_set_memalloc - sets %SOCK_MEMALLOC
291 * @sk: socket to set it on
293 * Set %SOCK_MEMALLOC on a socket for access to emergency reserves.
294 * It's the responsibility of the admin to adjust min_free_kbytes
295 * to meet the requirements
297 void sk_set_memalloc(struct sock *sk)
299 sock_set_flag(sk, SOCK_MEMALLOC);
300 sk->sk_allocation |= __GFP_MEMALLOC;
301 static_branch_inc(&memalloc_socks_key);
303 EXPORT_SYMBOL_GPL(sk_set_memalloc);
305 void sk_clear_memalloc(struct sock *sk)
307 sock_reset_flag(sk, SOCK_MEMALLOC);
308 sk->sk_allocation &= ~__GFP_MEMALLOC;
309 static_branch_dec(&memalloc_socks_key);
312 * SOCK_MEMALLOC is allowed to ignore rmem limits to ensure forward
313 * progress of swapping. SOCK_MEMALLOC may be cleared while
314 * it has rmem allocations due to the last swapfile being deactivated
315 * but there is a risk that the socket is unusable due to exceeding
316 * the rmem limits. Reclaim the reserves and obey rmem limits again.
318 sk_mem_reclaim(sk);
320 EXPORT_SYMBOL_GPL(sk_clear_memalloc);
322 int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb)
324 int ret;
325 unsigned int noreclaim_flag;
327 /* these should have been dropped before queueing */
328 BUG_ON(!sock_flag(sk, SOCK_MEMALLOC));
330 noreclaim_flag = memalloc_noreclaim_save();
331 ret = sk->sk_backlog_rcv(sk, skb);
332 memalloc_noreclaim_restore(noreclaim_flag);
334 return ret;
336 EXPORT_SYMBOL(__sk_backlog_rcv);
338 static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen)
340 struct timeval tv;
342 if (optlen < sizeof(tv))
343 return -EINVAL;
344 if (copy_from_user(&tv, optval, sizeof(tv)))
345 return -EFAULT;
346 if (tv.tv_usec < 0 || tv.tv_usec >= USEC_PER_SEC)
347 return -EDOM;
349 if (tv.tv_sec < 0) {
350 static int warned __read_mostly;
352 *timeo_p = 0;
353 if (warned < 10 && net_ratelimit()) {
354 warned++;
355 pr_info("%s: `%s' (pid %d) tries to set negative timeout\n",
356 __func__, current->comm, task_pid_nr(current));
358 return 0;
360 *timeo_p = MAX_SCHEDULE_TIMEOUT;
361 if (tv.tv_sec == 0 && tv.tv_usec == 0)
362 return 0;
363 if (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT/HZ - 1))
364 *timeo_p = tv.tv_sec * HZ + DIV_ROUND_UP(tv.tv_usec, USEC_PER_SEC / HZ);
365 return 0;
368 static void sock_warn_obsolete_bsdism(const char *name)
370 static int warned;
371 static char warncomm[TASK_COMM_LEN];
372 if (strcmp(warncomm, current->comm) && warned < 5) {
373 strcpy(warncomm, current->comm);
374 pr_warn("process `%s' is using obsolete %s SO_BSDCOMPAT\n",
375 warncomm, name);
376 warned++;
380 static bool sock_needs_netstamp(const struct sock *sk)
382 switch (sk->sk_family) {
383 case AF_UNSPEC:
384 case AF_UNIX:
385 return false;
386 default:
387 return true;
391 static void sock_disable_timestamp(struct sock *sk, unsigned long flags)
393 if (sk->sk_flags & flags) {
394 sk->sk_flags &= ~flags;
395 if (sock_needs_netstamp(sk) &&
396 !(sk->sk_flags & SK_FLAGS_TIMESTAMP))
397 net_disable_timestamp();
402 int __sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
404 unsigned long flags;
405 struct sk_buff_head *list = &sk->sk_receive_queue;
407 if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) {
408 atomic_inc(&sk->sk_drops);
409 trace_sock_rcvqueue_full(sk, skb);
410 return -ENOMEM;
413 if (!sk_rmem_schedule(sk, skb, skb->truesize)) {
414 atomic_inc(&sk->sk_drops);
415 return -ENOBUFS;
418 skb->dev = NULL;
419 skb_set_owner_r(skb, sk);
421 /* we escape from rcu protected region, make sure we dont leak
422 * a norefcounted dst
424 skb_dst_force(skb);
426 spin_lock_irqsave(&list->lock, flags);
427 sock_skb_set_dropcount(sk, skb);
428 __skb_queue_tail(list, skb);
429 spin_unlock_irqrestore(&list->lock, flags);
431 if (!sock_flag(sk, SOCK_DEAD))
432 sk->sk_data_ready(sk);
433 return 0;
435 EXPORT_SYMBOL(__sock_queue_rcv_skb);
437 int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
439 int err;
441 err = sk_filter(sk, skb);
442 if (err)
443 return err;
445 return __sock_queue_rcv_skb(sk, skb);
447 EXPORT_SYMBOL(sock_queue_rcv_skb);
449 int __sk_receive_skb(struct sock *sk, struct sk_buff *skb,
450 const int nested, unsigned int trim_cap, bool refcounted)
452 int rc = NET_RX_SUCCESS;
454 if (sk_filter_trim_cap(sk, skb, trim_cap))
455 goto discard_and_relse;
457 skb->dev = NULL;
459 if (sk_rcvqueues_full(sk, sk->sk_rcvbuf)) {
460 atomic_inc(&sk->sk_drops);
461 goto discard_and_relse;
463 if (nested)
464 bh_lock_sock_nested(sk);
465 else
466 bh_lock_sock(sk);
467 if (!sock_owned_by_user(sk)) {
469 * trylock + unlock semantics:
471 mutex_acquire(&sk->sk_lock.dep_map, 0, 1, _RET_IP_);
473 rc = sk_backlog_rcv(sk, skb);
475 mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
476 } else if (sk_add_backlog(sk, skb, sk->sk_rcvbuf)) {
477 bh_unlock_sock(sk);
478 atomic_inc(&sk->sk_drops);
479 goto discard_and_relse;
482 bh_unlock_sock(sk);
483 out:
484 if (refcounted)
485 sock_put(sk);
486 return rc;
487 discard_and_relse:
488 kfree_skb(skb);
489 goto out;
491 EXPORT_SYMBOL(__sk_receive_skb);
493 struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie)
495 struct dst_entry *dst = __sk_dst_get(sk);
497 if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
498 sk_tx_queue_clear(sk);
499 sk->sk_dst_pending_confirm = 0;
500 RCU_INIT_POINTER(sk->sk_dst_cache, NULL);
501 dst_release(dst);
502 return NULL;
505 return dst;
507 EXPORT_SYMBOL(__sk_dst_check);
509 struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie)
511 struct dst_entry *dst = sk_dst_get(sk);
513 if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
514 sk_dst_reset(sk);
515 dst_release(dst);
516 return NULL;
519 return dst;
521 EXPORT_SYMBOL(sk_dst_check);
523 static int sock_setbindtodevice(struct sock *sk, char __user *optval,
524 int optlen)
526 int ret = -ENOPROTOOPT;
527 #ifdef CONFIG_NETDEVICES
528 struct net *net = sock_net(sk);
529 char devname[IFNAMSIZ];
530 int index;
532 /* Sorry... */
533 ret = -EPERM;
534 if (!ns_capable(net->user_ns, CAP_NET_RAW))
535 goto out;
537 ret = -EINVAL;
538 if (optlen < 0)
539 goto out;
541 /* Bind this socket to a particular device like "eth0",
542 * as specified in the passed interface name. If the
543 * name is "" or the option length is zero the socket
544 * is not bound.
546 if (optlen > IFNAMSIZ - 1)
547 optlen = IFNAMSIZ - 1;
548 memset(devname, 0, sizeof(devname));
550 ret = -EFAULT;
551 if (copy_from_user(devname, optval, optlen))
552 goto out;
554 index = 0;
555 if (devname[0] != '\0') {
556 struct net_device *dev;
558 rcu_read_lock();
559 dev = dev_get_by_name_rcu(net, devname);
560 if (dev)
561 index = dev->ifindex;
562 rcu_read_unlock();
563 ret = -ENODEV;
564 if (!dev)
565 goto out;
568 lock_sock(sk);
569 sk->sk_bound_dev_if = index;
570 sk_dst_reset(sk);
571 release_sock(sk);
573 ret = 0;
575 out:
576 #endif
578 return ret;
581 static int sock_getbindtodevice(struct sock *sk, char __user *optval,
582 int __user *optlen, int len)
584 int ret = -ENOPROTOOPT;
585 #ifdef CONFIG_NETDEVICES
586 struct net *net = sock_net(sk);
587 char devname[IFNAMSIZ];
589 if (sk->sk_bound_dev_if == 0) {
590 len = 0;
591 goto zero;
594 ret = -EINVAL;
595 if (len < IFNAMSIZ)
596 goto out;
598 ret = netdev_get_name(net, devname, sk->sk_bound_dev_if);
599 if (ret)
600 goto out;
602 len = strlen(devname) + 1;
604 ret = -EFAULT;
605 if (copy_to_user(optval, devname, len))
606 goto out;
608 zero:
609 ret = -EFAULT;
610 if (put_user(len, optlen))
611 goto out;
613 ret = 0;
615 out:
616 #endif
618 return ret;
621 static inline void sock_valbool_flag(struct sock *sk, int bit, int valbool)
623 if (valbool)
624 sock_set_flag(sk, bit);
625 else
626 sock_reset_flag(sk, bit);
629 bool sk_mc_loop(struct sock *sk)
631 if (dev_recursion_level())
632 return false;
633 if (!sk)
634 return true;
635 switch (sk->sk_family) {
636 case AF_INET:
637 return inet_sk(sk)->mc_loop;
638 #if IS_ENABLED(CONFIG_IPV6)
639 case AF_INET6:
640 return inet6_sk(sk)->mc_loop;
641 #endif
643 WARN_ON_ONCE(1);
644 return true;
646 EXPORT_SYMBOL(sk_mc_loop);
649 * This is meant for all protocols to use and covers goings on
650 * at the socket level. Everything here is generic.
653 int sock_setsockopt(struct socket *sock, int level, int optname,
654 char __user *optval, unsigned int optlen)
656 struct sock_txtime sk_txtime;
657 struct sock *sk = sock->sk;
658 int val;
659 int valbool;
660 struct linger ling;
661 int ret = 0;
664 * Options without arguments
667 if (optname == SO_BINDTODEVICE)
668 return sock_setbindtodevice(sk, optval, optlen);
670 if (optlen < sizeof(int))
671 return -EINVAL;
673 if (get_user(val, (int __user *)optval))
674 return -EFAULT;
676 valbool = val ? 1 : 0;
678 lock_sock(sk);
680 switch (optname) {
681 case SO_DEBUG:
682 if (val && !capable(CAP_NET_ADMIN))
683 ret = -EACCES;
684 else
685 sock_valbool_flag(sk, SOCK_DBG, valbool);
686 break;
687 case SO_REUSEADDR:
688 sk->sk_reuse = (valbool ? SK_CAN_REUSE : SK_NO_REUSE);
689 break;
690 case SO_REUSEPORT:
691 sk->sk_reuseport = valbool;
692 break;
693 case SO_TYPE:
694 case SO_PROTOCOL:
695 case SO_DOMAIN:
696 case SO_ERROR:
697 ret = -ENOPROTOOPT;
698 break;
699 case SO_DONTROUTE:
700 sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool);
701 sk_dst_reset(sk);
702 break;
703 case SO_BROADCAST:
704 sock_valbool_flag(sk, SOCK_BROADCAST, valbool);
705 break;
706 case SO_SNDBUF:
707 /* Don't error on this BSD doesn't and if you think
708 * about it this is right. Otherwise apps have to
709 * play 'guess the biggest size' games. RCVBUF/SNDBUF
710 * are treated in BSD as hints
712 val = min_t(u32, val, sysctl_wmem_max);
713 set_sndbuf:
714 sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
715 sk->sk_sndbuf = max_t(int, val * 2, SOCK_MIN_SNDBUF);
716 /* Wake up sending tasks if we upped the value. */
717 sk->sk_write_space(sk);
718 break;
720 case SO_SNDBUFFORCE:
721 if (!capable(CAP_NET_ADMIN)) {
722 ret = -EPERM;
723 break;
725 goto set_sndbuf;
727 case SO_RCVBUF:
728 /* Don't error on this BSD doesn't and if you think
729 * about it this is right. Otherwise apps have to
730 * play 'guess the biggest size' games. RCVBUF/SNDBUF
731 * are treated in BSD as hints
733 val = min_t(u32, val, sysctl_rmem_max);
734 set_rcvbuf:
735 sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
737 * We double it on the way in to account for
738 * "struct sk_buff" etc. overhead. Applications
739 * assume that the SO_RCVBUF setting they make will
740 * allow that much actual data to be received on that
741 * socket.
743 * Applications are unaware that "struct sk_buff" and
744 * other overheads allocate from the receive buffer
745 * during socket buffer allocation.
747 * And after considering the possible alternatives,
748 * returning the value we actually used in getsockopt
749 * is the most desirable behavior.
751 sk->sk_rcvbuf = max_t(int, val * 2, SOCK_MIN_RCVBUF);
752 break;
754 case SO_RCVBUFFORCE:
755 if (!capable(CAP_NET_ADMIN)) {
756 ret = -EPERM;
757 break;
759 goto set_rcvbuf;
761 case SO_KEEPALIVE:
762 if (sk->sk_prot->keepalive)
763 sk->sk_prot->keepalive(sk, valbool);
764 sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool);
765 break;
767 case SO_OOBINLINE:
768 sock_valbool_flag(sk, SOCK_URGINLINE, valbool);
769 break;
771 case SO_NO_CHECK:
772 sk->sk_no_check_tx = valbool;
773 break;
775 case SO_PRIORITY:
776 if ((val >= 0 && val <= 6) ||
777 ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
778 sk->sk_priority = val;
779 else
780 ret = -EPERM;
781 break;
783 case SO_LINGER:
784 if (optlen < sizeof(ling)) {
785 ret = -EINVAL; /* 1003.1g */
786 break;
788 if (copy_from_user(&ling, optval, sizeof(ling))) {
789 ret = -EFAULT;
790 break;
792 if (!ling.l_onoff)
793 sock_reset_flag(sk, SOCK_LINGER);
794 else {
795 #if (BITS_PER_LONG == 32)
796 if ((unsigned int)ling.l_linger >= MAX_SCHEDULE_TIMEOUT/HZ)
797 sk->sk_lingertime = MAX_SCHEDULE_TIMEOUT;
798 else
799 #endif
800 sk->sk_lingertime = (unsigned int)ling.l_linger * HZ;
801 sock_set_flag(sk, SOCK_LINGER);
803 break;
805 case SO_BSDCOMPAT:
806 sock_warn_obsolete_bsdism("setsockopt");
807 break;
809 case SO_PASSCRED:
810 if (valbool)
811 set_bit(SOCK_PASSCRED, &sock->flags);
812 else
813 clear_bit(SOCK_PASSCRED, &sock->flags);
814 break;
816 case SO_TIMESTAMP:
817 case SO_TIMESTAMPNS:
818 if (valbool) {
819 if (optname == SO_TIMESTAMP)
820 sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
821 else
822 sock_set_flag(sk, SOCK_RCVTSTAMPNS);
823 sock_set_flag(sk, SOCK_RCVTSTAMP);
824 sock_enable_timestamp(sk, SOCK_TIMESTAMP);
825 } else {
826 sock_reset_flag(sk, SOCK_RCVTSTAMP);
827 sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
829 break;
831 case SO_TIMESTAMPING:
832 if (val & ~SOF_TIMESTAMPING_MASK) {
833 ret = -EINVAL;
834 break;
837 if (val & SOF_TIMESTAMPING_OPT_ID &&
838 !(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)) {
839 if (sk->sk_protocol == IPPROTO_TCP &&
840 sk->sk_type == SOCK_STREAM) {
841 if ((1 << sk->sk_state) &
842 (TCPF_CLOSE | TCPF_LISTEN)) {
843 ret = -EINVAL;
844 break;
846 sk->sk_tskey = tcp_sk(sk)->snd_una;
847 } else {
848 sk->sk_tskey = 0;
852 if (val & SOF_TIMESTAMPING_OPT_STATS &&
853 !(val & SOF_TIMESTAMPING_OPT_TSONLY)) {
854 ret = -EINVAL;
855 break;
858 sk->sk_tsflags = val;
859 if (val & SOF_TIMESTAMPING_RX_SOFTWARE)
860 sock_enable_timestamp(sk,
861 SOCK_TIMESTAMPING_RX_SOFTWARE);
862 else
863 sock_disable_timestamp(sk,
864 (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE));
865 break;
867 case SO_RCVLOWAT:
868 if (val < 0)
869 val = INT_MAX;
870 if (sock->ops->set_rcvlowat)
871 ret = sock->ops->set_rcvlowat(sk, val);
872 else
873 sk->sk_rcvlowat = val ? : 1;
874 break;
876 case SO_RCVTIMEO:
877 ret = sock_set_timeout(&sk->sk_rcvtimeo, optval, optlen);
878 break;
880 case SO_SNDTIMEO:
881 ret = sock_set_timeout(&sk->sk_sndtimeo, optval, optlen);
882 break;
884 case SO_ATTACH_FILTER:
885 ret = -EINVAL;
886 if (optlen == sizeof(struct sock_fprog)) {
887 struct sock_fprog fprog;
889 ret = -EFAULT;
890 if (copy_from_user(&fprog, optval, sizeof(fprog)))
891 break;
893 ret = sk_attach_filter(&fprog, sk);
895 break;
897 case SO_ATTACH_BPF:
898 ret = -EINVAL;
899 if (optlen == sizeof(u32)) {
900 u32 ufd;
902 ret = -EFAULT;
903 if (copy_from_user(&ufd, optval, sizeof(ufd)))
904 break;
906 ret = sk_attach_bpf(ufd, sk);
908 break;
910 case SO_ATTACH_REUSEPORT_CBPF:
911 ret = -EINVAL;
912 if (optlen == sizeof(struct sock_fprog)) {
913 struct sock_fprog fprog;
915 ret = -EFAULT;
916 if (copy_from_user(&fprog, optval, sizeof(fprog)))
917 break;
919 ret = sk_reuseport_attach_filter(&fprog, sk);
921 break;
923 case SO_ATTACH_REUSEPORT_EBPF:
924 ret = -EINVAL;
925 if (optlen == sizeof(u32)) {
926 u32 ufd;
928 ret = -EFAULT;
929 if (copy_from_user(&ufd, optval, sizeof(ufd)))
930 break;
932 ret = sk_reuseport_attach_bpf(ufd, sk);
934 break;
936 case SO_DETACH_FILTER:
937 ret = sk_detach_filter(sk);
938 break;
940 case SO_LOCK_FILTER:
941 if (sock_flag(sk, SOCK_FILTER_LOCKED) && !valbool)
942 ret = -EPERM;
943 else
944 sock_valbool_flag(sk, SOCK_FILTER_LOCKED, valbool);
945 break;
947 case SO_PASSSEC:
948 if (valbool)
949 set_bit(SOCK_PASSSEC, &sock->flags);
950 else
951 clear_bit(SOCK_PASSSEC, &sock->flags);
952 break;
953 case SO_MARK:
954 if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
955 ret = -EPERM;
956 else
957 sk->sk_mark = val;
958 break;
960 case SO_RXQ_OVFL:
961 sock_valbool_flag(sk, SOCK_RXQ_OVFL, valbool);
962 break;
964 case SO_WIFI_STATUS:
965 sock_valbool_flag(sk, SOCK_WIFI_STATUS, valbool);
966 break;
968 case SO_PEEK_OFF:
969 if (sock->ops->set_peek_off)
970 ret = sock->ops->set_peek_off(sk, val);
971 else
972 ret = -EOPNOTSUPP;
973 break;
975 case SO_NOFCS:
976 sock_valbool_flag(sk, SOCK_NOFCS, valbool);
977 break;
979 case SO_SELECT_ERR_QUEUE:
980 sock_valbool_flag(sk, SOCK_SELECT_ERR_QUEUE, valbool);
981 break;
983 #ifdef CONFIG_NET_RX_BUSY_POLL
984 case SO_BUSY_POLL:
985 /* allow unprivileged users to decrease the value */
986 if ((val > sk->sk_ll_usec) && !capable(CAP_NET_ADMIN))
987 ret = -EPERM;
988 else {
989 if (val < 0)
990 ret = -EINVAL;
991 else
992 sk->sk_ll_usec = val;
994 break;
995 #endif
997 case SO_MAX_PACING_RATE:
998 if (val != ~0U)
999 cmpxchg(&sk->sk_pacing_status,
1000 SK_PACING_NONE,
1001 SK_PACING_NEEDED);
1002 sk->sk_max_pacing_rate = val;
1003 sk->sk_pacing_rate = min(sk->sk_pacing_rate,
1004 sk->sk_max_pacing_rate);
1005 break;
1007 case SO_INCOMING_CPU:
1008 WRITE_ONCE(sk->sk_incoming_cpu, val);
1009 break;
1011 case SO_CNX_ADVICE:
1012 if (val == 1)
1013 dst_negative_advice(sk);
1014 break;
1016 case SO_ZEROCOPY:
1017 if (sk->sk_family == PF_INET || sk->sk_family == PF_INET6) {
1018 if (sk->sk_protocol != IPPROTO_TCP)
1019 ret = -ENOTSUPP;
1020 } else if (sk->sk_family != PF_RDS) {
1021 ret = -ENOTSUPP;
1023 if (!ret) {
1024 if (val < 0 || val > 1)
1025 ret = -EINVAL;
1026 else
1027 sock_valbool_flag(sk, SOCK_ZEROCOPY, valbool);
1029 break;
1031 case SO_TXTIME:
1032 if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
1033 ret = -EPERM;
1034 } else if (optlen != sizeof(struct sock_txtime)) {
1035 ret = -EINVAL;
1036 } else if (copy_from_user(&sk_txtime, optval,
1037 sizeof(struct sock_txtime))) {
1038 ret = -EFAULT;
1039 } else if (sk_txtime.flags & ~SOF_TXTIME_FLAGS_MASK) {
1040 ret = -EINVAL;
1041 } else {
1042 sock_valbool_flag(sk, SOCK_TXTIME, true);
1043 sk->sk_clockid = sk_txtime.clockid;
1044 sk->sk_txtime_deadline_mode =
1045 !!(sk_txtime.flags & SOF_TXTIME_DEADLINE_MODE);
1046 sk->sk_txtime_report_errors =
1047 !!(sk_txtime.flags & SOF_TXTIME_REPORT_ERRORS);
1049 break;
1051 default:
1052 ret = -ENOPROTOOPT;
1053 break;
1055 release_sock(sk);
1056 return ret;
1058 EXPORT_SYMBOL(sock_setsockopt);
1061 static void cred_to_ucred(struct pid *pid, const struct cred *cred,
1062 struct ucred *ucred)
1064 ucred->pid = pid_vnr(pid);
1065 ucred->uid = ucred->gid = -1;
1066 if (cred) {
1067 struct user_namespace *current_ns = current_user_ns();
1069 ucred->uid = from_kuid_munged(current_ns, cred->euid);
1070 ucred->gid = from_kgid_munged(current_ns, cred->egid);
1074 static int groups_to_user(gid_t __user *dst, const struct group_info *src)
1076 struct user_namespace *user_ns = current_user_ns();
1077 int i;
1079 for (i = 0; i < src->ngroups; i++)
1080 if (put_user(from_kgid_munged(user_ns, src->gid[i]), dst + i))
1081 return -EFAULT;
1083 return 0;
1086 int sock_getsockopt(struct socket *sock, int level, int optname,
1087 char __user *optval, int __user *optlen)
1089 struct sock *sk = sock->sk;
1091 union {
1092 int val;
1093 u64 val64;
1094 struct linger ling;
1095 struct timeval tm;
1096 struct sock_txtime txtime;
1097 } v;
1099 int lv = sizeof(int);
1100 int len;
1102 if (get_user(len, optlen))
1103 return -EFAULT;
1104 if (len < 0)
1105 return -EINVAL;
1107 memset(&v, 0, sizeof(v));
1109 switch (optname) {
1110 case SO_DEBUG:
1111 v.val = sock_flag(sk, SOCK_DBG);
1112 break;
1114 case SO_DONTROUTE:
1115 v.val = sock_flag(sk, SOCK_LOCALROUTE);
1116 break;
1118 case SO_BROADCAST:
1119 v.val = sock_flag(sk, SOCK_BROADCAST);
1120 break;
1122 case SO_SNDBUF:
1123 v.val = sk->sk_sndbuf;
1124 break;
1126 case SO_RCVBUF:
1127 v.val = sk->sk_rcvbuf;
1128 break;
1130 case SO_REUSEADDR:
1131 v.val = sk->sk_reuse;
1132 break;
1134 case SO_REUSEPORT:
1135 v.val = sk->sk_reuseport;
1136 break;
1138 case SO_KEEPALIVE:
1139 v.val = sock_flag(sk, SOCK_KEEPOPEN);
1140 break;
1142 case SO_TYPE:
1143 v.val = sk->sk_type;
1144 break;
1146 case SO_PROTOCOL:
1147 v.val = sk->sk_protocol;
1148 break;
1150 case SO_DOMAIN:
1151 v.val = sk->sk_family;
1152 break;
1154 case SO_ERROR:
1155 v.val = -sock_error(sk);
1156 if (v.val == 0)
1157 v.val = xchg(&sk->sk_err_soft, 0);
1158 break;
1160 case SO_OOBINLINE:
1161 v.val = sock_flag(sk, SOCK_URGINLINE);
1162 break;
1164 case SO_NO_CHECK:
1165 v.val = sk->sk_no_check_tx;
1166 break;
1168 case SO_PRIORITY:
1169 v.val = sk->sk_priority;
1170 break;
1172 case SO_LINGER:
1173 lv = sizeof(v.ling);
1174 v.ling.l_onoff = sock_flag(sk, SOCK_LINGER);
1175 v.ling.l_linger = sk->sk_lingertime / HZ;
1176 break;
1178 case SO_BSDCOMPAT:
1179 sock_warn_obsolete_bsdism("getsockopt");
1180 break;
1182 case SO_TIMESTAMP:
1183 v.val = sock_flag(sk, SOCK_RCVTSTAMP) &&
1184 !sock_flag(sk, SOCK_RCVTSTAMPNS);
1185 break;
1187 case SO_TIMESTAMPNS:
1188 v.val = sock_flag(sk, SOCK_RCVTSTAMPNS);
1189 break;
1191 case SO_TIMESTAMPING:
1192 v.val = sk->sk_tsflags;
1193 break;
1195 case SO_RCVTIMEO:
1196 lv = sizeof(struct timeval);
1197 if (sk->sk_rcvtimeo == MAX_SCHEDULE_TIMEOUT) {
1198 v.tm.tv_sec = 0;
1199 v.tm.tv_usec = 0;
1200 } else {
1201 v.tm.tv_sec = sk->sk_rcvtimeo / HZ;
1202 v.tm.tv_usec = ((sk->sk_rcvtimeo % HZ) * USEC_PER_SEC) / HZ;
1204 break;
1206 case SO_SNDTIMEO:
1207 lv = sizeof(struct timeval);
1208 if (sk->sk_sndtimeo == MAX_SCHEDULE_TIMEOUT) {
1209 v.tm.tv_sec = 0;
1210 v.tm.tv_usec = 0;
1211 } else {
1212 v.tm.tv_sec = sk->sk_sndtimeo / HZ;
1213 v.tm.tv_usec = ((sk->sk_sndtimeo % HZ) * USEC_PER_SEC) / HZ;
1215 break;
1217 case SO_RCVLOWAT:
1218 v.val = sk->sk_rcvlowat;
1219 break;
1221 case SO_SNDLOWAT:
1222 v.val = 1;
1223 break;
1225 case SO_PASSCRED:
1226 v.val = !!test_bit(SOCK_PASSCRED, &sock->flags);
1227 break;
1229 case SO_PEERCRED:
1231 struct ucred peercred;
1232 if (len > sizeof(peercred))
1233 len = sizeof(peercred);
1234 cred_to_ucred(sk->sk_peer_pid, sk->sk_peer_cred, &peercred);
1235 if (copy_to_user(optval, &peercred, len))
1236 return -EFAULT;
1237 goto lenout;
1240 case SO_PEERGROUPS:
1242 int ret, n;
1244 if (!sk->sk_peer_cred)
1245 return -ENODATA;
1247 n = sk->sk_peer_cred->group_info->ngroups;
1248 if (len < n * sizeof(gid_t)) {
1249 len = n * sizeof(gid_t);
1250 return put_user(len, optlen) ? -EFAULT : -ERANGE;
1252 len = n * sizeof(gid_t);
1254 ret = groups_to_user((gid_t __user *)optval,
1255 sk->sk_peer_cred->group_info);
1256 if (ret)
1257 return ret;
1258 goto lenout;
1261 case SO_PEERNAME:
1263 char address[128];
1265 lv = sock->ops->getname(sock, (struct sockaddr *)address, 2);
1266 if (lv < 0)
1267 return -ENOTCONN;
1268 if (lv < len)
1269 return -EINVAL;
1270 if (copy_to_user(optval, address, len))
1271 return -EFAULT;
1272 goto lenout;
1275 /* Dubious BSD thing... Probably nobody even uses it, but
1276 * the UNIX standard wants it for whatever reason... -DaveM
1278 case SO_ACCEPTCONN:
1279 v.val = sk->sk_state == TCP_LISTEN;
1280 break;
1282 case SO_PASSSEC:
1283 v.val = !!test_bit(SOCK_PASSSEC, &sock->flags);
1284 break;
1286 case SO_PEERSEC:
1287 return security_socket_getpeersec_stream(sock, optval, optlen, len);
1289 case SO_MARK:
1290 v.val = sk->sk_mark;
1291 break;
1293 case SO_RXQ_OVFL:
1294 v.val = sock_flag(sk, SOCK_RXQ_OVFL);
1295 break;
1297 case SO_WIFI_STATUS:
1298 v.val = sock_flag(sk, SOCK_WIFI_STATUS);
1299 break;
1301 case SO_PEEK_OFF:
1302 if (!sock->ops->set_peek_off)
1303 return -EOPNOTSUPP;
1305 v.val = sk->sk_peek_off;
1306 break;
1307 case SO_NOFCS:
1308 v.val = sock_flag(sk, SOCK_NOFCS);
1309 break;
1311 case SO_BINDTODEVICE:
1312 return sock_getbindtodevice(sk, optval, optlen, len);
1314 case SO_GET_FILTER:
1315 len = sk_get_filter(sk, (struct sock_filter __user *)optval, len);
1316 if (len < 0)
1317 return len;
1319 goto lenout;
1321 case SO_LOCK_FILTER:
1322 v.val = sock_flag(sk, SOCK_FILTER_LOCKED);
1323 break;
1325 case SO_BPF_EXTENSIONS:
1326 v.val = bpf_tell_extensions();
1327 break;
1329 case SO_SELECT_ERR_QUEUE:
1330 v.val = sock_flag(sk, SOCK_SELECT_ERR_QUEUE);
1331 break;
1333 #ifdef CONFIG_NET_RX_BUSY_POLL
1334 case SO_BUSY_POLL:
1335 v.val = sk->sk_ll_usec;
1336 break;
1337 #endif
1339 case SO_MAX_PACING_RATE:
1340 v.val = sk->sk_max_pacing_rate;
1341 break;
1343 case SO_INCOMING_CPU:
1344 v.val = READ_ONCE(sk->sk_incoming_cpu);
1345 break;
1347 case SO_MEMINFO:
1349 u32 meminfo[SK_MEMINFO_VARS];
1351 sk_get_meminfo(sk, meminfo);
1353 len = min_t(unsigned int, len, sizeof(meminfo));
1354 if (copy_to_user(optval, &meminfo, len))
1355 return -EFAULT;
1357 goto lenout;
1360 #ifdef CONFIG_NET_RX_BUSY_POLL
1361 case SO_INCOMING_NAPI_ID:
1362 v.val = READ_ONCE(sk->sk_napi_id);
1364 /* aggregate non-NAPI IDs down to 0 */
1365 if (v.val < MIN_NAPI_ID)
1366 v.val = 0;
1368 break;
1369 #endif
1371 case SO_COOKIE:
1372 lv = sizeof(u64);
1373 if (len < lv)
1374 return -EINVAL;
1375 v.val64 = sock_gen_cookie(sk);
1376 break;
1378 case SO_ZEROCOPY:
1379 v.val = sock_flag(sk, SOCK_ZEROCOPY);
1380 break;
1382 case SO_TXTIME:
1383 lv = sizeof(v.txtime);
1384 v.txtime.clockid = sk->sk_clockid;
1385 v.txtime.flags |= sk->sk_txtime_deadline_mode ?
1386 SOF_TXTIME_DEADLINE_MODE : 0;
1387 v.txtime.flags |= sk->sk_txtime_report_errors ?
1388 SOF_TXTIME_REPORT_ERRORS : 0;
1389 break;
1391 default:
1392 /* We implement the SO_SNDLOWAT etc to not be settable
1393 * (1003.1g 7).
1395 return -ENOPROTOOPT;
1398 if (len > lv)
1399 len = lv;
1400 if (copy_to_user(optval, &v, len))
1401 return -EFAULT;
1402 lenout:
1403 if (put_user(len, optlen))
1404 return -EFAULT;
1405 return 0;
1409 * Initialize an sk_lock.
1411 * (We also register the sk_lock with the lock validator.)
1413 static inline void sock_lock_init(struct sock *sk)
1415 if (sk->sk_kern_sock)
1416 sock_lock_init_class_and_name(
1418 af_family_kern_slock_key_strings[sk->sk_family],
1419 af_family_kern_slock_keys + sk->sk_family,
1420 af_family_kern_key_strings[sk->sk_family],
1421 af_family_kern_keys + sk->sk_family);
1422 else
1423 sock_lock_init_class_and_name(
1425 af_family_slock_key_strings[sk->sk_family],
1426 af_family_slock_keys + sk->sk_family,
1427 af_family_key_strings[sk->sk_family],
1428 af_family_keys + sk->sk_family);
1432 * Copy all fields from osk to nsk but nsk->sk_refcnt must not change yet,
1433 * even temporarly, because of RCU lookups. sk_node should also be left as is.
1434 * We must not copy fields between sk_dontcopy_begin and sk_dontcopy_end
1436 static void sock_copy(struct sock *nsk, const struct sock *osk)
1438 #ifdef CONFIG_SECURITY_NETWORK
1439 void *sptr = nsk->sk_security;
1440 #endif
1441 memcpy(nsk, osk, offsetof(struct sock, sk_dontcopy_begin));
1443 memcpy(&nsk->sk_dontcopy_end, &osk->sk_dontcopy_end,
1444 osk->sk_prot->obj_size - offsetof(struct sock, sk_dontcopy_end));
1446 #ifdef CONFIG_SECURITY_NETWORK
1447 nsk->sk_security = sptr;
1448 security_sk_clone(osk, nsk);
1449 #endif
1452 static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
1453 int family)
1455 struct sock *sk;
1456 struct kmem_cache *slab;
1458 slab = prot->slab;
1459 if (slab != NULL) {
1460 sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO);
1461 if (!sk)
1462 return sk;
1463 if (priority & __GFP_ZERO)
1464 sk_prot_clear_nulls(sk, prot->obj_size);
1465 } else
1466 sk = kmalloc(prot->obj_size, priority);
1468 if (sk != NULL) {
1469 if (security_sk_alloc(sk, family, priority))
1470 goto out_free;
1472 if (!try_module_get(prot->owner))
1473 goto out_free_sec;
1474 sk_tx_queue_clear(sk);
1477 return sk;
1479 out_free_sec:
1480 security_sk_free(sk);
1481 out_free:
1482 if (slab != NULL)
1483 kmem_cache_free(slab, sk);
1484 else
1485 kfree(sk);
1486 return NULL;
1489 static void sk_prot_free(struct proto *prot, struct sock *sk)
1491 struct kmem_cache *slab;
1492 struct module *owner;
1494 owner = prot->owner;
1495 slab = prot->slab;
1497 cgroup_sk_free(&sk->sk_cgrp_data);
1498 mem_cgroup_sk_free(sk);
1499 security_sk_free(sk);
1500 if (slab != NULL)
1501 kmem_cache_free(slab, sk);
1502 else
1503 kfree(sk);
1504 module_put(owner);
1508 * sk_alloc - All socket objects are allocated here
1509 * @net: the applicable net namespace
1510 * @family: protocol family
1511 * @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
1512 * @prot: struct proto associated with this new sock instance
1513 * @kern: is this to be a kernel socket?
1515 struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
1516 struct proto *prot, int kern)
1518 struct sock *sk;
1520 sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family);
1521 if (sk) {
1522 sk->sk_family = family;
1524 * See comment in struct sock definition to understand
1525 * why we need sk_prot_creator -acme
1527 sk->sk_prot = sk->sk_prot_creator = prot;
1528 sk->sk_kern_sock = kern;
1529 sock_lock_init(sk);
1530 sk->sk_net_refcnt = kern ? 0 : 1;
1531 if (likely(sk->sk_net_refcnt)) {
1532 get_net(net);
1533 sock_inuse_add(net, 1);
1536 sock_net_set(sk, net);
1537 refcount_set(&sk->sk_wmem_alloc, 1);
1539 mem_cgroup_sk_alloc(sk);
1540 cgroup_sk_alloc(&sk->sk_cgrp_data);
1541 sock_update_classid(&sk->sk_cgrp_data);
1542 sock_update_netprioidx(&sk->sk_cgrp_data);
1543 sk_tx_queue_clear(sk);
1546 return sk;
1548 EXPORT_SYMBOL(sk_alloc);
1550 /* Sockets having SOCK_RCU_FREE will call this function after one RCU
1551 * grace period. This is the case for UDP sockets and TCP listeners.
1553 static void __sk_destruct(struct rcu_head *head)
1555 struct sock *sk = container_of(head, struct sock, sk_rcu);
1556 struct sk_filter *filter;
1558 if (sk->sk_destruct)
1559 sk->sk_destruct(sk);
1561 filter = rcu_dereference_check(sk->sk_filter,
1562 refcount_read(&sk->sk_wmem_alloc) == 0);
1563 if (filter) {
1564 sk_filter_uncharge(sk, filter);
1565 RCU_INIT_POINTER(sk->sk_filter, NULL);
1568 sock_disable_timestamp(sk, SK_FLAGS_TIMESTAMP);
1570 if (atomic_read(&sk->sk_omem_alloc))
1571 pr_debug("%s: optmem leakage (%d bytes) detected\n",
1572 __func__, atomic_read(&sk->sk_omem_alloc));
1574 if (sk->sk_frag.page) {
1575 put_page(sk->sk_frag.page);
1576 sk->sk_frag.page = NULL;
1579 if (sk->sk_peer_cred)
1580 put_cred(sk->sk_peer_cred);
1581 put_pid(sk->sk_peer_pid);
1582 if (likely(sk->sk_net_refcnt))
1583 put_net(sock_net(sk));
1584 sk_prot_free(sk->sk_prot_creator, sk);
1587 void sk_destruct(struct sock *sk)
1589 bool use_call_rcu = sock_flag(sk, SOCK_RCU_FREE);
1591 if (rcu_access_pointer(sk->sk_reuseport_cb)) {
1592 reuseport_detach_sock(sk);
1593 use_call_rcu = true;
1596 if (use_call_rcu)
1597 call_rcu(&sk->sk_rcu, __sk_destruct);
1598 else
1599 __sk_destruct(&sk->sk_rcu);
1602 static void __sk_free(struct sock *sk)
1604 if (likely(sk->sk_net_refcnt))
1605 sock_inuse_add(sock_net(sk), -1);
1607 if (unlikely(sk->sk_net_refcnt && sock_diag_has_destroy_listeners(sk)))
1608 sock_diag_broadcast_destroy(sk);
1609 else
1610 sk_destruct(sk);
1613 void sk_free(struct sock *sk)
1616 * We subtract one from sk_wmem_alloc and can know if
1617 * some packets are still in some tx queue.
1618 * If not null, sock_wfree() will call __sk_free(sk) later
1620 if (refcount_dec_and_test(&sk->sk_wmem_alloc))
1621 __sk_free(sk);
1623 EXPORT_SYMBOL(sk_free);
1625 static void sk_init_common(struct sock *sk)
1627 skb_queue_head_init(&sk->sk_receive_queue);
1628 skb_queue_head_init(&sk->sk_write_queue);
1629 skb_queue_head_init(&sk->sk_error_queue);
1631 rwlock_init(&sk->sk_callback_lock);
1632 lockdep_set_class_and_name(&sk->sk_receive_queue.lock,
1633 af_rlock_keys + sk->sk_family,
1634 af_family_rlock_key_strings[sk->sk_family]);
1635 lockdep_set_class_and_name(&sk->sk_write_queue.lock,
1636 af_wlock_keys + sk->sk_family,
1637 af_family_wlock_key_strings[sk->sk_family]);
1638 lockdep_set_class_and_name(&sk->sk_error_queue.lock,
1639 af_elock_keys + sk->sk_family,
1640 af_family_elock_key_strings[sk->sk_family]);
1641 lockdep_set_class_and_name(&sk->sk_callback_lock,
1642 af_callback_keys + sk->sk_family,
1643 af_family_clock_key_strings[sk->sk_family]);
1647 * sk_clone_lock - clone a socket, and lock its clone
1648 * @sk: the socket to clone
1649 * @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
1651 * Caller must unlock socket even in error path (bh_unlock_sock(newsk))
1653 struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
1655 struct sock *newsk;
1656 bool is_charged = true;
1658 newsk = sk_prot_alloc(sk->sk_prot, priority, sk->sk_family);
1659 if (newsk != NULL) {
1660 struct sk_filter *filter;
1662 sock_copy(newsk, sk);
1664 newsk->sk_prot_creator = sk->sk_prot;
1666 /* SANITY */
1667 if (likely(newsk->sk_net_refcnt))
1668 get_net(sock_net(newsk));
1669 sk_node_init(&newsk->sk_node);
1670 sock_lock_init(newsk);
1671 bh_lock_sock(newsk);
1672 newsk->sk_backlog.head = newsk->sk_backlog.tail = NULL;
1673 newsk->sk_backlog.len = 0;
1675 atomic_set(&newsk->sk_rmem_alloc, 0);
1677 * sk_wmem_alloc set to one (see sk_free() and sock_wfree())
1679 refcount_set(&newsk->sk_wmem_alloc, 1);
1680 atomic_set(&newsk->sk_omem_alloc, 0);
1681 sk_init_common(newsk);
1683 newsk->sk_dst_cache = NULL;
1684 newsk->sk_dst_pending_confirm = 0;
1685 newsk->sk_wmem_queued = 0;
1686 newsk->sk_forward_alloc = 0;
1687 atomic_set(&newsk->sk_drops, 0);
1688 newsk->sk_send_head = NULL;
1689 newsk->sk_userlocks = sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
1690 atomic_set(&newsk->sk_zckey, 0);
1692 sock_reset_flag(newsk, SOCK_DONE);
1694 /* sk->sk_memcg will be populated at accept() time */
1695 newsk->sk_memcg = NULL;
1697 cgroup_sk_alloc(&newsk->sk_cgrp_data);
1699 rcu_read_lock();
1700 filter = rcu_dereference(sk->sk_filter);
1701 if (filter != NULL)
1702 /* though it's an empty new sock, the charging may fail
1703 * if sysctl_optmem_max was changed between creation of
1704 * original socket and cloning
1706 is_charged = sk_filter_charge(newsk, filter);
1707 RCU_INIT_POINTER(newsk->sk_filter, filter);
1708 rcu_read_unlock();
1710 if (unlikely(!is_charged || xfrm_sk_clone_policy(newsk, sk))) {
1711 /* We need to make sure that we don't uncharge the new
1712 * socket if we couldn't charge it in the first place
1713 * as otherwise we uncharge the parent's filter.
1715 if (!is_charged)
1716 RCU_INIT_POINTER(newsk->sk_filter, NULL);
1717 sk_free_unlock_clone(newsk);
1718 newsk = NULL;
1719 goto out;
1721 RCU_INIT_POINTER(newsk->sk_reuseport_cb, NULL);
1723 newsk->sk_err = 0;
1724 newsk->sk_err_soft = 0;
1725 newsk->sk_priority = 0;
1726 newsk->sk_incoming_cpu = raw_smp_processor_id();
1727 atomic64_set(&newsk->sk_cookie, 0);
1728 if (likely(newsk->sk_net_refcnt))
1729 sock_inuse_add(sock_net(newsk), 1);
1732 * Before updating sk_refcnt, we must commit prior changes to memory
1733 * (Documentation/RCU/rculist_nulls.txt for details)
1735 smp_wmb();
1736 refcount_set(&newsk->sk_refcnt, 2);
1739 * Increment the counter in the same struct proto as the master
1740 * sock (sk_refcnt_debug_inc uses newsk->sk_prot->socks, that
1741 * is the same as sk->sk_prot->socks, as this field was copied
1742 * with memcpy).
1744 * This _changes_ the previous behaviour, where
1745 * tcp_create_openreq_child always was incrementing the
1746 * equivalent to tcp_prot->socks (inet_sock_nr), so this have
1747 * to be taken into account in all callers. -acme
1749 sk_refcnt_debug_inc(newsk);
1750 sk_set_socket(newsk, NULL);
1751 sk_tx_queue_clear(newsk);
1752 newsk->sk_wq = NULL;
1754 if (newsk->sk_prot->sockets_allocated)
1755 sk_sockets_allocated_inc(newsk);
1757 if (sock_needs_netstamp(sk) &&
1758 newsk->sk_flags & SK_FLAGS_TIMESTAMP)
1759 net_enable_timestamp();
1761 out:
1762 return newsk;
1764 EXPORT_SYMBOL_GPL(sk_clone_lock);
1766 void sk_free_unlock_clone(struct sock *sk)
1768 /* It is still raw copy of parent, so invalidate
1769 * destructor and make plain sk_free() */
1770 sk->sk_destruct = NULL;
1771 bh_unlock_sock(sk);
1772 sk_free(sk);
1774 EXPORT_SYMBOL_GPL(sk_free_unlock_clone);
1776 void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
1778 u32 max_segs = 1;
1780 sk_dst_set(sk, dst);
1781 sk->sk_route_caps = dst->dev->features | sk->sk_route_forced_caps;
1782 if (sk->sk_route_caps & NETIF_F_GSO)
1783 sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE;
1784 sk->sk_route_caps &= ~sk->sk_route_nocaps;
1785 if (sk_can_gso(sk)) {
1786 if (dst->header_len && !xfrm_dst_offload_ok(dst)) {
1787 sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
1788 } else {
1789 sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM;
1790 sk->sk_gso_max_size = dst->dev->gso_max_size;
1791 max_segs = max_t(u32, dst->dev->gso_max_segs, 1);
1794 sk->sk_gso_max_segs = max_segs;
1796 EXPORT_SYMBOL_GPL(sk_setup_caps);
1799 * Simple resource managers for sockets.
1804 * Write buffer destructor automatically called from kfree_skb.
1806 void sock_wfree(struct sk_buff *skb)
1808 struct sock *sk = skb->sk;
1809 unsigned int len = skb->truesize;
1811 if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE)) {
1813 * Keep a reference on sk_wmem_alloc, this will be released
1814 * after sk_write_space() call
1816 WARN_ON(refcount_sub_and_test(len - 1, &sk->sk_wmem_alloc));
1817 sk->sk_write_space(sk);
1818 len = 1;
1821 * if sk_wmem_alloc reaches 0, we must finish what sk_free()
1822 * could not do because of in-flight packets
1824 if (refcount_sub_and_test(len, &sk->sk_wmem_alloc))
1825 __sk_free(sk);
1827 EXPORT_SYMBOL(sock_wfree);
1829 /* This variant of sock_wfree() is used by TCP,
1830 * since it sets SOCK_USE_WRITE_QUEUE.
1832 void __sock_wfree(struct sk_buff *skb)
1834 struct sock *sk = skb->sk;
1836 if (refcount_sub_and_test(skb->truesize, &sk->sk_wmem_alloc))
1837 __sk_free(sk);
1840 void skb_set_owner_w(struct sk_buff *skb, struct sock *sk)
1842 skb_orphan(skb);
1843 skb->sk = sk;
1844 #ifdef CONFIG_INET
1845 if (unlikely(!sk_fullsock(sk))) {
1846 skb->destructor = sock_edemux;
1847 sock_hold(sk);
1848 return;
1850 #endif
1851 skb->destructor = sock_wfree;
1852 skb_set_hash_from_sk(skb, sk);
1854 * We used to take a refcount on sk, but following operation
1855 * is enough to guarantee sk_free() wont free this sock until
1856 * all in-flight packets are completed
1858 refcount_add(skb->truesize, &sk->sk_wmem_alloc);
1860 EXPORT_SYMBOL(skb_set_owner_w);
1862 /* This helper is used by netem, as it can hold packets in its
1863 * delay queue. We want to allow the owner socket to send more
1864 * packets, as if they were already TX completed by a typical driver.
1865 * But we also want to keep skb->sk set because some packet schedulers
1866 * rely on it (sch_fq for example).
1868 void skb_orphan_partial(struct sk_buff *skb)
1870 if (skb_is_tcp_pure_ack(skb))
1871 return;
1873 if (skb->destructor == sock_wfree
1874 #ifdef CONFIG_INET
1875 || skb->destructor == tcp_wfree
1876 #endif
1878 struct sock *sk = skb->sk;
1880 if (refcount_inc_not_zero(&sk->sk_refcnt)) {
1881 WARN_ON(refcount_sub_and_test(skb->truesize, &sk->sk_wmem_alloc));
1882 skb->destructor = sock_efree;
1884 } else {
1885 skb_orphan(skb);
1888 EXPORT_SYMBOL(skb_orphan_partial);
1891 * Read buffer destructor automatically called from kfree_skb.
1893 void sock_rfree(struct sk_buff *skb)
1895 struct sock *sk = skb->sk;
1896 unsigned int len = skb->truesize;
1898 atomic_sub(len, &sk->sk_rmem_alloc);
1899 sk_mem_uncharge(sk, len);
1901 EXPORT_SYMBOL(sock_rfree);
1904 * Buffer destructor for skbs that are not used directly in read or write
1905 * path, e.g. for error handler skbs. Automatically called from kfree_skb.
1907 void sock_efree(struct sk_buff *skb)
1909 sock_put(skb->sk);
1911 EXPORT_SYMBOL(sock_efree);
1913 kuid_t sock_i_uid(struct sock *sk)
1915 kuid_t uid;
1917 read_lock_bh(&sk->sk_callback_lock);
1918 uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : GLOBAL_ROOT_UID;
1919 read_unlock_bh(&sk->sk_callback_lock);
1920 return uid;
1922 EXPORT_SYMBOL(sock_i_uid);
1924 unsigned long sock_i_ino(struct sock *sk)
1926 unsigned long ino;
1928 read_lock_bh(&sk->sk_callback_lock);
1929 ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0;
1930 read_unlock_bh(&sk->sk_callback_lock);
1931 return ino;
1933 EXPORT_SYMBOL(sock_i_ino);
1936 * Allocate a skb from the socket's send buffer.
1938 struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
1939 gfp_t priority)
1941 if (force || refcount_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
1942 struct sk_buff *skb = alloc_skb(size, priority);
1943 if (skb) {
1944 skb_set_owner_w(skb, sk);
1945 return skb;
1948 return NULL;
1950 EXPORT_SYMBOL(sock_wmalloc);
1952 static void sock_ofree(struct sk_buff *skb)
1954 struct sock *sk = skb->sk;
1956 atomic_sub(skb->truesize, &sk->sk_omem_alloc);
1959 struct sk_buff *sock_omalloc(struct sock *sk, unsigned long size,
1960 gfp_t priority)
1962 struct sk_buff *skb;
1964 /* small safe race: SKB_TRUESIZE may differ from final skb->truesize */
1965 if (atomic_read(&sk->sk_omem_alloc) + SKB_TRUESIZE(size) >
1966 sysctl_optmem_max)
1967 return NULL;
1969 skb = alloc_skb(size, priority);
1970 if (!skb)
1971 return NULL;
1973 atomic_add(skb->truesize, &sk->sk_omem_alloc);
1974 skb->sk = sk;
1975 skb->destructor = sock_ofree;
1976 return skb;
1980 * Allocate a memory block from the socket's option memory buffer.
1982 void *sock_kmalloc(struct sock *sk, int size, gfp_t priority)
1984 if ((unsigned int)size <= sysctl_optmem_max &&
1985 atomic_read(&sk->sk_omem_alloc) + size < sysctl_optmem_max) {
1986 void *mem;
1987 /* First do the add, to avoid the race if kmalloc
1988 * might sleep.
1990 atomic_add(size, &sk->sk_omem_alloc);
1991 mem = kmalloc(size, priority);
1992 if (mem)
1993 return mem;
1994 atomic_sub(size, &sk->sk_omem_alloc);
1996 return NULL;
1998 EXPORT_SYMBOL(sock_kmalloc);
2000 /* Free an option memory block. Note, we actually want the inline
2001 * here as this allows gcc to detect the nullify and fold away the
2002 * condition entirely.
2004 static inline void __sock_kfree_s(struct sock *sk, void *mem, int size,
2005 const bool nullify)
2007 if (WARN_ON_ONCE(!mem))
2008 return;
2009 if (nullify)
2010 kzfree(mem);
2011 else
2012 kfree(mem);
2013 atomic_sub(size, &sk->sk_omem_alloc);
2016 void sock_kfree_s(struct sock *sk, void *mem, int size)
2018 __sock_kfree_s(sk, mem, size, false);
2020 EXPORT_SYMBOL(sock_kfree_s);
2022 void sock_kzfree_s(struct sock *sk, void *mem, int size)
2024 __sock_kfree_s(sk, mem, size, true);
2026 EXPORT_SYMBOL(sock_kzfree_s);
2028 /* It is almost wait_for_tcp_memory minus release_sock/lock_sock.
2029 I think, these locks should be removed for datagram sockets.
2031 static long sock_wait_for_wmem(struct sock *sk, long timeo)
2033 DEFINE_WAIT(wait);
2035 sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
2036 for (;;) {
2037 if (!timeo)
2038 break;
2039 if (signal_pending(current))
2040 break;
2041 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
2042 prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
2043 if (refcount_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf)
2044 break;
2045 if (sk->sk_shutdown & SEND_SHUTDOWN)
2046 break;
2047 if (sk->sk_err)
2048 break;
2049 timeo = schedule_timeout(timeo);
2051 finish_wait(sk_sleep(sk), &wait);
2052 return timeo;
2057 * Generic send/receive buffer handlers
2060 struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len,
2061 unsigned long data_len, int noblock,
2062 int *errcode, int max_page_order)
2064 struct sk_buff *skb;
2065 long timeo;
2066 int err;
2068 timeo = sock_sndtimeo(sk, noblock);
2069 for (;;) {
2070 err = sock_error(sk);
2071 if (err != 0)
2072 goto failure;
2074 err = -EPIPE;
2075 if (sk->sk_shutdown & SEND_SHUTDOWN)
2076 goto failure;
2078 if (sk_wmem_alloc_get(sk) < sk->sk_sndbuf)
2079 break;
2081 sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
2082 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
2083 err = -EAGAIN;
2084 if (!timeo)
2085 goto failure;
2086 if (signal_pending(current))
2087 goto interrupted;
2088 timeo = sock_wait_for_wmem(sk, timeo);
2090 skb = alloc_skb_with_frags(header_len, data_len, max_page_order,
2091 errcode, sk->sk_allocation);
2092 if (skb)
2093 skb_set_owner_w(skb, sk);
2094 return skb;
2096 interrupted:
2097 err = sock_intr_errno(timeo);
2098 failure:
2099 *errcode = err;
2100 return NULL;
2102 EXPORT_SYMBOL(sock_alloc_send_pskb);
2104 struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size,
2105 int noblock, int *errcode)
2107 return sock_alloc_send_pskb(sk, size, 0, noblock, errcode, 0);
2109 EXPORT_SYMBOL(sock_alloc_send_skb);
2111 int __sock_cmsg_send(struct sock *sk, struct msghdr *msg, struct cmsghdr *cmsg,
2112 struct sockcm_cookie *sockc)
2114 u32 tsflags;
2116 switch (cmsg->cmsg_type) {
2117 case SO_MARK:
2118 if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
2119 return -EPERM;
2120 if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
2121 return -EINVAL;
2122 sockc->mark = *(u32 *)CMSG_DATA(cmsg);
2123 break;
2124 case SO_TIMESTAMPING:
2125 if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
2126 return -EINVAL;
2128 tsflags = *(u32 *)CMSG_DATA(cmsg);
2129 if (tsflags & ~SOF_TIMESTAMPING_TX_RECORD_MASK)
2130 return -EINVAL;
2132 sockc->tsflags &= ~SOF_TIMESTAMPING_TX_RECORD_MASK;
2133 sockc->tsflags |= tsflags;
2134 break;
2135 case SCM_TXTIME:
2136 if (!sock_flag(sk, SOCK_TXTIME))
2137 return -EINVAL;
2138 if (cmsg->cmsg_len != CMSG_LEN(sizeof(u64)))
2139 return -EINVAL;
2140 sockc->transmit_time = get_unaligned((u64 *)CMSG_DATA(cmsg));
2141 break;
2142 /* SCM_RIGHTS and SCM_CREDENTIALS are semantically in SOL_UNIX. */
2143 case SCM_RIGHTS:
2144 case SCM_CREDENTIALS:
2145 break;
2146 default:
2147 return -EINVAL;
2149 return 0;
2151 EXPORT_SYMBOL(__sock_cmsg_send);
2153 int sock_cmsg_send(struct sock *sk, struct msghdr *msg,
2154 struct sockcm_cookie *sockc)
2156 struct cmsghdr *cmsg;
2157 int ret;
2159 for_each_cmsghdr(cmsg, msg) {
2160 if (!CMSG_OK(msg, cmsg))
2161 return -EINVAL;
2162 if (cmsg->cmsg_level != SOL_SOCKET)
2163 continue;
2164 ret = __sock_cmsg_send(sk, msg, cmsg, sockc);
2165 if (ret)
2166 return ret;
2168 return 0;
2170 EXPORT_SYMBOL(sock_cmsg_send);
2172 static void sk_enter_memory_pressure(struct sock *sk)
2174 if (!sk->sk_prot->enter_memory_pressure)
2175 return;
2177 sk->sk_prot->enter_memory_pressure(sk);
2180 static void sk_leave_memory_pressure(struct sock *sk)
2182 if (sk->sk_prot->leave_memory_pressure) {
2183 sk->sk_prot->leave_memory_pressure(sk);
2184 } else {
2185 unsigned long *memory_pressure = sk->sk_prot->memory_pressure;
2187 if (memory_pressure && READ_ONCE(*memory_pressure))
2188 WRITE_ONCE(*memory_pressure, 0);
2192 /* On 32bit arches, an skb frag is limited to 2^15 */
2193 #define SKB_FRAG_PAGE_ORDER get_order(32768)
2196 * skb_page_frag_refill - check that a page_frag contains enough room
2197 * @sz: minimum size of the fragment we want to get
2198 * @pfrag: pointer to page_frag
2199 * @gfp: priority for memory allocation
2201 * Note: While this allocator tries to use high order pages, there is
2202 * no guarantee that allocations succeed. Therefore, @sz MUST be
2203 * less or equal than PAGE_SIZE.
2205 bool skb_page_frag_refill(unsigned int sz, struct page_frag *pfrag, gfp_t gfp)
2207 if (pfrag->page) {
2208 if (page_ref_count(pfrag->page) == 1) {
2209 pfrag->offset = 0;
2210 return true;
2212 if (pfrag->offset + sz <= pfrag->size)
2213 return true;
2214 put_page(pfrag->page);
2217 pfrag->offset = 0;
2218 if (SKB_FRAG_PAGE_ORDER) {
2219 /* Avoid direct reclaim but allow kswapd to wake */
2220 pfrag->page = alloc_pages((gfp & ~__GFP_DIRECT_RECLAIM) |
2221 __GFP_COMP | __GFP_NOWARN |
2222 __GFP_NORETRY,
2223 SKB_FRAG_PAGE_ORDER);
2224 if (likely(pfrag->page)) {
2225 pfrag->size = PAGE_SIZE << SKB_FRAG_PAGE_ORDER;
2226 return true;
2229 pfrag->page = alloc_page(gfp);
2230 if (likely(pfrag->page)) {
2231 pfrag->size = PAGE_SIZE;
2232 return true;
2234 return false;
2236 EXPORT_SYMBOL(skb_page_frag_refill);
2238 bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag)
2240 if (likely(skb_page_frag_refill(32U, pfrag, sk->sk_allocation)))
2241 return true;
2243 sk_enter_memory_pressure(sk);
2244 sk_stream_moderate_sndbuf(sk);
2245 return false;
2247 EXPORT_SYMBOL(sk_page_frag_refill);
2249 int sk_alloc_sg(struct sock *sk, int len, struct scatterlist *sg,
2250 int sg_start, int *sg_curr_index, unsigned int *sg_curr_size,
2251 int first_coalesce)
2253 int sg_curr = *sg_curr_index, use = 0, rc = 0;
2254 unsigned int size = *sg_curr_size;
2255 struct page_frag *pfrag;
2256 struct scatterlist *sge;
2258 len -= size;
2259 pfrag = sk_page_frag(sk);
2261 while (len > 0) {
2262 unsigned int orig_offset;
2264 if (!sk_page_frag_refill(sk, pfrag)) {
2265 rc = -ENOMEM;
2266 goto out;
2269 use = min_t(int, len, pfrag->size - pfrag->offset);
2271 if (!sk_wmem_schedule(sk, use)) {
2272 rc = -ENOMEM;
2273 goto out;
2276 sk_mem_charge(sk, use);
2277 size += use;
2278 orig_offset = pfrag->offset;
2279 pfrag->offset += use;
2281 sge = sg + sg_curr - 1;
2282 if (sg_curr > first_coalesce && sg_page(sge) == pfrag->page &&
2283 sge->offset + sge->length == orig_offset) {
2284 sge->length += use;
2285 } else {
2286 sge = sg + sg_curr;
2287 sg_unmark_end(sge);
2288 sg_set_page(sge, pfrag->page, use, orig_offset);
2289 get_page(pfrag->page);
2290 sg_curr++;
2292 if (sg_curr == MAX_SKB_FRAGS)
2293 sg_curr = 0;
2295 if (sg_curr == sg_start) {
2296 rc = -ENOSPC;
2297 break;
2301 len -= use;
2303 out:
2304 *sg_curr_size = size;
2305 *sg_curr_index = sg_curr;
2306 return rc;
2308 EXPORT_SYMBOL(sk_alloc_sg);
2310 static void __lock_sock(struct sock *sk)
2311 __releases(&sk->sk_lock.slock)
2312 __acquires(&sk->sk_lock.slock)
2314 DEFINE_WAIT(wait);
2316 for (;;) {
2317 prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait,
2318 TASK_UNINTERRUPTIBLE);
2319 spin_unlock_bh(&sk->sk_lock.slock);
2320 schedule();
2321 spin_lock_bh(&sk->sk_lock.slock);
2322 if (!sock_owned_by_user(sk))
2323 break;
2325 finish_wait(&sk->sk_lock.wq, &wait);
2328 void __release_sock(struct sock *sk)
2329 __releases(&sk->sk_lock.slock)
2330 __acquires(&sk->sk_lock.slock)
2332 struct sk_buff *skb, *next;
2334 while ((skb = sk->sk_backlog.head) != NULL) {
2335 sk->sk_backlog.head = sk->sk_backlog.tail = NULL;
2337 spin_unlock_bh(&sk->sk_lock.slock);
2339 do {
2340 next = skb->next;
2341 prefetch(next);
2342 WARN_ON_ONCE(skb_dst_is_noref(skb));
2343 skb->next = NULL;
2344 sk_backlog_rcv(sk, skb);
2346 cond_resched();
2348 skb = next;
2349 } while (skb != NULL);
2351 spin_lock_bh(&sk->sk_lock.slock);
2355 * Doing the zeroing here guarantee we can not loop forever
2356 * while a wild producer attempts to flood us.
2358 sk->sk_backlog.len = 0;
2361 void __sk_flush_backlog(struct sock *sk)
2363 spin_lock_bh(&sk->sk_lock.slock);
2364 __release_sock(sk);
2365 spin_unlock_bh(&sk->sk_lock.slock);
2369 * sk_wait_data - wait for data to arrive at sk_receive_queue
2370 * @sk: sock to wait on
2371 * @timeo: for how long
2372 * @skb: last skb seen on sk_receive_queue
2374 * Now socket state including sk->sk_err is changed only under lock,
2375 * hence we may omit checks after joining wait queue.
2376 * We check receive queue before schedule() only as optimization;
2377 * it is very likely that release_sock() added new data.
2379 int sk_wait_data(struct sock *sk, long *timeo, const struct sk_buff *skb)
2381 DEFINE_WAIT_FUNC(wait, woken_wake_function);
2382 int rc;
2384 add_wait_queue(sk_sleep(sk), &wait);
2385 sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2386 rc = sk_wait_event(sk, timeo, skb_peek_tail(&sk->sk_receive_queue) != skb, &wait);
2387 sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2388 remove_wait_queue(sk_sleep(sk), &wait);
2389 return rc;
2391 EXPORT_SYMBOL(sk_wait_data);
2394 * __sk_mem_raise_allocated - increase memory_allocated
2395 * @sk: socket
2396 * @size: memory size to allocate
2397 * @amt: pages to allocate
2398 * @kind: allocation type
2400 * Similar to __sk_mem_schedule(), but does not update sk_forward_alloc
2402 int __sk_mem_raise_allocated(struct sock *sk, int size, int amt, int kind)
2404 struct proto *prot = sk->sk_prot;
2405 long allocated = sk_memory_allocated_add(sk, amt);
2406 bool charged = true;
2408 if (mem_cgroup_sockets_enabled && sk->sk_memcg &&
2409 !(charged = mem_cgroup_charge_skmem(sk->sk_memcg, amt)))
2410 goto suppress_allocation;
2412 /* Under limit. */
2413 if (allocated <= sk_prot_mem_limits(sk, 0)) {
2414 sk_leave_memory_pressure(sk);
2415 return 1;
2418 /* Under pressure. */
2419 if (allocated > sk_prot_mem_limits(sk, 1))
2420 sk_enter_memory_pressure(sk);
2422 /* Over hard limit. */
2423 if (allocated > sk_prot_mem_limits(sk, 2))
2424 goto suppress_allocation;
2426 /* guarantee minimum buffer size under pressure */
2427 if (kind == SK_MEM_RECV) {
2428 if (atomic_read(&sk->sk_rmem_alloc) < sk_get_rmem0(sk, prot))
2429 return 1;
2431 } else { /* SK_MEM_SEND */
2432 int wmem0 = sk_get_wmem0(sk, prot);
2434 if (sk->sk_type == SOCK_STREAM) {
2435 if (sk->sk_wmem_queued < wmem0)
2436 return 1;
2437 } else if (refcount_read(&sk->sk_wmem_alloc) < wmem0) {
2438 return 1;
2442 if (sk_has_memory_pressure(sk)) {
2443 u64 alloc;
2445 if (!sk_under_memory_pressure(sk))
2446 return 1;
2447 alloc = sk_sockets_allocated_read_positive(sk);
2448 if (sk_prot_mem_limits(sk, 2) > alloc *
2449 sk_mem_pages(sk->sk_wmem_queued +
2450 atomic_read(&sk->sk_rmem_alloc) +
2451 sk->sk_forward_alloc))
2452 return 1;
2455 suppress_allocation:
2457 if (kind == SK_MEM_SEND && sk->sk_type == SOCK_STREAM) {
2458 sk_stream_moderate_sndbuf(sk);
2460 /* Fail only if socket is _under_ its sndbuf.
2461 * In this case we cannot block, so that we have to fail.
2463 if (sk->sk_wmem_queued + size >= sk->sk_sndbuf)
2464 return 1;
2467 if (kind == SK_MEM_SEND || (kind == SK_MEM_RECV && charged))
2468 trace_sock_exceed_buf_limit(sk, prot, allocated, kind);
2470 sk_memory_allocated_sub(sk, amt);
2472 if (mem_cgroup_sockets_enabled && sk->sk_memcg)
2473 mem_cgroup_uncharge_skmem(sk->sk_memcg, amt);
2475 return 0;
2477 EXPORT_SYMBOL(__sk_mem_raise_allocated);
2480 * __sk_mem_schedule - increase sk_forward_alloc and memory_allocated
2481 * @sk: socket
2482 * @size: memory size to allocate
2483 * @kind: allocation type
2485 * If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means
2486 * rmem allocation. This function assumes that protocols which have
2487 * memory_pressure use sk_wmem_queued as write buffer accounting.
2489 int __sk_mem_schedule(struct sock *sk, int size, int kind)
2491 int ret, amt = sk_mem_pages(size);
2493 sk->sk_forward_alloc += amt << SK_MEM_QUANTUM_SHIFT;
2494 ret = __sk_mem_raise_allocated(sk, size, amt, kind);
2495 if (!ret)
2496 sk->sk_forward_alloc -= amt << SK_MEM_QUANTUM_SHIFT;
2497 return ret;
2499 EXPORT_SYMBOL(__sk_mem_schedule);
2502 * __sk_mem_reduce_allocated - reclaim memory_allocated
2503 * @sk: socket
2504 * @amount: number of quanta
2506 * Similar to __sk_mem_reclaim(), but does not update sk_forward_alloc
2508 void __sk_mem_reduce_allocated(struct sock *sk, int amount)
2510 sk_memory_allocated_sub(sk, amount);
2512 if (mem_cgroup_sockets_enabled && sk->sk_memcg)
2513 mem_cgroup_uncharge_skmem(sk->sk_memcg, amount);
2515 if (sk_under_memory_pressure(sk) &&
2516 (sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)))
2517 sk_leave_memory_pressure(sk);
2519 EXPORT_SYMBOL(__sk_mem_reduce_allocated);
2522 * __sk_mem_reclaim - reclaim sk_forward_alloc and memory_allocated
2523 * @sk: socket
2524 * @amount: number of bytes (rounded down to a SK_MEM_QUANTUM multiple)
2526 void __sk_mem_reclaim(struct sock *sk, int amount)
2528 amount >>= SK_MEM_QUANTUM_SHIFT;
2529 sk->sk_forward_alloc -= amount << SK_MEM_QUANTUM_SHIFT;
2530 __sk_mem_reduce_allocated(sk, amount);
2532 EXPORT_SYMBOL(__sk_mem_reclaim);
2534 int sk_set_peek_off(struct sock *sk, int val)
2536 sk->sk_peek_off = val;
2537 return 0;
2539 EXPORT_SYMBOL_GPL(sk_set_peek_off);
2542 * Set of default routines for initialising struct proto_ops when
2543 * the protocol does not support a particular function. In certain
2544 * cases where it makes no sense for a protocol to have a "do nothing"
2545 * function, some default processing is provided.
2548 int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len)
2550 return -EOPNOTSUPP;
2552 EXPORT_SYMBOL(sock_no_bind);
2554 int sock_no_connect(struct socket *sock, struct sockaddr *saddr,
2555 int len, int flags)
2557 return -EOPNOTSUPP;
2559 EXPORT_SYMBOL(sock_no_connect);
2561 int sock_no_socketpair(struct socket *sock1, struct socket *sock2)
2563 return -EOPNOTSUPP;
2565 EXPORT_SYMBOL(sock_no_socketpair);
2567 int sock_no_accept(struct socket *sock, struct socket *newsock, int flags,
2568 bool kern)
2570 return -EOPNOTSUPP;
2572 EXPORT_SYMBOL(sock_no_accept);
2574 int sock_no_getname(struct socket *sock, struct sockaddr *saddr,
2575 int peer)
2577 return -EOPNOTSUPP;
2579 EXPORT_SYMBOL(sock_no_getname);
2581 int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
2583 return -EOPNOTSUPP;
2585 EXPORT_SYMBOL(sock_no_ioctl);
2587 int sock_no_listen(struct socket *sock, int backlog)
2589 return -EOPNOTSUPP;
2591 EXPORT_SYMBOL(sock_no_listen);
2593 int sock_no_shutdown(struct socket *sock, int how)
2595 return -EOPNOTSUPP;
2597 EXPORT_SYMBOL(sock_no_shutdown);
2599 int sock_no_setsockopt(struct socket *sock, int level, int optname,
2600 char __user *optval, unsigned int optlen)
2602 return -EOPNOTSUPP;
2604 EXPORT_SYMBOL(sock_no_setsockopt);
2606 int sock_no_getsockopt(struct socket *sock, int level, int optname,
2607 char __user *optval, int __user *optlen)
2609 return -EOPNOTSUPP;
2611 EXPORT_SYMBOL(sock_no_getsockopt);
2613 int sock_no_sendmsg(struct socket *sock, struct msghdr *m, size_t len)
2615 return -EOPNOTSUPP;
2617 EXPORT_SYMBOL(sock_no_sendmsg);
2619 int sock_no_sendmsg_locked(struct sock *sk, struct msghdr *m, size_t len)
2621 return -EOPNOTSUPP;
2623 EXPORT_SYMBOL(sock_no_sendmsg_locked);
2625 int sock_no_recvmsg(struct socket *sock, struct msghdr *m, size_t len,
2626 int flags)
2628 return -EOPNOTSUPP;
2630 EXPORT_SYMBOL(sock_no_recvmsg);
2632 int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
2634 /* Mirror missing mmap method error code */
2635 return -ENODEV;
2637 EXPORT_SYMBOL(sock_no_mmap);
2639 ssize_t sock_no_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags)
2641 ssize_t res;
2642 struct msghdr msg = {.msg_flags = flags};
2643 struct kvec iov;
2644 char *kaddr = kmap(page);
2645 iov.iov_base = kaddr + offset;
2646 iov.iov_len = size;
2647 res = kernel_sendmsg(sock, &msg, &iov, 1, size);
2648 kunmap(page);
2649 return res;
2651 EXPORT_SYMBOL(sock_no_sendpage);
2653 ssize_t sock_no_sendpage_locked(struct sock *sk, struct page *page,
2654 int offset, size_t size, int flags)
2656 ssize_t res;
2657 struct msghdr msg = {.msg_flags = flags};
2658 struct kvec iov;
2659 char *kaddr = kmap(page);
2661 iov.iov_base = kaddr + offset;
2662 iov.iov_len = size;
2663 res = kernel_sendmsg_locked(sk, &msg, &iov, 1, size);
2664 kunmap(page);
2665 return res;
2667 EXPORT_SYMBOL(sock_no_sendpage_locked);
2670 * Default Socket Callbacks
2673 static void sock_def_wakeup(struct sock *sk)
2675 struct socket_wq *wq;
2677 rcu_read_lock();
2678 wq = rcu_dereference(sk->sk_wq);
2679 if (skwq_has_sleeper(wq))
2680 wake_up_interruptible_all(&wq->wait);
2681 rcu_read_unlock();
2684 static void sock_def_error_report(struct sock *sk)
2686 struct socket_wq *wq;
2688 rcu_read_lock();
2689 wq = rcu_dereference(sk->sk_wq);
2690 if (skwq_has_sleeper(wq))
2691 wake_up_interruptible_poll(&wq->wait, EPOLLERR);
2692 sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR);
2693 rcu_read_unlock();
2696 static void sock_def_readable(struct sock *sk)
2698 struct socket_wq *wq;
2700 rcu_read_lock();
2701 wq = rcu_dereference(sk->sk_wq);
2702 if (skwq_has_sleeper(wq))
2703 wake_up_interruptible_sync_poll(&wq->wait, EPOLLIN | EPOLLPRI |
2704 EPOLLRDNORM | EPOLLRDBAND);
2705 sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
2706 rcu_read_unlock();
2709 static void sock_def_write_space(struct sock *sk)
2711 struct socket_wq *wq;
2713 rcu_read_lock();
2715 /* Do not wake up a writer until he can make "significant"
2716 * progress. --DaveM
2718 if ((refcount_read(&sk->sk_wmem_alloc) << 1) <= sk->sk_sndbuf) {
2719 wq = rcu_dereference(sk->sk_wq);
2720 if (skwq_has_sleeper(wq))
2721 wake_up_interruptible_sync_poll(&wq->wait, EPOLLOUT |
2722 EPOLLWRNORM | EPOLLWRBAND);
2724 /* Should agree with poll, otherwise some programs break */
2725 if (sock_writeable(sk))
2726 sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
2729 rcu_read_unlock();
2732 static void sock_def_destruct(struct sock *sk)
2736 void sk_send_sigurg(struct sock *sk)
2738 if (sk->sk_socket && sk->sk_socket->file)
2739 if (send_sigurg(&sk->sk_socket->file->f_owner))
2740 sk_wake_async(sk, SOCK_WAKE_URG, POLL_PRI);
2742 EXPORT_SYMBOL(sk_send_sigurg);
2744 void sk_reset_timer(struct sock *sk, struct timer_list* timer,
2745 unsigned long expires)
2747 if (!mod_timer(timer, expires))
2748 sock_hold(sk);
2750 EXPORT_SYMBOL(sk_reset_timer);
2752 void sk_stop_timer(struct sock *sk, struct timer_list* timer)
2754 if (del_timer(timer))
2755 __sock_put(sk);
2757 EXPORT_SYMBOL(sk_stop_timer);
2759 void sock_init_data(struct socket *sock, struct sock *sk)
2761 sk_init_common(sk);
2762 sk->sk_send_head = NULL;
2764 timer_setup(&sk->sk_timer, NULL, 0);
2766 sk->sk_allocation = GFP_KERNEL;
2767 sk->sk_rcvbuf = sysctl_rmem_default;
2768 sk->sk_sndbuf = sysctl_wmem_default;
2769 sk->sk_state = TCP_CLOSE;
2770 sk_set_socket(sk, sock);
2772 sock_set_flag(sk, SOCK_ZAPPED);
2774 if (sock) {
2775 sk->sk_type = sock->type;
2776 sk->sk_wq = sock->wq;
2777 sock->sk = sk;
2778 sk->sk_uid = SOCK_INODE(sock)->i_uid;
2779 } else {
2780 sk->sk_wq = NULL;
2781 sk->sk_uid = make_kuid(sock_net(sk)->user_ns, 0);
2784 rwlock_init(&sk->sk_callback_lock);
2785 if (sk->sk_kern_sock)
2786 lockdep_set_class_and_name(
2787 &sk->sk_callback_lock,
2788 af_kern_callback_keys + sk->sk_family,
2789 af_family_kern_clock_key_strings[sk->sk_family]);
2790 else
2791 lockdep_set_class_and_name(
2792 &sk->sk_callback_lock,
2793 af_callback_keys + sk->sk_family,
2794 af_family_clock_key_strings[sk->sk_family]);
2796 sk->sk_state_change = sock_def_wakeup;
2797 sk->sk_data_ready = sock_def_readable;
2798 sk->sk_write_space = sock_def_write_space;
2799 sk->sk_error_report = sock_def_error_report;
2800 sk->sk_destruct = sock_def_destruct;
2802 sk->sk_frag.page = NULL;
2803 sk->sk_frag.offset = 0;
2804 sk->sk_peek_off = -1;
2806 sk->sk_peer_pid = NULL;
2807 sk->sk_peer_cred = NULL;
2808 sk->sk_write_pending = 0;
2809 sk->sk_rcvlowat = 1;
2810 sk->sk_rcvtimeo = MAX_SCHEDULE_TIMEOUT;
2811 sk->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT;
2813 sk->sk_stamp = SK_DEFAULT_STAMP;
2814 #if BITS_PER_LONG==32
2815 seqlock_init(&sk->sk_stamp_seq);
2816 #endif
2817 atomic_set(&sk->sk_zckey, 0);
2819 #ifdef CONFIG_NET_RX_BUSY_POLL
2820 sk->sk_napi_id = 0;
2821 sk->sk_ll_usec = sysctl_net_busy_read;
2822 #endif
2824 sk->sk_max_pacing_rate = ~0U;
2825 sk->sk_pacing_rate = ~0U;
2826 sk->sk_pacing_shift = 10;
2827 sk->sk_incoming_cpu = -1;
2829 sk_rx_queue_clear(sk);
2831 * Before updating sk_refcnt, we must commit prior changes to memory
2832 * (Documentation/RCU/rculist_nulls.txt for details)
2834 smp_wmb();
2835 refcount_set(&sk->sk_refcnt, 1);
2836 atomic_set(&sk->sk_drops, 0);
2838 EXPORT_SYMBOL(sock_init_data);
2840 void lock_sock_nested(struct sock *sk, int subclass)
2842 might_sleep();
2843 spin_lock_bh(&sk->sk_lock.slock);
2844 if (sk->sk_lock.owned)
2845 __lock_sock(sk);
2846 sk->sk_lock.owned = 1;
2847 spin_unlock(&sk->sk_lock.slock);
2849 * The sk_lock has mutex_lock() semantics here:
2851 mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_);
2852 local_bh_enable();
2854 EXPORT_SYMBOL(lock_sock_nested);
2856 void release_sock(struct sock *sk)
2858 spin_lock_bh(&sk->sk_lock.slock);
2859 if (sk->sk_backlog.tail)
2860 __release_sock(sk);
2862 /* Warning : release_cb() might need to release sk ownership,
2863 * ie call sock_release_ownership(sk) before us.
2865 if (sk->sk_prot->release_cb)
2866 sk->sk_prot->release_cb(sk);
2868 sock_release_ownership(sk);
2869 if (waitqueue_active(&sk->sk_lock.wq))
2870 wake_up(&sk->sk_lock.wq);
2871 spin_unlock_bh(&sk->sk_lock.slock);
2873 EXPORT_SYMBOL(release_sock);
2876 * lock_sock_fast - fast version of lock_sock
2877 * @sk: socket
2879 * This version should be used for very small section, where process wont block
2880 * return false if fast path is taken:
2882 * sk_lock.slock locked, owned = 0, BH disabled
2884 * return true if slow path is taken:
2886 * sk_lock.slock unlocked, owned = 1, BH enabled
2888 bool lock_sock_fast(struct sock *sk)
2890 might_sleep();
2891 spin_lock_bh(&sk->sk_lock.slock);
2893 if (!sk->sk_lock.owned)
2895 * Note : We must disable BH
2897 return false;
2899 __lock_sock(sk);
2900 sk->sk_lock.owned = 1;
2901 spin_unlock(&sk->sk_lock.slock);
2903 * The sk_lock has mutex_lock() semantics here:
2905 mutex_acquire(&sk->sk_lock.dep_map, 0, 0, _RET_IP_);
2906 local_bh_enable();
2907 return true;
2909 EXPORT_SYMBOL(lock_sock_fast);
2911 int sock_get_timestamp(struct sock *sk, struct timeval __user *userstamp)
2913 struct timeval tv;
2915 sock_enable_timestamp(sk, SOCK_TIMESTAMP);
2916 tv = ktime_to_timeval(sock_read_timestamp(sk));
2917 if (tv.tv_sec == -1)
2918 return -ENOENT;
2919 if (tv.tv_sec == 0) {
2920 ktime_t kt = ktime_get_real();
2921 sock_write_timestamp(sk, kt);
2922 tv = ktime_to_timeval(kt);
2924 return copy_to_user(userstamp, &tv, sizeof(tv)) ? -EFAULT : 0;
2926 EXPORT_SYMBOL(sock_get_timestamp);
2928 int sock_get_timestampns(struct sock *sk, struct timespec __user *userstamp)
2930 struct timespec ts;
2932 sock_enable_timestamp(sk, SOCK_TIMESTAMP);
2933 ts = ktime_to_timespec(sock_read_timestamp(sk));
2934 if (ts.tv_sec == -1)
2935 return -ENOENT;
2936 if (ts.tv_sec == 0) {
2937 ktime_t kt = ktime_get_real();
2938 sock_write_timestamp(sk, kt);
2939 ts = ktime_to_timespec(sk->sk_stamp);
2941 return copy_to_user(userstamp, &ts, sizeof(ts)) ? -EFAULT : 0;
2943 EXPORT_SYMBOL(sock_get_timestampns);
2945 void sock_enable_timestamp(struct sock *sk, int flag)
2947 if (!sock_flag(sk, flag)) {
2948 unsigned long previous_flags = sk->sk_flags;
2950 sock_set_flag(sk, flag);
2952 * we just set one of the two flags which require net
2953 * time stamping, but time stamping might have been on
2954 * already because of the other one
2956 if (sock_needs_netstamp(sk) &&
2957 !(previous_flags & SK_FLAGS_TIMESTAMP))
2958 net_enable_timestamp();
2962 int sock_recv_errqueue(struct sock *sk, struct msghdr *msg, int len,
2963 int level, int type)
2965 struct sock_exterr_skb *serr;
2966 struct sk_buff *skb;
2967 int copied, err;
2969 err = -EAGAIN;
2970 skb = sock_dequeue_err_skb(sk);
2971 if (skb == NULL)
2972 goto out;
2974 copied = skb->len;
2975 if (copied > len) {
2976 msg->msg_flags |= MSG_TRUNC;
2977 copied = len;
2979 err = skb_copy_datagram_msg(skb, 0, msg, copied);
2980 if (err)
2981 goto out_free_skb;
2983 sock_recv_timestamp(msg, sk, skb);
2985 serr = SKB_EXT_ERR(skb);
2986 put_cmsg(msg, level, type, sizeof(serr->ee), &serr->ee);
2988 msg->msg_flags |= MSG_ERRQUEUE;
2989 err = copied;
2991 out_free_skb:
2992 kfree_skb(skb);
2993 out:
2994 return err;
2996 EXPORT_SYMBOL(sock_recv_errqueue);
2999 * Get a socket option on an socket.
3001 * FIX: POSIX 1003.1g is very ambiguous here. It states that
3002 * asynchronous errors should be reported by getsockopt. We assume
3003 * this means if you specify SO_ERROR (otherwise whats the point of it).
3005 int sock_common_getsockopt(struct socket *sock, int level, int optname,
3006 char __user *optval, int __user *optlen)
3008 struct sock *sk = sock->sk;
3010 return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
3012 EXPORT_SYMBOL(sock_common_getsockopt);
3014 #ifdef CONFIG_COMPAT
3015 int compat_sock_common_getsockopt(struct socket *sock, int level, int optname,
3016 char __user *optval, int __user *optlen)
3018 struct sock *sk = sock->sk;
3020 if (sk->sk_prot->compat_getsockopt != NULL)
3021 return sk->sk_prot->compat_getsockopt(sk, level, optname,
3022 optval, optlen);
3023 return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
3025 EXPORT_SYMBOL(compat_sock_common_getsockopt);
3026 #endif
3028 int sock_common_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
3029 int flags)
3031 struct sock *sk = sock->sk;
3032 int addr_len = 0;
3033 int err;
3035 err = sk->sk_prot->recvmsg(sk, msg, size, flags & MSG_DONTWAIT,
3036 flags & ~MSG_DONTWAIT, &addr_len);
3037 if (err >= 0)
3038 msg->msg_namelen = addr_len;
3039 return err;
3041 EXPORT_SYMBOL(sock_common_recvmsg);
3044 * Set socket options on an inet socket.
3046 int sock_common_setsockopt(struct socket *sock, int level, int optname,
3047 char __user *optval, unsigned int optlen)
3049 struct sock *sk = sock->sk;
3051 return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
3053 EXPORT_SYMBOL(sock_common_setsockopt);
3055 #ifdef CONFIG_COMPAT
3056 int compat_sock_common_setsockopt(struct socket *sock, int level, int optname,
3057 char __user *optval, unsigned int optlen)
3059 struct sock *sk = sock->sk;
3061 if (sk->sk_prot->compat_setsockopt != NULL)
3062 return sk->sk_prot->compat_setsockopt(sk, level, optname,
3063 optval, optlen);
3064 return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
3066 EXPORT_SYMBOL(compat_sock_common_setsockopt);
3067 #endif
3069 void sk_common_release(struct sock *sk)
3071 if (sk->sk_prot->destroy)
3072 sk->sk_prot->destroy(sk);
3075 * Observation: when sock_common_release is called, processes have
3076 * no access to socket. But net still has.
3077 * Step one, detach it from networking:
3079 * A. Remove from hash tables.
3082 sk->sk_prot->unhash(sk);
3085 * In this point socket cannot receive new packets, but it is possible
3086 * that some packets are in flight because some CPU runs receiver and
3087 * did hash table lookup before we unhashed socket. They will achieve
3088 * receive queue and will be purged by socket destructor.
3090 * Also we still have packets pending on receive queue and probably,
3091 * our own packets waiting in device queues. sock_destroy will drain
3092 * receive queue, but transmitted packets will delay socket destruction
3093 * until the last reference will be released.
3096 sock_orphan(sk);
3098 xfrm_sk_free_policy(sk);
3100 sk_refcnt_debug_release(sk);
3102 sock_put(sk);
3104 EXPORT_SYMBOL(sk_common_release);
3106 void sk_get_meminfo(const struct sock *sk, u32 *mem)
3108 memset(mem, 0, sizeof(*mem) * SK_MEMINFO_VARS);
3110 mem[SK_MEMINFO_RMEM_ALLOC] = sk_rmem_alloc_get(sk);
3111 mem[SK_MEMINFO_RCVBUF] = sk->sk_rcvbuf;
3112 mem[SK_MEMINFO_WMEM_ALLOC] = sk_wmem_alloc_get(sk);
3113 mem[SK_MEMINFO_SNDBUF] = sk->sk_sndbuf;
3114 mem[SK_MEMINFO_FWD_ALLOC] = sk->sk_forward_alloc;
3115 mem[SK_MEMINFO_WMEM_QUEUED] = sk->sk_wmem_queued;
3116 mem[SK_MEMINFO_OPTMEM] = atomic_read(&sk->sk_omem_alloc);
3117 mem[SK_MEMINFO_BACKLOG] = sk->sk_backlog.len;
3118 mem[SK_MEMINFO_DROPS] = atomic_read(&sk->sk_drops);
3121 #ifdef CONFIG_PROC_FS
3122 #define PROTO_INUSE_NR 64 /* should be enough for the first time */
3123 struct prot_inuse {
3124 int val[PROTO_INUSE_NR];
3127 static DECLARE_BITMAP(proto_inuse_idx, PROTO_INUSE_NR);
3129 void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
3131 __this_cpu_add(net->core.prot_inuse->val[prot->inuse_idx], val);
3133 EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
3135 int sock_prot_inuse_get(struct net *net, struct proto *prot)
3137 int cpu, idx = prot->inuse_idx;
3138 int res = 0;
3140 for_each_possible_cpu(cpu)
3141 res += per_cpu_ptr(net->core.prot_inuse, cpu)->val[idx];
3143 return res >= 0 ? res : 0;
3145 EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
3147 static void sock_inuse_add(struct net *net, int val)
3149 this_cpu_add(*net->core.sock_inuse, val);
3152 int sock_inuse_get(struct net *net)
3154 int cpu, res = 0;
3156 for_each_possible_cpu(cpu)
3157 res += *per_cpu_ptr(net->core.sock_inuse, cpu);
3159 return res;
3162 EXPORT_SYMBOL_GPL(sock_inuse_get);
3164 static int __net_init sock_inuse_init_net(struct net *net)
3166 net->core.prot_inuse = alloc_percpu(struct prot_inuse);
3167 if (net->core.prot_inuse == NULL)
3168 return -ENOMEM;
3170 net->core.sock_inuse = alloc_percpu(int);
3171 if (net->core.sock_inuse == NULL)
3172 goto out;
3174 return 0;
3176 out:
3177 free_percpu(net->core.prot_inuse);
3178 return -ENOMEM;
3181 static void __net_exit sock_inuse_exit_net(struct net *net)
3183 free_percpu(net->core.prot_inuse);
3184 free_percpu(net->core.sock_inuse);
3187 static struct pernet_operations net_inuse_ops = {
3188 .init = sock_inuse_init_net,
3189 .exit = sock_inuse_exit_net,
3192 static __init int net_inuse_init(void)
3194 if (register_pernet_subsys(&net_inuse_ops))
3195 panic("Cannot initialize net inuse counters");
3197 return 0;
3200 core_initcall(net_inuse_init);
3202 static void assign_proto_idx(struct proto *prot)
3204 prot->inuse_idx = find_first_zero_bit(proto_inuse_idx, PROTO_INUSE_NR);
3206 if (unlikely(prot->inuse_idx == PROTO_INUSE_NR - 1)) {
3207 pr_err("PROTO_INUSE_NR exhausted\n");
3208 return;
3211 set_bit(prot->inuse_idx, proto_inuse_idx);
3214 static void release_proto_idx(struct proto *prot)
3216 if (prot->inuse_idx != PROTO_INUSE_NR - 1)
3217 clear_bit(prot->inuse_idx, proto_inuse_idx);
3219 #else
3220 static inline void assign_proto_idx(struct proto *prot)
3224 static inline void release_proto_idx(struct proto *prot)
3228 static void sock_inuse_add(struct net *net, int val)
3231 #endif
3233 static void req_prot_cleanup(struct request_sock_ops *rsk_prot)
3235 if (!rsk_prot)
3236 return;
3237 kfree(rsk_prot->slab_name);
3238 rsk_prot->slab_name = NULL;
3239 kmem_cache_destroy(rsk_prot->slab);
3240 rsk_prot->slab = NULL;
3243 static int req_prot_init(const struct proto *prot)
3245 struct request_sock_ops *rsk_prot = prot->rsk_prot;
3247 if (!rsk_prot)
3248 return 0;
3250 rsk_prot->slab_name = kasprintf(GFP_KERNEL, "request_sock_%s",
3251 prot->name);
3252 if (!rsk_prot->slab_name)
3253 return -ENOMEM;
3255 rsk_prot->slab = kmem_cache_create(rsk_prot->slab_name,
3256 rsk_prot->obj_size, 0,
3257 SLAB_ACCOUNT | prot->slab_flags,
3258 NULL);
3260 if (!rsk_prot->slab) {
3261 pr_crit("%s: Can't create request sock SLAB cache!\n",
3262 prot->name);
3263 return -ENOMEM;
3265 return 0;
3268 int proto_register(struct proto *prot, int alloc_slab)
3270 if (alloc_slab) {
3271 prot->slab = kmem_cache_create_usercopy(prot->name,
3272 prot->obj_size, 0,
3273 SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT |
3274 prot->slab_flags,
3275 prot->useroffset, prot->usersize,
3276 NULL);
3278 if (prot->slab == NULL) {
3279 pr_crit("%s: Can't create sock SLAB cache!\n",
3280 prot->name);
3281 goto out;
3284 if (req_prot_init(prot))
3285 goto out_free_request_sock_slab;
3287 if (prot->twsk_prot != NULL) {
3288 prot->twsk_prot->twsk_slab_name = kasprintf(GFP_KERNEL, "tw_sock_%s", prot->name);
3290 if (prot->twsk_prot->twsk_slab_name == NULL)
3291 goto out_free_request_sock_slab;
3293 prot->twsk_prot->twsk_slab =
3294 kmem_cache_create(prot->twsk_prot->twsk_slab_name,
3295 prot->twsk_prot->twsk_obj_size,
3297 SLAB_ACCOUNT |
3298 prot->slab_flags,
3299 NULL);
3300 if (prot->twsk_prot->twsk_slab == NULL)
3301 goto out_free_timewait_sock_slab_name;
3305 mutex_lock(&proto_list_mutex);
3306 list_add(&prot->node, &proto_list);
3307 assign_proto_idx(prot);
3308 mutex_unlock(&proto_list_mutex);
3309 return 0;
3311 out_free_timewait_sock_slab_name:
3312 kfree(prot->twsk_prot->twsk_slab_name);
3313 out_free_request_sock_slab:
3314 req_prot_cleanup(prot->rsk_prot);
3316 kmem_cache_destroy(prot->slab);
3317 prot->slab = NULL;
3318 out:
3319 return -ENOBUFS;
3321 EXPORT_SYMBOL(proto_register);
3323 void proto_unregister(struct proto *prot)
3325 mutex_lock(&proto_list_mutex);
3326 release_proto_idx(prot);
3327 list_del(&prot->node);
3328 mutex_unlock(&proto_list_mutex);
3330 kmem_cache_destroy(prot->slab);
3331 prot->slab = NULL;
3333 req_prot_cleanup(prot->rsk_prot);
3335 if (prot->twsk_prot != NULL && prot->twsk_prot->twsk_slab != NULL) {
3336 kmem_cache_destroy(prot->twsk_prot->twsk_slab);
3337 kfree(prot->twsk_prot->twsk_slab_name);
3338 prot->twsk_prot->twsk_slab = NULL;
3341 EXPORT_SYMBOL(proto_unregister);
3343 int sock_load_diag_module(int family, int protocol)
3345 if (!protocol) {
3346 if (!sock_is_registered(family))
3347 return -ENOENT;
3349 return request_module("net-pf-%d-proto-%d-type-%d", PF_NETLINK,
3350 NETLINK_SOCK_DIAG, family);
3353 #ifdef CONFIG_INET
3354 if (family == AF_INET &&
3355 protocol != IPPROTO_RAW &&
3356 !rcu_access_pointer(inet_protos[protocol]))
3357 return -ENOENT;
3358 #endif
3360 return request_module("net-pf-%d-proto-%d-type-%d-%d", PF_NETLINK,
3361 NETLINK_SOCK_DIAG, family, protocol);
3363 EXPORT_SYMBOL(sock_load_diag_module);
3365 #ifdef CONFIG_PROC_FS
3366 static void *proto_seq_start(struct seq_file *seq, loff_t *pos)
3367 __acquires(proto_list_mutex)
3369 mutex_lock(&proto_list_mutex);
3370 return seq_list_start_head(&proto_list, *pos);
3373 static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3375 return seq_list_next(v, &proto_list, pos);
3378 static void proto_seq_stop(struct seq_file *seq, void *v)
3379 __releases(proto_list_mutex)
3381 mutex_unlock(&proto_list_mutex);
3384 static char proto_method_implemented(const void *method)
3386 return method == NULL ? 'n' : 'y';
3388 static long sock_prot_memory_allocated(struct proto *proto)
3390 return proto->memory_allocated != NULL ? proto_memory_allocated(proto) : -1L;
3393 static char *sock_prot_memory_pressure(struct proto *proto)
3395 return proto->memory_pressure != NULL ?
3396 proto_memory_pressure(proto) ? "yes" : "no" : "NI";
3399 static void proto_seq_printf(struct seq_file *seq, struct proto *proto)
3402 seq_printf(seq, "%-9s %4u %6d %6ld %-3s %6u %-3s %-10s "
3403 "%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n",
3404 proto->name,
3405 proto->obj_size,
3406 sock_prot_inuse_get(seq_file_net(seq), proto),
3407 sock_prot_memory_allocated(proto),
3408 sock_prot_memory_pressure(proto),
3409 proto->max_header,
3410 proto->slab == NULL ? "no" : "yes",
3411 module_name(proto->owner),
3412 proto_method_implemented(proto->close),
3413 proto_method_implemented(proto->connect),
3414 proto_method_implemented(proto->disconnect),
3415 proto_method_implemented(proto->accept),
3416 proto_method_implemented(proto->ioctl),
3417 proto_method_implemented(proto->init),
3418 proto_method_implemented(proto->destroy),
3419 proto_method_implemented(proto->shutdown),
3420 proto_method_implemented(proto->setsockopt),
3421 proto_method_implemented(proto->getsockopt),
3422 proto_method_implemented(proto->sendmsg),
3423 proto_method_implemented(proto->recvmsg),
3424 proto_method_implemented(proto->sendpage),
3425 proto_method_implemented(proto->bind),
3426 proto_method_implemented(proto->backlog_rcv),
3427 proto_method_implemented(proto->hash),
3428 proto_method_implemented(proto->unhash),
3429 proto_method_implemented(proto->get_port),
3430 proto_method_implemented(proto->enter_memory_pressure));
3433 static int proto_seq_show(struct seq_file *seq, void *v)
3435 if (v == &proto_list)
3436 seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s",
3437 "protocol",
3438 "size",
3439 "sockets",
3440 "memory",
3441 "press",
3442 "maxhdr",
3443 "slab",
3444 "module",
3445 "cl co di ac io in de sh ss gs se re sp bi br ha uh gp em\n");
3446 else
3447 proto_seq_printf(seq, list_entry(v, struct proto, node));
3448 return 0;
3451 static const struct seq_operations proto_seq_ops = {
3452 .start = proto_seq_start,
3453 .next = proto_seq_next,
3454 .stop = proto_seq_stop,
3455 .show = proto_seq_show,
3458 static __net_init int proto_init_net(struct net *net)
3460 if (!proc_create_net("protocols", 0444, net->proc_net, &proto_seq_ops,
3461 sizeof(struct seq_net_private)))
3462 return -ENOMEM;
3464 return 0;
3467 static __net_exit void proto_exit_net(struct net *net)
3469 remove_proc_entry("protocols", net->proc_net);
3473 static __net_initdata struct pernet_operations proto_net_ops = {
3474 .init = proto_init_net,
3475 .exit = proto_exit_net,
3478 static int __init proto_init(void)
3480 return register_pernet_subsys(&proto_net_ops);
3483 subsys_initcall(proto_init);
3485 #endif /* PROC_FS */
3487 #ifdef CONFIG_NET_RX_BUSY_POLL
3488 bool sk_busy_loop_end(void *p, unsigned long start_time)
3490 struct sock *sk = p;
3492 return !skb_queue_empty_lockless(&sk->sk_receive_queue) ||
3493 sk_busy_loop_timeout(sk, start_time);
3495 EXPORT_SYMBOL(sk_busy_loop_end);
3496 #endif /* CONFIG_NET_RX_BUSY_POLL */