debugfs: fix debugfs_rename parameter checking
[linux/fpc-iii.git] / net / core / sock.c
blob530583ae92bfc6abdb1a669cdaa79d394225e366
1 /*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
6 * Generic socket support routines. Memory allocators, socket lock/release
7 * handler for protocols to use and generic option handler.
10 * Authors: Ross Biro
11 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12 * Florian La Roche, <flla@stud.uni-sb.de>
13 * Alan Cox, <A.Cox@swansea.ac.uk>
15 * Fixes:
16 * Alan Cox : Numerous verify_area() problems
17 * Alan Cox : Connecting on a connecting socket
18 * now returns an error for tcp.
19 * Alan Cox : sock->protocol is set correctly.
20 * and is not sometimes left as 0.
21 * Alan Cox : connect handles icmp errors on a
22 * connect properly. Unfortunately there
23 * is a restart syscall nasty there. I
24 * can't match BSD without hacking the C
25 * library. Ideas urgently sought!
26 * Alan Cox : Disallow bind() to addresses that are
27 * not ours - especially broadcast ones!!
28 * Alan Cox : Socket 1024 _IS_ ok for users. (fencepost)
29 * Alan Cox : sock_wfree/sock_rfree don't destroy sockets,
30 * instead they leave that for the DESTROY timer.
31 * Alan Cox : Clean up error flag in accept
32 * Alan Cox : TCP ack handling is buggy, the DESTROY timer
33 * was buggy. Put a remove_sock() in the handler
34 * for memory when we hit 0. Also altered the timer
35 * code. The ACK stuff can wait and needs major
36 * TCP layer surgery.
37 * Alan Cox : Fixed TCP ack bug, removed remove sock
38 * and fixed timer/inet_bh race.
39 * Alan Cox : Added zapped flag for TCP
40 * Alan Cox : Move kfree_skb into skbuff.c and tidied up surplus code
41 * Alan Cox : for new sk_buff allocations wmalloc/rmalloc now call alloc_skb
42 * Alan Cox : kfree_s calls now are kfree_skbmem so we can track skb resources
43 * Alan Cox : Supports socket option broadcast now as does udp. Packet and raw need fixing.
44 * Alan Cox : Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so...
45 * Rick Sladkey : Relaxed UDP rules for matching packets.
46 * C.E.Hawkins : IFF_PROMISC/SIOCGHWADDR support
47 * Pauline Middelink : identd support
48 * Alan Cox : Fixed connect() taking signals I think.
49 * Alan Cox : SO_LINGER supported
50 * Alan Cox : Error reporting fixes
51 * Anonymous : inet_create tidied up (sk->reuse setting)
52 * Alan Cox : inet sockets don't set sk->type!
53 * Alan Cox : Split socket option code
54 * Alan Cox : Callbacks
55 * Alan Cox : Nagle flag for Charles & Johannes stuff
56 * Alex : Removed restriction on inet fioctl
57 * Alan Cox : Splitting INET from NET core
58 * Alan Cox : Fixed bogus SO_TYPE handling in getsockopt()
59 * Adam Caldwell : Missing return in SO_DONTROUTE/SO_DEBUG code
60 * Alan Cox : Split IP from generic code
61 * Alan Cox : New kfree_skbmem()
62 * Alan Cox : Make SO_DEBUG superuser only.
63 * Alan Cox : Allow anyone to clear SO_DEBUG
64 * (compatibility fix)
65 * Alan Cox : Added optimistic memory grabbing for AF_UNIX throughput.
66 * Alan Cox : Allocator for a socket is settable.
67 * Alan Cox : SO_ERROR includes soft errors.
68 * Alan Cox : Allow NULL arguments on some SO_ opts
69 * Alan Cox : Generic socket allocation to make hooks
70 * easier (suggested by Craig Metz).
71 * Michael Pall : SO_ERROR returns positive errno again
72 * Steve Whitehouse: Added default destructor to free
73 * protocol private data.
74 * Steve Whitehouse: Added various other default routines
75 * common to several socket families.
76 * Chris Evans : Call suser() check last on F_SETOWN
77 * Jay Schulist : Added SO_ATTACH_FILTER and SO_DETACH_FILTER.
78 * Andi Kleen : Add sock_kmalloc()/sock_kfree_s()
79 * Andi Kleen : Fix write_space callback
80 * Chris Evans : Security fixes - signedness again
81 * Arnaldo C. Melo : cleanups, use skb_queue_purge
83 * To Fix:
86 * This program is free software; you can redistribute it and/or
87 * modify it under the terms of the GNU General Public License
88 * as published by the Free Software Foundation; either version
89 * 2 of the License, or (at your option) any later version.
92 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
94 #include <asm/unaligned.h>
95 #include <linux/capability.h>
96 #include <linux/errno.h>
97 #include <linux/errqueue.h>
98 #include <linux/types.h>
99 #include <linux/socket.h>
100 #include <linux/in.h>
101 #include <linux/kernel.h>
102 #include <linux/module.h>
103 #include <linux/proc_fs.h>
104 #include <linux/seq_file.h>
105 #include <linux/sched.h>
106 #include <linux/sched/mm.h>
107 #include <linux/timer.h>
108 #include <linux/string.h>
109 #include <linux/sockios.h>
110 #include <linux/net.h>
111 #include <linux/mm.h>
112 #include <linux/slab.h>
113 #include <linux/interrupt.h>
114 #include <linux/poll.h>
115 #include <linux/tcp.h>
116 #include <linux/init.h>
117 #include <linux/highmem.h>
118 #include <linux/user_namespace.h>
119 #include <linux/static_key.h>
120 #include <linux/memcontrol.h>
121 #include <linux/prefetch.h>
123 #include <linux/uaccess.h>
125 #include <linux/netdevice.h>
126 #include <net/protocol.h>
127 #include <linux/skbuff.h>
128 #include <net/net_namespace.h>
129 #include <net/request_sock.h>
130 #include <net/sock.h>
131 #include <linux/net_tstamp.h>
132 #include <net/xfrm.h>
133 #include <linux/ipsec.h>
134 #include <net/cls_cgroup.h>
135 #include <net/netprio_cgroup.h>
136 #include <linux/sock_diag.h>
138 #include <linux/filter.h>
139 #include <net/sock_reuseport.h>
141 #include <trace/events/sock.h>
143 #include <net/tcp.h>
144 #include <net/busy_poll.h>
146 static DEFINE_MUTEX(proto_list_mutex);
147 static LIST_HEAD(proto_list);
149 static void sock_inuse_add(struct net *net, int val);
152 * sk_ns_capable - General socket capability test
153 * @sk: Socket to use a capability on or through
154 * @user_ns: The user namespace of the capability to use
155 * @cap: The capability to use
157 * Test to see if the opener of the socket had when the socket was
158 * created and the current process has the capability @cap in the user
159 * namespace @user_ns.
161 bool sk_ns_capable(const struct sock *sk,
162 struct user_namespace *user_ns, int cap)
164 return file_ns_capable(sk->sk_socket->file, user_ns, cap) &&
165 ns_capable(user_ns, cap);
167 EXPORT_SYMBOL(sk_ns_capable);
170 * sk_capable - Socket global capability test
171 * @sk: Socket to use a capability on or through
172 * @cap: The global capability to use
174 * Test to see if the opener of the socket had when the socket was
175 * created and the current process has the capability @cap in all user
176 * namespaces.
178 bool sk_capable(const struct sock *sk, int cap)
180 return sk_ns_capable(sk, &init_user_ns, cap);
182 EXPORT_SYMBOL(sk_capable);
185 * sk_net_capable - Network namespace socket capability test
186 * @sk: Socket to use a capability on or through
187 * @cap: The capability to use
189 * Test to see if the opener of the socket had when the socket was created
190 * and the current process has the capability @cap over the network namespace
191 * the socket is a member of.
193 bool sk_net_capable(const struct sock *sk, int cap)
195 return sk_ns_capable(sk, sock_net(sk)->user_ns, cap);
197 EXPORT_SYMBOL(sk_net_capable);
200 * Each address family might have different locking rules, so we have
201 * one slock key per address family and separate keys for internal and
202 * userspace sockets.
204 static struct lock_class_key af_family_keys[AF_MAX];
205 static struct lock_class_key af_family_kern_keys[AF_MAX];
206 static struct lock_class_key af_family_slock_keys[AF_MAX];
207 static struct lock_class_key af_family_kern_slock_keys[AF_MAX];
210 * Make lock validator output more readable. (we pre-construct these
211 * strings build-time, so that runtime initialization of socket
212 * locks is fast):
215 #define _sock_locks(x) \
216 x "AF_UNSPEC", x "AF_UNIX" , x "AF_INET" , \
217 x "AF_AX25" , x "AF_IPX" , x "AF_APPLETALK", \
218 x "AF_NETROM", x "AF_BRIDGE" , x "AF_ATMPVC" , \
219 x "AF_X25" , x "AF_INET6" , x "AF_ROSE" , \
220 x "AF_DECnet", x "AF_NETBEUI" , x "AF_SECURITY" , \
221 x "AF_KEY" , x "AF_NETLINK" , x "AF_PACKET" , \
222 x "AF_ASH" , x "AF_ECONET" , x "AF_ATMSVC" , \
223 x "AF_RDS" , x "AF_SNA" , x "AF_IRDA" , \
224 x "AF_PPPOX" , x "AF_WANPIPE" , x "AF_LLC" , \
225 x "27" , x "28" , x "AF_CAN" , \
226 x "AF_TIPC" , x "AF_BLUETOOTH", x "IUCV" , \
227 x "AF_RXRPC" , x "AF_ISDN" , x "AF_PHONET" , \
228 x "AF_IEEE802154", x "AF_CAIF" , x "AF_ALG" , \
229 x "AF_NFC" , x "AF_VSOCK" , x "AF_KCM" , \
230 x "AF_QIPCRTR", x "AF_SMC" , x "AF_XDP" , \
231 x "AF_MAX"
233 static const char *const af_family_key_strings[AF_MAX+1] = {
234 _sock_locks("sk_lock-")
236 static const char *const af_family_slock_key_strings[AF_MAX+1] = {
237 _sock_locks("slock-")
239 static const char *const af_family_clock_key_strings[AF_MAX+1] = {
240 _sock_locks("clock-")
243 static const char *const af_family_kern_key_strings[AF_MAX+1] = {
244 _sock_locks("k-sk_lock-")
246 static const char *const af_family_kern_slock_key_strings[AF_MAX+1] = {
247 _sock_locks("k-slock-")
249 static const char *const af_family_kern_clock_key_strings[AF_MAX+1] = {
250 _sock_locks("k-clock-")
252 static const char *const af_family_rlock_key_strings[AF_MAX+1] = {
253 _sock_locks("rlock-")
255 static const char *const af_family_wlock_key_strings[AF_MAX+1] = {
256 _sock_locks("wlock-")
258 static const char *const af_family_elock_key_strings[AF_MAX+1] = {
259 _sock_locks("elock-")
263 * sk_callback_lock and sk queues locking rules are per-address-family,
264 * so split the lock classes by using a per-AF key:
266 static struct lock_class_key af_callback_keys[AF_MAX];
267 static struct lock_class_key af_rlock_keys[AF_MAX];
268 static struct lock_class_key af_wlock_keys[AF_MAX];
269 static struct lock_class_key af_elock_keys[AF_MAX];
270 static struct lock_class_key af_kern_callback_keys[AF_MAX];
272 /* Run time adjustable parameters. */
273 __u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX;
274 EXPORT_SYMBOL(sysctl_wmem_max);
275 __u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX;
276 EXPORT_SYMBOL(sysctl_rmem_max);
277 __u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX;
278 __u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX;
280 /* Maximal space eaten by iovec or ancillary data plus some space */
281 int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512);
282 EXPORT_SYMBOL(sysctl_optmem_max);
284 int sysctl_tstamp_allow_data __read_mostly = 1;
286 DEFINE_STATIC_KEY_FALSE(memalloc_socks_key);
287 EXPORT_SYMBOL_GPL(memalloc_socks_key);
290 * sk_set_memalloc - sets %SOCK_MEMALLOC
291 * @sk: socket to set it on
293 * Set %SOCK_MEMALLOC on a socket for access to emergency reserves.
294 * It's the responsibility of the admin to adjust min_free_kbytes
295 * to meet the requirements
297 void sk_set_memalloc(struct sock *sk)
299 sock_set_flag(sk, SOCK_MEMALLOC);
300 sk->sk_allocation |= __GFP_MEMALLOC;
301 static_branch_inc(&memalloc_socks_key);
303 EXPORT_SYMBOL_GPL(sk_set_memalloc);
305 void sk_clear_memalloc(struct sock *sk)
307 sock_reset_flag(sk, SOCK_MEMALLOC);
308 sk->sk_allocation &= ~__GFP_MEMALLOC;
309 static_branch_dec(&memalloc_socks_key);
312 * SOCK_MEMALLOC is allowed to ignore rmem limits to ensure forward
313 * progress of swapping. SOCK_MEMALLOC may be cleared while
314 * it has rmem allocations due to the last swapfile being deactivated
315 * but there is a risk that the socket is unusable due to exceeding
316 * the rmem limits. Reclaim the reserves and obey rmem limits again.
318 sk_mem_reclaim(sk);
320 EXPORT_SYMBOL_GPL(sk_clear_memalloc);
322 int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb)
324 int ret;
325 unsigned int noreclaim_flag;
327 /* these should have been dropped before queueing */
328 BUG_ON(!sock_flag(sk, SOCK_MEMALLOC));
330 noreclaim_flag = memalloc_noreclaim_save();
331 ret = sk->sk_backlog_rcv(sk, skb);
332 memalloc_noreclaim_restore(noreclaim_flag);
334 return ret;
336 EXPORT_SYMBOL(__sk_backlog_rcv);
338 static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen)
340 struct timeval tv;
342 if (optlen < sizeof(tv))
343 return -EINVAL;
344 if (copy_from_user(&tv, optval, sizeof(tv)))
345 return -EFAULT;
346 if (tv.tv_usec < 0 || tv.tv_usec >= USEC_PER_SEC)
347 return -EDOM;
349 if (tv.tv_sec < 0) {
350 static int warned __read_mostly;
352 *timeo_p = 0;
353 if (warned < 10 && net_ratelimit()) {
354 warned++;
355 pr_info("%s: `%s' (pid %d) tries to set negative timeout\n",
356 __func__, current->comm, task_pid_nr(current));
358 return 0;
360 *timeo_p = MAX_SCHEDULE_TIMEOUT;
361 if (tv.tv_sec == 0 && tv.tv_usec == 0)
362 return 0;
363 if (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT/HZ - 1))
364 *timeo_p = tv.tv_sec * HZ + DIV_ROUND_UP(tv.tv_usec, USEC_PER_SEC / HZ);
365 return 0;
368 static void sock_warn_obsolete_bsdism(const char *name)
370 static int warned;
371 static char warncomm[TASK_COMM_LEN];
372 if (strcmp(warncomm, current->comm) && warned < 5) {
373 strcpy(warncomm, current->comm);
374 pr_warn("process `%s' is using obsolete %s SO_BSDCOMPAT\n",
375 warncomm, name);
376 warned++;
380 static bool sock_needs_netstamp(const struct sock *sk)
382 switch (sk->sk_family) {
383 case AF_UNSPEC:
384 case AF_UNIX:
385 return false;
386 default:
387 return true;
391 static void sock_disable_timestamp(struct sock *sk, unsigned long flags)
393 if (sk->sk_flags & flags) {
394 sk->sk_flags &= ~flags;
395 if (sock_needs_netstamp(sk) &&
396 !(sk->sk_flags & SK_FLAGS_TIMESTAMP))
397 net_disable_timestamp();
402 int __sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
404 unsigned long flags;
405 struct sk_buff_head *list = &sk->sk_receive_queue;
407 if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) {
408 atomic_inc(&sk->sk_drops);
409 trace_sock_rcvqueue_full(sk, skb);
410 return -ENOMEM;
413 if (!sk_rmem_schedule(sk, skb, skb->truesize)) {
414 atomic_inc(&sk->sk_drops);
415 return -ENOBUFS;
418 skb->dev = NULL;
419 skb_set_owner_r(skb, sk);
421 /* we escape from rcu protected region, make sure we dont leak
422 * a norefcounted dst
424 skb_dst_force(skb);
426 spin_lock_irqsave(&list->lock, flags);
427 sock_skb_set_dropcount(sk, skb);
428 __skb_queue_tail(list, skb);
429 spin_unlock_irqrestore(&list->lock, flags);
431 if (!sock_flag(sk, SOCK_DEAD))
432 sk->sk_data_ready(sk);
433 return 0;
435 EXPORT_SYMBOL(__sock_queue_rcv_skb);
437 int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
439 int err;
441 err = sk_filter(sk, skb);
442 if (err)
443 return err;
445 return __sock_queue_rcv_skb(sk, skb);
447 EXPORT_SYMBOL(sock_queue_rcv_skb);
449 int __sk_receive_skb(struct sock *sk, struct sk_buff *skb,
450 const int nested, unsigned int trim_cap, bool refcounted)
452 int rc = NET_RX_SUCCESS;
454 if (sk_filter_trim_cap(sk, skb, trim_cap))
455 goto discard_and_relse;
457 skb->dev = NULL;
459 if (sk_rcvqueues_full(sk, sk->sk_rcvbuf)) {
460 atomic_inc(&sk->sk_drops);
461 goto discard_and_relse;
463 if (nested)
464 bh_lock_sock_nested(sk);
465 else
466 bh_lock_sock(sk);
467 if (!sock_owned_by_user(sk)) {
469 * trylock + unlock semantics:
471 mutex_acquire(&sk->sk_lock.dep_map, 0, 1, _RET_IP_);
473 rc = sk_backlog_rcv(sk, skb);
475 mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
476 } else if (sk_add_backlog(sk, skb, sk->sk_rcvbuf)) {
477 bh_unlock_sock(sk);
478 atomic_inc(&sk->sk_drops);
479 goto discard_and_relse;
482 bh_unlock_sock(sk);
483 out:
484 if (refcounted)
485 sock_put(sk);
486 return rc;
487 discard_and_relse:
488 kfree_skb(skb);
489 goto out;
491 EXPORT_SYMBOL(__sk_receive_skb);
493 struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie)
495 struct dst_entry *dst = __sk_dst_get(sk);
497 if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
498 sk_tx_queue_clear(sk);
499 sk->sk_dst_pending_confirm = 0;
500 RCU_INIT_POINTER(sk->sk_dst_cache, NULL);
501 dst_release(dst);
502 return NULL;
505 return dst;
507 EXPORT_SYMBOL(__sk_dst_check);
509 struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie)
511 struct dst_entry *dst = sk_dst_get(sk);
513 if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
514 sk_dst_reset(sk);
515 dst_release(dst);
516 return NULL;
519 return dst;
521 EXPORT_SYMBOL(sk_dst_check);
523 static int sock_setbindtodevice(struct sock *sk, char __user *optval,
524 int optlen)
526 int ret = -ENOPROTOOPT;
527 #ifdef CONFIG_NETDEVICES
528 struct net *net = sock_net(sk);
529 char devname[IFNAMSIZ];
530 int index;
532 /* Sorry... */
533 ret = -EPERM;
534 if (!ns_capable(net->user_ns, CAP_NET_RAW))
535 goto out;
537 ret = -EINVAL;
538 if (optlen < 0)
539 goto out;
541 /* Bind this socket to a particular device like "eth0",
542 * as specified in the passed interface name. If the
543 * name is "" or the option length is zero the socket
544 * is not bound.
546 if (optlen > IFNAMSIZ - 1)
547 optlen = IFNAMSIZ - 1;
548 memset(devname, 0, sizeof(devname));
550 ret = -EFAULT;
551 if (copy_from_user(devname, optval, optlen))
552 goto out;
554 index = 0;
555 if (devname[0] != '\0') {
556 struct net_device *dev;
558 rcu_read_lock();
559 dev = dev_get_by_name_rcu(net, devname);
560 if (dev)
561 index = dev->ifindex;
562 rcu_read_unlock();
563 ret = -ENODEV;
564 if (!dev)
565 goto out;
568 lock_sock(sk);
569 sk->sk_bound_dev_if = index;
570 sk_dst_reset(sk);
571 release_sock(sk);
573 ret = 0;
575 out:
576 #endif
578 return ret;
581 static int sock_getbindtodevice(struct sock *sk, char __user *optval,
582 int __user *optlen, int len)
584 int ret = -ENOPROTOOPT;
585 #ifdef CONFIG_NETDEVICES
586 struct net *net = sock_net(sk);
587 char devname[IFNAMSIZ];
589 if (sk->sk_bound_dev_if == 0) {
590 len = 0;
591 goto zero;
594 ret = -EINVAL;
595 if (len < IFNAMSIZ)
596 goto out;
598 ret = netdev_get_name(net, devname, sk->sk_bound_dev_if);
599 if (ret)
600 goto out;
602 len = strlen(devname) + 1;
604 ret = -EFAULT;
605 if (copy_to_user(optval, devname, len))
606 goto out;
608 zero:
609 ret = -EFAULT;
610 if (put_user(len, optlen))
611 goto out;
613 ret = 0;
615 out:
616 #endif
618 return ret;
621 static inline void sock_valbool_flag(struct sock *sk, int bit, int valbool)
623 if (valbool)
624 sock_set_flag(sk, bit);
625 else
626 sock_reset_flag(sk, bit);
629 bool sk_mc_loop(struct sock *sk)
631 if (dev_recursion_level())
632 return false;
633 if (!sk)
634 return true;
635 switch (sk->sk_family) {
636 case AF_INET:
637 return inet_sk(sk)->mc_loop;
638 #if IS_ENABLED(CONFIG_IPV6)
639 case AF_INET6:
640 return inet6_sk(sk)->mc_loop;
641 #endif
643 WARN_ON(1);
644 return true;
646 EXPORT_SYMBOL(sk_mc_loop);
649 * This is meant for all protocols to use and covers goings on
650 * at the socket level. Everything here is generic.
653 int sock_setsockopt(struct socket *sock, int level, int optname,
654 char __user *optval, unsigned int optlen)
656 struct sock_txtime sk_txtime;
657 struct sock *sk = sock->sk;
658 int val;
659 int valbool;
660 struct linger ling;
661 int ret = 0;
664 * Options without arguments
667 if (optname == SO_BINDTODEVICE)
668 return sock_setbindtodevice(sk, optval, optlen);
670 if (optlen < sizeof(int))
671 return -EINVAL;
673 if (get_user(val, (int __user *)optval))
674 return -EFAULT;
676 valbool = val ? 1 : 0;
678 lock_sock(sk);
680 switch (optname) {
681 case SO_DEBUG:
682 if (val && !capable(CAP_NET_ADMIN))
683 ret = -EACCES;
684 else
685 sock_valbool_flag(sk, SOCK_DBG, valbool);
686 break;
687 case SO_REUSEADDR:
688 sk->sk_reuse = (valbool ? SK_CAN_REUSE : SK_NO_REUSE);
689 break;
690 case SO_REUSEPORT:
691 sk->sk_reuseport = valbool;
692 break;
693 case SO_TYPE:
694 case SO_PROTOCOL:
695 case SO_DOMAIN:
696 case SO_ERROR:
697 ret = -ENOPROTOOPT;
698 break;
699 case SO_DONTROUTE:
700 sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool);
701 sk_dst_reset(sk);
702 break;
703 case SO_BROADCAST:
704 sock_valbool_flag(sk, SOCK_BROADCAST, valbool);
705 break;
706 case SO_SNDBUF:
707 /* Don't error on this BSD doesn't and if you think
708 * about it this is right. Otherwise apps have to
709 * play 'guess the biggest size' games. RCVBUF/SNDBUF
710 * are treated in BSD as hints
712 val = min_t(u32, val, sysctl_wmem_max);
713 set_sndbuf:
714 sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
715 sk->sk_sndbuf = max_t(int, val * 2, SOCK_MIN_SNDBUF);
716 /* Wake up sending tasks if we upped the value. */
717 sk->sk_write_space(sk);
718 break;
720 case SO_SNDBUFFORCE:
721 if (!capable(CAP_NET_ADMIN)) {
722 ret = -EPERM;
723 break;
725 goto set_sndbuf;
727 case SO_RCVBUF:
728 /* Don't error on this BSD doesn't and if you think
729 * about it this is right. Otherwise apps have to
730 * play 'guess the biggest size' games. RCVBUF/SNDBUF
731 * are treated in BSD as hints
733 val = min_t(u32, val, sysctl_rmem_max);
734 set_rcvbuf:
735 sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
737 * We double it on the way in to account for
738 * "struct sk_buff" etc. overhead. Applications
739 * assume that the SO_RCVBUF setting they make will
740 * allow that much actual data to be received on that
741 * socket.
743 * Applications are unaware that "struct sk_buff" and
744 * other overheads allocate from the receive buffer
745 * during socket buffer allocation.
747 * And after considering the possible alternatives,
748 * returning the value we actually used in getsockopt
749 * is the most desirable behavior.
751 sk->sk_rcvbuf = max_t(int, val * 2, SOCK_MIN_RCVBUF);
752 break;
754 case SO_RCVBUFFORCE:
755 if (!capable(CAP_NET_ADMIN)) {
756 ret = -EPERM;
757 break;
759 goto set_rcvbuf;
761 case SO_KEEPALIVE:
762 if (sk->sk_prot->keepalive)
763 sk->sk_prot->keepalive(sk, valbool);
764 sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool);
765 break;
767 case SO_OOBINLINE:
768 sock_valbool_flag(sk, SOCK_URGINLINE, valbool);
769 break;
771 case SO_NO_CHECK:
772 sk->sk_no_check_tx = valbool;
773 break;
775 case SO_PRIORITY:
776 if ((val >= 0 && val <= 6) ||
777 ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
778 sk->sk_priority = val;
779 else
780 ret = -EPERM;
781 break;
783 case SO_LINGER:
784 if (optlen < sizeof(ling)) {
785 ret = -EINVAL; /* 1003.1g */
786 break;
788 if (copy_from_user(&ling, optval, sizeof(ling))) {
789 ret = -EFAULT;
790 break;
792 if (!ling.l_onoff)
793 sock_reset_flag(sk, SOCK_LINGER);
794 else {
795 #if (BITS_PER_LONG == 32)
796 if ((unsigned int)ling.l_linger >= MAX_SCHEDULE_TIMEOUT/HZ)
797 sk->sk_lingertime = MAX_SCHEDULE_TIMEOUT;
798 else
799 #endif
800 sk->sk_lingertime = (unsigned int)ling.l_linger * HZ;
801 sock_set_flag(sk, SOCK_LINGER);
803 break;
805 case SO_BSDCOMPAT:
806 sock_warn_obsolete_bsdism("setsockopt");
807 break;
809 case SO_PASSCRED:
810 if (valbool)
811 set_bit(SOCK_PASSCRED, &sock->flags);
812 else
813 clear_bit(SOCK_PASSCRED, &sock->flags);
814 break;
816 case SO_TIMESTAMP:
817 case SO_TIMESTAMPNS:
818 if (valbool) {
819 if (optname == SO_TIMESTAMP)
820 sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
821 else
822 sock_set_flag(sk, SOCK_RCVTSTAMPNS);
823 sock_set_flag(sk, SOCK_RCVTSTAMP);
824 sock_enable_timestamp(sk, SOCK_TIMESTAMP);
825 } else {
826 sock_reset_flag(sk, SOCK_RCVTSTAMP);
827 sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
829 break;
831 case SO_TIMESTAMPING:
832 if (val & ~SOF_TIMESTAMPING_MASK) {
833 ret = -EINVAL;
834 break;
837 if (val & SOF_TIMESTAMPING_OPT_ID &&
838 !(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)) {
839 if (sk->sk_protocol == IPPROTO_TCP &&
840 sk->sk_type == SOCK_STREAM) {
841 if ((1 << sk->sk_state) &
842 (TCPF_CLOSE | TCPF_LISTEN)) {
843 ret = -EINVAL;
844 break;
846 sk->sk_tskey = tcp_sk(sk)->snd_una;
847 } else {
848 sk->sk_tskey = 0;
852 if (val & SOF_TIMESTAMPING_OPT_STATS &&
853 !(val & SOF_TIMESTAMPING_OPT_TSONLY)) {
854 ret = -EINVAL;
855 break;
858 sk->sk_tsflags = val;
859 if (val & SOF_TIMESTAMPING_RX_SOFTWARE)
860 sock_enable_timestamp(sk,
861 SOCK_TIMESTAMPING_RX_SOFTWARE);
862 else
863 sock_disable_timestamp(sk,
864 (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE));
865 break;
867 case SO_RCVLOWAT:
868 if (val < 0)
869 val = INT_MAX;
870 if (sock->ops->set_rcvlowat)
871 ret = sock->ops->set_rcvlowat(sk, val);
872 else
873 sk->sk_rcvlowat = val ? : 1;
874 break;
876 case SO_RCVTIMEO:
877 ret = sock_set_timeout(&sk->sk_rcvtimeo, optval, optlen);
878 break;
880 case SO_SNDTIMEO:
881 ret = sock_set_timeout(&sk->sk_sndtimeo, optval, optlen);
882 break;
884 case SO_ATTACH_FILTER:
885 ret = -EINVAL;
886 if (optlen == sizeof(struct sock_fprog)) {
887 struct sock_fprog fprog;
889 ret = -EFAULT;
890 if (copy_from_user(&fprog, optval, sizeof(fprog)))
891 break;
893 ret = sk_attach_filter(&fprog, sk);
895 break;
897 case SO_ATTACH_BPF:
898 ret = -EINVAL;
899 if (optlen == sizeof(u32)) {
900 u32 ufd;
902 ret = -EFAULT;
903 if (copy_from_user(&ufd, optval, sizeof(ufd)))
904 break;
906 ret = sk_attach_bpf(ufd, sk);
908 break;
910 case SO_ATTACH_REUSEPORT_CBPF:
911 ret = -EINVAL;
912 if (optlen == sizeof(struct sock_fprog)) {
913 struct sock_fprog fprog;
915 ret = -EFAULT;
916 if (copy_from_user(&fprog, optval, sizeof(fprog)))
917 break;
919 ret = sk_reuseport_attach_filter(&fprog, sk);
921 break;
923 case SO_ATTACH_REUSEPORT_EBPF:
924 ret = -EINVAL;
925 if (optlen == sizeof(u32)) {
926 u32 ufd;
928 ret = -EFAULT;
929 if (copy_from_user(&ufd, optval, sizeof(ufd)))
930 break;
932 ret = sk_reuseport_attach_bpf(ufd, sk);
934 break;
936 case SO_DETACH_FILTER:
937 ret = sk_detach_filter(sk);
938 break;
940 case SO_LOCK_FILTER:
941 if (sock_flag(sk, SOCK_FILTER_LOCKED) && !valbool)
942 ret = -EPERM;
943 else
944 sock_valbool_flag(sk, SOCK_FILTER_LOCKED, valbool);
945 break;
947 case SO_PASSSEC:
948 if (valbool)
949 set_bit(SOCK_PASSSEC, &sock->flags);
950 else
951 clear_bit(SOCK_PASSSEC, &sock->flags);
952 break;
953 case SO_MARK:
954 if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
955 ret = -EPERM;
956 else
957 sk->sk_mark = val;
958 break;
960 case SO_RXQ_OVFL:
961 sock_valbool_flag(sk, SOCK_RXQ_OVFL, valbool);
962 break;
964 case SO_WIFI_STATUS:
965 sock_valbool_flag(sk, SOCK_WIFI_STATUS, valbool);
966 break;
968 case SO_PEEK_OFF:
969 if (sock->ops->set_peek_off)
970 ret = sock->ops->set_peek_off(sk, val);
971 else
972 ret = -EOPNOTSUPP;
973 break;
975 case SO_NOFCS:
976 sock_valbool_flag(sk, SOCK_NOFCS, valbool);
977 break;
979 case SO_SELECT_ERR_QUEUE:
980 sock_valbool_flag(sk, SOCK_SELECT_ERR_QUEUE, valbool);
981 break;
983 #ifdef CONFIG_NET_RX_BUSY_POLL
984 case SO_BUSY_POLL:
985 /* allow unprivileged users to decrease the value */
986 if ((val > sk->sk_ll_usec) && !capable(CAP_NET_ADMIN))
987 ret = -EPERM;
988 else {
989 if (val < 0)
990 ret = -EINVAL;
991 else
992 sk->sk_ll_usec = val;
994 break;
995 #endif
997 case SO_MAX_PACING_RATE:
998 if (val != ~0U)
999 cmpxchg(&sk->sk_pacing_status,
1000 SK_PACING_NONE,
1001 SK_PACING_NEEDED);
1002 sk->sk_max_pacing_rate = (val == ~0U) ? ~0UL : val;
1003 sk->sk_pacing_rate = min(sk->sk_pacing_rate,
1004 sk->sk_max_pacing_rate);
1005 break;
1007 case SO_INCOMING_CPU:
1008 sk->sk_incoming_cpu = val;
1009 break;
1011 case SO_CNX_ADVICE:
1012 if (val == 1)
1013 dst_negative_advice(sk);
1014 break;
1016 case SO_ZEROCOPY:
1017 if (sk->sk_family == PF_INET || sk->sk_family == PF_INET6) {
1018 if (sk->sk_protocol != IPPROTO_TCP)
1019 ret = -ENOTSUPP;
1020 } else if (sk->sk_family != PF_RDS) {
1021 ret = -ENOTSUPP;
1023 if (!ret) {
1024 if (val < 0 || val > 1)
1025 ret = -EINVAL;
1026 else
1027 sock_valbool_flag(sk, SOCK_ZEROCOPY, valbool);
1029 break;
1031 case SO_TXTIME:
1032 if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
1033 ret = -EPERM;
1034 } else if (optlen != sizeof(struct sock_txtime)) {
1035 ret = -EINVAL;
1036 } else if (copy_from_user(&sk_txtime, optval,
1037 sizeof(struct sock_txtime))) {
1038 ret = -EFAULT;
1039 } else if (sk_txtime.flags & ~SOF_TXTIME_FLAGS_MASK) {
1040 ret = -EINVAL;
1041 } else {
1042 sock_valbool_flag(sk, SOCK_TXTIME, true);
1043 sk->sk_clockid = sk_txtime.clockid;
1044 sk->sk_txtime_deadline_mode =
1045 !!(sk_txtime.flags & SOF_TXTIME_DEADLINE_MODE);
1046 sk->sk_txtime_report_errors =
1047 !!(sk_txtime.flags & SOF_TXTIME_REPORT_ERRORS);
1049 break;
1051 default:
1052 ret = -ENOPROTOOPT;
1053 break;
1055 release_sock(sk);
1056 return ret;
1058 EXPORT_SYMBOL(sock_setsockopt);
1061 static void cred_to_ucred(struct pid *pid, const struct cred *cred,
1062 struct ucred *ucred)
1064 ucred->pid = pid_vnr(pid);
1065 ucred->uid = ucred->gid = -1;
1066 if (cred) {
1067 struct user_namespace *current_ns = current_user_ns();
1069 ucred->uid = from_kuid_munged(current_ns, cred->euid);
1070 ucred->gid = from_kgid_munged(current_ns, cred->egid);
1074 static int groups_to_user(gid_t __user *dst, const struct group_info *src)
1076 struct user_namespace *user_ns = current_user_ns();
1077 int i;
1079 for (i = 0; i < src->ngroups; i++)
1080 if (put_user(from_kgid_munged(user_ns, src->gid[i]), dst + i))
1081 return -EFAULT;
1083 return 0;
1086 int sock_getsockopt(struct socket *sock, int level, int optname,
1087 char __user *optval, int __user *optlen)
1089 struct sock *sk = sock->sk;
1091 union {
1092 int val;
1093 u64 val64;
1094 struct linger ling;
1095 struct timeval tm;
1096 struct sock_txtime txtime;
1097 } v;
1099 int lv = sizeof(int);
1100 int len;
1102 if (get_user(len, optlen))
1103 return -EFAULT;
1104 if (len < 0)
1105 return -EINVAL;
1107 memset(&v, 0, sizeof(v));
1109 switch (optname) {
1110 case SO_DEBUG:
1111 v.val = sock_flag(sk, SOCK_DBG);
1112 break;
1114 case SO_DONTROUTE:
1115 v.val = sock_flag(sk, SOCK_LOCALROUTE);
1116 break;
1118 case SO_BROADCAST:
1119 v.val = sock_flag(sk, SOCK_BROADCAST);
1120 break;
1122 case SO_SNDBUF:
1123 v.val = sk->sk_sndbuf;
1124 break;
1126 case SO_RCVBUF:
1127 v.val = sk->sk_rcvbuf;
1128 break;
1130 case SO_REUSEADDR:
1131 v.val = sk->sk_reuse;
1132 break;
1134 case SO_REUSEPORT:
1135 v.val = sk->sk_reuseport;
1136 break;
1138 case SO_KEEPALIVE:
1139 v.val = sock_flag(sk, SOCK_KEEPOPEN);
1140 break;
1142 case SO_TYPE:
1143 v.val = sk->sk_type;
1144 break;
1146 case SO_PROTOCOL:
1147 v.val = sk->sk_protocol;
1148 break;
1150 case SO_DOMAIN:
1151 v.val = sk->sk_family;
1152 break;
1154 case SO_ERROR:
1155 v.val = -sock_error(sk);
1156 if (v.val == 0)
1157 v.val = xchg(&sk->sk_err_soft, 0);
1158 break;
1160 case SO_OOBINLINE:
1161 v.val = sock_flag(sk, SOCK_URGINLINE);
1162 break;
1164 case SO_NO_CHECK:
1165 v.val = sk->sk_no_check_tx;
1166 break;
1168 case SO_PRIORITY:
1169 v.val = sk->sk_priority;
1170 break;
1172 case SO_LINGER:
1173 lv = sizeof(v.ling);
1174 v.ling.l_onoff = sock_flag(sk, SOCK_LINGER);
1175 v.ling.l_linger = sk->sk_lingertime / HZ;
1176 break;
1178 case SO_BSDCOMPAT:
1179 sock_warn_obsolete_bsdism("getsockopt");
1180 break;
1182 case SO_TIMESTAMP:
1183 v.val = sock_flag(sk, SOCK_RCVTSTAMP) &&
1184 !sock_flag(sk, SOCK_RCVTSTAMPNS);
1185 break;
1187 case SO_TIMESTAMPNS:
1188 v.val = sock_flag(sk, SOCK_RCVTSTAMPNS);
1189 break;
1191 case SO_TIMESTAMPING:
1192 v.val = sk->sk_tsflags;
1193 break;
1195 case SO_RCVTIMEO:
1196 lv = sizeof(struct timeval);
1197 if (sk->sk_rcvtimeo == MAX_SCHEDULE_TIMEOUT) {
1198 v.tm.tv_sec = 0;
1199 v.tm.tv_usec = 0;
1200 } else {
1201 v.tm.tv_sec = sk->sk_rcvtimeo / HZ;
1202 v.tm.tv_usec = ((sk->sk_rcvtimeo % HZ) * USEC_PER_SEC) / HZ;
1204 break;
1206 case SO_SNDTIMEO:
1207 lv = sizeof(struct timeval);
1208 if (sk->sk_sndtimeo == MAX_SCHEDULE_TIMEOUT) {
1209 v.tm.tv_sec = 0;
1210 v.tm.tv_usec = 0;
1211 } else {
1212 v.tm.tv_sec = sk->sk_sndtimeo / HZ;
1213 v.tm.tv_usec = ((sk->sk_sndtimeo % HZ) * USEC_PER_SEC) / HZ;
1215 break;
1217 case SO_RCVLOWAT:
1218 v.val = sk->sk_rcvlowat;
1219 break;
1221 case SO_SNDLOWAT:
1222 v.val = 1;
1223 break;
1225 case SO_PASSCRED:
1226 v.val = !!test_bit(SOCK_PASSCRED, &sock->flags);
1227 break;
1229 case SO_PEERCRED:
1231 struct ucred peercred;
1232 if (len > sizeof(peercred))
1233 len = sizeof(peercred);
1234 cred_to_ucred(sk->sk_peer_pid, sk->sk_peer_cred, &peercred);
1235 if (copy_to_user(optval, &peercred, len))
1236 return -EFAULT;
1237 goto lenout;
1240 case SO_PEERGROUPS:
1242 int ret, n;
1244 if (!sk->sk_peer_cred)
1245 return -ENODATA;
1247 n = sk->sk_peer_cred->group_info->ngroups;
1248 if (len < n * sizeof(gid_t)) {
1249 len = n * sizeof(gid_t);
1250 return put_user(len, optlen) ? -EFAULT : -ERANGE;
1252 len = n * sizeof(gid_t);
1254 ret = groups_to_user((gid_t __user *)optval,
1255 sk->sk_peer_cred->group_info);
1256 if (ret)
1257 return ret;
1258 goto lenout;
1261 case SO_PEERNAME:
1263 char address[128];
1265 lv = sock->ops->getname(sock, (struct sockaddr *)address, 2);
1266 if (lv < 0)
1267 return -ENOTCONN;
1268 if (lv < len)
1269 return -EINVAL;
1270 if (copy_to_user(optval, address, len))
1271 return -EFAULT;
1272 goto lenout;
1275 /* Dubious BSD thing... Probably nobody even uses it, but
1276 * the UNIX standard wants it for whatever reason... -DaveM
1278 case SO_ACCEPTCONN:
1279 v.val = sk->sk_state == TCP_LISTEN;
1280 break;
1282 case SO_PASSSEC:
1283 v.val = !!test_bit(SOCK_PASSSEC, &sock->flags);
1284 break;
1286 case SO_PEERSEC:
1287 return security_socket_getpeersec_stream(sock, optval, optlen, len);
1289 case SO_MARK:
1290 v.val = sk->sk_mark;
1291 break;
1293 case SO_RXQ_OVFL:
1294 v.val = sock_flag(sk, SOCK_RXQ_OVFL);
1295 break;
1297 case SO_WIFI_STATUS:
1298 v.val = sock_flag(sk, SOCK_WIFI_STATUS);
1299 break;
1301 case SO_PEEK_OFF:
1302 if (!sock->ops->set_peek_off)
1303 return -EOPNOTSUPP;
1305 v.val = sk->sk_peek_off;
1306 break;
1307 case SO_NOFCS:
1308 v.val = sock_flag(sk, SOCK_NOFCS);
1309 break;
1311 case SO_BINDTODEVICE:
1312 return sock_getbindtodevice(sk, optval, optlen, len);
1314 case SO_GET_FILTER:
1315 len = sk_get_filter(sk, (struct sock_filter __user *)optval, len);
1316 if (len < 0)
1317 return len;
1319 goto lenout;
1321 case SO_LOCK_FILTER:
1322 v.val = sock_flag(sk, SOCK_FILTER_LOCKED);
1323 break;
1325 case SO_BPF_EXTENSIONS:
1326 v.val = bpf_tell_extensions();
1327 break;
1329 case SO_SELECT_ERR_QUEUE:
1330 v.val = sock_flag(sk, SOCK_SELECT_ERR_QUEUE);
1331 break;
1333 #ifdef CONFIG_NET_RX_BUSY_POLL
1334 case SO_BUSY_POLL:
1335 v.val = sk->sk_ll_usec;
1336 break;
1337 #endif
1339 case SO_MAX_PACING_RATE:
1340 /* 32bit version */
1341 v.val = min_t(unsigned long, sk->sk_max_pacing_rate, ~0U);
1342 break;
1344 case SO_INCOMING_CPU:
1345 v.val = sk->sk_incoming_cpu;
1346 break;
1348 case SO_MEMINFO:
1350 u32 meminfo[SK_MEMINFO_VARS];
1352 if (get_user(len, optlen))
1353 return -EFAULT;
1355 sk_get_meminfo(sk, meminfo);
1357 len = min_t(unsigned int, len, sizeof(meminfo));
1358 if (copy_to_user(optval, &meminfo, len))
1359 return -EFAULT;
1361 goto lenout;
1364 #ifdef CONFIG_NET_RX_BUSY_POLL
1365 case SO_INCOMING_NAPI_ID:
1366 v.val = READ_ONCE(sk->sk_napi_id);
1368 /* aggregate non-NAPI IDs down to 0 */
1369 if (v.val < MIN_NAPI_ID)
1370 v.val = 0;
1372 break;
1373 #endif
1375 case SO_COOKIE:
1376 lv = sizeof(u64);
1377 if (len < lv)
1378 return -EINVAL;
1379 v.val64 = sock_gen_cookie(sk);
1380 break;
1382 case SO_ZEROCOPY:
1383 v.val = sock_flag(sk, SOCK_ZEROCOPY);
1384 break;
1386 case SO_TXTIME:
1387 lv = sizeof(v.txtime);
1388 v.txtime.clockid = sk->sk_clockid;
1389 v.txtime.flags |= sk->sk_txtime_deadline_mode ?
1390 SOF_TXTIME_DEADLINE_MODE : 0;
1391 v.txtime.flags |= sk->sk_txtime_report_errors ?
1392 SOF_TXTIME_REPORT_ERRORS : 0;
1393 break;
1395 default:
1396 /* We implement the SO_SNDLOWAT etc to not be settable
1397 * (1003.1g 7).
1399 return -ENOPROTOOPT;
1402 if (len > lv)
1403 len = lv;
1404 if (copy_to_user(optval, &v, len))
1405 return -EFAULT;
1406 lenout:
1407 if (put_user(len, optlen))
1408 return -EFAULT;
1409 return 0;
1413 * Initialize an sk_lock.
1415 * (We also register the sk_lock with the lock validator.)
1417 static inline void sock_lock_init(struct sock *sk)
1419 if (sk->sk_kern_sock)
1420 sock_lock_init_class_and_name(
1422 af_family_kern_slock_key_strings[sk->sk_family],
1423 af_family_kern_slock_keys + sk->sk_family,
1424 af_family_kern_key_strings[sk->sk_family],
1425 af_family_kern_keys + sk->sk_family);
1426 else
1427 sock_lock_init_class_and_name(
1429 af_family_slock_key_strings[sk->sk_family],
1430 af_family_slock_keys + sk->sk_family,
1431 af_family_key_strings[sk->sk_family],
1432 af_family_keys + sk->sk_family);
1436 * Copy all fields from osk to nsk but nsk->sk_refcnt must not change yet,
1437 * even temporarly, because of RCU lookups. sk_node should also be left as is.
1438 * We must not copy fields between sk_dontcopy_begin and sk_dontcopy_end
1440 static void sock_copy(struct sock *nsk, const struct sock *osk)
1442 #ifdef CONFIG_SECURITY_NETWORK
1443 void *sptr = nsk->sk_security;
1444 #endif
1445 memcpy(nsk, osk, offsetof(struct sock, sk_dontcopy_begin));
1447 memcpy(&nsk->sk_dontcopy_end, &osk->sk_dontcopy_end,
1448 osk->sk_prot->obj_size - offsetof(struct sock, sk_dontcopy_end));
1450 #ifdef CONFIG_SECURITY_NETWORK
1451 nsk->sk_security = sptr;
1452 security_sk_clone(osk, nsk);
1453 #endif
1456 static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
1457 int family)
1459 struct sock *sk;
1460 struct kmem_cache *slab;
1462 slab = prot->slab;
1463 if (slab != NULL) {
1464 sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO);
1465 if (!sk)
1466 return sk;
1467 if (priority & __GFP_ZERO)
1468 sk_prot_clear_nulls(sk, prot->obj_size);
1469 } else
1470 sk = kmalloc(prot->obj_size, priority);
1472 if (sk != NULL) {
1473 if (security_sk_alloc(sk, family, priority))
1474 goto out_free;
1476 if (!try_module_get(prot->owner))
1477 goto out_free_sec;
1478 sk_tx_queue_clear(sk);
1481 return sk;
1483 out_free_sec:
1484 security_sk_free(sk);
1485 out_free:
1486 if (slab != NULL)
1487 kmem_cache_free(slab, sk);
1488 else
1489 kfree(sk);
1490 return NULL;
1493 static void sk_prot_free(struct proto *prot, struct sock *sk)
1495 struct kmem_cache *slab;
1496 struct module *owner;
1498 owner = prot->owner;
1499 slab = prot->slab;
1501 cgroup_sk_free(&sk->sk_cgrp_data);
1502 mem_cgroup_sk_free(sk);
1503 security_sk_free(sk);
1504 if (slab != NULL)
1505 kmem_cache_free(slab, sk);
1506 else
1507 kfree(sk);
1508 module_put(owner);
1512 * sk_alloc - All socket objects are allocated here
1513 * @net: the applicable net namespace
1514 * @family: protocol family
1515 * @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
1516 * @prot: struct proto associated with this new sock instance
1517 * @kern: is this to be a kernel socket?
1519 struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
1520 struct proto *prot, int kern)
1522 struct sock *sk;
1524 sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family);
1525 if (sk) {
1526 sk->sk_family = family;
1528 * See comment in struct sock definition to understand
1529 * why we need sk_prot_creator -acme
1531 sk->sk_prot = sk->sk_prot_creator = prot;
1532 sk->sk_kern_sock = kern;
1533 sock_lock_init(sk);
1534 sk->sk_net_refcnt = kern ? 0 : 1;
1535 if (likely(sk->sk_net_refcnt)) {
1536 get_net(net);
1537 sock_inuse_add(net, 1);
1540 sock_net_set(sk, net);
1541 refcount_set(&sk->sk_wmem_alloc, 1);
1543 mem_cgroup_sk_alloc(sk);
1544 cgroup_sk_alloc(&sk->sk_cgrp_data);
1545 sock_update_classid(&sk->sk_cgrp_data);
1546 sock_update_netprioidx(&sk->sk_cgrp_data);
1549 return sk;
1551 EXPORT_SYMBOL(sk_alloc);
1553 /* Sockets having SOCK_RCU_FREE will call this function after one RCU
1554 * grace period. This is the case for UDP sockets and TCP listeners.
1556 static void __sk_destruct(struct rcu_head *head)
1558 struct sock *sk = container_of(head, struct sock, sk_rcu);
1559 struct sk_filter *filter;
1561 if (sk->sk_destruct)
1562 sk->sk_destruct(sk);
1564 filter = rcu_dereference_check(sk->sk_filter,
1565 refcount_read(&sk->sk_wmem_alloc) == 0);
1566 if (filter) {
1567 sk_filter_uncharge(sk, filter);
1568 RCU_INIT_POINTER(sk->sk_filter, NULL);
1570 if (rcu_access_pointer(sk->sk_reuseport_cb))
1571 reuseport_detach_sock(sk);
1573 sock_disable_timestamp(sk, SK_FLAGS_TIMESTAMP);
1575 if (atomic_read(&sk->sk_omem_alloc))
1576 pr_debug("%s: optmem leakage (%d bytes) detected\n",
1577 __func__, atomic_read(&sk->sk_omem_alloc));
1579 if (sk->sk_frag.page) {
1580 put_page(sk->sk_frag.page);
1581 sk->sk_frag.page = NULL;
1584 if (sk->sk_peer_cred)
1585 put_cred(sk->sk_peer_cred);
1586 put_pid(sk->sk_peer_pid);
1587 if (likely(sk->sk_net_refcnt))
1588 put_net(sock_net(sk));
1589 sk_prot_free(sk->sk_prot_creator, sk);
1592 void sk_destruct(struct sock *sk)
1594 if (sock_flag(sk, SOCK_RCU_FREE))
1595 call_rcu(&sk->sk_rcu, __sk_destruct);
1596 else
1597 __sk_destruct(&sk->sk_rcu);
1600 static void __sk_free(struct sock *sk)
1602 if (likely(sk->sk_net_refcnt))
1603 sock_inuse_add(sock_net(sk), -1);
1605 if (unlikely(sk->sk_net_refcnt && sock_diag_has_destroy_listeners(sk)))
1606 sock_diag_broadcast_destroy(sk);
1607 else
1608 sk_destruct(sk);
1611 void sk_free(struct sock *sk)
1614 * We subtract one from sk_wmem_alloc and can know if
1615 * some packets are still in some tx queue.
1616 * If not null, sock_wfree() will call __sk_free(sk) later
1618 if (refcount_dec_and_test(&sk->sk_wmem_alloc))
1619 __sk_free(sk);
1621 EXPORT_SYMBOL(sk_free);
1623 static void sk_init_common(struct sock *sk)
1625 skb_queue_head_init(&sk->sk_receive_queue);
1626 skb_queue_head_init(&sk->sk_write_queue);
1627 skb_queue_head_init(&sk->sk_error_queue);
1629 rwlock_init(&sk->sk_callback_lock);
1630 lockdep_set_class_and_name(&sk->sk_receive_queue.lock,
1631 af_rlock_keys + sk->sk_family,
1632 af_family_rlock_key_strings[sk->sk_family]);
1633 lockdep_set_class_and_name(&sk->sk_write_queue.lock,
1634 af_wlock_keys + sk->sk_family,
1635 af_family_wlock_key_strings[sk->sk_family]);
1636 lockdep_set_class_and_name(&sk->sk_error_queue.lock,
1637 af_elock_keys + sk->sk_family,
1638 af_family_elock_key_strings[sk->sk_family]);
1639 lockdep_set_class_and_name(&sk->sk_callback_lock,
1640 af_callback_keys + sk->sk_family,
1641 af_family_clock_key_strings[sk->sk_family]);
1645 * sk_clone_lock - clone a socket, and lock its clone
1646 * @sk: the socket to clone
1647 * @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
1649 * Caller must unlock socket even in error path (bh_unlock_sock(newsk))
1651 struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
1653 struct sock *newsk;
1654 bool is_charged = true;
1656 newsk = sk_prot_alloc(sk->sk_prot, priority, sk->sk_family);
1657 if (newsk != NULL) {
1658 struct sk_filter *filter;
1660 sock_copy(newsk, sk);
1662 newsk->sk_prot_creator = sk->sk_prot;
1664 /* SANITY */
1665 if (likely(newsk->sk_net_refcnt))
1666 get_net(sock_net(newsk));
1667 sk_node_init(&newsk->sk_node);
1668 sock_lock_init(newsk);
1669 bh_lock_sock(newsk);
1670 newsk->sk_backlog.head = newsk->sk_backlog.tail = NULL;
1671 newsk->sk_backlog.len = 0;
1673 atomic_set(&newsk->sk_rmem_alloc, 0);
1675 * sk_wmem_alloc set to one (see sk_free() and sock_wfree())
1677 refcount_set(&newsk->sk_wmem_alloc, 1);
1678 atomic_set(&newsk->sk_omem_alloc, 0);
1679 sk_init_common(newsk);
1681 newsk->sk_dst_cache = NULL;
1682 newsk->sk_dst_pending_confirm = 0;
1683 newsk->sk_wmem_queued = 0;
1684 newsk->sk_forward_alloc = 0;
1685 atomic_set(&newsk->sk_drops, 0);
1686 newsk->sk_send_head = NULL;
1687 newsk->sk_userlocks = sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
1688 atomic_set(&newsk->sk_zckey, 0);
1690 sock_reset_flag(newsk, SOCK_DONE);
1691 mem_cgroup_sk_alloc(newsk);
1692 cgroup_sk_alloc(&newsk->sk_cgrp_data);
1694 rcu_read_lock();
1695 filter = rcu_dereference(sk->sk_filter);
1696 if (filter != NULL)
1697 /* though it's an empty new sock, the charging may fail
1698 * if sysctl_optmem_max was changed between creation of
1699 * original socket and cloning
1701 is_charged = sk_filter_charge(newsk, filter);
1702 RCU_INIT_POINTER(newsk->sk_filter, filter);
1703 rcu_read_unlock();
1705 if (unlikely(!is_charged || xfrm_sk_clone_policy(newsk, sk))) {
1706 /* We need to make sure that we don't uncharge the new
1707 * socket if we couldn't charge it in the first place
1708 * as otherwise we uncharge the parent's filter.
1710 if (!is_charged)
1711 RCU_INIT_POINTER(newsk->sk_filter, NULL);
1712 sk_free_unlock_clone(newsk);
1713 newsk = NULL;
1714 goto out;
1716 RCU_INIT_POINTER(newsk->sk_reuseport_cb, NULL);
1718 newsk->sk_err = 0;
1719 newsk->sk_err_soft = 0;
1720 newsk->sk_priority = 0;
1721 newsk->sk_incoming_cpu = raw_smp_processor_id();
1722 atomic64_set(&newsk->sk_cookie, 0);
1723 if (likely(newsk->sk_net_refcnt))
1724 sock_inuse_add(sock_net(newsk), 1);
1727 * Before updating sk_refcnt, we must commit prior changes to memory
1728 * (Documentation/RCU/rculist_nulls.txt for details)
1730 smp_wmb();
1731 refcount_set(&newsk->sk_refcnt, 2);
1734 * Increment the counter in the same struct proto as the master
1735 * sock (sk_refcnt_debug_inc uses newsk->sk_prot->socks, that
1736 * is the same as sk->sk_prot->socks, as this field was copied
1737 * with memcpy).
1739 * This _changes_ the previous behaviour, where
1740 * tcp_create_openreq_child always was incrementing the
1741 * equivalent to tcp_prot->socks (inet_sock_nr), so this have
1742 * to be taken into account in all callers. -acme
1744 sk_refcnt_debug_inc(newsk);
1745 sk_set_socket(newsk, NULL);
1746 newsk->sk_wq = NULL;
1748 if (newsk->sk_prot->sockets_allocated)
1749 sk_sockets_allocated_inc(newsk);
1751 if (sock_needs_netstamp(sk) &&
1752 newsk->sk_flags & SK_FLAGS_TIMESTAMP)
1753 net_enable_timestamp();
1755 out:
1756 return newsk;
1758 EXPORT_SYMBOL_GPL(sk_clone_lock);
1760 void sk_free_unlock_clone(struct sock *sk)
1762 /* It is still raw copy of parent, so invalidate
1763 * destructor and make plain sk_free() */
1764 sk->sk_destruct = NULL;
1765 bh_unlock_sock(sk);
1766 sk_free(sk);
1768 EXPORT_SYMBOL_GPL(sk_free_unlock_clone);
1770 void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
1772 u32 max_segs = 1;
1774 sk_dst_set(sk, dst);
1775 sk->sk_route_caps = dst->dev->features | sk->sk_route_forced_caps;
1776 if (sk->sk_route_caps & NETIF_F_GSO)
1777 sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE;
1778 sk->sk_route_caps &= ~sk->sk_route_nocaps;
1779 if (sk_can_gso(sk)) {
1780 if (dst->header_len && !xfrm_dst_offload_ok(dst)) {
1781 sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
1782 } else {
1783 sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM;
1784 sk->sk_gso_max_size = dst->dev->gso_max_size;
1785 max_segs = max_t(u32, dst->dev->gso_max_segs, 1);
1788 sk->sk_gso_max_segs = max_segs;
1790 EXPORT_SYMBOL_GPL(sk_setup_caps);
1793 * Simple resource managers for sockets.
1798 * Write buffer destructor automatically called from kfree_skb.
1800 void sock_wfree(struct sk_buff *skb)
1802 struct sock *sk = skb->sk;
1803 unsigned int len = skb->truesize;
1805 if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE)) {
1807 * Keep a reference on sk_wmem_alloc, this will be released
1808 * after sk_write_space() call
1810 WARN_ON(refcount_sub_and_test(len - 1, &sk->sk_wmem_alloc));
1811 sk->sk_write_space(sk);
1812 len = 1;
1815 * if sk_wmem_alloc reaches 0, we must finish what sk_free()
1816 * could not do because of in-flight packets
1818 if (refcount_sub_and_test(len, &sk->sk_wmem_alloc))
1819 __sk_free(sk);
1821 EXPORT_SYMBOL(sock_wfree);
1823 /* This variant of sock_wfree() is used by TCP,
1824 * since it sets SOCK_USE_WRITE_QUEUE.
1826 void __sock_wfree(struct sk_buff *skb)
1828 struct sock *sk = skb->sk;
1830 if (refcount_sub_and_test(skb->truesize, &sk->sk_wmem_alloc))
1831 __sk_free(sk);
1834 void skb_set_owner_w(struct sk_buff *skb, struct sock *sk)
1836 skb_orphan(skb);
1837 skb->sk = sk;
1838 #ifdef CONFIG_INET
1839 if (unlikely(!sk_fullsock(sk))) {
1840 skb->destructor = sock_edemux;
1841 sock_hold(sk);
1842 return;
1844 #endif
1845 skb->destructor = sock_wfree;
1846 skb_set_hash_from_sk(skb, sk);
1848 * We used to take a refcount on sk, but following operation
1849 * is enough to guarantee sk_free() wont free this sock until
1850 * all in-flight packets are completed
1852 refcount_add(skb->truesize, &sk->sk_wmem_alloc);
1854 EXPORT_SYMBOL(skb_set_owner_w);
1856 /* This helper is used by netem, as it can hold packets in its
1857 * delay queue. We want to allow the owner socket to send more
1858 * packets, as if they were already TX completed by a typical driver.
1859 * But we also want to keep skb->sk set because some packet schedulers
1860 * rely on it (sch_fq for example).
1862 void skb_orphan_partial(struct sk_buff *skb)
1864 if (skb_is_tcp_pure_ack(skb))
1865 return;
1867 if (skb->destructor == sock_wfree
1868 #ifdef CONFIG_INET
1869 || skb->destructor == tcp_wfree
1870 #endif
1872 struct sock *sk = skb->sk;
1874 if (refcount_inc_not_zero(&sk->sk_refcnt)) {
1875 WARN_ON(refcount_sub_and_test(skb->truesize, &sk->sk_wmem_alloc));
1876 skb->destructor = sock_efree;
1878 } else {
1879 skb_orphan(skb);
1882 EXPORT_SYMBOL(skb_orphan_partial);
1885 * Read buffer destructor automatically called from kfree_skb.
1887 void sock_rfree(struct sk_buff *skb)
1889 struct sock *sk = skb->sk;
1890 unsigned int len = skb->truesize;
1892 atomic_sub(len, &sk->sk_rmem_alloc);
1893 sk_mem_uncharge(sk, len);
1895 EXPORT_SYMBOL(sock_rfree);
1898 * Buffer destructor for skbs that are not used directly in read or write
1899 * path, e.g. for error handler skbs. Automatically called from kfree_skb.
1901 void sock_efree(struct sk_buff *skb)
1903 sock_put(skb->sk);
1905 EXPORT_SYMBOL(sock_efree);
1907 kuid_t sock_i_uid(struct sock *sk)
1909 kuid_t uid;
1911 read_lock_bh(&sk->sk_callback_lock);
1912 uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : GLOBAL_ROOT_UID;
1913 read_unlock_bh(&sk->sk_callback_lock);
1914 return uid;
1916 EXPORT_SYMBOL(sock_i_uid);
1918 unsigned long sock_i_ino(struct sock *sk)
1920 unsigned long ino;
1922 read_lock_bh(&sk->sk_callback_lock);
1923 ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0;
1924 read_unlock_bh(&sk->sk_callback_lock);
1925 return ino;
1927 EXPORT_SYMBOL(sock_i_ino);
1930 * Allocate a skb from the socket's send buffer.
1932 struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
1933 gfp_t priority)
1935 if (force || refcount_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
1936 struct sk_buff *skb = alloc_skb(size, priority);
1937 if (skb) {
1938 skb_set_owner_w(skb, sk);
1939 return skb;
1942 return NULL;
1944 EXPORT_SYMBOL(sock_wmalloc);
1946 static void sock_ofree(struct sk_buff *skb)
1948 struct sock *sk = skb->sk;
1950 atomic_sub(skb->truesize, &sk->sk_omem_alloc);
1953 struct sk_buff *sock_omalloc(struct sock *sk, unsigned long size,
1954 gfp_t priority)
1956 struct sk_buff *skb;
1958 /* small safe race: SKB_TRUESIZE may differ from final skb->truesize */
1959 if (atomic_read(&sk->sk_omem_alloc) + SKB_TRUESIZE(size) >
1960 sysctl_optmem_max)
1961 return NULL;
1963 skb = alloc_skb(size, priority);
1964 if (!skb)
1965 return NULL;
1967 atomic_add(skb->truesize, &sk->sk_omem_alloc);
1968 skb->sk = sk;
1969 skb->destructor = sock_ofree;
1970 return skb;
1974 * Allocate a memory block from the socket's option memory buffer.
1976 void *sock_kmalloc(struct sock *sk, int size, gfp_t priority)
1978 if ((unsigned int)size <= sysctl_optmem_max &&
1979 atomic_read(&sk->sk_omem_alloc) + size < sysctl_optmem_max) {
1980 void *mem;
1981 /* First do the add, to avoid the race if kmalloc
1982 * might sleep.
1984 atomic_add(size, &sk->sk_omem_alloc);
1985 mem = kmalloc(size, priority);
1986 if (mem)
1987 return mem;
1988 atomic_sub(size, &sk->sk_omem_alloc);
1990 return NULL;
1992 EXPORT_SYMBOL(sock_kmalloc);
1994 /* Free an option memory block. Note, we actually want the inline
1995 * here as this allows gcc to detect the nullify and fold away the
1996 * condition entirely.
1998 static inline void __sock_kfree_s(struct sock *sk, void *mem, int size,
1999 const bool nullify)
2001 if (WARN_ON_ONCE(!mem))
2002 return;
2003 if (nullify)
2004 kzfree(mem);
2005 else
2006 kfree(mem);
2007 atomic_sub(size, &sk->sk_omem_alloc);
2010 void sock_kfree_s(struct sock *sk, void *mem, int size)
2012 __sock_kfree_s(sk, mem, size, false);
2014 EXPORT_SYMBOL(sock_kfree_s);
2016 void sock_kzfree_s(struct sock *sk, void *mem, int size)
2018 __sock_kfree_s(sk, mem, size, true);
2020 EXPORT_SYMBOL(sock_kzfree_s);
2022 /* It is almost wait_for_tcp_memory minus release_sock/lock_sock.
2023 I think, these locks should be removed for datagram sockets.
2025 static long sock_wait_for_wmem(struct sock *sk, long timeo)
2027 DEFINE_WAIT(wait);
2029 sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
2030 for (;;) {
2031 if (!timeo)
2032 break;
2033 if (signal_pending(current))
2034 break;
2035 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
2036 prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
2037 if (refcount_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf)
2038 break;
2039 if (sk->sk_shutdown & SEND_SHUTDOWN)
2040 break;
2041 if (sk->sk_err)
2042 break;
2043 timeo = schedule_timeout(timeo);
2045 finish_wait(sk_sleep(sk), &wait);
2046 return timeo;
2051 * Generic send/receive buffer handlers
2054 struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len,
2055 unsigned long data_len, int noblock,
2056 int *errcode, int max_page_order)
2058 struct sk_buff *skb;
2059 long timeo;
2060 int err;
2062 timeo = sock_sndtimeo(sk, noblock);
2063 for (;;) {
2064 err = sock_error(sk);
2065 if (err != 0)
2066 goto failure;
2068 err = -EPIPE;
2069 if (sk->sk_shutdown & SEND_SHUTDOWN)
2070 goto failure;
2072 if (sk_wmem_alloc_get(sk) < sk->sk_sndbuf)
2073 break;
2075 sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
2076 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
2077 err = -EAGAIN;
2078 if (!timeo)
2079 goto failure;
2080 if (signal_pending(current))
2081 goto interrupted;
2082 timeo = sock_wait_for_wmem(sk, timeo);
2084 skb = alloc_skb_with_frags(header_len, data_len, max_page_order,
2085 errcode, sk->sk_allocation);
2086 if (skb)
2087 skb_set_owner_w(skb, sk);
2088 return skb;
2090 interrupted:
2091 err = sock_intr_errno(timeo);
2092 failure:
2093 *errcode = err;
2094 return NULL;
2096 EXPORT_SYMBOL(sock_alloc_send_pskb);
2098 struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size,
2099 int noblock, int *errcode)
2101 return sock_alloc_send_pskb(sk, size, 0, noblock, errcode, 0);
2103 EXPORT_SYMBOL(sock_alloc_send_skb);
2105 int __sock_cmsg_send(struct sock *sk, struct msghdr *msg, struct cmsghdr *cmsg,
2106 struct sockcm_cookie *sockc)
2108 u32 tsflags;
2110 switch (cmsg->cmsg_type) {
2111 case SO_MARK:
2112 if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
2113 return -EPERM;
2114 if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
2115 return -EINVAL;
2116 sockc->mark = *(u32 *)CMSG_DATA(cmsg);
2117 break;
2118 case SO_TIMESTAMPING:
2119 if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
2120 return -EINVAL;
2122 tsflags = *(u32 *)CMSG_DATA(cmsg);
2123 if (tsflags & ~SOF_TIMESTAMPING_TX_RECORD_MASK)
2124 return -EINVAL;
2126 sockc->tsflags &= ~SOF_TIMESTAMPING_TX_RECORD_MASK;
2127 sockc->tsflags |= tsflags;
2128 break;
2129 case SCM_TXTIME:
2130 if (!sock_flag(sk, SOCK_TXTIME))
2131 return -EINVAL;
2132 if (cmsg->cmsg_len != CMSG_LEN(sizeof(u64)))
2133 return -EINVAL;
2134 sockc->transmit_time = get_unaligned((u64 *)CMSG_DATA(cmsg));
2135 break;
2136 /* SCM_RIGHTS and SCM_CREDENTIALS are semantically in SOL_UNIX. */
2137 case SCM_RIGHTS:
2138 case SCM_CREDENTIALS:
2139 break;
2140 default:
2141 return -EINVAL;
2143 return 0;
2145 EXPORT_SYMBOL(__sock_cmsg_send);
2147 int sock_cmsg_send(struct sock *sk, struct msghdr *msg,
2148 struct sockcm_cookie *sockc)
2150 struct cmsghdr *cmsg;
2151 int ret;
2153 for_each_cmsghdr(cmsg, msg) {
2154 if (!CMSG_OK(msg, cmsg))
2155 return -EINVAL;
2156 if (cmsg->cmsg_level != SOL_SOCKET)
2157 continue;
2158 ret = __sock_cmsg_send(sk, msg, cmsg, sockc);
2159 if (ret)
2160 return ret;
2162 return 0;
2164 EXPORT_SYMBOL(sock_cmsg_send);
2166 static void sk_enter_memory_pressure(struct sock *sk)
2168 if (!sk->sk_prot->enter_memory_pressure)
2169 return;
2171 sk->sk_prot->enter_memory_pressure(sk);
2174 static void sk_leave_memory_pressure(struct sock *sk)
2176 if (sk->sk_prot->leave_memory_pressure) {
2177 sk->sk_prot->leave_memory_pressure(sk);
2178 } else {
2179 unsigned long *memory_pressure = sk->sk_prot->memory_pressure;
2181 if (memory_pressure && *memory_pressure)
2182 *memory_pressure = 0;
2186 /* On 32bit arches, an skb frag is limited to 2^15 */
2187 #define SKB_FRAG_PAGE_ORDER get_order(32768)
2190 * skb_page_frag_refill - check that a page_frag contains enough room
2191 * @sz: minimum size of the fragment we want to get
2192 * @pfrag: pointer to page_frag
2193 * @gfp: priority for memory allocation
2195 * Note: While this allocator tries to use high order pages, there is
2196 * no guarantee that allocations succeed. Therefore, @sz MUST be
2197 * less or equal than PAGE_SIZE.
2199 bool skb_page_frag_refill(unsigned int sz, struct page_frag *pfrag, gfp_t gfp)
2201 if (pfrag->page) {
2202 if (page_ref_count(pfrag->page) == 1) {
2203 pfrag->offset = 0;
2204 return true;
2206 if (pfrag->offset + sz <= pfrag->size)
2207 return true;
2208 put_page(pfrag->page);
2211 pfrag->offset = 0;
2212 if (SKB_FRAG_PAGE_ORDER) {
2213 /* Avoid direct reclaim but allow kswapd to wake */
2214 pfrag->page = alloc_pages((gfp & ~__GFP_DIRECT_RECLAIM) |
2215 __GFP_COMP | __GFP_NOWARN |
2216 __GFP_NORETRY,
2217 SKB_FRAG_PAGE_ORDER);
2218 if (likely(pfrag->page)) {
2219 pfrag->size = PAGE_SIZE << SKB_FRAG_PAGE_ORDER;
2220 return true;
2223 pfrag->page = alloc_page(gfp);
2224 if (likely(pfrag->page)) {
2225 pfrag->size = PAGE_SIZE;
2226 return true;
2228 return false;
2230 EXPORT_SYMBOL(skb_page_frag_refill);
2232 bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag)
2234 if (likely(skb_page_frag_refill(32U, pfrag, sk->sk_allocation)))
2235 return true;
2237 sk_enter_memory_pressure(sk);
2238 sk_stream_moderate_sndbuf(sk);
2239 return false;
2241 EXPORT_SYMBOL(sk_page_frag_refill);
2243 static void __lock_sock(struct sock *sk)
2244 __releases(&sk->sk_lock.slock)
2245 __acquires(&sk->sk_lock.slock)
2247 DEFINE_WAIT(wait);
2249 for (;;) {
2250 prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait,
2251 TASK_UNINTERRUPTIBLE);
2252 spin_unlock_bh(&sk->sk_lock.slock);
2253 schedule();
2254 spin_lock_bh(&sk->sk_lock.slock);
2255 if (!sock_owned_by_user(sk))
2256 break;
2258 finish_wait(&sk->sk_lock.wq, &wait);
2261 void __release_sock(struct sock *sk)
2262 __releases(&sk->sk_lock.slock)
2263 __acquires(&sk->sk_lock.slock)
2265 struct sk_buff *skb, *next;
2267 while ((skb = sk->sk_backlog.head) != NULL) {
2268 sk->sk_backlog.head = sk->sk_backlog.tail = NULL;
2270 spin_unlock_bh(&sk->sk_lock.slock);
2272 do {
2273 next = skb->next;
2274 prefetch(next);
2275 WARN_ON_ONCE(skb_dst_is_noref(skb));
2276 skb_mark_not_on_list(skb);
2277 sk_backlog_rcv(sk, skb);
2279 cond_resched();
2281 skb = next;
2282 } while (skb != NULL);
2284 spin_lock_bh(&sk->sk_lock.slock);
2288 * Doing the zeroing here guarantee we can not loop forever
2289 * while a wild producer attempts to flood us.
2291 sk->sk_backlog.len = 0;
2294 void __sk_flush_backlog(struct sock *sk)
2296 spin_lock_bh(&sk->sk_lock.slock);
2297 __release_sock(sk);
2298 spin_unlock_bh(&sk->sk_lock.slock);
2302 * sk_wait_data - wait for data to arrive at sk_receive_queue
2303 * @sk: sock to wait on
2304 * @timeo: for how long
2305 * @skb: last skb seen on sk_receive_queue
2307 * Now socket state including sk->sk_err is changed only under lock,
2308 * hence we may omit checks after joining wait queue.
2309 * We check receive queue before schedule() only as optimization;
2310 * it is very likely that release_sock() added new data.
2312 int sk_wait_data(struct sock *sk, long *timeo, const struct sk_buff *skb)
2314 DEFINE_WAIT_FUNC(wait, woken_wake_function);
2315 int rc;
2317 add_wait_queue(sk_sleep(sk), &wait);
2318 sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2319 rc = sk_wait_event(sk, timeo, skb_peek_tail(&sk->sk_receive_queue) != skb, &wait);
2320 sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2321 remove_wait_queue(sk_sleep(sk), &wait);
2322 return rc;
2324 EXPORT_SYMBOL(sk_wait_data);
2327 * __sk_mem_raise_allocated - increase memory_allocated
2328 * @sk: socket
2329 * @size: memory size to allocate
2330 * @amt: pages to allocate
2331 * @kind: allocation type
2333 * Similar to __sk_mem_schedule(), but does not update sk_forward_alloc
2335 int __sk_mem_raise_allocated(struct sock *sk, int size, int amt, int kind)
2337 struct proto *prot = sk->sk_prot;
2338 long allocated = sk_memory_allocated_add(sk, amt);
2339 bool charged = true;
2341 if (mem_cgroup_sockets_enabled && sk->sk_memcg &&
2342 !(charged = mem_cgroup_charge_skmem(sk->sk_memcg, amt)))
2343 goto suppress_allocation;
2345 /* Under limit. */
2346 if (allocated <= sk_prot_mem_limits(sk, 0)) {
2347 sk_leave_memory_pressure(sk);
2348 return 1;
2351 /* Under pressure. */
2352 if (allocated > sk_prot_mem_limits(sk, 1))
2353 sk_enter_memory_pressure(sk);
2355 /* Over hard limit. */
2356 if (allocated > sk_prot_mem_limits(sk, 2))
2357 goto suppress_allocation;
2359 /* guarantee minimum buffer size under pressure */
2360 if (kind == SK_MEM_RECV) {
2361 if (atomic_read(&sk->sk_rmem_alloc) < sk_get_rmem0(sk, prot))
2362 return 1;
2364 } else { /* SK_MEM_SEND */
2365 int wmem0 = sk_get_wmem0(sk, prot);
2367 if (sk->sk_type == SOCK_STREAM) {
2368 if (sk->sk_wmem_queued < wmem0)
2369 return 1;
2370 } else if (refcount_read(&sk->sk_wmem_alloc) < wmem0) {
2371 return 1;
2375 if (sk_has_memory_pressure(sk)) {
2376 int alloc;
2378 if (!sk_under_memory_pressure(sk))
2379 return 1;
2380 alloc = sk_sockets_allocated_read_positive(sk);
2381 if (sk_prot_mem_limits(sk, 2) > alloc *
2382 sk_mem_pages(sk->sk_wmem_queued +
2383 atomic_read(&sk->sk_rmem_alloc) +
2384 sk->sk_forward_alloc))
2385 return 1;
2388 suppress_allocation:
2390 if (kind == SK_MEM_SEND && sk->sk_type == SOCK_STREAM) {
2391 sk_stream_moderate_sndbuf(sk);
2393 /* Fail only if socket is _under_ its sndbuf.
2394 * In this case we cannot block, so that we have to fail.
2396 if (sk->sk_wmem_queued + size >= sk->sk_sndbuf)
2397 return 1;
2400 if (kind == SK_MEM_SEND || (kind == SK_MEM_RECV && charged))
2401 trace_sock_exceed_buf_limit(sk, prot, allocated, kind);
2403 sk_memory_allocated_sub(sk, amt);
2405 if (mem_cgroup_sockets_enabled && sk->sk_memcg)
2406 mem_cgroup_uncharge_skmem(sk->sk_memcg, amt);
2408 return 0;
2410 EXPORT_SYMBOL(__sk_mem_raise_allocated);
2413 * __sk_mem_schedule - increase sk_forward_alloc and memory_allocated
2414 * @sk: socket
2415 * @size: memory size to allocate
2416 * @kind: allocation type
2418 * If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means
2419 * rmem allocation. This function assumes that protocols which have
2420 * memory_pressure use sk_wmem_queued as write buffer accounting.
2422 int __sk_mem_schedule(struct sock *sk, int size, int kind)
2424 int ret, amt = sk_mem_pages(size);
2426 sk->sk_forward_alloc += amt << SK_MEM_QUANTUM_SHIFT;
2427 ret = __sk_mem_raise_allocated(sk, size, amt, kind);
2428 if (!ret)
2429 sk->sk_forward_alloc -= amt << SK_MEM_QUANTUM_SHIFT;
2430 return ret;
2432 EXPORT_SYMBOL(__sk_mem_schedule);
2435 * __sk_mem_reduce_allocated - reclaim memory_allocated
2436 * @sk: socket
2437 * @amount: number of quanta
2439 * Similar to __sk_mem_reclaim(), but does not update sk_forward_alloc
2441 void __sk_mem_reduce_allocated(struct sock *sk, int amount)
2443 sk_memory_allocated_sub(sk, amount);
2445 if (mem_cgroup_sockets_enabled && sk->sk_memcg)
2446 mem_cgroup_uncharge_skmem(sk->sk_memcg, amount);
2448 if (sk_under_memory_pressure(sk) &&
2449 (sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)))
2450 sk_leave_memory_pressure(sk);
2452 EXPORT_SYMBOL(__sk_mem_reduce_allocated);
2455 * __sk_mem_reclaim - reclaim sk_forward_alloc and memory_allocated
2456 * @sk: socket
2457 * @amount: number of bytes (rounded down to a SK_MEM_QUANTUM multiple)
2459 void __sk_mem_reclaim(struct sock *sk, int amount)
2461 amount >>= SK_MEM_QUANTUM_SHIFT;
2462 sk->sk_forward_alloc -= amount << SK_MEM_QUANTUM_SHIFT;
2463 __sk_mem_reduce_allocated(sk, amount);
2465 EXPORT_SYMBOL(__sk_mem_reclaim);
2467 int sk_set_peek_off(struct sock *sk, int val)
2469 sk->sk_peek_off = val;
2470 return 0;
2472 EXPORT_SYMBOL_GPL(sk_set_peek_off);
2475 * Set of default routines for initialising struct proto_ops when
2476 * the protocol does not support a particular function. In certain
2477 * cases where it makes no sense for a protocol to have a "do nothing"
2478 * function, some default processing is provided.
2481 int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len)
2483 return -EOPNOTSUPP;
2485 EXPORT_SYMBOL(sock_no_bind);
2487 int sock_no_connect(struct socket *sock, struct sockaddr *saddr,
2488 int len, int flags)
2490 return -EOPNOTSUPP;
2492 EXPORT_SYMBOL(sock_no_connect);
2494 int sock_no_socketpair(struct socket *sock1, struct socket *sock2)
2496 return -EOPNOTSUPP;
2498 EXPORT_SYMBOL(sock_no_socketpair);
2500 int sock_no_accept(struct socket *sock, struct socket *newsock, int flags,
2501 bool kern)
2503 return -EOPNOTSUPP;
2505 EXPORT_SYMBOL(sock_no_accept);
2507 int sock_no_getname(struct socket *sock, struct sockaddr *saddr,
2508 int peer)
2510 return -EOPNOTSUPP;
2512 EXPORT_SYMBOL(sock_no_getname);
2514 int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
2516 return -EOPNOTSUPP;
2518 EXPORT_SYMBOL(sock_no_ioctl);
2520 int sock_no_listen(struct socket *sock, int backlog)
2522 return -EOPNOTSUPP;
2524 EXPORT_SYMBOL(sock_no_listen);
2526 int sock_no_shutdown(struct socket *sock, int how)
2528 return -EOPNOTSUPP;
2530 EXPORT_SYMBOL(sock_no_shutdown);
2532 int sock_no_setsockopt(struct socket *sock, int level, int optname,
2533 char __user *optval, unsigned int optlen)
2535 return -EOPNOTSUPP;
2537 EXPORT_SYMBOL(sock_no_setsockopt);
2539 int sock_no_getsockopt(struct socket *sock, int level, int optname,
2540 char __user *optval, int __user *optlen)
2542 return -EOPNOTSUPP;
2544 EXPORT_SYMBOL(sock_no_getsockopt);
2546 int sock_no_sendmsg(struct socket *sock, struct msghdr *m, size_t len)
2548 return -EOPNOTSUPP;
2550 EXPORT_SYMBOL(sock_no_sendmsg);
2552 int sock_no_sendmsg_locked(struct sock *sk, struct msghdr *m, size_t len)
2554 return -EOPNOTSUPP;
2556 EXPORT_SYMBOL(sock_no_sendmsg_locked);
2558 int sock_no_recvmsg(struct socket *sock, struct msghdr *m, size_t len,
2559 int flags)
2561 return -EOPNOTSUPP;
2563 EXPORT_SYMBOL(sock_no_recvmsg);
2565 int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
2567 /* Mirror missing mmap method error code */
2568 return -ENODEV;
2570 EXPORT_SYMBOL(sock_no_mmap);
2572 ssize_t sock_no_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags)
2574 ssize_t res;
2575 struct msghdr msg = {.msg_flags = flags};
2576 struct kvec iov;
2577 char *kaddr = kmap(page);
2578 iov.iov_base = kaddr + offset;
2579 iov.iov_len = size;
2580 res = kernel_sendmsg(sock, &msg, &iov, 1, size);
2581 kunmap(page);
2582 return res;
2584 EXPORT_SYMBOL(sock_no_sendpage);
2586 ssize_t sock_no_sendpage_locked(struct sock *sk, struct page *page,
2587 int offset, size_t size, int flags)
2589 ssize_t res;
2590 struct msghdr msg = {.msg_flags = flags};
2591 struct kvec iov;
2592 char *kaddr = kmap(page);
2594 iov.iov_base = kaddr + offset;
2595 iov.iov_len = size;
2596 res = kernel_sendmsg_locked(sk, &msg, &iov, 1, size);
2597 kunmap(page);
2598 return res;
2600 EXPORT_SYMBOL(sock_no_sendpage_locked);
2603 * Default Socket Callbacks
2606 static void sock_def_wakeup(struct sock *sk)
2608 struct socket_wq *wq;
2610 rcu_read_lock();
2611 wq = rcu_dereference(sk->sk_wq);
2612 if (skwq_has_sleeper(wq))
2613 wake_up_interruptible_all(&wq->wait);
2614 rcu_read_unlock();
2617 static void sock_def_error_report(struct sock *sk)
2619 struct socket_wq *wq;
2621 rcu_read_lock();
2622 wq = rcu_dereference(sk->sk_wq);
2623 if (skwq_has_sleeper(wq))
2624 wake_up_interruptible_poll(&wq->wait, EPOLLERR);
2625 sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR);
2626 rcu_read_unlock();
2629 static void sock_def_readable(struct sock *sk)
2631 struct socket_wq *wq;
2633 rcu_read_lock();
2634 wq = rcu_dereference(sk->sk_wq);
2635 if (skwq_has_sleeper(wq))
2636 wake_up_interruptible_sync_poll(&wq->wait, EPOLLIN | EPOLLPRI |
2637 EPOLLRDNORM | EPOLLRDBAND);
2638 sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
2639 rcu_read_unlock();
2642 static void sock_def_write_space(struct sock *sk)
2644 struct socket_wq *wq;
2646 rcu_read_lock();
2648 /* Do not wake up a writer until he can make "significant"
2649 * progress. --DaveM
2651 if ((refcount_read(&sk->sk_wmem_alloc) << 1) <= sk->sk_sndbuf) {
2652 wq = rcu_dereference(sk->sk_wq);
2653 if (skwq_has_sleeper(wq))
2654 wake_up_interruptible_sync_poll(&wq->wait, EPOLLOUT |
2655 EPOLLWRNORM | EPOLLWRBAND);
2657 /* Should agree with poll, otherwise some programs break */
2658 if (sock_writeable(sk))
2659 sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
2662 rcu_read_unlock();
2665 static void sock_def_destruct(struct sock *sk)
2669 void sk_send_sigurg(struct sock *sk)
2671 if (sk->sk_socket && sk->sk_socket->file)
2672 if (send_sigurg(&sk->sk_socket->file->f_owner))
2673 sk_wake_async(sk, SOCK_WAKE_URG, POLL_PRI);
2675 EXPORT_SYMBOL(sk_send_sigurg);
2677 void sk_reset_timer(struct sock *sk, struct timer_list* timer,
2678 unsigned long expires)
2680 if (!mod_timer(timer, expires))
2681 sock_hold(sk);
2683 EXPORT_SYMBOL(sk_reset_timer);
2685 void sk_stop_timer(struct sock *sk, struct timer_list* timer)
2687 if (del_timer(timer))
2688 __sock_put(sk);
2690 EXPORT_SYMBOL(sk_stop_timer);
2692 void sock_init_data(struct socket *sock, struct sock *sk)
2694 sk_init_common(sk);
2695 sk->sk_send_head = NULL;
2697 timer_setup(&sk->sk_timer, NULL, 0);
2699 sk->sk_allocation = GFP_KERNEL;
2700 sk->sk_rcvbuf = sysctl_rmem_default;
2701 sk->sk_sndbuf = sysctl_wmem_default;
2702 sk->sk_state = TCP_CLOSE;
2703 sk_set_socket(sk, sock);
2705 sock_set_flag(sk, SOCK_ZAPPED);
2707 if (sock) {
2708 sk->sk_type = sock->type;
2709 sk->sk_wq = sock->wq;
2710 sock->sk = sk;
2711 sk->sk_uid = SOCK_INODE(sock)->i_uid;
2712 } else {
2713 sk->sk_wq = NULL;
2714 sk->sk_uid = make_kuid(sock_net(sk)->user_ns, 0);
2717 rwlock_init(&sk->sk_callback_lock);
2718 if (sk->sk_kern_sock)
2719 lockdep_set_class_and_name(
2720 &sk->sk_callback_lock,
2721 af_kern_callback_keys + sk->sk_family,
2722 af_family_kern_clock_key_strings[sk->sk_family]);
2723 else
2724 lockdep_set_class_and_name(
2725 &sk->sk_callback_lock,
2726 af_callback_keys + sk->sk_family,
2727 af_family_clock_key_strings[sk->sk_family]);
2729 sk->sk_state_change = sock_def_wakeup;
2730 sk->sk_data_ready = sock_def_readable;
2731 sk->sk_write_space = sock_def_write_space;
2732 sk->sk_error_report = sock_def_error_report;
2733 sk->sk_destruct = sock_def_destruct;
2735 sk->sk_frag.page = NULL;
2736 sk->sk_frag.offset = 0;
2737 sk->sk_peek_off = -1;
2739 sk->sk_peer_pid = NULL;
2740 sk->sk_peer_cred = NULL;
2741 sk->sk_write_pending = 0;
2742 sk->sk_rcvlowat = 1;
2743 sk->sk_rcvtimeo = MAX_SCHEDULE_TIMEOUT;
2744 sk->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT;
2746 sk->sk_stamp = SK_DEFAULT_STAMP;
2747 #if BITS_PER_LONG==32
2748 seqlock_init(&sk->sk_stamp_seq);
2749 #endif
2750 atomic_set(&sk->sk_zckey, 0);
2752 #ifdef CONFIG_NET_RX_BUSY_POLL
2753 sk->sk_napi_id = 0;
2754 sk->sk_ll_usec = sysctl_net_busy_read;
2755 #endif
2757 sk->sk_max_pacing_rate = ~0UL;
2758 sk->sk_pacing_rate = ~0UL;
2759 sk->sk_pacing_shift = 10;
2760 sk->sk_incoming_cpu = -1;
2762 sk_rx_queue_clear(sk);
2764 * Before updating sk_refcnt, we must commit prior changes to memory
2765 * (Documentation/RCU/rculist_nulls.txt for details)
2767 smp_wmb();
2768 refcount_set(&sk->sk_refcnt, 1);
2769 atomic_set(&sk->sk_drops, 0);
2771 EXPORT_SYMBOL(sock_init_data);
2773 void lock_sock_nested(struct sock *sk, int subclass)
2775 might_sleep();
2776 spin_lock_bh(&sk->sk_lock.slock);
2777 if (sk->sk_lock.owned)
2778 __lock_sock(sk);
2779 sk->sk_lock.owned = 1;
2780 spin_unlock(&sk->sk_lock.slock);
2782 * The sk_lock has mutex_lock() semantics here:
2784 mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_);
2785 local_bh_enable();
2787 EXPORT_SYMBOL(lock_sock_nested);
2789 void release_sock(struct sock *sk)
2791 spin_lock_bh(&sk->sk_lock.slock);
2792 if (sk->sk_backlog.tail)
2793 __release_sock(sk);
2795 /* Warning : release_cb() might need to release sk ownership,
2796 * ie call sock_release_ownership(sk) before us.
2798 if (sk->sk_prot->release_cb)
2799 sk->sk_prot->release_cb(sk);
2801 sock_release_ownership(sk);
2802 if (waitqueue_active(&sk->sk_lock.wq))
2803 wake_up(&sk->sk_lock.wq);
2804 spin_unlock_bh(&sk->sk_lock.slock);
2806 EXPORT_SYMBOL(release_sock);
2809 * lock_sock_fast - fast version of lock_sock
2810 * @sk: socket
2812 * This version should be used for very small section, where process wont block
2813 * return false if fast path is taken:
2815 * sk_lock.slock locked, owned = 0, BH disabled
2817 * return true if slow path is taken:
2819 * sk_lock.slock unlocked, owned = 1, BH enabled
2821 bool lock_sock_fast(struct sock *sk)
2823 might_sleep();
2824 spin_lock_bh(&sk->sk_lock.slock);
2826 if (!sk->sk_lock.owned)
2828 * Note : We must disable BH
2830 return false;
2832 __lock_sock(sk);
2833 sk->sk_lock.owned = 1;
2834 spin_unlock(&sk->sk_lock.slock);
2836 * The sk_lock has mutex_lock() semantics here:
2838 mutex_acquire(&sk->sk_lock.dep_map, 0, 0, _RET_IP_);
2839 local_bh_enable();
2840 return true;
2842 EXPORT_SYMBOL(lock_sock_fast);
2844 int sock_get_timestamp(struct sock *sk, struct timeval __user *userstamp)
2846 struct timeval tv;
2848 sock_enable_timestamp(sk, SOCK_TIMESTAMP);
2849 tv = ktime_to_timeval(sock_read_timestamp(sk));
2850 if (tv.tv_sec == -1)
2851 return -ENOENT;
2852 if (tv.tv_sec == 0) {
2853 ktime_t kt = ktime_get_real();
2854 sock_write_timestamp(sk, kt);
2855 tv = ktime_to_timeval(kt);
2857 return copy_to_user(userstamp, &tv, sizeof(tv)) ? -EFAULT : 0;
2859 EXPORT_SYMBOL(sock_get_timestamp);
2861 int sock_get_timestampns(struct sock *sk, struct timespec __user *userstamp)
2863 struct timespec ts;
2865 sock_enable_timestamp(sk, SOCK_TIMESTAMP);
2866 ts = ktime_to_timespec(sock_read_timestamp(sk));
2867 if (ts.tv_sec == -1)
2868 return -ENOENT;
2869 if (ts.tv_sec == 0) {
2870 ktime_t kt = ktime_get_real();
2871 sock_write_timestamp(sk, kt);
2872 ts = ktime_to_timespec(sk->sk_stamp);
2874 return copy_to_user(userstamp, &ts, sizeof(ts)) ? -EFAULT : 0;
2876 EXPORT_SYMBOL(sock_get_timestampns);
2878 void sock_enable_timestamp(struct sock *sk, int flag)
2880 if (!sock_flag(sk, flag)) {
2881 unsigned long previous_flags = sk->sk_flags;
2883 sock_set_flag(sk, flag);
2885 * we just set one of the two flags which require net
2886 * time stamping, but time stamping might have been on
2887 * already because of the other one
2889 if (sock_needs_netstamp(sk) &&
2890 !(previous_flags & SK_FLAGS_TIMESTAMP))
2891 net_enable_timestamp();
2895 int sock_recv_errqueue(struct sock *sk, struct msghdr *msg, int len,
2896 int level, int type)
2898 struct sock_exterr_skb *serr;
2899 struct sk_buff *skb;
2900 int copied, err;
2902 err = -EAGAIN;
2903 skb = sock_dequeue_err_skb(sk);
2904 if (skb == NULL)
2905 goto out;
2907 copied = skb->len;
2908 if (copied > len) {
2909 msg->msg_flags |= MSG_TRUNC;
2910 copied = len;
2912 err = skb_copy_datagram_msg(skb, 0, msg, copied);
2913 if (err)
2914 goto out_free_skb;
2916 sock_recv_timestamp(msg, sk, skb);
2918 serr = SKB_EXT_ERR(skb);
2919 put_cmsg(msg, level, type, sizeof(serr->ee), &serr->ee);
2921 msg->msg_flags |= MSG_ERRQUEUE;
2922 err = copied;
2924 out_free_skb:
2925 kfree_skb(skb);
2926 out:
2927 return err;
2929 EXPORT_SYMBOL(sock_recv_errqueue);
2932 * Get a socket option on an socket.
2934 * FIX: POSIX 1003.1g is very ambiguous here. It states that
2935 * asynchronous errors should be reported by getsockopt. We assume
2936 * this means if you specify SO_ERROR (otherwise whats the point of it).
2938 int sock_common_getsockopt(struct socket *sock, int level, int optname,
2939 char __user *optval, int __user *optlen)
2941 struct sock *sk = sock->sk;
2943 return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
2945 EXPORT_SYMBOL(sock_common_getsockopt);
2947 #ifdef CONFIG_COMPAT
2948 int compat_sock_common_getsockopt(struct socket *sock, int level, int optname,
2949 char __user *optval, int __user *optlen)
2951 struct sock *sk = sock->sk;
2953 if (sk->sk_prot->compat_getsockopt != NULL)
2954 return sk->sk_prot->compat_getsockopt(sk, level, optname,
2955 optval, optlen);
2956 return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
2958 EXPORT_SYMBOL(compat_sock_common_getsockopt);
2959 #endif
2961 int sock_common_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
2962 int flags)
2964 struct sock *sk = sock->sk;
2965 int addr_len = 0;
2966 int err;
2968 err = sk->sk_prot->recvmsg(sk, msg, size, flags & MSG_DONTWAIT,
2969 flags & ~MSG_DONTWAIT, &addr_len);
2970 if (err >= 0)
2971 msg->msg_namelen = addr_len;
2972 return err;
2974 EXPORT_SYMBOL(sock_common_recvmsg);
2977 * Set socket options on an inet socket.
2979 int sock_common_setsockopt(struct socket *sock, int level, int optname,
2980 char __user *optval, unsigned int optlen)
2982 struct sock *sk = sock->sk;
2984 return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
2986 EXPORT_SYMBOL(sock_common_setsockopt);
2988 #ifdef CONFIG_COMPAT
2989 int compat_sock_common_setsockopt(struct socket *sock, int level, int optname,
2990 char __user *optval, unsigned int optlen)
2992 struct sock *sk = sock->sk;
2994 if (sk->sk_prot->compat_setsockopt != NULL)
2995 return sk->sk_prot->compat_setsockopt(sk, level, optname,
2996 optval, optlen);
2997 return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
2999 EXPORT_SYMBOL(compat_sock_common_setsockopt);
3000 #endif
3002 void sk_common_release(struct sock *sk)
3004 if (sk->sk_prot->destroy)
3005 sk->sk_prot->destroy(sk);
3008 * Observation: when sock_common_release is called, processes have
3009 * no access to socket. But net still has.
3010 * Step one, detach it from networking:
3012 * A. Remove from hash tables.
3015 sk->sk_prot->unhash(sk);
3018 * In this point socket cannot receive new packets, but it is possible
3019 * that some packets are in flight because some CPU runs receiver and
3020 * did hash table lookup before we unhashed socket. They will achieve
3021 * receive queue and will be purged by socket destructor.
3023 * Also we still have packets pending on receive queue and probably,
3024 * our own packets waiting in device queues. sock_destroy will drain
3025 * receive queue, but transmitted packets will delay socket destruction
3026 * until the last reference will be released.
3029 sock_orphan(sk);
3031 xfrm_sk_free_policy(sk);
3033 sk_refcnt_debug_release(sk);
3035 sock_put(sk);
3037 EXPORT_SYMBOL(sk_common_release);
3039 void sk_get_meminfo(const struct sock *sk, u32 *mem)
3041 memset(mem, 0, sizeof(*mem) * SK_MEMINFO_VARS);
3043 mem[SK_MEMINFO_RMEM_ALLOC] = sk_rmem_alloc_get(sk);
3044 mem[SK_MEMINFO_RCVBUF] = sk->sk_rcvbuf;
3045 mem[SK_MEMINFO_WMEM_ALLOC] = sk_wmem_alloc_get(sk);
3046 mem[SK_MEMINFO_SNDBUF] = sk->sk_sndbuf;
3047 mem[SK_MEMINFO_FWD_ALLOC] = sk->sk_forward_alloc;
3048 mem[SK_MEMINFO_WMEM_QUEUED] = sk->sk_wmem_queued;
3049 mem[SK_MEMINFO_OPTMEM] = atomic_read(&sk->sk_omem_alloc);
3050 mem[SK_MEMINFO_BACKLOG] = sk->sk_backlog.len;
3051 mem[SK_MEMINFO_DROPS] = atomic_read(&sk->sk_drops);
3054 #ifdef CONFIG_PROC_FS
3055 #define PROTO_INUSE_NR 64 /* should be enough for the first time */
3056 struct prot_inuse {
3057 int val[PROTO_INUSE_NR];
3060 static DECLARE_BITMAP(proto_inuse_idx, PROTO_INUSE_NR);
3062 void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
3064 __this_cpu_add(net->core.prot_inuse->val[prot->inuse_idx], val);
3066 EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
3068 int sock_prot_inuse_get(struct net *net, struct proto *prot)
3070 int cpu, idx = prot->inuse_idx;
3071 int res = 0;
3073 for_each_possible_cpu(cpu)
3074 res += per_cpu_ptr(net->core.prot_inuse, cpu)->val[idx];
3076 return res >= 0 ? res : 0;
3078 EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
3080 static void sock_inuse_add(struct net *net, int val)
3082 this_cpu_add(*net->core.sock_inuse, val);
3085 int sock_inuse_get(struct net *net)
3087 int cpu, res = 0;
3089 for_each_possible_cpu(cpu)
3090 res += *per_cpu_ptr(net->core.sock_inuse, cpu);
3092 return res;
3095 EXPORT_SYMBOL_GPL(sock_inuse_get);
3097 static int __net_init sock_inuse_init_net(struct net *net)
3099 net->core.prot_inuse = alloc_percpu(struct prot_inuse);
3100 if (net->core.prot_inuse == NULL)
3101 return -ENOMEM;
3103 net->core.sock_inuse = alloc_percpu(int);
3104 if (net->core.sock_inuse == NULL)
3105 goto out;
3107 return 0;
3109 out:
3110 free_percpu(net->core.prot_inuse);
3111 return -ENOMEM;
3114 static void __net_exit sock_inuse_exit_net(struct net *net)
3116 free_percpu(net->core.prot_inuse);
3117 free_percpu(net->core.sock_inuse);
3120 static struct pernet_operations net_inuse_ops = {
3121 .init = sock_inuse_init_net,
3122 .exit = sock_inuse_exit_net,
3125 static __init int net_inuse_init(void)
3127 if (register_pernet_subsys(&net_inuse_ops))
3128 panic("Cannot initialize net inuse counters");
3130 return 0;
3133 core_initcall(net_inuse_init);
3135 static void assign_proto_idx(struct proto *prot)
3137 prot->inuse_idx = find_first_zero_bit(proto_inuse_idx, PROTO_INUSE_NR);
3139 if (unlikely(prot->inuse_idx == PROTO_INUSE_NR - 1)) {
3140 pr_err("PROTO_INUSE_NR exhausted\n");
3141 return;
3144 set_bit(prot->inuse_idx, proto_inuse_idx);
3147 static void release_proto_idx(struct proto *prot)
3149 if (prot->inuse_idx != PROTO_INUSE_NR - 1)
3150 clear_bit(prot->inuse_idx, proto_inuse_idx);
3152 #else
3153 static inline void assign_proto_idx(struct proto *prot)
3157 static inline void release_proto_idx(struct proto *prot)
3161 static void sock_inuse_add(struct net *net, int val)
3164 #endif
3166 static void req_prot_cleanup(struct request_sock_ops *rsk_prot)
3168 if (!rsk_prot)
3169 return;
3170 kfree(rsk_prot->slab_name);
3171 rsk_prot->slab_name = NULL;
3172 kmem_cache_destroy(rsk_prot->slab);
3173 rsk_prot->slab = NULL;
3176 static int req_prot_init(const struct proto *prot)
3178 struct request_sock_ops *rsk_prot = prot->rsk_prot;
3180 if (!rsk_prot)
3181 return 0;
3183 rsk_prot->slab_name = kasprintf(GFP_KERNEL, "request_sock_%s",
3184 prot->name);
3185 if (!rsk_prot->slab_name)
3186 return -ENOMEM;
3188 rsk_prot->slab = kmem_cache_create(rsk_prot->slab_name,
3189 rsk_prot->obj_size, 0,
3190 SLAB_ACCOUNT | prot->slab_flags,
3191 NULL);
3193 if (!rsk_prot->slab) {
3194 pr_crit("%s: Can't create request sock SLAB cache!\n",
3195 prot->name);
3196 return -ENOMEM;
3198 return 0;
3201 int proto_register(struct proto *prot, int alloc_slab)
3203 if (alloc_slab) {
3204 prot->slab = kmem_cache_create_usercopy(prot->name,
3205 prot->obj_size, 0,
3206 SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT |
3207 prot->slab_flags,
3208 prot->useroffset, prot->usersize,
3209 NULL);
3211 if (prot->slab == NULL) {
3212 pr_crit("%s: Can't create sock SLAB cache!\n",
3213 prot->name);
3214 goto out;
3217 if (req_prot_init(prot))
3218 goto out_free_request_sock_slab;
3220 if (prot->twsk_prot != NULL) {
3221 prot->twsk_prot->twsk_slab_name = kasprintf(GFP_KERNEL, "tw_sock_%s", prot->name);
3223 if (prot->twsk_prot->twsk_slab_name == NULL)
3224 goto out_free_request_sock_slab;
3226 prot->twsk_prot->twsk_slab =
3227 kmem_cache_create(prot->twsk_prot->twsk_slab_name,
3228 prot->twsk_prot->twsk_obj_size,
3230 SLAB_ACCOUNT |
3231 prot->slab_flags,
3232 NULL);
3233 if (prot->twsk_prot->twsk_slab == NULL)
3234 goto out_free_timewait_sock_slab_name;
3238 mutex_lock(&proto_list_mutex);
3239 list_add(&prot->node, &proto_list);
3240 assign_proto_idx(prot);
3241 mutex_unlock(&proto_list_mutex);
3242 return 0;
3244 out_free_timewait_sock_slab_name:
3245 kfree(prot->twsk_prot->twsk_slab_name);
3246 out_free_request_sock_slab:
3247 req_prot_cleanup(prot->rsk_prot);
3249 kmem_cache_destroy(prot->slab);
3250 prot->slab = NULL;
3251 out:
3252 return -ENOBUFS;
3254 EXPORT_SYMBOL(proto_register);
3256 void proto_unregister(struct proto *prot)
3258 mutex_lock(&proto_list_mutex);
3259 release_proto_idx(prot);
3260 list_del(&prot->node);
3261 mutex_unlock(&proto_list_mutex);
3263 kmem_cache_destroy(prot->slab);
3264 prot->slab = NULL;
3266 req_prot_cleanup(prot->rsk_prot);
3268 if (prot->twsk_prot != NULL && prot->twsk_prot->twsk_slab != NULL) {
3269 kmem_cache_destroy(prot->twsk_prot->twsk_slab);
3270 kfree(prot->twsk_prot->twsk_slab_name);
3271 prot->twsk_prot->twsk_slab = NULL;
3274 EXPORT_SYMBOL(proto_unregister);
3276 int sock_load_diag_module(int family, int protocol)
3278 if (!protocol) {
3279 if (!sock_is_registered(family))
3280 return -ENOENT;
3282 return request_module("net-pf-%d-proto-%d-type-%d", PF_NETLINK,
3283 NETLINK_SOCK_DIAG, family);
3286 #ifdef CONFIG_INET
3287 if (family == AF_INET &&
3288 protocol != IPPROTO_RAW &&
3289 !rcu_access_pointer(inet_protos[protocol]))
3290 return -ENOENT;
3291 #endif
3293 return request_module("net-pf-%d-proto-%d-type-%d-%d", PF_NETLINK,
3294 NETLINK_SOCK_DIAG, family, protocol);
3296 EXPORT_SYMBOL(sock_load_diag_module);
3298 #ifdef CONFIG_PROC_FS
3299 static void *proto_seq_start(struct seq_file *seq, loff_t *pos)
3300 __acquires(proto_list_mutex)
3302 mutex_lock(&proto_list_mutex);
3303 return seq_list_start_head(&proto_list, *pos);
3306 static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3308 return seq_list_next(v, &proto_list, pos);
3311 static void proto_seq_stop(struct seq_file *seq, void *v)
3312 __releases(proto_list_mutex)
3314 mutex_unlock(&proto_list_mutex);
3317 static char proto_method_implemented(const void *method)
3319 return method == NULL ? 'n' : 'y';
3321 static long sock_prot_memory_allocated(struct proto *proto)
3323 return proto->memory_allocated != NULL ? proto_memory_allocated(proto) : -1L;
3326 static char *sock_prot_memory_pressure(struct proto *proto)
3328 return proto->memory_pressure != NULL ?
3329 proto_memory_pressure(proto) ? "yes" : "no" : "NI";
3332 static void proto_seq_printf(struct seq_file *seq, struct proto *proto)
3335 seq_printf(seq, "%-9s %4u %6d %6ld %-3s %6u %-3s %-10s "
3336 "%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n",
3337 proto->name,
3338 proto->obj_size,
3339 sock_prot_inuse_get(seq_file_net(seq), proto),
3340 sock_prot_memory_allocated(proto),
3341 sock_prot_memory_pressure(proto),
3342 proto->max_header,
3343 proto->slab == NULL ? "no" : "yes",
3344 module_name(proto->owner),
3345 proto_method_implemented(proto->close),
3346 proto_method_implemented(proto->connect),
3347 proto_method_implemented(proto->disconnect),
3348 proto_method_implemented(proto->accept),
3349 proto_method_implemented(proto->ioctl),
3350 proto_method_implemented(proto->init),
3351 proto_method_implemented(proto->destroy),
3352 proto_method_implemented(proto->shutdown),
3353 proto_method_implemented(proto->setsockopt),
3354 proto_method_implemented(proto->getsockopt),
3355 proto_method_implemented(proto->sendmsg),
3356 proto_method_implemented(proto->recvmsg),
3357 proto_method_implemented(proto->sendpage),
3358 proto_method_implemented(proto->bind),
3359 proto_method_implemented(proto->backlog_rcv),
3360 proto_method_implemented(proto->hash),
3361 proto_method_implemented(proto->unhash),
3362 proto_method_implemented(proto->get_port),
3363 proto_method_implemented(proto->enter_memory_pressure));
3366 static int proto_seq_show(struct seq_file *seq, void *v)
3368 if (v == &proto_list)
3369 seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s",
3370 "protocol",
3371 "size",
3372 "sockets",
3373 "memory",
3374 "press",
3375 "maxhdr",
3376 "slab",
3377 "module",
3378 "cl co di ac io in de sh ss gs se re sp bi br ha uh gp em\n");
3379 else
3380 proto_seq_printf(seq, list_entry(v, struct proto, node));
3381 return 0;
3384 static const struct seq_operations proto_seq_ops = {
3385 .start = proto_seq_start,
3386 .next = proto_seq_next,
3387 .stop = proto_seq_stop,
3388 .show = proto_seq_show,
3391 static __net_init int proto_init_net(struct net *net)
3393 if (!proc_create_net("protocols", 0444, net->proc_net, &proto_seq_ops,
3394 sizeof(struct seq_net_private)))
3395 return -ENOMEM;
3397 return 0;
3400 static __net_exit void proto_exit_net(struct net *net)
3402 remove_proc_entry("protocols", net->proc_net);
3406 static __net_initdata struct pernet_operations proto_net_ops = {
3407 .init = proto_init_net,
3408 .exit = proto_exit_net,
3411 static int __init proto_init(void)
3413 return register_pernet_subsys(&proto_net_ops);
3416 subsys_initcall(proto_init);
3418 #endif /* PROC_FS */
3420 #ifdef CONFIG_NET_RX_BUSY_POLL
3421 bool sk_busy_loop_end(void *p, unsigned long start_time)
3423 struct sock *sk = p;
3425 return !skb_queue_empty(&sk->sk_receive_queue) ||
3426 sk_busy_loop_timeout(sk, start_time);
3428 EXPORT_SYMBOL(sk_busy_loop_end);
3429 #endif /* CONFIG_NET_RX_BUSY_POLL */