scsi: ufs: fix race between clock gating and devfreq scaling work
[linux/fpc-iii.git] / net / core / sock.c
blob1c4c43483b54e9fea2172123af1435577196786b
1 /*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
6 * Generic socket support routines. Memory allocators, socket lock/release
7 * handler for protocols to use and generic option handler.
10 * Authors: Ross Biro
11 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12 * Florian La Roche, <flla@stud.uni-sb.de>
13 * Alan Cox, <A.Cox@swansea.ac.uk>
15 * Fixes:
16 * Alan Cox : Numerous verify_area() problems
17 * Alan Cox : Connecting on a connecting socket
18 * now returns an error for tcp.
19 * Alan Cox : sock->protocol is set correctly.
20 * and is not sometimes left as 0.
21 * Alan Cox : connect handles icmp errors on a
22 * connect properly. Unfortunately there
23 * is a restart syscall nasty there. I
24 * can't match BSD without hacking the C
25 * library. Ideas urgently sought!
26 * Alan Cox : Disallow bind() to addresses that are
27 * not ours - especially broadcast ones!!
28 * Alan Cox : Socket 1024 _IS_ ok for users. (fencepost)
29 * Alan Cox : sock_wfree/sock_rfree don't destroy sockets,
30 * instead they leave that for the DESTROY timer.
31 * Alan Cox : Clean up error flag in accept
32 * Alan Cox : TCP ack handling is buggy, the DESTROY timer
33 * was buggy. Put a remove_sock() in the handler
34 * for memory when we hit 0. Also altered the timer
35 * code. The ACK stuff can wait and needs major
36 * TCP layer surgery.
37 * Alan Cox : Fixed TCP ack bug, removed remove sock
38 * and fixed timer/inet_bh race.
39 * Alan Cox : Added zapped flag for TCP
40 * Alan Cox : Move kfree_skb into skbuff.c and tidied up surplus code
41 * Alan Cox : for new sk_buff allocations wmalloc/rmalloc now call alloc_skb
42 * Alan Cox : kfree_s calls now are kfree_skbmem so we can track skb resources
43 * Alan Cox : Supports socket option broadcast now as does udp. Packet and raw need fixing.
44 * Alan Cox : Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so...
45 * Rick Sladkey : Relaxed UDP rules for matching packets.
46 * C.E.Hawkins : IFF_PROMISC/SIOCGHWADDR support
47 * Pauline Middelink : identd support
48 * Alan Cox : Fixed connect() taking signals I think.
49 * Alan Cox : SO_LINGER supported
50 * Alan Cox : Error reporting fixes
51 * Anonymous : inet_create tidied up (sk->reuse setting)
52 * Alan Cox : inet sockets don't set sk->type!
53 * Alan Cox : Split socket option code
54 * Alan Cox : Callbacks
55 * Alan Cox : Nagle flag for Charles & Johannes stuff
56 * Alex : Removed restriction on inet fioctl
57 * Alan Cox : Splitting INET from NET core
58 * Alan Cox : Fixed bogus SO_TYPE handling in getsockopt()
59 * Adam Caldwell : Missing return in SO_DONTROUTE/SO_DEBUG code
60 * Alan Cox : Split IP from generic code
61 * Alan Cox : New kfree_skbmem()
62 * Alan Cox : Make SO_DEBUG superuser only.
63 * Alan Cox : Allow anyone to clear SO_DEBUG
64 * (compatibility fix)
65 * Alan Cox : Added optimistic memory grabbing for AF_UNIX throughput.
66 * Alan Cox : Allocator for a socket is settable.
67 * Alan Cox : SO_ERROR includes soft errors.
68 * Alan Cox : Allow NULL arguments on some SO_ opts
69 * Alan Cox : Generic socket allocation to make hooks
70 * easier (suggested by Craig Metz).
71 * Michael Pall : SO_ERROR returns positive errno again
72 * Steve Whitehouse: Added default destructor to free
73 * protocol private data.
74 * Steve Whitehouse: Added various other default routines
75 * common to several socket families.
76 * Chris Evans : Call suser() check last on F_SETOWN
77 * Jay Schulist : Added SO_ATTACH_FILTER and SO_DETACH_FILTER.
78 * Andi Kleen : Add sock_kmalloc()/sock_kfree_s()
79 * Andi Kleen : Fix write_space callback
80 * Chris Evans : Security fixes - signedness again
81 * Arnaldo C. Melo : cleanups, use skb_queue_purge
83 * To Fix:
86 * This program is free software; you can redistribute it and/or
87 * modify it under the terms of the GNU General Public License
88 * as published by the Free Software Foundation; either version
89 * 2 of the License, or (at your option) any later version.
92 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
94 #include <linux/capability.h>
95 #include <linux/errno.h>
96 #include <linux/errqueue.h>
97 #include <linux/types.h>
98 #include <linux/socket.h>
99 #include <linux/in.h>
100 #include <linux/kernel.h>
101 #include <linux/module.h>
102 #include <linux/proc_fs.h>
103 #include <linux/seq_file.h>
104 #include <linux/sched.h>
105 #include <linux/timer.h>
106 #include <linux/string.h>
107 #include <linux/sockios.h>
108 #include <linux/net.h>
109 #include <linux/mm.h>
110 #include <linux/slab.h>
111 #include <linux/interrupt.h>
112 #include <linux/poll.h>
113 #include <linux/tcp.h>
114 #include <linux/init.h>
115 #include <linux/highmem.h>
116 #include <linux/user_namespace.h>
117 #include <linux/static_key.h>
118 #include <linux/memcontrol.h>
119 #include <linux/prefetch.h>
121 #include <asm/uaccess.h>
123 #include <linux/netdevice.h>
124 #include <net/protocol.h>
125 #include <linux/skbuff.h>
126 #include <net/net_namespace.h>
127 #include <net/request_sock.h>
128 #include <net/sock.h>
129 #include <linux/net_tstamp.h>
130 #include <net/xfrm.h>
131 #include <linux/ipsec.h>
132 #include <net/cls_cgroup.h>
133 #include <net/netprio_cgroup.h>
134 #include <linux/sock_diag.h>
136 #include <linux/filter.h>
137 #include <net/sock_reuseport.h>
139 #include <trace/events/sock.h>
141 #include <net/tcp.h>
142 #include <net/busy_poll.h>
144 static DEFINE_MUTEX(proto_list_mutex);
145 static LIST_HEAD(proto_list);
148 * sk_ns_capable - General socket capability test
149 * @sk: Socket to use a capability on or through
150 * @user_ns: The user namespace of the capability to use
151 * @cap: The capability to use
153 * Test to see if the opener of the socket had when the socket was
154 * created and the current process has the capability @cap in the user
155 * namespace @user_ns.
157 bool sk_ns_capable(const struct sock *sk,
158 struct user_namespace *user_ns, int cap)
160 return file_ns_capable(sk->sk_socket->file, user_ns, cap) &&
161 ns_capable(user_ns, cap);
163 EXPORT_SYMBOL(sk_ns_capable);
166 * sk_capable - Socket global capability test
167 * @sk: Socket to use a capability on or through
168 * @cap: The global capability to use
170 * Test to see if the opener of the socket had when the socket was
171 * created and the current process has the capability @cap in all user
172 * namespaces.
174 bool sk_capable(const struct sock *sk, int cap)
176 return sk_ns_capable(sk, &init_user_ns, cap);
178 EXPORT_SYMBOL(sk_capable);
181 * sk_net_capable - Network namespace socket capability test
182 * @sk: Socket to use a capability on or through
183 * @cap: The capability to use
185 * Test to see if the opener of the socket had when the socket was created
186 * and the current process has the capability @cap over the network namespace
187 * the socket is a member of.
189 bool sk_net_capable(const struct sock *sk, int cap)
191 return sk_ns_capable(sk, sock_net(sk)->user_ns, cap);
193 EXPORT_SYMBOL(sk_net_capable);
196 * Each address family might have different locking rules, so we have
197 * one slock key per address family:
199 static struct lock_class_key af_family_keys[AF_MAX];
200 static struct lock_class_key af_family_slock_keys[AF_MAX];
203 * Make lock validator output more readable. (we pre-construct these
204 * strings build-time, so that runtime initialization of socket
205 * locks is fast):
207 static const char *const af_family_key_strings[AF_MAX+1] = {
208 "sk_lock-AF_UNSPEC", "sk_lock-AF_UNIX" , "sk_lock-AF_INET" ,
209 "sk_lock-AF_AX25" , "sk_lock-AF_IPX" , "sk_lock-AF_APPLETALK",
210 "sk_lock-AF_NETROM", "sk_lock-AF_BRIDGE" , "sk_lock-AF_ATMPVC" ,
211 "sk_lock-AF_X25" , "sk_lock-AF_INET6" , "sk_lock-AF_ROSE" ,
212 "sk_lock-AF_DECnet", "sk_lock-AF_NETBEUI" , "sk_lock-AF_SECURITY" ,
213 "sk_lock-AF_KEY" , "sk_lock-AF_NETLINK" , "sk_lock-AF_PACKET" ,
214 "sk_lock-AF_ASH" , "sk_lock-AF_ECONET" , "sk_lock-AF_ATMSVC" ,
215 "sk_lock-AF_RDS" , "sk_lock-AF_SNA" , "sk_lock-AF_IRDA" ,
216 "sk_lock-AF_PPPOX" , "sk_lock-AF_WANPIPE" , "sk_lock-AF_LLC" ,
217 "sk_lock-27" , "sk_lock-28" , "sk_lock-AF_CAN" ,
218 "sk_lock-AF_TIPC" , "sk_lock-AF_BLUETOOTH", "sk_lock-IUCV" ,
219 "sk_lock-AF_RXRPC" , "sk_lock-AF_ISDN" , "sk_lock-AF_PHONET" ,
220 "sk_lock-AF_IEEE802154", "sk_lock-AF_CAIF" , "sk_lock-AF_ALG" ,
221 "sk_lock-AF_NFC" , "sk_lock-AF_VSOCK" , "sk_lock-AF_KCM" ,
222 "sk_lock-AF_QIPCRTR", "sk_lock-AF_MAX"
224 static const char *const af_family_slock_key_strings[AF_MAX+1] = {
225 "slock-AF_UNSPEC", "slock-AF_UNIX" , "slock-AF_INET" ,
226 "slock-AF_AX25" , "slock-AF_IPX" , "slock-AF_APPLETALK",
227 "slock-AF_NETROM", "slock-AF_BRIDGE" , "slock-AF_ATMPVC" ,
228 "slock-AF_X25" , "slock-AF_INET6" , "slock-AF_ROSE" ,
229 "slock-AF_DECnet", "slock-AF_NETBEUI" , "slock-AF_SECURITY" ,
230 "slock-AF_KEY" , "slock-AF_NETLINK" , "slock-AF_PACKET" ,
231 "slock-AF_ASH" , "slock-AF_ECONET" , "slock-AF_ATMSVC" ,
232 "slock-AF_RDS" , "slock-AF_SNA" , "slock-AF_IRDA" ,
233 "slock-AF_PPPOX" , "slock-AF_WANPIPE" , "slock-AF_LLC" ,
234 "slock-27" , "slock-28" , "slock-AF_CAN" ,
235 "slock-AF_TIPC" , "slock-AF_BLUETOOTH", "slock-AF_IUCV" ,
236 "slock-AF_RXRPC" , "slock-AF_ISDN" , "slock-AF_PHONET" ,
237 "slock-AF_IEEE802154", "slock-AF_CAIF" , "slock-AF_ALG" ,
238 "slock-AF_NFC" , "slock-AF_VSOCK" ,"slock-AF_KCM" ,
239 "slock-AF_QIPCRTR", "slock-AF_MAX"
241 static const char *const af_family_clock_key_strings[AF_MAX+1] = {
242 "clock-AF_UNSPEC", "clock-AF_UNIX" , "clock-AF_INET" ,
243 "clock-AF_AX25" , "clock-AF_IPX" , "clock-AF_APPLETALK",
244 "clock-AF_NETROM", "clock-AF_BRIDGE" , "clock-AF_ATMPVC" ,
245 "clock-AF_X25" , "clock-AF_INET6" , "clock-AF_ROSE" ,
246 "clock-AF_DECnet", "clock-AF_NETBEUI" , "clock-AF_SECURITY" ,
247 "clock-AF_KEY" , "clock-AF_NETLINK" , "clock-AF_PACKET" ,
248 "clock-AF_ASH" , "clock-AF_ECONET" , "clock-AF_ATMSVC" ,
249 "clock-AF_RDS" , "clock-AF_SNA" , "clock-AF_IRDA" ,
250 "clock-AF_PPPOX" , "clock-AF_WANPIPE" , "clock-AF_LLC" ,
251 "clock-27" , "clock-28" , "clock-AF_CAN" ,
252 "clock-AF_TIPC" , "clock-AF_BLUETOOTH", "clock-AF_IUCV" ,
253 "clock-AF_RXRPC" , "clock-AF_ISDN" , "clock-AF_PHONET" ,
254 "clock-AF_IEEE802154", "clock-AF_CAIF" , "clock-AF_ALG" ,
255 "clock-AF_NFC" , "clock-AF_VSOCK" , "clock-AF_KCM" ,
256 "clock-AF_QIPCRTR", "clock-AF_MAX"
260 * sk_callback_lock locking rules are per-address-family,
261 * so split the lock classes by using a per-AF key:
263 static struct lock_class_key af_callback_keys[AF_MAX];
265 /* Take into consideration the size of the struct sk_buff overhead in the
266 * determination of these values, since that is non-constant across
267 * platforms. This makes socket queueing behavior and performance
268 * not depend upon such differences.
270 #define _SK_MEM_PACKETS 256
271 #define _SK_MEM_OVERHEAD SKB_TRUESIZE(256)
272 #define SK_WMEM_MAX (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
273 #define SK_RMEM_MAX (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
275 /* Run time adjustable parameters. */
276 __u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX;
277 EXPORT_SYMBOL(sysctl_wmem_max);
278 __u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX;
279 EXPORT_SYMBOL(sysctl_rmem_max);
280 __u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX;
281 __u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX;
283 /* Maximal space eaten by iovec or ancillary data plus some space */
284 int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512);
285 EXPORT_SYMBOL(sysctl_optmem_max);
287 int sysctl_tstamp_allow_data __read_mostly = 1;
289 struct static_key memalloc_socks = STATIC_KEY_INIT_FALSE;
290 EXPORT_SYMBOL_GPL(memalloc_socks);
293 * sk_set_memalloc - sets %SOCK_MEMALLOC
294 * @sk: socket to set it on
296 * Set %SOCK_MEMALLOC on a socket for access to emergency reserves.
297 * It's the responsibility of the admin to adjust min_free_kbytes
298 * to meet the requirements
300 void sk_set_memalloc(struct sock *sk)
302 sock_set_flag(sk, SOCK_MEMALLOC);
303 sk->sk_allocation |= __GFP_MEMALLOC;
304 static_key_slow_inc(&memalloc_socks);
306 EXPORT_SYMBOL_GPL(sk_set_memalloc);
308 void sk_clear_memalloc(struct sock *sk)
310 sock_reset_flag(sk, SOCK_MEMALLOC);
311 sk->sk_allocation &= ~__GFP_MEMALLOC;
312 static_key_slow_dec(&memalloc_socks);
315 * SOCK_MEMALLOC is allowed to ignore rmem limits to ensure forward
316 * progress of swapping. SOCK_MEMALLOC may be cleared while
317 * it has rmem allocations due to the last swapfile being deactivated
318 * but there is a risk that the socket is unusable due to exceeding
319 * the rmem limits. Reclaim the reserves and obey rmem limits again.
321 sk_mem_reclaim(sk);
323 EXPORT_SYMBOL_GPL(sk_clear_memalloc);
325 int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb)
327 int ret;
328 unsigned long pflags = current->flags;
330 /* these should have been dropped before queueing */
331 BUG_ON(!sock_flag(sk, SOCK_MEMALLOC));
333 current->flags |= PF_MEMALLOC;
334 ret = sk->sk_backlog_rcv(sk, skb);
335 tsk_restore_flags(current, pflags, PF_MEMALLOC);
337 return ret;
339 EXPORT_SYMBOL(__sk_backlog_rcv);
341 static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen)
343 struct timeval tv;
345 if (optlen < sizeof(tv))
346 return -EINVAL;
347 if (copy_from_user(&tv, optval, sizeof(tv)))
348 return -EFAULT;
349 if (tv.tv_usec < 0 || tv.tv_usec >= USEC_PER_SEC)
350 return -EDOM;
352 if (tv.tv_sec < 0) {
353 static int warned __read_mostly;
355 *timeo_p = 0;
356 if (warned < 10 && net_ratelimit()) {
357 warned++;
358 pr_info("%s: `%s' (pid %d) tries to set negative timeout\n",
359 __func__, current->comm, task_pid_nr(current));
361 return 0;
363 *timeo_p = MAX_SCHEDULE_TIMEOUT;
364 if (tv.tv_sec == 0 && tv.tv_usec == 0)
365 return 0;
366 if (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT/HZ - 1))
367 *timeo_p = tv.tv_sec*HZ + (tv.tv_usec+(1000000/HZ-1))/(1000000/HZ);
368 return 0;
371 static void sock_warn_obsolete_bsdism(const char *name)
373 static int warned;
374 static char warncomm[TASK_COMM_LEN];
375 if (strcmp(warncomm, current->comm) && warned < 5) {
376 strcpy(warncomm, current->comm);
377 pr_warn("process `%s' is using obsolete %s SO_BSDCOMPAT\n",
378 warncomm, name);
379 warned++;
383 static bool sock_needs_netstamp(const struct sock *sk)
385 switch (sk->sk_family) {
386 case AF_UNSPEC:
387 case AF_UNIX:
388 return false;
389 default:
390 return true;
394 static void sock_disable_timestamp(struct sock *sk, unsigned long flags)
396 if (sk->sk_flags & flags) {
397 sk->sk_flags &= ~flags;
398 if (sock_needs_netstamp(sk) &&
399 !(sk->sk_flags & SK_FLAGS_TIMESTAMP))
400 net_disable_timestamp();
405 int __sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
407 unsigned long flags;
408 struct sk_buff_head *list = &sk->sk_receive_queue;
410 if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) {
411 atomic_inc(&sk->sk_drops);
412 trace_sock_rcvqueue_full(sk, skb);
413 return -ENOMEM;
416 if (!sk_rmem_schedule(sk, skb, skb->truesize)) {
417 atomic_inc(&sk->sk_drops);
418 return -ENOBUFS;
421 skb->dev = NULL;
422 skb_set_owner_r(skb, sk);
424 /* we escape from rcu protected region, make sure we dont leak
425 * a norefcounted dst
427 skb_dst_force(skb);
429 spin_lock_irqsave(&list->lock, flags);
430 sock_skb_set_dropcount(sk, skb);
431 __skb_queue_tail(list, skb);
432 spin_unlock_irqrestore(&list->lock, flags);
434 if (!sock_flag(sk, SOCK_DEAD))
435 sk->sk_data_ready(sk);
436 return 0;
438 EXPORT_SYMBOL(__sock_queue_rcv_skb);
440 int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
442 int err;
444 err = sk_filter(sk, skb);
445 if (err)
446 return err;
448 return __sock_queue_rcv_skb(sk, skb);
450 EXPORT_SYMBOL(sock_queue_rcv_skb);
452 int __sk_receive_skb(struct sock *sk, struct sk_buff *skb,
453 const int nested, unsigned int trim_cap, bool refcounted)
455 int rc = NET_RX_SUCCESS;
457 if (sk_filter_trim_cap(sk, skb, trim_cap))
458 goto discard_and_relse;
460 skb->dev = NULL;
462 if (sk_rcvqueues_full(sk, sk->sk_rcvbuf)) {
463 atomic_inc(&sk->sk_drops);
464 goto discard_and_relse;
466 if (nested)
467 bh_lock_sock_nested(sk);
468 else
469 bh_lock_sock(sk);
470 if (!sock_owned_by_user(sk)) {
472 * trylock + unlock semantics:
474 mutex_acquire(&sk->sk_lock.dep_map, 0, 1, _RET_IP_);
476 rc = sk_backlog_rcv(sk, skb);
478 mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
479 } else if (sk_add_backlog(sk, skb, sk->sk_rcvbuf)) {
480 bh_unlock_sock(sk);
481 atomic_inc(&sk->sk_drops);
482 goto discard_and_relse;
485 bh_unlock_sock(sk);
486 out:
487 if (refcounted)
488 sock_put(sk);
489 return rc;
490 discard_and_relse:
491 kfree_skb(skb);
492 goto out;
494 EXPORT_SYMBOL(__sk_receive_skb);
496 struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie)
498 struct dst_entry *dst = __sk_dst_get(sk);
500 if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
501 sk_tx_queue_clear(sk);
502 RCU_INIT_POINTER(sk->sk_dst_cache, NULL);
503 dst_release(dst);
504 return NULL;
507 return dst;
509 EXPORT_SYMBOL(__sk_dst_check);
511 struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie)
513 struct dst_entry *dst = sk_dst_get(sk);
515 if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
516 sk_dst_reset(sk);
517 dst_release(dst);
518 return NULL;
521 return dst;
523 EXPORT_SYMBOL(sk_dst_check);
525 static int sock_setbindtodevice(struct sock *sk, char __user *optval,
526 int optlen)
528 int ret = -ENOPROTOOPT;
529 #ifdef CONFIG_NETDEVICES
530 struct net *net = sock_net(sk);
531 char devname[IFNAMSIZ];
532 int index;
534 /* Sorry... */
535 ret = -EPERM;
536 if (!ns_capable(net->user_ns, CAP_NET_RAW))
537 goto out;
539 ret = -EINVAL;
540 if (optlen < 0)
541 goto out;
543 /* Bind this socket to a particular device like "eth0",
544 * as specified in the passed interface name. If the
545 * name is "" or the option length is zero the socket
546 * is not bound.
548 if (optlen > IFNAMSIZ - 1)
549 optlen = IFNAMSIZ - 1;
550 memset(devname, 0, sizeof(devname));
552 ret = -EFAULT;
553 if (copy_from_user(devname, optval, optlen))
554 goto out;
556 index = 0;
557 if (devname[0] != '\0') {
558 struct net_device *dev;
560 rcu_read_lock();
561 dev = dev_get_by_name_rcu(net, devname);
562 if (dev)
563 index = dev->ifindex;
564 rcu_read_unlock();
565 ret = -ENODEV;
566 if (!dev)
567 goto out;
570 lock_sock(sk);
571 sk->sk_bound_dev_if = index;
572 sk_dst_reset(sk);
573 release_sock(sk);
575 ret = 0;
577 out:
578 #endif
580 return ret;
583 static int sock_getbindtodevice(struct sock *sk, char __user *optval,
584 int __user *optlen, int len)
586 int ret = -ENOPROTOOPT;
587 #ifdef CONFIG_NETDEVICES
588 struct net *net = sock_net(sk);
589 char devname[IFNAMSIZ];
591 if (sk->sk_bound_dev_if == 0) {
592 len = 0;
593 goto zero;
596 ret = -EINVAL;
597 if (len < IFNAMSIZ)
598 goto out;
600 ret = netdev_get_name(net, devname, sk->sk_bound_dev_if);
601 if (ret)
602 goto out;
604 len = strlen(devname) + 1;
606 ret = -EFAULT;
607 if (copy_to_user(optval, devname, len))
608 goto out;
610 zero:
611 ret = -EFAULT;
612 if (put_user(len, optlen))
613 goto out;
615 ret = 0;
617 out:
618 #endif
620 return ret;
623 static inline void sock_valbool_flag(struct sock *sk, int bit, int valbool)
625 if (valbool)
626 sock_set_flag(sk, bit);
627 else
628 sock_reset_flag(sk, bit);
631 bool sk_mc_loop(struct sock *sk)
633 if (dev_recursion_level())
634 return false;
635 if (!sk)
636 return true;
637 switch (sk->sk_family) {
638 case AF_INET:
639 return inet_sk(sk)->mc_loop;
640 #if IS_ENABLED(CONFIG_IPV6)
641 case AF_INET6:
642 return inet6_sk(sk)->mc_loop;
643 #endif
645 WARN_ON(1);
646 return true;
648 EXPORT_SYMBOL(sk_mc_loop);
651 * This is meant for all protocols to use and covers goings on
652 * at the socket level. Everything here is generic.
655 int sock_setsockopt(struct socket *sock, int level, int optname,
656 char __user *optval, unsigned int optlen)
658 struct sock *sk = sock->sk;
659 int val;
660 int valbool;
661 struct linger ling;
662 int ret = 0;
665 * Options without arguments
668 if (optname == SO_BINDTODEVICE)
669 return sock_setbindtodevice(sk, optval, optlen);
671 if (optlen < sizeof(int))
672 return -EINVAL;
674 if (get_user(val, (int __user *)optval))
675 return -EFAULT;
677 valbool = val ? 1 : 0;
679 lock_sock(sk);
681 switch (optname) {
682 case SO_DEBUG:
683 if (val && !capable(CAP_NET_ADMIN))
684 ret = -EACCES;
685 else
686 sock_valbool_flag(sk, SOCK_DBG, valbool);
687 break;
688 case SO_REUSEADDR:
689 sk->sk_reuse = (valbool ? SK_CAN_REUSE : SK_NO_REUSE);
690 break;
691 case SO_REUSEPORT:
692 sk->sk_reuseport = valbool;
693 break;
694 case SO_TYPE:
695 case SO_PROTOCOL:
696 case SO_DOMAIN:
697 case SO_ERROR:
698 ret = -ENOPROTOOPT;
699 break;
700 case SO_DONTROUTE:
701 sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool);
702 break;
703 case SO_BROADCAST:
704 sock_valbool_flag(sk, SOCK_BROADCAST, valbool);
705 break;
706 case SO_SNDBUF:
707 /* Don't error on this BSD doesn't and if you think
708 * about it this is right. Otherwise apps have to
709 * play 'guess the biggest size' games. RCVBUF/SNDBUF
710 * are treated in BSD as hints
712 val = min_t(u32, val, sysctl_wmem_max);
713 set_sndbuf:
714 sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
715 sk->sk_sndbuf = max_t(int, val * 2, SOCK_MIN_SNDBUF);
716 /* Wake up sending tasks if we upped the value. */
717 sk->sk_write_space(sk);
718 break;
720 case SO_SNDBUFFORCE:
721 if (!capable(CAP_NET_ADMIN)) {
722 ret = -EPERM;
723 break;
725 goto set_sndbuf;
727 case SO_RCVBUF:
728 /* Don't error on this BSD doesn't and if you think
729 * about it this is right. Otherwise apps have to
730 * play 'guess the biggest size' games. RCVBUF/SNDBUF
731 * are treated in BSD as hints
733 val = min_t(u32, val, sysctl_rmem_max);
734 set_rcvbuf:
735 sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
737 * We double it on the way in to account for
738 * "struct sk_buff" etc. overhead. Applications
739 * assume that the SO_RCVBUF setting they make will
740 * allow that much actual data to be received on that
741 * socket.
743 * Applications are unaware that "struct sk_buff" and
744 * other overheads allocate from the receive buffer
745 * during socket buffer allocation.
747 * And after considering the possible alternatives,
748 * returning the value we actually used in getsockopt
749 * is the most desirable behavior.
751 sk->sk_rcvbuf = max_t(int, val * 2, SOCK_MIN_RCVBUF);
752 break;
754 case SO_RCVBUFFORCE:
755 if (!capable(CAP_NET_ADMIN)) {
756 ret = -EPERM;
757 break;
759 goto set_rcvbuf;
761 case SO_KEEPALIVE:
762 #ifdef CONFIG_INET
763 if (sk->sk_protocol == IPPROTO_TCP &&
764 sk->sk_type == SOCK_STREAM)
765 tcp_set_keepalive(sk, valbool);
766 #endif
767 sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool);
768 break;
770 case SO_OOBINLINE:
771 sock_valbool_flag(sk, SOCK_URGINLINE, valbool);
772 break;
774 case SO_NO_CHECK:
775 sk->sk_no_check_tx = valbool;
776 break;
778 case SO_PRIORITY:
779 if ((val >= 0 && val <= 6) ||
780 ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
781 sk->sk_priority = val;
782 else
783 ret = -EPERM;
784 break;
786 case SO_LINGER:
787 if (optlen < sizeof(ling)) {
788 ret = -EINVAL; /* 1003.1g */
789 break;
791 if (copy_from_user(&ling, optval, sizeof(ling))) {
792 ret = -EFAULT;
793 break;
795 if (!ling.l_onoff)
796 sock_reset_flag(sk, SOCK_LINGER);
797 else {
798 #if (BITS_PER_LONG == 32)
799 if ((unsigned int)ling.l_linger >= MAX_SCHEDULE_TIMEOUT/HZ)
800 sk->sk_lingertime = MAX_SCHEDULE_TIMEOUT;
801 else
802 #endif
803 sk->sk_lingertime = (unsigned int)ling.l_linger * HZ;
804 sock_set_flag(sk, SOCK_LINGER);
806 break;
808 case SO_BSDCOMPAT:
809 sock_warn_obsolete_bsdism("setsockopt");
810 break;
812 case SO_PASSCRED:
813 if (valbool)
814 set_bit(SOCK_PASSCRED, &sock->flags);
815 else
816 clear_bit(SOCK_PASSCRED, &sock->flags);
817 break;
819 case SO_TIMESTAMP:
820 case SO_TIMESTAMPNS:
821 if (valbool) {
822 if (optname == SO_TIMESTAMP)
823 sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
824 else
825 sock_set_flag(sk, SOCK_RCVTSTAMPNS);
826 sock_set_flag(sk, SOCK_RCVTSTAMP);
827 sock_enable_timestamp(sk, SOCK_TIMESTAMP);
828 } else {
829 sock_reset_flag(sk, SOCK_RCVTSTAMP);
830 sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
832 break;
834 case SO_TIMESTAMPING:
835 if (val & ~SOF_TIMESTAMPING_MASK) {
836 ret = -EINVAL;
837 break;
840 if (val & SOF_TIMESTAMPING_OPT_ID &&
841 !(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)) {
842 if (sk->sk_protocol == IPPROTO_TCP &&
843 sk->sk_type == SOCK_STREAM) {
844 if ((1 << sk->sk_state) &
845 (TCPF_CLOSE | TCPF_LISTEN)) {
846 ret = -EINVAL;
847 break;
849 sk->sk_tskey = tcp_sk(sk)->snd_una;
850 } else {
851 sk->sk_tskey = 0;
854 sk->sk_tsflags = val;
855 if (val & SOF_TIMESTAMPING_RX_SOFTWARE)
856 sock_enable_timestamp(sk,
857 SOCK_TIMESTAMPING_RX_SOFTWARE);
858 else
859 sock_disable_timestamp(sk,
860 (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE));
861 break;
863 case SO_RCVLOWAT:
864 if (val < 0)
865 val = INT_MAX;
866 sk->sk_rcvlowat = val ? : 1;
867 break;
869 case SO_RCVTIMEO:
870 ret = sock_set_timeout(&sk->sk_rcvtimeo, optval, optlen);
871 break;
873 case SO_SNDTIMEO:
874 ret = sock_set_timeout(&sk->sk_sndtimeo, optval, optlen);
875 break;
877 case SO_ATTACH_FILTER:
878 ret = -EINVAL;
879 if (optlen == sizeof(struct sock_fprog)) {
880 struct sock_fprog fprog;
882 ret = -EFAULT;
883 if (copy_from_user(&fprog, optval, sizeof(fprog)))
884 break;
886 ret = sk_attach_filter(&fprog, sk);
888 break;
890 case SO_ATTACH_BPF:
891 ret = -EINVAL;
892 if (optlen == sizeof(u32)) {
893 u32 ufd;
895 ret = -EFAULT;
896 if (copy_from_user(&ufd, optval, sizeof(ufd)))
897 break;
899 ret = sk_attach_bpf(ufd, sk);
901 break;
903 case SO_ATTACH_REUSEPORT_CBPF:
904 ret = -EINVAL;
905 if (optlen == sizeof(struct sock_fprog)) {
906 struct sock_fprog fprog;
908 ret = -EFAULT;
909 if (copy_from_user(&fprog, optval, sizeof(fprog)))
910 break;
912 ret = sk_reuseport_attach_filter(&fprog, sk);
914 break;
916 case SO_ATTACH_REUSEPORT_EBPF:
917 ret = -EINVAL;
918 if (optlen == sizeof(u32)) {
919 u32 ufd;
921 ret = -EFAULT;
922 if (copy_from_user(&ufd, optval, sizeof(ufd)))
923 break;
925 ret = sk_reuseport_attach_bpf(ufd, sk);
927 break;
929 case SO_DETACH_FILTER:
930 ret = sk_detach_filter(sk);
931 break;
933 case SO_LOCK_FILTER:
934 if (sock_flag(sk, SOCK_FILTER_LOCKED) && !valbool)
935 ret = -EPERM;
936 else
937 sock_valbool_flag(sk, SOCK_FILTER_LOCKED, valbool);
938 break;
940 case SO_PASSSEC:
941 if (valbool)
942 set_bit(SOCK_PASSSEC, &sock->flags);
943 else
944 clear_bit(SOCK_PASSSEC, &sock->flags);
945 break;
946 case SO_MARK:
947 if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
948 ret = -EPERM;
949 else
950 sk->sk_mark = val;
951 break;
953 case SO_RXQ_OVFL:
954 sock_valbool_flag(sk, SOCK_RXQ_OVFL, valbool);
955 break;
957 case SO_WIFI_STATUS:
958 sock_valbool_flag(sk, SOCK_WIFI_STATUS, valbool);
959 break;
961 case SO_PEEK_OFF:
962 if (sock->ops->set_peek_off)
963 ret = sock->ops->set_peek_off(sk, val);
964 else
965 ret = -EOPNOTSUPP;
966 break;
968 case SO_NOFCS:
969 sock_valbool_flag(sk, SOCK_NOFCS, valbool);
970 break;
972 case SO_SELECT_ERR_QUEUE:
973 sock_valbool_flag(sk, SOCK_SELECT_ERR_QUEUE, valbool);
974 break;
976 #ifdef CONFIG_NET_RX_BUSY_POLL
977 case SO_BUSY_POLL:
978 /* allow unprivileged users to decrease the value */
979 if ((val > sk->sk_ll_usec) && !capable(CAP_NET_ADMIN))
980 ret = -EPERM;
981 else {
982 if (val < 0)
983 ret = -EINVAL;
984 else
985 sk->sk_ll_usec = val;
987 break;
988 #endif
990 case SO_MAX_PACING_RATE:
991 sk->sk_max_pacing_rate = val;
992 sk->sk_pacing_rate = min(sk->sk_pacing_rate,
993 sk->sk_max_pacing_rate);
994 break;
996 case SO_INCOMING_CPU:
997 sk->sk_incoming_cpu = val;
998 break;
1000 case SO_CNX_ADVICE:
1001 if (val == 1)
1002 dst_negative_advice(sk);
1003 break;
1004 default:
1005 ret = -ENOPROTOOPT;
1006 break;
1008 release_sock(sk);
1009 return ret;
1011 EXPORT_SYMBOL(sock_setsockopt);
1014 static void cred_to_ucred(struct pid *pid, const struct cred *cred,
1015 struct ucred *ucred)
1017 ucred->pid = pid_vnr(pid);
1018 ucred->uid = ucred->gid = -1;
1019 if (cred) {
1020 struct user_namespace *current_ns = current_user_ns();
1022 ucred->uid = from_kuid_munged(current_ns, cred->euid);
1023 ucred->gid = from_kgid_munged(current_ns, cred->egid);
1027 int sock_getsockopt(struct socket *sock, int level, int optname,
1028 char __user *optval, int __user *optlen)
1030 struct sock *sk = sock->sk;
1032 union {
1033 int val;
1034 struct linger ling;
1035 struct timeval tm;
1036 } v;
1038 int lv = sizeof(int);
1039 int len;
1041 if (get_user(len, optlen))
1042 return -EFAULT;
1043 if (len < 0)
1044 return -EINVAL;
1046 memset(&v, 0, sizeof(v));
1048 switch (optname) {
1049 case SO_DEBUG:
1050 v.val = sock_flag(sk, SOCK_DBG);
1051 break;
1053 case SO_DONTROUTE:
1054 v.val = sock_flag(sk, SOCK_LOCALROUTE);
1055 break;
1057 case SO_BROADCAST:
1058 v.val = sock_flag(sk, SOCK_BROADCAST);
1059 break;
1061 case SO_SNDBUF:
1062 v.val = sk->sk_sndbuf;
1063 break;
1065 case SO_RCVBUF:
1066 v.val = sk->sk_rcvbuf;
1067 break;
1069 case SO_REUSEADDR:
1070 v.val = sk->sk_reuse;
1071 break;
1073 case SO_REUSEPORT:
1074 v.val = sk->sk_reuseport;
1075 break;
1077 case SO_KEEPALIVE:
1078 v.val = sock_flag(sk, SOCK_KEEPOPEN);
1079 break;
1081 case SO_TYPE:
1082 v.val = sk->sk_type;
1083 break;
1085 case SO_PROTOCOL:
1086 v.val = sk->sk_protocol;
1087 break;
1089 case SO_DOMAIN:
1090 v.val = sk->sk_family;
1091 break;
1093 case SO_ERROR:
1094 v.val = -sock_error(sk);
1095 if (v.val == 0)
1096 v.val = xchg(&sk->sk_err_soft, 0);
1097 break;
1099 case SO_OOBINLINE:
1100 v.val = sock_flag(sk, SOCK_URGINLINE);
1101 break;
1103 case SO_NO_CHECK:
1104 v.val = sk->sk_no_check_tx;
1105 break;
1107 case SO_PRIORITY:
1108 v.val = sk->sk_priority;
1109 break;
1111 case SO_LINGER:
1112 lv = sizeof(v.ling);
1113 v.ling.l_onoff = sock_flag(sk, SOCK_LINGER);
1114 v.ling.l_linger = sk->sk_lingertime / HZ;
1115 break;
1117 case SO_BSDCOMPAT:
1118 sock_warn_obsolete_bsdism("getsockopt");
1119 break;
1121 case SO_TIMESTAMP:
1122 v.val = sock_flag(sk, SOCK_RCVTSTAMP) &&
1123 !sock_flag(sk, SOCK_RCVTSTAMPNS);
1124 break;
1126 case SO_TIMESTAMPNS:
1127 v.val = sock_flag(sk, SOCK_RCVTSTAMPNS);
1128 break;
1130 case SO_TIMESTAMPING:
1131 v.val = sk->sk_tsflags;
1132 break;
1134 case SO_RCVTIMEO:
1135 lv = sizeof(struct timeval);
1136 if (sk->sk_rcvtimeo == MAX_SCHEDULE_TIMEOUT) {
1137 v.tm.tv_sec = 0;
1138 v.tm.tv_usec = 0;
1139 } else {
1140 v.tm.tv_sec = sk->sk_rcvtimeo / HZ;
1141 v.tm.tv_usec = ((sk->sk_rcvtimeo % HZ) * 1000000) / HZ;
1143 break;
1145 case SO_SNDTIMEO:
1146 lv = sizeof(struct timeval);
1147 if (sk->sk_sndtimeo == MAX_SCHEDULE_TIMEOUT) {
1148 v.tm.tv_sec = 0;
1149 v.tm.tv_usec = 0;
1150 } else {
1151 v.tm.tv_sec = sk->sk_sndtimeo / HZ;
1152 v.tm.tv_usec = ((sk->sk_sndtimeo % HZ) * 1000000) / HZ;
1154 break;
1156 case SO_RCVLOWAT:
1157 v.val = sk->sk_rcvlowat;
1158 break;
1160 case SO_SNDLOWAT:
1161 v.val = 1;
1162 break;
1164 case SO_PASSCRED:
1165 v.val = !!test_bit(SOCK_PASSCRED, &sock->flags);
1166 break;
1168 case SO_PEERCRED:
1170 struct ucred peercred;
1171 if (len > sizeof(peercred))
1172 len = sizeof(peercred);
1173 cred_to_ucred(sk->sk_peer_pid, sk->sk_peer_cred, &peercred);
1174 if (copy_to_user(optval, &peercred, len))
1175 return -EFAULT;
1176 goto lenout;
1179 case SO_PEERNAME:
1181 char address[128];
1183 if (sock->ops->getname(sock, (struct sockaddr *)address, &lv, 2))
1184 return -ENOTCONN;
1185 if (lv < len)
1186 return -EINVAL;
1187 if (copy_to_user(optval, address, len))
1188 return -EFAULT;
1189 goto lenout;
1192 /* Dubious BSD thing... Probably nobody even uses it, but
1193 * the UNIX standard wants it for whatever reason... -DaveM
1195 case SO_ACCEPTCONN:
1196 v.val = sk->sk_state == TCP_LISTEN;
1197 break;
1199 case SO_PASSSEC:
1200 v.val = !!test_bit(SOCK_PASSSEC, &sock->flags);
1201 break;
1203 case SO_PEERSEC:
1204 return security_socket_getpeersec_stream(sock, optval, optlen, len);
1206 case SO_MARK:
1207 v.val = sk->sk_mark;
1208 break;
1210 case SO_RXQ_OVFL:
1211 v.val = sock_flag(sk, SOCK_RXQ_OVFL);
1212 break;
1214 case SO_WIFI_STATUS:
1215 v.val = sock_flag(sk, SOCK_WIFI_STATUS);
1216 break;
1218 case SO_PEEK_OFF:
1219 if (!sock->ops->set_peek_off)
1220 return -EOPNOTSUPP;
1222 v.val = sk->sk_peek_off;
1223 break;
1224 case SO_NOFCS:
1225 v.val = sock_flag(sk, SOCK_NOFCS);
1226 break;
1228 case SO_BINDTODEVICE:
1229 return sock_getbindtodevice(sk, optval, optlen, len);
1231 case SO_GET_FILTER:
1232 len = sk_get_filter(sk, (struct sock_filter __user *)optval, len);
1233 if (len < 0)
1234 return len;
1236 goto lenout;
1238 case SO_LOCK_FILTER:
1239 v.val = sock_flag(sk, SOCK_FILTER_LOCKED);
1240 break;
1242 case SO_BPF_EXTENSIONS:
1243 v.val = bpf_tell_extensions();
1244 break;
1246 case SO_SELECT_ERR_QUEUE:
1247 v.val = sock_flag(sk, SOCK_SELECT_ERR_QUEUE);
1248 break;
1250 #ifdef CONFIG_NET_RX_BUSY_POLL
1251 case SO_BUSY_POLL:
1252 v.val = sk->sk_ll_usec;
1253 break;
1254 #endif
1256 case SO_MAX_PACING_RATE:
1257 v.val = sk->sk_max_pacing_rate;
1258 break;
1260 case SO_INCOMING_CPU:
1261 v.val = sk->sk_incoming_cpu;
1262 break;
1264 default:
1265 /* We implement the SO_SNDLOWAT etc to not be settable
1266 * (1003.1g 7).
1268 return -ENOPROTOOPT;
1271 if (len > lv)
1272 len = lv;
1273 if (copy_to_user(optval, &v, len))
1274 return -EFAULT;
1275 lenout:
1276 if (put_user(len, optlen))
1277 return -EFAULT;
1278 return 0;
1282 * Initialize an sk_lock.
1284 * (We also register the sk_lock with the lock validator.)
1286 static inline void sock_lock_init(struct sock *sk)
1288 sock_lock_init_class_and_name(sk,
1289 af_family_slock_key_strings[sk->sk_family],
1290 af_family_slock_keys + sk->sk_family,
1291 af_family_key_strings[sk->sk_family],
1292 af_family_keys + sk->sk_family);
1296 * Copy all fields from osk to nsk but nsk->sk_refcnt must not change yet,
1297 * even temporarly, because of RCU lookups. sk_node should also be left as is.
1298 * We must not copy fields between sk_dontcopy_begin and sk_dontcopy_end
1300 static void sock_copy(struct sock *nsk, const struct sock *osk)
1302 #ifdef CONFIG_SECURITY_NETWORK
1303 void *sptr = nsk->sk_security;
1304 #endif
1305 memcpy(nsk, osk, offsetof(struct sock, sk_dontcopy_begin));
1307 memcpy(&nsk->sk_dontcopy_end, &osk->sk_dontcopy_end,
1308 osk->sk_prot->obj_size - offsetof(struct sock, sk_dontcopy_end));
1310 #ifdef CONFIG_SECURITY_NETWORK
1311 nsk->sk_security = sptr;
1312 security_sk_clone(osk, nsk);
1313 #endif
1316 static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
1317 int family)
1319 struct sock *sk;
1320 struct kmem_cache *slab;
1322 slab = prot->slab;
1323 if (slab != NULL) {
1324 sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO);
1325 if (!sk)
1326 return sk;
1327 if (priority & __GFP_ZERO)
1328 sk_prot_clear_nulls(sk, prot->obj_size);
1329 } else
1330 sk = kmalloc(prot->obj_size, priority);
1332 if (sk != NULL) {
1333 kmemcheck_annotate_bitfield(sk, flags);
1335 if (security_sk_alloc(sk, family, priority))
1336 goto out_free;
1338 if (!try_module_get(prot->owner))
1339 goto out_free_sec;
1340 sk_tx_queue_clear(sk);
1343 return sk;
1345 out_free_sec:
1346 security_sk_free(sk);
1347 out_free:
1348 if (slab != NULL)
1349 kmem_cache_free(slab, sk);
1350 else
1351 kfree(sk);
1352 return NULL;
1355 static void sk_prot_free(struct proto *prot, struct sock *sk)
1357 struct kmem_cache *slab;
1358 struct module *owner;
1360 owner = prot->owner;
1361 slab = prot->slab;
1363 cgroup_sk_free(&sk->sk_cgrp_data);
1364 mem_cgroup_sk_free(sk);
1365 security_sk_free(sk);
1366 if (slab != NULL)
1367 kmem_cache_free(slab, sk);
1368 else
1369 kfree(sk);
1370 module_put(owner);
1374 * sk_alloc - All socket objects are allocated here
1375 * @net: the applicable net namespace
1376 * @family: protocol family
1377 * @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
1378 * @prot: struct proto associated with this new sock instance
1379 * @kern: is this to be a kernel socket?
1381 struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
1382 struct proto *prot, int kern)
1384 struct sock *sk;
1386 sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family);
1387 if (sk) {
1388 sk->sk_family = family;
1390 * See comment in struct sock definition to understand
1391 * why we need sk_prot_creator -acme
1393 sk->sk_prot = sk->sk_prot_creator = prot;
1394 sock_lock_init(sk);
1395 sk->sk_net_refcnt = kern ? 0 : 1;
1396 if (likely(sk->sk_net_refcnt))
1397 get_net(net);
1398 sock_net_set(sk, net);
1399 atomic_set(&sk->sk_wmem_alloc, 1);
1401 mem_cgroup_sk_alloc(sk);
1402 cgroup_sk_alloc(&sk->sk_cgrp_data);
1403 sock_update_classid(&sk->sk_cgrp_data);
1404 sock_update_netprioidx(&sk->sk_cgrp_data);
1407 return sk;
1409 EXPORT_SYMBOL(sk_alloc);
1411 /* Sockets having SOCK_RCU_FREE will call this function after one RCU
1412 * grace period. This is the case for UDP sockets and TCP listeners.
1414 static void __sk_destruct(struct rcu_head *head)
1416 struct sock *sk = container_of(head, struct sock, sk_rcu);
1417 struct sk_filter *filter;
1419 if (sk->sk_destruct)
1420 sk->sk_destruct(sk);
1422 filter = rcu_dereference_check(sk->sk_filter,
1423 atomic_read(&sk->sk_wmem_alloc) == 0);
1424 if (filter) {
1425 sk_filter_uncharge(sk, filter);
1426 RCU_INIT_POINTER(sk->sk_filter, NULL);
1428 if (rcu_access_pointer(sk->sk_reuseport_cb))
1429 reuseport_detach_sock(sk);
1431 sock_disable_timestamp(sk, SK_FLAGS_TIMESTAMP);
1433 if (atomic_read(&sk->sk_omem_alloc))
1434 pr_debug("%s: optmem leakage (%d bytes) detected\n",
1435 __func__, atomic_read(&sk->sk_omem_alloc));
1437 if (sk->sk_frag.page) {
1438 put_page(sk->sk_frag.page);
1439 sk->sk_frag.page = NULL;
1442 if (sk->sk_peer_cred)
1443 put_cred(sk->sk_peer_cred);
1444 put_pid(sk->sk_peer_pid);
1445 if (likely(sk->sk_net_refcnt))
1446 put_net(sock_net(sk));
1447 sk_prot_free(sk->sk_prot_creator, sk);
1450 void sk_destruct(struct sock *sk)
1452 if (sock_flag(sk, SOCK_RCU_FREE))
1453 call_rcu(&sk->sk_rcu, __sk_destruct);
1454 else
1455 __sk_destruct(&sk->sk_rcu);
1458 static void __sk_free(struct sock *sk)
1460 if (unlikely(sk->sk_net_refcnt && sock_diag_has_destroy_listeners(sk)))
1461 sock_diag_broadcast_destroy(sk);
1462 else
1463 sk_destruct(sk);
1466 void sk_free(struct sock *sk)
1469 * We subtract one from sk_wmem_alloc and can know if
1470 * some packets are still in some tx queue.
1471 * If not null, sock_wfree() will call __sk_free(sk) later
1473 if (atomic_dec_and_test(&sk->sk_wmem_alloc))
1474 __sk_free(sk);
1476 EXPORT_SYMBOL(sk_free);
1479 * sk_clone_lock - clone a socket, and lock its clone
1480 * @sk: the socket to clone
1481 * @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
1483 * Caller must unlock socket even in error path (bh_unlock_sock(newsk))
1485 struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
1487 struct sock *newsk;
1488 bool is_charged = true;
1490 newsk = sk_prot_alloc(sk->sk_prot, priority, sk->sk_family);
1491 if (newsk != NULL) {
1492 struct sk_filter *filter;
1494 sock_copy(newsk, sk);
1496 newsk->sk_prot_creator = sk->sk_prot;
1498 /* SANITY */
1499 if (likely(newsk->sk_net_refcnt))
1500 get_net(sock_net(newsk));
1501 sk_node_init(&newsk->sk_node);
1502 sock_lock_init(newsk);
1503 bh_lock_sock(newsk);
1504 newsk->sk_backlog.head = newsk->sk_backlog.tail = NULL;
1505 newsk->sk_backlog.len = 0;
1507 atomic_set(&newsk->sk_rmem_alloc, 0);
1509 * sk_wmem_alloc set to one (see sk_free() and sock_wfree())
1511 atomic_set(&newsk->sk_wmem_alloc, 1);
1512 atomic_set(&newsk->sk_omem_alloc, 0);
1513 skb_queue_head_init(&newsk->sk_receive_queue);
1514 skb_queue_head_init(&newsk->sk_write_queue);
1516 rwlock_init(&newsk->sk_callback_lock);
1517 lockdep_set_class_and_name(&newsk->sk_callback_lock,
1518 af_callback_keys + newsk->sk_family,
1519 af_family_clock_key_strings[newsk->sk_family]);
1521 newsk->sk_dst_cache = NULL;
1522 newsk->sk_wmem_queued = 0;
1523 newsk->sk_forward_alloc = 0;
1524 atomic_set(&newsk->sk_drops, 0);
1525 newsk->sk_send_head = NULL;
1526 newsk->sk_userlocks = sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
1528 sock_reset_flag(newsk, SOCK_DONE);
1529 cgroup_sk_alloc(&newsk->sk_cgrp_data);
1530 skb_queue_head_init(&newsk->sk_error_queue);
1532 filter = rcu_dereference_protected(newsk->sk_filter, 1);
1533 if (filter != NULL)
1534 /* though it's an empty new sock, the charging may fail
1535 * if sysctl_optmem_max was changed between creation of
1536 * original socket and cloning
1538 is_charged = sk_filter_charge(newsk, filter);
1540 if (unlikely(!is_charged || xfrm_sk_clone_policy(newsk, sk))) {
1541 /* We need to make sure that we don't uncharge the new
1542 * socket if we couldn't charge it in the first place
1543 * as otherwise we uncharge the parent's filter.
1545 if (!is_charged)
1546 RCU_INIT_POINTER(newsk->sk_filter, NULL);
1547 /* It is still raw copy of parent, so invalidate
1548 * destructor and make plain sk_free() */
1549 newsk->sk_destruct = NULL;
1550 bh_unlock_sock(newsk);
1551 sk_free(newsk);
1552 newsk = NULL;
1553 goto out;
1555 RCU_INIT_POINTER(newsk->sk_reuseport_cb, NULL);
1557 newsk->sk_err = 0;
1558 newsk->sk_err_soft = 0;
1559 newsk->sk_priority = 0;
1560 newsk->sk_incoming_cpu = raw_smp_processor_id();
1561 atomic64_set(&newsk->sk_cookie, 0);
1563 mem_cgroup_sk_alloc(newsk);
1565 * Before updating sk_refcnt, we must commit prior changes to memory
1566 * (Documentation/RCU/rculist_nulls.txt for details)
1568 smp_wmb();
1569 atomic_set(&newsk->sk_refcnt, 2);
1572 * Increment the counter in the same struct proto as the master
1573 * sock (sk_refcnt_debug_inc uses newsk->sk_prot->socks, that
1574 * is the same as sk->sk_prot->socks, as this field was copied
1575 * with memcpy).
1577 * This _changes_ the previous behaviour, where
1578 * tcp_create_openreq_child always was incrementing the
1579 * equivalent to tcp_prot->socks (inet_sock_nr), so this have
1580 * to be taken into account in all callers. -acme
1582 sk_refcnt_debug_inc(newsk);
1583 sk_set_socket(newsk, NULL);
1584 newsk->sk_wq = NULL;
1586 if (newsk->sk_prot->sockets_allocated)
1587 sk_sockets_allocated_inc(newsk);
1589 if (sock_needs_netstamp(sk) &&
1590 newsk->sk_flags & SK_FLAGS_TIMESTAMP)
1591 net_enable_timestamp();
1593 out:
1594 return newsk;
1596 EXPORT_SYMBOL_GPL(sk_clone_lock);
1598 void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
1600 u32 max_segs = 1;
1602 sk_dst_set(sk, dst);
1603 sk->sk_route_caps = dst->dev->features;
1604 if (sk->sk_route_caps & NETIF_F_GSO)
1605 sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE;
1606 sk->sk_route_caps &= ~sk->sk_route_nocaps;
1607 if (sk_can_gso(sk)) {
1608 if (dst->header_len) {
1609 sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
1610 } else {
1611 sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM;
1612 sk->sk_gso_max_size = dst->dev->gso_max_size;
1613 max_segs = max_t(u32, dst->dev->gso_max_segs, 1);
1616 sk->sk_gso_max_segs = max_segs;
1618 EXPORT_SYMBOL_GPL(sk_setup_caps);
1621 * Simple resource managers for sockets.
1626 * Write buffer destructor automatically called from kfree_skb.
1628 void sock_wfree(struct sk_buff *skb)
1630 struct sock *sk = skb->sk;
1631 unsigned int len = skb->truesize;
1633 if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE)) {
1635 * Keep a reference on sk_wmem_alloc, this will be released
1636 * after sk_write_space() call
1638 atomic_sub(len - 1, &sk->sk_wmem_alloc);
1639 sk->sk_write_space(sk);
1640 len = 1;
1643 * if sk_wmem_alloc reaches 0, we must finish what sk_free()
1644 * could not do because of in-flight packets
1646 if (atomic_sub_and_test(len, &sk->sk_wmem_alloc))
1647 __sk_free(sk);
1649 EXPORT_SYMBOL(sock_wfree);
1651 /* This variant of sock_wfree() is used by TCP,
1652 * since it sets SOCK_USE_WRITE_QUEUE.
1654 void __sock_wfree(struct sk_buff *skb)
1656 struct sock *sk = skb->sk;
1658 if (atomic_sub_and_test(skb->truesize, &sk->sk_wmem_alloc))
1659 __sk_free(sk);
1662 void skb_set_owner_w(struct sk_buff *skb, struct sock *sk)
1664 skb_orphan(skb);
1665 skb->sk = sk;
1666 #ifdef CONFIG_INET
1667 if (unlikely(!sk_fullsock(sk))) {
1668 skb->destructor = sock_edemux;
1669 sock_hold(sk);
1670 return;
1672 #endif
1673 skb->destructor = sock_wfree;
1674 skb_set_hash_from_sk(skb, sk);
1676 * We used to take a refcount on sk, but following operation
1677 * is enough to guarantee sk_free() wont free this sock until
1678 * all in-flight packets are completed
1680 atomic_add(skb->truesize, &sk->sk_wmem_alloc);
1682 EXPORT_SYMBOL(skb_set_owner_w);
1684 /* This helper is used by netem, as it can hold packets in its
1685 * delay queue. We want to allow the owner socket to send more
1686 * packets, as if they were already TX completed by a typical driver.
1687 * But we also want to keep skb->sk set because some packet schedulers
1688 * rely on it (sch_fq for example).
1690 void skb_orphan_partial(struct sk_buff *skb)
1692 if (skb_is_tcp_pure_ack(skb))
1693 return;
1695 if (skb->destructor == sock_wfree
1696 #ifdef CONFIG_INET
1697 || skb->destructor == tcp_wfree
1698 #endif
1700 struct sock *sk = skb->sk;
1702 if (atomic_inc_not_zero(&sk->sk_refcnt)) {
1703 atomic_sub(skb->truesize, &sk->sk_wmem_alloc);
1704 skb->destructor = sock_efree;
1706 } else {
1707 skb_orphan(skb);
1710 EXPORT_SYMBOL(skb_orphan_partial);
1713 * Read buffer destructor automatically called from kfree_skb.
1715 void sock_rfree(struct sk_buff *skb)
1717 struct sock *sk = skb->sk;
1718 unsigned int len = skb->truesize;
1720 atomic_sub(len, &sk->sk_rmem_alloc);
1721 sk_mem_uncharge(sk, len);
1723 EXPORT_SYMBOL(sock_rfree);
1726 * Buffer destructor for skbs that are not used directly in read or write
1727 * path, e.g. for error handler skbs. Automatically called from kfree_skb.
1729 void sock_efree(struct sk_buff *skb)
1731 sock_put(skb->sk);
1733 EXPORT_SYMBOL(sock_efree);
1735 kuid_t sock_i_uid(struct sock *sk)
1737 kuid_t uid;
1739 read_lock_bh(&sk->sk_callback_lock);
1740 uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : GLOBAL_ROOT_UID;
1741 read_unlock_bh(&sk->sk_callback_lock);
1742 return uid;
1744 EXPORT_SYMBOL(sock_i_uid);
1746 unsigned long sock_i_ino(struct sock *sk)
1748 unsigned long ino;
1750 read_lock_bh(&sk->sk_callback_lock);
1751 ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0;
1752 read_unlock_bh(&sk->sk_callback_lock);
1753 return ino;
1755 EXPORT_SYMBOL(sock_i_ino);
1758 * Allocate a skb from the socket's send buffer.
1760 struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
1761 gfp_t priority)
1763 if (force || atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
1764 struct sk_buff *skb = alloc_skb(size, priority);
1765 if (skb) {
1766 skb_set_owner_w(skb, sk);
1767 return skb;
1770 return NULL;
1772 EXPORT_SYMBOL(sock_wmalloc);
1775 * Allocate a memory block from the socket's option memory buffer.
1777 void *sock_kmalloc(struct sock *sk, int size, gfp_t priority)
1779 if ((unsigned int)size <= sysctl_optmem_max &&
1780 atomic_read(&sk->sk_omem_alloc) + size < sysctl_optmem_max) {
1781 void *mem;
1782 /* First do the add, to avoid the race if kmalloc
1783 * might sleep.
1785 atomic_add(size, &sk->sk_omem_alloc);
1786 mem = kmalloc(size, priority);
1787 if (mem)
1788 return mem;
1789 atomic_sub(size, &sk->sk_omem_alloc);
1791 return NULL;
1793 EXPORT_SYMBOL(sock_kmalloc);
1795 /* Free an option memory block. Note, we actually want the inline
1796 * here as this allows gcc to detect the nullify and fold away the
1797 * condition entirely.
1799 static inline void __sock_kfree_s(struct sock *sk, void *mem, int size,
1800 const bool nullify)
1802 if (WARN_ON_ONCE(!mem))
1803 return;
1804 if (nullify)
1805 kzfree(mem);
1806 else
1807 kfree(mem);
1808 atomic_sub(size, &sk->sk_omem_alloc);
1811 void sock_kfree_s(struct sock *sk, void *mem, int size)
1813 __sock_kfree_s(sk, mem, size, false);
1815 EXPORT_SYMBOL(sock_kfree_s);
1817 void sock_kzfree_s(struct sock *sk, void *mem, int size)
1819 __sock_kfree_s(sk, mem, size, true);
1821 EXPORT_SYMBOL(sock_kzfree_s);
1823 /* It is almost wait_for_tcp_memory minus release_sock/lock_sock.
1824 I think, these locks should be removed for datagram sockets.
1826 static long sock_wait_for_wmem(struct sock *sk, long timeo)
1828 DEFINE_WAIT(wait);
1830 sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
1831 for (;;) {
1832 if (!timeo)
1833 break;
1834 if (signal_pending(current))
1835 break;
1836 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1837 prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
1838 if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf)
1839 break;
1840 if (sk->sk_shutdown & SEND_SHUTDOWN)
1841 break;
1842 if (sk->sk_err)
1843 break;
1844 timeo = schedule_timeout(timeo);
1846 finish_wait(sk_sleep(sk), &wait);
1847 return timeo;
1852 * Generic send/receive buffer handlers
1855 struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len,
1856 unsigned long data_len, int noblock,
1857 int *errcode, int max_page_order)
1859 struct sk_buff *skb;
1860 long timeo;
1861 int err;
1863 timeo = sock_sndtimeo(sk, noblock);
1864 for (;;) {
1865 err = sock_error(sk);
1866 if (err != 0)
1867 goto failure;
1869 err = -EPIPE;
1870 if (sk->sk_shutdown & SEND_SHUTDOWN)
1871 goto failure;
1873 if (sk_wmem_alloc_get(sk) < sk->sk_sndbuf)
1874 break;
1876 sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
1877 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1878 err = -EAGAIN;
1879 if (!timeo)
1880 goto failure;
1881 if (signal_pending(current))
1882 goto interrupted;
1883 timeo = sock_wait_for_wmem(sk, timeo);
1885 skb = alloc_skb_with_frags(header_len, data_len, max_page_order,
1886 errcode, sk->sk_allocation);
1887 if (skb)
1888 skb_set_owner_w(skb, sk);
1889 return skb;
1891 interrupted:
1892 err = sock_intr_errno(timeo);
1893 failure:
1894 *errcode = err;
1895 return NULL;
1897 EXPORT_SYMBOL(sock_alloc_send_pskb);
1899 struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size,
1900 int noblock, int *errcode)
1902 return sock_alloc_send_pskb(sk, size, 0, noblock, errcode, 0);
1904 EXPORT_SYMBOL(sock_alloc_send_skb);
1906 int __sock_cmsg_send(struct sock *sk, struct msghdr *msg, struct cmsghdr *cmsg,
1907 struct sockcm_cookie *sockc)
1909 u32 tsflags;
1911 switch (cmsg->cmsg_type) {
1912 case SO_MARK:
1913 if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
1914 return -EPERM;
1915 if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
1916 return -EINVAL;
1917 sockc->mark = *(u32 *)CMSG_DATA(cmsg);
1918 break;
1919 case SO_TIMESTAMPING:
1920 if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
1921 return -EINVAL;
1923 tsflags = *(u32 *)CMSG_DATA(cmsg);
1924 if (tsflags & ~SOF_TIMESTAMPING_TX_RECORD_MASK)
1925 return -EINVAL;
1927 sockc->tsflags &= ~SOF_TIMESTAMPING_TX_RECORD_MASK;
1928 sockc->tsflags |= tsflags;
1929 break;
1930 /* SCM_RIGHTS and SCM_CREDENTIALS are semantically in SOL_UNIX. */
1931 case SCM_RIGHTS:
1932 case SCM_CREDENTIALS:
1933 break;
1934 default:
1935 return -EINVAL;
1937 return 0;
1939 EXPORT_SYMBOL(__sock_cmsg_send);
1941 int sock_cmsg_send(struct sock *sk, struct msghdr *msg,
1942 struct sockcm_cookie *sockc)
1944 struct cmsghdr *cmsg;
1945 int ret;
1947 for_each_cmsghdr(cmsg, msg) {
1948 if (!CMSG_OK(msg, cmsg))
1949 return -EINVAL;
1950 if (cmsg->cmsg_level != SOL_SOCKET)
1951 continue;
1952 ret = __sock_cmsg_send(sk, msg, cmsg, sockc);
1953 if (ret)
1954 return ret;
1956 return 0;
1958 EXPORT_SYMBOL(sock_cmsg_send);
1960 /* On 32bit arches, an skb frag is limited to 2^15 */
1961 #define SKB_FRAG_PAGE_ORDER get_order(32768)
1964 * skb_page_frag_refill - check that a page_frag contains enough room
1965 * @sz: minimum size of the fragment we want to get
1966 * @pfrag: pointer to page_frag
1967 * @gfp: priority for memory allocation
1969 * Note: While this allocator tries to use high order pages, there is
1970 * no guarantee that allocations succeed. Therefore, @sz MUST be
1971 * less or equal than PAGE_SIZE.
1973 bool skb_page_frag_refill(unsigned int sz, struct page_frag *pfrag, gfp_t gfp)
1975 if (pfrag->page) {
1976 if (page_ref_count(pfrag->page) == 1) {
1977 pfrag->offset = 0;
1978 return true;
1980 if (pfrag->offset + sz <= pfrag->size)
1981 return true;
1982 put_page(pfrag->page);
1985 pfrag->offset = 0;
1986 if (SKB_FRAG_PAGE_ORDER) {
1987 /* Avoid direct reclaim but allow kswapd to wake */
1988 pfrag->page = alloc_pages((gfp & ~__GFP_DIRECT_RECLAIM) |
1989 __GFP_COMP | __GFP_NOWARN |
1990 __GFP_NORETRY,
1991 SKB_FRAG_PAGE_ORDER);
1992 if (likely(pfrag->page)) {
1993 pfrag->size = PAGE_SIZE << SKB_FRAG_PAGE_ORDER;
1994 return true;
1997 pfrag->page = alloc_page(gfp);
1998 if (likely(pfrag->page)) {
1999 pfrag->size = PAGE_SIZE;
2000 return true;
2002 return false;
2004 EXPORT_SYMBOL(skb_page_frag_refill);
2006 bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag)
2008 if (likely(skb_page_frag_refill(32U, pfrag, sk->sk_allocation)))
2009 return true;
2011 sk_enter_memory_pressure(sk);
2012 sk_stream_moderate_sndbuf(sk);
2013 return false;
2015 EXPORT_SYMBOL(sk_page_frag_refill);
2017 static void __lock_sock(struct sock *sk)
2018 __releases(&sk->sk_lock.slock)
2019 __acquires(&sk->sk_lock.slock)
2021 DEFINE_WAIT(wait);
2023 for (;;) {
2024 prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait,
2025 TASK_UNINTERRUPTIBLE);
2026 spin_unlock_bh(&sk->sk_lock.slock);
2027 schedule();
2028 spin_lock_bh(&sk->sk_lock.slock);
2029 if (!sock_owned_by_user(sk))
2030 break;
2032 finish_wait(&sk->sk_lock.wq, &wait);
2035 static void __release_sock(struct sock *sk)
2036 __releases(&sk->sk_lock.slock)
2037 __acquires(&sk->sk_lock.slock)
2039 struct sk_buff *skb, *next;
2041 while ((skb = sk->sk_backlog.head) != NULL) {
2042 sk->sk_backlog.head = sk->sk_backlog.tail = NULL;
2044 spin_unlock_bh(&sk->sk_lock.slock);
2046 do {
2047 next = skb->next;
2048 prefetch(next);
2049 WARN_ON_ONCE(skb_dst_is_noref(skb));
2050 skb->next = NULL;
2051 sk_backlog_rcv(sk, skb);
2053 cond_resched();
2055 skb = next;
2056 } while (skb != NULL);
2058 spin_lock_bh(&sk->sk_lock.slock);
2062 * Doing the zeroing here guarantee we can not loop forever
2063 * while a wild producer attempts to flood us.
2065 sk->sk_backlog.len = 0;
2068 void __sk_flush_backlog(struct sock *sk)
2070 spin_lock_bh(&sk->sk_lock.slock);
2071 __release_sock(sk);
2072 spin_unlock_bh(&sk->sk_lock.slock);
2076 * sk_wait_data - wait for data to arrive at sk_receive_queue
2077 * @sk: sock to wait on
2078 * @timeo: for how long
2079 * @skb: last skb seen on sk_receive_queue
2081 * Now socket state including sk->sk_err is changed only under lock,
2082 * hence we may omit checks after joining wait queue.
2083 * We check receive queue before schedule() only as optimization;
2084 * it is very likely that release_sock() added new data.
2086 int sk_wait_data(struct sock *sk, long *timeo, const struct sk_buff *skb)
2088 int rc;
2089 DEFINE_WAIT(wait);
2091 prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
2092 sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2093 rc = sk_wait_event(sk, timeo, skb_peek_tail(&sk->sk_receive_queue) != skb);
2094 sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2095 finish_wait(sk_sleep(sk), &wait);
2096 return rc;
2098 EXPORT_SYMBOL(sk_wait_data);
2101 * __sk_mem_schedule - increase sk_forward_alloc and memory_allocated
2102 * @sk: socket
2103 * @size: memory size to allocate
2104 * @kind: allocation type
2106 * If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means
2107 * rmem allocation. This function assumes that protocols which have
2108 * memory_pressure use sk_wmem_queued as write buffer accounting.
2110 int __sk_mem_schedule(struct sock *sk, int size, int kind)
2112 struct proto *prot = sk->sk_prot;
2113 int amt = sk_mem_pages(size);
2114 long allocated;
2116 sk->sk_forward_alloc += amt * SK_MEM_QUANTUM;
2118 allocated = sk_memory_allocated_add(sk, amt);
2120 if (mem_cgroup_sockets_enabled && sk->sk_memcg &&
2121 !mem_cgroup_charge_skmem(sk->sk_memcg, amt))
2122 goto suppress_allocation;
2124 /* Under limit. */
2125 if (allocated <= sk_prot_mem_limits(sk, 0)) {
2126 sk_leave_memory_pressure(sk);
2127 return 1;
2130 /* Under pressure. */
2131 if (allocated > sk_prot_mem_limits(sk, 1))
2132 sk_enter_memory_pressure(sk);
2134 /* Over hard limit. */
2135 if (allocated > sk_prot_mem_limits(sk, 2))
2136 goto suppress_allocation;
2138 /* guarantee minimum buffer size under pressure */
2139 if (kind == SK_MEM_RECV) {
2140 if (atomic_read(&sk->sk_rmem_alloc) < prot->sysctl_rmem[0])
2141 return 1;
2143 } else { /* SK_MEM_SEND */
2144 if (sk->sk_type == SOCK_STREAM) {
2145 if (sk->sk_wmem_queued < prot->sysctl_wmem[0])
2146 return 1;
2147 } else if (atomic_read(&sk->sk_wmem_alloc) <
2148 prot->sysctl_wmem[0])
2149 return 1;
2152 if (sk_has_memory_pressure(sk)) {
2153 int alloc;
2155 if (!sk_under_memory_pressure(sk))
2156 return 1;
2157 alloc = sk_sockets_allocated_read_positive(sk);
2158 if (sk_prot_mem_limits(sk, 2) > alloc *
2159 sk_mem_pages(sk->sk_wmem_queued +
2160 atomic_read(&sk->sk_rmem_alloc) +
2161 sk->sk_forward_alloc))
2162 return 1;
2165 suppress_allocation:
2167 if (kind == SK_MEM_SEND && sk->sk_type == SOCK_STREAM) {
2168 sk_stream_moderate_sndbuf(sk);
2170 /* Fail only if socket is _under_ its sndbuf.
2171 * In this case we cannot block, so that we have to fail.
2173 if (sk->sk_wmem_queued + size >= sk->sk_sndbuf)
2174 return 1;
2177 trace_sock_exceed_buf_limit(sk, prot, allocated);
2179 /* Alas. Undo changes. */
2180 sk->sk_forward_alloc -= amt * SK_MEM_QUANTUM;
2182 sk_memory_allocated_sub(sk, amt);
2184 if (mem_cgroup_sockets_enabled && sk->sk_memcg)
2185 mem_cgroup_uncharge_skmem(sk->sk_memcg, amt);
2187 return 0;
2189 EXPORT_SYMBOL(__sk_mem_schedule);
2192 * __sk_mem_reclaim - reclaim memory_allocated
2193 * @sk: socket
2194 * @amount: number of bytes (rounded down to a SK_MEM_QUANTUM multiple)
2196 void __sk_mem_reclaim(struct sock *sk, int amount)
2198 amount >>= SK_MEM_QUANTUM_SHIFT;
2199 sk_memory_allocated_sub(sk, amount);
2200 sk->sk_forward_alloc -= amount << SK_MEM_QUANTUM_SHIFT;
2202 if (mem_cgroup_sockets_enabled && sk->sk_memcg)
2203 mem_cgroup_uncharge_skmem(sk->sk_memcg, amount);
2205 if (sk_under_memory_pressure(sk) &&
2206 (sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)))
2207 sk_leave_memory_pressure(sk);
2209 EXPORT_SYMBOL(__sk_mem_reclaim);
2211 int sk_set_peek_off(struct sock *sk, int val)
2213 if (val < 0)
2214 return -EINVAL;
2216 sk->sk_peek_off = val;
2217 return 0;
2219 EXPORT_SYMBOL_GPL(sk_set_peek_off);
2222 * Set of default routines for initialising struct proto_ops when
2223 * the protocol does not support a particular function. In certain
2224 * cases where it makes no sense for a protocol to have a "do nothing"
2225 * function, some default processing is provided.
2228 int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len)
2230 return -EOPNOTSUPP;
2232 EXPORT_SYMBOL(sock_no_bind);
2234 int sock_no_connect(struct socket *sock, struct sockaddr *saddr,
2235 int len, int flags)
2237 return -EOPNOTSUPP;
2239 EXPORT_SYMBOL(sock_no_connect);
2241 int sock_no_socketpair(struct socket *sock1, struct socket *sock2)
2243 return -EOPNOTSUPP;
2245 EXPORT_SYMBOL(sock_no_socketpair);
2247 int sock_no_accept(struct socket *sock, struct socket *newsock, int flags)
2249 return -EOPNOTSUPP;
2251 EXPORT_SYMBOL(sock_no_accept);
2253 int sock_no_getname(struct socket *sock, struct sockaddr *saddr,
2254 int *len, int peer)
2256 return -EOPNOTSUPP;
2258 EXPORT_SYMBOL(sock_no_getname);
2260 unsigned int sock_no_poll(struct file *file, struct socket *sock, poll_table *pt)
2262 return 0;
2264 EXPORT_SYMBOL(sock_no_poll);
2266 int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
2268 return -EOPNOTSUPP;
2270 EXPORT_SYMBOL(sock_no_ioctl);
2272 int sock_no_listen(struct socket *sock, int backlog)
2274 return -EOPNOTSUPP;
2276 EXPORT_SYMBOL(sock_no_listen);
2278 int sock_no_shutdown(struct socket *sock, int how)
2280 return -EOPNOTSUPP;
2282 EXPORT_SYMBOL(sock_no_shutdown);
2284 int sock_no_setsockopt(struct socket *sock, int level, int optname,
2285 char __user *optval, unsigned int optlen)
2287 return -EOPNOTSUPP;
2289 EXPORT_SYMBOL(sock_no_setsockopt);
2291 int sock_no_getsockopt(struct socket *sock, int level, int optname,
2292 char __user *optval, int __user *optlen)
2294 return -EOPNOTSUPP;
2296 EXPORT_SYMBOL(sock_no_getsockopt);
2298 int sock_no_sendmsg(struct socket *sock, struct msghdr *m, size_t len)
2300 return -EOPNOTSUPP;
2302 EXPORT_SYMBOL(sock_no_sendmsg);
2304 int sock_no_recvmsg(struct socket *sock, struct msghdr *m, size_t len,
2305 int flags)
2307 return -EOPNOTSUPP;
2309 EXPORT_SYMBOL(sock_no_recvmsg);
2311 int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
2313 /* Mirror missing mmap method error code */
2314 return -ENODEV;
2316 EXPORT_SYMBOL(sock_no_mmap);
2318 ssize_t sock_no_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags)
2320 ssize_t res;
2321 struct msghdr msg = {.msg_flags = flags};
2322 struct kvec iov;
2323 char *kaddr = kmap(page);
2324 iov.iov_base = kaddr + offset;
2325 iov.iov_len = size;
2326 res = kernel_sendmsg(sock, &msg, &iov, 1, size);
2327 kunmap(page);
2328 return res;
2330 EXPORT_SYMBOL(sock_no_sendpage);
2333 * Default Socket Callbacks
2336 static void sock_def_wakeup(struct sock *sk)
2338 struct socket_wq *wq;
2340 rcu_read_lock();
2341 wq = rcu_dereference(sk->sk_wq);
2342 if (skwq_has_sleeper(wq))
2343 wake_up_interruptible_all(&wq->wait);
2344 rcu_read_unlock();
2347 static void sock_def_error_report(struct sock *sk)
2349 struct socket_wq *wq;
2351 rcu_read_lock();
2352 wq = rcu_dereference(sk->sk_wq);
2353 if (skwq_has_sleeper(wq))
2354 wake_up_interruptible_poll(&wq->wait, POLLERR);
2355 sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR);
2356 rcu_read_unlock();
2359 static void sock_def_readable(struct sock *sk)
2361 struct socket_wq *wq;
2363 rcu_read_lock();
2364 wq = rcu_dereference(sk->sk_wq);
2365 if (skwq_has_sleeper(wq))
2366 wake_up_interruptible_sync_poll(&wq->wait, POLLIN | POLLPRI |
2367 POLLRDNORM | POLLRDBAND);
2368 sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
2369 rcu_read_unlock();
2372 static void sock_def_write_space(struct sock *sk)
2374 struct socket_wq *wq;
2376 rcu_read_lock();
2378 /* Do not wake up a writer until he can make "significant"
2379 * progress. --DaveM
2381 if ((atomic_read(&sk->sk_wmem_alloc) << 1) <= sk->sk_sndbuf) {
2382 wq = rcu_dereference(sk->sk_wq);
2383 if (skwq_has_sleeper(wq))
2384 wake_up_interruptible_sync_poll(&wq->wait, POLLOUT |
2385 POLLWRNORM | POLLWRBAND);
2387 /* Should agree with poll, otherwise some programs break */
2388 if (sock_writeable(sk))
2389 sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
2392 rcu_read_unlock();
2395 static void sock_def_destruct(struct sock *sk)
2399 void sk_send_sigurg(struct sock *sk)
2401 if (sk->sk_socket && sk->sk_socket->file)
2402 if (send_sigurg(&sk->sk_socket->file->f_owner))
2403 sk_wake_async(sk, SOCK_WAKE_URG, POLL_PRI);
2405 EXPORT_SYMBOL(sk_send_sigurg);
2407 void sk_reset_timer(struct sock *sk, struct timer_list* timer,
2408 unsigned long expires)
2410 if (!mod_timer(timer, expires))
2411 sock_hold(sk);
2413 EXPORT_SYMBOL(sk_reset_timer);
2415 void sk_stop_timer(struct sock *sk, struct timer_list* timer)
2417 if (del_timer(timer))
2418 __sock_put(sk);
2420 EXPORT_SYMBOL(sk_stop_timer);
2422 void sock_init_data(struct socket *sock, struct sock *sk)
2424 skb_queue_head_init(&sk->sk_receive_queue);
2425 skb_queue_head_init(&sk->sk_write_queue);
2426 skb_queue_head_init(&sk->sk_error_queue);
2428 sk->sk_send_head = NULL;
2430 init_timer(&sk->sk_timer);
2432 sk->sk_allocation = GFP_KERNEL;
2433 sk->sk_rcvbuf = sysctl_rmem_default;
2434 sk->sk_sndbuf = sysctl_wmem_default;
2435 sk->sk_state = TCP_CLOSE;
2436 sk_set_socket(sk, sock);
2438 sock_set_flag(sk, SOCK_ZAPPED);
2440 if (sock) {
2441 sk->sk_type = sock->type;
2442 sk->sk_wq = sock->wq;
2443 sock->sk = sk;
2444 } else
2445 sk->sk_wq = NULL;
2447 rwlock_init(&sk->sk_callback_lock);
2448 lockdep_set_class_and_name(&sk->sk_callback_lock,
2449 af_callback_keys + sk->sk_family,
2450 af_family_clock_key_strings[sk->sk_family]);
2452 sk->sk_state_change = sock_def_wakeup;
2453 sk->sk_data_ready = sock_def_readable;
2454 sk->sk_write_space = sock_def_write_space;
2455 sk->sk_error_report = sock_def_error_report;
2456 sk->sk_destruct = sock_def_destruct;
2458 sk->sk_frag.page = NULL;
2459 sk->sk_frag.offset = 0;
2460 sk->sk_peek_off = -1;
2462 sk->sk_peer_pid = NULL;
2463 sk->sk_peer_cred = NULL;
2464 sk->sk_write_pending = 0;
2465 sk->sk_rcvlowat = 1;
2466 sk->sk_rcvtimeo = MAX_SCHEDULE_TIMEOUT;
2467 sk->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT;
2469 sk->sk_stamp = ktime_set(-1L, 0);
2471 #ifdef CONFIG_NET_RX_BUSY_POLL
2472 sk->sk_napi_id = 0;
2473 sk->sk_ll_usec = sysctl_net_busy_read;
2474 #endif
2476 sk->sk_max_pacing_rate = ~0U;
2477 sk->sk_pacing_rate = ~0U;
2478 sk->sk_incoming_cpu = -1;
2480 * Before updating sk_refcnt, we must commit prior changes to memory
2481 * (Documentation/RCU/rculist_nulls.txt for details)
2483 smp_wmb();
2484 atomic_set(&sk->sk_refcnt, 1);
2485 atomic_set(&sk->sk_drops, 0);
2487 EXPORT_SYMBOL(sock_init_data);
2489 void lock_sock_nested(struct sock *sk, int subclass)
2491 might_sleep();
2492 spin_lock_bh(&sk->sk_lock.slock);
2493 if (sk->sk_lock.owned)
2494 __lock_sock(sk);
2495 sk->sk_lock.owned = 1;
2496 spin_unlock(&sk->sk_lock.slock);
2498 * The sk_lock has mutex_lock() semantics here:
2500 mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_);
2501 local_bh_enable();
2503 EXPORT_SYMBOL(lock_sock_nested);
2505 void release_sock(struct sock *sk)
2507 spin_lock_bh(&sk->sk_lock.slock);
2508 if (sk->sk_backlog.tail)
2509 __release_sock(sk);
2511 /* Warning : release_cb() might need to release sk ownership,
2512 * ie call sock_release_ownership(sk) before us.
2514 if (sk->sk_prot->release_cb)
2515 sk->sk_prot->release_cb(sk);
2517 sock_release_ownership(sk);
2518 if (waitqueue_active(&sk->sk_lock.wq))
2519 wake_up(&sk->sk_lock.wq);
2520 spin_unlock_bh(&sk->sk_lock.slock);
2522 EXPORT_SYMBOL(release_sock);
2525 * lock_sock_fast - fast version of lock_sock
2526 * @sk: socket
2528 * This version should be used for very small section, where process wont block
2529 * return false if fast path is taken
2530 * sk_lock.slock locked, owned = 0, BH disabled
2531 * return true if slow path is taken
2532 * sk_lock.slock unlocked, owned = 1, BH enabled
2534 bool lock_sock_fast(struct sock *sk)
2536 might_sleep();
2537 spin_lock_bh(&sk->sk_lock.slock);
2539 if (!sk->sk_lock.owned)
2541 * Note : We must disable BH
2543 return false;
2545 __lock_sock(sk);
2546 sk->sk_lock.owned = 1;
2547 spin_unlock(&sk->sk_lock.slock);
2549 * The sk_lock has mutex_lock() semantics here:
2551 mutex_acquire(&sk->sk_lock.dep_map, 0, 0, _RET_IP_);
2552 local_bh_enable();
2553 return true;
2555 EXPORT_SYMBOL(lock_sock_fast);
2557 int sock_get_timestamp(struct sock *sk, struct timeval __user *userstamp)
2559 struct timeval tv;
2560 if (!sock_flag(sk, SOCK_TIMESTAMP))
2561 sock_enable_timestamp(sk, SOCK_TIMESTAMP);
2562 tv = ktime_to_timeval(sk->sk_stamp);
2563 if (tv.tv_sec == -1)
2564 return -ENOENT;
2565 if (tv.tv_sec == 0) {
2566 sk->sk_stamp = ktime_get_real();
2567 tv = ktime_to_timeval(sk->sk_stamp);
2569 return copy_to_user(userstamp, &tv, sizeof(tv)) ? -EFAULT : 0;
2571 EXPORT_SYMBOL(sock_get_timestamp);
2573 int sock_get_timestampns(struct sock *sk, struct timespec __user *userstamp)
2575 struct timespec ts;
2576 if (!sock_flag(sk, SOCK_TIMESTAMP))
2577 sock_enable_timestamp(sk, SOCK_TIMESTAMP);
2578 ts = ktime_to_timespec(sk->sk_stamp);
2579 if (ts.tv_sec == -1)
2580 return -ENOENT;
2581 if (ts.tv_sec == 0) {
2582 sk->sk_stamp = ktime_get_real();
2583 ts = ktime_to_timespec(sk->sk_stamp);
2585 return copy_to_user(userstamp, &ts, sizeof(ts)) ? -EFAULT : 0;
2587 EXPORT_SYMBOL(sock_get_timestampns);
2589 void sock_enable_timestamp(struct sock *sk, int flag)
2591 if (!sock_flag(sk, flag)) {
2592 unsigned long previous_flags = sk->sk_flags;
2594 sock_set_flag(sk, flag);
2596 * we just set one of the two flags which require net
2597 * time stamping, but time stamping might have been on
2598 * already because of the other one
2600 if (sock_needs_netstamp(sk) &&
2601 !(previous_flags & SK_FLAGS_TIMESTAMP))
2602 net_enable_timestamp();
2606 int sock_recv_errqueue(struct sock *sk, struct msghdr *msg, int len,
2607 int level, int type)
2609 struct sock_exterr_skb *serr;
2610 struct sk_buff *skb;
2611 int copied, err;
2613 err = -EAGAIN;
2614 skb = sock_dequeue_err_skb(sk);
2615 if (skb == NULL)
2616 goto out;
2618 copied = skb->len;
2619 if (copied > len) {
2620 msg->msg_flags |= MSG_TRUNC;
2621 copied = len;
2623 err = skb_copy_datagram_msg(skb, 0, msg, copied);
2624 if (err)
2625 goto out_free_skb;
2627 sock_recv_timestamp(msg, sk, skb);
2629 serr = SKB_EXT_ERR(skb);
2630 put_cmsg(msg, level, type, sizeof(serr->ee), &serr->ee);
2632 msg->msg_flags |= MSG_ERRQUEUE;
2633 err = copied;
2635 out_free_skb:
2636 kfree_skb(skb);
2637 out:
2638 return err;
2640 EXPORT_SYMBOL(sock_recv_errqueue);
2643 * Get a socket option on an socket.
2645 * FIX: POSIX 1003.1g is very ambiguous here. It states that
2646 * asynchronous errors should be reported by getsockopt. We assume
2647 * this means if you specify SO_ERROR (otherwise whats the point of it).
2649 int sock_common_getsockopt(struct socket *sock, int level, int optname,
2650 char __user *optval, int __user *optlen)
2652 struct sock *sk = sock->sk;
2654 return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
2656 EXPORT_SYMBOL(sock_common_getsockopt);
2658 #ifdef CONFIG_COMPAT
2659 int compat_sock_common_getsockopt(struct socket *sock, int level, int optname,
2660 char __user *optval, int __user *optlen)
2662 struct sock *sk = sock->sk;
2664 if (sk->sk_prot->compat_getsockopt != NULL)
2665 return sk->sk_prot->compat_getsockopt(sk, level, optname,
2666 optval, optlen);
2667 return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
2669 EXPORT_SYMBOL(compat_sock_common_getsockopt);
2670 #endif
2672 int sock_common_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
2673 int flags)
2675 struct sock *sk = sock->sk;
2676 int addr_len = 0;
2677 int err;
2679 err = sk->sk_prot->recvmsg(sk, msg, size, flags & MSG_DONTWAIT,
2680 flags & ~MSG_DONTWAIT, &addr_len);
2681 if (err >= 0)
2682 msg->msg_namelen = addr_len;
2683 return err;
2685 EXPORT_SYMBOL(sock_common_recvmsg);
2688 * Set socket options on an inet socket.
2690 int sock_common_setsockopt(struct socket *sock, int level, int optname,
2691 char __user *optval, unsigned int optlen)
2693 struct sock *sk = sock->sk;
2695 return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
2697 EXPORT_SYMBOL(sock_common_setsockopt);
2699 #ifdef CONFIG_COMPAT
2700 int compat_sock_common_setsockopt(struct socket *sock, int level, int optname,
2701 char __user *optval, unsigned int optlen)
2703 struct sock *sk = sock->sk;
2705 if (sk->sk_prot->compat_setsockopt != NULL)
2706 return sk->sk_prot->compat_setsockopt(sk, level, optname,
2707 optval, optlen);
2708 return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
2710 EXPORT_SYMBOL(compat_sock_common_setsockopt);
2711 #endif
2713 void sk_common_release(struct sock *sk)
2715 if (sk->sk_prot->destroy)
2716 sk->sk_prot->destroy(sk);
2719 * Observation: when sock_common_release is called, processes have
2720 * no access to socket. But net still has.
2721 * Step one, detach it from networking:
2723 * A. Remove from hash tables.
2726 sk->sk_prot->unhash(sk);
2729 * In this point socket cannot receive new packets, but it is possible
2730 * that some packets are in flight because some CPU runs receiver and
2731 * did hash table lookup before we unhashed socket. They will achieve
2732 * receive queue and will be purged by socket destructor.
2734 * Also we still have packets pending on receive queue and probably,
2735 * our own packets waiting in device queues. sock_destroy will drain
2736 * receive queue, but transmitted packets will delay socket destruction
2737 * until the last reference will be released.
2740 sock_orphan(sk);
2742 xfrm_sk_free_policy(sk);
2744 sk_refcnt_debug_release(sk);
2746 sock_put(sk);
2748 EXPORT_SYMBOL(sk_common_release);
2750 #ifdef CONFIG_PROC_FS
2751 #define PROTO_INUSE_NR 64 /* should be enough for the first time */
2752 struct prot_inuse {
2753 int val[PROTO_INUSE_NR];
2756 static DECLARE_BITMAP(proto_inuse_idx, PROTO_INUSE_NR);
2758 #ifdef CONFIG_NET_NS
2759 void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
2761 __this_cpu_add(net->core.inuse->val[prot->inuse_idx], val);
2763 EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
2765 int sock_prot_inuse_get(struct net *net, struct proto *prot)
2767 int cpu, idx = prot->inuse_idx;
2768 int res = 0;
2770 for_each_possible_cpu(cpu)
2771 res += per_cpu_ptr(net->core.inuse, cpu)->val[idx];
2773 return res >= 0 ? res : 0;
2775 EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
2777 static int __net_init sock_inuse_init_net(struct net *net)
2779 net->core.inuse = alloc_percpu(struct prot_inuse);
2780 return net->core.inuse ? 0 : -ENOMEM;
2783 static void __net_exit sock_inuse_exit_net(struct net *net)
2785 free_percpu(net->core.inuse);
2788 static struct pernet_operations net_inuse_ops = {
2789 .init = sock_inuse_init_net,
2790 .exit = sock_inuse_exit_net,
2793 static __init int net_inuse_init(void)
2795 if (register_pernet_subsys(&net_inuse_ops))
2796 panic("Cannot initialize net inuse counters");
2798 return 0;
2801 core_initcall(net_inuse_init);
2802 #else
2803 static DEFINE_PER_CPU(struct prot_inuse, prot_inuse);
2805 void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
2807 __this_cpu_add(prot_inuse.val[prot->inuse_idx], val);
2809 EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
2811 int sock_prot_inuse_get(struct net *net, struct proto *prot)
2813 int cpu, idx = prot->inuse_idx;
2814 int res = 0;
2816 for_each_possible_cpu(cpu)
2817 res += per_cpu(prot_inuse, cpu).val[idx];
2819 return res >= 0 ? res : 0;
2821 EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
2822 #endif
2824 static void assign_proto_idx(struct proto *prot)
2826 prot->inuse_idx = find_first_zero_bit(proto_inuse_idx, PROTO_INUSE_NR);
2828 if (unlikely(prot->inuse_idx == PROTO_INUSE_NR - 1)) {
2829 pr_err("PROTO_INUSE_NR exhausted\n");
2830 return;
2833 set_bit(prot->inuse_idx, proto_inuse_idx);
2836 static void release_proto_idx(struct proto *prot)
2838 if (prot->inuse_idx != PROTO_INUSE_NR - 1)
2839 clear_bit(prot->inuse_idx, proto_inuse_idx);
2841 #else
2842 static inline void assign_proto_idx(struct proto *prot)
2846 static inline void release_proto_idx(struct proto *prot)
2849 #endif
2851 static void req_prot_cleanup(struct request_sock_ops *rsk_prot)
2853 if (!rsk_prot)
2854 return;
2855 kfree(rsk_prot->slab_name);
2856 rsk_prot->slab_name = NULL;
2857 kmem_cache_destroy(rsk_prot->slab);
2858 rsk_prot->slab = NULL;
2861 static int req_prot_init(const struct proto *prot)
2863 struct request_sock_ops *rsk_prot = prot->rsk_prot;
2865 if (!rsk_prot)
2866 return 0;
2868 rsk_prot->slab_name = kasprintf(GFP_KERNEL, "request_sock_%s",
2869 prot->name);
2870 if (!rsk_prot->slab_name)
2871 return -ENOMEM;
2873 rsk_prot->slab = kmem_cache_create(rsk_prot->slab_name,
2874 rsk_prot->obj_size, 0,
2875 prot->slab_flags, NULL);
2877 if (!rsk_prot->slab) {
2878 pr_crit("%s: Can't create request sock SLAB cache!\n",
2879 prot->name);
2880 return -ENOMEM;
2882 return 0;
2885 int proto_register(struct proto *prot, int alloc_slab)
2887 if (alloc_slab) {
2888 prot->slab = kmem_cache_create(prot->name, prot->obj_size, 0,
2889 SLAB_HWCACHE_ALIGN | prot->slab_flags,
2890 NULL);
2892 if (prot->slab == NULL) {
2893 pr_crit("%s: Can't create sock SLAB cache!\n",
2894 prot->name);
2895 goto out;
2898 if (req_prot_init(prot))
2899 goto out_free_request_sock_slab;
2901 if (prot->twsk_prot != NULL) {
2902 prot->twsk_prot->twsk_slab_name = kasprintf(GFP_KERNEL, "tw_sock_%s", prot->name);
2904 if (prot->twsk_prot->twsk_slab_name == NULL)
2905 goto out_free_request_sock_slab;
2907 prot->twsk_prot->twsk_slab =
2908 kmem_cache_create(prot->twsk_prot->twsk_slab_name,
2909 prot->twsk_prot->twsk_obj_size,
2911 prot->slab_flags,
2912 NULL);
2913 if (prot->twsk_prot->twsk_slab == NULL)
2914 goto out_free_timewait_sock_slab_name;
2918 mutex_lock(&proto_list_mutex);
2919 list_add(&prot->node, &proto_list);
2920 assign_proto_idx(prot);
2921 mutex_unlock(&proto_list_mutex);
2922 return 0;
2924 out_free_timewait_sock_slab_name:
2925 kfree(prot->twsk_prot->twsk_slab_name);
2926 out_free_request_sock_slab:
2927 req_prot_cleanup(prot->rsk_prot);
2929 kmem_cache_destroy(prot->slab);
2930 prot->slab = NULL;
2931 out:
2932 return -ENOBUFS;
2934 EXPORT_SYMBOL(proto_register);
2936 void proto_unregister(struct proto *prot)
2938 mutex_lock(&proto_list_mutex);
2939 release_proto_idx(prot);
2940 list_del(&prot->node);
2941 mutex_unlock(&proto_list_mutex);
2943 kmem_cache_destroy(prot->slab);
2944 prot->slab = NULL;
2946 req_prot_cleanup(prot->rsk_prot);
2948 if (prot->twsk_prot != NULL && prot->twsk_prot->twsk_slab != NULL) {
2949 kmem_cache_destroy(prot->twsk_prot->twsk_slab);
2950 kfree(prot->twsk_prot->twsk_slab_name);
2951 prot->twsk_prot->twsk_slab = NULL;
2954 EXPORT_SYMBOL(proto_unregister);
2956 #ifdef CONFIG_PROC_FS
2957 static void *proto_seq_start(struct seq_file *seq, loff_t *pos)
2958 __acquires(proto_list_mutex)
2960 mutex_lock(&proto_list_mutex);
2961 return seq_list_start_head(&proto_list, *pos);
2964 static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2966 return seq_list_next(v, &proto_list, pos);
2969 static void proto_seq_stop(struct seq_file *seq, void *v)
2970 __releases(proto_list_mutex)
2972 mutex_unlock(&proto_list_mutex);
2975 static char proto_method_implemented(const void *method)
2977 return method == NULL ? 'n' : 'y';
2979 static long sock_prot_memory_allocated(struct proto *proto)
2981 return proto->memory_allocated != NULL ? proto_memory_allocated(proto) : -1L;
2984 static char *sock_prot_memory_pressure(struct proto *proto)
2986 return proto->memory_pressure != NULL ?
2987 proto_memory_pressure(proto) ? "yes" : "no" : "NI";
2990 static void proto_seq_printf(struct seq_file *seq, struct proto *proto)
2993 seq_printf(seq, "%-9s %4u %6d %6ld %-3s %6u %-3s %-10s "
2994 "%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n",
2995 proto->name,
2996 proto->obj_size,
2997 sock_prot_inuse_get(seq_file_net(seq), proto),
2998 sock_prot_memory_allocated(proto),
2999 sock_prot_memory_pressure(proto),
3000 proto->max_header,
3001 proto->slab == NULL ? "no" : "yes",
3002 module_name(proto->owner),
3003 proto_method_implemented(proto->close),
3004 proto_method_implemented(proto->connect),
3005 proto_method_implemented(proto->disconnect),
3006 proto_method_implemented(proto->accept),
3007 proto_method_implemented(proto->ioctl),
3008 proto_method_implemented(proto->init),
3009 proto_method_implemented(proto->destroy),
3010 proto_method_implemented(proto->shutdown),
3011 proto_method_implemented(proto->setsockopt),
3012 proto_method_implemented(proto->getsockopt),
3013 proto_method_implemented(proto->sendmsg),
3014 proto_method_implemented(proto->recvmsg),
3015 proto_method_implemented(proto->sendpage),
3016 proto_method_implemented(proto->bind),
3017 proto_method_implemented(proto->backlog_rcv),
3018 proto_method_implemented(proto->hash),
3019 proto_method_implemented(proto->unhash),
3020 proto_method_implemented(proto->get_port),
3021 proto_method_implemented(proto->enter_memory_pressure));
3024 static int proto_seq_show(struct seq_file *seq, void *v)
3026 if (v == &proto_list)
3027 seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s",
3028 "protocol",
3029 "size",
3030 "sockets",
3031 "memory",
3032 "press",
3033 "maxhdr",
3034 "slab",
3035 "module",
3036 "cl co di ac io in de sh ss gs se re sp bi br ha uh gp em\n");
3037 else
3038 proto_seq_printf(seq, list_entry(v, struct proto, node));
3039 return 0;
3042 static const struct seq_operations proto_seq_ops = {
3043 .start = proto_seq_start,
3044 .next = proto_seq_next,
3045 .stop = proto_seq_stop,
3046 .show = proto_seq_show,
3049 static int proto_seq_open(struct inode *inode, struct file *file)
3051 return seq_open_net(inode, file, &proto_seq_ops,
3052 sizeof(struct seq_net_private));
3055 static const struct file_operations proto_seq_fops = {
3056 .owner = THIS_MODULE,
3057 .open = proto_seq_open,
3058 .read = seq_read,
3059 .llseek = seq_lseek,
3060 .release = seq_release_net,
3063 static __net_init int proto_init_net(struct net *net)
3065 if (!proc_create("protocols", S_IRUGO, net->proc_net, &proto_seq_fops))
3066 return -ENOMEM;
3068 return 0;
3071 static __net_exit void proto_exit_net(struct net *net)
3073 remove_proc_entry("protocols", net->proc_net);
3077 static __net_initdata struct pernet_operations proto_net_ops = {
3078 .init = proto_init_net,
3079 .exit = proto_exit_net,
3082 static int __init proto_init(void)
3084 return register_pernet_subsys(&proto_net_ops);
3087 subsys_initcall(proto_init);
3089 #endif /* PROC_FS */