ALSA: seq: Use bool for snd_seq_queue internal flags
[linux/fpc-iii.git] / net / core / sock.c
blobd468ffb5a31c681b1e569413187a16afb34bc7d0
1 /*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
6 * Generic socket support routines. Memory allocators, socket lock/release
7 * handler for protocols to use and generic option handler.
10 * Authors: Ross Biro
11 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12 * Florian La Roche, <flla@stud.uni-sb.de>
13 * Alan Cox, <A.Cox@swansea.ac.uk>
15 * Fixes:
16 * Alan Cox : Numerous verify_area() problems
17 * Alan Cox : Connecting on a connecting socket
18 * now returns an error for tcp.
19 * Alan Cox : sock->protocol is set correctly.
20 * and is not sometimes left as 0.
21 * Alan Cox : connect handles icmp errors on a
22 * connect properly. Unfortunately there
23 * is a restart syscall nasty there. I
24 * can't match BSD without hacking the C
25 * library. Ideas urgently sought!
26 * Alan Cox : Disallow bind() to addresses that are
27 * not ours - especially broadcast ones!!
28 * Alan Cox : Socket 1024 _IS_ ok for users. (fencepost)
29 * Alan Cox : sock_wfree/sock_rfree don't destroy sockets,
30 * instead they leave that for the DESTROY timer.
31 * Alan Cox : Clean up error flag in accept
32 * Alan Cox : TCP ack handling is buggy, the DESTROY timer
33 * was buggy. Put a remove_sock() in the handler
34 * for memory when we hit 0. Also altered the timer
35 * code. The ACK stuff can wait and needs major
36 * TCP layer surgery.
37 * Alan Cox : Fixed TCP ack bug, removed remove sock
38 * and fixed timer/inet_bh race.
39 * Alan Cox : Added zapped flag for TCP
40 * Alan Cox : Move kfree_skb into skbuff.c and tidied up surplus code
41 * Alan Cox : for new sk_buff allocations wmalloc/rmalloc now call alloc_skb
42 * Alan Cox : kfree_s calls now are kfree_skbmem so we can track skb resources
43 * Alan Cox : Supports socket option broadcast now as does udp. Packet and raw need fixing.
44 * Alan Cox : Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so...
45 * Rick Sladkey : Relaxed UDP rules for matching packets.
46 * C.E.Hawkins : IFF_PROMISC/SIOCGHWADDR support
47 * Pauline Middelink : identd support
48 * Alan Cox : Fixed connect() taking signals I think.
49 * Alan Cox : SO_LINGER supported
50 * Alan Cox : Error reporting fixes
51 * Anonymous : inet_create tidied up (sk->reuse setting)
52 * Alan Cox : inet sockets don't set sk->type!
53 * Alan Cox : Split socket option code
54 * Alan Cox : Callbacks
55 * Alan Cox : Nagle flag for Charles & Johannes stuff
56 * Alex : Removed restriction on inet fioctl
57 * Alan Cox : Splitting INET from NET core
58 * Alan Cox : Fixed bogus SO_TYPE handling in getsockopt()
59 * Adam Caldwell : Missing return in SO_DONTROUTE/SO_DEBUG code
60 * Alan Cox : Split IP from generic code
61 * Alan Cox : New kfree_skbmem()
62 * Alan Cox : Make SO_DEBUG superuser only.
63 * Alan Cox : Allow anyone to clear SO_DEBUG
64 * (compatibility fix)
65 * Alan Cox : Added optimistic memory grabbing for AF_UNIX throughput.
66 * Alan Cox : Allocator for a socket is settable.
67 * Alan Cox : SO_ERROR includes soft errors.
68 * Alan Cox : Allow NULL arguments on some SO_ opts
69 * Alan Cox : Generic socket allocation to make hooks
70 * easier (suggested by Craig Metz).
71 * Michael Pall : SO_ERROR returns positive errno again
72 * Steve Whitehouse: Added default destructor to free
73 * protocol private data.
74 * Steve Whitehouse: Added various other default routines
75 * common to several socket families.
76 * Chris Evans : Call suser() check last on F_SETOWN
77 * Jay Schulist : Added SO_ATTACH_FILTER and SO_DETACH_FILTER.
78 * Andi Kleen : Add sock_kmalloc()/sock_kfree_s()
79 * Andi Kleen : Fix write_space callback
80 * Chris Evans : Security fixes - signedness again
81 * Arnaldo C. Melo : cleanups, use skb_queue_purge
83 * To Fix:
86 * This program is free software; you can redistribute it and/or
87 * modify it under the terms of the GNU General Public License
88 * as published by the Free Software Foundation; either version
89 * 2 of the License, or (at your option) any later version.
92 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
94 #include <linux/capability.h>
95 #include <linux/errno.h>
96 #include <linux/errqueue.h>
97 #include <linux/types.h>
98 #include <linux/socket.h>
99 #include <linux/in.h>
100 #include <linux/kernel.h>
101 #include <linux/module.h>
102 #include <linux/proc_fs.h>
103 #include <linux/seq_file.h>
104 #include <linux/sched.h>
105 #include <linux/timer.h>
106 #include <linux/string.h>
107 #include <linux/sockios.h>
108 #include <linux/net.h>
109 #include <linux/mm.h>
110 #include <linux/slab.h>
111 #include <linux/interrupt.h>
112 #include <linux/poll.h>
113 #include <linux/tcp.h>
114 #include <linux/init.h>
115 #include <linux/highmem.h>
116 #include <linux/user_namespace.h>
117 #include <linux/static_key.h>
118 #include <linux/memcontrol.h>
119 #include <linux/prefetch.h>
121 #include <asm/uaccess.h>
123 #include <linux/netdevice.h>
124 #include <net/protocol.h>
125 #include <linux/skbuff.h>
126 #include <net/net_namespace.h>
127 #include <net/request_sock.h>
128 #include <net/sock.h>
129 #include <linux/net_tstamp.h>
130 #include <net/xfrm.h>
131 #include <linux/ipsec.h>
132 #include <net/cls_cgroup.h>
133 #include <net/netprio_cgroup.h>
134 #include <linux/sock_diag.h>
136 #include <linux/filter.h>
137 #include <net/sock_reuseport.h>
139 #include <trace/events/sock.h>
141 #include <net/tcp.h>
142 #include <net/busy_poll.h>
144 static DEFINE_MUTEX(proto_list_mutex);
145 static LIST_HEAD(proto_list);
148 * sk_ns_capable - General socket capability test
149 * @sk: Socket to use a capability on or through
150 * @user_ns: The user namespace of the capability to use
151 * @cap: The capability to use
153 * Test to see if the opener of the socket had when the socket was
154 * created and the current process has the capability @cap in the user
155 * namespace @user_ns.
157 bool sk_ns_capable(const struct sock *sk,
158 struct user_namespace *user_ns, int cap)
160 return file_ns_capable(sk->sk_socket->file, user_ns, cap) &&
161 ns_capable(user_ns, cap);
163 EXPORT_SYMBOL(sk_ns_capable);
166 * sk_capable - Socket global capability test
167 * @sk: Socket to use a capability on or through
168 * @cap: The global capability to use
170 * Test to see if the opener of the socket had when the socket was
171 * created and the current process has the capability @cap in all user
172 * namespaces.
174 bool sk_capable(const struct sock *sk, int cap)
176 return sk_ns_capable(sk, &init_user_ns, cap);
178 EXPORT_SYMBOL(sk_capable);
181 * sk_net_capable - Network namespace socket capability test
182 * @sk: Socket to use a capability on or through
183 * @cap: The capability to use
185 * Test to see if the opener of the socket had when the socket was created
186 * and the current process has the capability @cap over the network namespace
187 * the socket is a member of.
189 bool sk_net_capable(const struct sock *sk, int cap)
191 return sk_ns_capable(sk, sock_net(sk)->user_ns, cap);
193 EXPORT_SYMBOL(sk_net_capable);
196 * Each address family might have different locking rules, so we have
197 * one slock key per address family:
199 static struct lock_class_key af_family_keys[AF_MAX];
200 static struct lock_class_key af_family_slock_keys[AF_MAX];
203 * Make lock validator output more readable. (we pre-construct these
204 * strings build-time, so that runtime initialization of socket
205 * locks is fast):
207 static const char *const af_family_key_strings[AF_MAX+1] = {
208 "sk_lock-AF_UNSPEC", "sk_lock-AF_UNIX" , "sk_lock-AF_INET" ,
209 "sk_lock-AF_AX25" , "sk_lock-AF_IPX" , "sk_lock-AF_APPLETALK",
210 "sk_lock-AF_NETROM", "sk_lock-AF_BRIDGE" , "sk_lock-AF_ATMPVC" ,
211 "sk_lock-AF_X25" , "sk_lock-AF_INET6" , "sk_lock-AF_ROSE" ,
212 "sk_lock-AF_DECnet", "sk_lock-AF_NETBEUI" , "sk_lock-AF_SECURITY" ,
213 "sk_lock-AF_KEY" , "sk_lock-AF_NETLINK" , "sk_lock-AF_PACKET" ,
214 "sk_lock-AF_ASH" , "sk_lock-AF_ECONET" , "sk_lock-AF_ATMSVC" ,
215 "sk_lock-AF_RDS" , "sk_lock-AF_SNA" , "sk_lock-AF_IRDA" ,
216 "sk_lock-AF_PPPOX" , "sk_lock-AF_WANPIPE" , "sk_lock-AF_LLC" ,
217 "sk_lock-27" , "sk_lock-28" , "sk_lock-AF_CAN" ,
218 "sk_lock-AF_TIPC" , "sk_lock-AF_BLUETOOTH", "sk_lock-IUCV" ,
219 "sk_lock-AF_RXRPC" , "sk_lock-AF_ISDN" , "sk_lock-AF_PHONET" ,
220 "sk_lock-AF_IEEE802154", "sk_lock-AF_CAIF" , "sk_lock-AF_ALG" ,
221 "sk_lock-AF_NFC" , "sk_lock-AF_VSOCK" , "sk_lock-AF_KCM" ,
222 "sk_lock-AF_QIPCRTR", "sk_lock-AF_MAX"
224 static const char *const af_family_slock_key_strings[AF_MAX+1] = {
225 "slock-AF_UNSPEC", "slock-AF_UNIX" , "slock-AF_INET" ,
226 "slock-AF_AX25" , "slock-AF_IPX" , "slock-AF_APPLETALK",
227 "slock-AF_NETROM", "slock-AF_BRIDGE" , "slock-AF_ATMPVC" ,
228 "slock-AF_X25" , "slock-AF_INET6" , "slock-AF_ROSE" ,
229 "slock-AF_DECnet", "slock-AF_NETBEUI" , "slock-AF_SECURITY" ,
230 "slock-AF_KEY" , "slock-AF_NETLINK" , "slock-AF_PACKET" ,
231 "slock-AF_ASH" , "slock-AF_ECONET" , "slock-AF_ATMSVC" ,
232 "slock-AF_RDS" , "slock-AF_SNA" , "slock-AF_IRDA" ,
233 "slock-AF_PPPOX" , "slock-AF_WANPIPE" , "slock-AF_LLC" ,
234 "slock-27" , "slock-28" , "slock-AF_CAN" ,
235 "slock-AF_TIPC" , "slock-AF_BLUETOOTH", "slock-AF_IUCV" ,
236 "slock-AF_RXRPC" , "slock-AF_ISDN" , "slock-AF_PHONET" ,
237 "slock-AF_IEEE802154", "slock-AF_CAIF" , "slock-AF_ALG" ,
238 "slock-AF_NFC" , "slock-AF_VSOCK" ,"slock-AF_KCM" ,
239 "slock-AF_QIPCRTR", "slock-AF_MAX"
241 static const char *const af_family_clock_key_strings[AF_MAX+1] = {
242 "clock-AF_UNSPEC", "clock-AF_UNIX" , "clock-AF_INET" ,
243 "clock-AF_AX25" , "clock-AF_IPX" , "clock-AF_APPLETALK",
244 "clock-AF_NETROM", "clock-AF_BRIDGE" , "clock-AF_ATMPVC" ,
245 "clock-AF_X25" , "clock-AF_INET6" , "clock-AF_ROSE" ,
246 "clock-AF_DECnet", "clock-AF_NETBEUI" , "clock-AF_SECURITY" ,
247 "clock-AF_KEY" , "clock-AF_NETLINK" , "clock-AF_PACKET" ,
248 "clock-AF_ASH" , "clock-AF_ECONET" , "clock-AF_ATMSVC" ,
249 "clock-AF_RDS" , "clock-AF_SNA" , "clock-AF_IRDA" ,
250 "clock-AF_PPPOX" , "clock-AF_WANPIPE" , "clock-AF_LLC" ,
251 "clock-27" , "clock-28" , "clock-AF_CAN" ,
252 "clock-AF_TIPC" , "clock-AF_BLUETOOTH", "clock-AF_IUCV" ,
253 "clock-AF_RXRPC" , "clock-AF_ISDN" , "clock-AF_PHONET" ,
254 "clock-AF_IEEE802154", "clock-AF_CAIF" , "clock-AF_ALG" ,
255 "clock-AF_NFC" , "clock-AF_VSOCK" , "clock-AF_KCM" ,
256 "clock-AF_QIPCRTR", "clock-AF_MAX"
260 * sk_callback_lock locking rules are per-address-family,
261 * so split the lock classes by using a per-AF key:
263 static struct lock_class_key af_callback_keys[AF_MAX];
265 /* Take into consideration the size of the struct sk_buff overhead in the
266 * determination of these values, since that is non-constant across
267 * platforms. This makes socket queueing behavior and performance
268 * not depend upon such differences.
270 #define _SK_MEM_PACKETS 256
271 #define _SK_MEM_OVERHEAD SKB_TRUESIZE(256)
272 #define SK_WMEM_MAX (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
273 #define SK_RMEM_MAX (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
275 /* Run time adjustable parameters. */
276 __u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX;
277 EXPORT_SYMBOL(sysctl_wmem_max);
278 __u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX;
279 EXPORT_SYMBOL(sysctl_rmem_max);
280 __u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX;
281 __u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX;
283 /* Maximal space eaten by iovec or ancillary data plus some space */
284 int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512);
285 EXPORT_SYMBOL(sysctl_optmem_max);
287 int sysctl_tstamp_allow_data __read_mostly = 1;
289 struct static_key memalloc_socks = STATIC_KEY_INIT_FALSE;
290 EXPORT_SYMBOL_GPL(memalloc_socks);
293 * sk_set_memalloc - sets %SOCK_MEMALLOC
294 * @sk: socket to set it on
296 * Set %SOCK_MEMALLOC on a socket for access to emergency reserves.
297 * It's the responsibility of the admin to adjust min_free_kbytes
298 * to meet the requirements
300 void sk_set_memalloc(struct sock *sk)
302 sock_set_flag(sk, SOCK_MEMALLOC);
303 sk->sk_allocation |= __GFP_MEMALLOC;
304 static_key_slow_inc(&memalloc_socks);
306 EXPORT_SYMBOL_GPL(sk_set_memalloc);
308 void sk_clear_memalloc(struct sock *sk)
310 sock_reset_flag(sk, SOCK_MEMALLOC);
311 sk->sk_allocation &= ~__GFP_MEMALLOC;
312 static_key_slow_dec(&memalloc_socks);
315 * SOCK_MEMALLOC is allowed to ignore rmem limits to ensure forward
316 * progress of swapping. SOCK_MEMALLOC may be cleared while
317 * it has rmem allocations due to the last swapfile being deactivated
318 * but there is a risk that the socket is unusable due to exceeding
319 * the rmem limits. Reclaim the reserves and obey rmem limits again.
321 sk_mem_reclaim(sk);
323 EXPORT_SYMBOL_GPL(sk_clear_memalloc);
325 int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb)
327 int ret;
328 unsigned long pflags = current->flags;
330 /* these should have been dropped before queueing */
331 BUG_ON(!sock_flag(sk, SOCK_MEMALLOC));
333 current->flags |= PF_MEMALLOC;
334 ret = sk->sk_backlog_rcv(sk, skb);
335 tsk_restore_flags(current, pflags, PF_MEMALLOC);
337 return ret;
339 EXPORT_SYMBOL(__sk_backlog_rcv);
341 static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen)
343 struct timeval tv;
345 if (optlen < sizeof(tv))
346 return -EINVAL;
347 if (copy_from_user(&tv, optval, sizeof(tv)))
348 return -EFAULT;
349 if (tv.tv_usec < 0 || tv.tv_usec >= USEC_PER_SEC)
350 return -EDOM;
352 if (tv.tv_sec < 0) {
353 static int warned __read_mostly;
355 *timeo_p = 0;
356 if (warned < 10 && net_ratelimit()) {
357 warned++;
358 pr_info("%s: `%s' (pid %d) tries to set negative timeout\n",
359 __func__, current->comm, task_pid_nr(current));
361 return 0;
363 *timeo_p = MAX_SCHEDULE_TIMEOUT;
364 if (tv.tv_sec == 0 && tv.tv_usec == 0)
365 return 0;
366 if (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT/HZ - 1))
367 *timeo_p = tv.tv_sec*HZ + (tv.tv_usec+(1000000/HZ-1))/(1000000/HZ);
368 return 0;
371 static void sock_warn_obsolete_bsdism(const char *name)
373 static int warned;
374 static char warncomm[TASK_COMM_LEN];
375 if (strcmp(warncomm, current->comm) && warned < 5) {
376 strcpy(warncomm, current->comm);
377 pr_warn("process `%s' is using obsolete %s SO_BSDCOMPAT\n",
378 warncomm, name);
379 warned++;
383 static bool sock_needs_netstamp(const struct sock *sk)
385 switch (sk->sk_family) {
386 case AF_UNSPEC:
387 case AF_UNIX:
388 return false;
389 default:
390 return true;
394 static void sock_disable_timestamp(struct sock *sk, unsigned long flags)
396 if (sk->sk_flags & flags) {
397 sk->sk_flags &= ~flags;
398 if (sock_needs_netstamp(sk) &&
399 !(sk->sk_flags & SK_FLAGS_TIMESTAMP))
400 net_disable_timestamp();
405 int __sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
407 unsigned long flags;
408 struct sk_buff_head *list = &sk->sk_receive_queue;
410 if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) {
411 atomic_inc(&sk->sk_drops);
412 trace_sock_rcvqueue_full(sk, skb);
413 return -ENOMEM;
416 if (!sk_rmem_schedule(sk, skb, skb->truesize)) {
417 atomic_inc(&sk->sk_drops);
418 return -ENOBUFS;
421 skb->dev = NULL;
422 skb_set_owner_r(skb, sk);
424 /* we escape from rcu protected region, make sure we dont leak
425 * a norefcounted dst
427 skb_dst_force(skb);
429 spin_lock_irqsave(&list->lock, flags);
430 sock_skb_set_dropcount(sk, skb);
431 __skb_queue_tail(list, skb);
432 spin_unlock_irqrestore(&list->lock, flags);
434 if (!sock_flag(sk, SOCK_DEAD))
435 sk->sk_data_ready(sk);
436 return 0;
438 EXPORT_SYMBOL(__sock_queue_rcv_skb);
440 int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
442 int err;
444 err = sk_filter(sk, skb);
445 if (err)
446 return err;
448 return __sock_queue_rcv_skb(sk, skb);
450 EXPORT_SYMBOL(sock_queue_rcv_skb);
452 int __sk_receive_skb(struct sock *sk, struct sk_buff *skb,
453 const int nested, unsigned int trim_cap, bool refcounted)
455 int rc = NET_RX_SUCCESS;
457 if (sk_filter_trim_cap(sk, skb, trim_cap))
458 goto discard_and_relse;
460 skb->dev = NULL;
462 if (sk_rcvqueues_full(sk, sk->sk_rcvbuf)) {
463 atomic_inc(&sk->sk_drops);
464 goto discard_and_relse;
466 if (nested)
467 bh_lock_sock_nested(sk);
468 else
469 bh_lock_sock(sk);
470 if (!sock_owned_by_user(sk)) {
472 * trylock + unlock semantics:
474 mutex_acquire(&sk->sk_lock.dep_map, 0, 1, _RET_IP_);
476 rc = sk_backlog_rcv(sk, skb);
478 mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
479 } else if (sk_add_backlog(sk, skb, sk->sk_rcvbuf)) {
480 bh_unlock_sock(sk);
481 atomic_inc(&sk->sk_drops);
482 goto discard_and_relse;
485 bh_unlock_sock(sk);
486 out:
487 if (refcounted)
488 sock_put(sk);
489 return rc;
490 discard_and_relse:
491 kfree_skb(skb);
492 goto out;
494 EXPORT_SYMBOL(__sk_receive_skb);
496 struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie)
498 struct dst_entry *dst = __sk_dst_get(sk);
500 if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
501 sk_tx_queue_clear(sk);
502 RCU_INIT_POINTER(sk->sk_dst_cache, NULL);
503 dst_release(dst);
504 return NULL;
507 return dst;
509 EXPORT_SYMBOL(__sk_dst_check);
511 struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie)
513 struct dst_entry *dst = sk_dst_get(sk);
515 if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
516 sk_dst_reset(sk);
517 dst_release(dst);
518 return NULL;
521 return dst;
523 EXPORT_SYMBOL(sk_dst_check);
525 static int sock_setbindtodevice(struct sock *sk, char __user *optval,
526 int optlen)
528 int ret = -ENOPROTOOPT;
529 #ifdef CONFIG_NETDEVICES
530 struct net *net = sock_net(sk);
531 char devname[IFNAMSIZ];
532 int index;
534 /* Sorry... */
535 ret = -EPERM;
536 if (!ns_capable(net->user_ns, CAP_NET_RAW))
537 goto out;
539 ret = -EINVAL;
540 if (optlen < 0)
541 goto out;
543 /* Bind this socket to a particular device like "eth0",
544 * as specified in the passed interface name. If the
545 * name is "" or the option length is zero the socket
546 * is not bound.
548 if (optlen > IFNAMSIZ - 1)
549 optlen = IFNAMSIZ - 1;
550 memset(devname, 0, sizeof(devname));
552 ret = -EFAULT;
553 if (copy_from_user(devname, optval, optlen))
554 goto out;
556 index = 0;
557 if (devname[0] != '\0') {
558 struct net_device *dev;
560 rcu_read_lock();
561 dev = dev_get_by_name_rcu(net, devname);
562 if (dev)
563 index = dev->ifindex;
564 rcu_read_unlock();
565 ret = -ENODEV;
566 if (!dev)
567 goto out;
570 lock_sock(sk);
571 sk->sk_bound_dev_if = index;
572 sk_dst_reset(sk);
573 release_sock(sk);
575 ret = 0;
577 out:
578 #endif
580 return ret;
583 static int sock_getbindtodevice(struct sock *sk, char __user *optval,
584 int __user *optlen, int len)
586 int ret = -ENOPROTOOPT;
587 #ifdef CONFIG_NETDEVICES
588 struct net *net = sock_net(sk);
589 char devname[IFNAMSIZ];
591 if (sk->sk_bound_dev_if == 0) {
592 len = 0;
593 goto zero;
596 ret = -EINVAL;
597 if (len < IFNAMSIZ)
598 goto out;
600 ret = netdev_get_name(net, devname, sk->sk_bound_dev_if);
601 if (ret)
602 goto out;
604 len = strlen(devname) + 1;
606 ret = -EFAULT;
607 if (copy_to_user(optval, devname, len))
608 goto out;
610 zero:
611 ret = -EFAULT;
612 if (put_user(len, optlen))
613 goto out;
615 ret = 0;
617 out:
618 #endif
620 return ret;
623 static inline void sock_valbool_flag(struct sock *sk, int bit, int valbool)
625 if (valbool)
626 sock_set_flag(sk, bit);
627 else
628 sock_reset_flag(sk, bit);
631 bool sk_mc_loop(struct sock *sk)
633 if (dev_recursion_level())
634 return false;
635 if (!sk)
636 return true;
637 switch (sk->sk_family) {
638 case AF_INET:
639 return inet_sk(sk)->mc_loop;
640 #if IS_ENABLED(CONFIG_IPV6)
641 case AF_INET6:
642 return inet6_sk(sk)->mc_loop;
643 #endif
645 WARN_ON(1);
646 return true;
648 EXPORT_SYMBOL(sk_mc_loop);
651 * This is meant for all protocols to use and covers goings on
652 * at the socket level. Everything here is generic.
655 int sock_setsockopt(struct socket *sock, int level, int optname,
656 char __user *optval, unsigned int optlen)
658 struct sock *sk = sock->sk;
659 int val;
660 int valbool;
661 struct linger ling;
662 int ret = 0;
665 * Options without arguments
668 if (optname == SO_BINDTODEVICE)
669 return sock_setbindtodevice(sk, optval, optlen);
671 if (optlen < sizeof(int))
672 return -EINVAL;
674 if (get_user(val, (int __user *)optval))
675 return -EFAULT;
677 valbool = val ? 1 : 0;
679 lock_sock(sk);
681 switch (optname) {
682 case SO_DEBUG:
683 if (val && !capable(CAP_NET_ADMIN))
684 ret = -EACCES;
685 else
686 sock_valbool_flag(sk, SOCK_DBG, valbool);
687 break;
688 case SO_REUSEADDR:
689 sk->sk_reuse = (valbool ? SK_CAN_REUSE : SK_NO_REUSE);
690 break;
691 case SO_REUSEPORT:
692 sk->sk_reuseport = valbool;
693 break;
694 case SO_TYPE:
695 case SO_PROTOCOL:
696 case SO_DOMAIN:
697 case SO_ERROR:
698 ret = -ENOPROTOOPT;
699 break;
700 case SO_DONTROUTE:
701 sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool);
702 sk_dst_reset(sk);
703 break;
704 case SO_BROADCAST:
705 sock_valbool_flag(sk, SOCK_BROADCAST, valbool);
706 break;
707 case SO_SNDBUF:
708 /* Don't error on this BSD doesn't and if you think
709 * about it this is right. Otherwise apps have to
710 * play 'guess the biggest size' games. RCVBUF/SNDBUF
711 * are treated in BSD as hints
713 val = min_t(u32, val, sysctl_wmem_max);
714 set_sndbuf:
715 sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
716 sk->sk_sndbuf = max_t(int, val * 2, SOCK_MIN_SNDBUF);
717 /* Wake up sending tasks if we upped the value. */
718 sk->sk_write_space(sk);
719 break;
721 case SO_SNDBUFFORCE:
722 if (!capable(CAP_NET_ADMIN)) {
723 ret = -EPERM;
724 break;
726 goto set_sndbuf;
728 case SO_RCVBUF:
729 /* Don't error on this BSD doesn't and if you think
730 * about it this is right. Otherwise apps have to
731 * play 'guess the biggest size' games. RCVBUF/SNDBUF
732 * are treated in BSD as hints
734 val = min_t(u32, val, sysctl_rmem_max);
735 set_rcvbuf:
736 sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
738 * We double it on the way in to account for
739 * "struct sk_buff" etc. overhead. Applications
740 * assume that the SO_RCVBUF setting they make will
741 * allow that much actual data to be received on that
742 * socket.
744 * Applications are unaware that "struct sk_buff" and
745 * other overheads allocate from the receive buffer
746 * during socket buffer allocation.
748 * And after considering the possible alternatives,
749 * returning the value we actually used in getsockopt
750 * is the most desirable behavior.
752 sk->sk_rcvbuf = max_t(int, val * 2, SOCK_MIN_RCVBUF);
753 break;
755 case SO_RCVBUFFORCE:
756 if (!capable(CAP_NET_ADMIN)) {
757 ret = -EPERM;
758 break;
760 goto set_rcvbuf;
762 case SO_KEEPALIVE:
763 #ifdef CONFIG_INET
764 if (sk->sk_protocol == IPPROTO_TCP &&
765 sk->sk_type == SOCK_STREAM)
766 tcp_set_keepalive(sk, valbool);
767 #endif
768 sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool);
769 break;
771 case SO_OOBINLINE:
772 sock_valbool_flag(sk, SOCK_URGINLINE, valbool);
773 break;
775 case SO_NO_CHECK:
776 sk->sk_no_check_tx = valbool;
777 break;
779 case SO_PRIORITY:
780 if ((val >= 0 && val <= 6) ||
781 ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
782 sk->sk_priority = val;
783 else
784 ret = -EPERM;
785 break;
787 case SO_LINGER:
788 if (optlen < sizeof(ling)) {
789 ret = -EINVAL; /* 1003.1g */
790 break;
792 if (copy_from_user(&ling, optval, sizeof(ling))) {
793 ret = -EFAULT;
794 break;
796 if (!ling.l_onoff)
797 sock_reset_flag(sk, SOCK_LINGER);
798 else {
799 #if (BITS_PER_LONG == 32)
800 if ((unsigned int)ling.l_linger >= MAX_SCHEDULE_TIMEOUT/HZ)
801 sk->sk_lingertime = MAX_SCHEDULE_TIMEOUT;
802 else
803 #endif
804 sk->sk_lingertime = (unsigned int)ling.l_linger * HZ;
805 sock_set_flag(sk, SOCK_LINGER);
807 break;
809 case SO_BSDCOMPAT:
810 sock_warn_obsolete_bsdism("setsockopt");
811 break;
813 case SO_PASSCRED:
814 if (valbool)
815 set_bit(SOCK_PASSCRED, &sock->flags);
816 else
817 clear_bit(SOCK_PASSCRED, &sock->flags);
818 break;
820 case SO_TIMESTAMP:
821 case SO_TIMESTAMPNS:
822 if (valbool) {
823 if (optname == SO_TIMESTAMP)
824 sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
825 else
826 sock_set_flag(sk, SOCK_RCVTSTAMPNS);
827 sock_set_flag(sk, SOCK_RCVTSTAMP);
828 sock_enable_timestamp(sk, SOCK_TIMESTAMP);
829 } else {
830 sock_reset_flag(sk, SOCK_RCVTSTAMP);
831 sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
833 break;
835 case SO_TIMESTAMPING:
836 if (val & ~SOF_TIMESTAMPING_MASK) {
837 ret = -EINVAL;
838 break;
841 if (val & SOF_TIMESTAMPING_OPT_ID &&
842 !(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)) {
843 if (sk->sk_protocol == IPPROTO_TCP &&
844 sk->sk_type == SOCK_STREAM) {
845 if ((1 << sk->sk_state) &
846 (TCPF_CLOSE | TCPF_LISTEN)) {
847 ret = -EINVAL;
848 break;
850 sk->sk_tskey = tcp_sk(sk)->snd_una;
851 } else {
852 sk->sk_tskey = 0;
855 sk->sk_tsflags = val;
856 if (val & SOF_TIMESTAMPING_RX_SOFTWARE)
857 sock_enable_timestamp(sk,
858 SOCK_TIMESTAMPING_RX_SOFTWARE);
859 else
860 sock_disable_timestamp(sk,
861 (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE));
862 break;
864 case SO_RCVLOWAT:
865 if (val < 0)
866 val = INT_MAX;
867 sk->sk_rcvlowat = val ? : 1;
868 break;
870 case SO_RCVTIMEO:
871 ret = sock_set_timeout(&sk->sk_rcvtimeo, optval, optlen);
872 break;
874 case SO_SNDTIMEO:
875 ret = sock_set_timeout(&sk->sk_sndtimeo, optval, optlen);
876 break;
878 case SO_ATTACH_FILTER:
879 ret = -EINVAL;
880 if (optlen == sizeof(struct sock_fprog)) {
881 struct sock_fprog fprog;
883 ret = -EFAULT;
884 if (copy_from_user(&fprog, optval, sizeof(fprog)))
885 break;
887 ret = sk_attach_filter(&fprog, sk);
889 break;
891 case SO_ATTACH_BPF:
892 ret = -EINVAL;
893 if (optlen == sizeof(u32)) {
894 u32 ufd;
896 ret = -EFAULT;
897 if (copy_from_user(&ufd, optval, sizeof(ufd)))
898 break;
900 ret = sk_attach_bpf(ufd, sk);
902 break;
904 case SO_ATTACH_REUSEPORT_CBPF:
905 ret = -EINVAL;
906 if (optlen == sizeof(struct sock_fprog)) {
907 struct sock_fprog fprog;
909 ret = -EFAULT;
910 if (copy_from_user(&fprog, optval, sizeof(fprog)))
911 break;
913 ret = sk_reuseport_attach_filter(&fprog, sk);
915 break;
917 case SO_ATTACH_REUSEPORT_EBPF:
918 ret = -EINVAL;
919 if (optlen == sizeof(u32)) {
920 u32 ufd;
922 ret = -EFAULT;
923 if (copy_from_user(&ufd, optval, sizeof(ufd)))
924 break;
926 ret = sk_reuseport_attach_bpf(ufd, sk);
928 break;
930 case SO_DETACH_FILTER:
931 ret = sk_detach_filter(sk);
932 break;
934 case SO_LOCK_FILTER:
935 if (sock_flag(sk, SOCK_FILTER_LOCKED) && !valbool)
936 ret = -EPERM;
937 else
938 sock_valbool_flag(sk, SOCK_FILTER_LOCKED, valbool);
939 break;
941 case SO_PASSSEC:
942 if (valbool)
943 set_bit(SOCK_PASSSEC, &sock->flags);
944 else
945 clear_bit(SOCK_PASSSEC, &sock->flags);
946 break;
947 case SO_MARK:
948 if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
949 ret = -EPERM;
950 else
951 sk->sk_mark = val;
952 break;
954 case SO_RXQ_OVFL:
955 sock_valbool_flag(sk, SOCK_RXQ_OVFL, valbool);
956 break;
958 case SO_WIFI_STATUS:
959 sock_valbool_flag(sk, SOCK_WIFI_STATUS, valbool);
960 break;
962 case SO_PEEK_OFF:
963 if (sock->ops->set_peek_off)
964 ret = sock->ops->set_peek_off(sk, val);
965 else
966 ret = -EOPNOTSUPP;
967 break;
969 case SO_NOFCS:
970 sock_valbool_flag(sk, SOCK_NOFCS, valbool);
971 break;
973 case SO_SELECT_ERR_QUEUE:
974 sock_valbool_flag(sk, SOCK_SELECT_ERR_QUEUE, valbool);
975 break;
977 #ifdef CONFIG_NET_RX_BUSY_POLL
978 case SO_BUSY_POLL:
979 /* allow unprivileged users to decrease the value */
980 if ((val > sk->sk_ll_usec) && !capable(CAP_NET_ADMIN))
981 ret = -EPERM;
982 else {
983 if (val < 0)
984 ret = -EINVAL;
985 else
986 sk->sk_ll_usec = val;
988 break;
989 #endif
991 case SO_MAX_PACING_RATE:
992 sk->sk_max_pacing_rate = val;
993 sk->sk_pacing_rate = min(sk->sk_pacing_rate,
994 sk->sk_max_pacing_rate);
995 break;
997 case SO_INCOMING_CPU:
998 sk->sk_incoming_cpu = val;
999 break;
1001 case SO_CNX_ADVICE:
1002 if (val == 1)
1003 dst_negative_advice(sk);
1004 break;
1005 default:
1006 ret = -ENOPROTOOPT;
1007 break;
1009 release_sock(sk);
1010 return ret;
1012 EXPORT_SYMBOL(sock_setsockopt);
1015 static void cred_to_ucred(struct pid *pid, const struct cred *cred,
1016 struct ucred *ucred)
1018 ucred->pid = pid_vnr(pid);
1019 ucred->uid = ucred->gid = -1;
1020 if (cred) {
1021 struct user_namespace *current_ns = current_user_ns();
1023 ucred->uid = from_kuid_munged(current_ns, cred->euid);
1024 ucred->gid = from_kgid_munged(current_ns, cred->egid);
1028 int sock_getsockopt(struct socket *sock, int level, int optname,
1029 char __user *optval, int __user *optlen)
1031 struct sock *sk = sock->sk;
1033 union {
1034 int val;
1035 struct linger ling;
1036 struct timeval tm;
1037 } v;
1039 int lv = sizeof(int);
1040 int len;
1042 if (get_user(len, optlen))
1043 return -EFAULT;
1044 if (len < 0)
1045 return -EINVAL;
1047 memset(&v, 0, sizeof(v));
1049 switch (optname) {
1050 case SO_DEBUG:
1051 v.val = sock_flag(sk, SOCK_DBG);
1052 break;
1054 case SO_DONTROUTE:
1055 v.val = sock_flag(sk, SOCK_LOCALROUTE);
1056 break;
1058 case SO_BROADCAST:
1059 v.val = sock_flag(sk, SOCK_BROADCAST);
1060 break;
1062 case SO_SNDBUF:
1063 v.val = sk->sk_sndbuf;
1064 break;
1066 case SO_RCVBUF:
1067 v.val = sk->sk_rcvbuf;
1068 break;
1070 case SO_REUSEADDR:
1071 v.val = sk->sk_reuse;
1072 break;
1074 case SO_REUSEPORT:
1075 v.val = sk->sk_reuseport;
1076 break;
1078 case SO_KEEPALIVE:
1079 v.val = sock_flag(sk, SOCK_KEEPOPEN);
1080 break;
1082 case SO_TYPE:
1083 v.val = sk->sk_type;
1084 break;
1086 case SO_PROTOCOL:
1087 v.val = sk->sk_protocol;
1088 break;
1090 case SO_DOMAIN:
1091 v.val = sk->sk_family;
1092 break;
1094 case SO_ERROR:
1095 v.val = -sock_error(sk);
1096 if (v.val == 0)
1097 v.val = xchg(&sk->sk_err_soft, 0);
1098 break;
1100 case SO_OOBINLINE:
1101 v.val = sock_flag(sk, SOCK_URGINLINE);
1102 break;
1104 case SO_NO_CHECK:
1105 v.val = sk->sk_no_check_tx;
1106 break;
1108 case SO_PRIORITY:
1109 v.val = sk->sk_priority;
1110 break;
1112 case SO_LINGER:
1113 lv = sizeof(v.ling);
1114 v.ling.l_onoff = sock_flag(sk, SOCK_LINGER);
1115 v.ling.l_linger = sk->sk_lingertime / HZ;
1116 break;
1118 case SO_BSDCOMPAT:
1119 sock_warn_obsolete_bsdism("getsockopt");
1120 break;
1122 case SO_TIMESTAMP:
1123 v.val = sock_flag(sk, SOCK_RCVTSTAMP) &&
1124 !sock_flag(sk, SOCK_RCVTSTAMPNS);
1125 break;
1127 case SO_TIMESTAMPNS:
1128 v.val = sock_flag(sk, SOCK_RCVTSTAMPNS);
1129 break;
1131 case SO_TIMESTAMPING:
1132 v.val = sk->sk_tsflags;
1133 break;
1135 case SO_RCVTIMEO:
1136 lv = sizeof(struct timeval);
1137 if (sk->sk_rcvtimeo == MAX_SCHEDULE_TIMEOUT) {
1138 v.tm.tv_sec = 0;
1139 v.tm.tv_usec = 0;
1140 } else {
1141 v.tm.tv_sec = sk->sk_rcvtimeo / HZ;
1142 v.tm.tv_usec = ((sk->sk_rcvtimeo % HZ) * 1000000) / HZ;
1144 break;
1146 case SO_SNDTIMEO:
1147 lv = sizeof(struct timeval);
1148 if (sk->sk_sndtimeo == MAX_SCHEDULE_TIMEOUT) {
1149 v.tm.tv_sec = 0;
1150 v.tm.tv_usec = 0;
1151 } else {
1152 v.tm.tv_sec = sk->sk_sndtimeo / HZ;
1153 v.tm.tv_usec = ((sk->sk_sndtimeo % HZ) * 1000000) / HZ;
1155 break;
1157 case SO_RCVLOWAT:
1158 v.val = sk->sk_rcvlowat;
1159 break;
1161 case SO_SNDLOWAT:
1162 v.val = 1;
1163 break;
1165 case SO_PASSCRED:
1166 v.val = !!test_bit(SOCK_PASSCRED, &sock->flags);
1167 break;
1169 case SO_PEERCRED:
1171 struct ucred peercred;
1172 if (len > sizeof(peercred))
1173 len = sizeof(peercred);
1174 cred_to_ucred(sk->sk_peer_pid, sk->sk_peer_cred, &peercred);
1175 if (copy_to_user(optval, &peercred, len))
1176 return -EFAULT;
1177 goto lenout;
1180 case SO_PEERNAME:
1182 char address[128];
1184 if (sock->ops->getname(sock, (struct sockaddr *)address, &lv, 2))
1185 return -ENOTCONN;
1186 if (lv < len)
1187 return -EINVAL;
1188 if (copy_to_user(optval, address, len))
1189 return -EFAULT;
1190 goto lenout;
1193 /* Dubious BSD thing... Probably nobody even uses it, but
1194 * the UNIX standard wants it for whatever reason... -DaveM
1196 case SO_ACCEPTCONN:
1197 v.val = sk->sk_state == TCP_LISTEN;
1198 break;
1200 case SO_PASSSEC:
1201 v.val = !!test_bit(SOCK_PASSSEC, &sock->flags);
1202 break;
1204 case SO_PEERSEC:
1205 return security_socket_getpeersec_stream(sock, optval, optlen, len);
1207 case SO_MARK:
1208 v.val = sk->sk_mark;
1209 break;
1211 case SO_RXQ_OVFL:
1212 v.val = sock_flag(sk, SOCK_RXQ_OVFL);
1213 break;
1215 case SO_WIFI_STATUS:
1216 v.val = sock_flag(sk, SOCK_WIFI_STATUS);
1217 break;
1219 case SO_PEEK_OFF:
1220 if (!sock->ops->set_peek_off)
1221 return -EOPNOTSUPP;
1223 v.val = sk->sk_peek_off;
1224 break;
1225 case SO_NOFCS:
1226 v.val = sock_flag(sk, SOCK_NOFCS);
1227 break;
1229 case SO_BINDTODEVICE:
1230 return sock_getbindtodevice(sk, optval, optlen, len);
1232 case SO_GET_FILTER:
1233 len = sk_get_filter(sk, (struct sock_filter __user *)optval, len);
1234 if (len < 0)
1235 return len;
1237 goto lenout;
1239 case SO_LOCK_FILTER:
1240 v.val = sock_flag(sk, SOCK_FILTER_LOCKED);
1241 break;
1243 case SO_BPF_EXTENSIONS:
1244 v.val = bpf_tell_extensions();
1245 break;
1247 case SO_SELECT_ERR_QUEUE:
1248 v.val = sock_flag(sk, SOCK_SELECT_ERR_QUEUE);
1249 break;
1251 #ifdef CONFIG_NET_RX_BUSY_POLL
1252 case SO_BUSY_POLL:
1253 v.val = sk->sk_ll_usec;
1254 break;
1255 #endif
1257 case SO_MAX_PACING_RATE:
1258 v.val = sk->sk_max_pacing_rate;
1259 break;
1261 case SO_INCOMING_CPU:
1262 v.val = sk->sk_incoming_cpu;
1263 break;
1265 default:
1266 /* We implement the SO_SNDLOWAT etc to not be settable
1267 * (1003.1g 7).
1269 return -ENOPROTOOPT;
1272 if (len > lv)
1273 len = lv;
1274 if (copy_to_user(optval, &v, len))
1275 return -EFAULT;
1276 lenout:
1277 if (put_user(len, optlen))
1278 return -EFAULT;
1279 return 0;
1283 * Initialize an sk_lock.
1285 * (We also register the sk_lock with the lock validator.)
1287 static inline void sock_lock_init(struct sock *sk)
1289 sock_lock_init_class_and_name(sk,
1290 af_family_slock_key_strings[sk->sk_family],
1291 af_family_slock_keys + sk->sk_family,
1292 af_family_key_strings[sk->sk_family],
1293 af_family_keys + sk->sk_family);
1297 * Copy all fields from osk to nsk but nsk->sk_refcnt must not change yet,
1298 * even temporarly, because of RCU lookups. sk_node should also be left as is.
1299 * We must not copy fields between sk_dontcopy_begin and sk_dontcopy_end
1301 static void sock_copy(struct sock *nsk, const struct sock *osk)
1303 #ifdef CONFIG_SECURITY_NETWORK
1304 void *sptr = nsk->sk_security;
1305 #endif
1306 memcpy(nsk, osk, offsetof(struct sock, sk_dontcopy_begin));
1308 memcpy(&nsk->sk_dontcopy_end, &osk->sk_dontcopy_end,
1309 osk->sk_prot->obj_size - offsetof(struct sock, sk_dontcopy_end));
1311 #ifdef CONFIG_SECURITY_NETWORK
1312 nsk->sk_security = sptr;
1313 security_sk_clone(osk, nsk);
1314 #endif
1317 static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
1318 int family)
1320 struct sock *sk;
1321 struct kmem_cache *slab;
1323 slab = prot->slab;
1324 if (slab != NULL) {
1325 sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO);
1326 if (!sk)
1327 return sk;
1328 if (priority & __GFP_ZERO)
1329 sk_prot_clear_nulls(sk, prot->obj_size);
1330 } else
1331 sk = kmalloc(prot->obj_size, priority);
1333 if (sk != NULL) {
1334 kmemcheck_annotate_bitfield(sk, flags);
1336 if (security_sk_alloc(sk, family, priority))
1337 goto out_free;
1339 if (!try_module_get(prot->owner))
1340 goto out_free_sec;
1341 sk_tx_queue_clear(sk);
1344 return sk;
1346 out_free_sec:
1347 security_sk_free(sk);
1348 out_free:
1349 if (slab != NULL)
1350 kmem_cache_free(slab, sk);
1351 else
1352 kfree(sk);
1353 return NULL;
1356 static void sk_prot_free(struct proto *prot, struct sock *sk)
1358 struct kmem_cache *slab;
1359 struct module *owner;
1361 owner = prot->owner;
1362 slab = prot->slab;
1364 cgroup_sk_free(&sk->sk_cgrp_data);
1365 mem_cgroup_sk_free(sk);
1366 security_sk_free(sk);
1367 if (slab != NULL)
1368 kmem_cache_free(slab, sk);
1369 else
1370 kfree(sk);
1371 module_put(owner);
1375 * sk_alloc - All socket objects are allocated here
1376 * @net: the applicable net namespace
1377 * @family: protocol family
1378 * @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
1379 * @prot: struct proto associated with this new sock instance
1380 * @kern: is this to be a kernel socket?
1382 struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
1383 struct proto *prot, int kern)
1385 struct sock *sk;
1387 sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family);
1388 if (sk) {
1389 sk->sk_family = family;
1391 * See comment in struct sock definition to understand
1392 * why we need sk_prot_creator -acme
1394 sk->sk_prot = sk->sk_prot_creator = prot;
1395 sock_lock_init(sk);
1396 sk->sk_net_refcnt = kern ? 0 : 1;
1397 if (likely(sk->sk_net_refcnt))
1398 get_net(net);
1399 sock_net_set(sk, net);
1400 atomic_set(&sk->sk_wmem_alloc, 1);
1402 mem_cgroup_sk_alloc(sk);
1403 cgroup_sk_alloc(&sk->sk_cgrp_data);
1404 sock_update_classid(&sk->sk_cgrp_data);
1405 sock_update_netprioidx(&sk->sk_cgrp_data);
1406 sk_tx_queue_clear(sk);
1409 return sk;
1411 EXPORT_SYMBOL(sk_alloc);
1413 /* Sockets having SOCK_RCU_FREE will call this function after one RCU
1414 * grace period. This is the case for UDP sockets and TCP listeners.
1416 static void __sk_destruct(struct rcu_head *head)
1418 struct sock *sk = container_of(head, struct sock, sk_rcu);
1419 struct sk_filter *filter;
1421 if (sk->sk_destruct)
1422 sk->sk_destruct(sk);
1424 filter = rcu_dereference_check(sk->sk_filter,
1425 atomic_read(&sk->sk_wmem_alloc) == 0);
1426 if (filter) {
1427 sk_filter_uncharge(sk, filter);
1428 RCU_INIT_POINTER(sk->sk_filter, NULL);
1431 sock_disable_timestamp(sk, SK_FLAGS_TIMESTAMP);
1433 if (atomic_read(&sk->sk_omem_alloc))
1434 pr_debug("%s: optmem leakage (%d bytes) detected\n",
1435 __func__, atomic_read(&sk->sk_omem_alloc));
1437 if (sk->sk_frag.page) {
1438 put_page(sk->sk_frag.page);
1439 sk->sk_frag.page = NULL;
1442 if (sk->sk_peer_cred)
1443 put_cred(sk->sk_peer_cred);
1444 put_pid(sk->sk_peer_pid);
1445 if (likely(sk->sk_net_refcnt))
1446 put_net(sock_net(sk));
1447 sk_prot_free(sk->sk_prot_creator, sk);
1450 void sk_destruct(struct sock *sk)
1452 bool use_call_rcu = sock_flag(sk, SOCK_RCU_FREE);
1454 if (rcu_access_pointer(sk->sk_reuseport_cb)) {
1455 reuseport_detach_sock(sk);
1456 use_call_rcu = true;
1459 if (use_call_rcu)
1460 call_rcu(&sk->sk_rcu, __sk_destruct);
1461 else
1462 __sk_destruct(&sk->sk_rcu);
1465 static void __sk_free(struct sock *sk)
1467 if (unlikely(sk->sk_net_refcnt && sock_diag_has_destroy_listeners(sk)))
1468 sock_diag_broadcast_destroy(sk);
1469 else
1470 sk_destruct(sk);
1473 void sk_free(struct sock *sk)
1476 * We subtract one from sk_wmem_alloc and can know if
1477 * some packets are still in some tx queue.
1478 * If not null, sock_wfree() will call __sk_free(sk) later
1480 if (atomic_dec_and_test(&sk->sk_wmem_alloc))
1481 __sk_free(sk);
1483 EXPORT_SYMBOL(sk_free);
1486 * sk_clone_lock - clone a socket, and lock its clone
1487 * @sk: the socket to clone
1488 * @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
1490 * Caller must unlock socket even in error path (bh_unlock_sock(newsk))
1492 struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
1494 struct sock *newsk;
1495 bool is_charged = true;
1497 newsk = sk_prot_alloc(sk->sk_prot, priority, sk->sk_family);
1498 if (newsk != NULL) {
1499 struct sk_filter *filter;
1501 sock_copy(newsk, sk);
1503 newsk->sk_prot_creator = sk->sk_prot;
1505 /* SANITY */
1506 if (likely(newsk->sk_net_refcnt))
1507 get_net(sock_net(newsk));
1508 sk_node_init(&newsk->sk_node);
1509 sock_lock_init(newsk);
1510 bh_lock_sock(newsk);
1511 newsk->sk_backlog.head = newsk->sk_backlog.tail = NULL;
1512 newsk->sk_backlog.len = 0;
1514 atomic_set(&newsk->sk_rmem_alloc, 0);
1516 * sk_wmem_alloc set to one (see sk_free() and sock_wfree())
1518 atomic_set(&newsk->sk_wmem_alloc, 1);
1519 atomic_set(&newsk->sk_omem_alloc, 0);
1520 skb_queue_head_init(&newsk->sk_receive_queue);
1521 skb_queue_head_init(&newsk->sk_write_queue);
1523 rwlock_init(&newsk->sk_callback_lock);
1524 lockdep_set_class_and_name(&newsk->sk_callback_lock,
1525 af_callback_keys + newsk->sk_family,
1526 af_family_clock_key_strings[newsk->sk_family]);
1528 newsk->sk_dst_cache = NULL;
1529 newsk->sk_wmem_queued = 0;
1530 newsk->sk_forward_alloc = 0;
1531 atomic_set(&newsk->sk_drops, 0);
1532 newsk->sk_send_head = NULL;
1533 newsk->sk_userlocks = sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
1535 sock_reset_flag(newsk, SOCK_DONE);
1536 cgroup_sk_clone(&newsk->sk_cgrp_data);
1537 skb_queue_head_init(&newsk->sk_error_queue);
1539 filter = rcu_dereference_protected(newsk->sk_filter, 1);
1540 if (filter != NULL)
1541 /* though it's an empty new sock, the charging may fail
1542 * if sysctl_optmem_max was changed between creation of
1543 * original socket and cloning
1545 is_charged = sk_filter_charge(newsk, filter);
1547 if (unlikely(!is_charged || xfrm_sk_clone_policy(newsk, sk))) {
1548 /* We need to make sure that we don't uncharge the new
1549 * socket if we couldn't charge it in the first place
1550 * as otherwise we uncharge the parent's filter.
1552 if (!is_charged)
1553 RCU_INIT_POINTER(newsk->sk_filter, NULL);
1554 /* It is still raw copy of parent, so invalidate
1555 * destructor and make plain sk_free() */
1556 newsk->sk_destruct = NULL;
1557 bh_unlock_sock(newsk);
1558 sk_free(newsk);
1559 newsk = NULL;
1560 goto out;
1562 RCU_INIT_POINTER(newsk->sk_reuseport_cb, NULL);
1564 newsk->sk_err = 0;
1565 newsk->sk_err_soft = 0;
1566 newsk->sk_priority = 0;
1567 newsk->sk_incoming_cpu = raw_smp_processor_id();
1568 atomic64_set(&newsk->sk_cookie, 0);
1570 mem_cgroup_sk_alloc(newsk);
1572 * Before updating sk_refcnt, we must commit prior changes to memory
1573 * (Documentation/RCU/rculist_nulls.txt for details)
1575 smp_wmb();
1576 atomic_set(&newsk->sk_refcnt, 2);
1579 * Increment the counter in the same struct proto as the master
1580 * sock (sk_refcnt_debug_inc uses newsk->sk_prot->socks, that
1581 * is the same as sk->sk_prot->socks, as this field was copied
1582 * with memcpy).
1584 * This _changes_ the previous behaviour, where
1585 * tcp_create_openreq_child always was incrementing the
1586 * equivalent to tcp_prot->socks (inet_sock_nr), so this have
1587 * to be taken into account in all callers. -acme
1589 sk_refcnt_debug_inc(newsk);
1590 sk_set_socket(newsk, NULL);
1591 sk_tx_queue_clear(newsk);
1592 newsk->sk_wq = NULL;
1594 if (newsk->sk_prot->sockets_allocated)
1595 sk_sockets_allocated_inc(newsk);
1597 if (sock_needs_netstamp(sk) &&
1598 newsk->sk_flags & SK_FLAGS_TIMESTAMP)
1599 net_enable_timestamp();
1601 out:
1602 return newsk;
1604 EXPORT_SYMBOL_GPL(sk_clone_lock);
1606 void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
1608 u32 max_segs = 1;
1610 sk_dst_set(sk, dst);
1611 sk->sk_route_caps = dst->dev->features;
1612 if (sk->sk_route_caps & NETIF_F_GSO)
1613 sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE;
1614 sk->sk_route_caps &= ~sk->sk_route_nocaps;
1615 if (sk_can_gso(sk)) {
1616 if (dst->header_len) {
1617 sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
1618 } else {
1619 sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM;
1620 sk->sk_gso_max_size = dst->dev->gso_max_size;
1621 max_segs = max_t(u32, dst->dev->gso_max_segs, 1);
1624 sk->sk_gso_max_segs = max_segs;
1626 EXPORT_SYMBOL_GPL(sk_setup_caps);
1629 * Simple resource managers for sockets.
1634 * Write buffer destructor automatically called from kfree_skb.
1636 void sock_wfree(struct sk_buff *skb)
1638 struct sock *sk = skb->sk;
1639 unsigned int len = skb->truesize;
1641 if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE)) {
1643 * Keep a reference on sk_wmem_alloc, this will be released
1644 * after sk_write_space() call
1646 atomic_sub(len - 1, &sk->sk_wmem_alloc);
1647 sk->sk_write_space(sk);
1648 len = 1;
1651 * if sk_wmem_alloc reaches 0, we must finish what sk_free()
1652 * could not do because of in-flight packets
1654 if (atomic_sub_and_test(len, &sk->sk_wmem_alloc))
1655 __sk_free(sk);
1657 EXPORT_SYMBOL(sock_wfree);
1659 /* This variant of sock_wfree() is used by TCP,
1660 * since it sets SOCK_USE_WRITE_QUEUE.
1662 void __sock_wfree(struct sk_buff *skb)
1664 struct sock *sk = skb->sk;
1666 if (atomic_sub_and_test(skb->truesize, &sk->sk_wmem_alloc))
1667 __sk_free(sk);
1670 void skb_set_owner_w(struct sk_buff *skb, struct sock *sk)
1672 skb_orphan(skb);
1673 skb->sk = sk;
1674 #ifdef CONFIG_INET
1675 if (unlikely(!sk_fullsock(sk))) {
1676 skb->destructor = sock_edemux;
1677 sock_hold(sk);
1678 return;
1680 #endif
1681 skb->destructor = sock_wfree;
1682 skb_set_hash_from_sk(skb, sk);
1684 * We used to take a refcount on sk, but following operation
1685 * is enough to guarantee sk_free() wont free this sock until
1686 * all in-flight packets are completed
1688 atomic_add(skb->truesize, &sk->sk_wmem_alloc);
1690 EXPORT_SYMBOL(skb_set_owner_w);
1692 /* This helper is used by netem, as it can hold packets in its
1693 * delay queue. We want to allow the owner socket to send more
1694 * packets, as if they were already TX completed by a typical driver.
1695 * But we also want to keep skb->sk set because some packet schedulers
1696 * rely on it (sch_fq for example).
1698 void skb_orphan_partial(struct sk_buff *skb)
1700 if (skb_is_tcp_pure_ack(skb))
1701 return;
1703 if (skb->destructor == sock_wfree
1704 #ifdef CONFIG_INET
1705 || skb->destructor == tcp_wfree
1706 #endif
1708 struct sock *sk = skb->sk;
1710 if (atomic_inc_not_zero(&sk->sk_refcnt)) {
1711 atomic_sub(skb->truesize, &sk->sk_wmem_alloc);
1712 skb->destructor = sock_efree;
1714 } else {
1715 skb_orphan(skb);
1718 EXPORT_SYMBOL(skb_orphan_partial);
1721 * Read buffer destructor automatically called from kfree_skb.
1723 void sock_rfree(struct sk_buff *skb)
1725 struct sock *sk = skb->sk;
1726 unsigned int len = skb->truesize;
1728 atomic_sub(len, &sk->sk_rmem_alloc);
1729 sk_mem_uncharge(sk, len);
1731 EXPORT_SYMBOL(sock_rfree);
1734 * Buffer destructor for skbs that are not used directly in read or write
1735 * path, e.g. for error handler skbs. Automatically called from kfree_skb.
1737 void sock_efree(struct sk_buff *skb)
1739 sock_put(skb->sk);
1741 EXPORT_SYMBOL(sock_efree);
1743 kuid_t sock_i_uid(struct sock *sk)
1745 kuid_t uid;
1747 read_lock_bh(&sk->sk_callback_lock);
1748 uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : GLOBAL_ROOT_UID;
1749 read_unlock_bh(&sk->sk_callback_lock);
1750 return uid;
1752 EXPORT_SYMBOL(sock_i_uid);
1754 unsigned long sock_i_ino(struct sock *sk)
1756 unsigned long ino;
1758 read_lock_bh(&sk->sk_callback_lock);
1759 ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0;
1760 read_unlock_bh(&sk->sk_callback_lock);
1761 return ino;
1763 EXPORT_SYMBOL(sock_i_ino);
1766 * Allocate a skb from the socket's send buffer.
1768 struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
1769 gfp_t priority)
1771 if (force || atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
1772 struct sk_buff *skb = alloc_skb(size, priority);
1773 if (skb) {
1774 skb_set_owner_w(skb, sk);
1775 return skb;
1778 return NULL;
1780 EXPORT_SYMBOL(sock_wmalloc);
1783 * Allocate a memory block from the socket's option memory buffer.
1785 void *sock_kmalloc(struct sock *sk, int size, gfp_t priority)
1787 if ((unsigned int)size <= sysctl_optmem_max &&
1788 atomic_read(&sk->sk_omem_alloc) + size < sysctl_optmem_max) {
1789 void *mem;
1790 /* First do the add, to avoid the race if kmalloc
1791 * might sleep.
1793 atomic_add(size, &sk->sk_omem_alloc);
1794 mem = kmalloc(size, priority);
1795 if (mem)
1796 return mem;
1797 atomic_sub(size, &sk->sk_omem_alloc);
1799 return NULL;
1801 EXPORT_SYMBOL(sock_kmalloc);
1803 /* Free an option memory block. Note, we actually want the inline
1804 * here as this allows gcc to detect the nullify and fold away the
1805 * condition entirely.
1807 static inline void __sock_kfree_s(struct sock *sk, void *mem, int size,
1808 const bool nullify)
1810 if (WARN_ON_ONCE(!mem))
1811 return;
1812 if (nullify)
1813 kzfree(mem);
1814 else
1815 kfree(mem);
1816 atomic_sub(size, &sk->sk_omem_alloc);
1819 void sock_kfree_s(struct sock *sk, void *mem, int size)
1821 __sock_kfree_s(sk, mem, size, false);
1823 EXPORT_SYMBOL(sock_kfree_s);
1825 void sock_kzfree_s(struct sock *sk, void *mem, int size)
1827 __sock_kfree_s(sk, mem, size, true);
1829 EXPORT_SYMBOL(sock_kzfree_s);
1831 /* It is almost wait_for_tcp_memory minus release_sock/lock_sock.
1832 I think, these locks should be removed for datagram sockets.
1834 static long sock_wait_for_wmem(struct sock *sk, long timeo)
1836 DEFINE_WAIT(wait);
1838 sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
1839 for (;;) {
1840 if (!timeo)
1841 break;
1842 if (signal_pending(current))
1843 break;
1844 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1845 prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
1846 if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf)
1847 break;
1848 if (sk->sk_shutdown & SEND_SHUTDOWN)
1849 break;
1850 if (sk->sk_err)
1851 break;
1852 timeo = schedule_timeout(timeo);
1854 finish_wait(sk_sleep(sk), &wait);
1855 return timeo;
1860 * Generic send/receive buffer handlers
1863 struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len,
1864 unsigned long data_len, int noblock,
1865 int *errcode, int max_page_order)
1867 struct sk_buff *skb;
1868 long timeo;
1869 int err;
1871 timeo = sock_sndtimeo(sk, noblock);
1872 for (;;) {
1873 err = sock_error(sk);
1874 if (err != 0)
1875 goto failure;
1877 err = -EPIPE;
1878 if (sk->sk_shutdown & SEND_SHUTDOWN)
1879 goto failure;
1881 if (sk_wmem_alloc_get(sk) < sk->sk_sndbuf)
1882 break;
1884 sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
1885 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1886 err = -EAGAIN;
1887 if (!timeo)
1888 goto failure;
1889 if (signal_pending(current))
1890 goto interrupted;
1891 timeo = sock_wait_for_wmem(sk, timeo);
1893 skb = alloc_skb_with_frags(header_len, data_len, max_page_order,
1894 errcode, sk->sk_allocation);
1895 if (skb)
1896 skb_set_owner_w(skb, sk);
1897 return skb;
1899 interrupted:
1900 err = sock_intr_errno(timeo);
1901 failure:
1902 *errcode = err;
1903 return NULL;
1905 EXPORT_SYMBOL(sock_alloc_send_pskb);
1907 struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size,
1908 int noblock, int *errcode)
1910 return sock_alloc_send_pskb(sk, size, 0, noblock, errcode, 0);
1912 EXPORT_SYMBOL(sock_alloc_send_skb);
1914 int __sock_cmsg_send(struct sock *sk, struct msghdr *msg, struct cmsghdr *cmsg,
1915 struct sockcm_cookie *sockc)
1917 u32 tsflags;
1919 switch (cmsg->cmsg_type) {
1920 case SO_MARK:
1921 if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
1922 return -EPERM;
1923 if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
1924 return -EINVAL;
1925 sockc->mark = *(u32 *)CMSG_DATA(cmsg);
1926 break;
1927 case SO_TIMESTAMPING:
1928 if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
1929 return -EINVAL;
1931 tsflags = *(u32 *)CMSG_DATA(cmsg);
1932 if (tsflags & ~SOF_TIMESTAMPING_TX_RECORD_MASK)
1933 return -EINVAL;
1935 sockc->tsflags &= ~SOF_TIMESTAMPING_TX_RECORD_MASK;
1936 sockc->tsflags |= tsflags;
1937 break;
1938 /* SCM_RIGHTS and SCM_CREDENTIALS are semantically in SOL_UNIX. */
1939 case SCM_RIGHTS:
1940 case SCM_CREDENTIALS:
1941 break;
1942 default:
1943 return -EINVAL;
1945 return 0;
1947 EXPORT_SYMBOL(__sock_cmsg_send);
1949 int sock_cmsg_send(struct sock *sk, struct msghdr *msg,
1950 struct sockcm_cookie *sockc)
1952 struct cmsghdr *cmsg;
1953 int ret;
1955 for_each_cmsghdr(cmsg, msg) {
1956 if (!CMSG_OK(msg, cmsg))
1957 return -EINVAL;
1958 if (cmsg->cmsg_level != SOL_SOCKET)
1959 continue;
1960 ret = __sock_cmsg_send(sk, msg, cmsg, sockc);
1961 if (ret)
1962 return ret;
1964 return 0;
1966 EXPORT_SYMBOL(sock_cmsg_send);
1968 /* On 32bit arches, an skb frag is limited to 2^15 */
1969 #define SKB_FRAG_PAGE_ORDER get_order(32768)
1972 * skb_page_frag_refill - check that a page_frag contains enough room
1973 * @sz: minimum size of the fragment we want to get
1974 * @pfrag: pointer to page_frag
1975 * @gfp: priority for memory allocation
1977 * Note: While this allocator tries to use high order pages, there is
1978 * no guarantee that allocations succeed. Therefore, @sz MUST be
1979 * less or equal than PAGE_SIZE.
1981 bool skb_page_frag_refill(unsigned int sz, struct page_frag *pfrag, gfp_t gfp)
1983 if (pfrag->page) {
1984 if (page_ref_count(pfrag->page) == 1) {
1985 pfrag->offset = 0;
1986 return true;
1988 if (pfrag->offset + sz <= pfrag->size)
1989 return true;
1990 put_page(pfrag->page);
1993 pfrag->offset = 0;
1994 if (SKB_FRAG_PAGE_ORDER) {
1995 /* Avoid direct reclaim but allow kswapd to wake */
1996 pfrag->page = alloc_pages((gfp & ~__GFP_DIRECT_RECLAIM) |
1997 __GFP_COMP | __GFP_NOWARN |
1998 __GFP_NORETRY,
1999 SKB_FRAG_PAGE_ORDER);
2000 if (likely(pfrag->page)) {
2001 pfrag->size = PAGE_SIZE << SKB_FRAG_PAGE_ORDER;
2002 return true;
2005 pfrag->page = alloc_page(gfp);
2006 if (likely(pfrag->page)) {
2007 pfrag->size = PAGE_SIZE;
2008 return true;
2010 return false;
2012 EXPORT_SYMBOL(skb_page_frag_refill);
2014 bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag)
2016 if (likely(skb_page_frag_refill(32U, pfrag, sk->sk_allocation)))
2017 return true;
2019 sk_enter_memory_pressure(sk);
2020 sk_stream_moderate_sndbuf(sk);
2021 return false;
2023 EXPORT_SYMBOL(sk_page_frag_refill);
2025 static void __lock_sock(struct sock *sk)
2026 __releases(&sk->sk_lock.slock)
2027 __acquires(&sk->sk_lock.slock)
2029 DEFINE_WAIT(wait);
2031 for (;;) {
2032 prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait,
2033 TASK_UNINTERRUPTIBLE);
2034 spin_unlock_bh(&sk->sk_lock.slock);
2035 schedule();
2036 spin_lock_bh(&sk->sk_lock.slock);
2037 if (!sock_owned_by_user(sk))
2038 break;
2040 finish_wait(&sk->sk_lock.wq, &wait);
2043 static void __release_sock(struct sock *sk)
2044 __releases(&sk->sk_lock.slock)
2045 __acquires(&sk->sk_lock.slock)
2047 struct sk_buff *skb, *next;
2049 while ((skb = sk->sk_backlog.head) != NULL) {
2050 sk->sk_backlog.head = sk->sk_backlog.tail = NULL;
2052 spin_unlock_bh(&sk->sk_lock.slock);
2054 do {
2055 next = skb->next;
2056 prefetch(next);
2057 WARN_ON_ONCE(skb_dst_is_noref(skb));
2058 skb->next = NULL;
2059 sk_backlog_rcv(sk, skb);
2061 cond_resched();
2063 skb = next;
2064 } while (skb != NULL);
2066 spin_lock_bh(&sk->sk_lock.slock);
2070 * Doing the zeroing here guarantee we can not loop forever
2071 * while a wild producer attempts to flood us.
2073 sk->sk_backlog.len = 0;
2076 void __sk_flush_backlog(struct sock *sk)
2078 spin_lock_bh(&sk->sk_lock.slock);
2079 __release_sock(sk);
2080 spin_unlock_bh(&sk->sk_lock.slock);
2084 * sk_wait_data - wait for data to arrive at sk_receive_queue
2085 * @sk: sock to wait on
2086 * @timeo: for how long
2087 * @skb: last skb seen on sk_receive_queue
2089 * Now socket state including sk->sk_err is changed only under lock,
2090 * hence we may omit checks after joining wait queue.
2091 * We check receive queue before schedule() only as optimization;
2092 * it is very likely that release_sock() added new data.
2094 int sk_wait_data(struct sock *sk, long *timeo, const struct sk_buff *skb)
2096 int rc;
2097 DEFINE_WAIT(wait);
2099 prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
2100 sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2101 rc = sk_wait_event(sk, timeo, skb_peek_tail(&sk->sk_receive_queue) != skb);
2102 sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2103 finish_wait(sk_sleep(sk), &wait);
2104 return rc;
2106 EXPORT_SYMBOL(sk_wait_data);
2109 * __sk_mem_schedule - increase sk_forward_alloc and memory_allocated
2110 * @sk: socket
2111 * @size: memory size to allocate
2112 * @kind: allocation type
2114 * If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means
2115 * rmem allocation. This function assumes that protocols which have
2116 * memory_pressure use sk_wmem_queued as write buffer accounting.
2118 int __sk_mem_schedule(struct sock *sk, int size, int kind)
2120 struct proto *prot = sk->sk_prot;
2121 int amt = sk_mem_pages(size);
2122 long allocated;
2124 sk->sk_forward_alloc += amt * SK_MEM_QUANTUM;
2126 allocated = sk_memory_allocated_add(sk, amt);
2128 if (mem_cgroup_sockets_enabled && sk->sk_memcg &&
2129 !mem_cgroup_charge_skmem(sk->sk_memcg, amt))
2130 goto suppress_allocation;
2132 /* Under limit. */
2133 if (allocated <= sk_prot_mem_limits(sk, 0)) {
2134 sk_leave_memory_pressure(sk);
2135 return 1;
2138 /* Under pressure. */
2139 if (allocated > sk_prot_mem_limits(sk, 1))
2140 sk_enter_memory_pressure(sk);
2142 /* Over hard limit. */
2143 if (allocated > sk_prot_mem_limits(sk, 2))
2144 goto suppress_allocation;
2146 /* guarantee minimum buffer size under pressure */
2147 if (kind == SK_MEM_RECV) {
2148 if (atomic_read(&sk->sk_rmem_alloc) < prot->sysctl_rmem[0])
2149 return 1;
2151 } else { /* SK_MEM_SEND */
2152 if (sk->sk_type == SOCK_STREAM) {
2153 if (sk->sk_wmem_queued < prot->sysctl_wmem[0])
2154 return 1;
2155 } else if (atomic_read(&sk->sk_wmem_alloc) <
2156 prot->sysctl_wmem[0])
2157 return 1;
2160 if (sk_has_memory_pressure(sk)) {
2161 u64 alloc;
2163 if (!sk_under_memory_pressure(sk))
2164 return 1;
2165 alloc = sk_sockets_allocated_read_positive(sk);
2166 if (sk_prot_mem_limits(sk, 2) > alloc *
2167 sk_mem_pages(sk->sk_wmem_queued +
2168 atomic_read(&sk->sk_rmem_alloc) +
2169 sk->sk_forward_alloc))
2170 return 1;
2173 suppress_allocation:
2175 if (kind == SK_MEM_SEND && sk->sk_type == SOCK_STREAM) {
2176 sk_stream_moderate_sndbuf(sk);
2178 /* Fail only if socket is _under_ its sndbuf.
2179 * In this case we cannot block, so that we have to fail.
2181 if (sk->sk_wmem_queued + size >= sk->sk_sndbuf)
2182 return 1;
2185 trace_sock_exceed_buf_limit(sk, prot, allocated);
2187 /* Alas. Undo changes. */
2188 sk->sk_forward_alloc -= amt * SK_MEM_QUANTUM;
2190 sk_memory_allocated_sub(sk, amt);
2192 if (mem_cgroup_sockets_enabled && sk->sk_memcg)
2193 mem_cgroup_uncharge_skmem(sk->sk_memcg, amt);
2195 return 0;
2197 EXPORT_SYMBOL(__sk_mem_schedule);
2200 * __sk_mem_reclaim - reclaim memory_allocated
2201 * @sk: socket
2202 * @amount: number of bytes (rounded down to a SK_MEM_QUANTUM multiple)
2204 void __sk_mem_reclaim(struct sock *sk, int amount)
2206 amount >>= SK_MEM_QUANTUM_SHIFT;
2207 sk_memory_allocated_sub(sk, amount);
2208 sk->sk_forward_alloc -= amount << SK_MEM_QUANTUM_SHIFT;
2210 if (mem_cgroup_sockets_enabled && sk->sk_memcg)
2211 mem_cgroup_uncharge_skmem(sk->sk_memcg, amount);
2213 if (sk_under_memory_pressure(sk) &&
2214 (sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)))
2215 sk_leave_memory_pressure(sk);
2217 EXPORT_SYMBOL(__sk_mem_reclaim);
2219 int sk_set_peek_off(struct sock *sk, int val)
2221 if (val < 0)
2222 return -EINVAL;
2224 sk->sk_peek_off = val;
2225 return 0;
2227 EXPORT_SYMBOL_GPL(sk_set_peek_off);
2230 * Set of default routines for initialising struct proto_ops when
2231 * the protocol does not support a particular function. In certain
2232 * cases where it makes no sense for a protocol to have a "do nothing"
2233 * function, some default processing is provided.
2236 int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len)
2238 return -EOPNOTSUPP;
2240 EXPORT_SYMBOL(sock_no_bind);
2242 int sock_no_connect(struct socket *sock, struct sockaddr *saddr,
2243 int len, int flags)
2245 return -EOPNOTSUPP;
2247 EXPORT_SYMBOL(sock_no_connect);
2249 int sock_no_socketpair(struct socket *sock1, struct socket *sock2)
2251 return -EOPNOTSUPP;
2253 EXPORT_SYMBOL(sock_no_socketpair);
2255 int sock_no_accept(struct socket *sock, struct socket *newsock, int flags)
2257 return -EOPNOTSUPP;
2259 EXPORT_SYMBOL(sock_no_accept);
2261 int sock_no_getname(struct socket *sock, struct sockaddr *saddr,
2262 int *len, int peer)
2264 return -EOPNOTSUPP;
2266 EXPORT_SYMBOL(sock_no_getname);
2268 unsigned int sock_no_poll(struct file *file, struct socket *sock, poll_table *pt)
2270 return 0;
2272 EXPORT_SYMBOL(sock_no_poll);
2274 int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
2276 return -EOPNOTSUPP;
2278 EXPORT_SYMBOL(sock_no_ioctl);
2280 int sock_no_listen(struct socket *sock, int backlog)
2282 return -EOPNOTSUPP;
2284 EXPORT_SYMBOL(sock_no_listen);
2286 int sock_no_shutdown(struct socket *sock, int how)
2288 return -EOPNOTSUPP;
2290 EXPORT_SYMBOL(sock_no_shutdown);
2292 int sock_no_setsockopt(struct socket *sock, int level, int optname,
2293 char __user *optval, unsigned int optlen)
2295 return -EOPNOTSUPP;
2297 EXPORT_SYMBOL(sock_no_setsockopt);
2299 int sock_no_getsockopt(struct socket *sock, int level, int optname,
2300 char __user *optval, int __user *optlen)
2302 return -EOPNOTSUPP;
2304 EXPORT_SYMBOL(sock_no_getsockopt);
2306 int sock_no_sendmsg(struct socket *sock, struct msghdr *m, size_t len)
2308 return -EOPNOTSUPP;
2310 EXPORT_SYMBOL(sock_no_sendmsg);
2312 int sock_no_recvmsg(struct socket *sock, struct msghdr *m, size_t len,
2313 int flags)
2315 return -EOPNOTSUPP;
2317 EXPORT_SYMBOL(sock_no_recvmsg);
2319 int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
2321 /* Mirror missing mmap method error code */
2322 return -ENODEV;
2324 EXPORT_SYMBOL(sock_no_mmap);
2327 * When a file is received (via SCM_RIGHTS, etc), we must bump the
2328 * various sock-based usage counts.
2330 void __receive_sock(struct file *file)
2332 struct socket *sock;
2333 int error;
2336 * The resulting value of "error" is ignored here since we only
2337 * need to take action when the file is a socket and testing
2338 * "sock" for NULL is sufficient.
2340 sock = sock_from_file(file, &error);
2341 if (sock) {
2342 sock_update_netprioidx(&sock->sk->sk_cgrp_data);
2343 sock_update_classid(&sock->sk->sk_cgrp_data);
2347 ssize_t sock_no_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags)
2349 ssize_t res;
2350 struct msghdr msg = {.msg_flags = flags};
2351 struct kvec iov;
2352 char *kaddr = kmap(page);
2353 iov.iov_base = kaddr + offset;
2354 iov.iov_len = size;
2355 res = kernel_sendmsg(sock, &msg, &iov, 1, size);
2356 kunmap(page);
2357 return res;
2359 EXPORT_SYMBOL(sock_no_sendpage);
2362 * Default Socket Callbacks
2365 static void sock_def_wakeup(struct sock *sk)
2367 struct socket_wq *wq;
2369 rcu_read_lock();
2370 wq = rcu_dereference(sk->sk_wq);
2371 if (skwq_has_sleeper(wq))
2372 wake_up_interruptible_all(&wq->wait);
2373 rcu_read_unlock();
2376 static void sock_def_error_report(struct sock *sk)
2378 struct socket_wq *wq;
2380 rcu_read_lock();
2381 wq = rcu_dereference(sk->sk_wq);
2382 if (skwq_has_sleeper(wq))
2383 wake_up_interruptible_poll(&wq->wait, POLLERR);
2384 sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR);
2385 rcu_read_unlock();
2388 static void sock_def_readable(struct sock *sk)
2390 struct socket_wq *wq;
2392 rcu_read_lock();
2393 wq = rcu_dereference(sk->sk_wq);
2394 if (skwq_has_sleeper(wq))
2395 wake_up_interruptible_sync_poll(&wq->wait, POLLIN | POLLPRI |
2396 POLLRDNORM | POLLRDBAND);
2397 sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
2398 rcu_read_unlock();
2401 static void sock_def_write_space(struct sock *sk)
2403 struct socket_wq *wq;
2405 rcu_read_lock();
2407 /* Do not wake up a writer until he can make "significant"
2408 * progress. --DaveM
2410 if ((atomic_read(&sk->sk_wmem_alloc) << 1) <= sk->sk_sndbuf) {
2411 wq = rcu_dereference(sk->sk_wq);
2412 if (skwq_has_sleeper(wq))
2413 wake_up_interruptible_sync_poll(&wq->wait, POLLOUT |
2414 POLLWRNORM | POLLWRBAND);
2416 /* Should agree with poll, otherwise some programs break */
2417 if (sock_writeable(sk))
2418 sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
2421 rcu_read_unlock();
2424 static void sock_def_destruct(struct sock *sk)
2428 void sk_send_sigurg(struct sock *sk)
2430 if (sk->sk_socket && sk->sk_socket->file)
2431 if (send_sigurg(&sk->sk_socket->file->f_owner))
2432 sk_wake_async(sk, SOCK_WAKE_URG, POLL_PRI);
2434 EXPORT_SYMBOL(sk_send_sigurg);
2436 void sk_reset_timer(struct sock *sk, struct timer_list* timer,
2437 unsigned long expires)
2439 if (!mod_timer(timer, expires))
2440 sock_hold(sk);
2442 EXPORT_SYMBOL(sk_reset_timer);
2444 void sk_stop_timer(struct sock *sk, struct timer_list* timer)
2446 if (del_timer(timer))
2447 __sock_put(sk);
2449 EXPORT_SYMBOL(sk_stop_timer);
2451 void sock_init_data(struct socket *sock, struct sock *sk)
2453 skb_queue_head_init(&sk->sk_receive_queue);
2454 skb_queue_head_init(&sk->sk_write_queue);
2455 skb_queue_head_init(&sk->sk_error_queue);
2457 sk->sk_send_head = NULL;
2459 init_timer(&sk->sk_timer);
2461 sk->sk_allocation = GFP_KERNEL;
2462 sk->sk_rcvbuf = sysctl_rmem_default;
2463 sk->sk_sndbuf = sysctl_wmem_default;
2464 sk->sk_state = TCP_CLOSE;
2465 sk_set_socket(sk, sock);
2467 sock_set_flag(sk, SOCK_ZAPPED);
2469 if (sock) {
2470 sk->sk_type = sock->type;
2471 sk->sk_wq = sock->wq;
2472 sock->sk = sk;
2473 } else
2474 sk->sk_wq = NULL;
2476 rwlock_init(&sk->sk_callback_lock);
2477 lockdep_set_class_and_name(&sk->sk_callback_lock,
2478 af_callback_keys + sk->sk_family,
2479 af_family_clock_key_strings[sk->sk_family]);
2481 sk->sk_state_change = sock_def_wakeup;
2482 sk->sk_data_ready = sock_def_readable;
2483 sk->sk_write_space = sock_def_write_space;
2484 sk->sk_error_report = sock_def_error_report;
2485 sk->sk_destruct = sock_def_destruct;
2487 sk->sk_frag.page = NULL;
2488 sk->sk_frag.offset = 0;
2489 sk->sk_peek_off = -1;
2491 sk->sk_peer_pid = NULL;
2492 sk->sk_peer_cred = NULL;
2493 sk->sk_write_pending = 0;
2494 sk->sk_rcvlowat = 1;
2495 sk->sk_rcvtimeo = MAX_SCHEDULE_TIMEOUT;
2496 sk->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT;
2498 sk->sk_stamp = ktime_set(-1L, 0);
2499 #if BITS_PER_LONG==32
2500 seqlock_init(&sk->sk_stamp_seq);
2501 #endif
2503 #ifdef CONFIG_NET_RX_BUSY_POLL
2504 sk->sk_napi_id = 0;
2505 sk->sk_ll_usec = sysctl_net_busy_read;
2506 #endif
2508 sk->sk_max_pacing_rate = ~0U;
2509 sk->sk_pacing_rate = ~0U;
2510 sk->sk_incoming_cpu = -1;
2512 * Before updating sk_refcnt, we must commit prior changes to memory
2513 * (Documentation/RCU/rculist_nulls.txt for details)
2515 smp_wmb();
2516 atomic_set(&sk->sk_refcnt, 1);
2517 atomic_set(&sk->sk_drops, 0);
2519 EXPORT_SYMBOL(sock_init_data);
2521 void lock_sock_nested(struct sock *sk, int subclass)
2523 might_sleep();
2524 spin_lock_bh(&sk->sk_lock.slock);
2525 if (sk->sk_lock.owned)
2526 __lock_sock(sk);
2527 sk->sk_lock.owned = 1;
2528 spin_unlock(&sk->sk_lock.slock);
2530 * The sk_lock has mutex_lock() semantics here:
2532 mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_);
2533 local_bh_enable();
2535 EXPORT_SYMBOL(lock_sock_nested);
2537 void release_sock(struct sock *sk)
2539 spin_lock_bh(&sk->sk_lock.slock);
2540 if (sk->sk_backlog.tail)
2541 __release_sock(sk);
2543 /* Warning : release_cb() might need to release sk ownership,
2544 * ie call sock_release_ownership(sk) before us.
2546 if (sk->sk_prot->release_cb)
2547 sk->sk_prot->release_cb(sk);
2549 sock_release_ownership(sk);
2550 if (waitqueue_active(&sk->sk_lock.wq))
2551 wake_up(&sk->sk_lock.wq);
2552 spin_unlock_bh(&sk->sk_lock.slock);
2554 EXPORT_SYMBOL(release_sock);
2557 * lock_sock_fast - fast version of lock_sock
2558 * @sk: socket
2560 * This version should be used for very small section, where process wont block
2561 * return false if fast path is taken
2562 * sk_lock.slock locked, owned = 0, BH disabled
2563 * return true if slow path is taken
2564 * sk_lock.slock unlocked, owned = 1, BH enabled
2566 bool lock_sock_fast(struct sock *sk)
2568 might_sleep();
2569 spin_lock_bh(&sk->sk_lock.slock);
2571 if (!sk->sk_lock.owned)
2573 * Note : We must disable BH
2575 return false;
2577 __lock_sock(sk);
2578 sk->sk_lock.owned = 1;
2579 spin_unlock(&sk->sk_lock.slock);
2581 * The sk_lock has mutex_lock() semantics here:
2583 mutex_acquire(&sk->sk_lock.dep_map, 0, 0, _RET_IP_);
2584 local_bh_enable();
2585 return true;
2587 EXPORT_SYMBOL(lock_sock_fast);
2589 int sock_get_timestamp(struct sock *sk, struct timeval __user *userstamp)
2591 struct timeval tv;
2592 if (!sock_flag(sk, SOCK_TIMESTAMP))
2593 sock_enable_timestamp(sk, SOCK_TIMESTAMP);
2594 tv = ktime_to_timeval(sk->sk_stamp);
2595 if (tv.tv_sec == -1)
2596 return -ENOENT;
2597 if (tv.tv_sec == 0) {
2598 sk->sk_stamp = ktime_get_real();
2599 tv = ktime_to_timeval(sk->sk_stamp);
2601 return copy_to_user(userstamp, &tv, sizeof(tv)) ? -EFAULT : 0;
2603 EXPORT_SYMBOL(sock_get_timestamp);
2605 int sock_get_timestampns(struct sock *sk, struct timespec __user *userstamp)
2607 struct timespec ts;
2608 if (!sock_flag(sk, SOCK_TIMESTAMP))
2609 sock_enable_timestamp(sk, SOCK_TIMESTAMP);
2610 ts = ktime_to_timespec(sk->sk_stamp);
2611 if (ts.tv_sec == -1)
2612 return -ENOENT;
2613 if (ts.tv_sec == 0) {
2614 sk->sk_stamp = ktime_get_real();
2615 ts = ktime_to_timespec(sk->sk_stamp);
2617 return copy_to_user(userstamp, &ts, sizeof(ts)) ? -EFAULT : 0;
2619 EXPORT_SYMBOL(sock_get_timestampns);
2621 void sock_enable_timestamp(struct sock *sk, int flag)
2623 if (!sock_flag(sk, flag)) {
2624 unsigned long previous_flags = sk->sk_flags;
2626 sock_set_flag(sk, flag);
2628 * we just set one of the two flags which require net
2629 * time stamping, but time stamping might have been on
2630 * already because of the other one
2632 if (sock_needs_netstamp(sk) &&
2633 !(previous_flags & SK_FLAGS_TIMESTAMP))
2634 net_enable_timestamp();
2638 int sock_recv_errqueue(struct sock *sk, struct msghdr *msg, int len,
2639 int level, int type)
2641 struct sock_exterr_skb *serr;
2642 struct sk_buff *skb;
2643 int copied, err;
2645 err = -EAGAIN;
2646 skb = sock_dequeue_err_skb(sk);
2647 if (skb == NULL)
2648 goto out;
2650 copied = skb->len;
2651 if (copied > len) {
2652 msg->msg_flags |= MSG_TRUNC;
2653 copied = len;
2655 err = skb_copy_datagram_msg(skb, 0, msg, copied);
2656 if (err)
2657 goto out_free_skb;
2659 sock_recv_timestamp(msg, sk, skb);
2661 serr = SKB_EXT_ERR(skb);
2662 put_cmsg(msg, level, type, sizeof(serr->ee), &serr->ee);
2664 msg->msg_flags |= MSG_ERRQUEUE;
2665 err = copied;
2667 out_free_skb:
2668 kfree_skb(skb);
2669 out:
2670 return err;
2672 EXPORT_SYMBOL(sock_recv_errqueue);
2675 * Get a socket option on an socket.
2677 * FIX: POSIX 1003.1g is very ambiguous here. It states that
2678 * asynchronous errors should be reported by getsockopt. We assume
2679 * this means if you specify SO_ERROR (otherwise whats the point of it).
2681 int sock_common_getsockopt(struct socket *sock, int level, int optname,
2682 char __user *optval, int __user *optlen)
2684 struct sock *sk = sock->sk;
2686 return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
2688 EXPORT_SYMBOL(sock_common_getsockopt);
2690 #ifdef CONFIG_COMPAT
2691 int compat_sock_common_getsockopt(struct socket *sock, int level, int optname,
2692 char __user *optval, int __user *optlen)
2694 struct sock *sk = sock->sk;
2696 if (sk->sk_prot->compat_getsockopt != NULL)
2697 return sk->sk_prot->compat_getsockopt(sk, level, optname,
2698 optval, optlen);
2699 return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
2701 EXPORT_SYMBOL(compat_sock_common_getsockopt);
2702 #endif
2704 int sock_common_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
2705 int flags)
2707 struct sock *sk = sock->sk;
2708 int addr_len = 0;
2709 int err;
2711 err = sk->sk_prot->recvmsg(sk, msg, size, flags & MSG_DONTWAIT,
2712 flags & ~MSG_DONTWAIT, &addr_len);
2713 if (err >= 0)
2714 msg->msg_namelen = addr_len;
2715 return err;
2717 EXPORT_SYMBOL(sock_common_recvmsg);
2720 * Set socket options on an inet socket.
2722 int sock_common_setsockopt(struct socket *sock, int level, int optname,
2723 char __user *optval, unsigned int optlen)
2725 struct sock *sk = sock->sk;
2727 return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
2729 EXPORT_SYMBOL(sock_common_setsockopt);
2731 #ifdef CONFIG_COMPAT
2732 int compat_sock_common_setsockopt(struct socket *sock, int level, int optname,
2733 char __user *optval, unsigned int optlen)
2735 struct sock *sk = sock->sk;
2737 if (sk->sk_prot->compat_setsockopt != NULL)
2738 return sk->sk_prot->compat_setsockopt(sk, level, optname,
2739 optval, optlen);
2740 return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
2742 EXPORT_SYMBOL(compat_sock_common_setsockopt);
2743 #endif
2745 void sk_common_release(struct sock *sk)
2747 if (sk->sk_prot->destroy)
2748 sk->sk_prot->destroy(sk);
2751 * Observation: when sock_common_release is called, processes have
2752 * no access to socket. But net still has.
2753 * Step one, detach it from networking:
2755 * A. Remove from hash tables.
2758 sk->sk_prot->unhash(sk);
2761 * In this point socket cannot receive new packets, but it is possible
2762 * that some packets are in flight because some CPU runs receiver and
2763 * did hash table lookup before we unhashed socket. They will achieve
2764 * receive queue and will be purged by socket destructor.
2766 * Also we still have packets pending on receive queue and probably,
2767 * our own packets waiting in device queues. sock_destroy will drain
2768 * receive queue, but transmitted packets will delay socket destruction
2769 * until the last reference will be released.
2772 sock_orphan(sk);
2774 xfrm_sk_free_policy(sk);
2776 sk_refcnt_debug_release(sk);
2778 sock_put(sk);
2780 EXPORT_SYMBOL(sk_common_release);
2782 #ifdef CONFIG_PROC_FS
2783 #define PROTO_INUSE_NR 64 /* should be enough for the first time */
2784 struct prot_inuse {
2785 int val[PROTO_INUSE_NR];
2788 static DECLARE_BITMAP(proto_inuse_idx, PROTO_INUSE_NR);
2790 #ifdef CONFIG_NET_NS
2791 void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
2793 __this_cpu_add(net->core.inuse->val[prot->inuse_idx], val);
2795 EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
2797 int sock_prot_inuse_get(struct net *net, struct proto *prot)
2799 int cpu, idx = prot->inuse_idx;
2800 int res = 0;
2802 for_each_possible_cpu(cpu)
2803 res += per_cpu_ptr(net->core.inuse, cpu)->val[idx];
2805 return res >= 0 ? res : 0;
2807 EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
2809 static int __net_init sock_inuse_init_net(struct net *net)
2811 net->core.inuse = alloc_percpu(struct prot_inuse);
2812 return net->core.inuse ? 0 : -ENOMEM;
2815 static void __net_exit sock_inuse_exit_net(struct net *net)
2817 free_percpu(net->core.inuse);
2820 static struct pernet_operations net_inuse_ops = {
2821 .init = sock_inuse_init_net,
2822 .exit = sock_inuse_exit_net,
2825 static __init int net_inuse_init(void)
2827 if (register_pernet_subsys(&net_inuse_ops))
2828 panic("Cannot initialize net inuse counters");
2830 return 0;
2833 core_initcall(net_inuse_init);
2834 #else
2835 static DEFINE_PER_CPU(struct prot_inuse, prot_inuse);
2837 void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
2839 __this_cpu_add(prot_inuse.val[prot->inuse_idx], val);
2841 EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
2843 int sock_prot_inuse_get(struct net *net, struct proto *prot)
2845 int cpu, idx = prot->inuse_idx;
2846 int res = 0;
2848 for_each_possible_cpu(cpu)
2849 res += per_cpu(prot_inuse, cpu).val[idx];
2851 return res >= 0 ? res : 0;
2853 EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
2854 #endif
2856 static void assign_proto_idx(struct proto *prot)
2858 prot->inuse_idx = find_first_zero_bit(proto_inuse_idx, PROTO_INUSE_NR);
2860 if (unlikely(prot->inuse_idx == PROTO_INUSE_NR - 1)) {
2861 pr_err("PROTO_INUSE_NR exhausted\n");
2862 return;
2865 set_bit(prot->inuse_idx, proto_inuse_idx);
2868 static void release_proto_idx(struct proto *prot)
2870 if (prot->inuse_idx != PROTO_INUSE_NR - 1)
2871 clear_bit(prot->inuse_idx, proto_inuse_idx);
2873 #else
2874 static inline void assign_proto_idx(struct proto *prot)
2878 static inline void release_proto_idx(struct proto *prot)
2881 #endif
2883 static void req_prot_cleanup(struct request_sock_ops *rsk_prot)
2885 if (!rsk_prot)
2886 return;
2887 kfree(rsk_prot->slab_name);
2888 rsk_prot->slab_name = NULL;
2889 kmem_cache_destroy(rsk_prot->slab);
2890 rsk_prot->slab = NULL;
2893 static int req_prot_init(const struct proto *prot)
2895 struct request_sock_ops *rsk_prot = prot->rsk_prot;
2897 if (!rsk_prot)
2898 return 0;
2900 rsk_prot->slab_name = kasprintf(GFP_KERNEL, "request_sock_%s",
2901 prot->name);
2902 if (!rsk_prot->slab_name)
2903 return -ENOMEM;
2905 rsk_prot->slab = kmem_cache_create(rsk_prot->slab_name,
2906 rsk_prot->obj_size, 0,
2907 prot->slab_flags, NULL);
2909 if (!rsk_prot->slab) {
2910 pr_crit("%s: Can't create request sock SLAB cache!\n",
2911 prot->name);
2912 return -ENOMEM;
2914 return 0;
2917 int proto_register(struct proto *prot, int alloc_slab)
2919 if (alloc_slab) {
2920 prot->slab = kmem_cache_create(prot->name, prot->obj_size, 0,
2921 SLAB_HWCACHE_ALIGN | prot->slab_flags,
2922 NULL);
2924 if (prot->slab == NULL) {
2925 pr_crit("%s: Can't create sock SLAB cache!\n",
2926 prot->name);
2927 goto out;
2930 if (req_prot_init(prot))
2931 goto out_free_request_sock_slab;
2933 if (prot->twsk_prot != NULL) {
2934 prot->twsk_prot->twsk_slab_name = kasprintf(GFP_KERNEL, "tw_sock_%s", prot->name);
2936 if (prot->twsk_prot->twsk_slab_name == NULL)
2937 goto out_free_request_sock_slab;
2939 prot->twsk_prot->twsk_slab =
2940 kmem_cache_create(prot->twsk_prot->twsk_slab_name,
2941 prot->twsk_prot->twsk_obj_size,
2943 prot->slab_flags,
2944 NULL);
2945 if (prot->twsk_prot->twsk_slab == NULL)
2946 goto out_free_timewait_sock_slab_name;
2950 mutex_lock(&proto_list_mutex);
2951 list_add(&prot->node, &proto_list);
2952 assign_proto_idx(prot);
2953 mutex_unlock(&proto_list_mutex);
2954 return 0;
2956 out_free_timewait_sock_slab_name:
2957 kfree(prot->twsk_prot->twsk_slab_name);
2958 out_free_request_sock_slab:
2959 req_prot_cleanup(prot->rsk_prot);
2961 kmem_cache_destroy(prot->slab);
2962 prot->slab = NULL;
2963 out:
2964 return -ENOBUFS;
2966 EXPORT_SYMBOL(proto_register);
2968 void proto_unregister(struct proto *prot)
2970 mutex_lock(&proto_list_mutex);
2971 release_proto_idx(prot);
2972 list_del(&prot->node);
2973 mutex_unlock(&proto_list_mutex);
2975 kmem_cache_destroy(prot->slab);
2976 prot->slab = NULL;
2978 req_prot_cleanup(prot->rsk_prot);
2980 if (prot->twsk_prot != NULL && prot->twsk_prot->twsk_slab != NULL) {
2981 kmem_cache_destroy(prot->twsk_prot->twsk_slab);
2982 kfree(prot->twsk_prot->twsk_slab_name);
2983 prot->twsk_prot->twsk_slab = NULL;
2986 EXPORT_SYMBOL(proto_unregister);
2988 #ifdef CONFIG_PROC_FS
2989 static void *proto_seq_start(struct seq_file *seq, loff_t *pos)
2990 __acquires(proto_list_mutex)
2992 mutex_lock(&proto_list_mutex);
2993 return seq_list_start_head(&proto_list, *pos);
2996 static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2998 return seq_list_next(v, &proto_list, pos);
3001 static void proto_seq_stop(struct seq_file *seq, void *v)
3002 __releases(proto_list_mutex)
3004 mutex_unlock(&proto_list_mutex);
3007 static char proto_method_implemented(const void *method)
3009 return method == NULL ? 'n' : 'y';
3011 static long sock_prot_memory_allocated(struct proto *proto)
3013 return proto->memory_allocated != NULL ? proto_memory_allocated(proto) : -1L;
3016 static char *sock_prot_memory_pressure(struct proto *proto)
3018 return proto->memory_pressure != NULL ?
3019 proto_memory_pressure(proto) ? "yes" : "no" : "NI";
3022 static void proto_seq_printf(struct seq_file *seq, struct proto *proto)
3025 seq_printf(seq, "%-9s %4u %6d %6ld %-3s %6u %-3s %-10s "
3026 "%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n",
3027 proto->name,
3028 proto->obj_size,
3029 sock_prot_inuse_get(seq_file_net(seq), proto),
3030 sock_prot_memory_allocated(proto),
3031 sock_prot_memory_pressure(proto),
3032 proto->max_header,
3033 proto->slab == NULL ? "no" : "yes",
3034 module_name(proto->owner),
3035 proto_method_implemented(proto->close),
3036 proto_method_implemented(proto->connect),
3037 proto_method_implemented(proto->disconnect),
3038 proto_method_implemented(proto->accept),
3039 proto_method_implemented(proto->ioctl),
3040 proto_method_implemented(proto->init),
3041 proto_method_implemented(proto->destroy),
3042 proto_method_implemented(proto->shutdown),
3043 proto_method_implemented(proto->setsockopt),
3044 proto_method_implemented(proto->getsockopt),
3045 proto_method_implemented(proto->sendmsg),
3046 proto_method_implemented(proto->recvmsg),
3047 proto_method_implemented(proto->sendpage),
3048 proto_method_implemented(proto->bind),
3049 proto_method_implemented(proto->backlog_rcv),
3050 proto_method_implemented(proto->hash),
3051 proto_method_implemented(proto->unhash),
3052 proto_method_implemented(proto->get_port),
3053 proto_method_implemented(proto->enter_memory_pressure));
3056 static int proto_seq_show(struct seq_file *seq, void *v)
3058 if (v == &proto_list)
3059 seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s",
3060 "protocol",
3061 "size",
3062 "sockets",
3063 "memory",
3064 "press",
3065 "maxhdr",
3066 "slab",
3067 "module",
3068 "cl co di ac io in de sh ss gs se re sp bi br ha uh gp em\n");
3069 else
3070 proto_seq_printf(seq, list_entry(v, struct proto, node));
3071 return 0;
3074 static const struct seq_operations proto_seq_ops = {
3075 .start = proto_seq_start,
3076 .next = proto_seq_next,
3077 .stop = proto_seq_stop,
3078 .show = proto_seq_show,
3081 static int proto_seq_open(struct inode *inode, struct file *file)
3083 return seq_open_net(inode, file, &proto_seq_ops,
3084 sizeof(struct seq_net_private));
3087 static const struct file_operations proto_seq_fops = {
3088 .owner = THIS_MODULE,
3089 .open = proto_seq_open,
3090 .read = seq_read,
3091 .llseek = seq_lseek,
3092 .release = seq_release_net,
3095 static __net_init int proto_init_net(struct net *net)
3097 if (!proc_create("protocols", S_IRUGO, net->proc_net, &proto_seq_fops))
3098 return -ENOMEM;
3100 return 0;
3103 static __net_exit void proto_exit_net(struct net *net)
3105 remove_proc_entry("protocols", net->proc_net);
3109 static __net_initdata struct pernet_operations proto_net_ops = {
3110 .init = proto_init_net,
3111 .exit = proto_exit_net,
3114 static int __init proto_init(void)
3116 return register_pernet_subsys(&proto_net_ops);
3119 subsys_initcall(proto_init);
3121 #endif /* PROC_FS */