Linux 3.12.28
[linux/fpc-iii.git] / net / core / sock.c
blobf9ec2f5be1c09bbe85d0e56fd95f66b915ec3745
1 /*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
6 * Generic socket support routines. Memory allocators, socket lock/release
7 * handler for protocols to use and generic option handler.
10 * Authors: Ross Biro
11 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12 * Florian La Roche, <flla@stud.uni-sb.de>
13 * Alan Cox, <A.Cox@swansea.ac.uk>
15 * Fixes:
16 * Alan Cox : Numerous verify_area() problems
17 * Alan Cox : Connecting on a connecting socket
18 * now returns an error for tcp.
19 * Alan Cox : sock->protocol is set correctly.
20 * and is not sometimes left as 0.
21 * Alan Cox : connect handles icmp errors on a
22 * connect properly. Unfortunately there
23 * is a restart syscall nasty there. I
24 * can't match BSD without hacking the C
25 * library. Ideas urgently sought!
26 * Alan Cox : Disallow bind() to addresses that are
27 * not ours - especially broadcast ones!!
28 * Alan Cox : Socket 1024 _IS_ ok for users. (fencepost)
29 * Alan Cox : sock_wfree/sock_rfree don't destroy sockets,
30 * instead they leave that for the DESTROY timer.
31 * Alan Cox : Clean up error flag in accept
32 * Alan Cox : TCP ack handling is buggy, the DESTROY timer
33 * was buggy. Put a remove_sock() in the handler
34 * for memory when we hit 0. Also altered the timer
35 * code. The ACK stuff can wait and needs major
36 * TCP layer surgery.
37 * Alan Cox : Fixed TCP ack bug, removed remove sock
38 * and fixed timer/inet_bh race.
39 * Alan Cox : Added zapped flag for TCP
40 * Alan Cox : Move kfree_skb into skbuff.c and tidied up surplus code
41 * Alan Cox : for new sk_buff allocations wmalloc/rmalloc now call alloc_skb
42 * Alan Cox : kfree_s calls now are kfree_skbmem so we can track skb resources
43 * Alan Cox : Supports socket option broadcast now as does udp. Packet and raw need fixing.
44 * Alan Cox : Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so...
45 * Rick Sladkey : Relaxed UDP rules for matching packets.
46 * C.E.Hawkins : IFF_PROMISC/SIOCGHWADDR support
47 * Pauline Middelink : identd support
48 * Alan Cox : Fixed connect() taking signals I think.
49 * Alan Cox : SO_LINGER supported
50 * Alan Cox : Error reporting fixes
51 * Anonymous : inet_create tidied up (sk->reuse setting)
52 * Alan Cox : inet sockets don't set sk->type!
53 * Alan Cox : Split socket option code
54 * Alan Cox : Callbacks
55 * Alan Cox : Nagle flag for Charles & Johannes stuff
56 * Alex : Removed restriction on inet fioctl
57 * Alan Cox : Splitting INET from NET core
58 * Alan Cox : Fixed bogus SO_TYPE handling in getsockopt()
59 * Adam Caldwell : Missing return in SO_DONTROUTE/SO_DEBUG code
60 * Alan Cox : Split IP from generic code
61 * Alan Cox : New kfree_skbmem()
62 * Alan Cox : Make SO_DEBUG superuser only.
63 * Alan Cox : Allow anyone to clear SO_DEBUG
64 * (compatibility fix)
65 * Alan Cox : Added optimistic memory grabbing for AF_UNIX throughput.
66 * Alan Cox : Allocator for a socket is settable.
67 * Alan Cox : SO_ERROR includes soft errors.
68 * Alan Cox : Allow NULL arguments on some SO_ opts
69 * Alan Cox : Generic socket allocation to make hooks
70 * easier (suggested by Craig Metz).
71 * Michael Pall : SO_ERROR returns positive errno again
72 * Steve Whitehouse: Added default destructor to free
73 * protocol private data.
74 * Steve Whitehouse: Added various other default routines
75 * common to several socket families.
76 * Chris Evans : Call suser() check last on F_SETOWN
77 * Jay Schulist : Added SO_ATTACH_FILTER and SO_DETACH_FILTER.
78 * Andi Kleen : Add sock_kmalloc()/sock_kfree_s()
79 * Andi Kleen : Fix write_space callback
80 * Chris Evans : Security fixes - signedness again
81 * Arnaldo C. Melo : cleanups, use skb_queue_purge
83 * To Fix:
86 * This program is free software; you can redistribute it and/or
87 * modify it under the terms of the GNU General Public License
88 * as published by the Free Software Foundation; either version
89 * 2 of the License, or (at your option) any later version.
92 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
94 #include <linux/capability.h>
95 #include <linux/errno.h>
96 #include <linux/errqueue.h>
97 #include <linux/types.h>
98 #include <linux/socket.h>
99 #include <linux/in.h>
100 #include <linux/kernel.h>
101 #include <linux/module.h>
102 #include <linux/proc_fs.h>
103 #include <linux/seq_file.h>
104 #include <linux/sched.h>
105 #include <linux/timer.h>
106 #include <linux/string.h>
107 #include <linux/sockios.h>
108 #include <linux/net.h>
109 #include <linux/mm.h>
110 #include <linux/slab.h>
111 #include <linux/interrupt.h>
112 #include <linux/poll.h>
113 #include <linux/tcp.h>
114 #include <linux/init.h>
115 #include <linux/highmem.h>
116 #include <linux/user_namespace.h>
117 #include <linux/static_key.h>
118 #include <linux/memcontrol.h>
119 #include <linux/prefetch.h>
121 #include <asm/uaccess.h>
123 #include <linux/netdevice.h>
124 #include <net/protocol.h>
125 #include <linux/skbuff.h>
126 #include <net/net_namespace.h>
127 #include <net/request_sock.h>
128 #include <net/sock.h>
129 #include <linux/net_tstamp.h>
130 #include <net/xfrm.h>
131 #include <linux/ipsec.h>
132 #include <net/cls_cgroup.h>
133 #include <net/netprio_cgroup.h>
135 #include <linux/filter.h>
137 #include <trace/events/sock.h>
139 #ifdef CONFIG_INET
140 #include <net/tcp.h>
141 #endif
143 #include <net/busy_poll.h>
145 static DEFINE_MUTEX(proto_list_mutex);
146 static LIST_HEAD(proto_list);
149 * sk_ns_capable - General socket capability test
150 * @sk: Socket to use a capability on or through
151 * @user_ns: The user namespace of the capability to use
152 * @cap: The capability to use
154 * Test to see if the opener of the socket had when the socket was
155 * created and the current process has the capability @cap in the user
156 * namespace @user_ns.
158 bool sk_ns_capable(const struct sock *sk,
159 struct user_namespace *user_ns, int cap)
161 return file_ns_capable(sk->sk_socket->file, user_ns, cap) &&
162 ns_capable(user_ns, cap);
164 EXPORT_SYMBOL(sk_ns_capable);
167 * sk_capable - Socket global capability test
168 * @sk: Socket to use a capability on or through
169 * @cap: The global capbility to use
171 * Test to see if the opener of the socket had when the socket was
172 * created and the current process has the capability @cap in all user
173 * namespaces.
175 bool sk_capable(const struct sock *sk, int cap)
177 return sk_ns_capable(sk, &init_user_ns, cap);
179 EXPORT_SYMBOL(sk_capable);
182 * sk_net_capable - Network namespace socket capability test
183 * @sk: Socket to use a capability on or through
184 * @cap: The capability to use
186 * Test to see if the opener of the socket had when the socke was created
187 * and the current process has the capability @cap over the network namespace
188 * the socket is a member of.
190 bool sk_net_capable(const struct sock *sk, int cap)
192 return sk_ns_capable(sk, sock_net(sk)->user_ns, cap);
194 EXPORT_SYMBOL(sk_net_capable);
197 #ifdef CONFIG_MEMCG_KMEM
198 int mem_cgroup_sockets_init(struct mem_cgroup *memcg, struct cgroup_subsys *ss)
200 struct proto *proto;
201 int ret = 0;
203 mutex_lock(&proto_list_mutex);
204 list_for_each_entry(proto, &proto_list, node) {
205 if (proto->init_cgroup) {
206 ret = proto->init_cgroup(memcg, ss);
207 if (ret)
208 goto out;
212 mutex_unlock(&proto_list_mutex);
213 return ret;
214 out:
215 list_for_each_entry_continue_reverse(proto, &proto_list, node)
216 if (proto->destroy_cgroup)
217 proto->destroy_cgroup(memcg);
218 mutex_unlock(&proto_list_mutex);
219 return ret;
222 void mem_cgroup_sockets_destroy(struct mem_cgroup *memcg)
224 struct proto *proto;
226 mutex_lock(&proto_list_mutex);
227 list_for_each_entry_reverse(proto, &proto_list, node)
228 if (proto->destroy_cgroup)
229 proto->destroy_cgroup(memcg);
230 mutex_unlock(&proto_list_mutex);
232 #endif
235 * Each address family might have different locking rules, so we have
236 * one slock key per address family:
238 static struct lock_class_key af_family_keys[AF_MAX];
239 static struct lock_class_key af_family_slock_keys[AF_MAX];
241 #if defined(CONFIG_MEMCG_KMEM)
242 struct static_key memcg_socket_limit_enabled;
243 EXPORT_SYMBOL(memcg_socket_limit_enabled);
244 #endif
247 * Make lock validator output more readable. (we pre-construct these
248 * strings build-time, so that runtime initialization of socket
249 * locks is fast):
251 static const char *const af_family_key_strings[AF_MAX+1] = {
252 "sk_lock-AF_UNSPEC", "sk_lock-AF_UNIX" , "sk_lock-AF_INET" ,
253 "sk_lock-AF_AX25" , "sk_lock-AF_IPX" , "sk_lock-AF_APPLETALK",
254 "sk_lock-AF_NETROM", "sk_lock-AF_BRIDGE" , "sk_lock-AF_ATMPVC" ,
255 "sk_lock-AF_X25" , "sk_lock-AF_INET6" , "sk_lock-AF_ROSE" ,
256 "sk_lock-AF_DECnet", "sk_lock-AF_NETBEUI" , "sk_lock-AF_SECURITY" ,
257 "sk_lock-AF_KEY" , "sk_lock-AF_NETLINK" , "sk_lock-AF_PACKET" ,
258 "sk_lock-AF_ASH" , "sk_lock-AF_ECONET" , "sk_lock-AF_ATMSVC" ,
259 "sk_lock-AF_RDS" , "sk_lock-AF_SNA" , "sk_lock-AF_IRDA" ,
260 "sk_lock-AF_PPPOX" , "sk_lock-AF_WANPIPE" , "sk_lock-AF_LLC" ,
261 "sk_lock-27" , "sk_lock-28" , "sk_lock-AF_CAN" ,
262 "sk_lock-AF_TIPC" , "sk_lock-AF_BLUETOOTH", "sk_lock-IUCV" ,
263 "sk_lock-AF_RXRPC" , "sk_lock-AF_ISDN" , "sk_lock-AF_PHONET" ,
264 "sk_lock-AF_IEEE802154", "sk_lock-AF_CAIF" , "sk_lock-AF_ALG" ,
265 "sk_lock-AF_NFC" , "sk_lock-AF_VSOCK" , "sk_lock-AF_MAX"
267 static const char *const af_family_slock_key_strings[AF_MAX+1] = {
268 "slock-AF_UNSPEC", "slock-AF_UNIX" , "slock-AF_INET" ,
269 "slock-AF_AX25" , "slock-AF_IPX" , "slock-AF_APPLETALK",
270 "slock-AF_NETROM", "slock-AF_BRIDGE" , "slock-AF_ATMPVC" ,
271 "slock-AF_X25" , "slock-AF_INET6" , "slock-AF_ROSE" ,
272 "slock-AF_DECnet", "slock-AF_NETBEUI" , "slock-AF_SECURITY" ,
273 "slock-AF_KEY" , "slock-AF_NETLINK" , "slock-AF_PACKET" ,
274 "slock-AF_ASH" , "slock-AF_ECONET" , "slock-AF_ATMSVC" ,
275 "slock-AF_RDS" , "slock-AF_SNA" , "slock-AF_IRDA" ,
276 "slock-AF_PPPOX" , "slock-AF_WANPIPE" , "slock-AF_LLC" ,
277 "slock-27" , "slock-28" , "slock-AF_CAN" ,
278 "slock-AF_TIPC" , "slock-AF_BLUETOOTH", "slock-AF_IUCV" ,
279 "slock-AF_RXRPC" , "slock-AF_ISDN" , "slock-AF_PHONET" ,
280 "slock-AF_IEEE802154", "slock-AF_CAIF" , "slock-AF_ALG" ,
281 "slock-AF_NFC" , "slock-AF_VSOCK" ,"slock-AF_MAX"
283 static const char *const af_family_clock_key_strings[AF_MAX+1] = {
284 "clock-AF_UNSPEC", "clock-AF_UNIX" , "clock-AF_INET" ,
285 "clock-AF_AX25" , "clock-AF_IPX" , "clock-AF_APPLETALK",
286 "clock-AF_NETROM", "clock-AF_BRIDGE" , "clock-AF_ATMPVC" ,
287 "clock-AF_X25" , "clock-AF_INET6" , "clock-AF_ROSE" ,
288 "clock-AF_DECnet", "clock-AF_NETBEUI" , "clock-AF_SECURITY" ,
289 "clock-AF_KEY" , "clock-AF_NETLINK" , "clock-AF_PACKET" ,
290 "clock-AF_ASH" , "clock-AF_ECONET" , "clock-AF_ATMSVC" ,
291 "clock-AF_RDS" , "clock-AF_SNA" , "clock-AF_IRDA" ,
292 "clock-AF_PPPOX" , "clock-AF_WANPIPE" , "clock-AF_LLC" ,
293 "clock-27" , "clock-28" , "clock-AF_CAN" ,
294 "clock-AF_TIPC" , "clock-AF_BLUETOOTH", "clock-AF_IUCV" ,
295 "clock-AF_RXRPC" , "clock-AF_ISDN" , "clock-AF_PHONET" ,
296 "clock-AF_IEEE802154", "clock-AF_CAIF" , "clock-AF_ALG" ,
297 "clock-AF_NFC" , "clock-AF_VSOCK" , "clock-AF_MAX"
301 * sk_callback_lock locking rules are per-address-family,
302 * so split the lock classes by using a per-AF key:
304 static struct lock_class_key af_callback_keys[AF_MAX];
306 /* Take into consideration the size of the struct sk_buff overhead in the
307 * determination of these values, since that is non-constant across
308 * platforms. This makes socket queueing behavior and performance
309 * not depend upon such differences.
311 #define _SK_MEM_PACKETS 256
312 #define _SK_MEM_OVERHEAD SKB_TRUESIZE(256)
313 #define SK_WMEM_MAX (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
314 #define SK_RMEM_MAX (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
316 /* Run time adjustable parameters. */
317 __u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX;
318 EXPORT_SYMBOL(sysctl_wmem_max);
319 __u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX;
320 EXPORT_SYMBOL(sysctl_rmem_max);
321 __u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX;
322 __u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX;
324 /* Maximal space eaten by iovec or ancillary data plus some space */
325 int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512);
326 EXPORT_SYMBOL(sysctl_optmem_max);
328 struct static_key memalloc_socks = STATIC_KEY_INIT_FALSE;
329 EXPORT_SYMBOL_GPL(memalloc_socks);
332 * sk_set_memalloc - sets %SOCK_MEMALLOC
333 * @sk: socket to set it on
335 * Set %SOCK_MEMALLOC on a socket for access to emergency reserves.
336 * It's the responsibility of the admin to adjust min_free_kbytes
337 * to meet the requirements
339 void sk_set_memalloc(struct sock *sk)
341 sock_set_flag(sk, SOCK_MEMALLOC);
342 sk->sk_allocation |= __GFP_MEMALLOC;
343 static_key_slow_inc(&memalloc_socks);
345 EXPORT_SYMBOL_GPL(sk_set_memalloc);
347 void sk_clear_memalloc(struct sock *sk)
349 sock_reset_flag(sk, SOCK_MEMALLOC);
350 sk->sk_allocation &= ~__GFP_MEMALLOC;
351 static_key_slow_dec(&memalloc_socks);
354 * SOCK_MEMALLOC is allowed to ignore rmem limits to ensure forward
355 * progress of swapping. However, if SOCK_MEMALLOC is cleared while
356 * it has rmem allocations there is a risk that the user of the
357 * socket cannot make forward progress due to exceeding the rmem
358 * limits. By rights, sk_clear_memalloc() should only be called
359 * on sockets being torn down but warn and reset the accounting if
360 * that assumption breaks.
362 if (WARN_ON(sk->sk_forward_alloc))
363 sk_mem_reclaim(sk);
365 EXPORT_SYMBOL_GPL(sk_clear_memalloc);
367 int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb)
369 int ret;
370 unsigned long pflags = current->flags;
372 /* these should have been dropped before queueing */
373 BUG_ON(!sock_flag(sk, SOCK_MEMALLOC));
375 current->flags |= PF_MEMALLOC;
376 ret = sk->sk_backlog_rcv(sk, skb);
377 tsk_restore_flags(current, pflags, PF_MEMALLOC);
379 return ret;
381 EXPORT_SYMBOL(__sk_backlog_rcv);
383 static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen)
385 struct timeval tv;
387 if (optlen < sizeof(tv))
388 return -EINVAL;
389 if (copy_from_user(&tv, optval, sizeof(tv)))
390 return -EFAULT;
391 if (tv.tv_usec < 0 || tv.tv_usec >= USEC_PER_SEC)
392 return -EDOM;
394 if (tv.tv_sec < 0) {
395 static int warned __read_mostly;
397 *timeo_p = 0;
398 if (warned < 10 && net_ratelimit()) {
399 warned++;
400 pr_info("%s: `%s' (pid %d) tries to set negative timeout\n",
401 __func__, current->comm, task_pid_nr(current));
403 return 0;
405 *timeo_p = MAX_SCHEDULE_TIMEOUT;
406 if (tv.tv_sec == 0 && tv.tv_usec == 0)
407 return 0;
408 if (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT/HZ - 1))
409 *timeo_p = tv.tv_sec*HZ + (tv.tv_usec+(1000000/HZ-1))/(1000000/HZ);
410 return 0;
413 static void sock_warn_obsolete_bsdism(const char *name)
415 static int warned;
416 static char warncomm[TASK_COMM_LEN];
417 if (strcmp(warncomm, current->comm) && warned < 5) {
418 strcpy(warncomm, current->comm);
419 pr_warn("process `%s' is using obsolete %s SO_BSDCOMPAT\n",
420 warncomm, name);
421 warned++;
425 #define SK_FLAGS_TIMESTAMP ((1UL << SOCK_TIMESTAMP) | (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE))
427 static void sock_disable_timestamp(struct sock *sk, unsigned long flags)
429 if (sk->sk_flags & flags) {
430 sk->sk_flags &= ~flags;
431 if (!(sk->sk_flags & SK_FLAGS_TIMESTAMP))
432 net_disable_timestamp();
437 int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
439 int err;
440 int skb_len;
441 unsigned long flags;
442 struct sk_buff_head *list = &sk->sk_receive_queue;
444 if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) {
445 atomic_inc(&sk->sk_drops);
446 trace_sock_rcvqueue_full(sk, skb);
447 return -ENOMEM;
450 err = sk_filter(sk, skb);
451 if (err)
452 return err;
454 if (!sk_rmem_schedule(sk, skb, skb->truesize)) {
455 atomic_inc(&sk->sk_drops);
456 return -ENOBUFS;
459 skb->dev = NULL;
460 skb_set_owner_r(skb, sk);
462 /* Cache the SKB length before we tack it onto the receive
463 * queue. Once it is added it no longer belongs to us and
464 * may be freed by other threads of control pulling packets
465 * from the queue.
467 skb_len = skb->len;
469 /* we escape from rcu protected region, make sure we dont leak
470 * a norefcounted dst
472 skb_dst_force(skb);
474 spin_lock_irqsave(&list->lock, flags);
475 skb->dropcount = atomic_read(&sk->sk_drops);
476 __skb_queue_tail(list, skb);
477 spin_unlock_irqrestore(&list->lock, flags);
479 if (!sock_flag(sk, SOCK_DEAD))
480 sk->sk_data_ready(sk, skb_len);
481 return 0;
483 EXPORT_SYMBOL(sock_queue_rcv_skb);
485 int sk_receive_skb(struct sock *sk, struct sk_buff *skb, const int nested)
487 int rc = NET_RX_SUCCESS;
489 if (sk_filter(sk, skb))
490 goto discard_and_relse;
492 skb->dev = NULL;
494 if (sk_rcvqueues_full(sk, skb, sk->sk_rcvbuf)) {
495 atomic_inc(&sk->sk_drops);
496 goto discard_and_relse;
498 if (nested)
499 bh_lock_sock_nested(sk);
500 else
501 bh_lock_sock(sk);
502 if (!sock_owned_by_user(sk)) {
504 * trylock + unlock semantics:
506 mutex_acquire(&sk->sk_lock.dep_map, 0, 1, _RET_IP_);
508 rc = sk_backlog_rcv(sk, skb);
510 mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
511 } else if (sk_add_backlog(sk, skb, sk->sk_rcvbuf)) {
512 bh_unlock_sock(sk);
513 atomic_inc(&sk->sk_drops);
514 goto discard_and_relse;
517 bh_unlock_sock(sk);
518 out:
519 sock_put(sk);
520 return rc;
521 discard_and_relse:
522 kfree_skb(skb);
523 goto out;
525 EXPORT_SYMBOL(sk_receive_skb);
527 void sk_reset_txq(struct sock *sk)
529 sk_tx_queue_clear(sk);
531 EXPORT_SYMBOL(sk_reset_txq);
533 struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie)
535 struct dst_entry *dst = __sk_dst_get(sk);
537 if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
538 sk_tx_queue_clear(sk);
539 RCU_INIT_POINTER(sk->sk_dst_cache, NULL);
540 dst_release(dst);
541 return NULL;
544 return dst;
546 EXPORT_SYMBOL(__sk_dst_check);
548 struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie)
550 struct dst_entry *dst = sk_dst_get(sk);
552 if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
553 sk_dst_reset(sk);
554 dst_release(dst);
555 return NULL;
558 return dst;
560 EXPORT_SYMBOL(sk_dst_check);
562 static int sock_setbindtodevice(struct sock *sk, char __user *optval,
563 int optlen)
565 int ret = -ENOPROTOOPT;
566 #ifdef CONFIG_NETDEVICES
567 struct net *net = sock_net(sk);
568 char devname[IFNAMSIZ];
569 int index;
571 /* Sorry... */
572 ret = -EPERM;
573 if (!ns_capable(net->user_ns, CAP_NET_RAW))
574 goto out;
576 ret = -EINVAL;
577 if (optlen < 0)
578 goto out;
580 /* Bind this socket to a particular device like "eth0",
581 * as specified in the passed interface name. If the
582 * name is "" or the option length is zero the socket
583 * is not bound.
585 if (optlen > IFNAMSIZ - 1)
586 optlen = IFNAMSIZ - 1;
587 memset(devname, 0, sizeof(devname));
589 ret = -EFAULT;
590 if (copy_from_user(devname, optval, optlen))
591 goto out;
593 index = 0;
594 if (devname[0] != '\0') {
595 struct net_device *dev;
597 rcu_read_lock();
598 dev = dev_get_by_name_rcu(net, devname);
599 if (dev)
600 index = dev->ifindex;
601 rcu_read_unlock();
602 ret = -ENODEV;
603 if (!dev)
604 goto out;
607 lock_sock(sk);
608 sk->sk_bound_dev_if = index;
609 sk_dst_reset(sk);
610 release_sock(sk);
612 ret = 0;
614 out:
615 #endif
617 return ret;
620 static int sock_getbindtodevice(struct sock *sk, char __user *optval,
621 int __user *optlen, int len)
623 int ret = -ENOPROTOOPT;
624 #ifdef CONFIG_NETDEVICES
625 struct net *net = sock_net(sk);
626 char devname[IFNAMSIZ];
628 if (sk->sk_bound_dev_if == 0) {
629 len = 0;
630 goto zero;
633 ret = -EINVAL;
634 if (len < IFNAMSIZ)
635 goto out;
637 ret = netdev_get_name(net, devname, sk->sk_bound_dev_if);
638 if (ret)
639 goto out;
641 len = strlen(devname) + 1;
643 ret = -EFAULT;
644 if (copy_to_user(optval, devname, len))
645 goto out;
647 zero:
648 ret = -EFAULT;
649 if (put_user(len, optlen))
650 goto out;
652 ret = 0;
654 out:
655 #endif
657 return ret;
660 static inline void sock_valbool_flag(struct sock *sk, int bit, int valbool)
662 if (valbool)
663 sock_set_flag(sk, bit);
664 else
665 sock_reset_flag(sk, bit);
669 * This is meant for all protocols to use and covers goings on
670 * at the socket level. Everything here is generic.
673 int sock_setsockopt(struct socket *sock, int level, int optname,
674 char __user *optval, unsigned int optlen)
676 struct sock *sk = sock->sk;
677 int val;
678 int valbool;
679 struct linger ling;
680 int ret = 0;
683 * Options without arguments
686 if (optname == SO_BINDTODEVICE)
687 return sock_setbindtodevice(sk, optval, optlen);
689 if (optlen < sizeof(int))
690 return -EINVAL;
692 if (get_user(val, (int __user *)optval))
693 return -EFAULT;
695 valbool = val ? 1 : 0;
697 lock_sock(sk);
699 switch (optname) {
700 case SO_DEBUG:
701 if (val && !capable(CAP_NET_ADMIN))
702 ret = -EACCES;
703 else
704 sock_valbool_flag(sk, SOCK_DBG, valbool);
705 break;
706 case SO_REUSEADDR:
707 sk->sk_reuse = (valbool ? SK_CAN_REUSE : SK_NO_REUSE);
708 break;
709 case SO_REUSEPORT:
710 sk->sk_reuseport = valbool;
711 break;
712 case SO_TYPE:
713 case SO_PROTOCOL:
714 case SO_DOMAIN:
715 case SO_ERROR:
716 ret = -ENOPROTOOPT;
717 break;
718 case SO_DONTROUTE:
719 sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool);
720 break;
721 case SO_BROADCAST:
722 sock_valbool_flag(sk, SOCK_BROADCAST, valbool);
723 break;
724 case SO_SNDBUF:
725 /* Don't error on this BSD doesn't and if you think
726 * about it this is right. Otherwise apps have to
727 * play 'guess the biggest size' games. RCVBUF/SNDBUF
728 * are treated in BSD as hints
730 val = min_t(u32, val, sysctl_wmem_max);
731 set_sndbuf:
732 sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
733 sk->sk_sndbuf = max_t(u32, val * 2, SOCK_MIN_SNDBUF);
734 /* Wake up sending tasks if we upped the value. */
735 sk->sk_write_space(sk);
736 break;
738 case SO_SNDBUFFORCE:
739 if (!capable(CAP_NET_ADMIN)) {
740 ret = -EPERM;
741 break;
743 goto set_sndbuf;
745 case SO_RCVBUF:
746 /* Don't error on this BSD doesn't and if you think
747 * about it this is right. Otherwise apps have to
748 * play 'guess the biggest size' games. RCVBUF/SNDBUF
749 * are treated in BSD as hints
751 val = min_t(u32, val, sysctl_rmem_max);
752 set_rcvbuf:
753 sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
755 * We double it on the way in to account for
756 * "struct sk_buff" etc. overhead. Applications
757 * assume that the SO_RCVBUF setting they make will
758 * allow that much actual data to be received on that
759 * socket.
761 * Applications are unaware that "struct sk_buff" and
762 * other overheads allocate from the receive buffer
763 * during socket buffer allocation.
765 * And after considering the possible alternatives,
766 * returning the value we actually used in getsockopt
767 * is the most desirable behavior.
769 sk->sk_rcvbuf = max_t(u32, val * 2, SOCK_MIN_RCVBUF);
770 break;
772 case SO_RCVBUFFORCE:
773 if (!capable(CAP_NET_ADMIN)) {
774 ret = -EPERM;
775 break;
777 goto set_rcvbuf;
779 case SO_KEEPALIVE:
780 #ifdef CONFIG_INET
781 if (sk->sk_protocol == IPPROTO_TCP &&
782 sk->sk_type == SOCK_STREAM)
783 tcp_set_keepalive(sk, valbool);
784 #endif
785 sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool);
786 break;
788 case SO_OOBINLINE:
789 sock_valbool_flag(sk, SOCK_URGINLINE, valbool);
790 break;
792 case SO_NO_CHECK:
793 sk->sk_no_check = valbool;
794 break;
796 case SO_PRIORITY:
797 if ((val >= 0 && val <= 6) ||
798 ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
799 sk->sk_priority = val;
800 else
801 ret = -EPERM;
802 break;
804 case SO_LINGER:
805 if (optlen < sizeof(ling)) {
806 ret = -EINVAL; /* 1003.1g */
807 break;
809 if (copy_from_user(&ling, optval, sizeof(ling))) {
810 ret = -EFAULT;
811 break;
813 if (!ling.l_onoff)
814 sock_reset_flag(sk, SOCK_LINGER);
815 else {
816 #if (BITS_PER_LONG == 32)
817 if ((unsigned int)ling.l_linger >= MAX_SCHEDULE_TIMEOUT/HZ)
818 sk->sk_lingertime = MAX_SCHEDULE_TIMEOUT;
819 else
820 #endif
821 sk->sk_lingertime = (unsigned int)ling.l_linger * HZ;
822 sock_set_flag(sk, SOCK_LINGER);
824 break;
826 case SO_BSDCOMPAT:
827 sock_warn_obsolete_bsdism("setsockopt");
828 break;
830 case SO_PASSCRED:
831 if (valbool)
832 set_bit(SOCK_PASSCRED, &sock->flags);
833 else
834 clear_bit(SOCK_PASSCRED, &sock->flags);
835 break;
837 case SO_TIMESTAMP:
838 case SO_TIMESTAMPNS:
839 if (valbool) {
840 if (optname == SO_TIMESTAMP)
841 sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
842 else
843 sock_set_flag(sk, SOCK_RCVTSTAMPNS);
844 sock_set_flag(sk, SOCK_RCVTSTAMP);
845 sock_enable_timestamp(sk, SOCK_TIMESTAMP);
846 } else {
847 sock_reset_flag(sk, SOCK_RCVTSTAMP);
848 sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
850 break;
852 case SO_TIMESTAMPING:
853 if (val & ~SOF_TIMESTAMPING_MASK) {
854 ret = -EINVAL;
855 break;
857 sock_valbool_flag(sk, SOCK_TIMESTAMPING_TX_HARDWARE,
858 val & SOF_TIMESTAMPING_TX_HARDWARE);
859 sock_valbool_flag(sk, SOCK_TIMESTAMPING_TX_SOFTWARE,
860 val & SOF_TIMESTAMPING_TX_SOFTWARE);
861 sock_valbool_flag(sk, SOCK_TIMESTAMPING_RX_HARDWARE,
862 val & SOF_TIMESTAMPING_RX_HARDWARE);
863 if (val & SOF_TIMESTAMPING_RX_SOFTWARE)
864 sock_enable_timestamp(sk,
865 SOCK_TIMESTAMPING_RX_SOFTWARE);
866 else
867 sock_disable_timestamp(sk,
868 (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE));
869 sock_valbool_flag(sk, SOCK_TIMESTAMPING_SOFTWARE,
870 val & SOF_TIMESTAMPING_SOFTWARE);
871 sock_valbool_flag(sk, SOCK_TIMESTAMPING_SYS_HARDWARE,
872 val & SOF_TIMESTAMPING_SYS_HARDWARE);
873 sock_valbool_flag(sk, SOCK_TIMESTAMPING_RAW_HARDWARE,
874 val & SOF_TIMESTAMPING_RAW_HARDWARE);
875 break;
877 case SO_RCVLOWAT:
878 if (val < 0)
879 val = INT_MAX;
880 sk->sk_rcvlowat = val ? : 1;
881 break;
883 case SO_RCVTIMEO:
884 ret = sock_set_timeout(&sk->sk_rcvtimeo, optval, optlen);
885 break;
887 case SO_SNDTIMEO:
888 ret = sock_set_timeout(&sk->sk_sndtimeo, optval, optlen);
889 break;
891 case SO_ATTACH_FILTER:
892 ret = -EINVAL;
893 if (optlen == sizeof(struct sock_fprog)) {
894 struct sock_fprog fprog;
896 ret = -EFAULT;
897 if (copy_from_user(&fprog, optval, sizeof(fprog)))
898 break;
900 ret = sk_attach_filter(&fprog, sk);
902 break;
904 case SO_DETACH_FILTER:
905 ret = sk_detach_filter(sk);
906 break;
908 case SO_LOCK_FILTER:
909 if (sock_flag(sk, SOCK_FILTER_LOCKED) && !valbool)
910 ret = -EPERM;
911 else
912 sock_valbool_flag(sk, SOCK_FILTER_LOCKED, valbool);
913 break;
915 case SO_PASSSEC:
916 if (valbool)
917 set_bit(SOCK_PASSSEC, &sock->flags);
918 else
919 clear_bit(SOCK_PASSSEC, &sock->flags);
920 break;
921 case SO_MARK:
922 if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
923 ret = -EPERM;
924 else
925 sk->sk_mark = val;
926 break;
928 /* We implement the SO_SNDLOWAT etc to
929 not be settable (1003.1g 5.3) */
930 case SO_RXQ_OVFL:
931 sock_valbool_flag(sk, SOCK_RXQ_OVFL, valbool);
932 break;
934 case SO_WIFI_STATUS:
935 sock_valbool_flag(sk, SOCK_WIFI_STATUS, valbool);
936 break;
938 case SO_PEEK_OFF:
939 if (sock->ops->set_peek_off)
940 ret = sock->ops->set_peek_off(sk, val);
941 else
942 ret = -EOPNOTSUPP;
943 break;
945 case SO_NOFCS:
946 sock_valbool_flag(sk, SOCK_NOFCS, valbool);
947 break;
949 case SO_SELECT_ERR_QUEUE:
950 sock_valbool_flag(sk, SOCK_SELECT_ERR_QUEUE, valbool);
951 break;
953 #ifdef CONFIG_NET_RX_BUSY_POLL
954 case SO_BUSY_POLL:
955 /* allow unprivileged users to decrease the value */
956 if ((val > sk->sk_ll_usec) && !capable(CAP_NET_ADMIN))
957 ret = -EPERM;
958 else {
959 if (val < 0)
960 ret = -EINVAL;
961 else
962 sk->sk_ll_usec = val;
964 break;
965 #endif
966 default:
967 ret = -ENOPROTOOPT;
968 break;
970 release_sock(sk);
971 return ret;
973 EXPORT_SYMBOL(sock_setsockopt);
976 void cred_to_ucred(struct pid *pid, const struct cred *cred,
977 struct ucred *ucred)
979 ucred->pid = pid_vnr(pid);
980 ucred->uid = ucred->gid = -1;
981 if (cred) {
982 struct user_namespace *current_ns = current_user_ns();
984 ucred->uid = from_kuid_munged(current_ns, cred->euid);
985 ucred->gid = from_kgid_munged(current_ns, cred->egid);
988 EXPORT_SYMBOL_GPL(cred_to_ucred);
990 int sock_getsockopt(struct socket *sock, int level, int optname,
991 char __user *optval, int __user *optlen)
993 struct sock *sk = sock->sk;
995 union {
996 int val;
997 struct linger ling;
998 struct timeval tm;
999 } v;
1001 int lv = sizeof(int);
1002 int len;
1004 if (get_user(len, optlen))
1005 return -EFAULT;
1006 if (len < 0)
1007 return -EINVAL;
1009 memset(&v, 0, sizeof(v));
1011 switch (optname) {
1012 case SO_DEBUG:
1013 v.val = sock_flag(sk, SOCK_DBG);
1014 break;
1016 case SO_DONTROUTE:
1017 v.val = sock_flag(sk, SOCK_LOCALROUTE);
1018 break;
1020 case SO_BROADCAST:
1021 v.val = sock_flag(sk, SOCK_BROADCAST);
1022 break;
1024 case SO_SNDBUF:
1025 v.val = sk->sk_sndbuf;
1026 break;
1028 case SO_RCVBUF:
1029 v.val = sk->sk_rcvbuf;
1030 break;
1032 case SO_REUSEADDR:
1033 v.val = sk->sk_reuse;
1034 break;
1036 case SO_REUSEPORT:
1037 v.val = sk->sk_reuseport;
1038 break;
1040 case SO_KEEPALIVE:
1041 v.val = sock_flag(sk, SOCK_KEEPOPEN);
1042 break;
1044 case SO_TYPE:
1045 v.val = sk->sk_type;
1046 break;
1048 case SO_PROTOCOL:
1049 v.val = sk->sk_protocol;
1050 break;
1052 case SO_DOMAIN:
1053 v.val = sk->sk_family;
1054 break;
1056 case SO_ERROR:
1057 v.val = -sock_error(sk);
1058 if (v.val == 0)
1059 v.val = xchg(&sk->sk_err_soft, 0);
1060 break;
1062 case SO_OOBINLINE:
1063 v.val = sock_flag(sk, SOCK_URGINLINE);
1064 break;
1066 case SO_NO_CHECK:
1067 v.val = sk->sk_no_check;
1068 break;
1070 case SO_PRIORITY:
1071 v.val = sk->sk_priority;
1072 break;
1074 case SO_LINGER:
1075 lv = sizeof(v.ling);
1076 v.ling.l_onoff = sock_flag(sk, SOCK_LINGER);
1077 v.ling.l_linger = sk->sk_lingertime / HZ;
1078 break;
1080 case SO_BSDCOMPAT:
1081 sock_warn_obsolete_bsdism("getsockopt");
1082 break;
1084 case SO_TIMESTAMP:
1085 v.val = sock_flag(sk, SOCK_RCVTSTAMP) &&
1086 !sock_flag(sk, SOCK_RCVTSTAMPNS);
1087 break;
1089 case SO_TIMESTAMPNS:
1090 v.val = sock_flag(sk, SOCK_RCVTSTAMPNS);
1091 break;
1093 case SO_TIMESTAMPING:
1094 v.val = 0;
1095 if (sock_flag(sk, SOCK_TIMESTAMPING_TX_HARDWARE))
1096 v.val |= SOF_TIMESTAMPING_TX_HARDWARE;
1097 if (sock_flag(sk, SOCK_TIMESTAMPING_TX_SOFTWARE))
1098 v.val |= SOF_TIMESTAMPING_TX_SOFTWARE;
1099 if (sock_flag(sk, SOCK_TIMESTAMPING_RX_HARDWARE))
1100 v.val |= SOF_TIMESTAMPING_RX_HARDWARE;
1101 if (sock_flag(sk, SOCK_TIMESTAMPING_RX_SOFTWARE))
1102 v.val |= SOF_TIMESTAMPING_RX_SOFTWARE;
1103 if (sock_flag(sk, SOCK_TIMESTAMPING_SOFTWARE))
1104 v.val |= SOF_TIMESTAMPING_SOFTWARE;
1105 if (sock_flag(sk, SOCK_TIMESTAMPING_SYS_HARDWARE))
1106 v.val |= SOF_TIMESTAMPING_SYS_HARDWARE;
1107 if (sock_flag(sk, SOCK_TIMESTAMPING_RAW_HARDWARE))
1108 v.val |= SOF_TIMESTAMPING_RAW_HARDWARE;
1109 break;
1111 case SO_RCVTIMEO:
1112 lv = sizeof(struct timeval);
1113 if (sk->sk_rcvtimeo == MAX_SCHEDULE_TIMEOUT) {
1114 v.tm.tv_sec = 0;
1115 v.tm.tv_usec = 0;
1116 } else {
1117 v.tm.tv_sec = sk->sk_rcvtimeo / HZ;
1118 v.tm.tv_usec = ((sk->sk_rcvtimeo % HZ) * 1000000) / HZ;
1120 break;
1122 case SO_SNDTIMEO:
1123 lv = sizeof(struct timeval);
1124 if (sk->sk_sndtimeo == MAX_SCHEDULE_TIMEOUT) {
1125 v.tm.tv_sec = 0;
1126 v.tm.tv_usec = 0;
1127 } else {
1128 v.tm.tv_sec = sk->sk_sndtimeo / HZ;
1129 v.tm.tv_usec = ((sk->sk_sndtimeo % HZ) * 1000000) / HZ;
1131 break;
1133 case SO_RCVLOWAT:
1134 v.val = sk->sk_rcvlowat;
1135 break;
1137 case SO_SNDLOWAT:
1138 v.val = 1;
1139 break;
1141 case SO_PASSCRED:
1142 v.val = !!test_bit(SOCK_PASSCRED, &sock->flags);
1143 break;
1145 case SO_PEERCRED:
1147 struct ucred peercred;
1148 if (len > sizeof(peercred))
1149 len = sizeof(peercred);
1150 cred_to_ucred(sk->sk_peer_pid, sk->sk_peer_cred, &peercred);
1151 if (copy_to_user(optval, &peercred, len))
1152 return -EFAULT;
1153 goto lenout;
1156 case SO_PEERNAME:
1158 char address[128];
1160 if (sock->ops->getname(sock, (struct sockaddr *)address, &lv, 2))
1161 return -ENOTCONN;
1162 if (lv < len)
1163 return -EINVAL;
1164 if (copy_to_user(optval, address, len))
1165 return -EFAULT;
1166 goto lenout;
1169 /* Dubious BSD thing... Probably nobody even uses it, but
1170 * the UNIX standard wants it for whatever reason... -DaveM
1172 case SO_ACCEPTCONN:
1173 v.val = sk->sk_state == TCP_LISTEN;
1174 break;
1176 case SO_PASSSEC:
1177 v.val = !!test_bit(SOCK_PASSSEC, &sock->flags);
1178 break;
1180 case SO_PEERSEC:
1181 return security_socket_getpeersec_stream(sock, optval, optlen, len);
1183 case SO_MARK:
1184 v.val = sk->sk_mark;
1185 break;
1187 case SO_RXQ_OVFL:
1188 v.val = sock_flag(sk, SOCK_RXQ_OVFL);
1189 break;
1191 case SO_WIFI_STATUS:
1192 v.val = sock_flag(sk, SOCK_WIFI_STATUS);
1193 break;
1195 case SO_PEEK_OFF:
1196 if (!sock->ops->set_peek_off)
1197 return -EOPNOTSUPP;
1199 v.val = sk->sk_peek_off;
1200 break;
1201 case SO_NOFCS:
1202 v.val = sock_flag(sk, SOCK_NOFCS);
1203 break;
1205 case SO_BINDTODEVICE:
1206 return sock_getbindtodevice(sk, optval, optlen, len);
1208 case SO_GET_FILTER:
1209 len = sk_get_filter(sk, (struct sock_filter __user *)optval, len);
1210 if (len < 0)
1211 return len;
1213 goto lenout;
1215 case SO_LOCK_FILTER:
1216 v.val = sock_flag(sk, SOCK_FILTER_LOCKED);
1217 break;
1219 case SO_SELECT_ERR_QUEUE:
1220 v.val = sock_flag(sk, SOCK_SELECT_ERR_QUEUE);
1221 break;
1223 #ifdef CONFIG_NET_RX_BUSY_POLL
1224 case SO_BUSY_POLL:
1225 v.val = sk->sk_ll_usec;
1226 break;
1227 #endif
1229 default:
1230 return -ENOPROTOOPT;
1233 if (len > lv)
1234 len = lv;
1235 if (copy_to_user(optval, &v, len))
1236 return -EFAULT;
1237 lenout:
1238 if (put_user(len, optlen))
1239 return -EFAULT;
1240 return 0;
1244 * Initialize an sk_lock.
1246 * (We also register the sk_lock with the lock validator.)
1248 static inline void sock_lock_init(struct sock *sk)
1250 sock_lock_init_class_and_name(sk,
1251 af_family_slock_key_strings[sk->sk_family],
1252 af_family_slock_keys + sk->sk_family,
1253 af_family_key_strings[sk->sk_family],
1254 af_family_keys + sk->sk_family);
1258 * Copy all fields from osk to nsk but nsk->sk_refcnt must not change yet,
1259 * even temporarly, because of RCU lookups. sk_node should also be left as is.
1260 * We must not copy fields between sk_dontcopy_begin and sk_dontcopy_end
1262 static void sock_copy(struct sock *nsk, const struct sock *osk)
1264 #ifdef CONFIG_SECURITY_NETWORK
1265 void *sptr = nsk->sk_security;
1266 #endif
1267 memcpy(nsk, osk, offsetof(struct sock, sk_dontcopy_begin));
1269 memcpy(&nsk->sk_dontcopy_end, &osk->sk_dontcopy_end,
1270 osk->sk_prot->obj_size - offsetof(struct sock, sk_dontcopy_end));
1272 #ifdef CONFIG_SECURITY_NETWORK
1273 nsk->sk_security = sptr;
1274 security_sk_clone(osk, nsk);
1275 #endif
1278 void sk_prot_clear_portaddr_nulls(struct sock *sk, int size)
1280 unsigned long nulls1, nulls2;
1282 nulls1 = offsetof(struct sock, __sk_common.skc_node.next);
1283 nulls2 = offsetof(struct sock, __sk_common.skc_portaddr_node.next);
1284 if (nulls1 > nulls2)
1285 swap(nulls1, nulls2);
1287 if (nulls1 != 0)
1288 memset((char *)sk, 0, nulls1);
1289 memset((char *)sk + nulls1 + sizeof(void *), 0,
1290 nulls2 - nulls1 - sizeof(void *));
1291 memset((char *)sk + nulls2 + sizeof(void *), 0,
1292 size - nulls2 - sizeof(void *));
1294 EXPORT_SYMBOL(sk_prot_clear_portaddr_nulls);
1296 static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
1297 int family)
1299 struct sock *sk;
1300 struct kmem_cache *slab;
1302 slab = prot->slab;
1303 if (slab != NULL) {
1304 sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO);
1305 if (!sk)
1306 return sk;
1307 if (priority & __GFP_ZERO) {
1308 if (prot->clear_sk)
1309 prot->clear_sk(sk, prot->obj_size);
1310 else
1311 sk_prot_clear_nulls(sk, prot->obj_size);
1313 } else
1314 sk = kmalloc(prot->obj_size, priority);
1316 if (sk != NULL) {
1317 kmemcheck_annotate_bitfield(sk, flags);
1319 if (security_sk_alloc(sk, family, priority))
1320 goto out_free;
1322 if (!try_module_get(prot->owner))
1323 goto out_free_sec;
1324 sk_tx_queue_clear(sk);
1327 return sk;
1329 out_free_sec:
1330 security_sk_free(sk);
1331 out_free:
1332 if (slab != NULL)
1333 kmem_cache_free(slab, sk);
1334 else
1335 kfree(sk);
1336 return NULL;
1339 static void sk_prot_free(struct proto *prot, struct sock *sk)
1341 struct kmem_cache *slab;
1342 struct module *owner;
1344 owner = prot->owner;
1345 slab = prot->slab;
1347 security_sk_free(sk);
1348 if (slab != NULL)
1349 kmem_cache_free(slab, sk);
1350 else
1351 kfree(sk);
1352 module_put(owner);
1355 #if IS_ENABLED(CONFIG_NET_CLS_CGROUP)
1356 void sock_update_classid(struct sock *sk)
1358 u32 classid;
1360 classid = task_cls_classid(current);
1361 if (classid != sk->sk_classid)
1362 sk->sk_classid = classid;
1364 EXPORT_SYMBOL(sock_update_classid);
1365 #endif
1367 #if IS_ENABLED(CONFIG_NETPRIO_CGROUP)
1368 void sock_update_netprioidx(struct sock *sk)
1370 if (in_interrupt())
1371 return;
1373 sk->sk_cgrp_prioidx = task_netprioidx(current);
1375 EXPORT_SYMBOL_GPL(sock_update_netprioidx);
1376 #endif
1379 * sk_alloc - All socket objects are allocated here
1380 * @net: the applicable net namespace
1381 * @family: protocol family
1382 * @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
1383 * @prot: struct proto associated with this new sock instance
1385 struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
1386 struct proto *prot)
1388 struct sock *sk;
1390 sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family);
1391 if (sk) {
1392 sk->sk_family = family;
1394 * See comment in struct sock definition to understand
1395 * why we need sk_prot_creator -acme
1397 sk->sk_prot = sk->sk_prot_creator = prot;
1398 sock_lock_init(sk);
1399 sock_net_set(sk, get_net(net));
1400 atomic_set(&sk->sk_wmem_alloc, 1);
1402 sock_update_classid(sk);
1403 sock_update_netprioidx(sk);
1406 return sk;
1408 EXPORT_SYMBOL(sk_alloc);
1410 static void __sk_free(struct sock *sk)
1412 struct sk_filter *filter;
1414 if (sk->sk_destruct)
1415 sk->sk_destruct(sk);
1417 filter = rcu_dereference_check(sk->sk_filter,
1418 atomic_read(&sk->sk_wmem_alloc) == 0);
1419 if (filter) {
1420 sk_filter_uncharge(sk, filter);
1421 RCU_INIT_POINTER(sk->sk_filter, NULL);
1424 sock_disable_timestamp(sk, SK_FLAGS_TIMESTAMP);
1426 if (atomic_read(&sk->sk_omem_alloc))
1427 pr_debug("%s: optmem leakage (%d bytes) detected\n",
1428 __func__, atomic_read(&sk->sk_omem_alloc));
1430 if (sk->sk_peer_cred)
1431 put_cred(sk->sk_peer_cred);
1432 put_pid(sk->sk_peer_pid);
1433 put_net(sock_net(sk));
1434 sk_prot_free(sk->sk_prot_creator, sk);
1437 void sk_free(struct sock *sk)
1440 * We subtract one from sk_wmem_alloc and can know if
1441 * some packets are still in some tx queue.
1442 * If not null, sock_wfree() will call __sk_free(sk) later
1444 if (atomic_dec_and_test(&sk->sk_wmem_alloc))
1445 __sk_free(sk);
1447 EXPORT_SYMBOL(sk_free);
1450 * Last sock_put should drop reference to sk->sk_net. It has already
1451 * been dropped in sk_change_net. Taking reference to stopping namespace
1452 * is not an option.
1453 * Take reference to a socket to remove it from hash _alive_ and after that
1454 * destroy it in the context of init_net.
1456 void sk_release_kernel(struct sock *sk)
1458 if (sk == NULL || sk->sk_socket == NULL)
1459 return;
1461 sock_hold(sk);
1462 sock_release(sk->sk_socket);
1463 release_net(sock_net(sk));
1464 sock_net_set(sk, get_net(&init_net));
1465 sock_put(sk);
1467 EXPORT_SYMBOL(sk_release_kernel);
1469 static void sk_update_clone(const struct sock *sk, struct sock *newsk)
1471 if (mem_cgroup_sockets_enabled && sk->sk_cgrp)
1472 sock_update_memcg(newsk);
1476 * sk_clone_lock - clone a socket, and lock its clone
1477 * @sk: the socket to clone
1478 * @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
1480 * Caller must unlock socket even in error path (bh_unlock_sock(newsk))
1482 struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
1484 struct sock *newsk;
1486 newsk = sk_prot_alloc(sk->sk_prot, priority, sk->sk_family);
1487 if (newsk != NULL) {
1488 struct sk_filter *filter;
1490 sock_copy(newsk, sk);
1492 /* SANITY */
1493 get_net(sock_net(newsk));
1494 sk_node_init(&newsk->sk_node);
1495 sock_lock_init(newsk);
1496 bh_lock_sock(newsk);
1497 newsk->sk_backlog.head = newsk->sk_backlog.tail = NULL;
1498 newsk->sk_backlog.len = 0;
1500 atomic_set(&newsk->sk_rmem_alloc, 0);
1502 * sk_wmem_alloc set to one (see sk_free() and sock_wfree())
1504 atomic_set(&newsk->sk_wmem_alloc, 1);
1505 atomic_set(&newsk->sk_omem_alloc, 0);
1506 skb_queue_head_init(&newsk->sk_receive_queue);
1507 skb_queue_head_init(&newsk->sk_write_queue);
1508 #ifdef CONFIG_NET_DMA
1509 skb_queue_head_init(&newsk->sk_async_wait_queue);
1510 #endif
1512 spin_lock_init(&newsk->sk_dst_lock);
1513 rwlock_init(&newsk->sk_callback_lock);
1514 lockdep_set_class_and_name(&newsk->sk_callback_lock,
1515 af_callback_keys + newsk->sk_family,
1516 af_family_clock_key_strings[newsk->sk_family]);
1518 newsk->sk_dst_cache = NULL;
1519 newsk->sk_wmem_queued = 0;
1520 newsk->sk_forward_alloc = 0;
1521 newsk->sk_send_head = NULL;
1522 newsk->sk_userlocks = sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
1524 sock_reset_flag(newsk, SOCK_DONE);
1525 skb_queue_head_init(&newsk->sk_error_queue);
1527 filter = rcu_dereference_protected(newsk->sk_filter, 1);
1528 if (filter != NULL)
1529 sk_filter_charge(newsk, filter);
1531 if (unlikely(xfrm_sk_clone_policy(newsk))) {
1532 /* It is still raw copy of parent, so invalidate
1533 * destructor and make plain sk_free() */
1534 newsk->sk_destruct = NULL;
1535 bh_unlock_sock(newsk);
1536 sk_free(newsk);
1537 newsk = NULL;
1538 goto out;
1541 newsk->sk_err = 0;
1542 newsk->sk_priority = 0;
1544 * Before updating sk_refcnt, we must commit prior changes to memory
1545 * (Documentation/RCU/rculist_nulls.txt for details)
1547 smp_wmb();
1548 atomic_set(&newsk->sk_refcnt, 2);
1551 * Increment the counter in the same struct proto as the master
1552 * sock (sk_refcnt_debug_inc uses newsk->sk_prot->socks, that
1553 * is the same as sk->sk_prot->socks, as this field was copied
1554 * with memcpy).
1556 * This _changes_ the previous behaviour, where
1557 * tcp_create_openreq_child always was incrementing the
1558 * equivalent to tcp_prot->socks (inet_sock_nr), so this have
1559 * to be taken into account in all callers. -acme
1561 sk_refcnt_debug_inc(newsk);
1562 sk_set_socket(newsk, NULL);
1563 newsk->sk_wq = NULL;
1565 sk_update_clone(sk, newsk);
1567 if (newsk->sk_prot->sockets_allocated)
1568 sk_sockets_allocated_inc(newsk);
1570 if (newsk->sk_flags & SK_FLAGS_TIMESTAMP)
1571 net_enable_timestamp();
1573 out:
1574 return newsk;
1576 EXPORT_SYMBOL_GPL(sk_clone_lock);
1578 void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
1580 __sk_dst_set(sk, dst);
1581 sk->sk_route_caps = dst->dev->features;
1582 if (sk->sk_route_caps & NETIF_F_GSO)
1583 sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE;
1584 sk->sk_route_caps &= ~sk->sk_route_nocaps;
1585 if (sk_can_gso(sk)) {
1586 if (dst->header_len) {
1587 sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
1588 } else {
1589 sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM;
1590 sk->sk_gso_max_size = dst->dev->gso_max_size;
1591 sk->sk_gso_max_segs = dst->dev->gso_max_segs;
1595 EXPORT_SYMBOL_GPL(sk_setup_caps);
1598 * Simple resource managers for sockets.
1603 * Write buffer destructor automatically called from kfree_skb.
1605 void sock_wfree(struct sk_buff *skb)
1607 struct sock *sk = skb->sk;
1608 unsigned int len = skb->truesize;
1610 if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE)) {
1612 * Keep a reference on sk_wmem_alloc, this will be released
1613 * after sk_write_space() call
1615 atomic_sub(len - 1, &sk->sk_wmem_alloc);
1616 sk->sk_write_space(sk);
1617 len = 1;
1620 * if sk_wmem_alloc reaches 0, we must finish what sk_free()
1621 * could not do because of in-flight packets
1623 if (atomic_sub_and_test(len, &sk->sk_wmem_alloc))
1624 __sk_free(sk);
1626 EXPORT_SYMBOL(sock_wfree);
1628 void skb_orphan_partial(struct sk_buff *skb)
1630 /* TCP stack sets skb->ooo_okay based on sk_wmem_alloc,
1631 * so we do not completely orphan skb, but transfert all
1632 * accounted bytes but one, to avoid unexpected reorders.
1634 if (skb->destructor == sock_wfree
1635 #ifdef CONFIG_INET
1636 || skb->destructor == tcp_wfree
1637 #endif
1639 atomic_sub(skb->truesize - 1, &skb->sk->sk_wmem_alloc);
1640 skb->truesize = 1;
1641 } else {
1642 skb_orphan(skb);
1645 EXPORT_SYMBOL(skb_orphan_partial);
1648 * Read buffer destructor automatically called from kfree_skb.
1650 void sock_rfree(struct sk_buff *skb)
1652 struct sock *sk = skb->sk;
1653 unsigned int len = skb->truesize;
1655 atomic_sub(len, &sk->sk_rmem_alloc);
1656 sk_mem_uncharge(sk, len);
1658 EXPORT_SYMBOL(sock_rfree);
1660 void sock_edemux(struct sk_buff *skb)
1662 struct sock *sk = skb->sk;
1664 #ifdef CONFIG_INET
1665 if (sk->sk_state == TCP_TIME_WAIT)
1666 inet_twsk_put(inet_twsk(sk));
1667 else
1668 #endif
1669 sock_put(sk);
1671 EXPORT_SYMBOL(sock_edemux);
1673 kuid_t sock_i_uid(struct sock *sk)
1675 kuid_t uid;
1677 read_lock_bh(&sk->sk_callback_lock);
1678 uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : GLOBAL_ROOT_UID;
1679 read_unlock_bh(&sk->sk_callback_lock);
1680 return uid;
1682 EXPORT_SYMBOL(sock_i_uid);
1684 unsigned long sock_i_ino(struct sock *sk)
1686 unsigned long ino;
1688 read_lock_bh(&sk->sk_callback_lock);
1689 ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0;
1690 read_unlock_bh(&sk->sk_callback_lock);
1691 return ino;
1693 EXPORT_SYMBOL(sock_i_ino);
1696 * Allocate a skb from the socket's send buffer.
1698 struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
1699 gfp_t priority)
1701 if (force || atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
1702 struct sk_buff *skb = alloc_skb(size, priority);
1703 if (skb) {
1704 skb_set_owner_w(skb, sk);
1705 return skb;
1708 return NULL;
1710 EXPORT_SYMBOL(sock_wmalloc);
1713 * Allocate a skb from the socket's receive buffer.
1715 struct sk_buff *sock_rmalloc(struct sock *sk, unsigned long size, int force,
1716 gfp_t priority)
1718 if (force || atomic_read(&sk->sk_rmem_alloc) < sk->sk_rcvbuf) {
1719 struct sk_buff *skb = alloc_skb(size, priority);
1720 if (skb) {
1721 skb_set_owner_r(skb, sk);
1722 return skb;
1725 return NULL;
1729 * Allocate a memory block from the socket's option memory buffer.
1731 void *sock_kmalloc(struct sock *sk, int size, gfp_t priority)
1733 if ((unsigned int)size <= sysctl_optmem_max &&
1734 atomic_read(&sk->sk_omem_alloc) + size < sysctl_optmem_max) {
1735 void *mem;
1736 /* First do the add, to avoid the race if kmalloc
1737 * might sleep.
1739 atomic_add(size, &sk->sk_omem_alloc);
1740 mem = kmalloc(size, priority);
1741 if (mem)
1742 return mem;
1743 atomic_sub(size, &sk->sk_omem_alloc);
1745 return NULL;
1747 EXPORT_SYMBOL(sock_kmalloc);
1750 * Free an option memory block.
1752 void sock_kfree_s(struct sock *sk, void *mem, int size)
1754 kfree(mem);
1755 atomic_sub(size, &sk->sk_omem_alloc);
1757 EXPORT_SYMBOL(sock_kfree_s);
1759 /* It is almost wait_for_tcp_memory minus release_sock/lock_sock.
1760 I think, these locks should be removed for datagram sockets.
1762 static long sock_wait_for_wmem(struct sock *sk, long timeo)
1764 DEFINE_WAIT(wait);
1766 clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
1767 for (;;) {
1768 if (!timeo)
1769 break;
1770 if (signal_pending(current))
1771 break;
1772 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1773 prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
1774 if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf)
1775 break;
1776 if (sk->sk_shutdown & SEND_SHUTDOWN)
1777 break;
1778 if (sk->sk_err)
1779 break;
1780 timeo = schedule_timeout(timeo);
1782 finish_wait(sk_sleep(sk), &wait);
1783 return timeo;
1788 * Generic send/receive buffer handlers
1791 struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len,
1792 unsigned long data_len, int noblock,
1793 int *errcode, int max_page_order)
1795 struct sk_buff *skb = NULL;
1796 unsigned long chunk;
1797 gfp_t gfp_mask;
1798 long timeo;
1799 int err;
1800 int npages = (data_len + (PAGE_SIZE - 1)) >> PAGE_SHIFT;
1801 struct page *page;
1802 int i;
1804 err = -EMSGSIZE;
1805 if (npages > MAX_SKB_FRAGS)
1806 goto failure;
1808 timeo = sock_sndtimeo(sk, noblock);
1809 while (!skb) {
1810 err = sock_error(sk);
1811 if (err != 0)
1812 goto failure;
1814 err = -EPIPE;
1815 if (sk->sk_shutdown & SEND_SHUTDOWN)
1816 goto failure;
1818 if (atomic_read(&sk->sk_wmem_alloc) >= sk->sk_sndbuf) {
1819 set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
1820 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1821 err = -EAGAIN;
1822 if (!timeo)
1823 goto failure;
1824 if (signal_pending(current))
1825 goto interrupted;
1826 timeo = sock_wait_for_wmem(sk, timeo);
1827 continue;
1830 err = -ENOBUFS;
1831 gfp_mask = sk->sk_allocation;
1832 if (gfp_mask & __GFP_WAIT)
1833 gfp_mask |= __GFP_REPEAT;
1835 skb = alloc_skb(header_len, gfp_mask);
1836 if (!skb)
1837 goto failure;
1839 skb->truesize += data_len;
1841 for (i = 0; npages > 0; i++) {
1842 int order = max_page_order;
1844 while (order) {
1845 if (npages >= 1 << order) {
1846 page = alloc_pages(sk->sk_allocation |
1847 __GFP_COMP |
1848 __GFP_NOWARN |
1849 __GFP_NORETRY,
1850 order);
1851 if (page)
1852 goto fill_page;
1854 order--;
1856 page = alloc_page(sk->sk_allocation);
1857 if (!page)
1858 goto failure;
1859 fill_page:
1860 chunk = min_t(unsigned long, data_len,
1861 PAGE_SIZE << order);
1862 skb_fill_page_desc(skb, i, page, 0, chunk);
1863 data_len -= chunk;
1864 npages -= 1 << order;
1868 skb_set_owner_w(skb, sk);
1869 return skb;
1871 interrupted:
1872 err = sock_intr_errno(timeo);
1873 failure:
1874 kfree_skb(skb);
1875 *errcode = err;
1876 return NULL;
1878 EXPORT_SYMBOL(sock_alloc_send_pskb);
1880 struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size,
1881 int noblock, int *errcode)
1883 return sock_alloc_send_pskb(sk, size, 0, noblock, errcode, 0);
1885 EXPORT_SYMBOL(sock_alloc_send_skb);
1887 /* On 32bit arches, an skb frag is limited to 2^15 */
1888 #define SKB_FRAG_PAGE_ORDER get_order(32768)
1890 bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag)
1892 int order;
1894 if (pfrag->page) {
1895 if (atomic_read(&pfrag->page->_count) == 1) {
1896 pfrag->offset = 0;
1897 return true;
1899 if (pfrag->offset < pfrag->size)
1900 return true;
1901 put_page(pfrag->page);
1904 /* We restrict high order allocations to users that can afford to wait */
1905 order = (sk->sk_allocation & __GFP_WAIT) ? SKB_FRAG_PAGE_ORDER : 0;
1907 do {
1908 gfp_t gfp = sk->sk_allocation;
1910 if (order)
1911 gfp |= __GFP_COMP | __GFP_NOWARN | __GFP_NORETRY;
1912 pfrag->page = alloc_pages(gfp, order);
1913 if (likely(pfrag->page)) {
1914 pfrag->offset = 0;
1915 pfrag->size = PAGE_SIZE << order;
1916 return true;
1918 } while (--order >= 0);
1920 sk_enter_memory_pressure(sk);
1921 sk_stream_moderate_sndbuf(sk);
1922 return false;
1924 EXPORT_SYMBOL(sk_page_frag_refill);
1926 static void __lock_sock(struct sock *sk)
1927 __releases(&sk->sk_lock.slock)
1928 __acquires(&sk->sk_lock.slock)
1930 DEFINE_WAIT(wait);
1932 for (;;) {
1933 prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait,
1934 TASK_UNINTERRUPTIBLE);
1935 spin_unlock_bh(&sk->sk_lock.slock);
1936 schedule();
1937 spin_lock_bh(&sk->sk_lock.slock);
1938 if (!sock_owned_by_user(sk))
1939 break;
1941 finish_wait(&sk->sk_lock.wq, &wait);
1944 static void __release_sock(struct sock *sk)
1945 __releases(&sk->sk_lock.slock)
1946 __acquires(&sk->sk_lock.slock)
1948 struct sk_buff *skb = sk->sk_backlog.head;
1950 do {
1951 sk->sk_backlog.head = sk->sk_backlog.tail = NULL;
1952 bh_unlock_sock(sk);
1954 do {
1955 struct sk_buff *next = skb->next;
1957 prefetch(next);
1958 WARN_ON_ONCE(skb_dst_is_noref(skb));
1959 skb->next = NULL;
1960 sk_backlog_rcv(sk, skb);
1963 * We are in process context here with softirqs
1964 * disabled, use cond_resched_softirq() to preempt.
1965 * This is safe to do because we've taken the backlog
1966 * queue private:
1968 cond_resched_softirq();
1970 skb = next;
1971 } while (skb != NULL);
1973 bh_lock_sock(sk);
1974 } while ((skb = sk->sk_backlog.head) != NULL);
1977 * Doing the zeroing here guarantee we can not loop forever
1978 * while a wild producer attempts to flood us.
1980 sk->sk_backlog.len = 0;
1984 * sk_wait_data - wait for data to arrive at sk_receive_queue
1985 * @sk: sock to wait on
1986 * @timeo: for how long
1988 * Now socket state including sk->sk_err is changed only under lock,
1989 * hence we may omit checks after joining wait queue.
1990 * We check receive queue before schedule() only as optimization;
1991 * it is very likely that release_sock() added new data.
1993 int sk_wait_data(struct sock *sk, long *timeo)
1995 int rc;
1996 DEFINE_WAIT(wait);
1998 prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
1999 set_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
2000 rc = sk_wait_event(sk, timeo, !skb_queue_empty(&sk->sk_receive_queue));
2001 clear_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
2002 finish_wait(sk_sleep(sk), &wait);
2003 return rc;
2005 EXPORT_SYMBOL(sk_wait_data);
2008 * __sk_mem_schedule - increase sk_forward_alloc and memory_allocated
2009 * @sk: socket
2010 * @size: memory size to allocate
2011 * @kind: allocation type
2013 * If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means
2014 * rmem allocation. This function assumes that protocols which have
2015 * memory_pressure use sk_wmem_queued as write buffer accounting.
2017 int __sk_mem_schedule(struct sock *sk, int size, int kind)
2019 struct proto *prot = sk->sk_prot;
2020 int amt = sk_mem_pages(size);
2021 long allocated;
2022 int parent_status = UNDER_LIMIT;
2024 sk->sk_forward_alloc += amt * SK_MEM_QUANTUM;
2026 allocated = sk_memory_allocated_add(sk, amt, &parent_status);
2028 /* Under limit. */
2029 if (parent_status == UNDER_LIMIT &&
2030 allocated <= sk_prot_mem_limits(sk, 0)) {
2031 sk_leave_memory_pressure(sk);
2032 return 1;
2035 /* Under pressure. (we or our parents) */
2036 if ((parent_status > SOFT_LIMIT) ||
2037 allocated > sk_prot_mem_limits(sk, 1))
2038 sk_enter_memory_pressure(sk);
2040 /* Over hard limit (we or our parents) */
2041 if ((parent_status == OVER_LIMIT) ||
2042 (allocated > sk_prot_mem_limits(sk, 2)))
2043 goto suppress_allocation;
2045 /* guarantee minimum buffer size under pressure */
2046 if (kind == SK_MEM_RECV) {
2047 if (atomic_read(&sk->sk_rmem_alloc) < prot->sysctl_rmem[0])
2048 return 1;
2050 } else { /* SK_MEM_SEND */
2051 if (sk->sk_type == SOCK_STREAM) {
2052 if (sk->sk_wmem_queued < prot->sysctl_wmem[0])
2053 return 1;
2054 } else if (atomic_read(&sk->sk_wmem_alloc) <
2055 prot->sysctl_wmem[0])
2056 return 1;
2059 if (sk_has_memory_pressure(sk)) {
2060 int alloc;
2062 if (!sk_under_memory_pressure(sk))
2063 return 1;
2064 alloc = sk_sockets_allocated_read_positive(sk);
2065 if (sk_prot_mem_limits(sk, 2) > alloc *
2066 sk_mem_pages(sk->sk_wmem_queued +
2067 atomic_read(&sk->sk_rmem_alloc) +
2068 sk->sk_forward_alloc))
2069 return 1;
2072 suppress_allocation:
2074 if (kind == SK_MEM_SEND && sk->sk_type == SOCK_STREAM) {
2075 sk_stream_moderate_sndbuf(sk);
2077 /* Fail only if socket is _under_ its sndbuf.
2078 * In this case we cannot block, so that we have to fail.
2080 if (sk->sk_wmem_queued + size >= sk->sk_sndbuf)
2081 return 1;
2084 trace_sock_exceed_buf_limit(sk, prot, allocated);
2086 /* Alas. Undo changes. */
2087 sk->sk_forward_alloc -= amt * SK_MEM_QUANTUM;
2089 sk_memory_allocated_sub(sk, amt);
2091 return 0;
2093 EXPORT_SYMBOL(__sk_mem_schedule);
2096 * __sk_reclaim - reclaim memory_allocated
2097 * @sk: socket
2099 void __sk_mem_reclaim(struct sock *sk)
2101 sk_memory_allocated_sub(sk,
2102 sk->sk_forward_alloc >> SK_MEM_QUANTUM_SHIFT);
2103 sk->sk_forward_alloc &= SK_MEM_QUANTUM - 1;
2105 if (sk_under_memory_pressure(sk) &&
2106 (sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)))
2107 sk_leave_memory_pressure(sk);
2109 EXPORT_SYMBOL(__sk_mem_reclaim);
2113 * Set of default routines for initialising struct proto_ops when
2114 * the protocol does not support a particular function. In certain
2115 * cases where it makes no sense for a protocol to have a "do nothing"
2116 * function, some default processing is provided.
2119 int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len)
2121 return -EOPNOTSUPP;
2123 EXPORT_SYMBOL(sock_no_bind);
2125 int sock_no_connect(struct socket *sock, struct sockaddr *saddr,
2126 int len, int flags)
2128 return -EOPNOTSUPP;
2130 EXPORT_SYMBOL(sock_no_connect);
2132 int sock_no_socketpair(struct socket *sock1, struct socket *sock2)
2134 return -EOPNOTSUPP;
2136 EXPORT_SYMBOL(sock_no_socketpair);
2138 int sock_no_accept(struct socket *sock, struct socket *newsock, int flags)
2140 return -EOPNOTSUPP;
2142 EXPORT_SYMBOL(sock_no_accept);
2144 int sock_no_getname(struct socket *sock, struct sockaddr *saddr,
2145 int *len, int peer)
2147 return -EOPNOTSUPP;
2149 EXPORT_SYMBOL(sock_no_getname);
2151 unsigned int sock_no_poll(struct file *file, struct socket *sock, poll_table *pt)
2153 return 0;
2155 EXPORT_SYMBOL(sock_no_poll);
2157 int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
2159 return -EOPNOTSUPP;
2161 EXPORT_SYMBOL(sock_no_ioctl);
2163 int sock_no_listen(struct socket *sock, int backlog)
2165 return -EOPNOTSUPP;
2167 EXPORT_SYMBOL(sock_no_listen);
2169 int sock_no_shutdown(struct socket *sock, int how)
2171 return -EOPNOTSUPP;
2173 EXPORT_SYMBOL(sock_no_shutdown);
2175 int sock_no_setsockopt(struct socket *sock, int level, int optname,
2176 char __user *optval, unsigned int optlen)
2178 return -EOPNOTSUPP;
2180 EXPORT_SYMBOL(sock_no_setsockopt);
2182 int sock_no_getsockopt(struct socket *sock, int level, int optname,
2183 char __user *optval, int __user *optlen)
2185 return -EOPNOTSUPP;
2187 EXPORT_SYMBOL(sock_no_getsockopt);
2189 int sock_no_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *m,
2190 size_t len)
2192 return -EOPNOTSUPP;
2194 EXPORT_SYMBOL(sock_no_sendmsg);
2196 int sock_no_recvmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *m,
2197 size_t len, int flags)
2199 return -EOPNOTSUPP;
2201 EXPORT_SYMBOL(sock_no_recvmsg);
2203 int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
2205 /* Mirror missing mmap method error code */
2206 return -ENODEV;
2208 EXPORT_SYMBOL(sock_no_mmap);
2210 ssize_t sock_no_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags)
2212 ssize_t res;
2213 struct msghdr msg = {.msg_flags = flags};
2214 struct kvec iov;
2215 char *kaddr = kmap(page);
2216 iov.iov_base = kaddr + offset;
2217 iov.iov_len = size;
2218 res = kernel_sendmsg(sock, &msg, &iov, 1, size);
2219 kunmap(page);
2220 return res;
2222 EXPORT_SYMBOL(sock_no_sendpage);
2225 * Default Socket Callbacks
2228 static void sock_def_wakeup(struct sock *sk)
2230 struct socket_wq *wq;
2232 rcu_read_lock();
2233 wq = rcu_dereference(sk->sk_wq);
2234 if (wq_has_sleeper(wq))
2235 wake_up_interruptible_all(&wq->wait);
2236 rcu_read_unlock();
2239 static void sock_def_error_report(struct sock *sk)
2241 struct socket_wq *wq;
2243 rcu_read_lock();
2244 wq = rcu_dereference(sk->sk_wq);
2245 if (wq_has_sleeper(wq))
2246 wake_up_interruptible_poll(&wq->wait, POLLERR);
2247 sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR);
2248 rcu_read_unlock();
2251 static void sock_def_readable(struct sock *sk, int len)
2253 struct socket_wq *wq;
2255 rcu_read_lock();
2256 wq = rcu_dereference(sk->sk_wq);
2257 if (wq_has_sleeper(wq))
2258 wake_up_interruptible_sync_poll(&wq->wait, POLLIN | POLLPRI |
2259 POLLRDNORM | POLLRDBAND);
2260 sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
2261 rcu_read_unlock();
2264 static void sock_def_write_space(struct sock *sk)
2266 struct socket_wq *wq;
2268 rcu_read_lock();
2270 /* Do not wake up a writer until he can make "significant"
2271 * progress. --DaveM
2273 if ((atomic_read(&sk->sk_wmem_alloc) << 1) <= sk->sk_sndbuf) {
2274 wq = rcu_dereference(sk->sk_wq);
2275 if (wq_has_sleeper(wq))
2276 wake_up_interruptible_sync_poll(&wq->wait, POLLOUT |
2277 POLLWRNORM | POLLWRBAND);
2279 /* Should agree with poll, otherwise some programs break */
2280 if (sock_writeable(sk))
2281 sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
2284 rcu_read_unlock();
2287 static void sock_def_destruct(struct sock *sk)
2289 kfree(sk->sk_protinfo);
2292 void sk_send_sigurg(struct sock *sk)
2294 if (sk->sk_socket && sk->sk_socket->file)
2295 if (send_sigurg(&sk->sk_socket->file->f_owner))
2296 sk_wake_async(sk, SOCK_WAKE_URG, POLL_PRI);
2298 EXPORT_SYMBOL(sk_send_sigurg);
2300 void sk_reset_timer(struct sock *sk, struct timer_list* timer,
2301 unsigned long expires)
2303 if (!mod_timer(timer, expires))
2304 sock_hold(sk);
2306 EXPORT_SYMBOL(sk_reset_timer);
2308 void sk_stop_timer(struct sock *sk, struct timer_list* timer)
2310 if (del_timer(timer))
2311 __sock_put(sk);
2313 EXPORT_SYMBOL(sk_stop_timer);
2315 void sock_init_data(struct socket *sock, struct sock *sk)
2317 skb_queue_head_init(&sk->sk_receive_queue);
2318 skb_queue_head_init(&sk->sk_write_queue);
2319 skb_queue_head_init(&sk->sk_error_queue);
2320 #ifdef CONFIG_NET_DMA
2321 skb_queue_head_init(&sk->sk_async_wait_queue);
2322 #endif
2324 sk->sk_send_head = NULL;
2326 init_timer(&sk->sk_timer);
2328 sk->sk_allocation = GFP_KERNEL;
2329 sk->sk_rcvbuf = sysctl_rmem_default;
2330 sk->sk_sndbuf = sysctl_wmem_default;
2331 sk->sk_state = TCP_CLOSE;
2332 sk_set_socket(sk, sock);
2334 sock_set_flag(sk, SOCK_ZAPPED);
2336 if (sock) {
2337 sk->sk_type = sock->type;
2338 sk->sk_wq = sock->wq;
2339 sock->sk = sk;
2340 } else
2341 sk->sk_wq = NULL;
2343 spin_lock_init(&sk->sk_dst_lock);
2344 rwlock_init(&sk->sk_callback_lock);
2345 lockdep_set_class_and_name(&sk->sk_callback_lock,
2346 af_callback_keys + sk->sk_family,
2347 af_family_clock_key_strings[sk->sk_family]);
2349 sk->sk_state_change = sock_def_wakeup;
2350 sk->sk_data_ready = sock_def_readable;
2351 sk->sk_write_space = sock_def_write_space;
2352 sk->sk_error_report = sock_def_error_report;
2353 sk->sk_destruct = sock_def_destruct;
2355 sk->sk_frag.page = NULL;
2356 sk->sk_frag.offset = 0;
2357 sk->sk_peek_off = -1;
2359 sk->sk_peer_pid = NULL;
2360 sk->sk_peer_cred = NULL;
2361 sk->sk_write_pending = 0;
2362 sk->sk_rcvlowat = 1;
2363 sk->sk_rcvtimeo = MAX_SCHEDULE_TIMEOUT;
2364 sk->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT;
2366 sk->sk_stamp = ktime_set(-1L, 0);
2368 #ifdef CONFIG_NET_RX_BUSY_POLL
2369 sk->sk_napi_id = 0;
2370 sk->sk_ll_usec = sysctl_net_busy_read;
2371 #endif
2373 sk->sk_pacing_rate = ~0U;
2375 * Before updating sk_refcnt, we must commit prior changes to memory
2376 * (Documentation/RCU/rculist_nulls.txt for details)
2378 smp_wmb();
2379 atomic_set(&sk->sk_refcnt, 1);
2380 atomic_set(&sk->sk_drops, 0);
2382 EXPORT_SYMBOL(sock_init_data);
2384 void lock_sock_nested(struct sock *sk, int subclass)
2386 might_sleep();
2387 spin_lock_bh(&sk->sk_lock.slock);
2388 if (sk->sk_lock.owned)
2389 __lock_sock(sk);
2390 sk->sk_lock.owned = 1;
2391 spin_unlock(&sk->sk_lock.slock);
2393 * The sk_lock has mutex_lock() semantics here:
2395 mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_);
2396 local_bh_enable();
2398 EXPORT_SYMBOL(lock_sock_nested);
2400 void release_sock(struct sock *sk)
2403 * The sk_lock has mutex_unlock() semantics:
2405 mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
2407 spin_lock_bh(&sk->sk_lock.slock);
2408 if (sk->sk_backlog.tail)
2409 __release_sock(sk);
2411 /* Warning : release_cb() might need to release sk ownership,
2412 * ie call sock_release_ownership(sk) before us.
2414 if (sk->sk_prot->release_cb)
2415 sk->sk_prot->release_cb(sk);
2417 sock_release_ownership(sk);
2418 if (waitqueue_active(&sk->sk_lock.wq))
2419 wake_up(&sk->sk_lock.wq);
2420 spin_unlock_bh(&sk->sk_lock.slock);
2422 EXPORT_SYMBOL(release_sock);
2425 * lock_sock_fast - fast version of lock_sock
2426 * @sk: socket
2428 * This version should be used for very small section, where process wont block
2429 * return false if fast path is taken
2430 * sk_lock.slock locked, owned = 0, BH disabled
2431 * return true if slow path is taken
2432 * sk_lock.slock unlocked, owned = 1, BH enabled
2434 bool lock_sock_fast(struct sock *sk)
2436 might_sleep();
2437 spin_lock_bh(&sk->sk_lock.slock);
2439 if (!sk->sk_lock.owned)
2441 * Note : We must disable BH
2443 return false;
2445 __lock_sock(sk);
2446 sk->sk_lock.owned = 1;
2447 spin_unlock(&sk->sk_lock.slock);
2449 * The sk_lock has mutex_lock() semantics here:
2451 mutex_acquire(&sk->sk_lock.dep_map, 0, 0, _RET_IP_);
2452 local_bh_enable();
2453 return true;
2455 EXPORT_SYMBOL(lock_sock_fast);
2457 int sock_get_timestamp(struct sock *sk, struct timeval __user *userstamp)
2459 struct timeval tv;
2460 if (!sock_flag(sk, SOCK_TIMESTAMP))
2461 sock_enable_timestamp(sk, SOCK_TIMESTAMP);
2462 tv = ktime_to_timeval(sk->sk_stamp);
2463 if (tv.tv_sec == -1)
2464 return -ENOENT;
2465 if (tv.tv_sec == 0) {
2466 sk->sk_stamp = ktime_get_real();
2467 tv = ktime_to_timeval(sk->sk_stamp);
2469 return copy_to_user(userstamp, &tv, sizeof(tv)) ? -EFAULT : 0;
2471 EXPORT_SYMBOL(sock_get_timestamp);
2473 int sock_get_timestampns(struct sock *sk, struct timespec __user *userstamp)
2475 struct timespec ts;
2476 if (!sock_flag(sk, SOCK_TIMESTAMP))
2477 sock_enable_timestamp(sk, SOCK_TIMESTAMP);
2478 ts = ktime_to_timespec(sk->sk_stamp);
2479 if (ts.tv_sec == -1)
2480 return -ENOENT;
2481 if (ts.tv_sec == 0) {
2482 sk->sk_stamp = ktime_get_real();
2483 ts = ktime_to_timespec(sk->sk_stamp);
2485 return copy_to_user(userstamp, &ts, sizeof(ts)) ? -EFAULT : 0;
2487 EXPORT_SYMBOL(sock_get_timestampns);
2489 void sock_enable_timestamp(struct sock *sk, int flag)
2491 if (!sock_flag(sk, flag)) {
2492 unsigned long previous_flags = sk->sk_flags;
2494 sock_set_flag(sk, flag);
2496 * we just set one of the two flags which require net
2497 * time stamping, but time stamping might have been on
2498 * already because of the other one
2500 if (!(previous_flags & SK_FLAGS_TIMESTAMP))
2501 net_enable_timestamp();
2505 int sock_recv_errqueue(struct sock *sk, struct msghdr *msg, int len,
2506 int level, int type)
2508 struct sock_exterr_skb *serr;
2509 struct sk_buff *skb, *skb2;
2510 int copied, err;
2512 err = -EAGAIN;
2513 skb = skb_dequeue(&sk->sk_error_queue);
2514 if (skb == NULL)
2515 goto out;
2517 copied = skb->len;
2518 if (copied > len) {
2519 msg->msg_flags |= MSG_TRUNC;
2520 copied = len;
2522 err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied);
2523 if (err)
2524 goto out_free_skb;
2526 sock_recv_timestamp(msg, sk, skb);
2528 serr = SKB_EXT_ERR(skb);
2529 put_cmsg(msg, level, type, sizeof(serr->ee), &serr->ee);
2531 msg->msg_flags |= MSG_ERRQUEUE;
2532 err = copied;
2534 /* Reset and regenerate socket error */
2535 spin_lock_bh(&sk->sk_error_queue.lock);
2536 sk->sk_err = 0;
2537 if ((skb2 = skb_peek(&sk->sk_error_queue)) != NULL) {
2538 sk->sk_err = SKB_EXT_ERR(skb2)->ee.ee_errno;
2539 spin_unlock_bh(&sk->sk_error_queue.lock);
2540 sk->sk_error_report(sk);
2541 } else
2542 spin_unlock_bh(&sk->sk_error_queue.lock);
2544 out_free_skb:
2545 kfree_skb(skb);
2546 out:
2547 return err;
2549 EXPORT_SYMBOL(sock_recv_errqueue);
2552 * Get a socket option on an socket.
2554 * FIX: POSIX 1003.1g is very ambiguous here. It states that
2555 * asynchronous errors should be reported by getsockopt. We assume
2556 * this means if you specify SO_ERROR (otherwise whats the point of it).
2558 int sock_common_getsockopt(struct socket *sock, int level, int optname,
2559 char __user *optval, int __user *optlen)
2561 struct sock *sk = sock->sk;
2563 return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
2565 EXPORT_SYMBOL(sock_common_getsockopt);
2567 #ifdef CONFIG_COMPAT
2568 int compat_sock_common_getsockopt(struct socket *sock, int level, int optname,
2569 char __user *optval, int __user *optlen)
2571 struct sock *sk = sock->sk;
2573 if (sk->sk_prot->compat_getsockopt != NULL)
2574 return sk->sk_prot->compat_getsockopt(sk, level, optname,
2575 optval, optlen);
2576 return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
2578 EXPORT_SYMBOL(compat_sock_common_getsockopt);
2579 #endif
2581 int sock_common_recvmsg(struct kiocb *iocb, struct socket *sock,
2582 struct msghdr *msg, size_t size, int flags)
2584 struct sock *sk = sock->sk;
2585 int addr_len = 0;
2586 int err;
2588 err = sk->sk_prot->recvmsg(iocb, sk, msg, size, flags & MSG_DONTWAIT,
2589 flags & ~MSG_DONTWAIT, &addr_len);
2590 if (err >= 0)
2591 msg->msg_namelen = addr_len;
2592 return err;
2594 EXPORT_SYMBOL(sock_common_recvmsg);
2597 * Set socket options on an inet socket.
2599 int sock_common_setsockopt(struct socket *sock, int level, int optname,
2600 char __user *optval, unsigned int optlen)
2602 struct sock *sk = sock->sk;
2604 return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
2606 EXPORT_SYMBOL(sock_common_setsockopt);
2608 #ifdef CONFIG_COMPAT
2609 int compat_sock_common_setsockopt(struct socket *sock, int level, int optname,
2610 char __user *optval, unsigned int optlen)
2612 struct sock *sk = sock->sk;
2614 if (sk->sk_prot->compat_setsockopt != NULL)
2615 return sk->sk_prot->compat_setsockopt(sk, level, optname,
2616 optval, optlen);
2617 return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
2619 EXPORT_SYMBOL(compat_sock_common_setsockopt);
2620 #endif
2622 void sk_common_release(struct sock *sk)
2624 if (sk->sk_prot->destroy)
2625 sk->sk_prot->destroy(sk);
2628 * Observation: when sock_common_release is called, processes have
2629 * no access to socket. But net still has.
2630 * Step one, detach it from networking:
2632 * A. Remove from hash tables.
2635 sk->sk_prot->unhash(sk);
2638 * In this point socket cannot receive new packets, but it is possible
2639 * that some packets are in flight because some CPU runs receiver and
2640 * did hash table lookup before we unhashed socket. They will achieve
2641 * receive queue and will be purged by socket destructor.
2643 * Also we still have packets pending on receive queue and probably,
2644 * our own packets waiting in device queues. sock_destroy will drain
2645 * receive queue, but transmitted packets will delay socket destruction
2646 * until the last reference will be released.
2649 sock_orphan(sk);
2651 xfrm_sk_free_policy(sk);
2653 sk_refcnt_debug_release(sk);
2655 if (sk->sk_frag.page) {
2656 put_page(sk->sk_frag.page);
2657 sk->sk_frag.page = NULL;
2660 sock_put(sk);
2662 EXPORT_SYMBOL(sk_common_release);
2664 #ifdef CONFIG_PROC_FS
2665 #define PROTO_INUSE_NR 64 /* should be enough for the first time */
2666 struct prot_inuse {
2667 int val[PROTO_INUSE_NR];
2670 static DECLARE_BITMAP(proto_inuse_idx, PROTO_INUSE_NR);
2672 #ifdef CONFIG_NET_NS
2673 void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
2675 __this_cpu_add(net->core.inuse->val[prot->inuse_idx], val);
2677 EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
2679 int sock_prot_inuse_get(struct net *net, struct proto *prot)
2681 int cpu, idx = prot->inuse_idx;
2682 int res = 0;
2684 for_each_possible_cpu(cpu)
2685 res += per_cpu_ptr(net->core.inuse, cpu)->val[idx];
2687 return res >= 0 ? res : 0;
2689 EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
2691 static int __net_init sock_inuse_init_net(struct net *net)
2693 net->core.inuse = alloc_percpu(struct prot_inuse);
2694 return net->core.inuse ? 0 : -ENOMEM;
2697 static void __net_exit sock_inuse_exit_net(struct net *net)
2699 free_percpu(net->core.inuse);
2702 static struct pernet_operations net_inuse_ops = {
2703 .init = sock_inuse_init_net,
2704 .exit = sock_inuse_exit_net,
2707 static __init int net_inuse_init(void)
2709 if (register_pernet_subsys(&net_inuse_ops))
2710 panic("Cannot initialize net inuse counters");
2712 return 0;
2715 core_initcall(net_inuse_init);
2716 #else
2717 static DEFINE_PER_CPU(struct prot_inuse, prot_inuse);
2719 void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
2721 __this_cpu_add(prot_inuse.val[prot->inuse_idx], val);
2723 EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
2725 int sock_prot_inuse_get(struct net *net, struct proto *prot)
2727 int cpu, idx = prot->inuse_idx;
2728 int res = 0;
2730 for_each_possible_cpu(cpu)
2731 res += per_cpu(prot_inuse, cpu).val[idx];
2733 return res >= 0 ? res : 0;
2735 EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
2736 #endif
2738 static void assign_proto_idx(struct proto *prot)
2740 prot->inuse_idx = find_first_zero_bit(proto_inuse_idx, PROTO_INUSE_NR);
2742 if (unlikely(prot->inuse_idx == PROTO_INUSE_NR - 1)) {
2743 pr_err("PROTO_INUSE_NR exhausted\n");
2744 return;
2747 set_bit(prot->inuse_idx, proto_inuse_idx);
2750 static void release_proto_idx(struct proto *prot)
2752 if (prot->inuse_idx != PROTO_INUSE_NR - 1)
2753 clear_bit(prot->inuse_idx, proto_inuse_idx);
2755 #else
2756 static inline void assign_proto_idx(struct proto *prot)
2760 static inline void release_proto_idx(struct proto *prot)
2763 #endif
2765 int proto_register(struct proto *prot, int alloc_slab)
2767 if (alloc_slab) {
2768 prot->slab = kmem_cache_create(prot->name, prot->obj_size, 0,
2769 SLAB_HWCACHE_ALIGN | prot->slab_flags,
2770 NULL);
2772 if (prot->slab == NULL) {
2773 pr_crit("%s: Can't create sock SLAB cache!\n",
2774 prot->name);
2775 goto out;
2778 if (prot->rsk_prot != NULL) {
2779 prot->rsk_prot->slab_name = kasprintf(GFP_KERNEL, "request_sock_%s", prot->name);
2780 if (prot->rsk_prot->slab_name == NULL)
2781 goto out_free_sock_slab;
2783 prot->rsk_prot->slab = kmem_cache_create(prot->rsk_prot->slab_name,
2784 prot->rsk_prot->obj_size, 0,
2785 SLAB_HWCACHE_ALIGN, NULL);
2787 if (prot->rsk_prot->slab == NULL) {
2788 pr_crit("%s: Can't create request sock SLAB cache!\n",
2789 prot->name);
2790 goto out_free_request_sock_slab_name;
2794 if (prot->twsk_prot != NULL) {
2795 prot->twsk_prot->twsk_slab_name = kasprintf(GFP_KERNEL, "tw_sock_%s", prot->name);
2797 if (prot->twsk_prot->twsk_slab_name == NULL)
2798 goto out_free_request_sock_slab;
2800 prot->twsk_prot->twsk_slab =
2801 kmem_cache_create(prot->twsk_prot->twsk_slab_name,
2802 prot->twsk_prot->twsk_obj_size,
2804 SLAB_HWCACHE_ALIGN |
2805 prot->slab_flags,
2806 NULL);
2807 if (prot->twsk_prot->twsk_slab == NULL)
2808 goto out_free_timewait_sock_slab_name;
2812 mutex_lock(&proto_list_mutex);
2813 list_add(&prot->node, &proto_list);
2814 assign_proto_idx(prot);
2815 mutex_unlock(&proto_list_mutex);
2816 return 0;
2818 out_free_timewait_sock_slab_name:
2819 kfree(prot->twsk_prot->twsk_slab_name);
2820 out_free_request_sock_slab:
2821 if (prot->rsk_prot && prot->rsk_prot->slab) {
2822 kmem_cache_destroy(prot->rsk_prot->slab);
2823 prot->rsk_prot->slab = NULL;
2825 out_free_request_sock_slab_name:
2826 if (prot->rsk_prot)
2827 kfree(prot->rsk_prot->slab_name);
2828 out_free_sock_slab:
2829 kmem_cache_destroy(prot->slab);
2830 prot->slab = NULL;
2831 out:
2832 return -ENOBUFS;
2834 EXPORT_SYMBOL(proto_register);
2836 void proto_unregister(struct proto *prot)
2838 mutex_lock(&proto_list_mutex);
2839 release_proto_idx(prot);
2840 list_del(&prot->node);
2841 mutex_unlock(&proto_list_mutex);
2843 if (prot->slab != NULL) {
2844 kmem_cache_destroy(prot->slab);
2845 prot->slab = NULL;
2848 if (prot->rsk_prot != NULL && prot->rsk_prot->slab != NULL) {
2849 kmem_cache_destroy(prot->rsk_prot->slab);
2850 kfree(prot->rsk_prot->slab_name);
2851 prot->rsk_prot->slab = NULL;
2854 if (prot->twsk_prot != NULL && prot->twsk_prot->twsk_slab != NULL) {
2855 kmem_cache_destroy(prot->twsk_prot->twsk_slab);
2856 kfree(prot->twsk_prot->twsk_slab_name);
2857 prot->twsk_prot->twsk_slab = NULL;
2860 EXPORT_SYMBOL(proto_unregister);
2862 #ifdef CONFIG_PROC_FS
2863 static void *proto_seq_start(struct seq_file *seq, loff_t *pos)
2864 __acquires(proto_list_mutex)
2866 mutex_lock(&proto_list_mutex);
2867 return seq_list_start_head(&proto_list, *pos);
2870 static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2872 return seq_list_next(v, &proto_list, pos);
2875 static void proto_seq_stop(struct seq_file *seq, void *v)
2876 __releases(proto_list_mutex)
2878 mutex_unlock(&proto_list_mutex);
2881 static char proto_method_implemented(const void *method)
2883 return method == NULL ? 'n' : 'y';
2885 static long sock_prot_memory_allocated(struct proto *proto)
2887 return proto->memory_allocated != NULL ? proto_memory_allocated(proto) : -1L;
2890 static char *sock_prot_memory_pressure(struct proto *proto)
2892 return proto->memory_pressure != NULL ?
2893 proto_memory_pressure(proto) ? "yes" : "no" : "NI";
2896 static void proto_seq_printf(struct seq_file *seq, struct proto *proto)
2899 seq_printf(seq, "%-9s %4u %6d %6ld %-3s %6u %-3s %-10s "
2900 "%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n",
2901 proto->name,
2902 proto->obj_size,
2903 sock_prot_inuse_get(seq_file_net(seq), proto),
2904 sock_prot_memory_allocated(proto),
2905 sock_prot_memory_pressure(proto),
2906 proto->max_header,
2907 proto->slab == NULL ? "no" : "yes",
2908 module_name(proto->owner),
2909 proto_method_implemented(proto->close),
2910 proto_method_implemented(proto->connect),
2911 proto_method_implemented(proto->disconnect),
2912 proto_method_implemented(proto->accept),
2913 proto_method_implemented(proto->ioctl),
2914 proto_method_implemented(proto->init),
2915 proto_method_implemented(proto->destroy),
2916 proto_method_implemented(proto->shutdown),
2917 proto_method_implemented(proto->setsockopt),
2918 proto_method_implemented(proto->getsockopt),
2919 proto_method_implemented(proto->sendmsg),
2920 proto_method_implemented(proto->recvmsg),
2921 proto_method_implemented(proto->sendpage),
2922 proto_method_implemented(proto->bind),
2923 proto_method_implemented(proto->backlog_rcv),
2924 proto_method_implemented(proto->hash),
2925 proto_method_implemented(proto->unhash),
2926 proto_method_implemented(proto->get_port),
2927 proto_method_implemented(proto->enter_memory_pressure));
2930 static int proto_seq_show(struct seq_file *seq, void *v)
2932 if (v == &proto_list)
2933 seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s",
2934 "protocol",
2935 "size",
2936 "sockets",
2937 "memory",
2938 "press",
2939 "maxhdr",
2940 "slab",
2941 "module",
2942 "cl co di ac io in de sh ss gs se re sp bi br ha uh gp em\n");
2943 else
2944 proto_seq_printf(seq, list_entry(v, struct proto, node));
2945 return 0;
2948 static const struct seq_operations proto_seq_ops = {
2949 .start = proto_seq_start,
2950 .next = proto_seq_next,
2951 .stop = proto_seq_stop,
2952 .show = proto_seq_show,
2955 static int proto_seq_open(struct inode *inode, struct file *file)
2957 return seq_open_net(inode, file, &proto_seq_ops,
2958 sizeof(struct seq_net_private));
2961 static const struct file_operations proto_seq_fops = {
2962 .owner = THIS_MODULE,
2963 .open = proto_seq_open,
2964 .read = seq_read,
2965 .llseek = seq_lseek,
2966 .release = seq_release_net,
2969 static __net_init int proto_init_net(struct net *net)
2971 if (!proc_create("protocols", S_IRUGO, net->proc_net, &proto_seq_fops))
2972 return -ENOMEM;
2974 return 0;
2977 static __net_exit void proto_exit_net(struct net *net)
2979 remove_proc_entry("protocols", net->proc_net);
2983 static __net_initdata struct pernet_operations proto_net_ops = {
2984 .init = proto_init_net,
2985 .exit = proto_exit_net,
2988 static int __init proto_init(void)
2990 return register_pernet_subsys(&proto_net_ops);
2993 subsys_initcall(proto_init);
2995 #endif /* PROC_FS */