USB: usb-storage: unusual_devs update for Super TOP SATA bridge
[linux/fpc-iii.git] / net / core / sock.c
blob4b469e367923b1e626c8043c98ac6cd004701bdc
1 /*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
6 * Generic socket support routines. Memory allocators, socket lock/release
7 * handler for protocols to use and generic option handler.
10 * Authors: Ross Biro
11 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12 * Florian La Roche, <flla@stud.uni-sb.de>
13 * Alan Cox, <A.Cox@swansea.ac.uk>
15 * Fixes:
16 * Alan Cox : Numerous verify_area() problems
17 * Alan Cox : Connecting on a connecting socket
18 * now returns an error for tcp.
19 * Alan Cox : sock->protocol is set correctly.
20 * and is not sometimes left as 0.
21 * Alan Cox : connect handles icmp errors on a
22 * connect properly. Unfortunately there
23 * is a restart syscall nasty there. I
24 * can't match BSD without hacking the C
25 * library. Ideas urgently sought!
26 * Alan Cox : Disallow bind() to addresses that are
27 * not ours - especially broadcast ones!!
28 * Alan Cox : Socket 1024 _IS_ ok for users. (fencepost)
29 * Alan Cox : sock_wfree/sock_rfree don't destroy sockets,
30 * instead they leave that for the DESTROY timer.
31 * Alan Cox : Clean up error flag in accept
32 * Alan Cox : TCP ack handling is buggy, the DESTROY timer
33 * was buggy. Put a remove_sock() in the handler
34 * for memory when we hit 0. Also altered the timer
35 * code. The ACK stuff can wait and needs major
36 * TCP layer surgery.
37 * Alan Cox : Fixed TCP ack bug, removed remove sock
38 * and fixed timer/inet_bh race.
39 * Alan Cox : Added zapped flag for TCP
40 * Alan Cox : Move kfree_skb into skbuff.c and tidied up surplus code
41 * Alan Cox : for new sk_buff allocations wmalloc/rmalloc now call alloc_skb
42 * Alan Cox : kfree_s calls now are kfree_skbmem so we can track skb resources
43 * Alan Cox : Supports socket option broadcast now as does udp. Packet and raw need fixing.
44 * Alan Cox : Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so...
45 * Rick Sladkey : Relaxed UDP rules for matching packets.
46 * C.E.Hawkins : IFF_PROMISC/SIOCGHWADDR support
47 * Pauline Middelink : identd support
48 * Alan Cox : Fixed connect() taking signals I think.
49 * Alan Cox : SO_LINGER supported
50 * Alan Cox : Error reporting fixes
51 * Anonymous : inet_create tidied up (sk->reuse setting)
52 * Alan Cox : inet sockets don't set sk->type!
53 * Alan Cox : Split socket option code
54 * Alan Cox : Callbacks
55 * Alan Cox : Nagle flag for Charles & Johannes stuff
56 * Alex : Removed restriction on inet fioctl
57 * Alan Cox : Splitting INET from NET core
58 * Alan Cox : Fixed bogus SO_TYPE handling in getsockopt()
59 * Adam Caldwell : Missing return in SO_DONTROUTE/SO_DEBUG code
60 * Alan Cox : Split IP from generic code
61 * Alan Cox : New kfree_skbmem()
62 * Alan Cox : Make SO_DEBUG superuser only.
63 * Alan Cox : Allow anyone to clear SO_DEBUG
64 * (compatibility fix)
65 * Alan Cox : Added optimistic memory grabbing for AF_UNIX throughput.
66 * Alan Cox : Allocator for a socket is settable.
67 * Alan Cox : SO_ERROR includes soft errors.
68 * Alan Cox : Allow NULL arguments on some SO_ opts
69 * Alan Cox : Generic socket allocation to make hooks
70 * easier (suggested by Craig Metz).
71 * Michael Pall : SO_ERROR returns positive errno again
72 * Steve Whitehouse: Added default destructor to free
73 * protocol private data.
74 * Steve Whitehouse: Added various other default routines
75 * common to several socket families.
76 * Chris Evans : Call suser() check last on F_SETOWN
77 * Jay Schulist : Added SO_ATTACH_FILTER and SO_DETACH_FILTER.
78 * Andi Kleen : Add sock_kmalloc()/sock_kfree_s()
79 * Andi Kleen : Fix write_space callback
80 * Chris Evans : Security fixes - signedness again
81 * Arnaldo C. Melo : cleanups, use skb_queue_purge
83 * To Fix:
86 * This program is free software; you can redistribute it and/or
87 * modify it under the terms of the GNU General Public License
88 * as published by the Free Software Foundation; either version
89 * 2 of the License, or (at your option) any later version.
92 #include <linux/capability.h>
93 #include <linux/errno.h>
94 #include <linux/types.h>
95 #include <linux/socket.h>
96 #include <linux/in.h>
97 #include <linux/kernel.h>
98 #include <linux/module.h>
99 #include <linux/proc_fs.h>
100 #include <linux/seq_file.h>
101 #include <linux/sched.h>
102 #include <linux/timer.h>
103 #include <linux/string.h>
104 #include <linux/sockios.h>
105 #include <linux/net.h>
106 #include <linux/mm.h>
107 #include <linux/slab.h>
108 #include <linux/interrupt.h>
109 #include <linux/poll.h>
110 #include <linux/tcp.h>
111 #include <linux/init.h>
112 #include <linux/highmem.h>
113 #include <linux/user_namespace.h>
114 #include <linux/static_key.h>
115 #include <linux/memcontrol.h>
117 #include <asm/uaccess.h>
119 #include <linux/netdevice.h>
120 #include <net/protocol.h>
121 #include <linux/skbuff.h>
122 #include <net/net_namespace.h>
123 #include <net/request_sock.h>
124 #include <net/sock.h>
125 #include <linux/net_tstamp.h>
126 #include <net/xfrm.h>
127 #include <linux/ipsec.h>
128 #include <net/cls_cgroup.h>
129 #include <net/netprio_cgroup.h>
131 #include <linux/filter.h>
133 #include <trace/events/sock.h>
135 #ifdef CONFIG_INET
136 #include <net/tcp.h>
137 #endif
139 static DEFINE_MUTEX(proto_list_mutex);
140 static LIST_HEAD(proto_list);
142 #ifdef CONFIG_CGROUP_MEM_RES_CTLR_KMEM
143 int mem_cgroup_sockets_init(struct cgroup *cgrp, struct cgroup_subsys *ss)
145 struct proto *proto;
146 int ret = 0;
148 mutex_lock(&proto_list_mutex);
149 list_for_each_entry(proto, &proto_list, node) {
150 if (proto->init_cgroup) {
151 ret = proto->init_cgroup(cgrp, ss);
152 if (ret)
153 goto out;
157 mutex_unlock(&proto_list_mutex);
158 return ret;
159 out:
160 list_for_each_entry_continue_reverse(proto, &proto_list, node)
161 if (proto->destroy_cgroup)
162 proto->destroy_cgroup(cgrp);
163 mutex_unlock(&proto_list_mutex);
164 return ret;
167 void mem_cgroup_sockets_destroy(struct cgroup *cgrp)
169 struct proto *proto;
171 mutex_lock(&proto_list_mutex);
172 list_for_each_entry_reverse(proto, &proto_list, node)
173 if (proto->destroy_cgroup)
174 proto->destroy_cgroup(cgrp);
175 mutex_unlock(&proto_list_mutex);
177 #endif
180 * Each address family might have different locking rules, so we have
181 * one slock key per address family:
183 static struct lock_class_key af_family_keys[AF_MAX];
184 static struct lock_class_key af_family_slock_keys[AF_MAX];
186 struct static_key memcg_socket_limit_enabled;
187 EXPORT_SYMBOL(memcg_socket_limit_enabled);
190 * Make lock validator output more readable. (we pre-construct these
191 * strings build-time, so that runtime initialization of socket
192 * locks is fast):
194 static const char *const af_family_key_strings[AF_MAX+1] = {
195 "sk_lock-AF_UNSPEC", "sk_lock-AF_UNIX" , "sk_lock-AF_INET" ,
196 "sk_lock-AF_AX25" , "sk_lock-AF_IPX" , "sk_lock-AF_APPLETALK",
197 "sk_lock-AF_NETROM", "sk_lock-AF_BRIDGE" , "sk_lock-AF_ATMPVC" ,
198 "sk_lock-AF_X25" , "sk_lock-AF_INET6" , "sk_lock-AF_ROSE" ,
199 "sk_lock-AF_DECnet", "sk_lock-AF_NETBEUI" , "sk_lock-AF_SECURITY" ,
200 "sk_lock-AF_KEY" , "sk_lock-AF_NETLINK" , "sk_lock-AF_PACKET" ,
201 "sk_lock-AF_ASH" , "sk_lock-AF_ECONET" , "sk_lock-AF_ATMSVC" ,
202 "sk_lock-AF_RDS" , "sk_lock-AF_SNA" , "sk_lock-AF_IRDA" ,
203 "sk_lock-AF_PPPOX" , "sk_lock-AF_WANPIPE" , "sk_lock-AF_LLC" ,
204 "sk_lock-27" , "sk_lock-28" , "sk_lock-AF_CAN" ,
205 "sk_lock-AF_TIPC" , "sk_lock-AF_BLUETOOTH", "sk_lock-IUCV" ,
206 "sk_lock-AF_RXRPC" , "sk_lock-AF_ISDN" , "sk_lock-AF_PHONET" ,
207 "sk_lock-AF_IEEE802154", "sk_lock-AF_CAIF" , "sk_lock-AF_ALG" ,
208 "sk_lock-AF_NFC" , "sk_lock-AF_MAX"
210 static const char *const af_family_slock_key_strings[AF_MAX+1] = {
211 "slock-AF_UNSPEC", "slock-AF_UNIX" , "slock-AF_INET" ,
212 "slock-AF_AX25" , "slock-AF_IPX" , "slock-AF_APPLETALK",
213 "slock-AF_NETROM", "slock-AF_BRIDGE" , "slock-AF_ATMPVC" ,
214 "slock-AF_X25" , "slock-AF_INET6" , "slock-AF_ROSE" ,
215 "slock-AF_DECnet", "slock-AF_NETBEUI" , "slock-AF_SECURITY" ,
216 "slock-AF_KEY" , "slock-AF_NETLINK" , "slock-AF_PACKET" ,
217 "slock-AF_ASH" , "slock-AF_ECONET" , "slock-AF_ATMSVC" ,
218 "slock-AF_RDS" , "slock-AF_SNA" , "slock-AF_IRDA" ,
219 "slock-AF_PPPOX" , "slock-AF_WANPIPE" , "slock-AF_LLC" ,
220 "slock-27" , "slock-28" , "slock-AF_CAN" ,
221 "slock-AF_TIPC" , "slock-AF_BLUETOOTH", "slock-AF_IUCV" ,
222 "slock-AF_RXRPC" , "slock-AF_ISDN" , "slock-AF_PHONET" ,
223 "slock-AF_IEEE802154", "slock-AF_CAIF" , "slock-AF_ALG" ,
224 "slock-AF_NFC" , "slock-AF_MAX"
226 static const char *const af_family_clock_key_strings[AF_MAX+1] = {
227 "clock-AF_UNSPEC", "clock-AF_UNIX" , "clock-AF_INET" ,
228 "clock-AF_AX25" , "clock-AF_IPX" , "clock-AF_APPLETALK",
229 "clock-AF_NETROM", "clock-AF_BRIDGE" , "clock-AF_ATMPVC" ,
230 "clock-AF_X25" , "clock-AF_INET6" , "clock-AF_ROSE" ,
231 "clock-AF_DECnet", "clock-AF_NETBEUI" , "clock-AF_SECURITY" ,
232 "clock-AF_KEY" , "clock-AF_NETLINK" , "clock-AF_PACKET" ,
233 "clock-AF_ASH" , "clock-AF_ECONET" , "clock-AF_ATMSVC" ,
234 "clock-AF_RDS" , "clock-AF_SNA" , "clock-AF_IRDA" ,
235 "clock-AF_PPPOX" , "clock-AF_WANPIPE" , "clock-AF_LLC" ,
236 "clock-27" , "clock-28" , "clock-AF_CAN" ,
237 "clock-AF_TIPC" , "clock-AF_BLUETOOTH", "clock-AF_IUCV" ,
238 "clock-AF_RXRPC" , "clock-AF_ISDN" , "clock-AF_PHONET" ,
239 "clock-AF_IEEE802154", "clock-AF_CAIF" , "clock-AF_ALG" ,
240 "clock-AF_NFC" , "clock-AF_MAX"
244 * sk_callback_lock locking rules are per-address-family,
245 * so split the lock classes by using a per-AF key:
247 static struct lock_class_key af_callback_keys[AF_MAX];
249 /* Take into consideration the size of the struct sk_buff overhead in the
250 * determination of these values, since that is non-constant across
251 * platforms. This makes socket queueing behavior and performance
252 * not depend upon such differences.
254 #define _SK_MEM_PACKETS 256
255 #define _SK_MEM_OVERHEAD SKB_TRUESIZE(256)
256 #define SK_WMEM_MAX (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
257 #define SK_RMEM_MAX (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
259 /* Run time adjustable parameters. */
260 __u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX;
261 __u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX;
262 __u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX;
263 __u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX;
265 /* Maximal space eaten by iovec or ancillary data plus some space */
266 int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512);
267 EXPORT_SYMBOL(sysctl_optmem_max);
269 #if defined(CONFIG_CGROUPS)
270 #if !defined(CONFIG_NET_CLS_CGROUP)
271 int net_cls_subsys_id = -1;
272 EXPORT_SYMBOL_GPL(net_cls_subsys_id);
273 #endif
274 #if !defined(CONFIG_NETPRIO_CGROUP)
275 int net_prio_subsys_id = -1;
276 EXPORT_SYMBOL_GPL(net_prio_subsys_id);
277 #endif
278 #endif
280 static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen)
282 struct timeval tv;
284 if (optlen < sizeof(tv))
285 return -EINVAL;
286 if (copy_from_user(&tv, optval, sizeof(tv)))
287 return -EFAULT;
288 if (tv.tv_usec < 0 || tv.tv_usec >= USEC_PER_SEC)
289 return -EDOM;
291 if (tv.tv_sec < 0) {
292 static int warned __read_mostly;
294 *timeo_p = 0;
295 if (warned < 10 && net_ratelimit()) {
296 warned++;
297 printk(KERN_INFO "sock_set_timeout: `%s' (pid %d) "
298 "tries to set negative timeout\n",
299 current->comm, task_pid_nr(current));
301 return 0;
303 *timeo_p = MAX_SCHEDULE_TIMEOUT;
304 if (tv.tv_sec == 0 && tv.tv_usec == 0)
305 return 0;
306 if (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT/HZ - 1))
307 *timeo_p = tv.tv_sec*HZ + (tv.tv_usec+(1000000/HZ-1))/(1000000/HZ);
308 return 0;
311 static void sock_warn_obsolete_bsdism(const char *name)
313 static int warned;
314 static char warncomm[TASK_COMM_LEN];
315 if (strcmp(warncomm, current->comm) && warned < 5) {
316 strcpy(warncomm, current->comm);
317 printk(KERN_WARNING "process `%s' is using obsolete "
318 "%s SO_BSDCOMPAT\n", warncomm, name);
319 warned++;
323 #define SK_FLAGS_TIMESTAMP ((1UL << SOCK_TIMESTAMP) | (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE))
325 static void sock_disable_timestamp(struct sock *sk, unsigned long flags)
327 if (sk->sk_flags & flags) {
328 sk->sk_flags &= ~flags;
329 if (!(sk->sk_flags & SK_FLAGS_TIMESTAMP))
330 net_disable_timestamp();
335 int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
337 int err;
338 int skb_len;
339 unsigned long flags;
340 struct sk_buff_head *list = &sk->sk_receive_queue;
342 if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) {
343 atomic_inc(&sk->sk_drops);
344 trace_sock_rcvqueue_full(sk, skb);
345 return -ENOMEM;
348 err = sk_filter(sk, skb);
349 if (err)
350 return err;
352 if (!sk_rmem_schedule(sk, skb->truesize)) {
353 atomic_inc(&sk->sk_drops);
354 return -ENOBUFS;
357 skb->dev = NULL;
358 skb_set_owner_r(skb, sk);
360 /* Cache the SKB length before we tack it onto the receive
361 * queue. Once it is added it no longer belongs to us and
362 * may be freed by other threads of control pulling packets
363 * from the queue.
365 skb_len = skb->len;
367 /* we escape from rcu protected region, make sure we dont leak
368 * a norefcounted dst
370 skb_dst_force(skb);
372 spin_lock_irqsave(&list->lock, flags);
373 skb->dropcount = atomic_read(&sk->sk_drops);
374 __skb_queue_tail(list, skb);
375 spin_unlock_irqrestore(&list->lock, flags);
377 if (!sock_flag(sk, SOCK_DEAD))
378 sk->sk_data_ready(sk, skb_len);
379 return 0;
381 EXPORT_SYMBOL(sock_queue_rcv_skb);
383 int sk_receive_skb(struct sock *sk, struct sk_buff *skb, const int nested)
385 int rc = NET_RX_SUCCESS;
387 if (sk_filter(sk, skb))
388 goto discard_and_relse;
390 skb->dev = NULL;
392 if (sk_rcvqueues_full(sk, skb)) {
393 atomic_inc(&sk->sk_drops);
394 goto discard_and_relse;
396 if (nested)
397 bh_lock_sock_nested(sk);
398 else
399 bh_lock_sock(sk);
400 if (!sock_owned_by_user(sk)) {
402 * trylock + unlock semantics:
404 mutex_acquire(&sk->sk_lock.dep_map, 0, 1, _RET_IP_);
406 rc = sk_backlog_rcv(sk, skb);
408 mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
409 } else if (sk_add_backlog(sk, skb)) {
410 bh_unlock_sock(sk);
411 atomic_inc(&sk->sk_drops);
412 goto discard_and_relse;
415 bh_unlock_sock(sk);
416 out:
417 sock_put(sk);
418 return rc;
419 discard_and_relse:
420 kfree_skb(skb);
421 goto out;
423 EXPORT_SYMBOL(sk_receive_skb);
425 void sk_reset_txq(struct sock *sk)
427 sk_tx_queue_clear(sk);
429 EXPORT_SYMBOL(sk_reset_txq);
431 struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie)
433 struct dst_entry *dst = __sk_dst_get(sk);
435 if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
436 sk_tx_queue_clear(sk);
437 RCU_INIT_POINTER(sk->sk_dst_cache, NULL);
438 dst_release(dst);
439 return NULL;
442 return dst;
444 EXPORT_SYMBOL(__sk_dst_check);
446 struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie)
448 struct dst_entry *dst = sk_dst_get(sk);
450 if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
451 sk_dst_reset(sk);
452 dst_release(dst);
453 return NULL;
456 return dst;
458 EXPORT_SYMBOL(sk_dst_check);
460 static int sock_bindtodevice(struct sock *sk, char __user *optval, int optlen)
462 int ret = -ENOPROTOOPT;
463 #ifdef CONFIG_NETDEVICES
464 struct net *net = sock_net(sk);
465 char devname[IFNAMSIZ];
466 int index;
468 /* Sorry... */
469 ret = -EPERM;
470 if (!capable(CAP_NET_RAW))
471 goto out;
473 ret = -EINVAL;
474 if (optlen < 0)
475 goto out;
477 /* Bind this socket to a particular device like "eth0",
478 * as specified in the passed interface name. If the
479 * name is "" or the option length is zero the socket
480 * is not bound.
482 if (optlen > IFNAMSIZ - 1)
483 optlen = IFNAMSIZ - 1;
484 memset(devname, 0, sizeof(devname));
486 ret = -EFAULT;
487 if (copy_from_user(devname, optval, optlen))
488 goto out;
490 index = 0;
491 if (devname[0] != '\0') {
492 struct net_device *dev;
494 rcu_read_lock();
495 dev = dev_get_by_name_rcu(net, devname);
496 if (dev)
497 index = dev->ifindex;
498 rcu_read_unlock();
499 ret = -ENODEV;
500 if (!dev)
501 goto out;
504 lock_sock(sk);
505 sk->sk_bound_dev_if = index;
506 sk_dst_reset(sk);
507 release_sock(sk);
509 ret = 0;
511 out:
512 #endif
514 return ret;
517 static inline void sock_valbool_flag(struct sock *sk, int bit, int valbool)
519 if (valbool)
520 sock_set_flag(sk, bit);
521 else
522 sock_reset_flag(sk, bit);
526 * This is meant for all protocols to use and covers goings on
527 * at the socket level. Everything here is generic.
530 int sock_setsockopt(struct socket *sock, int level, int optname,
531 char __user *optval, unsigned int optlen)
533 struct sock *sk = sock->sk;
534 int val;
535 int valbool;
536 struct linger ling;
537 int ret = 0;
540 * Options without arguments
543 if (optname == SO_BINDTODEVICE)
544 return sock_bindtodevice(sk, optval, optlen);
546 if (optlen < sizeof(int))
547 return -EINVAL;
549 if (get_user(val, (int __user *)optval))
550 return -EFAULT;
552 valbool = val ? 1 : 0;
554 lock_sock(sk);
556 switch (optname) {
557 case SO_DEBUG:
558 if (val && !capable(CAP_NET_ADMIN))
559 ret = -EACCES;
560 else
561 sock_valbool_flag(sk, SOCK_DBG, valbool);
562 break;
563 case SO_REUSEADDR:
564 sk->sk_reuse = valbool;
565 break;
566 case SO_TYPE:
567 case SO_PROTOCOL:
568 case SO_DOMAIN:
569 case SO_ERROR:
570 ret = -ENOPROTOOPT;
571 break;
572 case SO_DONTROUTE:
573 sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool);
574 break;
575 case SO_BROADCAST:
576 sock_valbool_flag(sk, SOCK_BROADCAST, valbool);
577 break;
578 case SO_SNDBUF:
579 /* Don't error on this BSD doesn't and if you think
580 about it this is right. Otherwise apps have to
581 play 'guess the biggest size' games. RCVBUF/SNDBUF
582 are treated in BSD as hints */
584 if (val > sysctl_wmem_max)
585 val = sysctl_wmem_max;
586 set_sndbuf:
587 sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
588 if ((val * 2) < SOCK_MIN_SNDBUF)
589 sk->sk_sndbuf = SOCK_MIN_SNDBUF;
590 else
591 sk->sk_sndbuf = val * 2;
594 * Wake up sending tasks if we
595 * upped the value.
597 sk->sk_write_space(sk);
598 break;
600 case SO_SNDBUFFORCE:
601 if (!capable(CAP_NET_ADMIN)) {
602 ret = -EPERM;
603 break;
605 goto set_sndbuf;
607 case SO_RCVBUF:
608 /* Don't error on this BSD doesn't and if you think
609 about it this is right. Otherwise apps have to
610 play 'guess the biggest size' games. RCVBUF/SNDBUF
611 are treated in BSD as hints */
613 if (val > sysctl_rmem_max)
614 val = sysctl_rmem_max;
615 set_rcvbuf:
616 sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
618 * We double it on the way in to account for
619 * "struct sk_buff" etc. overhead. Applications
620 * assume that the SO_RCVBUF setting they make will
621 * allow that much actual data to be received on that
622 * socket.
624 * Applications are unaware that "struct sk_buff" and
625 * other overheads allocate from the receive buffer
626 * during socket buffer allocation.
628 * And after considering the possible alternatives,
629 * returning the value we actually used in getsockopt
630 * is the most desirable behavior.
632 if ((val * 2) < SOCK_MIN_RCVBUF)
633 sk->sk_rcvbuf = SOCK_MIN_RCVBUF;
634 else
635 sk->sk_rcvbuf = val * 2;
636 break;
638 case SO_RCVBUFFORCE:
639 if (!capable(CAP_NET_ADMIN)) {
640 ret = -EPERM;
641 break;
643 goto set_rcvbuf;
645 case SO_KEEPALIVE:
646 #ifdef CONFIG_INET
647 if (sk->sk_protocol == IPPROTO_TCP &&
648 sk->sk_type == SOCK_STREAM)
649 tcp_set_keepalive(sk, valbool);
650 #endif
651 sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool);
652 break;
654 case SO_OOBINLINE:
655 sock_valbool_flag(sk, SOCK_URGINLINE, valbool);
656 break;
658 case SO_NO_CHECK:
659 sk->sk_no_check = valbool;
660 break;
662 case SO_PRIORITY:
663 if ((val >= 0 && val <= 6) || capable(CAP_NET_ADMIN))
664 sk->sk_priority = val;
665 else
666 ret = -EPERM;
667 break;
669 case SO_LINGER:
670 if (optlen < sizeof(ling)) {
671 ret = -EINVAL; /* 1003.1g */
672 break;
674 if (copy_from_user(&ling, optval, sizeof(ling))) {
675 ret = -EFAULT;
676 break;
678 if (!ling.l_onoff)
679 sock_reset_flag(sk, SOCK_LINGER);
680 else {
681 #if (BITS_PER_LONG == 32)
682 if ((unsigned int)ling.l_linger >= MAX_SCHEDULE_TIMEOUT/HZ)
683 sk->sk_lingertime = MAX_SCHEDULE_TIMEOUT;
684 else
685 #endif
686 sk->sk_lingertime = (unsigned int)ling.l_linger * HZ;
687 sock_set_flag(sk, SOCK_LINGER);
689 break;
691 case SO_BSDCOMPAT:
692 sock_warn_obsolete_bsdism("setsockopt");
693 break;
695 case SO_PASSCRED:
696 if (valbool)
697 set_bit(SOCK_PASSCRED, &sock->flags);
698 else
699 clear_bit(SOCK_PASSCRED, &sock->flags);
700 break;
702 case SO_TIMESTAMP:
703 case SO_TIMESTAMPNS:
704 if (valbool) {
705 if (optname == SO_TIMESTAMP)
706 sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
707 else
708 sock_set_flag(sk, SOCK_RCVTSTAMPNS);
709 sock_set_flag(sk, SOCK_RCVTSTAMP);
710 sock_enable_timestamp(sk, SOCK_TIMESTAMP);
711 } else {
712 sock_reset_flag(sk, SOCK_RCVTSTAMP);
713 sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
715 break;
717 case SO_TIMESTAMPING:
718 if (val & ~SOF_TIMESTAMPING_MASK) {
719 ret = -EINVAL;
720 break;
722 sock_valbool_flag(sk, SOCK_TIMESTAMPING_TX_HARDWARE,
723 val & SOF_TIMESTAMPING_TX_HARDWARE);
724 sock_valbool_flag(sk, SOCK_TIMESTAMPING_TX_SOFTWARE,
725 val & SOF_TIMESTAMPING_TX_SOFTWARE);
726 sock_valbool_flag(sk, SOCK_TIMESTAMPING_RX_HARDWARE,
727 val & SOF_TIMESTAMPING_RX_HARDWARE);
728 if (val & SOF_TIMESTAMPING_RX_SOFTWARE)
729 sock_enable_timestamp(sk,
730 SOCK_TIMESTAMPING_RX_SOFTWARE);
731 else
732 sock_disable_timestamp(sk,
733 (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE));
734 sock_valbool_flag(sk, SOCK_TIMESTAMPING_SOFTWARE,
735 val & SOF_TIMESTAMPING_SOFTWARE);
736 sock_valbool_flag(sk, SOCK_TIMESTAMPING_SYS_HARDWARE,
737 val & SOF_TIMESTAMPING_SYS_HARDWARE);
738 sock_valbool_flag(sk, SOCK_TIMESTAMPING_RAW_HARDWARE,
739 val & SOF_TIMESTAMPING_RAW_HARDWARE);
740 break;
742 case SO_RCVLOWAT:
743 if (val < 0)
744 val = INT_MAX;
745 sk->sk_rcvlowat = val ? : 1;
746 break;
748 case SO_RCVTIMEO:
749 ret = sock_set_timeout(&sk->sk_rcvtimeo, optval, optlen);
750 break;
752 case SO_SNDTIMEO:
753 ret = sock_set_timeout(&sk->sk_sndtimeo, optval, optlen);
754 break;
756 case SO_ATTACH_FILTER:
757 ret = -EINVAL;
758 if (optlen == sizeof(struct sock_fprog)) {
759 struct sock_fprog fprog;
761 ret = -EFAULT;
762 if (copy_from_user(&fprog, optval, sizeof(fprog)))
763 break;
765 ret = sk_attach_filter(&fprog, sk);
767 break;
769 case SO_DETACH_FILTER:
770 ret = sk_detach_filter(sk);
771 break;
773 case SO_PASSSEC:
774 if (valbool)
775 set_bit(SOCK_PASSSEC, &sock->flags);
776 else
777 clear_bit(SOCK_PASSSEC, &sock->flags);
778 break;
779 case SO_MARK:
780 if (!capable(CAP_NET_ADMIN))
781 ret = -EPERM;
782 else
783 sk->sk_mark = val;
784 break;
786 /* We implement the SO_SNDLOWAT etc to
787 not be settable (1003.1g 5.3) */
788 case SO_RXQ_OVFL:
789 sock_valbool_flag(sk, SOCK_RXQ_OVFL, valbool);
790 break;
792 case SO_WIFI_STATUS:
793 sock_valbool_flag(sk, SOCK_WIFI_STATUS, valbool);
794 break;
796 case SO_PEEK_OFF:
797 if (sock->ops->set_peek_off)
798 sock->ops->set_peek_off(sk, val);
799 else
800 ret = -EOPNOTSUPP;
801 break;
803 case SO_NOFCS:
804 sock_valbool_flag(sk, SOCK_NOFCS, valbool);
805 break;
807 default:
808 ret = -ENOPROTOOPT;
809 break;
811 release_sock(sk);
812 return ret;
814 EXPORT_SYMBOL(sock_setsockopt);
817 void cred_to_ucred(struct pid *pid, const struct cred *cred,
818 struct ucred *ucred)
820 ucred->pid = pid_vnr(pid);
821 ucred->uid = ucred->gid = -1;
822 if (cred) {
823 struct user_namespace *current_ns = current_user_ns();
825 ucred->uid = user_ns_map_uid(current_ns, cred, cred->euid);
826 ucred->gid = user_ns_map_gid(current_ns, cred, cred->egid);
829 EXPORT_SYMBOL_GPL(cred_to_ucred);
831 int sock_getsockopt(struct socket *sock, int level, int optname,
832 char __user *optval, int __user *optlen)
834 struct sock *sk = sock->sk;
836 union {
837 int val;
838 struct linger ling;
839 struct timeval tm;
840 } v;
842 int lv = sizeof(int);
843 int len;
845 if (get_user(len, optlen))
846 return -EFAULT;
847 if (len < 0)
848 return -EINVAL;
850 memset(&v, 0, sizeof(v));
852 switch (optname) {
853 case SO_DEBUG:
854 v.val = sock_flag(sk, SOCK_DBG);
855 break;
857 case SO_DONTROUTE:
858 v.val = sock_flag(sk, SOCK_LOCALROUTE);
859 break;
861 case SO_BROADCAST:
862 v.val = !!sock_flag(sk, SOCK_BROADCAST);
863 break;
865 case SO_SNDBUF:
866 v.val = sk->sk_sndbuf;
867 break;
869 case SO_RCVBUF:
870 v.val = sk->sk_rcvbuf;
871 break;
873 case SO_REUSEADDR:
874 v.val = sk->sk_reuse;
875 break;
877 case SO_KEEPALIVE:
878 v.val = !!sock_flag(sk, SOCK_KEEPOPEN);
879 break;
881 case SO_TYPE:
882 v.val = sk->sk_type;
883 break;
885 case SO_PROTOCOL:
886 v.val = sk->sk_protocol;
887 break;
889 case SO_DOMAIN:
890 v.val = sk->sk_family;
891 break;
893 case SO_ERROR:
894 v.val = -sock_error(sk);
895 if (v.val == 0)
896 v.val = xchg(&sk->sk_err_soft, 0);
897 break;
899 case SO_OOBINLINE:
900 v.val = !!sock_flag(sk, SOCK_URGINLINE);
901 break;
903 case SO_NO_CHECK:
904 v.val = sk->sk_no_check;
905 break;
907 case SO_PRIORITY:
908 v.val = sk->sk_priority;
909 break;
911 case SO_LINGER:
912 lv = sizeof(v.ling);
913 v.ling.l_onoff = !!sock_flag(sk, SOCK_LINGER);
914 v.ling.l_linger = sk->sk_lingertime / HZ;
915 break;
917 case SO_BSDCOMPAT:
918 sock_warn_obsolete_bsdism("getsockopt");
919 break;
921 case SO_TIMESTAMP:
922 v.val = sock_flag(sk, SOCK_RCVTSTAMP) &&
923 !sock_flag(sk, SOCK_RCVTSTAMPNS);
924 break;
926 case SO_TIMESTAMPNS:
927 v.val = sock_flag(sk, SOCK_RCVTSTAMPNS);
928 break;
930 case SO_TIMESTAMPING:
931 v.val = 0;
932 if (sock_flag(sk, SOCK_TIMESTAMPING_TX_HARDWARE))
933 v.val |= SOF_TIMESTAMPING_TX_HARDWARE;
934 if (sock_flag(sk, SOCK_TIMESTAMPING_TX_SOFTWARE))
935 v.val |= SOF_TIMESTAMPING_TX_SOFTWARE;
936 if (sock_flag(sk, SOCK_TIMESTAMPING_RX_HARDWARE))
937 v.val |= SOF_TIMESTAMPING_RX_HARDWARE;
938 if (sock_flag(sk, SOCK_TIMESTAMPING_RX_SOFTWARE))
939 v.val |= SOF_TIMESTAMPING_RX_SOFTWARE;
940 if (sock_flag(sk, SOCK_TIMESTAMPING_SOFTWARE))
941 v.val |= SOF_TIMESTAMPING_SOFTWARE;
942 if (sock_flag(sk, SOCK_TIMESTAMPING_SYS_HARDWARE))
943 v.val |= SOF_TIMESTAMPING_SYS_HARDWARE;
944 if (sock_flag(sk, SOCK_TIMESTAMPING_RAW_HARDWARE))
945 v.val |= SOF_TIMESTAMPING_RAW_HARDWARE;
946 break;
948 case SO_RCVTIMEO:
949 lv = sizeof(struct timeval);
950 if (sk->sk_rcvtimeo == MAX_SCHEDULE_TIMEOUT) {
951 v.tm.tv_sec = 0;
952 v.tm.tv_usec = 0;
953 } else {
954 v.tm.tv_sec = sk->sk_rcvtimeo / HZ;
955 v.tm.tv_usec = ((sk->sk_rcvtimeo % HZ) * 1000000) / HZ;
957 break;
959 case SO_SNDTIMEO:
960 lv = sizeof(struct timeval);
961 if (sk->sk_sndtimeo == MAX_SCHEDULE_TIMEOUT) {
962 v.tm.tv_sec = 0;
963 v.tm.tv_usec = 0;
964 } else {
965 v.tm.tv_sec = sk->sk_sndtimeo / HZ;
966 v.tm.tv_usec = ((sk->sk_sndtimeo % HZ) * 1000000) / HZ;
968 break;
970 case SO_RCVLOWAT:
971 v.val = sk->sk_rcvlowat;
972 break;
974 case SO_SNDLOWAT:
975 v.val = 1;
976 break;
978 case SO_PASSCRED:
979 v.val = test_bit(SOCK_PASSCRED, &sock->flags) ? 1 : 0;
980 break;
982 case SO_PEERCRED:
984 struct ucred peercred;
985 if (len > sizeof(peercred))
986 len = sizeof(peercred);
987 cred_to_ucred(sk->sk_peer_pid, sk->sk_peer_cred, &peercred);
988 if (copy_to_user(optval, &peercred, len))
989 return -EFAULT;
990 goto lenout;
993 case SO_PEERNAME:
995 char address[128];
997 if (sock->ops->getname(sock, (struct sockaddr *)address, &lv, 2))
998 return -ENOTCONN;
999 if (lv < len)
1000 return -EINVAL;
1001 if (copy_to_user(optval, address, len))
1002 return -EFAULT;
1003 goto lenout;
1006 /* Dubious BSD thing... Probably nobody even uses it, but
1007 * the UNIX standard wants it for whatever reason... -DaveM
1009 case SO_ACCEPTCONN:
1010 v.val = sk->sk_state == TCP_LISTEN;
1011 break;
1013 case SO_PASSSEC:
1014 v.val = test_bit(SOCK_PASSSEC, &sock->flags) ? 1 : 0;
1015 break;
1017 case SO_PEERSEC:
1018 return security_socket_getpeersec_stream(sock, optval, optlen, len);
1020 case SO_MARK:
1021 v.val = sk->sk_mark;
1022 break;
1024 case SO_RXQ_OVFL:
1025 v.val = !!sock_flag(sk, SOCK_RXQ_OVFL);
1026 break;
1028 case SO_WIFI_STATUS:
1029 v.val = !!sock_flag(sk, SOCK_WIFI_STATUS);
1030 break;
1032 case SO_PEEK_OFF:
1033 if (!sock->ops->set_peek_off)
1034 return -EOPNOTSUPP;
1036 v.val = sk->sk_peek_off;
1037 break;
1038 case SO_NOFCS:
1039 v.val = !!sock_flag(sk, SOCK_NOFCS);
1040 break;
1041 default:
1042 return -ENOPROTOOPT;
1045 if (len > lv)
1046 len = lv;
1047 if (copy_to_user(optval, &v, len))
1048 return -EFAULT;
1049 lenout:
1050 if (put_user(len, optlen))
1051 return -EFAULT;
1052 return 0;
1056 * Initialize an sk_lock.
1058 * (We also register the sk_lock with the lock validator.)
1060 static inline void sock_lock_init(struct sock *sk)
1062 sock_lock_init_class_and_name(sk,
1063 af_family_slock_key_strings[sk->sk_family],
1064 af_family_slock_keys + sk->sk_family,
1065 af_family_key_strings[sk->sk_family],
1066 af_family_keys + sk->sk_family);
1070 * Copy all fields from osk to nsk but nsk->sk_refcnt must not change yet,
1071 * even temporarly, because of RCU lookups. sk_node should also be left as is.
1072 * We must not copy fields between sk_dontcopy_begin and sk_dontcopy_end
1074 static void sock_copy(struct sock *nsk, const struct sock *osk)
1076 #ifdef CONFIG_SECURITY_NETWORK
1077 void *sptr = nsk->sk_security;
1078 #endif
1079 memcpy(nsk, osk, offsetof(struct sock, sk_dontcopy_begin));
1081 memcpy(&nsk->sk_dontcopy_end, &osk->sk_dontcopy_end,
1082 osk->sk_prot->obj_size - offsetof(struct sock, sk_dontcopy_end));
1084 #ifdef CONFIG_SECURITY_NETWORK
1085 nsk->sk_security = sptr;
1086 security_sk_clone(osk, nsk);
1087 #endif
1091 * caches using SLAB_DESTROY_BY_RCU should let .next pointer from nulls nodes
1092 * un-modified. Special care is taken when initializing object to zero.
1094 static inline void sk_prot_clear_nulls(struct sock *sk, int size)
1096 if (offsetof(struct sock, sk_node.next) != 0)
1097 memset(sk, 0, offsetof(struct sock, sk_node.next));
1098 memset(&sk->sk_node.pprev, 0,
1099 size - offsetof(struct sock, sk_node.pprev));
1102 void sk_prot_clear_portaddr_nulls(struct sock *sk, int size)
1104 unsigned long nulls1, nulls2;
1106 nulls1 = offsetof(struct sock, __sk_common.skc_node.next);
1107 nulls2 = offsetof(struct sock, __sk_common.skc_portaddr_node.next);
1108 if (nulls1 > nulls2)
1109 swap(nulls1, nulls2);
1111 if (nulls1 != 0)
1112 memset((char *)sk, 0, nulls1);
1113 memset((char *)sk + nulls1 + sizeof(void *), 0,
1114 nulls2 - nulls1 - sizeof(void *));
1115 memset((char *)sk + nulls2 + sizeof(void *), 0,
1116 size - nulls2 - sizeof(void *));
1118 EXPORT_SYMBOL(sk_prot_clear_portaddr_nulls);
1120 static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
1121 int family)
1123 struct sock *sk;
1124 struct kmem_cache *slab;
1126 slab = prot->slab;
1127 if (slab != NULL) {
1128 sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO);
1129 if (!sk)
1130 return sk;
1131 if (priority & __GFP_ZERO) {
1132 if (prot->clear_sk)
1133 prot->clear_sk(sk, prot->obj_size);
1134 else
1135 sk_prot_clear_nulls(sk, prot->obj_size);
1137 } else
1138 sk = kmalloc(prot->obj_size, priority);
1140 if (sk != NULL) {
1141 kmemcheck_annotate_bitfield(sk, flags);
1143 if (security_sk_alloc(sk, family, priority))
1144 goto out_free;
1146 if (!try_module_get(prot->owner))
1147 goto out_free_sec;
1148 sk_tx_queue_clear(sk);
1151 return sk;
1153 out_free_sec:
1154 security_sk_free(sk);
1155 out_free:
1156 if (slab != NULL)
1157 kmem_cache_free(slab, sk);
1158 else
1159 kfree(sk);
1160 return NULL;
1163 static void sk_prot_free(struct proto *prot, struct sock *sk)
1165 struct kmem_cache *slab;
1166 struct module *owner;
1168 owner = prot->owner;
1169 slab = prot->slab;
1171 security_sk_free(sk);
1172 if (slab != NULL)
1173 kmem_cache_free(slab, sk);
1174 else
1175 kfree(sk);
1176 module_put(owner);
1179 #ifdef CONFIG_CGROUPS
1180 void sock_update_classid(struct sock *sk)
1182 u32 classid;
1184 rcu_read_lock(); /* doing current task, which cannot vanish. */
1185 classid = task_cls_classid(current);
1186 rcu_read_unlock();
1187 if (classid && classid != sk->sk_classid)
1188 sk->sk_classid = classid;
1190 EXPORT_SYMBOL(sock_update_classid);
1192 void sock_update_netprioidx(struct sock *sk)
1194 if (in_interrupt())
1195 return;
1197 sk->sk_cgrp_prioidx = task_netprioidx(current);
1199 EXPORT_SYMBOL_GPL(sock_update_netprioidx);
1200 #endif
1203 * sk_alloc - All socket objects are allocated here
1204 * @net: the applicable net namespace
1205 * @family: protocol family
1206 * @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
1207 * @prot: struct proto associated with this new sock instance
1209 struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
1210 struct proto *prot)
1212 struct sock *sk;
1214 sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family);
1215 if (sk) {
1216 sk->sk_family = family;
1218 * See comment in struct sock definition to understand
1219 * why we need sk_prot_creator -acme
1221 sk->sk_prot = sk->sk_prot_creator = prot;
1222 sock_lock_init(sk);
1223 sock_net_set(sk, get_net(net));
1224 atomic_set(&sk->sk_wmem_alloc, 1);
1226 sock_update_classid(sk);
1227 sock_update_netprioidx(sk);
1230 return sk;
1232 EXPORT_SYMBOL(sk_alloc);
1234 static void __sk_free(struct sock *sk)
1236 struct sk_filter *filter;
1238 if (sk->sk_destruct)
1239 sk->sk_destruct(sk);
1241 filter = rcu_dereference_check(sk->sk_filter,
1242 atomic_read(&sk->sk_wmem_alloc) == 0);
1243 if (filter) {
1244 sk_filter_uncharge(sk, filter);
1245 RCU_INIT_POINTER(sk->sk_filter, NULL);
1248 sock_disable_timestamp(sk, SK_FLAGS_TIMESTAMP);
1250 if (atomic_read(&sk->sk_omem_alloc))
1251 printk(KERN_DEBUG "%s: optmem leakage (%d bytes) detected.\n",
1252 __func__, atomic_read(&sk->sk_omem_alloc));
1254 if (sk->sk_peer_cred)
1255 put_cred(sk->sk_peer_cred);
1256 put_pid(sk->sk_peer_pid);
1257 put_net(sock_net(sk));
1258 sk_prot_free(sk->sk_prot_creator, sk);
1261 void sk_free(struct sock *sk)
1264 * We subtract one from sk_wmem_alloc and can know if
1265 * some packets are still in some tx queue.
1266 * If not null, sock_wfree() will call __sk_free(sk) later
1268 if (atomic_dec_and_test(&sk->sk_wmem_alloc))
1269 __sk_free(sk);
1271 EXPORT_SYMBOL(sk_free);
1274 * Last sock_put should drop reference to sk->sk_net. It has already
1275 * been dropped in sk_change_net. Taking reference to stopping namespace
1276 * is not an option.
1277 * Take reference to a socket to remove it from hash _alive_ and after that
1278 * destroy it in the context of init_net.
1280 void sk_release_kernel(struct sock *sk)
1282 if (sk == NULL || sk->sk_socket == NULL)
1283 return;
1285 sock_hold(sk);
1286 sock_release(sk->sk_socket);
1287 release_net(sock_net(sk));
1288 sock_net_set(sk, get_net(&init_net));
1289 sock_put(sk);
1291 EXPORT_SYMBOL(sk_release_kernel);
1293 static void sk_update_clone(const struct sock *sk, struct sock *newsk)
1295 if (mem_cgroup_sockets_enabled && sk->sk_cgrp)
1296 sock_update_memcg(newsk);
1300 * sk_clone_lock - clone a socket, and lock its clone
1301 * @sk: the socket to clone
1302 * @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
1304 * Caller must unlock socket even in error path (bh_unlock_sock(newsk))
1306 struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
1308 struct sock *newsk;
1310 newsk = sk_prot_alloc(sk->sk_prot, priority, sk->sk_family);
1311 if (newsk != NULL) {
1312 struct sk_filter *filter;
1314 sock_copy(newsk, sk);
1316 /* SANITY */
1317 get_net(sock_net(newsk));
1318 sk_node_init(&newsk->sk_node);
1319 sock_lock_init(newsk);
1320 bh_lock_sock(newsk);
1321 newsk->sk_backlog.head = newsk->sk_backlog.tail = NULL;
1322 newsk->sk_backlog.len = 0;
1324 atomic_set(&newsk->sk_rmem_alloc, 0);
1326 * sk_wmem_alloc set to one (see sk_free() and sock_wfree())
1328 atomic_set(&newsk->sk_wmem_alloc, 1);
1329 atomic_set(&newsk->sk_omem_alloc, 0);
1330 skb_queue_head_init(&newsk->sk_receive_queue);
1331 skb_queue_head_init(&newsk->sk_write_queue);
1332 #ifdef CONFIG_NET_DMA
1333 skb_queue_head_init(&newsk->sk_async_wait_queue);
1334 #endif
1336 spin_lock_init(&newsk->sk_dst_lock);
1337 rwlock_init(&newsk->sk_callback_lock);
1338 lockdep_set_class_and_name(&newsk->sk_callback_lock,
1339 af_callback_keys + newsk->sk_family,
1340 af_family_clock_key_strings[newsk->sk_family]);
1342 newsk->sk_dst_cache = NULL;
1343 newsk->sk_wmem_queued = 0;
1344 newsk->sk_forward_alloc = 0;
1345 newsk->sk_send_head = NULL;
1346 newsk->sk_userlocks = sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
1348 sock_reset_flag(newsk, SOCK_DONE);
1349 skb_queue_head_init(&newsk->sk_error_queue);
1351 filter = rcu_dereference_protected(newsk->sk_filter, 1);
1352 if (filter != NULL)
1353 sk_filter_charge(newsk, filter);
1355 if (unlikely(xfrm_sk_clone_policy(newsk))) {
1356 /* It is still raw copy of parent, so invalidate
1357 * destructor and make plain sk_free() */
1358 newsk->sk_destruct = NULL;
1359 bh_unlock_sock(newsk);
1360 sk_free(newsk);
1361 newsk = NULL;
1362 goto out;
1365 newsk->sk_err = 0;
1366 newsk->sk_priority = 0;
1368 * Before updating sk_refcnt, we must commit prior changes to memory
1369 * (Documentation/RCU/rculist_nulls.txt for details)
1371 smp_wmb();
1372 atomic_set(&newsk->sk_refcnt, 2);
1375 * Increment the counter in the same struct proto as the master
1376 * sock (sk_refcnt_debug_inc uses newsk->sk_prot->socks, that
1377 * is the same as sk->sk_prot->socks, as this field was copied
1378 * with memcpy).
1380 * This _changes_ the previous behaviour, where
1381 * tcp_create_openreq_child always was incrementing the
1382 * equivalent to tcp_prot->socks (inet_sock_nr), so this have
1383 * to be taken into account in all callers. -acme
1385 sk_refcnt_debug_inc(newsk);
1386 sk_set_socket(newsk, NULL);
1387 newsk->sk_wq = NULL;
1389 sk_update_clone(sk, newsk);
1391 if (newsk->sk_prot->sockets_allocated)
1392 sk_sockets_allocated_inc(newsk);
1394 if (newsk->sk_flags & SK_FLAGS_TIMESTAMP)
1395 net_enable_timestamp();
1397 out:
1398 return newsk;
1400 EXPORT_SYMBOL_GPL(sk_clone_lock);
1402 void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
1404 __sk_dst_set(sk, dst);
1405 sk->sk_route_caps = dst->dev->features;
1406 if (sk->sk_route_caps & NETIF_F_GSO)
1407 sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE;
1408 sk->sk_route_caps &= ~sk->sk_route_nocaps;
1409 if (sk_can_gso(sk)) {
1410 if (dst->header_len) {
1411 sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
1412 } else {
1413 sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM;
1414 sk->sk_gso_max_size = dst->dev->gso_max_size;
1415 sk->sk_gso_max_segs = dst->dev->gso_max_segs;
1419 EXPORT_SYMBOL_GPL(sk_setup_caps);
1421 void __init sk_init(void)
1423 if (totalram_pages <= 4096) {
1424 sysctl_wmem_max = 32767;
1425 sysctl_rmem_max = 32767;
1426 sysctl_wmem_default = 32767;
1427 sysctl_rmem_default = 32767;
1428 } else if (totalram_pages >= 131072) {
1429 sysctl_wmem_max = 131071;
1430 sysctl_rmem_max = 131071;
1435 * Simple resource managers for sockets.
1440 * Write buffer destructor automatically called from kfree_skb.
1442 void sock_wfree(struct sk_buff *skb)
1444 struct sock *sk = skb->sk;
1445 unsigned int len = skb->truesize;
1447 if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE)) {
1449 * Keep a reference on sk_wmem_alloc, this will be released
1450 * after sk_write_space() call
1452 atomic_sub(len - 1, &sk->sk_wmem_alloc);
1453 sk->sk_write_space(sk);
1454 len = 1;
1457 * if sk_wmem_alloc reaches 0, we must finish what sk_free()
1458 * could not do because of in-flight packets
1460 if (atomic_sub_and_test(len, &sk->sk_wmem_alloc))
1461 __sk_free(sk);
1463 EXPORT_SYMBOL(sock_wfree);
1466 * Read buffer destructor automatically called from kfree_skb.
1468 void sock_rfree(struct sk_buff *skb)
1470 struct sock *sk = skb->sk;
1471 unsigned int len = skb->truesize;
1473 atomic_sub(len, &sk->sk_rmem_alloc);
1474 sk_mem_uncharge(sk, len);
1476 EXPORT_SYMBOL(sock_rfree);
1479 int sock_i_uid(struct sock *sk)
1481 int uid;
1483 read_lock_bh(&sk->sk_callback_lock);
1484 uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : 0;
1485 read_unlock_bh(&sk->sk_callback_lock);
1486 return uid;
1488 EXPORT_SYMBOL(sock_i_uid);
1490 unsigned long sock_i_ino(struct sock *sk)
1492 unsigned long ino;
1494 read_lock_bh(&sk->sk_callback_lock);
1495 ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0;
1496 read_unlock_bh(&sk->sk_callback_lock);
1497 return ino;
1499 EXPORT_SYMBOL(sock_i_ino);
1502 * Allocate a skb from the socket's send buffer.
1504 struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
1505 gfp_t priority)
1507 if (force || atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
1508 struct sk_buff *skb = alloc_skb(size, priority);
1509 if (skb) {
1510 skb_set_owner_w(skb, sk);
1511 return skb;
1514 return NULL;
1516 EXPORT_SYMBOL(sock_wmalloc);
1519 * Allocate a skb from the socket's receive buffer.
1521 struct sk_buff *sock_rmalloc(struct sock *sk, unsigned long size, int force,
1522 gfp_t priority)
1524 if (force || atomic_read(&sk->sk_rmem_alloc) < sk->sk_rcvbuf) {
1525 struct sk_buff *skb = alloc_skb(size, priority);
1526 if (skb) {
1527 skb_set_owner_r(skb, sk);
1528 return skb;
1531 return NULL;
1535 * Allocate a memory block from the socket's option memory buffer.
1537 void *sock_kmalloc(struct sock *sk, int size, gfp_t priority)
1539 if ((unsigned)size <= sysctl_optmem_max &&
1540 atomic_read(&sk->sk_omem_alloc) + size < sysctl_optmem_max) {
1541 void *mem;
1542 /* First do the add, to avoid the race if kmalloc
1543 * might sleep.
1545 atomic_add(size, &sk->sk_omem_alloc);
1546 mem = kmalloc(size, priority);
1547 if (mem)
1548 return mem;
1549 atomic_sub(size, &sk->sk_omem_alloc);
1551 return NULL;
1553 EXPORT_SYMBOL(sock_kmalloc);
1556 * Free an option memory block.
1558 void sock_kfree_s(struct sock *sk, void *mem, int size)
1560 kfree(mem);
1561 atomic_sub(size, &sk->sk_omem_alloc);
1563 EXPORT_SYMBOL(sock_kfree_s);
1565 /* It is almost wait_for_tcp_memory minus release_sock/lock_sock.
1566 I think, these locks should be removed for datagram sockets.
1568 static long sock_wait_for_wmem(struct sock *sk, long timeo)
1570 DEFINE_WAIT(wait);
1572 clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
1573 for (;;) {
1574 if (!timeo)
1575 break;
1576 if (signal_pending(current))
1577 break;
1578 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1579 prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
1580 if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf)
1581 break;
1582 if (sk->sk_shutdown & SEND_SHUTDOWN)
1583 break;
1584 if (sk->sk_err)
1585 break;
1586 timeo = schedule_timeout(timeo);
1588 finish_wait(sk_sleep(sk), &wait);
1589 return timeo;
1594 * Generic send/receive buffer handlers
1597 struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len,
1598 unsigned long data_len, int noblock,
1599 int *errcode)
1601 struct sk_buff *skb;
1602 gfp_t gfp_mask;
1603 long timeo;
1604 int err;
1605 int npages = (data_len + (PAGE_SIZE - 1)) >> PAGE_SHIFT;
1607 err = -EMSGSIZE;
1608 if (npages > MAX_SKB_FRAGS)
1609 goto failure;
1611 gfp_mask = sk->sk_allocation;
1612 if (gfp_mask & __GFP_WAIT)
1613 gfp_mask |= __GFP_REPEAT;
1615 timeo = sock_sndtimeo(sk, noblock);
1616 while (1) {
1617 err = sock_error(sk);
1618 if (err != 0)
1619 goto failure;
1621 err = -EPIPE;
1622 if (sk->sk_shutdown & SEND_SHUTDOWN)
1623 goto failure;
1625 if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
1626 skb = alloc_skb(header_len, gfp_mask);
1627 if (skb) {
1628 int i;
1630 /* No pages, we're done... */
1631 if (!data_len)
1632 break;
1634 skb->truesize += data_len;
1635 skb_shinfo(skb)->nr_frags = npages;
1636 for (i = 0; i < npages; i++) {
1637 struct page *page;
1639 page = alloc_pages(sk->sk_allocation, 0);
1640 if (!page) {
1641 err = -ENOBUFS;
1642 skb_shinfo(skb)->nr_frags = i;
1643 kfree_skb(skb);
1644 goto failure;
1647 __skb_fill_page_desc(skb, i,
1648 page, 0,
1649 (data_len >= PAGE_SIZE ?
1650 PAGE_SIZE :
1651 data_len));
1652 data_len -= PAGE_SIZE;
1655 /* Full success... */
1656 break;
1658 err = -ENOBUFS;
1659 goto failure;
1661 set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
1662 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1663 err = -EAGAIN;
1664 if (!timeo)
1665 goto failure;
1666 if (signal_pending(current))
1667 goto interrupted;
1668 timeo = sock_wait_for_wmem(sk, timeo);
1671 skb_set_owner_w(skb, sk);
1672 return skb;
1674 interrupted:
1675 err = sock_intr_errno(timeo);
1676 failure:
1677 *errcode = err;
1678 return NULL;
1680 EXPORT_SYMBOL(sock_alloc_send_pskb);
1682 struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size,
1683 int noblock, int *errcode)
1685 return sock_alloc_send_pskb(sk, size, 0, noblock, errcode);
1687 EXPORT_SYMBOL(sock_alloc_send_skb);
1689 static void __lock_sock(struct sock *sk)
1690 __releases(&sk->sk_lock.slock)
1691 __acquires(&sk->sk_lock.slock)
1693 DEFINE_WAIT(wait);
1695 for (;;) {
1696 prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait,
1697 TASK_UNINTERRUPTIBLE);
1698 spin_unlock_bh(&sk->sk_lock.slock);
1699 schedule();
1700 spin_lock_bh(&sk->sk_lock.slock);
1701 if (!sock_owned_by_user(sk))
1702 break;
1704 finish_wait(&sk->sk_lock.wq, &wait);
1707 static void __release_sock(struct sock *sk)
1708 __releases(&sk->sk_lock.slock)
1709 __acquires(&sk->sk_lock.slock)
1711 struct sk_buff *skb = sk->sk_backlog.head;
1713 do {
1714 sk->sk_backlog.head = sk->sk_backlog.tail = NULL;
1715 bh_unlock_sock(sk);
1717 do {
1718 struct sk_buff *next = skb->next;
1720 WARN_ON_ONCE(skb_dst_is_noref(skb));
1721 skb->next = NULL;
1722 sk_backlog_rcv(sk, skb);
1725 * We are in process context here with softirqs
1726 * disabled, use cond_resched_softirq() to preempt.
1727 * This is safe to do because we've taken the backlog
1728 * queue private:
1730 cond_resched_softirq();
1732 skb = next;
1733 } while (skb != NULL);
1735 bh_lock_sock(sk);
1736 } while ((skb = sk->sk_backlog.head) != NULL);
1739 * Doing the zeroing here guarantee we can not loop forever
1740 * while a wild producer attempts to flood us.
1742 sk->sk_backlog.len = 0;
1746 * sk_wait_data - wait for data to arrive at sk_receive_queue
1747 * @sk: sock to wait on
1748 * @timeo: for how long
1750 * Now socket state including sk->sk_err is changed only under lock,
1751 * hence we may omit checks after joining wait queue.
1752 * We check receive queue before schedule() only as optimization;
1753 * it is very likely that release_sock() added new data.
1755 int sk_wait_data(struct sock *sk, long *timeo)
1757 int rc;
1758 DEFINE_WAIT(wait);
1760 prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
1761 set_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
1762 rc = sk_wait_event(sk, timeo, !skb_queue_empty(&sk->sk_receive_queue));
1763 clear_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
1764 finish_wait(sk_sleep(sk), &wait);
1765 return rc;
1767 EXPORT_SYMBOL(sk_wait_data);
1770 * __sk_mem_schedule - increase sk_forward_alloc and memory_allocated
1771 * @sk: socket
1772 * @size: memory size to allocate
1773 * @kind: allocation type
1775 * If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means
1776 * rmem allocation. This function assumes that protocols which have
1777 * memory_pressure use sk_wmem_queued as write buffer accounting.
1779 int __sk_mem_schedule(struct sock *sk, int size, int kind)
1781 struct proto *prot = sk->sk_prot;
1782 int amt = sk_mem_pages(size);
1783 long allocated;
1784 int parent_status = UNDER_LIMIT;
1786 sk->sk_forward_alloc += amt * SK_MEM_QUANTUM;
1788 allocated = sk_memory_allocated_add(sk, amt, &parent_status);
1790 /* Under limit. */
1791 if (parent_status == UNDER_LIMIT &&
1792 allocated <= sk_prot_mem_limits(sk, 0)) {
1793 sk_leave_memory_pressure(sk);
1794 return 1;
1797 /* Under pressure. (we or our parents) */
1798 if ((parent_status > SOFT_LIMIT) ||
1799 allocated > sk_prot_mem_limits(sk, 1))
1800 sk_enter_memory_pressure(sk);
1802 /* Over hard limit (we or our parents) */
1803 if ((parent_status == OVER_LIMIT) ||
1804 (allocated > sk_prot_mem_limits(sk, 2)))
1805 goto suppress_allocation;
1807 /* guarantee minimum buffer size under pressure */
1808 if (kind == SK_MEM_RECV) {
1809 if (atomic_read(&sk->sk_rmem_alloc) < prot->sysctl_rmem[0])
1810 return 1;
1812 } else { /* SK_MEM_SEND */
1813 if (sk->sk_type == SOCK_STREAM) {
1814 if (sk->sk_wmem_queued < prot->sysctl_wmem[0])
1815 return 1;
1816 } else if (atomic_read(&sk->sk_wmem_alloc) <
1817 prot->sysctl_wmem[0])
1818 return 1;
1821 if (sk_has_memory_pressure(sk)) {
1822 int alloc;
1824 if (!sk_under_memory_pressure(sk))
1825 return 1;
1826 alloc = sk_sockets_allocated_read_positive(sk);
1827 if (sk_prot_mem_limits(sk, 2) > alloc *
1828 sk_mem_pages(sk->sk_wmem_queued +
1829 atomic_read(&sk->sk_rmem_alloc) +
1830 sk->sk_forward_alloc))
1831 return 1;
1834 suppress_allocation:
1836 if (kind == SK_MEM_SEND && sk->sk_type == SOCK_STREAM) {
1837 sk_stream_moderate_sndbuf(sk);
1839 /* Fail only if socket is _under_ its sndbuf.
1840 * In this case we cannot block, so that we have to fail.
1842 if (sk->sk_wmem_queued + size >= sk->sk_sndbuf)
1843 return 1;
1846 trace_sock_exceed_buf_limit(sk, prot, allocated);
1848 /* Alas. Undo changes. */
1849 sk->sk_forward_alloc -= amt * SK_MEM_QUANTUM;
1851 sk_memory_allocated_sub(sk, amt);
1853 return 0;
1855 EXPORT_SYMBOL(__sk_mem_schedule);
1858 * __sk_reclaim - reclaim memory_allocated
1859 * @sk: socket
1861 void __sk_mem_reclaim(struct sock *sk)
1863 sk_memory_allocated_sub(sk,
1864 sk->sk_forward_alloc >> SK_MEM_QUANTUM_SHIFT);
1865 sk->sk_forward_alloc &= SK_MEM_QUANTUM - 1;
1867 if (sk_under_memory_pressure(sk) &&
1868 (sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)))
1869 sk_leave_memory_pressure(sk);
1871 EXPORT_SYMBOL(__sk_mem_reclaim);
1875 * Set of default routines for initialising struct proto_ops when
1876 * the protocol does not support a particular function. In certain
1877 * cases where it makes no sense for a protocol to have a "do nothing"
1878 * function, some default processing is provided.
1881 int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len)
1883 return -EOPNOTSUPP;
1885 EXPORT_SYMBOL(sock_no_bind);
1887 int sock_no_connect(struct socket *sock, struct sockaddr *saddr,
1888 int len, int flags)
1890 return -EOPNOTSUPP;
1892 EXPORT_SYMBOL(sock_no_connect);
1894 int sock_no_socketpair(struct socket *sock1, struct socket *sock2)
1896 return -EOPNOTSUPP;
1898 EXPORT_SYMBOL(sock_no_socketpair);
1900 int sock_no_accept(struct socket *sock, struct socket *newsock, int flags)
1902 return -EOPNOTSUPP;
1904 EXPORT_SYMBOL(sock_no_accept);
1906 int sock_no_getname(struct socket *sock, struct sockaddr *saddr,
1907 int *len, int peer)
1909 return -EOPNOTSUPP;
1911 EXPORT_SYMBOL(sock_no_getname);
1913 unsigned int sock_no_poll(struct file *file, struct socket *sock, poll_table *pt)
1915 return 0;
1917 EXPORT_SYMBOL(sock_no_poll);
1919 int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
1921 return -EOPNOTSUPP;
1923 EXPORT_SYMBOL(sock_no_ioctl);
1925 int sock_no_listen(struct socket *sock, int backlog)
1927 return -EOPNOTSUPP;
1929 EXPORT_SYMBOL(sock_no_listen);
1931 int sock_no_shutdown(struct socket *sock, int how)
1933 return -EOPNOTSUPP;
1935 EXPORT_SYMBOL(sock_no_shutdown);
1937 int sock_no_setsockopt(struct socket *sock, int level, int optname,
1938 char __user *optval, unsigned int optlen)
1940 return -EOPNOTSUPP;
1942 EXPORT_SYMBOL(sock_no_setsockopt);
1944 int sock_no_getsockopt(struct socket *sock, int level, int optname,
1945 char __user *optval, int __user *optlen)
1947 return -EOPNOTSUPP;
1949 EXPORT_SYMBOL(sock_no_getsockopt);
1951 int sock_no_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *m,
1952 size_t len)
1954 return -EOPNOTSUPP;
1956 EXPORT_SYMBOL(sock_no_sendmsg);
1958 int sock_no_recvmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *m,
1959 size_t len, int flags)
1961 return -EOPNOTSUPP;
1963 EXPORT_SYMBOL(sock_no_recvmsg);
1965 int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
1967 /* Mirror missing mmap method error code */
1968 return -ENODEV;
1970 EXPORT_SYMBOL(sock_no_mmap);
1972 ssize_t sock_no_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags)
1974 ssize_t res;
1975 struct msghdr msg = {.msg_flags = flags};
1976 struct kvec iov;
1977 char *kaddr = kmap(page);
1978 iov.iov_base = kaddr + offset;
1979 iov.iov_len = size;
1980 res = kernel_sendmsg(sock, &msg, &iov, 1, size);
1981 kunmap(page);
1982 return res;
1984 EXPORT_SYMBOL(sock_no_sendpage);
1987 * Default Socket Callbacks
1990 static void sock_def_wakeup(struct sock *sk)
1992 struct socket_wq *wq;
1994 rcu_read_lock();
1995 wq = rcu_dereference(sk->sk_wq);
1996 if (wq_has_sleeper(wq))
1997 wake_up_interruptible_all(&wq->wait);
1998 rcu_read_unlock();
2001 static void sock_def_error_report(struct sock *sk)
2003 struct socket_wq *wq;
2005 rcu_read_lock();
2006 wq = rcu_dereference(sk->sk_wq);
2007 if (wq_has_sleeper(wq))
2008 wake_up_interruptible_poll(&wq->wait, POLLERR);
2009 sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR);
2010 rcu_read_unlock();
2013 static void sock_def_readable(struct sock *sk, int len)
2015 struct socket_wq *wq;
2017 rcu_read_lock();
2018 wq = rcu_dereference(sk->sk_wq);
2019 if (wq_has_sleeper(wq))
2020 wake_up_interruptible_sync_poll(&wq->wait, POLLIN | POLLPRI |
2021 POLLRDNORM | POLLRDBAND);
2022 sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
2023 rcu_read_unlock();
2026 static void sock_def_write_space(struct sock *sk)
2028 struct socket_wq *wq;
2030 rcu_read_lock();
2032 /* Do not wake up a writer until he can make "significant"
2033 * progress. --DaveM
2035 if ((atomic_read(&sk->sk_wmem_alloc) << 1) <= sk->sk_sndbuf) {
2036 wq = rcu_dereference(sk->sk_wq);
2037 if (wq_has_sleeper(wq))
2038 wake_up_interruptible_sync_poll(&wq->wait, POLLOUT |
2039 POLLWRNORM | POLLWRBAND);
2041 /* Should agree with poll, otherwise some programs break */
2042 if (sock_writeable(sk))
2043 sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
2046 rcu_read_unlock();
2049 static void sock_def_destruct(struct sock *sk)
2051 kfree(sk->sk_protinfo);
2054 void sk_send_sigurg(struct sock *sk)
2056 if (sk->sk_socket && sk->sk_socket->file)
2057 if (send_sigurg(&sk->sk_socket->file->f_owner))
2058 sk_wake_async(sk, SOCK_WAKE_URG, POLL_PRI);
2060 EXPORT_SYMBOL(sk_send_sigurg);
2062 void sk_reset_timer(struct sock *sk, struct timer_list* timer,
2063 unsigned long expires)
2065 if (!mod_timer(timer, expires))
2066 sock_hold(sk);
2068 EXPORT_SYMBOL(sk_reset_timer);
2070 void sk_stop_timer(struct sock *sk, struct timer_list* timer)
2072 if (timer_pending(timer) && del_timer(timer))
2073 __sock_put(sk);
2075 EXPORT_SYMBOL(sk_stop_timer);
2077 void sock_init_data(struct socket *sock, struct sock *sk)
2079 skb_queue_head_init(&sk->sk_receive_queue);
2080 skb_queue_head_init(&sk->sk_write_queue);
2081 skb_queue_head_init(&sk->sk_error_queue);
2082 #ifdef CONFIG_NET_DMA
2083 skb_queue_head_init(&sk->sk_async_wait_queue);
2084 #endif
2086 sk->sk_send_head = NULL;
2088 init_timer(&sk->sk_timer);
2090 sk->sk_allocation = GFP_KERNEL;
2091 sk->sk_rcvbuf = sysctl_rmem_default;
2092 sk->sk_sndbuf = sysctl_wmem_default;
2093 sk->sk_state = TCP_CLOSE;
2094 sk_set_socket(sk, sock);
2096 sock_set_flag(sk, SOCK_ZAPPED);
2098 if (sock) {
2099 sk->sk_type = sock->type;
2100 sk->sk_wq = sock->wq;
2101 sock->sk = sk;
2102 } else
2103 sk->sk_wq = NULL;
2105 spin_lock_init(&sk->sk_dst_lock);
2106 rwlock_init(&sk->sk_callback_lock);
2107 lockdep_set_class_and_name(&sk->sk_callback_lock,
2108 af_callback_keys + sk->sk_family,
2109 af_family_clock_key_strings[sk->sk_family]);
2111 sk->sk_state_change = sock_def_wakeup;
2112 sk->sk_data_ready = sock_def_readable;
2113 sk->sk_write_space = sock_def_write_space;
2114 sk->sk_error_report = sock_def_error_report;
2115 sk->sk_destruct = sock_def_destruct;
2117 sk->sk_sndmsg_page = NULL;
2118 sk->sk_sndmsg_off = 0;
2119 sk->sk_peek_off = -1;
2121 sk->sk_peer_pid = NULL;
2122 sk->sk_peer_cred = NULL;
2123 sk->sk_write_pending = 0;
2124 sk->sk_rcvlowat = 1;
2125 sk->sk_rcvtimeo = MAX_SCHEDULE_TIMEOUT;
2126 sk->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT;
2128 sk->sk_stamp = ktime_set(-1L, 0);
2131 * Before updating sk_refcnt, we must commit prior changes to memory
2132 * (Documentation/RCU/rculist_nulls.txt for details)
2134 smp_wmb();
2135 atomic_set(&sk->sk_refcnt, 1);
2136 atomic_set(&sk->sk_drops, 0);
2138 EXPORT_SYMBOL(sock_init_data);
2140 void lock_sock_nested(struct sock *sk, int subclass)
2142 might_sleep();
2143 spin_lock_bh(&sk->sk_lock.slock);
2144 if (sk->sk_lock.owned)
2145 __lock_sock(sk);
2146 sk->sk_lock.owned = 1;
2147 spin_unlock(&sk->sk_lock.slock);
2149 * The sk_lock has mutex_lock() semantics here:
2151 mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_);
2152 local_bh_enable();
2154 EXPORT_SYMBOL(lock_sock_nested);
2156 void release_sock(struct sock *sk)
2159 * The sk_lock has mutex_unlock() semantics:
2161 mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
2163 spin_lock_bh(&sk->sk_lock.slock);
2164 if (sk->sk_backlog.tail)
2165 __release_sock(sk);
2166 sk->sk_lock.owned = 0;
2167 if (waitqueue_active(&sk->sk_lock.wq))
2168 wake_up(&sk->sk_lock.wq);
2169 spin_unlock_bh(&sk->sk_lock.slock);
2171 EXPORT_SYMBOL(release_sock);
2174 * lock_sock_fast - fast version of lock_sock
2175 * @sk: socket
2177 * This version should be used for very small section, where process wont block
2178 * return false if fast path is taken
2179 * sk_lock.slock locked, owned = 0, BH disabled
2180 * return true if slow path is taken
2181 * sk_lock.slock unlocked, owned = 1, BH enabled
2183 bool lock_sock_fast(struct sock *sk)
2185 might_sleep();
2186 spin_lock_bh(&sk->sk_lock.slock);
2188 if (!sk->sk_lock.owned)
2190 * Note : We must disable BH
2192 return false;
2194 __lock_sock(sk);
2195 sk->sk_lock.owned = 1;
2196 spin_unlock(&sk->sk_lock.slock);
2198 * The sk_lock has mutex_lock() semantics here:
2200 mutex_acquire(&sk->sk_lock.dep_map, 0, 0, _RET_IP_);
2201 local_bh_enable();
2202 return true;
2204 EXPORT_SYMBOL(lock_sock_fast);
2206 int sock_get_timestamp(struct sock *sk, struct timeval __user *userstamp)
2208 struct timeval tv;
2209 if (!sock_flag(sk, SOCK_TIMESTAMP))
2210 sock_enable_timestamp(sk, SOCK_TIMESTAMP);
2211 tv = ktime_to_timeval(sk->sk_stamp);
2212 if (tv.tv_sec == -1)
2213 return -ENOENT;
2214 if (tv.tv_sec == 0) {
2215 sk->sk_stamp = ktime_get_real();
2216 tv = ktime_to_timeval(sk->sk_stamp);
2218 return copy_to_user(userstamp, &tv, sizeof(tv)) ? -EFAULT : 0;
2220 EXPORT_SYMBOL(sock_get_timestamp);
2222 int sock_get_timestampns(struct sock *sk, struct timespec __user *userstamp)
2224 struct timespec ts;
2225 if (!sock_flag(sk, SOCK_TIMESTAMP))
2226 sock_enable_timestamp(sk, SOCK_TIMESTAMP);
2227 ts = ktime_to_timespec(sk->sk_stamp);
2228 if (ts.tv_sec == -1)
2229 return -ENOENT;
2230 if (ts.tv_sec == 0) {
2231 sk->sk_stamp = ktime_get_real();
2232 ts = ktime_to_timespec(sk->sk_stamp);
2234 return copy_to_user(userstamp, &ts, sizeof(ts)) ? -EFAULT : 0;
2236 EXPORT_SYMBOL(sock_get_timestampns);
2238 void sock_enable_timestamp(struct sock *sk, int flag)
2240 if (!sock_flag(sk, flag)) {
2241 unsigned long previous_flags = sk->sk_flags;
2243 sock_set_flag(sk, flag);
2245 * we just set one of the two flags which require net
2246 * time stamping, but time stamping might have been on
2247 * already because of the other one
2249 if (!(previous_flags & SK_FLAGS_TIMESTAMP))
2250 net_enable_timestamp();
2255 * Get a socket option on an socket.
2257 * FIX: POSIX 1003.1g is very ambiguous here. It states that
2258 * asynchronous errors should be reported by getsockopt. We assume
2259 * this means if you specify SO_ERROR (otherwise whats the point of it).
2261 int sock_common_getsockopt(struct socket *sock, int level, int optname,
2262 char __user *optval, int __user *optlen)
2264 struct sock *sk = sock->sk;
2266 return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
2268 EXPORT_SYMBOL(sock_common_getsockopt);
2270 #ifdef CONFIG_COMPAT
2271 int compat_sock_common_getsockopt(struct socket *sock, int level, int optname,
2272 char __user *optval, int __user *optlen)
2274 struct sock *sk = sock->sk;
2276 if (sk->sk_prot->compat_getsockopt != NULL)
2277 return sk->sk_prot->compat_getsockopt(sk, level, optname,
2278 optval, optlen);
2279 return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
2281 EXPORT_SYMBOL(compat_sock_common_getsockopt);
2282 #endif
2284 int sock_common_recvmsg(struct kiocb *iocb, struct socket *sock,
2285 struct msghdr *msg, size_t size, int flags)
2287 struct sock *sk = sock->sk;
2288 int addr_len = 0;
2289 int err;
2291 err = sk->sk_prot->recvmsg(iocb, sk, msg, size, flags & MSG_DONTWAIT,
2292 flags & ~MSG_DONTWAIT, &addr_len);
2293 if (err >= 0)
2294 msg->msg_namelen = addr_len;
2295 return err;
2297 EXPORT_SYMBOL(sock_common_recvmsg);
2300 * Set socket options on an inet socket.
2302 int sock_common_setsockopt(struct socket *sock, int level, int optname,
2303 char __user *optval, unsigned int optlen)
2305 struct sock *sk = sock->sk;
2307 return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
2309 EXPORT_SYMBOL(sock_common_setsockopt);
2311 #ifdef CONFIG_COMPAT
2312 int compat_sock_common_setsockopt(struct socket *sock, int level, int optname,
2313 char __user *optval, unsigned int optlen)
2315 struct sock *sk = sock->sk;
2317 if (sk->sk_prot->compat_setsockopt != NULL)
2318 return sk->sk_prot->compat_setsockopt(sk, level, optname,
2319 optval, optlen);
2320 return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
2322 EXPORT_SYMBOL(compat_sock_common_setsockopt);
2323 #endif
2325 void sk_common_release(struct sock *sk)
2327 if (sk->sk_prot->destroy)
2328 sk->sk_prot->destroy(sk);
2331 * Observation: when sock_common_release is called, processes have
2332 * no access to socket. But net still has.
2333 * Step one, detach it from networking:
2335 * A. Remove from hash tables.
2338 sk->sk_prot->unhash(sk);
2341 * In this point socket cannot receive new packets, but it is possible
2342 * that some packets are in flight because some CPU runs receiver and
2343 * did hash table lookup before we unhashed socket. They will achieve
2344 * receive queue and will be purged by socket destructor.
2346 * Also we still have packets pending on receive queue and probably,
2347 * our own packets waiting in device queues. sock_destroy will drain
2348 * receive queue, but transmitted packets will delay socket destruction
2349 * until the last reference will be released.
2352 sock_orphan(sk);
2354 xfrm_sk_free_policy(sk);
2356 sk_refcnt_debug_release(sk);
2357 sock_put(sk);
2359 EXPORT_SYMBOL(sk_common_release);
2361 #ifdef CONFIG_PROC_FS
2362 #define PROTO_INUSE_NR 64 /* should be enough for the first time */
2363 struct prot_inuse {
2364 int val[PROTO_INUSE_NR];
2367 static DECLARE_BITMAP(proto_inuse_idx, PROTO_INUSE_NR);
2369 #ifdef CONFIG_NET_NS
2370 void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
2372 __this_cpu_add(net->core.inuse->val[prot->inuse_idx], val);
2374 EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
2376 int sock_prot_inuse_get(struct net *net, struct proto *prot)
2378 int cpu, idx = prot->inuse_idx;
2379 int res = 0;
2381 for_each_possible_cpu(cpu)
2382 res += per_cpu_ptr(net->core.inuse, cpu)->val[idx];
2384 return res >= 0 ? res : 0;
2386 EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
2388 static int __net_init sock_inuse_init_net(struct net *net)
2390 net->core.inuse = alloc_percpu(struct prot_inuse);
2391 return net->core.inuse ? 0 : -ENOMEM;
2394 static void __net_exit sock_inuse_exit_net(struct net *net)
2396 free_percpu(net->core.inuse);
2399 static struct pernet_operations net_inuse_ops = {
2400 .init = sock_inuse_init_net,
2401 .exit = sock_inuse_exit_net,
2404 static __init int net_inuse_init(void)
2406 if (register_pernet_subsys(&net_inuse_ops))
2407 panic("Cannot initialize net inuse counters");
2409 return 0;
2412 core_initcall(net_inuse_init);
2413 #else
2414 static DEFINE_PER_CPU(struct prot_inuse, prot_inuse);
2416 void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
2418 __this_cpu_add(prot_inuse.val[prot->inuse_idx], val);
2420 EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
2422 int sock_prot_inuse_get(struct net *net, struct proto *prot)
2424 int cpu, idx = prot->inuse_idx;
2425 int res = 0;
2427 for_each_possible_cpu(cpu)
2428 res += per_cpu(prot_inuse, cpu).val[idx];
2430 return res >= 0 ? res : 0;
2432 EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
2433 #endif
2435 static void assign_proto_idx(struct proto *prot)
2437 prot->inuse_idx = find_first_zero_bit(proto_inuse_idx, PROTO_INUSE_NR);
2439 if (unlikely(prot->inuse_idx == PROTO_INUSE_NR - 1)) {
2440 printk(KERN_ERR "PROTO_INUSE_NR exhausted\n");
2441 return;
2444 set_bit(prot->inuse_idx, proto_inuse_idx);
2447 static void release_proto_idx(struct proto *prot)
2449 if (prot->inuse_idx != PROTO_INUSE_NR - 1)
2450 clear_bit(prot->inuse_idx, proto_inuse_idx);
2452 #else
2453 static inline void assign_proto_idx(struct proto *prot)
2457 static inline void release_proto_idx(struct proto *prot)
2460 #endif
2462 int proto_register(struct proto *prot, int alloc_slab)
2464 if (alloc_slab) {
2465 prot->slab = kmem_cache_create(prot->name, prot->obj_size, 0,
2466 SLAB_HWCACHE_ALIGN | prot->slab_flags,
2467 NULL);
2469 if (prot->slab == NULL) {
2470 printk(KERN_CRIT "%s: Can't create sock SLAB cache!\n",
2471 prot->name);
2472 goto out;
2475 if (prot->rsk_prot != NULL) {
2476 prot->rsk_prot->slab_name = kasprintf(GFP_KERNEL, "request_sock_%s", prot->name);
2477 if (prot->rsk_prot->slab_name == NULL)
2478 goto out_free_sock_slab;
2480 prot->rsk_prot->slab = kmem_cache_create(prot->rsk_prot->slab_name,
2481 prot->rsk_prot->obj_size, 0,
2482 SLAB_HWCACHE_ALIGN, NULL);
2484 if (prot->rsk_prot->slab == NULL) {
2485 printk(KERN_CRIT "%s: Can't create request sock SLAB cache!\n",
2486 prot->name);
2487 goto out_free_request_sock_slab_name;
2491 if (prot->twsk_prot != NULL) {
2492 prot->twsk_prot->twsk_slab_name = kasprintf(GFP_KERNEL, "tw_sock_%s", prot->name);
2494 if (prot->twsk_prot->twsk_slab_name == NULL)
2495 goto out_free_request_sock_slab;
2497 prot->twsk_prot->twsk_slab =
2498 kmem_cache_create(prot->twsk_prot->twsk_slab_name,
2499 prot->twsk_prot->twsk_obj_size,
2501 SLAB_HWCACHE_ALIGN |
2502 prot->slab_flags,
2503 NULL);
2504 if (prot->twsk_prot->twsk_slab == NULL)
2505 goto out_free_timewait_sock_slab_name;
2509 mutex_lock(&proto_list_mutex);
2510 list_add(&prot->node, &proto_list);
2511 assign_proto_idx(prot);
2512 mutex_unlock(&proto_list_mutex);
2513 return 0;
2515 out_free_timewait_sock_slab_name:
2516 kfree(prot->twsk_prot->twsk_slab_name);
2517 out_free_request_sock_slab:
2518 if (prot->rsk_prot && prot->rsk_prot->slab) {
2519 kmem_cache_destroy(prot->rsk_prot->slab);
2520 prot->rsk_prot->slab = NULL;
2522 out_free_request_sock_slab_name:
2523 if (prot->rsk_prot)
2524 kfree(prot->rsk_prot->slab_name);
2525 out_free_sock_slab:
2526 kmem_cache_destroy(prot->slab);
2527 prot->slab = NULL;
2528 out:
2529 return -ENOBUFS;
2531 EXPORT_SYMBOL(proto_register);
2533 void proto_unregister(struct proto *prot)
2535 mutex_lock(&proto_list_mutex);
2536 release_proto_idx(prot);
2537 list_del(&prot->node);
2538 mutex_unlock(&proto_list_mutex);
2540 if (prot->slab != NULL) {
2541 kmem_cache_destroy(prot->slab);
2542 prot->slab = NULL;
2545 if (prot->rsk_prot != NULL && prot->rsk_prot->slab != NULL) {
2546 kmem_cache_destroy(prot->rsk_prot->slab);
2547 kfree(prot->rsk_prot->slab_name);
2548 prot->rsk_prot->slab = NULL;
2551 if (prot->twsk_prot != NULL && prot->twsk_prot->twsk_slab != NULL) {
2552 kmem_cache_destroy(prot->twsk_prot->twsk_slab);
2553 kfree(prot->twsk_prot->twsk_slab_name);
2554 prot->twsk_prot->twsk_slab = NULL;
2557 EXPORT_SYMBOL(proto_unregister);
2559 #ifdef CONFIG_PROC_FS
2560 static void *proto_seq_start(struct seq_file *seq, loff_t *pos)
2561 __acquires(proto_list_mutex)
2563 mutex_lock(&proto_list_mutex);
2564 return seq_list_start_head(&proto_list, *pos);
2567 static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2569 return seq_list_next(v, &proto_list, pos);
2572 static void proto_seq_stop(struct seq_file *seq, void *v)
2573 __releases(proto_list_mutex)
2575 mutex_unlock(&proto_list_mutex);
2578 static char proto_method_implemented(const void *method)
2580 return method == NULL ? 'n' : 'y';
2582 static long sock_prot_memory_allocated(struct proto *proto)
2584 return proto->memory_allocated != NULL ? proto_memory_allocated(proto): -1L;
2587 static char *sock_prot_memory_pressure(struct proto *proto)
2589 return proto->memory_pressure != NULL ?
2590 proto_memory_pressure(proto) ? "yes" : "no" : "NI";
2593 static void proto_seq_printf(struct seq_file *seq, struct proto *proto)
2596 seq_printf(seq, "%-9s %4u %6d %6ld %-3s %6u %-3s %-10s "
2597 "%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n",
2598 proto->name,
2599 proto->obj_size,
2600 sock_prot_inuse_get(seq_file_net(seq), proto),
2601 sock_prot_memory_allocated(proto),
2602 sock_prot_memory_pressure(proto),
2603 proto->max_header,
2604 proto->slab == NULL ? "no" : "yes",
2605 module_name(proto->owner),
2606 proto_method_implemented(proto->close),
2607 proto_method_implemented(proto->connect),
2608 proto_method_implemented(proto->disconnect),
2609 proto_method_implemented(proto->accept),
2610 proto_method_implemented(proto->ioctl),
2611 proto_method_implemented(proto->init),
2612 proto_method_implemented(proto->destroy),
2613 proto_method_implemented(proto->shutdown),
2614 proto_method_implemented(proto->setsockopt),
2615 proto_method_implemented(proto->getsockopt),
2616 proto_method_implemented(proto->sendmsg),
2617 proto_method_implemented(proto->recvmsg),
2618 proto_method_implemented(proto->sendpage),
2619 proto_method_implemented(proto->bind),
2620 proto_method_implemented(proto->backlog_rcv),
2621 proto_method_implemented(proto->hash),
2622 proto_method_implemented(proto->unhash),
2623 proto_method_implemented(proto->get_port),
2624 proto_method_implemented(proto->enter_memory_pressure));
2627 static int proto_seq_show(struct seq_file *seq, void *v)
2629 if (v == &proto_list)
2630 seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s",
2631 "protocol",
2632 "size",
2633 "sockets",
2634 "memory",
2635 "press",
2636 "maxhdr",
2637 "slab",
2638 "module",
2639 "cl co di ac io in de sh ss gs se re sp bi br ha uh gp em\n");
2640 else
2641 proto_seq_printf(seq, list_entry(v, struct proto, node));
2642 return 0;
2645 static const struct seq_operations proto_seq_ops = {
2646 .start = proto_seq_start,
2647 .next = proto_seq_next,
2648 .stop = proto_seq_stop,
2649 .show = proto_seq_show,
2652 static int proto_seq_open(struct inode *inode, struct file *file)
2654 return seq_open_net(inode, file, &proto_seq_ops,
2655 sizeof(struct seq_net_private));
2658 static const struct file_operations proto_seq_fops = {
2659 .owner = THIS_MODULE,
2660 .open = proto_seq_open,
2661 .read = seq_read,
2662 .llseek = seq_lseek,
2663 .release = seq_release_net,
2666 static __net_init int proto_init_net(struct net *net)
2668 if (!proc_net_fops_create(net, "protocols", S_IRUGO, &proto_seq_fops))
2669 return -ENOMEM;
2671 return 0;
2674 static __net_exit void proto_exit_net(struct net *net)
2676 proc_net_remove(net, "protocols");
2680 static __net_initdata struct pernet_operations proto_net_ops = {
2681 .init = proto_init_net,
2682 .exit = proto_exit_net,
2685 static int __init proto_init(void)
2687 return register_pernet_subsys(&proto_net_ops);
2690 subsys_initcall(proto_init);
2692 #endif /* PROC_FS */