Merge commit 'refs/merge-requests/1' of git://gitorious.org/linux-on-wince-htc/linux_...
[htc-linux.git] / net / core / sock.c
blob13fa33b20ab63dda5a25f935c603d4374ff2fe8c
1 /*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
6 * Generic socket support routines. Memory allocators, socket lock/release
7 * handler for protocols to use and generic option handler.
10 * Authors: Ross Biro
11 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12 * Florian La Roche, <flla@stud.uni-sb.de>
13 * Alan Cox, <A.Cox@swansea.ac.uk>
15 * Fixes:
16 * Alan Cox : Numerous verify_area() problems
17 * Alan Cox : Connecting on a connecting socket
18 * now returns an error for tcp.
19 * Alan Cox : sock->protocol is set correctly.
20 * and is not sometimes left as 0.
21 * Alan Cox : connect handles icmp errors on a
22 * connect properly. Unfortunately there
23 * is a restart syscall nasty there. I
24 * can't match BSD without hacking the C
25 * library. Ideas urgently sought!
26 * Alan Cox : Disallow bind() to addresses that are
27 * not ours - especially broadcast ones!!
28 * Alan Cox : Socket 1024 _IS_ ok for users. (fencepost)
29 * Alan Cox : sock_wfree/sock_rfree don't destroy sockets,
30 * instead they leave that for the DESTROY timer.
31 * Alan Cox : Clean up error flag in accept
32 * Alan Cox : TCP ack handling is buggy, the DESTROY timer
33 * was buggy. Put a remove_sock() in the handler
34 * for memory when we hit 0. Also altered the timer
35 * code. The ACK stuff can wait and needs major
36 * TCP layer surgery.
37 * Alan Cox : Fixed TCP ack bug, removed remove sock
38 * and fixed timer/inet_bh race.
39 * Alan Cox : Added zapped flag for TCP
40 * Alan Cox : Move kfree_skb into skbuff.c and tidied up surplus code
41 * Alan Cox : for new sk_buff allocations wmalloc/rmalloc now call alloc_skb
42 * Alan Cox : kfree_s calls now are kfree_skbmem so we can track skb resources
43 * Alan Cox : Supports socket option broadcast now as does udp. Packet and raw need fixing.
44 * Alan Cox : Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so...
45 * Rick Sladkey : Relaxed UDP rules for matching packets.
46 * C.E.Hawkins : IFF_PROMISC/SIOCGHWADDR support
47 * Pauline Middelink : identd support
48 * Alan Cox : Fixed connect() taking signals I think.
49 * Alan Cox : SO_LINGER supported
50 * Alan Cox : Error reporting fixes
51 * Anonymous : inet_create tidied up (sk->reuse setting)
52 * Alan Cox : inet sockets don't set sk->type!
53 * Alan Cox : Split socket option code
54 * Alan Cox : Callbacks
55 * Alan Cox : Nagle flag for Charles & Johannes stuff
56 * Alex : Removed restriction on inet fioctl
57 * Alan Cox : Splitting INET from NET core
58 * Alan Cox : Fixed bogus SO_TYPE handling in getsockopt()
59 * Adam Caldwell : Missing return in SO_DONTROUTE/SO_DEBUG code
60 * Alan Cox : Split IP from generic code
61 * Alan Cox : New kfree_skbmem()
62 * Alan Cox : Make SO_DEBUG superuser only.
63 * Alan Cox : Allow anyone to clear SO_DEBUG
64 * (compatibility fix)
65 * Alan Cox : Added optimistic memory grabbing for AF_UNIX throughput.
66 * Alan Cox : Allocator for a socket is settable.
67 * Alan Cox : SO_ERROR includes soft errors.
68 * Alan Cox : Allow NULL arguments on some SO_ opts
69 * Alan Cox : Generic socket allocation to make hooks
70 * easier (suggested by Craig Metz).
71 * Michael Pall : SO_ERROR returns positive errno again
72 * Steve Whitehouse: Added default destructor to free
73 * protocol private data.
74 * Steve Whitehouse: Added various other default routines
75 * common to several socket families.
76 * Chris Evans : Call suser() check last on F_SETOWN
77 * Jay Schulist : Added SO_ATTACH_FILTER and SO_DETACH_FILTER.
78 * Andi Kleen : Add sock_kmalloc()/sock_kfree_s()
79 * Andi Kleen : Fix write_space callback
80 * Chris Evans : Security fixes - signedness again
81 * Arnaldo C. Melo : cleanups, use skb_queue_purge
83 * To Fix:
86 * This program is free software; you can redistribute it and/or
87 * modify it under the terms of the GNU General Public License
88 * as published by the Free Software Foundation; either version
89 * 2 of the License, or (at your option) any later version.
92 #include <linux/capability.h>
93 #include <linux/errno.h>
94 #include <linux/types.h>
95 #include <linux/socket.h>
96 #include <linux/in.h>
97 #include <linux/kernel.h>
98 #include <linux/module.h>
99 #include <linux/proc_fs.h>
100 #include <linux/seq_file.h>
101 #include <linux/sched.h>
102 #include <linux/timer.h>
103 #include <linux/string.h>
104 #include <linux/sockios.h>
105 #include <linux/net.h>
106 #include <linux/mm.h>
107 #include <linux/slab.h>
108 #include <linux/interrupt.h>
109 #include <linux/poll.h>
110 #include <linux/tcp.h>
111 #include <linux/init.h>
112 #include <linux/highmem.h>
114 #include <asm/uaccess.h>
115 #include <asm/system.h>
117 #include <linux/netdevice.h>
118 #include <net/protocol.h>
119 #include <linux/skbuff.h>
120 #include <net/net_namespace.h>
121 #include <net/request_sock.h>
122 #include <net/sock.h>
123 #include <linux/net_tstamp.h>
124 #include <net/xfrm.h>
125 #include <linux/ipsec.h>
127 #include <linux/filter.h>
129 #ifdef CONFIG_INET
130 #include <net/tcp.h>
131 #endif
134 * Each address family might have different locking rules, so we have
135 * one slock key per address family:
137 static struct lock_class_key af_family_keys[AF_MAX];
138 static struct lock_class_key af_family_slock_keys[AF_MAX];
141 * Make lock validator output more readable. (we pre-construct these
142 * strings build-time, so that runtime initialization of socket
143 * locks is fast):
145 static const char *const af_family_key_strings[AF_MAX+1] = {
146 "sk_lock-AF_UNSPEC", "sk_lock-AF_UNIX" , "sk_lock-AF_INET" ,
147 "sk_lock-AF_AX25" , "sk_lock-AF_IPX" , "sk_lock-AF_APPLETALK",
148 "sk_lock-AF_NETROM", "sk_lock-AF_BRIDGE" , "sk_lock-AF_ATMPVC" ,
149 "sk_lock-AF_X25" , "sk_lock-AF_INET6" , "sk_lock-AF_ROSE" ,
150 "sk_lock-AF_DECnet", "sk_lock-AF_NETBEUI" , "sk_lock-AF_SECURITY" ,
151 "sk_lock-AF_KEY" , "sk_lock-AF_NETLINK" , "sk_lock-AF_PACKET" ,
152 "sk_lock-AF_ASH" , "sk_lock-AF_ECONET" , "sk_lock-AF_ATMSVC" ,
153 "sk_lock-AF_RDS" , "sk_lock-AF_SNA" , "sk_lock-AF_IRDA" ,
154 "sk_lock-AF_PPPOX" , "sk_lock-AF_WANPIPE" , "sk_lock-AF_LLC" ,
155 "sk_lock-27" , "sk_lock-28" , "sk_lock-AF_CAN" ,
156 "sk_lock-AF_TIPC" , "sk_lock-AF_BLUETOOTH", "sk_lock-IUCV" ,
157 "sk_lock-AF_RXRPC" , "sk_lock-AF_ISDN" , "sk_lock-AF_PHONET" ,
158 "sk_lock-AF_IEEE802154",
159 "sk_lock-AF_MAX"
161 static const char *const af_family_slock_key_strings[AF_MAX+1] = {
162 "slock-AF_UNSPEC", "slock-AF_UNIX" , "slock-AF_INET" ,
163 "slock-AF_AX25" , "slock-AF_IPX" , "slock-AF_APPLETALK",
164 "slock-AF_NETROM", "slock-AF_BRIDGE" , "slock-AF_ATMPVC" ,
165 "slock-AF_X25" , "slock-AF_INET6" , "slock-AF_ROSE" ,
166 "slock-AF_DECnet", "slock-AF_NETBEUI" , "slock-AF_SECURITY" ,
167 "slock-AF_KEY" , "slock-AF_NETLINK" , "slock-AF_PACKET" ,
168 "slock-AF_ASH" , "slock-AF_ECONET" , "slock-AF_ATMSVC" ,
169 "slock-AF_RDS" , "slock-AF_SNA" , "slock-AF_IRDA" ,
170 "slock-AF_PPPOX" , "slock-AF_WANPIPE" , "slock-AF_LLC" ,
171 "slock-27" , "slock-28" , "slock-AF_CAN" ,
172 "slock-AF_TIPC" , "slock-AF_BLUETOOTH", "slock-AF_IUCV" ,
173 "slock-AF_RXRPC" , "slock-AF_ISDN" , "slock-AF_PHONET" ,
174 "slock-AF_IEEE802154",
175 "slock-AF_MAX"
177 static const char *const af_family_clock_key_strings[AF_MAX+1] = {
178 "clock-AF_UNSPEC", "clock-AF_UNIX" , "clock-AF_INET" ,
179 "clock-AF_AX25" , "clock-AF_IPX" , "clock-AF_APPLETALK",
180 "clock-AF_NETROM", "clock-AF_BRIDGE" , "clock-AF_ATMPVC" ,
181 "clock-AF_X25" , "clock-AF_INET6" , "clock-AF_ROSE" ,
182 "clock-AF_DECnet", "clock-AF_NETBEUI" , "clock-AF_SECURITY" ,
183 "clock-AF_KEY" , "clock-AF_NETLINK" , "clock-AF_PACKET" ,
184 "clock-AF_ASH" , "clock-AF_ECONET" , "clock-AF_ATMSVC" ,
185 "clock-AF_RDS" , "clock-AF_SNA" , "clock-AF_IRDA" ,
186 "clock-AF_PPPOX" , "clock-AF_WANPIPE" , "clock-AF_LLC" ,
187 "clock-27" , "clock-28" , "clock-AF_CAN" ,
188 "clock-AF_TIPC" , "clock-AF_BLUETOOTH", "clock-AF_IUCV" ,
189 "clock-AF_RXRPC" , "clock-AF_ISDN" , "clock-AF_PHONET" ,
190 "clock-AF_IEEE802154",
191 "clock-AF_MAX"
195 * sk_callback_lock locking rules are per-address-family,
196 * so split the lock classes by using a per-AF key:
198 static struct lock_class_key af_callback_keys[AF_MAX];
200 /* Take into consideration the size of the struct sk_buff overhead in the
201 * determination of these values, since that is non-constant across
202 * platforms. This makes socket queueing behavior and performance
203 * not depend upon such differences.
205 #define _SK_MEM_PACKETS 256
206 #define _SK_MEM_OVERHEAD (sizeof(struct sk_buff) + 256)
207 #define SK_WMEM_MAX (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
208 #define SK_RMEM_MAX (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
210 /* Run time adjustable parameters. */
211 #ifdef CONFIG_WIMAX
212 __u32 sysctl_wmem_max __read_mostly = 512*1024;
213 __u32 sysctl_rmem_max __read_mostly = 512*1024;
214 #else
215 __u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX;
216 __u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX;
217 #endif
219 __u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX;
220 __u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX;
222 /* Maximal space eaten by iovec or ancilliary data plus some space */
223 int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512);
224 EXPORT_SYMBOL(sysctl_optmem_max);
226 static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen)
228 struct timeval tv;
230 if (optlen < sizeof(tv))
231 return -EINVAL;
232 if (copy_from_user(&tv, optval, sizeof(tv)))
233 return -EFAULT;
234 if (tv.tv_usec < 0 || tv.tv_usec >= USEC_PER_SEC)
235 return -EDOM;
237 if (tv.tv_sec < 0) {
238 static int warned __read_mostly;
240 *timeo_p = 0;
241 if (warned < 10 && net_ratelimit()) {
242 warned++;
243 printk(KERN_INFO "sock_set_timeout: `%s' (pid %d) "
244 "tries to set negative timeout\n",
245 current->comm, task_pid_nr(current));
247 return 0;
249 *timeo_p = MAX_SCHEDULE_TIMEOUT;
250 if (tv.tv_sec == 0 && tv.tv_usec == 0)
251 return 0;
252 if (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT/HZ - 1))
253 *timeo_p = tv.tv_sec*HZ + (tv.tv_usec+(1000000/HZ-1))/(1000000/HZ);
254 return 0;
257 static void sock_warn_obsolete_bsdism(const char *name)
259 static int warned;
260 static char warncomm[TASK_COMM_LEN];
261 if (strcmp(warncomm, current->comm) && warned < 5) {
262 strcpy(warncomm, current->comm);
263 printk(KERN_WARNING "process `%s' is using obsolete "
264 "%s SO_BSDCOMPAT\n", warncomm, name);
265 warned++;
269 static void sock_disable_timestamp(struct sock *sk, int flag)
271 if (sock_flag(sk, flag)) {
272 sock_reset_flag(sk, flag);
273 if (!sock_flag(sk, SOCK_TIMESTAMP) &&
274 !sock_flag(sk, SOCK_TIMESTAMPING_RX_SOFTWARE)) {
275 net_disable_timestamp();
281 int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
283 int err = 0;
284 int skb_len;
286 /* Cast sk->rcvbuf to unsigned... It's pointless, but reduces
287 number of warnings when compiling with -W --ANK
289 if (atomic_read(&sk->sk_rmem_alloc) + skb->truesize >=
290 (unsigned)sk->sk_rcvbuf) {
291 err = -ENOMEM;
292 goto out;
295 err = sk_filter(sk, skb);
296 if (err)
297 goto out;
299 if (!sk_rmem_schedule(sk, skb->truesize)) {
300 err = -ENOBUFS;
301 goto out;
304 skb->dev = NULL;
305 skb_set_owner_r(skb, sk);
307 /* Cache the SKB length before we tack it onto the receive
308 * queue. Once it is added it no longer belongs to us and
309 * may be freed by other threads of control pulling packets
310 * from the queue.
312 skb_len = skb->len;
314 skb_queue_tail(&sk->sk_receive_queue, skb);
316 if (!sock_flag(sk, SOCK_DEAD))
317 sk->sk_data_ready(sk, skb_len);
318 out:
319 return err;
321 EXPORT_SYMBOL(sock_queue_rcv_skb);
323 int sk_receive_skb(struct sock *sk, struct sk_buff *skb, const int nested)
325 int rc = NET_RX_SUCCESS;
327 if (sk_filter(sk, skb))
328 goto discard_and_relse;
330 skb->dev = NULL;
332 if (nested)
333 bh_lock_sock_nested(sk);
334 else
335 bh_lock_sock(sk);
336 if (!sock_owned_by_user(sk)) {
338 * trylock + unlock semantics:
340 mutex_acquire(&sk->sk_lock.dep_map, 0, 1, _RET_IP_);
342 rc = sk_backlog_rcv(sk, skb);
344 mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
345 } else
346 sk_add_backlog(sk, skb);
347 bh_unlock_sock(sk);
348 out:
349 sock_put(sk);
350 return rc;
351 discard_and_relse:
352 kfree_skb(skb);
353 goto out;
355 EXPORT_SYMBOL(sk_receive_skb);
357 struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie)
359 struct dst_entry *dst = sk->sk_dst_cache;
361 if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
362 sk->sk_dst_cache = NULL;
363 dst_release(dst);
364 return NULL;
367 return dst;
369 EXPORT_SYMBOL(__sk_dst_check);
371 struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie)
373 struct dst_entry *dst = sk_dst_get(sk);
375 if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
376 sk_dst_reset(sk);
377 dst_release(dst);
378 return NULL;
381 return dst;
383 EXPORT_SYMBOL(sk_dst_check);
385 static int sock_bindtodevice(struct sock *sk, char __user *optval, int optlen)
387 int ret = -ENOPROTOOPT;
388 #ifdef CONFIG_NETDEVICES
389 struct net *net = sock_net(sk);
390 char devname[IFNAMSIZ];
391 int index;
393 /* Sorry... */
394 ret = -EPERM;
395 if (!capable(CAP_NET_RAW))
396 goto out;
398 ret = -EINVAL;
399 if (optlen < 0)
400 goto out;
402 /* Bind this socket to a particular device like "eth0",
403 * as specified in the passed interface name. If the
404 * name is "" or the option length is zero the socket
405 * is not bound.
407 if (optlen > IFNAMSIZ - 1)
408 optlen = IFNAMSIZ - 1;
409 memset(devname, 0, sizeof(devname));
411 ret = -EFAULT;
412 if (copy_from_user(devname, optval, optlen))
413 goto out;
415 if (devname[0] == '\0') {
416 index = 0;
417 } else {
418 struct net_device *dev = dev_get_by_name(net, devname);
420 ret = -ENODEV;
421 if (!dev)
422 goto out;
424 index = dev->ifindex;
425 dev_put(dev);
428 lock_sock(sk);
429 sk->sk_bound_dev_if = index;
430 sk_dst_reset(sk);
431 release_sock(sk);
433 ret = 0;
435 out:
436 #endif
438 return ret;
441 static inline void sock_valbool_flag(struct sock *sk, int bit, int valbool)
443 if (valbool)
444 sock_set_flag(sk, bit);
445 else
446 sock_reset_flag(sk, bit);
450 * This is meant for all protocols to use and covers goings on
451 * at the socket level. Everything here is generic.
454 int sock_setsockopt(struct socket *sock, int level, int optname,
455 char __user *optval, unsigned int optlen)
457 struct sock *sk = sock->sk;
458 int val;
459 int valbool;
460 struct linger ling;
461 int ret = 0;
464 * Options without arguments
467 if (optname == SO_BINDTODEVICE)
468 return sock_bindtodevice(sk, optval, optlen);
470 if (optlen < sizeof(int))
471 return -EINVAL;
473 if (get_user(val, (int __user *)optval))
474 return -EFAULT;
476 valbool = val ? 1 : 0;
478 lock_sock(sk);
480 switch (optname) {
481 case SO_DEBUG:
482 if (val && !capable(CAP_NET_ADMIN))
483 ret = -EACCES;
484 else
485 sock_valbool_flag(sk, SOCK_DBG, valbool);
486 break;
487 case SO_REUSEADDR:
488 sk->sk_reuse = valbool;
489 break;
490 case SO_TYPE:
491 case SO_PROTOCOL:
492 case SO_DOMAIN:
493 case SO_ERROR:
494 ret = -ENOPROTOOPT;
495 break;
496 case SO_DONTROUTE:
497 sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool);
498 break;
499 case SO_BROADCAST:
500 sock_valbool_flag(sk, SOCK_BROADCAST, valbool);
501 break;
502 case SO_SNDBUF:
503 /* Don't error on this BSD doesn't and if you think
504 about it this is right. Otherwise apps have to
505 play 'guess the biggest size' games. RCVBUF/SNDBUF
506 are treated in BSD as hints */
508 if (val > sysctl_wmem_max)
509 val = sysctl_wmem_max;
510 set_sndbuf:
511 sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
512 if ((val * 2) < SOCK_MIN_SNDBUF)
513 sk->sk_sndbuf = SOCK_MIN_SNDBUF;
514 else
515 sk->sk_sndbuf = val * 2;
518 * Wake up sending tasks if we
519 * upped the value.
521 sk->sk_write_space(sk);
522 break;
524 case SO_SNDBUFFORCE:
525 if (!capable(CAP_NET_ADMIN)) {
526 ret = -EPERM;
527 break;
529 goto set_sndbuf;
531 case SO_RCVBUF:
532 /* Don't error on this BSD doesn't and if you think
533 about it this is right. Otherwise apps have to
534 play 'guess the biggest size' games. RCVBUF/SNDBUF
535 are treated in BSD as hints */
537 if (val > sysctl_rmem_max)
538 val = sysctl_rmem_max;
539 set_rcvbuf:
540 sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
542 * We double it on the way in to account for
543 * "struct sk_buff" etc. overhead. Applications
544 * assume that the SO_RCVBUF setting they make will
545 * allow that much actual data to be received on that
546 * socket.
548 * Applications are unaware that "struct sk_buff" and
549 * other overheads allocate from the receive buffer
550 * during socket buffer allocation.
552 * And after considering the possible alternatives,
553 * returning the value we actually used in getsockopt
554 * is the most desirable behavior.
556 if ((val * 2) < SOCK_MIN_RCVBUF)
557 sk->sk_rcvbuf = SOCK_MIN_RCVBUF;
558 else
559 sk->sk_rcvbuf = val * 2;
560 break;
562 case SO_RCVBUFFORCE:
563 if (!capable(CAP_NET_ADMIN)) {
564 ret = -EPERM;
565 break;
567 goto set_rcvbuf;
569 case SO_KEEPALIVE:
570 #ifdef CONFIG_INET
571 if (sk->sk_protocol == IPPROTO_TCP)
572 tcp_set_keepalive(sk, valbool);
573 #endif
574 sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool);
575 break;
577 case SO_OOBINLINE:
578 sock_valbool_flag(sk, SOCK_URGINLINE, valbool);
579 break;
581 case SO_NO_CHECK:
582 sk->sk_no_check = valbool;
583 break;
585 case SO_PRIORITY:
586 if ((val >= 0 && val <= 6) || capable(CAP_NET_ADMIN))
587 sk->sk_priority = val;
588 else
589 ret = -EPERM;
590 break;
592 case SO_LINGER:
593 if (optlen < sizeof(ling)) {
594 ret = -EINVAL; /* 1003.1g */
595 break;
597 if (copy_from_user(&ling, optval, sizeof(ling))) {
598 ret = -EFAULT;
599 break;
601 if (!ling.l_onoff)
602 sock_reset_flag(sk, SOCK_LINGER);
603 else {
604 #if (BITS_PER_LONG == 32)
605 if ((unsigned int)ling.l_linger >= MAX_SCHEDULE_TIMEOUT/HZ)
606 sk->sk_lingertime = MAX_SCHEDULE_TIMEOUT;
607 else
608 #endif
609 sk->sk_lingertime = (unsigned int)ling.l_linger * HZ;
610 sock_set_flag(sk, SOCK_LINGER);
612 break;
614 case SO_BSDCOMPAT:
615 sock_warn_obsolete_bsdism("setsockopt");
616 break;
618 case SO_PASSCRED:
619 if (valbool)
620 set_bit(SOCK_PASSCRED, &sock->flags);
621 else
622 clear_bit(SOCK_PASSCRED, &sock->flags);
623 break;
625 case SO_TIMESTAMP:
626 case SO_TIMESTAMPNS:
627 if (valbool) {
628 if (optname == SO_TIMESTAMP)
629 sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
630 else
631 sock_set_flag(sk, SOCK_RCVTSTAMPNS);
632 sock_set_flag(sk, SOCK_RCVTSTAMP);
633 sock_enable_timestamp(sk, SOCK_TIMESTAMP);
634 } else {
635 sock_reset_flag(sk, SOCK_RCVTSTAMP);
636 sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
638 break;
640 case SO_TIMESTAMPING:
641 if (val & ~SOF_TIMESTAMPING_MASK) {
642 ret = -EINVAL;
643 break;
645 sock_valbool_flag(sk, SOCK_TIMESTAMPING_TX_HARDWARE,
646 val & SOF_TIMESTAMPING_TX_HARDWARE);
647 sock_valbool_flag(sk, SOCK_TIMESTAMPING_TX_SOFTWARE,
648 val & SOF_TIMESTAMPING_TX_SOFTWARE);
649 sock_valbool_flag(sk, SOCK_TIMESTAMPING_RX_HARDWARE,
650 val & SOF_TIMESTAMPING_RX_HARDWARE);
651 if (val & SOF_TIMESTAMPING_RX_SOFTWARE)
652 sock_enable_timestamp(sk,
653 SOCK_TIMESTAMPING_RX_SOFTWARE);
654 else
655 sock_disable_timestamp(sk,
656 SOCK_TIMESTAMPING_RX_SOFTWARE);
657 sock_valbool_flag(sk, SOCK_TIMESTAMPING_SOFTWARE,
658 val & SOF_TIMESTAMPING_SOFTWARE);
659 sock_valbool_flag(sk, SOCK_TIMESTAMPING_SYS_HARDWARE,
660 val & SOF_TIMESTAMPING_SYS_HARDWARE);
661 sock_valbool_flag(sk, SOCK_TIMESTAMPING_RAW_HARDWARE,
662 val & SOF_TIMESTAMPING_RAW_HARDWARE);
663 break;
665 case SO_RCVLOWAT:
666 if (val < 0)
667 val = INT_MAX;
668 sk->sk_rcvlowat = val ? : 1;
669 break;
671 case SO_RCVTIMEO:
672 ret = sock_set_timeout(&sk->sk_rcvtimeo, optval, optlen);
673 break;
675 case SO_SNDTIMEO:
676 ret = sock_set_timeout(&sk->sk_sndtimeo, optval, optlen);
677 break;
679 case SO_ATTACH_FILTER:
680 ret = -EINVAL;
681 if (optlen == sizeof(struct sock_fprog)) {
682 struct sock_fprog fprog;
684 ret = -EFAULT;
685 if (copy_from_user(&fprog, optval, sizeof(fprog)))
686 break;
688 ret = sk_attach_filter(&fprog, sk);
690 break;
692 case SO_DETACH_FILTER:
693 ret = sk_detach_filter(sk);
694 break;
696 case SO_PASSSEC:
697 if (valbool)
698 set_bit(SOCK_PASSSEC, &sock->flags);
699 else
700 clear_bit(SOCK_PASSSEC, &sock->flags);
701 break;
702 case SO_MARK:
703 if (!capable(CAP_NET_ADMIN))
704 ret = -EPERM;
705 else
706 sk->sk_mark = val;
707 break;
709 /* We implement the SO_SNDLOWAT etc to
710 not be settable (1003.1g 5.3) */
711 default:
712 ret = -ENOPROTOOPT;
713 break;
715 release_sock(sk);
716 return ret;
718 EXPORT_SYMBOL(sock_setsockopt);
721 int sock_getsockopt(struct socket *sock, int level, int optname,
722 char __user *optval, int __user *optlen)
724 struct sock *sk = sock->sk;
726 union {
727 int val;
728 struct linger ling;
729 struct timeval tm;
730 } v;
732 unsigned int lv = sizeof(int);
733 int len;
735 if (get_user(len, optlen))
736 return -EFAULT;
737 if (len < 0)
738 return -EINVAL;
740 memset(&v, 0, sizeof(v));
742 switch (optname) {
743 case SO_DEBUG:
744 v.val = sock_flag(sk, SOCK_DBG);
745 break;
747 case SO_DONTROUTE:
748 v.val = sock_flag(sk, SOCK_LOCALROUTE);
749 break;
751 case SO_BROADCAST:
752 v.val = !!sock_flag(sk, SOCK_BROADCAST);
753 break;
755 case SO_SNDBUF:
756 v.val = sk->sk_sndbuf;
757 break;
759 case SO_RCVBUF:
760 v.val = sk->sk_rcvbuf;
761 break;
763 case SO_REUSEADDR:
764 v.val = sk->sk_reuse;
765 break;
767 case SO_KEEPALIVE:
768 v.val = !!sock_flag(sk, SOCK_KEEPOPEN);
769 break;
771 case SO_TYPE:
772 v.val = sk->sk_type;
773 break;
775 case SO_PROTOCOL:
776 v.val = sk->sk_protocol;
777 break;
779 case SO_DOMAIN:
780 v.val = sk->sk_family;
781 break;
783 case SO_ERROR:
784 v.val = -sock_error(sk);
785 if (v.val == 0)
786 v.val = xchg(&sk->sk_err_soft, 0);
787 break;
789 case SO_OOBINLINE:
790 v.val = !!sock_flag(sk, SOCK_URGINLINE);
791 break;
793 case SO_NO_CHECK:
794 v.val = sk->sk_no_check;
795 break;
797 case SO_PRIORITY:
798 v.val = sk->sk_priority;
799 break;
801 case SO_LINGER:
802 lv = sizeof(v.ling);
803 v.ling.l_onoff = !!sock_flag(sk, SOCK_LINGER);
804 v.ling.l_linger = sk->sk_lingertime / HZ;
805 break;
807 case SO_BSDCOMPAT:
808 sock_warn_obsolete_bsdism("getsockopt");
809 break;
811 case SO_TIMESTAMP:
812 v.val = sock_flag(sk, SOCK_RCVTSTAMP) &&
813 !sock_flag(sk, SOCK_RCVTSTAMPNS);
814 break;
816 case SO_TIMESTAMPNS:
817 v.val = sock_flag(sk, SOCK_RCVTSTAMPNS);
818 break;
820 case SO_TIMESTAMPING:
821 v.val = 0;
822 if (sock_flag(sk, SOCK_TIMESTAMPING_TX_HARDWARE))
823 v.val |= SOF_TIMESTAMPING_TX_HARDWARE;
824 if (sock_flag(sk, SOCK_TIMESTAMPING_TX_SOFTWARE))
825 v.val |= SOF_TIMESTAMPING_TX_SOFTWARE;
826 if (sock_flag(sk, SOCK_TIMESTAMPING_RX_HARDWARE))
827 v.val |= SOF_TIMESTAMPING_RX_HARDWARE;
828 if (sock_flag(sk, SOCK_TIMESTAMPING_RX_SOFTWARE))
829 v.val |= SOF_TIMESTAMPING_RX_SOFTWARE;
830 if (sock_flag(sk, SOCK_TIMESTAMPING_SOFTWARE))
831 v.val |= SOF_TIMESTAMPING_SOFTWARE;
832 if (sock_flag(sk, SOCK_TIMESTAMPING_SYS_HARDWARE))
833 v.val |= SOF_TIMESTAMPING_SYS_HARDWARE;
834 if (sock_flag(sk, SOCK_TIMESTAMPING_RAW_HARDWARE))
835 v.val |= SOF_TIMESTAMPING_RAW_HARDWARE;
836 break;
838 case SO_RCVTIMEO:
839 lv = sizeof(struct timeval);
840 if (sk->sk_rcvtimeo == MAX_SCHEDULE_TIMEOUT) {
841 v.tm.tv_sec = 0;
842 v.tm.tv_usec = 0;
843 } else {
844 v.tm.tv_sec = sk->sk_rcvtimeo / HZ;
845 v.tm.tv_usec = ((sk->sk_rcvtimeo % HZ) * 1000000) / HZ;
847 break;
849 case SO_SNDTIMEO:
850 lv = sizeof(struct timeval);
851 if (sk->sk_sndtimeo == MAX_SCHEDULE_TIMEOUT) {
852 v.tm.tv_sec = 0;
853 v.tm.tv_usec = 0;
854 } else {
855 v.tm.tv_sec = sk->sk_sndtimeo / HZ;
856 v.tm.tv_usec = ((sk->sk_sndtimeo % HZ) * 1000000) / HZ;
858 break;
860 case SO_RCVLOWAT:
861 v.val = sk->sk_rcvlowat;
862 break;
864 case SO_SNDLOWAT:
865 v.val = 1;
866 break;
868 case SO_PASSCRED:
869 v.val = test_bit(SOCK_PASSCRED, &sock->flags) ? 1 : 0;
870 break;
872 case SO_PEERCRED:
873 if (len > sizeof(sk->sk_peercred))
874 len = sizeof(sk->sk_peercred);
875 if (copy_to_user(optval, &sk->sk_peercred, len))
876 return -EFAULT;
877 goto lenout;
879 case SO_PEERNAME:
881 char address[128];
883 if (sock->ops->getname(sock, (struct sockaddr *)address, &lv, 2))
884 return -ENOTCONN;
885 if (lv < len)
886 return -EINVAL;
887 if (copy_to_user(optval, address, len))
888 return -EFAULT;
889 goto lenout;
892 /* Dubious BSD thing... Probably nobody even uses it, but
893 * the UNIX standard wants it for whatever reason... -DaveM
895 case SO_ACCEPTCONN:
896 v.val = sk->sk_state == TCP_LISTEN;
897 break;
899 case SO_PASSSEC:
900 v.val = test_bit(SOCK_PASSSEC, &sock->flags) ? 1 : 0;
901 break;
903 case SO_PEERSEC:
904 return security_socket_getpeersec_stream(sock, optval, optlen, len);
906 case SO_MARK:
907 v.val = sk->sk_mark;
908 break;
910 default:
911 return -ENOPROTOOPT;
914 if (len > lv)
915 len = lv;
916 if (copy_to_user(optval, &v, len))
917 return -EFAULT;
918 lenout:
919 if (put_user(len, optlen))
920 return -EFAULT;
921 return 0;
925 * Initialize an sk_lock.
927 * (We also register the sk_lock with the lock validator.)
929 static inline void sock_lock_init(struct sock *sk)
931 sock_lock_init_class_and_name(sk,
932 af_family_slock_key_strings[sk->sk_family],
933 af_family_slock_keys + sk->sk_family,
934 af_family_key_strings[sk->sk_family],
935 af_family_keys + sk->sk_family);
939 * Copy all fields from osk to nsk but nsk->sk_refcnt must not change yet,
940 * even temporarly, because of RCU lookups. sk_node should also be left as is.
942 static void sock_copy(struct sock *nsk, const struct sock *osk)
944 #ifdef CONFIG_SECURITY_NETWORK
945 void *sptr = nsk->sk_security;
946 #endif
947 BUILD_BUG_ON(offsetof(struct sock, sk_copy_start) !=
948 sizeof(osk->sk_node) + sizeof(osk->sk_refcnt));
949 memcpy(&nsk->sk_copy_start, &osk->sk_copy_start,
950 osk->sk_prot->obj_size - offsetof(struct sock, sk_copy_start));
951 #ifdef CONFIG_SECURITY_NETWORK
952 nsk->sk_security = sptr;
953 security_sk_clone(osk, nsk);
954 #endif
957 static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
958 int family)
960 struct sock *sk;
961 struct kmem_cache *slab;
963 slab = prot->slab;
964 if (slab != NULL) {
965 sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO);
966 if (!sk)
967 return sk;
968 if (priority & __GFP_ZERO) {
970 * caches using SLAB_DESTROY_BY_RCU should let
971 * sk_node.next un-modified. Special care is taken
972 * when initializing object to zero.
974 if (offsetof(struct sock, sk_node.next) != 0)
975 memset(sk, 0, offsetof(struct sock, sk_node.next));
976 memset(&sk->sk_node.pprev, 0,
977 prot->obj_size - offsetof(struct sock,
978 sk_node.pprev));
981 else
982 sk = kmalloc(prot->obj_size, priority);
984 if (sk != NULL) {
985 kmemcheck_annotate_bitfield(sk, flags);
987 if (security_sk_alloc(sk, family, priority))
988 goto out_free;
990 if (!try_module_get(prot->owner))
991 goto out_free_sec;
994 return sk;
996 out_free_sec:
997 security_sk_free(sk);
998 out_free:
999 if (slab != NULL)
1000 kmem_cache_free(slab, sk);
1001 else
1002 kfree(sk);
1003 return NULL;
1006 static void sk_prot_free(struct proto *prot, struct sock *sk)
1008 struct kmem_cache *slab;
1009 struct module *owner;
1011 owner = prot->owner;
1012 slab = prot->slab;
1014 security_sk_free(sk);
1015 if (slab != NULL)
1016 kmem_cache_free(slab, sk);
1017 else
1018 kfree(sk);
1019 module_put(owner);
1023 * sk_alloc - All socket objects are allocated here
1024 * @net: the applicable net namespace
1025 * @family: protocol family
1026 * @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
1027 * @prot: struct proto associated with this new sock instance
1029 struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
1030 struct proto *prot)
1032 struct sock *sk;
1034 sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family);
1035 if (sk) {
1036 sk->sk_family = family;
1038 * See comment in struct sock definition to understand
1039 * why we need sk_prot_creator -acme
1041 sk->sk_prot = sk->sk_prot_creator = prot;
1042 sock_lock_init(sk);
1043 sock_net_set(sk, get_net(net));
1044 atomic_set(&sk->sk_wmem_alloc, 1);
1047 return sk;
1049 EXPORT_SYMBOL(sk_alloc);
1051 static void __sk_free(struct sock *sk)
1053 struct sk_filter *filter;
1055 if (sk->sk_destruct)
1056 sk->sk_destruct(sk);
1058 filter = rcu_dereference(sk->sk_filter);
1059 if (filter) {
1060 sk_filter_uncharge(sk, filter);
1061 rcu_assign_pointer(sk->sk_filter, NULL);
1064 sock_disable_timestamp(sk, SOCK_TIMESTAMP);
1065 sock_disable_timestamp(sk, SOCK_TIMESTAMPING_RX_SOFTWARE);
1067 if (atomic_read(&sk->sk_omem_alloc))
1068 printk(KERN_DEBUG "%s: optmem leakage (%d bytes) detected.\n",
1069 __func__, atomic_read(&sk->sk_omem_alloc));
1071 put_net(sock_net(sk));
1072 sk_prot_free(sk->sk_prot_creator, sk);
1075 void sk_free(struct sock *sk)
1078 * We substract one from sk_wmem_alloc and can know if
1079 * some packets are still in some tx queue.
1080 * If not null, sock_wfree() will call __sk_free(sk) later
1082 if (atomic_dec_and_test(&sk->sk_wmem_alloc))
1083 __sk_free(sk);
1085 EXPORT_SYMBOL(sk_free);
1088 * Last sock_put should drop referrence to sk->sk_net. It has already
1089 * been dropped in sk_change_net. Taking referrence to stopping namespace
1090 * is not an option.
1091 * Take referrence to a socket to remove it from hash _alive_ and after that
1092 * destroy it in the context of init_net.
1094 void sk_release_kernel(struct sock *sk)
1096 if (sk == NULL || sk->sk_socket == NULL)
1097 return;
1099 sock_hold(sk);
1100 sock_release(sk->sk_socket);
1101 release_net(sock_net(sk));
1102 sock_net_set(sk, get_net(&init_net));
1103 sock_put(sk);
1105 EXPORT_SYMBOL(sk_release_kernel);
1107 struct sock *sk_clone(const struct sock *sk, const gfp_t priority)
1109 struct sock *newsk;
1111 newsk = sk_prot_alloc(sk->sk_prot, priority, sk->sk_family);
1112 if (newsk != NULL) {
1113 struct sk_filter *filter;
1115 sock_copy(newsk, sk);
1117 /* SANITY */
1118 get_net(sock_net(newsk));
1119 sk_node_init(&newsk->sk_node);
1120 sock_lock_init(newsk);
1121 bh_lock_sock(newsk);
1122 newsk->sk_backlog.head = newsk->sk_backlog.tail = NULL;
1124 atomic_set(&newsk->sk_rmem_alloc, 0);
1126 * sk_wmem_alloc set to one (see sk_free() and sock_wfree())
1128 atomic_set(&newsk->sk_wmem_alloc, 1);
1129 atomic_set(&newsk->sk_omem_alloc, 0);
1130 skb_queue_head_init(&newsk->sk_receive_queue);
1131 skb_queue_head_init(&newsk->sk_write_queue);
1132 #ifdef CONFIG_NET_DMA
1133 skb_queue_head_init(&newsk->sk_async_wait_queue);
1134 #endif
1136 rwlock_init(&newsk->sk_dst_lock);
1137 rwlock_init(&newsk->sk_callback_lock);
1138 lockdep_set_class_and_name(&newsk->sk_callback_lock,
1139 af_callback_keys + newsk->sk_family,
1140 af_family_clock_key_strings[newsk->sk_family]);
1142 newsk->sk_dst_cache = NULL;
1143 newsk->sk_wmem_queued = 0;
1144 newsk->sk_forward_alloc = 0;
1145 newsk->sk_send_head = NULL;
1146 newsk->sk_userlocks = sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
1148 sock_reset_flag(newsk, SOCK_DONE);
1149 skb_queue_head_init(&newsk->sk_error_queue);
1151 filter = newsk->sk_filter;
1152 if (filter != NULL)
1153 sk_filter_charge(newsk, filter);
1155 if (unlikely(xfrm_sk_clone_policy(newsk))) {
1156 /* It is still raw copy of parent, so invalidate
1157 * destructor and make plain sk_free() */
1158 newsk->sk_destruct = NULL;
1159 sk_free(newsk);
1160 newsk = NULL;
1161 goto out;
1164 newsk->sk_err = 0;
1165 newsk->sk_priority = 0;
1167 * Before updating sk_refcnt, we must commit prior changes to memory
1168 * (Documentation/RCU/rculist_nulls.txt for details)
1170 smp_wmb();
1171 atomic_set(&newsk->sk_refcnt, 2);
1174 * Increment the counter in the same struct proto as the master
1175 * sock (sk_refcnt_debug_inc uses newsk->sk_prot->socks, that
1176 * is the same as sk->sk_prot->socks, as this field was copied
1177 * with memcpy).
1179 * This _changes_ the previous behaviour, where
1180 * tcp_create_openreq_child always was incrementing the
1181 * equivalent to tcp_prot->socks (inet_sock_nr), so this have
1182 * to be taken into account in all callers. -acme
1184 sk_refcnt_debug_inc(newsk);
1185 sk_set_socket(newsk, NULL);
1186 newsk->sk_sleep = NULL;
1188 if (newsk->sk_prot->sockets_allocated)
1189 percpu_counter_inc(newsk->sk_prot->sockets_allocated);
1191 if (sock_flag(newsk, SOCK_TIMESTAMP) ||
1192 sock_flag(newsk, SOCK_TIMESTAMPING_RX_SOFTWARE))
1193 net_enable_timestamp();
1195 out:
1196 return newsk;
1198 EXPORT_SYMBOL_GPL(sk_clone);
1200 void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
1202 __sk_dst_set(sk, dst);
1203 sk->sk_route_caps = dst->dev->features;
1204 if (sk->sk_route_caps & NETIF_F_GSO)
1205 sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE;
1206 if (sk_can_gso(sk)) {
1207 if (dst->header_len) {
1208 sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
1209 } else {
1210 sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM;
1211 sk->sk_gso_max_size = dst->dev->gso_max_size;
1215 EXPORT_SYMBOL_GPL(sk_setup_caps);
1217 void __init sk_init(void)
1219 if (totalram_pages <= 4096) {
1220 sysctl_wmem_max = 32767;
1221 sysctl_rmem_max = 32767;
1222 sysctl_wmem_default = 32767;
1223 sysctl_rmem_default = 32767;
1224 } else if (totalram_pages >= 131072) {
1225 sysctl_wmem_max = 131071;
1226 sysctl_rmem_max = 131071;
1231 * Simple resource managers for sockets.
1236 * Write buffer destructor automatically called from kfree_skb.
1238 void sock_wfree(struct sk_buff *skb)
1240 struct sock *sk = skb->sk;
1241 unsigned int len = skb->truesize;
1243 if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE)) {
1245 * Keep a reference on sk_wmem_alloc, this will be released
1246 * after sk_write_space() call
1248 atomic_sub(len - 1, &sk->sk_wmem_alloc);
1249 sk->sk_write_space(sk);
1250 len = 1;
1253 * if sk_wmem_alloc reaches 0, we must finish what sk_free()
1254 * could not do because of in-flight packets
1256 if (atomic_sub_and_test(len, &sk->sk_wmem_alloc))
1257 __sk_free(sk);
1259 EXPORT_SYMBOL(sock_wfree);
1262 * Read buffer destructor automatically called from kfree_skb.
1264 void sock_rfree(struct sk_buff *skb)
1266 struct sock *sk = skb->sk;
1268 atomic_sub(skb->truesize, &sk->sk_rmem_alloc);
1269 sk_mem_uncharge(skb->sk, skb->truesize);
1271 EXPORT_SYMBOL(sock_rfree);
1274 int sock_i_uid(struct sock *sk)
1276 int uid;
1278 read_lock(&sk->sk_callback_lock);
1279 uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : 0;
1280 read_unlock(&sk->sk_callback_lock);
1281 return uid;
1283 EXPORT_SYMBOL(sock_i_uid);
1285 unsigned long sock_i_ino(struct sock *sk)
1287 unsigned long ino;
1289 read_lock(&sk->sk_callback_lock);
1290 ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0;
1291 read_unlock(&sk->sk_callback_lock);
1292 return ino;
1294 EXPORT_SYMBOL(sock_i_ino);
1297 * Allocate a skb from the socket's send buffer.
1299 struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
1300 gfp_t priority)
1302 if (force || atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
1303 struct sk_buff *skb = alloc_skb(size, priority);
1304 if (skb) {
1305 skb_set_owner_w(skb, sk);
1306 return skb;
1309 return NULL;
1311 EXPORT_SYMBOL(sock_wmalloc);
1314 * Allocate a skb from the socket's receive buffer.
1316 struct sk_buff *sock_rmalloc(struct sock *sk, unsigned long size, int force,
1317 gfp_t priority)
1319 if (force || atomic_read(&sk->sk_rmem_alloc) < sk->sk_rcvbuf) {
1320 struct sk_buff *skb = alloc_skb(size, priority);
1321 if (skb) {
1322 skb_set_owner_r(skb, sk);
1323 return skb;
1326 return NULL;
1330 * Allocate a memory block from the socket's option memory buffer.
1332 void *sock_kmalloc(struct sock *sk, int size, gfp_t priority)
1334 if ((unsigned)size <= sysctl_optmem_max &&
1335 atomic_read(&sk->sk_omem_alloc) + size < sysctl_optmem_max) {
1336 void *mem;
1337 /* First do the add, to avoid the race if kmalloc
1338 * might sleep.
1340 atomic_add(size, &sk->sk_omem_alloc);
1341 mem = kmalloc(size, priority);
1342 if (mem)
1343 return mem;
1344 atomic_sub(size, &sk->sk_omem_alloc);
1346 return NULL;
1348 EXPORT_SYMBOL(sock_kmalloc);
1351 * Free an option memory block.
1353 void sock_kfree_s(struct sock *sk, void *mem, int size)
1355 kfree(mem);
1356 atomic_sub(size, &sk->sk_omem_alloc);
1358 EXPORT_SYMBOL(sock_kfree_s);
1360 /* It is almost wait_for_tcp_memory minus release_sock/lock_sock.
1361 I think, these locks should be removed for datagram sockets.
1363 static long sock_wait_for_wmem(struct sock *sk, long timeo)
1365 DEFINE_WAIT(wait);
1367 clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
1368 for (;;) {
1369 if (!timeo)
1370 break;
1371 if (signal_pending(current))
1372 break;
1373 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1374 prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE);
1375 if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf)
1376 break;
1377 if (sk->sk_shutdown & SEND_SHUTDOWN)
1378 break;
1379 if (sk->sk_err)
1380 break;
1381 timeo = schedule_timeout(timeo);
1383 finish_wait(sk->sk_sleep, &wait);
1384 return timeo;
1389 * Generic send/receive buffer handlers
1392 struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len,
1393 unsigned long data_len, int noblock,
1394 int *errcode)
1396 struct sk_buff *skb;
1397 gfp_t gfp_mask;
1398 long timeo;
1399 int err;
1401 gfp_mask = sk->sk_allocation;
1402 if (gfp_mask & __GFP_WAIT)
1403 gfp_mask |= __GFP_REPEAT;
1405 timeo = sock_sndtimeo(sk, noblock);
1406 while (1) {
1407 err = sock_error(sk);
1408 if (err != 0)
1409 goto failure;
1411 err = -EPIPE;
1412 if (sk->sk_shutdown & SEND_SHUTDOWN)
1413 goto failure;
1415 if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
1416 skb = alloc_skb(header_len, gfp_mask);
1417 if (skb) {
1418 int npages;
1419 int i;
1421 /* No pages, we're done... */
1422 if (!data_len)
1423 break;
1425 npages = (data_len + (PAGE_SIZE - 1)) >> PAGE_SHIFT;
1426 skb->truesize += data_len;
1427 skb_shinfo(skb)->nr_frags = npages;
1428 for (i = 0; i < npages; i++) {
1429 struct page *page;
1430 skb_frag_t *frag;
1432 page = alloc_pages(sk->sk_allocation, 0);
1433 if (!page) {
1434 err = -ENOBUFS;
1435 skb_shinfo(skb)->nr_frags = i;
1436 kfree_skb(skb);
1437 goto failure;
1440 frag = &skb_shinfo(skb)->frags[i];
1441 frag->page = page;
1442 frag->page_offset = 0;
1443 frag->size = (data_len >= PAGE_SIZE ?
1444 PAGE_SIZE :
1445 data_len);
1446 data_len -= PAGE_SIZE;
1449 /* Full success... */
1450 break;
1452 err = -ENOBUFS;
1453 goto failure;
1455 set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
1456 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1457 err = -EAGAIN;
1458 if (!timeo)
1459 goto failure;
1460 if (signal_pending(current))
1461 goto interrupted;
1462 timeo = sock_wait_for_wmem(sk, timeo);
1465 skb_set_owner_w(skb, sk);
1466 return skb;
1468 interrupted:
1469 err = sock_intr_errno(timeo);
1470 failure:
1471 *errcode = err;
1472 return NULL;
1474 EXPORT_SYMBOL(sock_alloc_send_pskb);
1476 struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size,
1477 int noblock, int *errcode)
1479 return sock_alloc_send_pskb(sk, size, 0, noblock, errcode);
1481 EXPORT_SYMBOL(sock_alloc_send_skb);
1483 static void __lock_sock(struct sock *sk)
1485 DEFINE_WAIT(wait);
1487 for (;;) {
1488 prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait,
1489 TASK_UNINTERRUPTIBLE);
1490 spin_unlock_bh(&sk->sk_lock.slock);
1491 schedule();
1492 spin_lock_bh(&sk->sk_lock.slock);
1493 if (!sock_owned_by_user(sk))
1494 break;
1496 finish_wait(&sk->sk_lock.wq, &wait);
1499 static void __release_sock(struct sock *sk)
1501 struct sk_buff *skb = sk->sk_backlog.head;
1503 do {
1504 sk->sk_backlog.head = sk->sk_backlog.tail = NULL;
1505 bh_unlock_sock(sk);
1507 do {
1508 struct sk_buff *next = skb->next;
1510 skb->next = NULL;
1511 sk_backlog_rcv(sk, skb);
1514 * We are in process context here with softirqs
1515 * disabled, use cond_resched_softirq() to preempt.
1516 * This is safe to do because we've taken the backlog
1517 * queue private:
1519 cond_resched_softirq();
1521 skb = next;
1522 } while (skb != NULL);
1524 bh_lock_sock(sk);
1525 } while ((skb = sk->sk_backlog.head) != NULL);
1529 * sk_wait_data - wait for data to arrive at sk_receive_queue
1530 * @sk: sock to wait on
1531 * @timeo: for how long
1533 * Now socket state including sk->sk_err is changed only under lock,
1534 * hence we may omit checks after joining wait queue.
1535 * We check receive queue before schedule() only as optimization;
1536 * it is very likely that release_sock() added new data.
1538 int sk_wait_data(struct sock *sk, long *timeo)
1540 int rc;
1541 DEFINE_WAIT(wait);
1543 prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE);
1544 set_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
1545 rc = sk_wait_event(sk, timeo, !skb_queue_empty(&sk->sk_receive_queue));
1546 clear_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
1547 finish_wait(sk->sk_sleep, &wait);
1548 return rc;
1550 EXPORT_SYMBOL(sk_wait_data);
1553 * __sk_mem_schedule - increase sk_forward_alloc and memory_allocated
1554 * @sk: socket
1555 * @size: memory size to allocate
1556 * @kind: allocation type
1558 * If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means
1559 * rmem allocation. This function assumes that protocols which have
1560 * memory_pressure use sk_wmem_queued as write buffer accounting.
1562 int __sk_mem_schedule(struct sock *sk, int size, int kind)
1564 struct proto *prot = sk->sk_prot;
1565 int amt = sk_mem_pages(size);
1566 int allocated;
1568 sk->sk_forward_alloc += amt * SK_MEM_QUANTUM;
1569 allocated = atomic_add_return(amt, prot->memory_allocated);
1571 /* Under limit. */
1572 if (allocated <= prot->sysctl_mem[0]) {
1573 if (prot->memory_pressure && *prot->memory_pressure)
1574 *prot->memory_pressure = 0;
1575 return 1;
1578 /* Under pressure. */
1579 if (allocated > prot->sysctl_mem[1])
1580 if (prot->enter_memory_pressure)
1581 prot->enter_memory_pressure(sk);
1583 /* Over hard limit. */
1584 if (allocated > prot->sysctl_mem[2])
1585 goto suppress_allocation;
1587 /* guarantee minimum buffer size under pressure */
1588 if (kind == SK_MEM_RECV) {
1589 if (atomic_read(&sk->sk_rmem_alloc) < prot->sysctl_rmem[0])
1590 return 1;
1591 } else { /* SK_MEM_SEND */
1592 if (sk->sk_type == SOCK_STREAM) {
1593 if (sk->sk_wmem_queued < prot->sysctl_wmem[0])
1594 return 1;
1595 } else if (atomic_read(&sk->sk_wmem_alloc) <
1596 prot->sysctl_wmem[0])
1597 return 1;
1600 if (prot->memory_pressure) {
1601 int alloc;
1603 if (!*prot->memory_pressure)
1604 return 1;
1605 alloc = percpu_counter_read_positive(prot->sockets_allocated);
1606 if (prot->sysctl_mem[2] > alloc *
1607 sk_mem_pages(sk->sk_wmem_queued +
1608 atomic_read(&sk->sk_rmem_alloc) +
1609 sk->sk_forward_alloc))
1610 return 1;
1613 suppress_allocation:
1615 if (kind == SK_MEM_SEND && sk->sk_type == SOCK_STREAM) {
1616 sk_stream_moderate_sndbuf(sk);
1618 /* Fail only if socket is _under_ its sndbuf.
1619 * In this case we cannot block, so that we have to fail.
1621 if (sk->sk_wmem_queued + size >= sk->sk_sndbuf)
1622 return 1;
1625 /* Alas. Undo changes. */
1626 sk->sk_forward_alloc -= amt * SK_MEM_QUANTUM;
1627 atomic_sub(amt, prot->memory_allocated);
1628 return 0;
1630 EXPORT_SYMBOL(__sk_mem_schedule);
1633 * __sk_reclaim - reclaim memory_allocated
1634 * @sk: socket
1636 void __sk_mem_reclaim(struct sock *sk)
1638 struct proto *prot = sk->sk_prot;
1640 atomic_sub(sk->sk_forward_alloc >> SK_MEM_QUANTUM_SHIFT,
1641 prot->memory_allocated);
1642 sk->sk_forward_alloc &= SK_MEM_QUANTUM - 1;
1644 if (prot->memory_pressure && *prot->memory_pressure &&
1645 (atomic_read(prot->memory_allocated) < prot->sysctl_mem[0]))
1646 *prot->memory_pressure = 0;
1648 EXPORT_SYMBOL(__sk_mem_reclaim);
1652 * Set of default routines for initialising struct proto_ops when
1653 * the protocol does not support a particular function. In certain
1654 * cases where it makes no sense for a protocol to have a "do nothing"
1655 * function, some default processing is provided.
1658 int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len)
1660 return -EOPNOTSUPP;
1662 EXPORT_SYMBOL(sock_no_bind);
1664 int sock_no_connect(struct socket *sock, struct sockaddr *saddr,
1665 int len, int flags)
1667 return -EOPNOTSUPP;
1669 EXPORT_SYMBOL(sock_no_connect);
1671 int sock_no_socketpair(struct socket *sock1, struct socket *sock2)
1673 return -EOPNOTSUPP;
1675 EXPORT_SYMBOL(sock_no_socketpair);
1677 int sock_no_accept(struct socket *sock, struct socket *newsock, int flags)
1679 return -EOPNOTSUPP;
1681 EXPORT_SYMBOL(sock_no_accept);
1683 int sock_no_getname(struct socket *sock, struct sockaddr *saddr,
1684 int *len, int peer)
1686 return -EOPNOTSUPP;
1688 EXPORT_SYMBOL(sock_no_getname);
1690 unsigned int sock_no_poll(struct file *file, struct socket *sock, poll_table *pt)
1692 return 0;
1694 EXPORT_SYMBOL(sock_no_poll);
1696 int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
1698 return -EOPNOTSUPP;
1700 EXPORT_SYMBOL(sock_no_ioctl);
1702 int sock_no_listen(struct socket *sock, int backlog)
1704 return -EOPNOTSUPP;
1706 EXPORT_SYMBOL(sock_no_listen);
1708 int sock_no_shutdown(struct socket *sock, int how)
1710 return -EOPNOTSUPP;
1712 EXPORT_SYMBOL(sock_no_shutdown);
1714 int sock_no_setsockopt(struct socket *sock, int level, int optname,
1715 char __user *optval, unsigned int optlen)
1717 return -EOPNOTSUPP;
1719 EXPORT_SYMBOL(sock_no_setsockopt);
1721 int sock_no_getsockopt(struct socket *sock, int level, int optname,
1722 char __user *optval, int __user *optlen)
1724 return -EOPNOTSUPP;
1726 EXPORT_SYMBOL(sock_no_getsockopt);
1728 int sock_no_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *m,
1729 size_t len)
1731 return -EOPNOTSUPP;
1733 EXPORT_SYMBOL(sock_no_sendmsg);
1735 int sock_no_recvmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *m,
1736 size_t len, int flags)
1738 return -EOPNOTSUPP;
1740 EXPORT_SYMBOL(sock_no_recvmsg);
1742 int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
1744 /* Mirror missing mmap method error code */
1745 return -ENODEV;
1747 EXPORT_SYMBOL(sock_no_mmap);
1749 ssize_t sock_no_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags)
1751 ssize_t res;
1752 struct msghdr msg = {.msg_flags = flags};
1753 struct kvec iov;
1754 char *kaddr = kmap(page);
1755 iov.iov_base = kaddr + offset;
1756 iov.iov_len = size;
1757 res = kernel_sendmsg(sock, &msg, &iov, 1, size);
1758 kunmap(page);
1759 return res;
1761 EXPORT_SYMBOL(sock_no_sendpage);
1764 * Default Socket Callbacks
1767 static void sock_def_wakeup(struct sock *sk)
1769 read_lock(&sk->sk_callback_lock);
1770 if (sk_has_sleeper(sk))
1771 wake_up_interruptible_all(sk->sk_sleep);
1772 read_unlock(&sk->sk_callback_lock);
1775 static void sock_def_error_report(struct sock *sk)
1777 read_lock(&sk->sk_callback_lock);
1778 if (sk_has_sleeper(sk))
1779 wake_up_interruptible_poll(sk->sk_sleep, POLLERR);
1780 sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR);
1781 read_unlock(&sk->sk_callback_lock);
1784 static void sock_def_readable(struct sock *sk, int len)
1786 read_lock(&sk->sk_callback_lock);
1787 if (sk_has_sleeper(sk))
1788 wake_up_interruptible_sync_poll(sk->sk_sleep, POLLIN |
1789 POLLRDNORM | POLLRDBAND);
1790 sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
1791 read_unlock(&sk->sk_callback_lock);
1794 static void sock_def_write_space(struct sock *sk)
1796 read_lock(&sk->sk_callback_lock);
1798 /* Do not wake up a writer until he can make "significant"
1799 * progress. --DaveM
1801 if ((atomic_read(&sk->sk_wmem_alloc) << 1) <= sk->sk_sndbuf) {
1802 if (sk_has_sleeper(sk))
1803 wake_up_interruptible_sync_poll(sk->sk_sleep, POLLOUT |
1804 POLLWRNORM | POLLWRBAND);
1806 /* Should agree with poll, otherwise some programs break */
1807 if (sock_writeable(sk))
1808 sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
1811 read_unlock(&sk->sk_callback_lock);
1814 static void sock_def_destruct(struct sock *sk)
1816 kfree(sk->sk_protinfo);
1819 void sk_send_sigurg(struct sock *sk)
1821 if (sk->sk_socket && sk->sk_socket->file)
1822 if (send_sigurg(&sk->sk_socket->file->f_owner))
1823 sk_wake_async(sk, SOCK_WAKE_URG, POLL_PRI);
1825 EXPORT_SYMBOL(sk_send_sigurg);
1827 void sk_reset_timer(struct sock *sk, struct timer_list* timer,
1828 unsigned long expires)
1830 if (!mod_timer(timer, expires))
1831 sock_hold(sk);
1833 EXPORT_SYMBOL(sk_reset_timer);
1835 void sk_stop_timer(struct sock *sk, struct timer_list* timer)
1837 if (timer_pending(timer) && del_timer(timer))
1838 __sock_put(sk);
1840 EXPORT_SYMBOL(sk_stop_timer);
1842 void sock_init_data(struct socket *sock, struct sock *sk)
1844 skb_queue_head_init(&sk->sk_receive_queue);
1845 skb_queue_head_init(&sk->sk_write_queue);
1846 skb_queue_head_init(&sk->sk_error_queue);
1847 #ifdef CONFIG_NET_DMA
1848 skb_queue_head_init(&sk->sk_async_wait_queue);
1849 #endif
1851 sk->sk_send_head = NULL;
1853 init_timer(&sk->sk_timer);
1855 sk->sk_allocation = GFP_KERNEL;
1856 sk->sk_rcvbuf = sysctl_rmem_default;
1857 sk->sk_sndbuf = sysctl_wmem_default;
1858 sk->sk_state = TCP_CLOSE;
1859 sk_set_socket(sk, sock);
1861 sock_set_flag(sk, SOCK_ZAPPED);
1863 if (sock) {
1864 sk->sk_type = sock->type;
1865 sk->sk_sleep = &sock->wait;
1866 sock->sk = sk;
1867 } else
1868 sk->sk_sleep = NULL;
1870 rwlock_init(&sk->sk_dst_lock);
1871 rwlock_init(&sk->sk_callback_lock);
1872 lockdep_set_class_and_name(&sk->sk_callback_lock,
1873 af_callback_keys + sk->sk_family,
1874 af_family_clock_key_strings[sk->sk_family]);
1876 sk->sk_state_change = sock_def_wakeup;
1877 sk->sk_data_ready = sock_def_readable;
1878 sk->sk_write_space = sock_def_write_space;
1879 sk->sk_error_report = sock_def_error_report;
1880 sk->sk_destruct = sock_def_destruct;
1882 sk->sk_sndmsg_page = NULL;
1883 sk->sk_sndmsg_off = 0;
1885 sk->sk_peercred.pid = 0;
1886 sk->sk_peercred.uid = -1;
1887 sk->sk_peercred.gid = -1;
1888 sk->sk_write_pending = 0;
1889 sk->sk_rcvlowat = 1;
1890 sk->sk_rcvtimeo = MAX_SCHEDULE_TIMEOUT;
1891 sk->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT;
1893 sk->sk_stamp = ktime_set(-1L, 0);
1896 * Before updating sk_refcnt, we must commit prior changes to memory
1897 * (Documentation/RCU/rculist_nulls.txt for details)
1899 smp_wmb();
1900 atomic_set(&sk->sk_refcnt, 1);
1901 atomic_set(&sk->sk_drops, 0);
1903 EXPORT_SYMBOL(sock_init_data);
1905 void lock_sock_nested(struct sock *sk, int subclass)
1907 might_sleep();
1908 spin_lock_bh(&sk->sk_lock.slock);
1909 if (sk->sk_lock.owned)
1910 __lock_sock(sk);
1911 sk->sk_lock.owned = 1;
1912 spin_unlock(&sk->sk_lock.slock);
1914 * The sk_lock has mutex_lock() semantics here:
1916 mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_);
1917 local_bh_enable();
1919 EXPORT_SYMBOL(lock_sock_nested);
1921 void release_sock(struct sock *sk)
1924 * The sk_lock has mutex_unlock() semantics:
1926 mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
1928 spin_lock_bh(&sk->sk_lock.slock);
1929 if (sk->sk_backlog.tail)
1930 __release_sock(sk);
1931 sk->sk_lock.owned = 0;
1932 if (waitqueue_active(&sk->sk_lock.wq))
1933 wake_up(&sk->sk_lock.wq);
1934 spin_unlock_bh(&sk->sk_lock.slock);
1936 EXPORT_SYMBOL(release_sock);
1938 int sock_get_timestamp(struct sock *sk, struct timeval __user *userstamp)
1940 struct timeval tv;
1941 if (!sock_flag(sk, SOCK_TIMESTAMP))
1942 sock_enable_timestamp(sk, SOCK_TIMESTAMP);
1943 tv = ktime_to_timeval(sk->sk_stamp);
1944 if (tv.tv_sec == -1)
1945 return -ENOENT;
1946 if (tv.tv_sec == 0) {
1947 sk->sk_stamp = ktime_get_real();
1948 tv = ktime_to_timeval(sk->sk_stamp);
1950 return copy_to_user(userstamp, &tv, sizeof(tv)) ? -EFAULT : 0;
1952 EXPORT_SYMBOL(sock_get_timestamp);
1954 int sock_get_timestampns(struct sock *sk, struct timespec __user *userstamp)
1956 struct timespec ts;
1957 if (!sock_flag(sk, SOCK_TIMESTAMP))
1958 sock_enable_timestamp(sk, SOCK_TIMESTAMP);
1959 ts = ktime_to_timespec(sk->sk_stamp);
1960 if (ts.tv_sec == -1)
1961 return -ENOENT;
1962 if (ts.tv_sec == 0) {
1963 sk->sk_stamp = ktime_get_real();
1964 ts = ktime_to_timespec(sk->sk_stamp);
1966 return copy_to_user(userstamp, &ts, sizeof(ts)) ? -EFAULT : 0;
1968 EXPORT_SYMBOL(sock_get_timestampns);
1970 void sock_enable_timestamp(struct sock *sk, int flag)
1972 if (!sock_flag(sk, flag)) {
1973 sock_set_flag(sk, flag);
1975 * we just set one of the two flags which require net
1976 * time stamping, but time stamping might have been on
1977 * already because of the other one
1979 if (!sock_flag(sk,
1980 flag == SOCK_TIMESTAMP ?
1981 SOCK_TIMESTAMPING_RX_SOFTWARE :
1982 SOCK_TIMESTAMP))
1983 net_enable_timestamp();
1988 * Get a socket option on an socket.
1990 * FIX: POSIX 1003.1g is very ambiguous here. It states that
1991 * asynchronous errors should be reported by getsockopt. We assume
1992 * this means if you specify SO_ERROR (otherwise whats the point of it).
1994 int sock_common_getsockopt(struct socket *sock, int level, int optname,
1995 char __user *optval, int __user *optlen)
1997 struct sock *sk = sock->sk;
1999 return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
2001 EXPORT_SYMBOL(sock_common_getsockopt);
2003 #ifdef CONFIG_COMPAT
2004 int compat_sock_common_getsockopt(struct socket *sock, int level, int optname,
2005 char __user *optval, int __user *optlen)
2007 struct sock *sk = sock->sk;
2009 if (sk->sk_prot->compat_getsockopt != NULL)
2010 return sk->sk_prot->compat_getsockopt(sk, level, optname,
2011 optval, optlen);
2012 return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
2014 EXPORT_SYMBOL(compat_sock_common_getsockopt);
2015 #endif
2017 int sock_common_recvmsg(struct kiocb *iocb, struct socket *sock,
2018 struct msghdr *msg, size_t size, int flags)
2020 struct sock *sk = sock->sk;
2021 int addr_len = 0;
2022 int err;
2024 err = sk->sk_prot->recvmsg(iocb, sk, msg, size, flags & MSG_DONTWAIT,
2025 flags & ~MSG_DONTWAIT, &addr_len);
2026 if (err >= 0)
2027 msg->msg_namelen = addr_len;
2028 return err;
2030 EXPORT_SYMBOL(sock_common_recvmsg);
2033 * Set socket options on an inet socket.
2035 int sock_common_setsockopt(struct socket *sock, int level, int optname,
2036 char __user *optval, unsigned int optlen)
2038 struct sock *sk = sock->sk;
2040 return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
2042 EXPORT_SYMBOL(sock_common_setsockopt);
2044 #ifdef CONFIG_COMPAT
2045 int compat_sock_common_setsockopt(struct socket *sock, int level, int optname,
2046 char __user *optval, unsigned int optlen)
2048 struct sock *sk = sock->sk;
2050 if (sk->sk_prot->compat_setsockopt != NULL)
2051 return sk->sk_prot->compat_setsockopt(sk, level, optname,
2052 optval, optlen);
2053 return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
2055 EXPORT_SYMBOL(compat_sock_common_setsockopt);
2056 #endif
2058 void sk_common_release(struct sock *sk)
2060 if (sk->sk_prot->destroy)
2061 sk->sk_prot->destroy(sk);
2064 * Observation: when sock_common_release is called, processes have
2065 * no access to socket. But net still has.
2066 * Step one, detach it from networking:
2068 * A. Remove from hash tables.
2071 sk->sk_prot->unhash(sk);
2074 * In this point socket cannot receive new packets, but it is possible
2075 * that some packets are in flight because some CPU runs receiver and
2076 * did hash table lookup before we unhashed socket. They will achieve
2077 * receive queue and will be purged by socket destructor.
2079 * Also we still have packets pending on receive queue and probably,
2080 * our own packets waiting in device queues. sock_destroy will drain
2081 * receive queue, but transmitted packets will delay socket destruction
2082 * until the last reference will be released.
2085 sock_orphan(sk);
2087 xfrm_sk_free_policy(sk);
2089 sk_refcnt_debug_release(sk);
2090 sock_put(sk);
2092 EXPORT_SYMBOL(sk_common_release);
2094 static DEFINE_RWLOCK(proto_list_lock);
2095 static LIST_HEAD(proto_list);
2097 #ifdef CONFIG_PROC_FS
2098 #define PROTO_INUSE_NR 64 /* should be enough for the first time */
2099 struct prot_inuse {
2100 int val[PROTO_INUSE_NR];
2103 static DECLARE_BITMAP(proto_inuse_idx, PROTO_INUSE_NR);
2105 #ifdef CONFIG_NET_NS
2106 void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
2108 int cpu = smp_processor_id();
2109 per_cpu_ptr(net->core.inuse, cpu)->val[prot->inuse_idx] += val;
2111 EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
2113 int sock_prot_inuse_get(struct net *net, struct proto *prot)
2115 int cpu, idx = prot->inuse_idx;
2116 int res = 0;
2118 for_each_possible_cpu(cpu)
2119 res += per_cpu_ptr(net->core.inuse, cpu)->val[idx];
2121 return res >= 0 ? res : 0;
2123 EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
2125 static int sock_inuse_init_net(struct net *net)
2127 net->core.inuse = alloc_percpu(struct prot_inuse);
2128 return net->core.inuse ? 0 : -ENOMEM;
2131 static void sock_inuse_exit_net(struct net *net)
2133 free_percpu(net->core.inuse);
2136 static struct pernet_operations net_inuse_ops = {
2137 .init = sock_inuse_init_net,
2138 .exit = sock_inuse_exit_net,
2141 static __init int net_inuse_init(void)
2143 if (register_pernet_subsys(&net_inuse_ops))
2144 panic("Cannot initialize net inuse counters");
2146 return 0;
2149 core_initcall(net_inuse_init);
2150 #else
2151 static DEFINE_PER_CPU(struct prot_inuse, prot_inuse);
2153 void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
2155 __get_cpu_var(prot_inuse).val[prot->inuse_idx] += val;
2157 EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
2159 int sock_prot_inuse_get(struct net *net, struct proto *prot)
2161 int cpu, idx = prot->inuse_idx;
2162 int res = 0;
2164 for_each_possible_cpu(cpu)
2165 res += per_cpu(prot_inuse, cpu).val[idx];
2167 return res >= 0 ? res : 0;
2169 EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
2170 #endif
2172 static void assign_proto_idx(struct proto *prot)
2174 prot->inuse_idx = find_first_zero_bit(proto_inuse_idx, PROTO_INUSE_NR);
2176 if (unlikely(prot->inuse_idx == PROTO_INUSE_NR - 1)) {
2177 printk(KERN_ERR "PROTO_INUSE_NR exhausted\n");
2178 return;
2181 set_bit(prot->inuse_idx, proto_inuse_idx);
2184 static void release_proto_idx(struct proto *prot)
2186 if (prot->inuse_idx != PROTO_INUSE_NR - 1)
2187 clear_bit(prot->inuse_idx, proto_inuse_idx);
2189 #else
2190 static inline void assign_proto_idx(struct proto *prot)
2194 static inline void release_proto_idx(struct proto *prot)
2197 #endif
2199 int proto_register(struct proto *prot, int alloc_slab)
2201 if (alloc_slab) {
2202 prot->slab = kmem_cache_create(prot->name, prot->obj_size, 0,
2203 SLAB_HWCACHE_ALIGN | prot->slab_flags,
2204 NULL);
2206 if (prot->slab == NULL) {
2207 printk(KERN_CRIT "%s: Can't create sock SLAB cache!\n",
2208 prot->name);
2209 goto out;
2212 if (prot->rsk_prot != NULL) {
2213 static const char mask[] = "request_sock_%s";
2215 prot->rsk_prot->slab_name = kmalloc(strlen(prot->name) + sizeof(mask) - 1, GFP_KERNEL);
2216 if (prot->rsk_prot->slab_name == NULL)
2217 goto out_free_sock_slab;
2219 sprintf(prot->rsk_prot->slab_name, mask, prot->name);
2220 prot->rsk_prot->slab = kmem_cache_create(prot->rsk_prot->slab_name,
2221 prot->rsk_prot->obj_size, 0,
2222 SLAB_HWCACHE_ALIGN, NULL);
2224 if (prot->rsk_prot->slab == NULL) {
2225 printk(KERN_CRIT "%s: Can't create request sock SLAB cache!\n",
2226 prot->name);
2227 goto out_free_request_sock_slab_name;
2231 if (prot->twsk_prot != NULL) {
2232 static const char mask[] = "tw_sock_%s";
2234 prot->twsk_prot->twsk_slab_name = kmalloc(strlen(prot->name) + sizeof(mask) - 1, GFP_KERNEL);
2236 if (prot->twsk_prot->twsk_slab_name == NULL)
2237 goto out_free_request_sock_slab;
2239 sprintf(prot->twsk_prot->twsk_slab_name, mask, prot->name);
2240 prot->twsk_prot->twsk_slab =
2241 kmem_cache_create(prot->twsk_prot->twsk_slab_name,
2242 prot->twsk_prot->twsk_obj_size,
2244 SLAB_HWCACHE_ALIGN |
2245 prot->slab_flags,
2246 NULL);
2247 if (prot->twsk_prot->twsk_slab == NULL)
2248 goto out_free_timewait_sock_slab_name;
2252 write_lock(&proto_list_lock);
2253 list_add(&prot->node, &proto_list);
2254 assign_proto_idx(prot);
2255 write_unlock(&proto_list_lock);
2256 return 0;
2258 out_free_timewait_sock_slab_name:
2259 kfree(prot->twsk_prot->twsk_slab_name);
2260 out_free_request_sock_slab:
2261 if (prot->rsk_prot && prot->rsk_prot->slab) {
2262 kmem_cache_destroy(prot->rsk_prot->slab);
2263 prot->rsk_prot->slab = NULL;
2265 out_free_request_sock_slab_name:
2266 kfree(prot->rsk_prot->slab_name);
2267 out_free_sock_slab:
2268 kmem_cache_destroy(prot->slab);
2269 prot->slab = NULL;
2270 out:
2271 return -ENOBUFS;
2273 EXPORT_SYMBOL(proto_register);
2275 void proto_unregister(struct proto *prot)
2277 write_lock(&proto_list_lock);
2278 release_proto_idx(prot);
2279 list_del(&prot->node);
2280 write_unlock(&proto_list_lock);
2282 if (prot->slab != NULL) {
2283 kmem_cache_destroy(prot->slab);
2284 prot->slab = NULL;
2287 if (prot->rsk_prot != NULL && prot->rsk_prot->slab != NULL) {
2288 kmem_cache_destroy(prot->rsk_prot->slab);
2289 kfree(prot->rsk_prot->slab_name);
2290 prot->rsk_prot->slab = NULL;
2293 if (prot->twsk_prot != NULL && prot->twsk_prot->twsk_slab != NULL) {
2294 kmem_cache_destroy(prot->twsk_prot->twsk_slab);
2295 kfree(prot->twsk_prot->twsk_slab_name);
2296 prot->twsk_prot->twsk_slab = NULL;
2299 EXPORT_SYMBOL(proto_unregister);
2301 #ifdef CONFIG_PROC_FS
2302 static void *proto_seq_start(struct seq_file *seq, loff_t *pos)
2303 __acquires(proto_list_lock)
2305 read_lock(&proto_list_lock);
2306 return seq_list_start_head(&proto_list, *pos);
2309 static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2311 return seq_list_next(v, &proto_list, pos);
2314 static void proto_seq_stop(struct seq_file *seq, void *v)
2315 __releases(proto_list_lock)
2317 read_unlock(&proto_list_lock);
2320 static char proto_method_implemented(const void *method)
2322 return method == NULL ? 'n' : 'y';
2325 static void proto_seq_printf(struct seq_file *seq, struct proto *proto)
2327 seq_printf(seq, "%-9s %4u %6d %6d %-3s %6u %-3s %-10s "
2328 "%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n",
2329 proto->name,
2330 proto->obj_size,
2331 sock_prot_inuse_get(seq_file_net(seq), proto),
2332 proto->memory_allocated != NULL ? atomic_read(proto->memory_allocated) : -1,
2333 proto->memory_pressure != NULL ? *proto->memory_pressure ? "yes" : "no" : "NI",
2334 proto->max_header,
2335 proto->slab == NULL ? "no" : "yes",
2336 module_name(proto->owner),
2337 proto_method_implemented(proto->close),
2338 proto_method_implemented(proto->connect),
2339 proto_method_implemented(proto->disconnect),
2340 proto_method_implemented(proto->accept),
2341 proto_method_implemented(proto->ioctl),
2342 proto_method_implemented(proto->init),
2343 proto_method_implemented(proto->destroy),
2344 proto_method_implemented(proto->shutdown),
2345 proto_method_implemented(proto->setsockopt),
2346 proto_method_implemented(proto->getsockopt),
2347 proto_method_implemented(proto->sendmsg),
2348 proto_method_implemented(proto->recvmsg),
2349 proto_method_implemented(proto->sendpage),
2350 proto_method_implemented(proto->bind),
2351 proto_method_implemented(proto->backlog_rcv),
2352 proto_method_implemented(proto->hash),
2353 proto_method_implemented(proto->unhash),
2354 proto_method_implemented(proto->get_port),
2355 proto_method_implemented(proto->enter_memory_pressure));
2358 static int proto_seq_show(struct seq_file *seq, void *v)
2360 if (v == &proto_list)
2361 seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s",
2362 "protocol",
2363 "size",
2364 "sockets",
2365 "memory",
2366 "press",
2367 "maxhdr",
2368 "slab",
2369 "module",
2370 "cl co di ac io in de sh ss gs se re sp bi br ha uh gp em\n");
2371 else
2372 proto_seq_printf(seq, list_entry(v, struct proto, node));
2373 return 0;
2376 static const struct seq_operations proto_seq_ops = {
2377 .start = proto_seq_start,
2378 .next = proto_seq_next,
2379 .stop = proto_seq_stop,
2380 .show = proto_seq_show,
2383 static int proto_seq_open(struct inode *inode, struct file *file)
2385 return seq_open_net(inode, file, &proto_seq_ops,
2386 sizeof(struct seq_net_private));
2389 static const struct file_operations proto_seq_fops = {
2390 .owner = THIS_MODULE,
2391 .open = proto_seq_open,
2392 .read = seq_read,
2393 .llseek = seq_lseek,
2394 .release = seq_release_net,
2397 static __net_init int proto_init_net(struct net *net)
2399 if (!proc_net_fops_create(net, "protocols", S_IRUGO, &proto_seq_fops))
2400 return -ENOMEM;
2402 return 0;
2405 static __net_exit void proto_exit_net(struct net *net)
2407 proc_net_remove(net, "protocols");
2411 static __net_initdata struct pernet_operations proto_net_ops = {
2412 .init = proto_init_net,
2413 .exit = proto_exit_net,
2416 static int __init proto_init(void)
2418 return register_pernet_subsys(&proto_net_ops);
2421 subsys_initcall(proto_init);
2423 #endif /* PROC_FS */