atm: fix info leak via getsockname()
[linux/fpc-iii.git] / net / core / sock.c
blobeafa660832d70f0d3b193aa459cbd84a86d3c9d6
1 /*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
6 * Generic socket support routines. Memory allocators, socket lock/release
7 * handler for protocols to use and generic option handler.
10 * Authors: Ross Biro
11 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12 * Florian La Roche, <flla@stud.uni-sb.de>
13 * Alan Cox, <A.Cox@swansea.ac.uk>
15 * Fixes:
16 * Alan Cox : Numerous verify_area() problems
17 * Alan Cox : Connecting on a connecting socket
18 * now returns an error for tcp.
19 * Alan Cox : sock->protocol is set correctly.
20 * and is not sometimes left as 0.
21 * Alan Cox : connect handles icmp errors on a
22 * connect properly. Unfortunately there
23 * is a restart syscall nasty there. I
24 * can't match BSD without hacking the C
25 * library. Ideas urgently sought!
26 * Alan Cox : Disallow bind() to addresses that are
27 * not ours - especially broadcast ones!!
28 * Alan Cox : Socket 1024 _IS_ ok for users. (fencepost)
29 * Alan Cox : sock_wfree/sock_rfree don't destroy sockets,
30 * instead they leave that for the DESTROY timer.
31 * Alan Cox : Clean up error flag in accept
32 * Alan Cox : TCP ack handling is buggy, the DESTROY timer
33 * was buggy. Put a remove_sock() in the handler
34 * for memory when we hit 0. Also altered the timer
35 * code. The ACK stuff can wait and needs major
36 * TCP layer surgery.
37 * Alan Cox : Fixed TCP ack bug, removed remove sock
38 * and fixed timer/inet_bh race.
39 * Alan Cox : Added zapped flag for TCP
40 * Alan Cox : Move kfree_skb into skbuff.c and tidied up surplus code
41 * Alan Cox : for new sk_buff allocations wmalloc/rmalloc now call alloc_skb
42 * Alan Cox : kfree_s calls now are kfree_skbmem so we can track skb resources
43 * Alan Cox : Supports socket option broadcast now as does udp. Packet and raw need fixing.
44 * Alan Cox : Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so...
45 * Rick Sladkey : Relaxed UDP rules for matching packets.
46 * C.E.Hawkins : IFF_PROMISC/SIOCGHWADDR support
47 * Pauline Middelink : identd support
48 * Alan Cox : Fixed connect() taking signals I think.
49 * Alan Cox : SO_LINGER supported
50 * Alan Cox : Error reporting fixes
51 * Anonymous : inet_create tidied up (sk->reuse setting)
52 * Alan Cox : inet sockets don't set sk->type!
53 * Alan Cox : Split socket option code
54 * Alan Cox : Callbacks
55 * Alan Cox : Nagle flag for Charles & Johannes stuff
56 * Alex : Removed restriction on inet fioctl
57 * Alan Cox : Splitting INET from NET core
58 * Alan Cox : Fixed bogus SO_TYPE handling in getsockopt()
59 * Adam Caldwell : Missing return in SO_DONTROUTE/SO_DEBUG code
60 * Alan Cox : Split IP from generic code
61 * Alan Cox : New kfree_skbmem()
62 * Alan Cox : Make SO_DEBUG superuser only.
63 * Alan Cox : Allow anyone to clear SO_DEBUG
64 * (compatibility fix)
65 * Alan Cox : Added optimistic memory grabbing for AF_UNIX throughput.
66 * Alan Cox : Allocator for a socket is settable.
67 * Alan Cox : SO_ERROR includes soft errors.
68 * Alan Cox : Allow NULL arguments on some SO_ opts
69 * Alan Cox : Generic socket allocation to make hooks
70 * easier (suggested by Craig Metz).
71 * Michael Pall : SO_ERROR returns positive errno again
72 * Steve Whitehouse: Added default destructor to free
73 * protocol private data.
74 * Steve Whitehouse: Added various other default routines
75 * common to several socket families.
76 * Chris Evans : Call suser() check last on F_SETOWN
77 * Jay Schulist : Added SO_ATTACH_FILTER and SO_DETACH_FILTER.
78 * Andi Kleen : Add sock_kmalloc()/sock_kfree_s()
79 * Andi Kleen : Fix write_space callback
80 * Chris Evans : Security fixes - signedness again
81 * Arnaldo C. Melo : cleanups, use skb_queue_purge
83 * To Fix:
86 * This program is free software; you can redistribute it and/or
87 * modify it under the terms of the GNU General Public License
88 * as published by the Free Software Foundation; either version
89 * 2 of the License, or (at your option) any later version.
92 #include <linux/capability.h>
93 #include <linux/errno.h>
94 #include <linux/types.h>
95 #include <linux/socket.h>
96 #include <linux/in.h>
97 #include <linux/kernel.h>
98 #include <linux/module.h>
99 #include <linux/proc_fs.h>
100 #include <linux/seq_file.h>
101 #include <linux/sched.h>
102 #include <linux/timer.h>
103 #include <linux/string.h>
104 #include <linux/sockios.h>
105 #include <linux/net.h>
106 #include <linux/mm.h>
107 #include <linux/slab.h>
108 #include <linux/interrupt.h>
109 #include <linux/poll.h>
110 #include <linux/tcp.h>
111 #include <linux/init.h>
112 #include <linux/highmem.h>
114 #include <asm/uaccess.h>
115 #include <asm/system.h>
117 #include <linux/netdevice.h>
118 #include <net/protocol.h>
119 #include <linux/skbuff.h>
120 #include <net/net_namespace.h>
121 #include <net/request_sock.h>
122 #include <net/sock.h>
123 #include <linux/net_tstamp.h>
124 #include <net/xfrm.h>
125 #include <linux/ipsec.h>
127 #include <linux/filter.h>
129 #ifdef CONFIG_INET
130 #include <net/tcp.h>
131 #endif
134 * Each address family might have different locking rules, so we have
135 * one slock key per address family:
137 static struct lock_class_key af_family_keys[AF_MAX];
138 static struct lock_class_key af_family_slock_keys[AF_MAX];
141 * Make lock validator output more readable. (we pre-construct these
142 * strings build-time, so that runtime initialization of socket
143 * locks is fast):
145 static const char *const af_family_key_strings[AF_MAX+1] = {
146 "sk_lock-AF_UNSPEC", "sk_lock-AF_UNIX" , "sk_lock-AF_INET" ,
147 "sk_lock-AF_AX25" , "sk_lock-AF_IPX" , "sk_lock-AF_APPLETALK",
148 "sk_lock-AF_NETROM", "sk_lock-AF_BRIDGE" , "sk_lock-AF_ATMPVC" ,
149 "sk_lock-AF_X25" , "sk_lock-AF_INET6" , "sk_lock-AF_ROSE" ,
150 "sk_lock-AF_DECnet", "sk_lock-AF_NETBEUI" , "sk_lock-AF_SECURITY" ,
151 "sk_lock-AF_KEY" , "sk_lock-AF_NETLINK" , "sk_lock-AF_PACKET" ,
152 "sk_lock-AF_ASH" , "sk_lock-AF_ECONET" , "sk_lock-AF_ATMSVC" ,
153 "sk_lock-AF_RDS" , "sk_lock-AF_SNA" , "sk_lock-AF_IRDA" ,
154 "sk_lock-AF_PPPOX" , "sk_lock-AF_WANPIPE" , "sk_lock-AF_LLC" ,
155 "sk_lock-27" , "sk_lock-28" , "sk_lock-AF_CAN" ,
156 "sk_lock-AF_TIPC" , "sk_lock-AF_BLUETOOTH", "sk_lock-IUCV" ,
157 "sk_lock-AF_RXRPC" , "sk_lock-AF_ISDN" , "sk_lock-AF_PHONET" ,
158 "sk_lock-AF_IEEE802154",
159 "sk_lock-AF_MAX"
161 static const char *const af_family_slock_key_strings[AF_MAX+1] = {
162 "slock-AF_UNSPEC", "slock-AF_UNIX" , "slock-AF_INET" ,
163 "slock-AF_AX25" , "slock-AF_IPX" , "slock-AF_APPLETALK",
164 "slock-AF_NETROM", "slock-AF_BRIDGE" , "slock-AF_ATMPVC" ,
165 "slock-AF_X25" , "slock-AF_INET6" , "slock-AF_ROSE" ,
166 "slock-AF_DECnet", "slock-AF_NETBEUI" , "slock-AF_SECURITY" ,
167 "slock-AF_KEY" , "slock-AF_NETLINK" , "slock-AF_PACKET" ,
168 "slock-AF_ASH" , "slock-AF_ECONET" , "slock-AF_ATMSVC" ,
169 "slock-AF_RDS" , "slock-AF_SNA" , "slock-AF_IRDA" ,
170 "slock-AF_PPPOX" , "slock-AF_WANPIPE" , "slock-AF_LLC" ,
171 "slock-27" , "slock-28" , "slock-AF_CAN" ,
172 "slock-AF_TIPC" , "slock-AF_BLUETOOTH", "slock-AF_IUCV" ,
173 "slock-AF_RXRPC" , "slock-AF_ISDN" , "slock-AF_PHONET" ,
174 "slock-AF_IEEE802154",
175 "slock-AF_MAX"
177 static const char *const af_family_clock_key_strings[AF_MAX+1] = {
178 "clock-AF_UNSPEC", "clock-AF_UNIX" , "clock-AF_INET" ,
179 "clock-AF_AX25" , "clock-AF_IPX" , "clock-AF_APPLETALK",
180 "clock-AF_NETROM", "clock-AF_BRIDGE" , "clock-AF_ATMPVC" ,
181 "clock-AF_X25" , "clock-AF_INET6" , "clock-AF_ROSE" ,
182 "clock-AF_DECnet", "clock-AF_NETBEUI" , "clock-AF_SECURITY" ,
183 "clock-AF_KEY" , "clock-AF_NETLINK" , "clock-AF_PACKET" ,
184 "clock-AF_ASH" , "clock-AF_ECONET" , "clock-AF_ATMSVC" ,
185 "clock-AF_RDS" , "clock-AF_SNA" , "clock-AF_IRDA" ,
186 "clock-AF_PPPOX" , "clock-AF_WANPIPE" , "clock-AF_LLC" ,
187 "clock-27" , "clock-28" , "clock-AF_CAN" ,
188 "clock-AF_TIPC" , "clock-AF_BLUETOOTH", "clock-AF_IUCV" ,
189 "clock-AF_RXRPC" , "clock-AF_ISDN" , "clock-AF_PHONET" ,
190 "clock-AF_IEEE802154",
191 "clock-AF_MAX"
195 * sk_callback_lock locking rules are per-address-family,
196 * so split the lock classes by using a per-AF key:
198 static struct lock_class_key af_callback_keys[AF_MAX];
200 /* Take into consideration the size of the struct sk_buff overhead in the
201 * determination of these values, since that is non-constant across
202 * platforms. This makes socket queueing behavior and performance
203 * not depend upon such differences.
205 #define _SK_MEM_PACKETS 256
206 #define _SK_MEM_OVERHEAD (sizeof(struct sk_buff) + 256)
207 #define SK_WMEM_MAX (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
208 #define SK_RMEM_MAX (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
210 /* Run time adjustable parameters. */
211 __u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX;
212 __u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX;
213 __u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX;
214 __u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX;
216 /* Maximal space eaten by iovec or ancilliary data plus some space */
217 int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512);
218 EXPORT_SYMBOL(sysctl_optmem_max);
220 static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen)
222 struct timeval tv;
224 if (optlen < sizeof(tv))
225 return -EINVAL;
226 if (copy_from_user(&tv, optval, sizeof(tv)))
227 return -EFAULT;
228 if (tv.tv_usec < 0 || tv.tv_usec >= USEC_PER_SEC)
229 return -EDOM;
231 if (tv.tv_sec < 0) {
232 static int warned __read_mostly;
234 *timeo_p = 0;
235 if (warned < 10 && net_ratelimit()) {
236 warned++;
237 printk(KERN_INFO "sock_set_timeout: `%s' (pid %d) "
238 "tries to set negative timeout\n",
239 current->comm, task_pid_nr(current));
241 return 0;
243 *timeo_p = MAX_SCHEDULE_TIMEOUT;
244 if (tv.tv_sec == 0 && tv.tv_usec == 0)
245 return 0;
246 if (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT/HZ - 1))
247 *timeo_p = tv.tv_sec*HZ + (tv.tv_usec+(1000000/HZ-1))/(1000000/HZ);
248 return 0;
251 static void sock_warn_obsolete_bsdism(const char *name)
253 static int warned;
254 static char warncomm[TASK_COMM_LEN];
255 if (strcmp(warncomm, current->comm) && warned < 5) {
256 strcpy(warncomm, current->comm);
257 printk(KERN_WARNING "process `%s' is using obsolete "
258 "%s SO_BSDCOMPAT\n", warncomm, name);
259 warned++;
263 static void sock_disable_timestamp(struct sock *sk, int flag)
265 if (sock_flag(sk, flag)) {
266 sock_reset_flag(sk, flag);
267 if (!sock_flag(sk, SOCK_TIMESTAMP) &&
268 !sock_flag(sk, SOCK_TIMESTAMPING_RX_SOFTWARE)) {
269 net_disable_timestamp();
275 int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
277 int err = 0;
278 int skb_len;
280 /* Cast sk->rcvbuf to unsigned... It's pointless, but reduces
281 number of warnings when compiling with -W --ANK
283 if (atomic_read(&sk->sk_rmem_alloc) + skb->truesize >=
284 (unsigned)sk->sk_rcvbuf) {
285 err = -ENOMEM;
286 goto out;
289 err = sk_filter(sk, skb);
290 if (err)
291 goto out;
293 if (!sk_rmem_schedule(sk, skb->truesize)) {
294 err = -ENOBUFS;
295 goto out;
298 skb->dev = NULL;
299 skb_set_owner_r(skb, sk);
301 /* Cache the SKB length before we tack it onto the receive
302 * queue. Once it is added it no longer belongs to us and
303 * may be freed by other threads of control pulling packets
304 * from the queue.
306 skb_len = skb->len;
308 skb_queue_tail(&sk->sk_receive_queue, skb);
310 if (!sock_flag(sk, SOCK_DEAD))
311 sk->sk_data_ready(sk, skb_len);
312 out:
313 return err;
315 EXPORT_SYMBOL(sock_queue_rcv_skb);
317 int sk_receive_skb(struct sock *sk, struct sk_buff *skb, const int nested)
319 int rc = NET_RX_SUCCESS;
321 if (sk_filter(sk, skb))
322 goto discard_and_relse;
324 skb->dev = NULL;
326 if (nested)
327 bh_lock_sock_nested(sk);
328 else
329 bh_lock_sock(sk);
330 if (!sock_owned_by_user(sk)) {
332 * trylock + unlock semantics:
334 mutex_acquire(&sk->sk_lock.dep_map, 0, 1, _RET_IP_);
336 rc = sk_backlog_rcv(sk, skb);
338 mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
339 } else
340 sk_add_backlog(sk, skb);
341 bh_unlock_sock(sk);
342 out:
343 sock_put(sk);
344 return rc;
345 discard_and_relse:
346 kfree_skb(skb);
347 goto out;
349 EXPORT_SYMBOL(sk_receive_skb);
351 struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie)
353 struct dst_entry *dst = sk->sk_dst_cache;
355 if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
356 sk->sk_dst_cache = NULL;
357 dst_release(dst);
358 return NULL;
361 return dst;
363 EXPORT_SYMBOL(__sk_dst_check);
365 struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie)
367 struct dst_entry *dst = sk_dst_get(sk);
369 if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
370 sk_dst_reset(sk);
371 dst_release(dst);
372 return NULL;
375 return dst;
377 EXPORT_SYMBOL(sk_dst_check);
379 static int sock_bindtodevice(struct sock *sk, char __user *optval, int optlen)
381 int ret = -ENOPROTOOPT;
382 #ifdef CONFIG_NETDEVICES
383 struct net *net = sock_net(sk);
384 char devname[IFNAMSIZ];
385 int index;
387 /* Sorry... */
388 ret = -EPERM;
389 if (!capable(CAP_NET_RAW))
390 goto out;
392 ret = -EINVAL;
393 if (optlen < 0)
394 goto out;
396 /* Bind this socket to a particular device like "eth0",
397 * as specified in the passed interface name. If the
398 * name is "" or the option length is zero the socket
399 * is not bound.
401 if (optlen > IFNAMSIZ - 1)
402 optlen = IFNAMSIZ - 1;
403 memset(devname, 0, sizeof(devname));
405 ret = -EFAULT;
406 if (copy_from_user(devname, optval, optlen))
407 goto out;
409 if (devname[0] == '\0') {
410 index = 0;
411 } else {
412 struct net_device *dev = dev_get_by_name(net, devname);
414 ret = -ENODEV;
415 if (!dev)
416 goto out;
418 index = dev->ifindex;
419 dev_put(dev);
422 lock_sock(sk);
423 sk->sk_bound_dev_if = index;
424 sk_dst_reset(sk);
425 release_sock(sk);
427 ret = 0;
429 out:
430 #endif
432 return ret;
435 static inline void sock_valbool_flag(struct sock *sk, int bit, int valbool)
437 if (valbool)
438 sock_set_flag(sk, bit);
439 else
440 sock_reset_flag(sk, bit);
444 * This is meant for all protocols to use and covers goings on
445 * at the socket level. Everything here is generic.
448 int sock_setsockopt(struct socket *sock, int level, int optname,
449 char __user *optval, unsigned int optlen)
451 struct sock *sk = sock->sk;
452 int val;
453 int valbool;
454 struct linger ling;
455 int ret = 0;
458 * Options without arguments
461 if (optname == SO_BINDTODEVICE)
462 return sock_bindtodevice(sk, optval, optlen);
464 if (optlen < sizeof(int))
465 return -EINVAL;
467 if (get_user(val, (int __user *)optval))
468 return -EFAULT;
470 valbool = val ? 1 : 0;
472 lock_sock(sk);
474 switch (optname) {
475 case SO_DEBUG:
476 if (val && !capable(CAP_NET_ADMIN))
477 ret = -EACCES;
478 else
479 sock_valbool_flag(sk, SOCK_DBG, valbool);
480 break;
481 case SO_REUSEADDR:
482 sk->sk_reuse = valbool;
483 break;
484 case SO_TYPE:
485 case SO_PROTOCOL:
486 case SO_DOMAIN:
487 case SO_ERROR:
488 ret = -ENOPROTOOPT;
489 break;
490 case SO_DONTROUTE:
491 sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool);
492 break;
493 case SO_BROADCAST:
494 sock_valbool_flag(sk, SOCK_BROADCAST, valbool);
495 break;
496 case SO_SNDBUF:
497 /* Don't error on this BSD doesn't and if you think
498 about it this is right. Otherwise apps have to
499 play 'guess the biggest size' games. RCVBUF/SNDBUF
500 are treated in BSD as hints */
502 if (val > sysctl_wmem_max)
503 val = sysctl_wmem_max;
504 set_sndbuf:
505 sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
506 if ((val * 2) < SOCK_MIN_SNDBUF)
507 sk->sk_sndbuf = SOCK_MIN_SNDBUF;
508 else
509 sk->sk_sndbuf = val * 2;
512 * Wake up sending tasks if we
513 * upped the value.
515 sk->sk_write_space(sk);
516 break;
518 case SO_SNDBUFFORCE:
519 if (!capable(CAP_NET_ADMIN)) {
520 ret = -EPERM;
521 break;
523 goto set_sndbuf;
525 case SO_RCVBUF:
526 /* Don't error on this BSD doesn't and if you think
527 about it this is right. Otherwise apps have to
528 play 'guess the biggest size' games. RCVBUF/SNDBUF
529 are treated in BSD as hints */
531 if (val > sysctl_rmem_max)
532 val = sysctl_rmem_max;
533 set_rcvbuf:
534 sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
536 * We double it on the way in to account for
537 * "struct sk_buff" etc. overhead. Applications
538 * assume that the SO_RCVBUF setting they make will
539 * allow that much actual data to be received on that
540 * socket.
542 * Applications are unaware that "struct sk_buff" and
543 * other overheads allocate from the receive buffer
544 * during socket buffer allocation.
546 * And after considering the possible alternatives,
547 * returning the value we actually used in getsockopt
548 * is the most desirable behavior.
550 if ((val * 2) < SOCK_MIN_RCVBUF)
551 sk->sk_rcvbuf = SOCK_MIN_RCVBUF;
552 else
553 sk->sk_rcvbuf = val * 2;
554 break;
556 case SO_RCVBUFFORCE:
557 if (!capable(CAP_NET_ADMIN)) {
558 ret = -EPERM;
559 break;
561 goto set_rcvbuf;
563 case SO_KEEPALIVE:
564 #ifdef CONFIG_INET
565 if (sk->sk_protocol == IPPROTO_TCP &&
566 sk->sk_type == SOCK_STREAM)
567 tcp_set_keepalive(sk, valbool);
568 #endif
569 sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool);
570 break;
572 case SO_OOBINLINE:
573 sock_valbool_flag(sk, SOCK_URGINLINE, valbool);
574 break;
576 case SO_NO_CHECK:
577 sk->sk_no_check = valbool;
578 break;
580 case SO_PRIORITY:
581 if ((val >= 0 && val <= 6) || capable(CAP_NET_ADMIN))
582 sk->sk_priority = val;
583 else
584 ret = -EPERM;
585 break;
587 case SO_LINGER:
588 if (optlen < sizeof(ling)) {
589 ret = -EINVAL; /* 1003.1g */
590 break;
592 if (copy_from_user(&ling, optval, sizeof(ling))) {
593 ret = -EFAULT;
594 break;
596 if (!ling.l_onoff)
597 sock_reset_flag(sk, SOCK_LINGER);
598 else {
599 #if (BITS_PER_LONG == 32)
600 if ((unsigned int)ling.l_linger >= MAX_SCHEDULE_TIMEOUT/HZ)
601 sk->sk_lingertime = MAX_SCHEDULE_TIMEOUT;
602 else
603 #endif
604 sk->sk_lingertime = (unsigned int)ling.l_linger * HZ;
605 sock_set_flag(sk, SOCK_LINGER);
607 break;
609 case SO_BSDCOMPAT:
610 sock_warn_obsolete_bsdism("setsockopt");
611 break;
613 case SO_PASSCRED:
614 if (valbool)
615 set_bit(SOCK_PASSCRED, &sock->flags);
616 else
617 clear_bit(SOCK_PASSCRED, &sock->flags);
618 break;
620 case SO_TIMESTAMP:
621 case SO_TIMESTAMPNS:
622 if (valbool) {
623 if (optname == SO_TIMESTAMP)
624 sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
625 else
626 sock_set_flag(sk, SOCK_RCVTSTAMPNS);
627 sock_set_flag(sk, SOCK_RCVTSTAMP);
628 sock_enable_timestamp(sk, SOCK_TIMESTAMP);
629 } else {
630 sock_reset_flag(sk, SOCK_RCVTSTAMP);
631 sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
633 break;
635 case SO_TIMESTAMPING:
636 if (val & ~SOF_TIMESTAMPING_MASK) {
637 ret = -EINVAL;
638 break;
640 sock_valbool_flag(sk, SOCK_TIMESTAMPING_TX_HARDWARE,
641 val & SOF_TIMESTAMPING_TX_HARDWARE);
642 sock_valbool_flag(sk, SOCK_TIMESTAMPING_TX_SOFTWARE,
643 val & SOF_TIMESTAMPING_TX_SOFTWARE);
644 sock_valbool_flag(sk, SOCK_TIMESTAMPING_RX_HARDWARE,
645 val & SOF_TIMESTAMPING_RX_HARDWARE);
646 if (val & SOF_TIMESTAMPING_RX_SOFTWARE)
647 sock_enable_timestamp(sk,
648 SOCK_TIMESTAMPING_RX_SOFTWARE);
649 else
650 sock_disable_timestamp(sk,
651 SOCK_TIMESTAMPING_RX_SOFTWARE);
652 sock_valbool_flag(sk, SOCK_TIMESTAMPING_SOFTWARE,
653 val & SOF_TIMESTAMPING_SOFTWARE);
654 sock_valbool_flag(sk, SOCK_TIMESTAMPING_SYS_HARDWARE,
655 val & SOF_TIMESTAMPING_SYS_HARDWARE);
656 sock_valbool_flag(sk, SOCK_TIMESTAMPING_RAW_HARDWARE,
657 val & SOF_TIMESTAMPING_RAW_HARDWARE);
658 break;
660 case SO_RCVLOWAT:
661 if (val < 0)
662 val = INT_MAX;
663 sk->sk_rcvlowat = val ? : 1;
664 break;
666 case SO_RCVTIMEO:
667 ret = sock_set_timeout(&sk->sk_rcvtimeo, optval, optlen);
668 break;
670 case SO_SNDTIMEO:
671 ret = sock_set_timeout(&sk->sk_sndtimeo, optval, optlen);
672 break;
674 case SO_ATTACH_FILTER:
675 ret = -EINVAL;
676 if (optlen == sizeof(struct sock_fprog)) {
677 struct sock_fprog fprog;
679 ret = -EFAULT;
680 if (copy_from_user(&fprog, optval, sizeof(fprog)))
681 break;
683 ret = sk_attach_filter(&fprog, sk);
685 break;
687 case SO_DETACH_FILTER:
688 ret = sk_detach_filter(sk);
689 break;
691 case SO_PASSSEC:
692 if (valbool)
693 set_bit(SOCK_PASSSEC, &sock->flags);
694 else
695 clear_bit(SOCK_PASSSEC, &sock->flags);
696 break;
697 case SO_MARK:
698 if (!capable(CAP_NET_ADMIN))
699 ret = -EPERM;
700 else
701 sk->sk_mark = val;
702 break;
704 /* We implement the SO_SNDLOWAT etc to
705 not be settable (1003.1g 5.3) */
706 default:
707 ret = -ENOPROTOOPT;
708 break;
710 release_sock(sk);
711 return ret;
713 EXPORT_SYMBOL(sock_setsockopt);
716 int sock_getsockopt(struct socket *sock, int level, int optname,
717 char __user *optval, int __user *optlen)
719 struct sock *sk = sock->sk;
721 union {
722 int val;
723 struct linger ling;
724 struct timeval tm;
725 } v;
727 unsigned int lv = sizeof(int);
728 int len;
730 if (get_user(len, optlen))
731 return -EFAULT;
732 if (len < 0)
733 return -EINVAL;
735 memset(&v, 0, sizeof(v));
737 switch (optname) {
738 case SO_DEBUG:
739 v.val = sock_flag(sk, SOCK_DBG);
740 break;
742 case SO_DONTROUTE:
743 v.val = sock_flag(sk, SOCK_LOCALROUTE);
744 break;
746 case SO_BROADCAST:
747 v.val = !!sock_flag(sk, SOCK_BROADCAST);
748 break;
750 case SO_SNDBUF:
751 v.val = sk->sk_sndbuf;
752 break;
754 case SO_RCVBUF:
755 v.val = sk->sk_rcvbuf;
756 break;
758 case SO_REUSEADDR:
759 v.val = sk->sk_reuse;
760 break;
762 case SO_KEEPALIVE:
763 v.val = !!sock_flag(sk, SOCK_KEEPOPEN);
764 break;
766 case SO_TYPE:
767 v.val = sk->sk_type;
768 break;
770 case SO_PROTOCOL:
771 v.val = sk->sk_protocol;
772 break;
774 case SO_DOMAIN:
775 v.val = sk->sk_family;
776 break;
778 case SO_ERROR:
779 v.val = -sock_error(sk);
780 if (v.val == 0)
781 v.val = xchg(&sk->sk_err_soft, 0);
782 break;
784 case SO_OOBINLINE:
785 v.val = !!sock_flag(sk, SOCK_URGINLINE);
786 break;
788 case SO_NO_CHECK:
789 v.val = sk->sk_no_check;
790 break;
792 case SO_PRIORITY:
793 v.val = sk->sk_priority;
794 break;
796 case SO_LINGER:
797 lv = sizeof(v.ling);
798 v.ling.l_onoff = !!sock_flag(sk, SOCK_LINGER);
799 v.ling.l_linger = sk->sk_lingertime / HZ;
800 break;
802 case SO_BSDCOMPAT:
803 sock_warn_obsolete_bsdism("getsockopt");
804 break;
806 case SO_TIMESTAMP:
807 v.val = sock_flag(sk, SOCK_RCVTSTAMP) &&
808 !sock_flag(sk, SOCK_RCVTSTAMPNS);
809 break;
811 case SO_TIMESTAMPNS:
812 v.val = sock_flag(sk, SOCK_RCVTSTAMPNS);
813 break;
815 case SO_TIMESTAMPING:
816 v.val = 0;
817 if (sock_flag(sk, SOCK_TIMESTAMPING_TX_HARDWARE))
818 v.val |= SOF_TIMESTAMPING_TX_HARDWARE;
819 if (sock_flag(sk, SOCK_TIMESTAMPING_TX_SOFTWARE))
820 v.val |= SOF_TIMESTAMPING_TX_SOFTWARE;
821 if (sock_flag(sk, SOCK_TIMESTAMPING_RX_HARDWARE))
822 v.val |= SOF_TIMESTAMPING_RX_HARDWARE;
823 if (sock_flag(sk, SOCK_TIMESTAMPING_RX_SOFTWARE))
824 v.val |= SOF_TIMESTAMPING_RX_SOFTWARE;
825 if (sock_flag(sk, SOCK_TIMESTAMPING_SOFTWARE))
826 v.val |= SOF_TIMESTAMPING_SOFTWARE;
827 if (sock_flag(sk, SOCK_TIMESTAMPING_SYS_HARDWARE))
828 v.val |= SOF_TIMESTAMPING_SYS_HARDWARE;
829 if (sock_flag(sk, SOCK_TIMESTAMPING_RAW_HARDWARE))
830 v.val |= SOF_TIMESTAMPING_RAW_HARDWARE;
831 break;
833 case SO_RCVTIMEO:
834 lv = sizeof(struct timeval);
835 if (sk->sk_rcvtimeo == MAX_SCHEDULE_TIMEOUT) {
836 v.tm.tv_sec = 0;
837 v.tm.tv_usec = 0;
838 } else {
839 v.tm.tv_sec = sk->sk_rcvtimeo / HZ;
840 v.tm.tv_usec = ((sk->sk_rcvtimeo % HZ) * 1000000) / HZ;
842 break;
844 case SO_SNDTIMEO:
845 lv = sizeof(struct timeval);
846 if (sk->sk_sndtimeo == MAX_SCHEDULE_TIMEOUT) {
847 v.tm.tv_sec = 0;
848 v.tm.tv_usec = 0;
849 } else {
850 v.tm.tv_sec = sk->sk_sndtimeo / HZ;
851 v.tm.tv_usec = ((sk->sk_sndtimeo % HZ) * 1000000) / HZ;
853 break;
855 case SO_RCVLOWAT:
856 v.val = sk->sk_rcvlowat;
857 break;
859 case SO_SNDLOWAT:
860 v.val = 1;
861 break;
863 case SO_PASSCRED:
864 v.val = test_bit(SOCK_PASSCRED, &sock->flags) ? 1 : 0;
865 break;
867 case SO_PEERCRED:
868 if (len > sizeof(sk->sk_peercred))
869 len = sizeof(sk->sk_peercred);
870 if (copy_to_user(optval, &sk->sk_peercred, len))
871 return -EFAULT;
872 goto lenout;
874 case SO_PEERNAME:
876 char address[128];
878 if (sock->ops->getname(sock, (struct sockaddr *)address, &lv, 2))
879 return -ENOTCONN;
880 if (lv < len)
881 return -EINVAL;
882 if (copy_to_user(optval, address, len))
883 return -EFAULT;
884 goto lenout;
887 /* Dubious BSD thing... Probably nobody even uses it, but
888 * the UNIX standard wants it for whatever reason... -DaveM
890 case SO_ACCEPTCONN:
891 v.val = sk->sk_state == TCP_LISTEN;
892 break;
894 case SO_PASSSEC:
895 v.val = test_bit(SOCK_PASSSEC, &sock->flags) ? 1 : 0;
896 break;
898 case SO_PEERSEC:
899 return security_socket_getpeersec_stream(sock, optval, optlen, len);
901 case SO_MARK:
902 v.val = sk->sk_mark;
903 break;
905 default:
906 return -ENOPROTOOPT;
909 if (len > lv)
910 len = lv;
911 if (copy_to_user(optval, &v, len))
912 return -EFAULT;
913 lenout:
914 if (put_user(len, optlen))
915 return -EFAULT;
916 return 0;
920 * Initialize an sk_lock.
922 * (We also register the sk_lock with the lock validator.)
924 static inline void sock_lock_init(struct sock *sk)
926 sock_lock_init_class_and_name(sk,
927 af_family_slock_key_strings[sk->sk_family],
928 af_family_slock_keys + sk->sk_family,
929 af_family_key_strings[sk->sk_family],
930 af_family_keys + sk->sk_family);
934 * Copy all fields from osk to nsk but nsk->sk_refcnt must not change yet,
935 * even temporarly, because of RCU lookups. sk_node should also be left as is.
937 static void sock_copy(struct sock *nsk, const struct sock *osk)
939 #ifdef CONFIG_SECURITY_NETWORK
940 void *sptr = nsk->sk_security;
941 #endif
942 BUILD_BUG_ON(offsetof(struct sock, sk_copy_start) !=
943 sizeof(osk->sk_node) + sizeof(osk->sk_refcnt));
944 memcpy(&nsk->sk_copy_start, &osk->sk_copy_start,
945 osk->sk_prot->obj_size - offsetof(struct sock, sk_copy_start));
946 #ifdef CONFIG_SECURITY_NETWORK
947 nsk->sk_security = sptr;
948 security_sk_clone(osk, nsk);
949 #endif
952 static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
953 int family)
955 struct sock *sk;
956 struct kmem_cache *slab;
958 slab = prot->slab;
959 if (slab != NULL) {
960 sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO);
961 if (!sk)
962 return sk;
963 if (priority & __GFP_ZERO) {
965 * caches using SLAB_DESTROY_BY_RCU should let
966 * sk_node.next un-modified. Special care is taken
967 * when initializing object to zero.
969 if (offsetof(struct sock, sk_node.next) != 0)
970 memset(sk, 0, offsetof(struct sock, sk_node.next));
971 memset(&sk->sk_node.pprev, 0,
972 prot->obj_size - offsetof(struct sock,
973 sk_node.pprev));
976 else
977 sk = kmalloc(prot->obj_size, priority);
979 if (sk != NULL) {
980 kmemcheck_annotate_bitfield(sk, flags);
982 if (security_sk_alloc(sk, family, priority))
983 goto out_free;
985 if (!try_module_get(prot->owner))
986 goto out_free_sec;
989 return sk;
991 out_free_sec:
992 security_sk_free(sk);
993 out_free:
994 if (slab != NULL)
995 kmem_cache_free(slab, sk);
996 else
997 kfree(sk);
998 return NULL;
1001 static void sk_prot_free(struct proto *prot, struct sock *sk)
1003 struct kmem_cache *slab;
1004 struct module *owner;
1006 owner = prot->owner;
1007 slab = prot->slab;
1009 security_sk_free(sk);
1010 if (slab != NULL)
1011 kmem_cache_free(slab, sk);
1012 else
1013 kfree(sk);
1014 module_put(owner);
1018 * sk_alloc - All socket objects are allocated here
1019 * @net: the applicable net namespace
1020 * @family: protocol family
1021 * @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
1022 * @prot: struct proto associated with this new sock instance
1024 struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
1025 struct proto *prot)
1027 struct sock *sk;
1029 sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family);
1030 if (sk) {
1031 sk->sk_family = family;
1033 * See comment in struct sock definition to understand
1034 * why we need sk_prot_creator -acme
1036 sk->sk_prot = sk->sk_prot_creator = prot;
1037 sock_lock_init(sk);
1038 sock_net_set(sk, get_net(net));
1039 atomic_set(&sk->sk_wmem_alloc, 1);
1042 return sk;
1044 EXPORT_SYMBOL(sk_alloc);
1046 static void __sk_free(struct sock *sk)
1048 struct sk_filter *filter;
1050 if (sk->sk_destruct)
1051 sk->sk_destruct(sk);
1053 filter = rcu_dereference(sk->sk_filter);
1054 if (filter) {
1055 sk_filter_uncharge(sk, filter);
1056 rcu_assign_pointer(sk->sk_filter, NULL);
1059 sock_disable_timestamp(sk, SOCK_TIMESTAMP);
1060 sock_disable_timestamp(sk, SOCK_TIMESTAMPING_RX_SOFTWARE);
1062 if (atomic_read(&sk->sk_omem_alloc))
1063 printk(KERN_DEBUG "%s: optmem leakage (%d bytes) detected.\n",
1064 __func__, atomic_read(&sk->sk_omem_alloc));
1066 put_net(sock_net(sk));
1067 sk_prot_free(sk->sk_prot_creator, sk);
1070 void sk_free(struct sock *sk)
1073 * We substract one from sk_wmem_alloc and can know if
1074 * some packets are still in some tx queue.
1075 * If not null, sock_wfree() will call __sk_free(sk) later
1077 if (atomic_dec_and_test(&sk->sk_wmem_alloc))
1078 __sk_free(sk);
1080 EXPORT_SYMBOL(sk_free);
1083 * Last sock_put should drop referrence to sk->sk_net. It has already
1084 * been dropped in sk_change_net. Taking referrence to stopping namespace
1085 * is not an option.
1086 * Take referrence to a socket to remove it from hash _alive_ and after that
1087 * destroy it in the context of init_net.
1089 void sk_release_kernel(struct sock *sk)
1091 if (sk == NULL || sk->sk_socket == NULL)
1092 return;
1094 sock_hold(sk);
1095 sock_release(sk->sk_socket);
1096 release_net(sock_net(sk));
1097 sock_net_set(sk, get_net(&init_net));
1098 sock_put(sk);
1100 EXPORT_SYMBOL(sk_release_kernel);
1102 struct sock *sk_clone(const struct sock *sk, const gfp_t priority)
1104 struct sock *newsk;
1106 newsk = sk_prot_alloc(sk->sk_prot, priority, sk->sk_family);
1107 if (newsk != NULL) {
1108 struct sk_filter *filter;
1110 sock_copy(newsk, sk);
1112 /* SANITY */
1113 get_net(sock_net(newsk));
1114 sk_node_init(&newsk->sk_node);
1115 sock_lock_init(newsk);
1116 bh_lock_sock(newsk);
1117 newsk->sk_backlog.head = newsk->sk_backlog.tail = NULL;
1119 atomic_set(&newsk->sk_rmem_alloc, 0);
1121 * sk_wmem_alloc set to one (see sk_free() and sock_wfree())
1123 atomic_set(&newsk->sk_wmem_alloc, 1);
1124 atomic_set(&newsk->sk_omem_alloc, 0);
1125 skb_queue_head_init(&newsk->sk_receive_queue);
1126 skb_queue_head_init(&newsk->sk_write_queue);
1127 #ifdef CONFIG_NET_DMA
1128 skb_queue_head_init(&newsk->sk_async_wait_queue);
1129 #endif
1131 rwlock_init(&newsk->sk_dst_lock);
1132 rwlock_init(&newsk->sk_callback_lock);
1133 lockdep_set_class_and_name(&newsk->sk_callback_lock,
1134 af_callback_keys + newsk->sk_family,
1135 af_family_clock_key_strings[newsk->sk_family]);
1137 newsk->sk_dst_cache = NULL;
1138 newsk->sk_wmem_queued = 0;
1139 newsk->sk_forward_alloc = 0;
1140 newsk->sk_send_head = NULL;
1141 newsk->sk_userlocks = sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
1143 sock_reset_flag(newsk, SOCK_DONE);
1144 skb_queue_head_init(&newsk->sk_error_queue);
1146 filter = newsk->sk_filter;
1147 if (filter != NULL)
1148 sk_filter_charge(newsk, filter);
1150 if (unlikely(xfrm_sk_clone_policy(newsk))) {
1151 /* It is still raw copy of parent, so invalidate
1152 * destructor and make plain sk_free() */
1153 newsk->sk_destruct = NULL;
1154 sk_free(newsk);
1155 newsk = NULL;
1156 goto out;
1159 newsk->sk_err = 0;
1160 newsk->sk_priority = 0;
1162 * Before updating sk_refcnt, we must commit prior changes to memory
1163 * (Documentation/RCU/rculist_nulls.txt for details)
1165 smp_wmb();
1166 atomic_set(&newsk->sk_refcnt, 2);
1169 * Increment the counter in the same struct proto as the master
1170 * sock (sk_refcnt_debug_inc uses newsk->sk_prot->socks, that
1171 * is the same as sk->sk_prot->socks, as this field was copied
1172 * with memcpy).
1174 * This _changes_ the previous behaviour, where
1175 * tcp_create_openreq_child always was incrementing the
1176 * equivalent to tcp_prot->socks (inet_sock_nr), so this have
1177 * to be taken into account in all callers. -acme
1179 sk_refcnt_debug_inc(newsk);
1180 sk_set_socket(newsk, NULL);
1181 newsk->sk_sleep = NULL;
1183 if (newsk->sk_prot->sockets_allocated)
1184 percpu_counter_inc(newsk->sk_prot->sockets_allocated);
1186 if (sock_flag(newsk, SOCK_TIMESTAMP) ||
1187 sock_flag(newsk, SOCK_TIMESTAMPING_RX_SOFTWARE))
1188 net_enable_timestamp();
1190 out:
1191 return newsk;
1193 EXPORT_SYMBOL_GPL(sk_clone);
1195 void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
1197 __sk_dst_set(sk, dst);
1198 sk->sk_route_caps = dst->dev->features;
1199 if (sk->sk_route_caps & NETIF_F_GSO)
1200 sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE;
1201 if (sk_can_gso(sk)) {
1202 if (dst->header_len) {
1203 sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
1204 } else {
1205 sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM;
1206 sk->sk_gso_max_size = dst->dev->gso_max_size;
1210 EXPORT_SYMBOL_GPL(sk_setup_caps);
1212 void __init sk_init(void)
1214 if (totalram_pages <= 4096) {
1215 sysctl_wmem_max = 32767;
1216 sysctl_rmem_max = 32767;
1217 sysctl_wmem_default = 32767;
1218 sysctl_rmem_default = 32767;
1219 } else if (totalram_pages >= 131072) {
1220 sysctl_wmem_max = 131071;
1221 sysctl_rmem_max = 131071;
1226 * Simple resource managers for sockets.
1231 * Write buffer destructor automatically called from kfree_skb.
1233 void sock_wfree(struct sk_buff *skb)
1235 struct sock *sk = skb->sk;
1236 unsigned int len = skb->truesize;
1238 if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE)) {
1240 * Keep a reference on sk_wmem_alloc, this will be released
1241 * after sk_write_space() call
1243 atomic_sub(len - 1, &sk->sk_wmem_alloc);
1244 sk->sk_write_space(sk);
1245 len = 1;
1248 * if sk_wmem_alloc reaches 0, we must finish what sk_free()
1249 * could not do because of in-flight packets
1251 if (atomic_sub_and_test(len, &sk->sk_wmem_alloc))
1252 __sk_free(sk);
1254 EXPORT_SYMBOL(sock_wfree);
1257 * Read buffer destructor automatically called from kfree_skb.
1259 void sock_rfree(struct sk_buff *skb)
1261 struct sock *sk = skb->sk;
1263 atomic_sub(skb->truesize, &sk->sk_rmem_alloc);
1264 sk_mem_uncharge(skb->sk, skb->truesize);
1266 EXPORT_SYMBOL(sock_rfree);
1269 int sock_i_uid(struct sock *sk)
1271 int uid;
1273 read_lock(&sk->sk_callback_lock);
1274 uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : 0;
1275 read_unlock(&sk->sk_callback_lock);
1276 return uid;
1278 EXPORT_SYMBOL(sock_i_uid);
1280 unsigned long sock_i_ino(struct sock *sk)
1282 unsigned long ino;
1284 read_lock(&sk->sk_callback_lock);
1285 ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0;
1286 read_unlock(&sk->sk_callback_lock);
1287 return ino;
1289 EXPORT_SYMBOL(sock_i_ino);
1292 * Allocate a skb from the socket's send buffer.
1294 struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
1295 gfp_t priority)
1297 if (force || atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
1298 struct sk_buff *skb = alloc_skb(size, priority);
1299 if (skb) {
1300 skb_set_owner_w(skb, sk);
1301 return skb;
1304 return NULL;
1306 EXPORT_SYMBOL(sock_wmalloc);
1309 * Allocate a skb from the socket's receive buffer.
1311 struct sk_buff *sock_rmalloc(struct sock *sk, unsigned long size, int force,
1312 gfp_t priority)
1314 if (force || atomic_read(&sk->sk_rmem_alloc) < sk->sk_rcvbuf) {
1315 struct sk_buff *skb = alloc_skb(size, priority);
1316 if (skb) {
1317 skb_set_owner_r(skb, sk);
1318 return skb;
1321 return NULL;
1325 * Allocate a memory block from the socket's option memory buffer.
1327 void *sock_kmalloc(struct sock *sk, int size, gfp_t priority)
1329 if ((unsigned)size <= sysctl_optmem_max &&
1330 atomic_read(&sk->sk_omem_alloc) + size < sysctl_optmem_max) {
1331 void *mem;
1332 /* First do the add, to avoid the race if kmalloc
1333 * might sleep.
1335 atomic_add(size, &sk->sk_omem_alloc);
1336 mem = kmalloc(size, priority);
1337 if (mem)
1338 return mem;
1339 atomic_sub(size, &sk->sk_omem_alloc);
1341 return NULL;
1343 EXPORT_SYMBOL(sock_kmalloc);
1346 * Free an option memory block.
1348 void sock_kfree_s(struct sock *sk, void *mem, int size)
1350 kfree(mem);
1351 atomic_sub(size, &sk->sk_omem_alloc);
1353 EXPORT_SYMBOL(sock_kfree_s);
1355 /* It is almost wait_for_tcp_memory minus release_sock/lock_sock.
1356 I think, these locks should be removed for datagram sockets.
1358 static long sock_wait_for_wmem(struct sock *sk, long timeo)
1360 DEFINE_WAIT(wait);
1362 clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
1363 for (;;) {
1364 if (!timeo)
1365 break;
1366 if (signal_pending(current))
1367 break;
1368 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1369 prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE);
1370 if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf)
1371 break;
1372 if (sk->sk_shutdown & SEND_SHUTDOWN)
1373 break;
1374 if (sk->sk_err)
1375 break;
1376 timeo = schedule_timeout(timeo);
1378 finish_wait(sk->sk_sleep, &wait);
1379 return timeo;
1384 * Generic send/receive buffer handlers
1387 struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len,
1388 unsigned long data_len, int noblock,
1389 int *errcode)
1391 struct sk_buff *skb;
1392 gfp_t gfp_mask;
1393 long timeo;
1394 int err;
1395 int npages = (data_len + (PAGE_SIZE - 1)) >> PAGE_SHIFT;
1397 err = -EMSGSIZE;
1398 if (npages > MAX_SKB_FRAGS)
1399 goto failure;
1401 gfp_mask = sk->sk_allocation;
1402 if (gfp_mask & __GFP_WAIT)
1403 gfp_mask |= __GFP_REPEAT;
1405 timeo = sock_sndtimeo(sk, noblock);
1406 while (1) {
1407 err = sock_error(sk);
1408 if (err != 0)
1409 goto failure;
1411 err = -EPIPE;
1412 if (sk->sk_shutdown & SEND_SHUTDOWN)
1413 goto failure;
1415 if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
1416 skb = alloc_skb(header_len, gfp_mask);
1417 if (skb) {
1418 int i;
1420 /* No pages, we're done... */
1421 if (!data_len)
1422 break;
1424 skb->truesize += data_len;
1425 skb_shinfo(skb)->nr_frags = npages;
1426 for (i = 0; i < npages; i++) {
1427 struct page *page;
1428 skb_frag_t *frag;
1430 page = alloc_pages(sk->sk_allocation, 0);
1431 if (!page) {
1432 err = -ENOBUFS;
1433 skb_shinfo(skb)->nr_frags = i;
1434 kfree_skb(skb);
1435 goto failure;
1438 frag = &skb_shinfo(skb)->frags[i];
1439 frag->page = page;
1440 frag->page_offset = 0;
1441 frag->size = (data_len >= PAGE_SIZE ?
1442 PAGE_SIZE :
1443 data_len);
1444 data_len -= PAGE_SIZE;
1447 /* Full success... */
1448 break;
1450 err = -ENOBUFS;
1451 goto failure;
1453 set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
1454 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1455 err = -EAGAIN;
1456 if (!timeo)
1457 goto failure;
1458 if (signal_pending(current))
1459 goto interrupted;
1460 timeo = sock_wait_for_wmem(sk, timeo);
1463 skb_set_owner_w(skb, sk);
1464 return skb;
1466 interrupted:
1467 err = sock_intr_errno(timeo);
1468 failure:
1469 *errcode = err;
1470 return NULL;
1472 EXPORT_SYMBOL(sock_alloc_send_pskb);
1474 struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size,
1475 int noblock, int *errcode)
1477 return sock_alloc_send_pskb(sk, size, 0, noblock, errcode);
1479 EXPORT_SYMBOL(sock_alloc_send_skb);
1481 static void __lock_sock(struct sock *sk)
1483 DEFINE_WAIT(wait);
1485 for (;;) {
1486 prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait,
1487 TASK_UNINTERRUPTIBLE);
1488 spin_unlock_bh(&sk->sk_lock.slock);
1489 schedule();
1490 spin_lock_bh(&sk->sk_lock.slock);
1491 if (!sock_owned_by_user(sk))
1492 break;
1494 finish_wait(&sk->sk_lock.wq, &wait);
1497 static void __release_sock(struct sock *sk)
1499 struct sk_buff *skb = sk->sk_backlog.head;
1501 do {
1502 sk->sk_backlog.head = sk->sk_backlog.tail = NULL;
1503 bh_unlock_sock(sk);
1505 do {
1506 struct sk_buff *next = skb->next;
1508 skb->next = NULL;
1509 sk_backlog_rcv(sk, skb);
1512 * We are in process context here with softirqs
1513 * disabled, use cond_resched_softirq() to preempt.
1514 * This is safe to do because we've taken the backlog
1515 * queue private:
1517 cond_resched_softirq();
1519 skb = next;
1520 } while (skb != NULL);
1522 bh_lock_sock(sk);
1523 } while ((skb = sk->sk_backlog.head) != NULL);
1527 * sk_wait_data - wait for data to arrive at sk_receive_queue
1528 * @sk: sock to wait on
1529 * @timeo: for how long
1531 * Now socket state including sk->sk_err is changed only under lock,
1532 * hence we may omit checks after joining wait queue.
1533 * We check receive queue before schedule() only as optimization;
1534 * it is very likely that release_sock() added new data.
1536 int sk_wait_data(struct sock *sk, long *timeo)
1538 int rc;
1539 DEFINE_WAIT(wait);
1541 prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE);
1542 set_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
1543 rc = sk_wait_event(sk, timeo, !skb_queue_empty(&sk->sk_receive_queue));
1544 clear_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
1545 finish_wait(sk->sk_sleep, &wait);
1546 return rc;
1548 EXPORT_SYMBOL(sk_wait_data);
1551 * __sk_mem_schedule - increase sk_forward_alloc and memory_allocated
1552 * @sk: socket
1553 * @size: memory size to allocate
1554 * @kind: allocation type
1556 * If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means
1557 * rmem allocation. This function assumes that protocols which have
1558 * memory_pressure use sk_wmem_queued as write buffer accounting.
1560 int __sk_mem_schedule(struct sock *sk, int size, int kind)
1562 struct proto *prot = sk->sk_prot;
1563 int amt = sk_mem_pages(size);
1564 int allocated;
1566 sk->sk_forward_alloc += amt * SK_MEM_QUANTUM;
1567 allocated = atomic_add_return(amt, prot->memory_allocated);
1569 /* Under limit. */
1570 if (allocated <= prot->sysctl_mem[0]) {
1571 if (prot->memory_pressure && *prot->memory_pressure)
1572 *prot->memory_pressure = 0;
1573 return 1;
1576 /* Under pressure. */
1577 if (allocated > prot->sysctl_mem[1])
1578 if (prot->enter_memory_pressure)
1579 prot->enter_memory_pressure(sk);
1581 /* Over hard limit. */
1582 if (allocated > prot->sysctl_mem[2])
1583 goto suppress_allocation;
1585 /* guarantee minimum buffer size under pressure */
1586 if (kind == SK_MEM_RECV) {
1587 if (atomic_read(&sk->sk_rmem_alloc) < prot->sysctl_rmem[0])
1588 return 1;
1589 } else { /* SK_MEM_SEND */
1590 if (sk->sk_type == SOCK_STREAM) {
1591 if (sk->sk_wmem_queued < prot->sysctl_wmem[0])
1592 return 1;
1593 } else if (atomic_read(&sk->sk_wmem_alloc) <
1594 prot->sysctl_wmem[0])
1595 return 1;
1598 if (prot->memory_pressure) {
1599 int alloc;
1601 if (!*prot->memory_pressure)
1602 return 1;
1603 alloc = percpu_counter_read_positive(prot->sockets_allocated);
1604 if (prot->sysctl_mem[2] > alloc *
1605 sk_mem_pages(sk->sk_wmem_queued +
1606 atomic_read(&sk->sk_rmem_alloc) +
1607 sk->sk_forward_alloc))
1608 return 1;
1611 suppress_allocation:
1613 if (kind == SK_MEM_SEND && sk->sk_type == SOCK_STREAM) {
1614 sk_stream_moderate_sndbuf(sk);
1616 /* Fail only if socket is _under_ its sndbuf.
1617 * In this case we cannot block, so that we have to fail.
1619 if (sk->sk_wmem_queued + size >= sk->sk_sndbuf)
1620 return 1;
1623 /* Alas. Undo changes. */
1624 sk->sk_forward_alloc -= amt * SK_MEM_QUANTUM;
1625 atomic_sub(amt, prot->memory_allocated);
1626 return 0;
1628 EXPORT_SYMBOL(__sk_mem_schedule);
1631 * __sk_reclaim - reclaim memory_allocated
1632 * @sk: socket
1634 void __sk_mem_reclaim(struct sock *sk)
1636 struct proto *prot = sk->sk_prot;
1638 atomic_sub(sk->sk_forward_alloc >> SK_MEM_QUANTUM_SHIFT,
1639 prot->memory_allocated);
1640 sk->sk_forward_alloc &= SK_MEM_QUANTUM - 1;
1642 if (prot->memory_pressure && *prot->memory_pressure &&
1643 (atomic_read(prot->memory_allocated) < prot->sysctl_mem[0]))
1644 *prot->memory_pressure = 0;
1646 EXPORT_SYMBOL(__sk_mem_reclaim);
1650 * Set of default routines for initialising struct proto_ops when
1651 * the protocol does not support a particular function. In certain
1652 * cases where it makes no sense for a protocol to have a "do nothing"
1653 * function, some default processing is provided.
1656 int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len)
1658 return -EOPNOTSUPP;
1660 EXPORT_SYMBOL(sock_no_bind);
1662 int sock_no_connect(struct socket *sock, struct sockaddr *saddr,
1663 int len, int flags)
1665 return -EOPNOTSUPP;
1667 EXPORT_SYMBOL(sock_no_connect);
1669 int sock_no_socketpair(struct socket *sock1, struct socket *sock2)
1671 return -EOPNOTSUPP;
1673 EXPORT_SYMBOL(sock_no_socketpair);
1675 int sock_no_accept(struct socket *sock, struct socket *newsock, int flags)
1677 return -EOPNOTSUPP;
1679 EXPORT_SYMBOL(sock_no_accept);
1681 int sock_no_getname(struct socket *sock, struct sockaddr *saddr,
1682 int *len, int peer)
1684 return -EOPNOTSUPP;
1686 EXPORT_SYMBOL(sock_no_getname);
1688 unsigned int sock_no_poll(struct file *file, struct socket *sock, poll_table *pt)
1690 return 0;
1692 EXPORT_SYMBOL(sock_no_poll);
1694 int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
1696 return -EOPNOTSUPP;
1698 EXPORT_SYMBOL(sock_no_ioctl);
1700 int sock_no_listen(struct socket *sock, int backlog)
1702 return -EOPNOTSUPP;
1704 EXPORT_SYMBOL(sock_no_listen);
1706 int sock_no_shutdown(struct socket *sock, int how)
1708 return -EOPNOTSUPP;
1710 EXPORT_SYMBOL(sock_no_shutdown);
1712 int sock_no_setsockopt(struct socket *sock, int level, int optname,
1713 char __user *optval, unsigned int optlen)
1715 return -EOPNOTSUPP;
1717 EXPORT_SYMBOL(sock_no_setsockopt);
1719 int sock_no_getsockopt(struct socket *sock, int level, int optname,
1720 char __user *optval, int __user *optlen)
1722 return -EOPNOTSUPP;
1724 EXPORT_SYMBOL(sock_no_getsockopt);
1726 int sock_no_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *m,
1727 size_t len)
1729 return -EOPNOTSUPP;
1731 EXPORT_SYMBOL(sock_no_sendmsg);
1733 int sock_no_recvmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *m,
1734 size_t len, int flags)
1736 return -EOPNOTSUPP;
1738 EXPORT_SYMBOL(sock_no_recvmsg);
1740 int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
1742 /* Mirror missing mmap method error code */
1743 return -ENODEV;
1745 EXPORT_SYMBOL(sock_no_mmap);
1747 ssize_t sock_no_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags)
1749 ssize_t res;
1750 struct msghdr msg = {.msg_flags = flags};
1751 struct kvec iov;
1752 char *kaddr = kmap(page);
1753 iov.iov_base = kaddr + offset;
1754 iov.iov_len = size;
1755 res = kernel_sendmsg(sock, &msg, &iov, 1, size);
1756 kunmap(page);
1757 return res;
1759 EXPORT_SYMBOL(sock_no_sendpage);
1762 * Default Socket Callbacks
1765 static void sock_def_wakeup(struct sock *sk)
1767 read_lock(&sk->sk_callback_lock);
1768 if (sk_has_sleeper(sk))
1769 wake_up_interruptible_all(sk->sk_sleep);
1770 read_unlock(&sk->sk_callback_lock);
1773 static void sock_def_error_report(struct sock *sk)
1775 read_lock(&sk->sk_callback_lock);
1776 if (sk_has_sleeper(sk))
1777 wake_up_interruptible_poll(sk->sk_sleep, POLLERR);
1778 sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR);
1779 read_unlock(&sk->sk_callback_lock);
1782 static void sock_def_readable(struct sock *sk, int len)
1784 read_lock(&sk->sk_callback_lock);
1785 if (sk_has_sleeper(sk))
1786 wake_up_interruptible_sync_poll(sk->sk_sleep, POLLIN |
1787 POLLRDNORM | POLLRDBAND);
1788 sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
1789 read_unlock(&sk->sk_callback_lock);
1792 static void sock_def_write_space(struct sock *sk)
1794 read_lock(&sk->sk_callback_lock);
1796 /* Do not wake up a writer until he can make "significant"
1797 * progress. --DaveM
1799 if ((atomic_read(&sk->sk_wmem_alloc) << 1) <= sk->sk_sndbuf) {
1800 if (sk_has_sleeper(sk))
1801 wake_up_interruptible_sync_poll(sk->sk_sleep, POLLOUT |
1802 POLLWRNORM | POLLWRBAND);
1804 /* Should agree with poll, otherwise some programs break */
1805 if (sock_writeable(sk))
1806 sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
1809 read_unlock(&sk->sk_callback_lock);
1812 static void sock_def_destruct(struct sock *sk)
1814 kfree(sk->sk_protinfo);
1817 void sk_send_sigurg(struct sock *sk)
1819 if (sk->sk_socket && sk->sk_socket->file)
1820 if (send_sigurg(&sk->sk_socket->file->f_owner))
1821 sk_wake_async(sk, SOCK_WAKE_URG, POLL_PRI);
1823 EXPORT_SYMBOL(sk_send_sigurg);
1825 void sk_reset_timer(struct sock *sk, struct timer_list* timer,
1826 unsigned long expires)
1828 if (!mod_timer(timer, expires))
1829 sock_hold(sk);
1831 EXPORT_SYMBOL(sk_reset_timer);
1833 void sk_stop_timer(struct sock *sk, struct timer_list* timer)
1835 if (timer_pending(timer) && del_timer(timer))
1836 __sock_put(sk);
1838 EXPORT_SYMBOL(sk_stop_timer);
1840 void sock_init_data(struct socket *sock, struct sock *sk)
1842 skb_queue_head_init(&sk->sk_receive_queue);
1843 skb_queue_head_init(&sk->sk_write_queue);
1844 skb_queue_head_init(&sk->sk_error_queue);
1845 #ifdef CONFIG_NET_DMA
1846 skb_queue_head_init(&sk->sk_async_wait_queue);
1847 #endif
1849 sk->sk_send_head = NULL;
1851 init_timer(&sk->sk_timer);
1853 sk->sk_allocation = GFP_KERNEL;
1854 sk->sk_rcvbuf = sysctl_rmem_default;
1855 sk->sk_sndbuf = sysctl_wmem_default;
1856 sk->sk_state = TCP_CLOSE;
1857 sk_set_socket(sk, sock);
1859 sock_set_flag(sk, SOCK_ZAPPED);
1861 if (sock) {
1862 sk->sk_type = sock->type;
1863 sk->sk_sleep = &sock->wait;
1864 sock->sk = sk;
1865 } else
1866 sk->sk_sleep = NULL;
1868 rwlock_init(&sk->sk_dst_lock);
1869 rwlock_init(&sk->sk_callback_lock);
1870 lockdep_set_class_and_name(&sk->sk_callback_lock,
1871 af_callback_keys + sk->sk_family,
1872 af_family_clock_key_strings[sk->sk_family]);
1874 sk->sk_state_change = sock_def_wakeup;
1875 sk->sk_data_ready = sock_def_readable;
1876 sk->sk_write_space = sock_def_write_space;
1877 sk->sk_error_report = sock_def_error_report;
1878 sk->sk_destruct = sock_def_destruct;
1880 sk->sk_sndmsg_page = NULL;
1881 sk->sk_sndmsg_off = 0;
1883 sk->sk_peercred.pid = 0;
1884 sk->sk_peercred.uid = -1;
1885 sk->sk_peercred.gid = -1;
1886 sk->sk_write_pending = 0;
1887 sk->sk_rcvlowat = 1;
1888 sk->sk_rcvtimeo = MAX_SCHEDULE_TIMEOUT;
1889 sk->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT;
1891 sk->sk_stamp = ktime_set(-1L, 0);
1894 * Before updating sk_refcnt, we must commit prior changes to memory
1895 * (Documentation/RCU/rculist_nulls.txt for details)
1897 smp_wmb();
1898 atomic_set(&sk->sk_refcnt, 1);
1899 atomic_set(&sk->sk_drops, 0);
1901 EXPORT_SYMBOL(sock_init_data);
1903 void lock_sock_nested(struct sock *sk, int subclass)
1905 might_sleep();
1906 spin_lock_bh(&sk->sk_lock.slock);
1907 if (sk->sk_lock.owned)
1908 __lock_sock(sk);
1909 sk->sk_lock.owned = 1;
1910 spin_unlock(&sk->sk_lock.slock);
1912 * The sk_lock has mutex_lock() semantics here:
1914 mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_);
1915 local_bh_enable();
1917 EXPORT_SYMBOL(lock_sock_nested);
1919 void release_sock(struct sock *sk)
1922 * The sk_lock has mutex_unlock() semantics:
1924 mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
1926 spin_lock_bh(&sk->sk_lock.slock);
1927 if (sk->sk_backlog.tail)
1928 __release_sock(sk);
1929 sk->sk_lock.owned = 0;
1930 if (waitqueue_active(&sk->sk_lock.wq))
1931 wake_up(&sk->sk_lock.wq);
1932 spin_unlock_bh(&sk->sk_lock.slock);
1934 EXPORT_SYMBOL(release_sock);
1936 int sock_get_timestamp(struct sock *sk, struct timeval __user *userstamp)
1938 struct timeval tv;
1939 if (!sock_flag(sk, SOCK_TIMESTAMP))
1940 sock_enable_timestamp(sk, SOCK_TIMESTAMP);
1941 tv = ktime_to_timeval(sk->sk_stamp);
1942 if (tv.tv_sec == -1)
1943 return -ENOENT;
1944 if (tv.tv_sec == 0) {
1945 sk->sk_stamp = ktime_get_real();
1946 tv = ktime_to_timeval(sk->sk_stamp);
1948 return copy_to_user(userstamp, &tv, sizeof(tv)) ? -EFAULT : 0;
1950 EXPORT_SYMBOL(sock_get_timestamp);
1952 int sock_get_timestampns(struct sock *sk, struct timespec __user *userstamp)
1954 struct timespec ts;
1955 if (!sock_flag(sk, SOCK_TIMESTAMP))
1956 sock_enable_timestamp(sk, SOCK_TIMESTAMP);
1957 ts = ktime_to_timespec(sk->sk_stamp);
1958 if (ts.tv_sec == -1)
1959 return -ENOENT;
1960 if (ts.tv_sec == 0) {
1961 sk->sk_stamp = ktime_get_real();
1962 ts = ktime_to_timespec(sk->sk_stamp);
1964 return copy_to_user(userstamp, &ts, sizeof(ts)) ? -EFAULT : 0;
1966 EXPORT_SYMBOL(sock_get_timestampns);
1968 void sock_enable_timestamp(struct sock *sk, int flag)
1970 if (!sock_flag(sk, flag)) {
1971 sock_set_flag(sk, flag);
1973 * we just set one of the two flags which require net
1974 * time stamping, but time stamping might have been on
1975 * already because of the other one
1977 if (!sock_flag(sk,
1978 flag == SOCK_TIMESTAMP ?
1979 SOCK_TIMESTAMPING_RX_SOFTWARE :
1980 SOCK_TIMESTAMP))
1981 net_enable_timestamp();
1986 * Get a socket option on an socket.
1988 * FIX: POSIX 1003.1g is very ambiguous here. It states that
1989 * asynchronous errors should be reported by getsockopt. We assume
1990 * this means if you specify SO_ERROR (otherwise whats the point of it).
1992 int sock_common_getsockopt(struct socket *sock, int level, int optname,
1993 char __user *optval, int __user *optlen)
1995 struct sock *sk = sock->sk;
1997 return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
1999 EXPORT_SYMBOL(sock_common_getsockopt);
2001 #ifdef CONFIG_COMPAT
2002 int compat_sock_common_getsockopt(struct socket *sock, int level, int optname,
2003 char __user *optval, int __user *optlen)
2005 struct sock *sk = sock->sk;
2007 if (sk->sk_prot->compat_getsockopt != NULL)
2008 return sk->sk_prot->compat_getsockopt(sk, level, optname,
2009 optval, optlen);
2010 return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
2012 EXPORT_SYMBOL(compat_sock_common_getsockopt);
2013 #endif
2015 int sock_common_recvmsg(struct kiocb *iocb, struct socket *sock,
2016 struct msghdr *msg, size_t size, int flags)
2018 struct sock *sk = sock->sk;
2019 int addr_len = 0;
2020 int err;
2022 err = sk->sk_prot->recvmsg(iocb, sk, msg, size, flags & MSG_DONTWAIT,
2023 flags & ~MSG_DONTWAIT, &addr_len);
2024 if (err >= 0)
2025 msg->msg_namelen = addr_len;
2026 return err;
2028 EXPORT_SYMBOL(sock_common_recvmsg);
2031 * Set socket options on an inet socket.
2033 int sock_common_setsockopt(struct socket *sock, int level, int optname,
2034 char __user *optval, unsigned int optlen)
2036 struct sock *sk = sock->sk;
2038 return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
2040 EXPORT_SYMBOL(sock_common_setsockopt);
2042 #ifdef CONFIG_COMPAT
2043 int compat_sock_common_setsockopt(struct socket *sock, int level, int optname,
2044 char __user *optval, unsigned int optlen)
2046 struct sock *sk = sock->sk;
2048 if (sk->sk_prot->compat_setsockopt != NULL)
2049 return sk->sk_prot->compat_setsockopt(sk, level, optname,
2050 optval, optlen);
2051 return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
2053 EXPORT_SYMBOL(compat_sock_common_setsockopt);
2054 #endif
2056 void sk_common_release(struct sock *sk)
2058 if (sk->sk_prot->destroy)
2059 sk->sk_prot->destroy(sk);
2062 * Observation: when sock_common_release is called, processes have
2063 * no access to socket. But net still has.
2064 * Step one, detach it from networking:
2066 * A. Remove from hash tables.
2069 sk->sk_prot->unhash(sk);
2072 * In this point socket cannot receive new packets, but it is possible
2073 * that some packets are in flight because some CPU runs receiver and
2074 * did hash table lookup before we unhashed socket. They will achieve
2075 * receive queue and will be purged by socket destructor.
2077 * Also we still have packets pending on receive queue and probably,
2078 * our own packets waiting in device queues. sock_destroy will drain
2079 * receive queue, but transmitted packets will delay socket destruction
2080 * until the last reference will be released.
2083 sock_orphan(sk);
2085 xfrm_sk_free_policy(sk);
2087 sk_refcnt_debug_release(sk);
2088 sock_put(sk);
2090 EXPORT_SYMBOL(sk_common_release);
2092 static DEFINE_RWLOCK(proto_list_lock);
2093 static LIST_HEAD(proto_list);
2095 #ifdef CONFIG_PROC_FS
2096 #define PROTO_INUSE_NR 64 /* should be enough for the first time */
2097 struct prot_inuse {
2098 int val[PROTO_INUSE_NR];
2101 static DECLARE_BITMAP(proto_inuse_idx, PROTO_INUSE_NR);
2103 #ifdef CONFIG_NET_NS
2104 void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
2106 int cpu = smp_processor_id();
2107 per_cpu_ptr(net->core.inuse, cpu)->val[prot->inuse_idx] += val;
2109 EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
2111 int sock_prot_inuse_get(struct net *net, struct proto *prot)
2113 int cpu, idx = prot->inuse_idx;
2114 int res = 0;
2116 for_each_possible_cpu(cpu)
2117 res += per_cpu_ptr(net->core.inuse, cpu)->val[idx];
2119 return res >= 0 ? res : 0;
2121 EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
2123 static int sock_inuse_init_net(struct net *net)
2125 net->core.inuse = alloc_percpu(struct prot_inuse);
2126 return net->core.inuse ? 0 : -ENOMEM;
2129 static void sock_inuse_exit_net(struct net *net)
2131 free_percpu(net->core.inuse);
2134 static struct pernet_operations net_inuse_ops = {
2135 .init = sock_inuse_init_net,
2136 .exit = sock_inuse_exit_net,
2139 static __init int net_inuse_init(void)
2141 if (register_pernet_subsys(&net_inuse_ops))
2142 panic("Cannot initialize net inuse counters");
2144 return 0;
2147 core_initcall(net_inuse_init);
2148 #else
2149 static DEFINE_PER_CPU(struct prot_inuse, prot_inuse);
2151 void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
2153 __get_cpu_var(prot_inuse).val[prot->inuse_idx] += val;
2155 EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
2157 int sock_prot_inuse_get(struct net *net, struct proto *prot)
2159 int cpu, idx = prot->inuse_idx;
2160 int res = 0;
2162 for_each_possible_cpu(cpu)
2163 res += per_cpu(prot_inuse, cpu).val[idx];
2165 return res >= 0 ? res : 0;
2167 EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
2168 #endif
2170 static void assign_proto_idx(struct proto *prot)
2172 prot->inuse_idx = find_first_zero_bit(proto_inuse_idx, PROTO_INUSE_NR);
2174 if (unlikely(prot->inuse_idx == PROTO_INUSE_NR - 1)) {
2175 printk(KERN_ERR "PROTO_INUSE_NR exhausted\n");
2176 return;
2179 set_bit(prot->inuse_idx, proto_inuse_idx);
2182 static void release_proto_idx(struct proto *prot)
2184 if (prot->inuse_idx != PROTO_INUSE_NR - 1)
2185 clear_bit(prot->inuse_idx, proto_inuse_idx);
2187 #else
2188 static inline void assign_proto_idx(struct proto *prot)
2192 static inline void release_proto_idx(struct proto *prot)
2195 #endif
2197 int proto_register(struct proto *prot, int alloc_slab)
2199 if (alloc_slab) {
2200 prot->slab = kmem_cache_create(prot->name, prot->obj_size, 0,
2201 SLAB_HWCACHE_ALIGN | prot->slab_flags,
2202 NULL);
2204 if (prot->slab == NULL) {
2205 printk(KERN_CRIT "%s: Can't create sock SLAB cache!\n",
2206 prot->name);
2207 goto out;
2210 if (prot->rsk_prot != NULL) {
2211 static const char mask[] = "request_sock_%s";
2213 prot->rsk_prot->slab_name = kmalloc(strlen(prot->name) + sizeof(mask) - 1, GFP_KERNEL);
2214 if (prot->rsk_prot->slab_name == NULL)
2215 goto out_free_sock_slab;
2217 sprintf(prot->rsk_prot->slab_name, mask, prot->name);
2218 prot->rsk_prot->slab = kmem_cache_create(prot->rsk_prot->slab_name,
2219 prot->rsk_prot->obj_size, 0,
2220 SLAB_HWCACHE_ALIGN, NULL);
2222 if (prot->rsk_prot->slab == NULL) {
2223 printk(KERN_CRIT "%s: Can't create request sock SLAB cache!\n",
2224 prot->name);
2225 goto out_free_request_sock_slab_name;
2229 if (prot->twsk_prot != NULL) {
2230 static const char mask[] = "tw_sock_%s";
2232 prot->twsk_prot->twsk_slab_name = kmalloc(strlen(prot->name) + sizeof(mask) - 1, GFP_KERNEL);
2234 if (prot->twsk_prot->twsk_slab_name == NULL)
2235 goto out_free_request_sock_slab;
2237 sprintf(prot->twsk_prot->twsk_slab_name, mask, prot->name);
2238 prot->twsk_prot->twsk_slab =
2239 kmem_cache_create(prot->twsk_prot->twsk_slab_name,
2240 prot->twsk_prot->twsk_obj_size,
2242 SLAB_HWCACHE_ALIGN |
2243 prot->slab_flags,
2244 NULL);
2245 if (prot->twsk_prot->twsk_slab == NULL)
2246 goto out_free_timewait_sock_slab_name;
2250 write_lock(&proto_list_lock);
2251 list_add(&prot->node, &proto_list);
2252 assign_proto_idx(prot);
2253 write_unlock(&proto_list_lock);
2254 return 0;
2256 out_free_timewait_sock_slab_name:
2257 kfree(prot->twsk_prot->twsk_slab_name);
2258 out_free_request_sock_slab:
2259 if (prot->rsk_prot && prot->rsk_prot->slab) {
2260 kmem_cache_destroy(prot->rsk_prot->slab);
2261 prot->rsk_prot->slab = NULL;
2263 out_free_request_sock_slab_name:
2264 kfree(prot->rsk_prot->slab_name);
2265 out_free_sock_slab:
2266 kmem_cache_destroy(prot->slab);
2267 prot->slab = NULL;
2268 out:
2269 return -ENOBUFS;
2271 EXPORT_SYMBOL(proto_register);
2273 void proto_unregister(struct proto *prot)
2275 write_lock(&proto_list_lock);
2276 release_proto_idx(prot);
2277 list_del(&prot->node);
2278 write_unlock(&proto_list_lock);
2280 if (prot->slab != NULL) {
2281 kmem_cache_destroy(prot->slab);
2282 prot->slab = NULL;
2285 if (prot->rsk_prot != NULL && prot->rsk_prot->slab != NULL) {
2286 kmem_cache_destroy(prot->rsk_prot->slab);
2287 kfree(prot->rsk_prot->slab_name);
2288 prot->rsk_prot->slab = NULL;
2291 if (prot->twsk_prot != NULL && prot->twsk_prot->twsk_slab != NULL) {
2292 kmem_cache_destroy(prot->twsk_prot->twsk_slab);
2293 kfree(prot->twsk_prot->twsk_slab_name);
2294 prot->twsk_prot->twsk_slab = NULL;
2297 EXPORT_SYMBOL(proto_unregister);
2299 #ifdef CONFIG_PROC_FS
2300 static void *proto_seq_start(struct seq_file *seq, loff_t *pos)
2301 __acquires(proto_list_lock)
2303 read_lock(&proto_list_lock);
2304 return seq_list_start_head(&proto_list, *pos);
2307 static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2309 return seq_list_next(v, &proto_list, pos);
2312 static void proto_seq_stop(struct seq_file *seq, void *v)
2313 __releases(proto_list_lock)
2315 read_unlock(&proto_list_lock);
2318 static char proto_method_implemented(const void *method)
2320 return method == NULL ? 'n' : 'y';
2323 static void proto_seq_printf(struct seq_file *seq, struct proto *proto)
2325 seq_printf(seq, "%-9s %4u %6d %6d %-3s %6u %-3s %-10s "
2326 "%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n",
2327 proto->name,
2328 proto->obj_size,
2329 sock_prot_inuse_get(seq_file_net(seq), proto),
2330 proto->memory_allocated != NULL ? atomic_read(proto->memory_allocated) : -1,
2331 proto->memory_pressure != NULL ? *proto->memory_pressure ? "yes" : "no" : "NI",
2332 proto->max_header,
2333 proto->slab == NULL ? "no" : "yes",
2334 module_name(proto->owner),
2335 proto_method_implemented(proto->close),
2336 proto_method_implemented(proto->connect),
2337 proto_method_implemented(proto->disconnect),
2338 proto_method_implemented(proto->accept),
2339 proto_method_implemented(proto->ioctl),
2340 proto_method_implemented(proto->init),
2341 proto_method_implemented(proto->destroy),
2342 proto_method_implemented(proto->shutdown),
2343 proto_method_implemented(proto->setsockopt),
2344 proto_method_implemented(proto->getsockopt),
2345 proto_method_implemented(proto->sendmsg),
2346 proto_method_implemented(proto->recvmsg),
2347 proto_method_implemented(proto->sendpage),
2348 proto_method_implemented(proto->bind),
2349 proto_method_implemented(proto->backlog_rcv),
2350 proto_method_implemented(proto->hash),
2351 proto_method_implemented(proto->unhash),
2352 proto_method_implemented(proto->get_port),
2353 proto_method_implemented(proto->enter_memory_pressure));
2356 static int proto_seq_show(struct seq_file *seq, void *v)
2358 if (v == &proto_list)
2359 seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s",
2360 "protocol",
2361 "size",
2362 "sockets",
2363 "memory",
2364 "press",
2365 "maxhdr",
2366 "slab",
2367 "module",
2368 "cl co di ac io in de sh ss gs se re sp bi br ha uh gp em\n");
2369 else
2370 proto_seq_printf(seq, list_entry(v, struct proto, node));
2371 return 0;
2374 static const struct seq_operations proto_seq_ops = {
2375 .start = proto_seq_start,
2376 .next = proto_seq_next,
2377 .stop = proto_seq_stop,
2378 .show = proto_seq_show,
2381 static int proto_seq_open(struct inode *inode, struct file *file)
2383 return seq_open_net(inode, file, &proto_seq_ops,
2384 sizeof(struct seq_net_private));
2387 static const struct file_operations proto_seq_fops = {
2388 .owner = THIS_MODULE,
2389 .open = proto_seq_open,
2390 .read = seq_read,
2391 .llseek = seq_lseek,
2392 .release = seq_release_net,
2395 static __net_init int proto_init_net(struct net *net)
2397 if (!proc_net_fops_create(net, "protocols", S_IRUGO, &proto_seq_fops))
2398 return -ENOMEM;
2400 return 0;
2403 static __net_exit void proto_exit_net(struct net *net)
2405 proc_net_remove(net, "protocols");
2409 static __net_initdata struct pernet_operations proto_net_ops = {
2410 .init = proto_init_net,
2411 .exit = proto_exit_net,
2414 static int __init proto_init(void)
2416 return register_pernet_subsys(&proto_net_ops);
2419 subsys_initcall(proto_init);
2421 #endif /* PROC_FS */