2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
6 * Generic socket support routines. Memory allocators, socket lock/release
7 * handler for protocols to use and generic option handler.
10 * Version: $Id: sock.c,v 1.86 1999/09/01 08:11:49 davem Exp $
12 * Authors: Ross Biro, <bir7@leland.Stanford.Edu>
13 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
14 * Florian La Roche, <flla@stud.uni-sb.de>
15 * Alan Cox, <A.Cox@swansea.ac.uk>
18 * Alan Cox : Numerous verify_area() problems
19 * Alan Cox : Connecting on a connecting socket
20 * now returns an error for tcp.
21 * Alan Cox : sock->protocol is set correctly.
22 * and is not sometimes left as 0.
23 * Alan Cox : connect handles icmp errors on a
24 * connect properly. Unfortunately there
25 * is a restart syscall nasty there. I
26 * can't match BSD without hacking the C
27 * library. Ideas urgently sought!
28 * Alan Cox : Disallow bind() to addresses that are
29 * not ours - especially broadcast ones!!
30 * Alan Cox : Socket 1024 _IS_ ok for users. (fencepost)
31 * Alan Cox : sock_wfree/sock_rfree don't destroy sockets,
32 * instead they leave that for the DESTROY timer.
33 * Alan Cox : Clean up error flag in accept
34 * Alan Cox : TCP ack handling is buggy, the DESTROY timer
35 * was buggy. Put a remove_sock() in the handler
36 * for memory when we hit 0. Also altered the timer
37 * code. The ACK stuff can wait and needs major
39 * Alan Cox : Fixed TCP ack bug, removed remove sock
40 * and fixed timer/inet_bh race.
41 * Alan Cox : Added zapped flag for TCP
42 * Alan Cox : Move kfree_skb into skbuff.c and tidied up surplus code
43 * Alan Cox : for new sk_buff allocations wmalloc/rmalloc now call alloc_skb
44 * Alan Cox : kfree_s calls now are kfree_skbmem so we can track skb resources
45 * Alan Cox : Supports socket option broadcast now as does udp. Packet and raw need fixing.
46 * Alan Cox : Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so...
47 * Rick Sladkey : Relaxed UDP rules for matching packets.
48 * C.E.Hawkins : IFF_PROMISC/SIOCGHWADDR support
49 * Pauline Middelink : identd support
50 * Alan Cox : Fixed connect() taking signals I think.
51 * Alan Cox : SO_LINGER supported
52 * Alan Cox : Error reporting fixes
53 * Anonymous : inet_create tidied up (sk->reuse setting)
54 * Alan Cox : inet sockets don't set sk->type!
55 * Alan Cox : Split socket option code
56 * Alan Cox : Callbacks
57 * Alan Cox : Nagle flag for Charles & Johannes stuff
58 * Alex : Removed restriction on inet fioctl
59 * Alan Cox : Splitting INET from NET core
60 * Alan Cox : Fixed bogus SO_TYPE handling in getsockopt()
61 * Adam Caldwell : Missing return in SO_DONTROUTE/SO_DEBUG code
62 * Alan Cox : Split IP from generic code
63 * Alan Cox : New kfree_skbmem()
64 * Alan Cox : Make SO_DEBUG superuser only.
65 * Alan Cox : Allow anyone to clear SO_DEBUG
67 * Alan Cox : Added optimistic memory grabbing for AF_UNIX throughput.
68 * Alan Cox : Allocator for a socket is settable.
69 * Alan Cox : SO_ERROR includes soft errors.
70 * Alan Cox : Allow NULL arguments on some SO_ opts
71 * Alan Cox : Generic socket allocation to make hooks
72 * easier (suggested by Craig Metz).
73 * Michael Pall : SO_ERROR returns positive errno again
74 * Steve Whitehouse: Added default destructor to free
75 * protocol private data.
76 * Steve Whitehouse: Added various other default routines
77 * common to several socket families.
78 * Chris Evans : Call suser() check last on F_SETOWN
79 * Jay Schulist : Added SO_ATTACH_FILTER and SO_DETACH_FILTER.
80 * Andi Kleen : Add sock_kmalloc()/sock_kfree_s()
81 * Andi Kleen : Fix write_space callback
86 * This program is free software; you can redistribute it and/or
87 * modify it under the terms of the GNU General Public License
88 * as published by the Free Software Foundation; either version
89 * 2 of the License, or (at your option) any later version.
92 #include <linux/config.h>
93 #include <linux/errno.h>
94 #include <linux/types.h>
95 #include <linux/socket.h>
97 #include <linux/kernel.h>
98 #include <linux/major.h>
99 #include <linux/sched.h>
100 #include <linux/timer.h>
101 #include <linux/string.h>
102 #include <linux/sockios.h>
103 #include <linux/net.h>
104 #include <linux/fcntl.h>
105 #include <linux/mm.h>
106 #include <linux/slab.h>
107 #include <linux/interrupt.h>
108 #include <linux/poll.h>
109 #include <linux/init.h>
111 #include <asm/uaccess.h>
112 #include <asm/system.h>
114 #include <linux/inet.h>
115 #include <linux/netdevice.h>
117 #include <net/protocol.h>
119 #include <net/route.h>
122 #include <linux/skbuff.h>
123 #include <net/sock.h>
125 #include <net/icmp.h>
126 #include <linux/ipsec.h>
129 #include <linux/filter.h>
132 #define min(a,b) ((a)<(b)?(a):(b))
134 /* Run time adjustable parameters. */
135 __u32 sysctl_wmem_max
= SK_WMEM_MAX
;
136 __u32 sysctl_rmem_max
= SK_RMEM_MAX
;
137 __u32 sysctl_wmem_default
= SK_WMEM_MAX
;
138 __u32 sysctl_rmem_default
= SK_RMEM_MAX
;
140 /* Maximal space eaten by iovec or ancilliary data plus some space */
141 int sysctl_optmem_max
= sizeof(unsigned long)*(2*UIO_MAXIOV
+ 512);
144 * This is meant for all protocols to use and covers goings on
145 * at the socket level. Everything here is generic.
148 int sock_setsockopt(struct socket
*sock
, int level
, int optname
,
149 char *optval
, int optlen
)
151 struct sock
*sk
=sock
->sk
;
153 struct sk_filter
*filter
;
162 * Options without arguments
165 #ifdef SO_DONTLINGER /* Compatibility item... */
174 if(optlen
<sizeof(int))
177 err
= get_user(val
, (int *)optval
);
188 if(val
&& !capable(CAP_NET_ADMIN
))
203 sk
->localroute
=valbool
;
206 sk
->broadcast
=valbool
;
209 /* Don't error on this BSD doesn't and if you think
210 about it this is right. Otherwise apps have to
211 play 'guess the biggest size' games. RCVBUF/SNDBUF
212 are treated in BSD as hints */
214 if (val
> sysctl_wmem_max
)
215 val
= sysctl_wmem_max
;
217 sk
->sndbuf
= max(val
*2,2048);
220 * Wake up sending tasks if we
227 /* Don't error on this BSD doesn't and if you think
228 about it this is right. Otherwise apps have to
229 play 'guess the biggest size' games. RCVBUF/SNDBUF
230 are treated in BSD as hints */
232 if (val
> sysctl_rmem_max
)
233 val
= sysctl_rmem_max
;
235 /* FIXME: is this lower bound the right one? */
236 sk
->rcvbuf
= max(val
*2,256);
241 if (sk
->protocol
== IPPROTO_TCP
)
243 tcp_set_keepalive(sk
, valbool
);
246 sk
->keepopen
= valbool
;
250 sk
->urginline
= valbool
;
254 sk
->no_check
= valbool
;
258 if ((val
>= 0 && val
<= 6) || capable(CAP_NET_ADMIN
))
265 if(optlen
<sizeof(ling
)) {
266 ret
= -EINVAL
; /* 1003.1g */
269 if (copy_from_user(&ling
,optval
,sizeof(ling
)))
278 sk
->lingertime
=ling
.l_linger
;
284 sk
->bsdism
= valbool
;
288 sock
->passcred
= valbool
;
292 #ifdef CONFIG_NETDEVICES
293 case SO_BINDTODEVICE
:
295 char devname
[IFNAMSIZ
];
298 if (!capable(CAP_NET_RAW
)) {
303 /* Bind this socket to a particular device like "eth0",
304 * as specified in the passed interface name. If the
305 * name is "" or the option length is zero the socket
310 sk
->bound_dev_if
= 0;
312 if (optlen
> IFNAMSIZ
)
314 if (copy_from_user(devname
, optval
, optlen
)) {
319 /* Remove any cached route for this socket. */
322 if (devname
[0] == '\0') {
323 sk
->bound_dev_if
= 0;
325 struct net_device
*dev
= dev_get_by_name(devname
);
330 sk
->bound_dev_if
= dev
->ifindex
;
340 case SO_ATTACH_FILTER
:
342 if (optlen
== sizeof(struct sock_fprog
)) {
343 struct sock_fprog fprog
;
346 if (copy_from_user(&fprog
, optval
, sizeof(fprog
)))
349 ret
= sk_attach_filter(&fprog
, sk
);
353 case SO_DETACH_FILTER
:
354 spin_lock_bh(&sk
->lock
.slock
);
358 spin_unlock_bh(&sk
->lock
.slock
);
359 sk_filter_release(sk
, filter
);
362 spin_unlock_bh(&sk
->lock
.slock
);
366 /* We implement the SO_SNDLOWAT etc to
367 not be settable (1003.1g 5.3) */
377 int sock_getsockopt(struct socket
*sock
, int level
, int optname
,
378 char *optval
, int *optlen
)
380 struct sock
*sk
= sock
->sk
;
389 int lv
=sizeof(int),len
;
391 if(get_user(len
,optlen
))
401 v
.val
= sk
->localroute
;
405 v
.val
= sk
->broadcast
;
421 v
.val
= sk
->keepopen
;
429 v
.val
= -sock_error(sk
);
431 v
.val
=xchg(&sk
->err_soft
,0);
435 v
.val
= sk
->urginline
;
439 v
.val
= sk
->no_check
;
443 v
.val
= sk
->priority
;
448 v
.ling
.l_onoff
=sk
->linger
;
449 v
.ling
.l_linger
=sk
->lingertime
;
458 lv
=sizeof(struct timeval
);
469 v
.val
= sock
->passcred
;
473 lv
=sizeof(sk
->peercred
);
475 if(copy_to_user((void*)optval
, &sk
->peercred
, len
))
480 return(-ENOPROTOOPT
);
483 if(copy_to_user(optval
,&v
,len
))
486 if(put_user(len
, optlen
))
491 static kmem_cache_t
*sk_cachep
;
494 * All socket objects are allocated here. This is for future
498 struct sock
*sk_alloc(int family
, int priority
, int zero_it
)
500 struct sock
*sk
= kmem_cache_alloc(sk_cachep
, priority
);
503 memset(sk
, 0, sizeof(struct sock
));
511 void sk_free(struct sock
*sk
)
514 struct sk_filter
*filter
;
523 sk_filter_release(sk
, filter
);
528 if (atomic_read(&sk
->omem_alloc
))
529 printk(KERN_DEBUG
"sk_free: optmem leakage (%d bytes) detected.\n", atomic_read(&sk
->omem_alloc
));
531 kmem_cache_free(sk_cachep
, sk
);
534 void __init
sk_init(void)
536 sk_cachep
= kmem_cache_create("sock", sizeof(struct sock
), 0,
537 SLAB_HWCACHE_ALIGN
, 0, 0);
542 * Simple resource managers for sockets.
547 * Write buffer destructor automatically called from kfree_skb.
549 void sock_wfree(struct sk_buff
*skb
)
551 struct sock
*sk
= skb
->sk
;
553 /* In case it might be waiting for more memory. */
554 atomic_sub(skb
->truesize
, &sk
->wmem_alloc
);
560 * Read buffer destructor automatically called from kfree_skb.
562 void sock_rfree(struct sk_buff
*skb
)
564 struct sock
*sk
= skb
->sk
;
566 atomic_sub(skb
->truesize
, &sk
->rmem_alloc
);
569 void sock_cfree(struct sk_buff
*skb
)
575 * Allocate a skb from the socket's send buffer.
577 struct sk_buff
*sock_wmalloc(struct sock
*sk
, unsigned long size
, int force
, int priority
)
579 if (force
|| atomic_read(&sk
->wmem_alloc
) < sk
->sndbuf
) {
580 struct sk_buff
* skb
= alloc_skb(size
, priority
);
582 skb_set_owner_w(skb
, sk
);
590 * Allocate a skb from the socket's receive buffer.
592 struct sk_buff
*sock_rmalloc(struct sock
*sk
, unsigned long size
, int force
, int priority
)
594 if (force
|| atomic_read(&sk
->rmem_alloc
) < sk
->rcvbuf
) {
595 struct sk_buff
*skb
= alloc_skb(size
, priority
);
597 skb_set_owner_r(skb
, sk
);
605 * Allocate a memory block from the socket's option memory buffer.
607 void *sock_kmalloc(struct sock
*sk
, int size
, int priority
)
609 if ((unsigned)size
<= sysctl_optmem_max
&&
610 atomic_read(&sk
->omem_alloc
)+size
< sysctl_optmem_max
) {
612 /* First do the add, to avoid the race if kmalloc
615 atomic_add(size
, &sk
->omem_alloc
);
616 mem
= kmalloc(size
, priority
);
619 atomic_sub(size
, &sk
->omem_alloc
);
625 * Free an option memory block.
627 void sock_kfree_s(struct sock
*sk
, void *mem
, int size
)
630 atomic_sub(size
, &sk
->omem_alloc
);
633 /* FIXME: this is insane. We are trying suppose to be controlling how
634 * how much space we have for data bytes, not packet headers.
635 * This really points out that we need a better system for doing the
636 * receive buffer. -- erics
637 * WARNING: This is currently ONLY used in tcp. If you need it else where
638 * this will probably not be what you want. Possibly these two routines
639 * should move over to the ipv4 directory.
641 unsigned long sock_rspace(struct sock
*sk
)
646 /* This used to have some bizarre complications that
647 * to attempt to reserve some amount of space. This doesn't
648 * make sense, since the number returned here does not
649 * actually reflect allocated space, but rather the amount
650 * of space we committed to. We gamble that we won't
651 * run out of memory, and returning a smaller number does
652 * not change the gamble. If we lose the gamble tcp still
653 * works, it may just slow down for retransmissions.
655 amt
= sk
->rcvbuf
- atomic_read(&sk
->rmem_alloc
);
663 /* It is almost wait_for_tcp_memory minus release_sock/lock_sock.
664 I think, these locks should be removed for datagram sockets.
666 static void sock_wait_for_wmem(struct sock
* sk
)
668 DECLARE_WAITQUEUE(wait
, current
);
670 sk
->socket
->flags
&= ~SO_NOSPACE
;
671 add_wait_queue(sk
->sleep
, &wait
);
673 if (signal_pending(current
))
675 set_current_state(TASK_INTERRUPTIBLE
);
676 if (atomic_read(&sk
->wmem_alloc
) < sk
->sndbuf
)
678 if (sk
->shutdown
& SEND_SHUTDOWN
)
684 __set_current_state(TASK_RUNNING
);
685 remove_wait_queue(sk
->sleep
, &wait
);
690 * Generic send/receive buffer handlers
693 struct sk_buff
*sock_alloc_send_skb(struct sock
*sk
, unsigned long size
,
694 unsigned long fallback
, int noblock
, int *errcode
)
700 unsigned long try_size
= size
;
702 err
= sock_error(sk
);
707 * We should send SIGPIPE in these cases according to
708 * 1003.1g draft 6.4. If we (the user) did a shutdown()
709 * call however we should not.
711 * Note: This routine isnt just used for datagrams and
712 * anyway some datagram protocols have a notion of
717 if (sk
->shutdown
&SEND_SHUTDOWN
)
721 /* The buffer get won't block, or use the atomic queue.
722 * It does produce annoying no free page messages still.
724 skb
= sock_wmalloc(sk
, size
, 0, GFP_BUFFER
);
729 skb
= sock_wmalloc(sk
, try_size
, 0, sk
->allocation
);
734 * This means we have too many buffers for this socket already.
737 sk
->socket
->flags
|= SO_NOSPACE
;
742 if (signal_pending(current
))
744 sock_wait_for_wmem(sk
);
754 void __lock_sock(struct sock
*sk
)
756 DECLARE_WAITQUEUE(wait
, current
);
758 add_wait_queue_exclusive(&sk
->lock
.wq
, &wait
);
760 current
->state
= TASK_EXCLUSIVE
| TASK_UNINTERRUPTIBLE
;
761 spin_unlock_bh(&sk
->lock
.slock
);
763 spin_lock_bh(&sk
->lock
.slock
);
767 current
->state
= TASK_RUNNING
;
768 remove_wait_queue(&sk
->lock
.wq
, &wait
);
771 void __release_sock(struct sock
*sk
)
773 struct sk_buff
*skb
= sk
->backlog
.head
;
775 struct sk_buff
*next
= skb
->next
;
777 sk
->backlog_rcv(sk
, skb
);
779 } while(skb
!= NULL
);
780 sk
->backlog
.head
= sk
->backlog
.tail
= NULL
;
784 * Generic socket manager library. Most simpler socket families
785 * use this to manage their socket lists. At some point we should
786 * hash these. By making this generic we get the lot hashed for free.
788 * It is broken by design. All the protocols using it must be fixed. --ANK
791 rwlock_t net_big_sklist_lock
= RW_LOCK_UNLOCKED
;
793 void sklist_remove_socket(struct sock
**list
, struct sock
*sk
)
797 write_lock_bh(&net_big_sklist_lock
);
803 write_unlock_bh(&net_big_sklist_lock
);
816 write_unlock_bh(&net_big_sklist_lock
);
819 void sklist_insert_socket(struct sock
**list
, struct sock
*sk
)
821 write_lock_bh(&net_big_sklist_lock
);
825 write_unlock_bh(&net_big_sklist_lock
);
829 * This is only called from user mode. Thus it protects itself against
830 * interrupt users but doesn't worry about being called during work.
831 * Once it is removed from the queue no interrupt or bottom half will
832 * touch it and we are (fairly 8-) ) safe.
835 void sklist_destroy_socket(struct sock
**list
, struct sock
*sk
);
838 * Handler for deferred kills.
841 static void sklist_destroy_timer(unsigned long data
)
843 struct sock
*sk
=(struct sock
*)data
;
844 sklist_destroy_socket(NULL
,sk
);
848 * Destroy a socket. We pass NULL for a list if we know the
849 * socket is not on a list.
852 void sklist_destroy_socket(struct sock
**list
,struct sock
*sk
)
856 sklist_remove_socket(list
, sk
);
858 while((skb
=skb_dequeue(&sk
->receive_queue
))!=NULL
)
863 if(atomic_read(&sk
->wmem_alloc
) == 0 &&
864 atomic_read(&sk
->rmem_alloc
) == 0 &&
872 * Someone is using our buffers still.. defer
874 init_timer(&sk
->timer
);
875 sk
->timer
.expires
=jiffies
+SOCK_DESTROY_TIME
;
876 sk
->timer
.function
=sklist_destroy_timer
;
877 sk
->timer
.data
= (unsigned long)sk
;
878 add_timer(&sk
->timer
);
883 * Set of default routines for initialising struct proto_ops when
884 * the protocol does not support a particular function. In certain
885 * cases where it makes no sense for a protocol to have a "do nothing"
886 * function, some default processing is provided.
889 int sock_no_release(struct socket
*sock
)
894 int sock_no_bind(struct socket
*sock
, struct sockaddr
*saddr
, int len
)
899 int sock_no_connect(struct socket
*sock
, struct sockaddr
*saddr
,
905 int sock_no_socketpair(struct socket
*sock1
, struct socket
*sock2
)
910 int sock_no_accept(struct socket
*sock
, struct socket
*newsock
, int flags
)
915 int sock_no_getname(struct socket
*sock
, struct sockaddr
*saddr
,
921 unsigned int sock_no_poll(struct file
* file
, struct socket
*sock
, poll_table
*pt
)
926 int sock_no_ioctl(struct socket
*sock
, unsigned int cmd
, unsigned long arg
)
931 int sock_no_listen(struct socket
*sock
, int backlog
)
936 int sock_no_shutdown(struct socket
*sock
, int how
)
941 int sock_no_setsockopt(struct socket
*sock
, int level
, int optname
,
942 char *optval
, int optlen
)
947 int sock_no_getsockopt(struct socket
*sock
, int level
, int optname
,
948 char *optval
, int *optlen
)
954 * Note: if you add something that sleeps here then change sock_fcntl()
955 * to do proper fd locking.
957 int sock_no_fcntl(struct socket
*sock
, unsigned int cmd
, unsigned long arg
)
959 struct sock
*sk
= sock
->sk
;
965 * This is a little restrictive, but it's the only
966 * way to make sure that you can't send a sigurg to
969 if (current
->pgrp
!= -arg
&&
970 current
->pid
!= arg
&&
971 !capable(CAP_KILL
)) return(-EPERM
);
981 int sock_no_sendmsg(struct socket
*sock
, struct msghdr
*m
, int flags
,
982 struct scm_cookie
*scm
)
987 int sock_no_recvmsg(struct socket
*sock
, struct msghdr
*m
, int flags
,
988 struct scm_cookie
*scm
)
993 int sock_no_mmap(struct file
*file
, struct socket
*sock
, struct vm_area_struct
*vma
)
995 /* Mirror missing mmap method error code */
1000 * Default Socket Callbacks
1003 void sock_def_wakeup(struct sock
*sk
)
1005 read_lock(&sk
->callback_lock
);
1007 wake_up_interruptible(sk
->sleep
);
1008 read_unlock(&sk
->callback_lock
);
1011 void sock_def_error_report(struct sock
*sk
)
1013 read_lock(&sk
->callback_lock
);
1015 wake_up_interruptible(sk
->sleep
);
1016 sock_wake_async(sk
->socket
,0,POLL_ERR
);
1018 read_unlock(&sk
->callback_lock
);
1021 void sock_def_readable(struct sock
*sk
, int len
)
1023 read_lock(&sk
->callback_lock
);
1025 wake_up_interruptible(sk
->sleep
);
1026 sock_wake_async(sk
->socket
,1,POLL_IN
);
1028 read_unlock(&sk
->callback_lock
);
1031 void sock_def_write_space(struct sock
*sk
)
1033 read_lock(&sk
->callback_lock
);
1035 /* Do not wake up a writer until he can make "significant"
1039 ((atomic_read(&sk
->wmem_alloc
) << 1) <= sk
->sndbuf
)) {
1040 wake_up_interruptible(sk
->sleep
);
1042 /* Should agree with poll, otherwise some programs break */
1043 if (sock_writeable(sk
))
1044 sock_wake_async(sk
->socket
, 2, POLL_OUT
);
1046 read_unlock(&sk
->callback_lock
);
1049 void sock_def_destruct(struct sock
*sk
)
1051 if (sk
->protinfo
.destruct_hook
)
1052 kfree(sk
->protinfo
.destruct_hook
);
1055 void sock_init_data(struct socket
*sock
, struct sock
*sk
)
1057 skb_queue_head_init(&sk
->receive_queue
);
1058 skb_queue_head_init(&sk
->write_queue
);
1059 skb_queue_head_init(&sk
->error_queue
);
1061 spin_lock_init(&sk
->timer_lock
);
1062 init_timer(&sk
->timer
);
1064 sk
->allocation
= GFP_KERNEL
;
1065 sk
->rcvbuf
= sysctl_rmem_default
;
1066 sk
->sndbuf
= sysctl_wmem_default
;
1067 sk
->state
= TCP_CLOSE
;
1073 sk
->type
= sock
->type
;
1074 sk
->sleep
= &sock
->wait
;
1079 sk
->callback_lock
= RW_LOCK_UNLOCKED
;
1081 sk
->state_change
= sock_def_wakeup
;
1082 sk
->data_ready
= sock_def_readable
;
1083 sk
->write_space
= sock_def_write_space
;
1084 sk
->error_report
= sock_def_error_report
;
1085 sk
->destruct
= sock_def_destruct
;
1087 sk
->peercred
.pid
= 0;
1088 sk
->peercred
.uid
= -1;
1089 sk
->peercred
.gid
= -1;
1091 atomic_set(&sk
->refcnt
, 1);