net/core/sock.c

   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              Generic socket support routines. Memory allocators, socket lock/release
   7  *              handler for protocols to use and generic option handler.
   8  *
   9  *
  10  * Authors:     Ross Biro
  11  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12  *              Florian La Roche, <flla@stud.uni-sb.de>
  13  *              Alan Cox, <A.Cox@swansea.ac.uk>
  14  *
  15  * Fixes:
  16  *              Alan Cox        :       Numerous verify_area() problems
  17  *              Alan Cox        :       Connecting on a connecting socket
  18  *                                      now returns an error for tcp.
  19  *              Alan Cox        :       sock->protocol is set correctly.
  20  *                                      and is not sometimes left as 0.
  21  *              Alan Cox        :       connect handles icmp errors on a
  22  *                                      connect properly. Unfortunately there
  23  *                                      is a restart syscall nasty there. I
  24  *                                      can't match BSD without hacking the C
  25  *                                      library. Ideas urgently sought!
  26  *              Alan Cox        :       Disallow bind() to addresses that are
  27  *                                      not ours - especially broadcast ones!!
  28  *              Alan Cox        :       Socket 1024 _IS_ ok for users. (fencepost)
  29  *              Alan Cox        :       sock_wfree/sock_rfree don't destroy sockets,
  30  *                                      instead they leave that for the DESTROY timer.
  31  *              Alan Cox        :       Clean up error flag in accept
  32  *              Alan Cox        :       TCP ack handling is buggy, the DESTROY timer
  33  *                                      was buggy. Put a remove_sock() in the handler
  34  *                                      for memory when we hit 0. Also altered the timer
  35  *                                      code. The ACK stuff can wait and needs major
  36  *                                      TCP layer surgery.
  37  *              Alan Cox        :       Fixed TCP ack bug, removed remove sock
  38  *                                      and fixed timer/inet_bh race.
  39  *              Alan Cox        :       Added zapped flag for TCP
  40  *              Alan Cox        :       Move kfree_skb into skbuff.c and tidied up surplus code
  41  *              Alan Cox        :       for new sk_buff allocations wmalloc/rmalloc now call alloc_skb
  42  *              Alan Cox        :       kfree_s calls now are kfree_skbmem so we can track skb resources
  43  *              Alan Cox        :       Supports socket option broadcast now as does udp. Packet and raw need fixing.
  44  *              Alan Cox        :       Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so...
  45  *              Rick Sladkey    :       Relaxed UDP rules for matching packets.
  46  *              C.E.Hawkins     :       IFF_PROMISC/SIOCGHWADDR support
  47  *      Pauline Middelink       :       identd support
  48  *              Alan Cox        :       Fixed connect() taking signals I think.
  49  *              Alan Cox        :       SO_LINGER supported
  50  *              Alan Cox        :       Error reporting fixes
  51  *              Anonymous       :       inet_create tidied up (sk->reuse setting)
  52  *              Alan Cox        :       inet sockets don't set sk->type!
  53  *              Alan Cox        :       Split socket option code
  54  *              Alan Cox        :       Callbacks
  55  *              Alan Cox        :       Nagle flag for Charles & Johannes stuff
  56  *              Alex            :       Removed restriction on inet fioctl
  57  *              Alan Cox        :       Splitting INET from NET core
  58  *              Alan Cox        :       Fixed bogus SO_TYPE handling in getsockopt()
  59  *              Adam Caldwell   :       Missing return in SO_DONTROUTE/SO_DEBUG code
  60  *              Alan Cox        :       Split IP from generic code
  61  *              Alan Cox        :       New kfree_skbmem()
  62  *              Alan Cox        :       Make SO_DEBUG superuser only.
  63  *              Alan Cox        :       Allow anyone to clear SO_DEBUG
  64  *                                      (compatibility fix)
  65  *              Alan Cox        :       Added optimistic memory grabbing for AF_UNIX throughput.
  66  *              Alan Cox        :       Allocator for a socket is settable.
  67  *              Alan Cox        :       SO_ERROR includes soft errors.
  68  *              Alan Cox        :       Allow NULL arguments on some SO_ opts
  69  *              Alan Cox        :       Generic socket allocation to make hooks
  70  *                                      easier (suggested by Craig Metz).
  71  *              Michael Pall    :       SO_ERROR returns positive errno again
  72  *              Steve Whitehouse:       Added default destructor to free
  73  *                                      protocol private data.
  74  *              Steve Whitehouse:       Added various other default routines
  75  *                                      common to several socket families.
  76  *              Chris Evans     :       Call suser() check last on F_SETOWN
  77  *              Jay Schulist    :       Added SO_ATTACH_FILTER and SO_DETACH_FILTER.
  78  *              Andi Kleen      :       Add sock_kmalloc()/sock_kfree_s()
  79  *              Andi Kleen      :       Fix write_space callback
  80  *              Chris Evans     :       Security fixes - signedness again
  81  *              Arnaldo C. Melo :       cleanups, use skb_queue_purge
  82  *
  83  * To Fix:
  84  *
  85  *
  86  *              This program is free software; you can redistribute it and/or
  87  *              modify it under the terms of the GNU General Public License
  88  *              as published by the Free Software Foundation; either version
  89  *              2 of the License, or (at your option) any later version.
  90  */
  91
  92 #include <linux/capability.h>
  93 #include <linux/errno.h>
  94 #include <linux/types.h>
  95 #include <linux/socket.h>
  96 #include <linux/in.h>
  97 #include <linux/kernel.h>
  98 #include <linux/module.h>
  99 #include <linux/proc_fs.h>
 100 #include <linux/seq_file.h>
 101 #include <linux/sched.h>
 102 #include <linux/timer.h>
 103 #include <linux/string.h>
 104 #include <linux/sockios.h>
 105 #include <linux/net.h>
 106 #include <linux/mm.h>
 107 #include <linux/slab.h>
 108 #include <linux/interrupt.h>
 109 #include <linux/poll.h>
 110 #include <linux/tcp.h>
 111 #include <linux/init.h>
 112 #include <linux/highmem.h>
 113
 114 #include <asm/uaccess.h>
 115 #include <asm/system.h>
 116
 117 #include <linux/netdevice.h>
 118 #include <net/protocol.h>
 119 #include <linux/skbuff.h>
 120 #include <net/net_namespace.h>
 121 #include <net/request_sock.h>
 122 #include <net/sock.h>
 123 #include <linux/net_tstamp.h>
 124 #include <net/xfrm.h>
 125 #include <linux/ipsec.h>
 126
 127 #include <linux/filter.h>
 128
 129 #ifdef CONFIG_INET
 130 #include <net/tcp.h>
 131 #endif
 132
 133 /*
 134  * Each address family might have different locking rules, so we have
 135  * one slock key per address family:
 136  */
 137 static struct lock_class_key af_family_keys[AF_MAX];
 138 static struct lock_class_key af_family_slock_keys[AF_MAX];
 139
 140 /*
 141  * Make lock validator output more readable. (we pre-construct these
 142  * strings build-time, so that runtime initialization of socket
 143  * locks is fast):
 144  */
 145 static const char *af_family_key_strings[AF_MAX+1] = {
 146   "sk_lock-AF_UNSPEC", "sk_lock-AF_UNIX"     , "sk_lock-AF_INET"     ,
 147   "sk_lock-AF_AX25"  , "sk_lock-AF_IPX"      , "sk_lock-AF_APPLETALK",
 148   "sk_lock-AF_NETROM", "sk_lock-AF_BRIDGE"   , "sk_lock-AF_ATMPVC"   ,
 149   "sk_lock-AF_X25"   , "sk_lock-AF_INET6"    , "sk_lock-AF_ROSE"     ,
 150   "sk_lock-AF_DECnet", "sk_lock-AF_NETBEUI"  , "sk_lock-AF_SECURITY" ,
 151   "sk_lock-AF_KEY"   , "sk_lock-AF_NETLINK"  , "sk_lock-AF_PACKET"   ,
 152   "sk_lock-AF_ASH"   , "sk_lock-AF_ECONET"   , "sk_lock-AF_ATMSVC"   ,
 153   "sk_lock-AF_RDS"   , "sk_lock-AF_SNA"      , "sk_lock-AF_IRDA"     ,
 154   "sk_lock-AF_PPPOX" , "sk_lock-AF_WANPIPE"  , "sk_lock-AF_LLC"      ,
 155   "sk_lock-27"       , "sk_lock-28"          , "sk_lock-AF_CAN"      ,
 156   "sk_lock-AF_TIPC"  , "sk_lock-AF_BLUETOOTH", "sk_lock-IUCV"        ,
 157   "sk_lock-AF_RXRPC" , "sk_lock-AF_ISDN"     , "sk_lock-AF_PHONET"   ,
 158   "sk_lock-AF_IEEE802154",
 159   "sk_lock-AF_MAX"
 160 };
 161 static const char *af_family_slock_key_strings[AF_MAX+1] = {
 162   "slock-AF_UNSPEC", "slock-AF_UNIX"     , "slock-AF_INET"     ,
 163   "slock-AF_AX25"  , "slock-AF_IPX"      , "slock-AF_APPLETALK",
 164   "slock-AF_NETROM", "slock-AF_BRIDGE"   , "slock-AF_ATMPVC"   ,
 165   "slock-AF_X25"   , "slock-AF_INET6"    , "slock-AF_ROSE"     ,
 166   "slock-AF_DECnet", "slock-AF_NETBEUI"  , "slock-AF_SECURITY" ,
 167   "slock-AF_KEY"   , "slock-AF_NETLINK"  , "slock-AF_PACKET"   ,
 168   "slock-AF_ASH"   , "slock-AF_ECONET"   , "slock-AF_ATMSVC"   ,
 169   "slock-AF_RDS"   , "slock-AF_SNA"      , "slock-AF_IRDA"     ,
 170   "slock-AF_PPPOX" , "slock-AF_WANPIPE"  , "slock-AF_LLC"      ,
 171   "slock-27"       , "slock-28"          , "slock-AF_CAN"      ,
 172   "slock-AF_TIPC"  , "slock-AF_BLUETOOTH", "slock-AF_IUCV"     ,
 173   "slock-AF_RXRPC" , "slock-AF_ISDN"     , "slock-AF_PHONET"   ,
 174   "slock-AF_IEEE802154",
 175   "slock-AF_MAX"
 176 };
 177 static const char *af_family_clock_key_strings[AF_MAX+1] = {
 178   "clock-AF_UNSPEC", "clock-AF_UNIX"     , "clock-AF_INET"     ,
 179   "clock-AF_AX25"  , "clock-AF_IPX"      , "clock-AF_APPLETALK",
 180   "clock-AF_NETROM", "clock-AF_BRIDGE"   , "clock-AF_ATMPVC"   ,
 181   "clock-AF_X25"   , "clock-AF_INET6"    , "clock-AF_ROSE"     ,
 182   "clock-AF_DECnet", "clock-AF_NETBEUI"  , "clock-AF_SECURITY" ,
 183   "clock-AF_KEY"   , "clock-AF_NETLINK"  , "clock-AF_PACKET"   ,
 184   "clock-AF_ASH"   , "clock-AF_ECONET"   , "clock-AF_ATMSVC"   ,
 185   "clock-AF_RDS"   , "clock-AF_SNA"      , "clock-AF_IRDA"     ,
 186   "clock-AF_PPPOX" , "clock-AF_WANPIPE"  , "clock-AF_LLC"      ,
 187   "clock-27"       , "clock-28"          , "clock-AF_CAN"      ,
 188   "clock-AF_TIPC"  , "clock-AF_BLUETOOTH", "clock-AF_IUCV"     ,
 189   "clock-AF_RXRPC" , "clock-AF_ISDN"     , "clock-AF_PHONET"   ,
 190   "clock-AF_IEEE802154",
 191   "clock-AF_MAX"
 192 };
 193
 194 /*
 195  * sk_callback_lock locking rules are per-address-family,
 196  * so split the lock classes by using a per-AF key:
 197  */
 198 static struct lock_class_key af_callback_keys[AF_MAX];
 199
 200 /* Take into consideration the size of the struct sk_buff overhead in the
 201  * determination of these values, since that is non-constant across
 202  * platforms.  This makes socket queueing behavior and performance
 203  * not depend upon such differences.
 204  */
 205 #define _SK_MEM_PACKETS         256
 206 #define _SK_MEM_OVERHEAD        (sizeof(struct sk_buff) + 256)
 207 #define SK_WMEM_MAX             (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
 208 #define SK_RMEM_MAX             (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
 209
 210 /* Run time adjustable parameters. */
 211 __u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX;
 212 __u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX;
 213 __u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX;
 214 __u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX;
 215
 216 /* Maximal space eaten by iovec or ancilliary data plus some space */
 217 int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512);
 218 EXPORT_SYMBOL(sysctl_optmem_max);
 219
 220 static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen)
 221 {
 222         struct timeval tv;
 223
 224         if (optlen < sizeof(tv))
 225                 return -EINVAL;
 226         if (copy_from_user(&tv, optval, sizeof(tv)))
 227                 return -EFAULT;
 228         if (tv.tv_usec < 0 || tv.tv_usec >= USEC_PER_SEC)
 229                 return -EDOM;
 230
 231         if (tv.tv_sec < 0) {
 232                 static int warned __read_mostly;
 233
 234                 *timeo_p = 0;
 235                 if (warned < 10 && net_ratelimit()) {
 236                         warned++;
 237                         printk(KERN_INFO "sock_set_timeout: `%s' (pid %d) "
 238                                "tries to set negative timeout\n",
 239                                 current->comm, task_pid_nr(current));
 240                 }
 241                 return 0;
 242         }
 243         *timeo_p = MAX_SCHEDULE_TIMEOUT;
 244         if (tv.tv_sec == 0 && tv.tv_usec == 0)
 245                 return 0;
 246         if (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT/HZ - 1))
 247                 *timeo_p = tv.tv_sec*HZ + (tv.tv_usec+(1000000/HZ-1))/(1000000/HZ);
 248         return 0;
 249 }
 250
 251 static void sock_warn_obsolete_bsdism(const char *name)
 252 {
 253         static int warned;
 254         static char warncomm[TASK_COMM_LEN];
 255         if (strcmp(warncomm, current->comm) && warned < 5) {
 256                 strcpy(warncomm,  current->comm);
 257                 printk(KERN_WARNING "process `%s' is using obsolete "
 258                        "%s SO_BSDCOMPAT\n", warncomm, name);
 259                 warned++;
 260         }
 261 }
 262
 263 static void sock_disable_timestamp(struct sock *sk, int flag)
 264 {
 265         if (sock_flag(sk, flag)) {
 266                 sock_reset_flag(sk, flag);
 267                 if (!sock_flag(sk, SOCK_TIMESTAMP) &&
 268                     !sock_flag(sk, SOCK_TIMESTAMPING_RX_SOFTWARE)) {
 269                         net_disable_timestamp();
 270                 }
 271         }
 272 }
 273
 274
 275 int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
 276 {
 277         int err = 0;
 278         int skb_len;
 279
 280         /* Cast sk->rcvbuf to unsigned... It's pointless, but reduces
 281            number of warnings when compiling with -W --ANK
 282          */
 283         if (atomic_read(&sk->sk_rmem_alloc) + skb->truesize >=
 284             (unsigned)sk->sk_rcvbuf) {
 285                 err = -ENOMEM;
 286                 goto out;
 287         }
 288
 289         err = sk_filter(sk, skb);
 290         if (err)
 291                 goto out;
 292
 293         if (!sk_rmem_schedule(sk, skb->truesize)) {
 294                 err = -ENOBUFS;
 295                 goto out;
 296         }
 297
 298         skb->dev = NULL;
 299         skb_set_owner_r(skb, sk);
 300
 301         /* Cache the SKB length before we tack it onto the receive
 302          * queue.  Once it is added it no longer belongs to us and
 303          * may be freed by other threads of control pulling packets
 304          * from the queue.
 305          */
 306         skb_len = skb->len;
 307
 308         skb_queue_tail(&sk->sk_receive_queue, skb);
 309
 310         if (!sock_flag(sk, SOCK_DEAD))
 311                 sk->sk_data_ready(sk, skb_len);
 312 out:
 313         return err;
 314 }
 315 EXPORT_SYMBOL(sock_queue_rcv_skb);
 316
 317 int sk_receive_skb(struct sock *sk, struct sk_buff *skb, const int nested)
 318 {
 319         int rc = NET_RX_SUCCESS;
 320
 321         if (sk_filter(sk, skb))
 322                 goto discard_and_relse;
 323
 324         skb->dev = NULL;
 325
 326         if (nested)
 327                 bh_lock_sock_nested(sk);
 328         else
 329                 bh_lock_sock(sk);
 330         if (!sock_owned_by_user(sk)) {
 331                 /*
 332                  * trylock + unlock semantics:
 333                  */
 334                 mutex_acquire(&sk->sk_lock.dep_map, 0, 1, _RET_IP_);
 335
 336                 rc = sk_backlog_rcv(sk, skb);
 337
 338                 mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
 339         } else
 340                 sk_add_backlog(sk, skb);
 341         bh_unlock_sock(sk);
 342 out:
 343         sock_put(sk);
 344         return rc;
 345 discard_and_relse:
 346         kfree_skb(skb);
 347         goto out;
 348 }
 349 EXPORT_SYMBOL(sk_receive_skb);
 350
 351 struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie)
 352 {
 353         struct dst_entry *dst = sk->sk_dst_cache;
 354
 355         if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
 356                 sk->sk_dst_cache = NULL;
 357                 dst_release(dst);
 358                 return NULL;
 359         }
 360
 361         return dst;
 362 }
 363 EXPORT_SYMBOL(__sk_dst_check);
 364
 365 struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie)
 366 {
 367         struct dst_entry *dst = sk_dst_get(sk);
 368
 369         if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
 370                 sk_dst_reset(sk);
 371                 dst_release(dst);
 372                 return NULL;
 373         }
 374
 375         return dst;
 376 }
 377 EXPORT_SYMBOL(sk_dst_check);
 378
 379 static int sock_bindtodevice(struct sock *sk, char __user *optval, int optlen)
 380 {
 381         int ret = -ENOPROTOOPT;
 382 #ifdef CONFIG_NETDEVICES
 383         struct net *net = sock_net(sk);
 384         char devname[IFNAMSIZ];
 385         int index;
 386
 387         /* Sorry... */
 388         ret = -EPERM;
 389         if (!capable(CAP_NET_RAW))
 390                 goto out;
 391
 392         ret = -EINVAL;
 393         if (optlen < 0)
 394                 goto out;
 395
 396         /* Bind this socket to a particular device like "eth0",
 397          * as specified in the passed interface name. If the
 398          * name is "" or the option length is zero the socket
 399          * is not bound.
 400          */
 401         if (optlen > IFNAMSIZ - 1)
 402                 optlen = IFNAMSIZ - 1;
 403         memset(devname, 0, sizeof(devname));
 404
 405         ret = -EFAULT;
 406         if (copy_from_user(devname, optval, optlen))
 407                 goto out;
 408
 409         if (devname[0] == '\0') {
 410                 index = 0;
 411         } else {
 412                 struct net_device *dev = dev_get_by_name(net, devname);
 413
 414                 ret = -ENODEV;
 415                 if (!dev)
 416                         goto out;
 417
 418                 index = dev->ifindex;
 419                 dev_put(dev);
 420         }
 421
 422         lock_sock(sk);
 423         sk->sk_bound_dev_if = index;
 424         sk_dst_reset(sk);
 425         release_sock(sk);
 426
 427         ret = 0;
 428
 429 out:
 430 #endif
 431
 432         return ret;
 433 }
 434
 435 static inline void sock_valbool_flag(struct sock *sk, int bit, int valbool)
 436 {
 437         if (valbool)
 438                 sock_set_flag(sk, bit);
 439         else
 440                 sock_reset_flag(sk, bit);
 441 }
 442
 443 /*
 444  *      This is meant for all protocols to use and covers goings on
 445  *      at the socket level. Everything here is generic.
 446  */
 447
 448 int sock_setsockopt(struct socket *sock, int level, int optname,
 449                     char __user *optval, int optlen)
 450 {
 451         struct sock *sk = sock->sk;
 452         int val;
 453         int valbool;
 454         struct linger ling;
 455         int ret = 0;
 456
 457         /*
 458          *      Options without arguments
 459          */
 460
 461         if (optname == SO_BINDTODEVICE)
 462                 return sock_bindtodevice(sk, optval, optlen);
 463
 464         if (optlen < sizeof(int))
 465                 return -EINVAL;
 466
 467         if (get_user(val, (int __user *)optval))
 468                 return -EFAULT;
 469
 470         valbool = val ? 1 : 0;
 471
 472         lock_sock(sk);
 473
 474         switch (optname) {
 475         case SO_DEBUG:
 476                 if (val && !capable(CAP_NET_ADMIN))
 477                         ret = -EACCES;
 478                 else
 479                         sock_valbool_flag(sk, SOCK_DBG, valbool);
 480                 break;
 481         case SO_REUSEADDR:
 482                 sk->sk_reuse = valbool;
 483                 break;
 484         case SO_TYPE:
 485         case SO_ERROR:
 486                 ret = -ENOPROTOOPT;
 487                 break;
 488         case SO_DONTROUTE:
 489                 sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool);
 490                 break;
 491         case SO_BROADCAST:
 492                 sock_valbool_flag(sk, SOCK_BROADCAST, valbool);
 493                 break;
 494         case SO_SNDBUF:
 495                 /* Don't error on this BSD doesn't and if you think
 496                    about it this is right. Otherwise apps have to
 497                    play 'guess the biggest size' games. RCVBUF/SNDBUF
 498                    are treated in BSD as hints */
 499
 500                 if (val > sysctl_wmem_max)
 501                         val = sysctl_wmem_max;
 502 set_sndbuf:
 503                 sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
 504                 if ((val * 2) < SOCK_MIN_SNDBUF)
 505                         sk->sk_sndbuf = SOCK_MIN_SNDBUF;
 506                 else
 507                         sk->sk_sndbuf = val * 2;
 508
 509                 /*
 510                  *      Wake up sending tasks if we
 511                  *      upped the value.
 512                  */
 513                 sk->sk_write_space(sk);
 514                 break;
 515
 516         case SO_SNDBUFFORCE:
 517                 if (!capable(CAP_NET_ADMIN)) {
 518                         ret = -EPERM;
 519                         break;
 520                 }
 521                 goto set_sndbuf;
 522
 523         case SO_RCVBUF:
 524                 /* Don't error on this BSD doesn't and if you think
 525                    about it this is right. Otherwise apps have to
 526                    play 'guess the biggest size' games. RCVBUF/SNDBUF
 527                    are treated in BSD as hints */
 528
 529                 if (val > sysctl_rmem_max)
 530                         val = sysctl_rmem_max;
 531 set_rcvbuf:
 532                 sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
 533                 /*
 534                  * We double it on the way in to account for
 535                  * "struct sk_buff" etc. overhead.   Applications
 536                  * assume that the SO_RCVBUF setting they make will
 537                  * allow that much actual data to be received on that
 538                  * socket.
 539                  *
 540                  * Applications are unaware that "struct sk_buff" and
 541                  * other overheads allocate from the receive buffer
 542                  * during socket buffer allocation.
 543                  *
 544                  * And after considering the possible alternatives,
 545                  * returning the value we actually used in getsockopt
 546                  * is the most desirable behavior.
 547                  */
 548                 if ((val * 2) < SOCK_MIN_RCVBUF)
 549                         sk->sk_rcvbuf = SOCK_MIN_RCVBUF;
 550                 else
 551                         sk->sk_rcvbuf = val * 2;
 552                 break;
 553
 554         case SO_RCVBUFFORCE:
 555                 if (!capable(CAP_NET_ADMIN)) {
 556                         ret = -EPERM;
 557                         break;
 558                 }
 559                 goto set_rcvbuf;
 560
 561         case SO_KEEPALIVE:
 562 #ifdef CONFIG_INET
 563                 if (sk->sk_protocol == IPPROTO_TCP)
 564                         tcp_set_keepalive(sk, valbool);
 565 #endif
 566                 sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool);
 567                 break;
 568
 569         case SO_OOBINLINE:
 570                 sock_valbool_flag(sk, SOCK_URGINLINE, valbool);
 571                 break;
 572
 573         case SO_NO_CHECK:
 574                 sk->sk_no_check = valbool;
 575                 break;
 576
 577         case SO_PRIORITY:
 578                 if ((val >= 0 && val <= 6) || capable(CAP_NET_ADMIN))
 579                         sk->sk_priority = val;
 580                 else
 581                         ret = -EPERM;
 582                 break;
 583
 584         case SO_LINGER:
 585                 if (optlen < sizeof(ling)) {
 586                         ret = -EINVAL;  /* 1003.1g */
 587                         break;
 588                 }
 589                 if (copy_from_user(&ling, optval, sizeof(ling))) {
 590                         ret = -EFAULT;
 591                         break;
 592                 }
 593                 if (!ling.l_onoff)
 594                         sock_reset_flag(sk, SOCK_LINGER);
 595                 else {
 596 #if (BITS_PER_LONG == 32)
 597                         if ((unsigned int)ling.l_linger >= MAX_SCHEDULE_TIMEOUT/HZ)
 598                                 sk->sk_lingertime = MAX_SCHEDULE_TIMEOUT;
 599                         else
 600 #endif
 601                                 sk->sk_lingertime = (unsigned int)ling.l_linger * HZ;
 602                         sock_set_flag(sk, SOCK_LINGER);
 603                 }
 604                 break;
 605
 606         case SO_BSDCOMPAT:
 607                 sock_warn_obsolete_bsdism("setsockopt");
 608                 break;
 609
 610         case SO_PASSCRED:
 611                 if (valbool)
 612                         set_bit(SOCK_PASSCRED, &sock->flags);
 613                 else
 614                         clear_bit(SOCK_PASSCRED, &sock->flags);
 615                 break;
 616
 617         case SO_TIMESTAMP:
 618         case SO_TIMESTAMPNS:
 619                 if (valbool)  {
 620                         if (optname == SO_TIMESTAMP)
 621                                 sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
 622                         else
 623                                 sock_set_flag(sk, SOCK_RCVTSTAMPNS);
 624                         sock_set_flag(sk, SOCK_RCVTSTAMP);
 625                         sock_enable_timestamp(sk, SOCK_TIMESTAMP);
 626                 } else {
 627                         sock_reset_flag(sk, SOCK_RCVTSTAMP);
 628                         sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
 629                 }
 630                 break;
 631
 632         case SO_TIMESTAMPING:
 633                 if (val & ~SOF_TIMESTAMPING_MASK) {
 634                         ret = -EINVAL;
 635                         break;
 636                 }
 637                 sock_valbool_flag(sk, SOCK_TIMESTAMPING_TX_HARDWARE,
 638                                   val & SOF_TIMESTAMPING_TX_HARDWARE);
 639                 sock_valbool_flag(sk, SOCK_TIMESTAMPING_TX_SOFTWARE,
 640                                   val & SOF_TIMESTAMPING_TX_SOFTWARE);
 641                 sock_valbool_flag(sk, SOCK_TIMESTAMPING_RX_HARDWARE,
 642                                   val & SOF_TIMESTAMPING_RX_HARDWARE);
 643                 if (val & SOF_TIMESTAMPING_RX_SOFTWARE)
 644                         sock_enable_timestamp(sk,
 645                                               SOCK_TIMESTAMPING_RX_SOFTWARE);
 646                 else
 647                         sock_disable_timestamp(sk,
 648                                                SOCK_TIMESTAMPING_RX_SOFTWARE);
 649                 sock_valbool_flag(sk, SOCK_TIMESTAMPING_SOFTWARE,
 650                                   val & SOF_TIMESTAMPING_SOFTWARE);
 651                 sock_valbool_flag(sk, SOCK_TIMESTAMPING_SYS_HARDWARE,
 652                                   val & SOF_TIMESTAMPING_SYS_HARDWARE);
 653                 sock_valbool_flag(sk, SOCK_TIMESTAMPING_RAW_HARDWARE,
 654                                   val & SOF_TIMESTAMPING_RAW_HARDWARE);
 655                 break;
 656
 657         case SO_RCVLOWAT:
 658                 if (val < 0)
 659                         val = INT_MAX;
 660                 sk->sk_rcvlowat = val ? : 1;
 661                 break;
 662
 663         case SO_RCVTIMEO:
 664                 ret = sock_set_timeout(&sk->sk_rcvtimeo, optval, optlen);
 665                 break;
 666
 667         case SO_SNDTIMEO:
 668                 ret = sock_set_timeout(&sk->sk_sndtimeo, optval, optlen);
 669                 break;
 670
 671         case SO_ATTACH_FILTER:
 672                 ret = -EINVAL;
 673                 if (optlen == sizeof(struct sock_fprog)) {
 674                         struct sock_fprog fprog;
 675
 676                         ret = -EFAULT;
 677                         if (copy_from_user(&fprog, optval, sizeof(fprog)))
 678                                 break;
 679
 680                         ret = sk_attach_filter(&fprog, sk);
 681                 }
 682                 break;
 683
 684         case SO_DETACH_FILTER:
 685                 ret = sk_detach_filter(sk);
 686                 break;
 687
 688         case SO_PASSSEC:
 689                 if (valbool)
 690                         set_bit(SOCK_PASSSEC, &sock->flags);
 691                 else
 692                         clear_bit(SOCK_PASSSEC, &sock->flags);
 693                 break;
 694         case SO_MARK:
 695                 if (!capable(CAP_NET_ADMIN))
 696                         ret = -EPERM;
 697                 else
 698                         sk->sk_mark = val;
 699                 break;
 700
 701                 /* We implement the SO_SNDLOWAT etc to
 702                    not be settable (1003.1g 5.3) */
 703         default:
 704                 ret = -ENOPROTOOPT;
 705                 break;
 706         }
 707         release_sock(sk);
 708         return ret;
 709 }
 710 EXPORT_SYMBOL(sock_setsockopt);
 711
 712
 713 int sock_getsockopt(struct socket *sock, int level, int optname,
 714                     char __user *optval, int __user *optlen)
 715 {
 716         struct sock *sk = sock->sk;
 717
 718         union {
 719                 int val;
 720                 struct linger ling;
 721                 struct timeval tm;
 722         } v;
 723
 724         unsigned int lv = sizeof(int);
 725         int len;
 726
 727         if (get_user(len, optlen))
 728                 return -EFAULT;
 729         if (len < 0)
 730                 return -EINVAL;
 731
 732         memset(&v, 0, sizeof(v));
 733
 734         switch (optname) {
 735         case SO_DEBUG:
 736                 v.val = sock_flag(sk, SOCK_DBG);
 737                 break;
 738
 739         case SO_DONTROUTE:
 740                 v.val = sock_flag(sk, SOCK_LOCALROUTE);
 741                 break;
 742
 743         case SO_BROADCAST:
 744                 v.val = !!sock_flag(sk, SOCK_BROADCAST);
 745                 break;
 746
 747         case SO_SNDBUF:
 748                 v.val = sk->sk_sndbuf;
 749                 break;
 750
 751         case SO_RCVBUF:
 752                 v.val = sk->sk_rcvbuf;
 753                 break;
 754
 755         case SO_REUSEADDR:
 756                 v.val = sk->sk_reuse;
 757                 break;
 758
 759         case SO_KEEPALIVE:
 760                 v.val = !!sock_flag(sk, SOCK_KEEPOPEN);
 761                 break;
 762
 763         case SO_TYPE:
 764                 v.val = sk->sk_type;
 765                 break;
 766
 767         case SO_ERROR:
 768                 v.val = -sock_error(sk);
 769                 if (v.val == 0)
 770                         v.val = xchg(&sk->sk_err_soft, 0);
 771                 break;
 772
 773         case SO_OOBINLINE:
 774                 v.val = !!sock_flag(sk, SOCK_URGINLINE);
 775                 break;
 776
 777         case SO_NO_CHECK:
 778                 v.val = sk->sk_no_check;
 779                 break;
 780
 781         case SO_PRIORITY:
 782                 v.val = sk->sk_priority;
 783                 break;
 784
 785         case SO_LINGER:
 786                 lv              = sizeof(v.ling);
 787                 v.ling.l_onoff  = !!sock_flag(sk, SOCK_LINGER);
 788                 v.ling.l_linger = sk->sk_lingertime / HZ;
 789                 break;
 790
 791         case SO_BSDCOMPAT:
 792                 sock_warn_obsolete_bsdism("getsockopt");
 793                 break;
 794
 795         case SO_TIMESTAMP:
 796                 v.val = sock_flag(sk, SOCK_RCVTSTAMP) &&
 797                                 !sock_flag(sk, SOCK_RCVTSTAMPNS);
 798                 break;
 799
 800         case SO_TIMESTAMPNS:
 801                 v.val = sock_flag(sk, SOCK_RCVTSTAMPNS);
 802                 break;
 803
 804         case SO_TIMESTAMPING:
 805                 v.val = 0;
 806                 if (sock_flag(sk, SOCK_TIMESTAMPING_TX_HARDWARE))
 807                         v.val |= SOF_TIMESTAMPING_TX_HARDWARE;
 808                 if (sock_flag(sk, SOCK_TIMESTAMPING_TX_SOFTWARE))
 809                         v.val |= SOF_TIMESTAMPING_TX_SOFTWARE;
 810                 if (sock_flag(sk, SOCK_TIMESTAMPING_RX_HARDWARE))
 811                         v.val |= SOF_TIMESTAMPING_RX_HARDWARE;
 812                 if (sock_flag(sk, SOCK_TIMESTAMPING_RX_SOFTWARE))
 813                         v.val |= SOF_TIMESTAMPING_RX_SOFTWARE;
 814                 if (sock_flag(sk, SOCK_TIMESTAMPING_SOFTWARE))
 815                         v.val |= SOF_TIMESTAMPING_SOFTWARE;
 816                 if (sock_flag(sk, SOCK_TIMESTAMPING_SYS_HARDWARE))
 817                         v.val |= SOF_TIMESTAMPING_SYS_HARDWARE;
 818                 if (sock_flag(sk, SOCK_TIMESTAMPING_RAW_HARDWARE))
 819                         v.val |= SOF_TIMESTAMPING_RAW_HARDWARE;
 820                 break;
 821
 822         case SO_RCVTIMEO:
 823                 lv = sizeof(struct timeval);
 824                 if (sk->sk_rcvtimeo == MAX_SCHEDULE_TIMEOUT) {
 825                         v.tm.tv_sec = 0;
 826                         v.tm.tv_usec = 0;
 827                 } else {
 828                         v.tm.tv_sec = sk->sk_rcvtimeo / HZ;
 829                         v.tm.tv_usec = ((sk->sk_rcvtimeo % HZ) * 1000000) / HZ;
 830                 }
 831                 break;
 832
 833         case SO_SNDTIMEO:
 834                 lv = sizeof(struct timeval);
 835                 if (sk->sk_sndtimeo == MAX_SCHEDULE_TIMEOUT) {
 836                         v.tm.tv_sec = 0;
 837                         v.tm.tv_usec = 0;
 838                 } else {
 839                         v.tm.tv_sec = sk->sk_sndtimeo / HZ;
 840                         v.tm.tv_usec = ((sk->sk_sndtimeo % HZ) * 1000000) / HZ;
 841                 }
 842                 break;
 843
 844         case SO_RCVLOWAT:
 845                 v.val = sk->sk_rcvlowat;
 846                 break;
 847
 848         case SO_SNDLOWAT:
 849                 v.val = 1;
 850                 break;
 851
 852         case SO_PASSCRED:
 853                 v.val = test_bit(SOCK_PASSCRED, &sock->flags) ? 1 : 0;
 854                 break;
 855
 856         case SO_PEERCRED:
 857                 if (len > sizeof(sk->sk_peercred))
 858                         len = sizeof(sk->sk_peercred);
 859                 if (copy_to_user(optval, &sk->sk_peercred, len))
 860                         return -EFAULT;
 861                 goto lenout;
 862
 863         case SO_PEERNAME:
 864         {
 865                 char address[128];
 866
 867                 if (sock->ops->getname(sock, (struct sockaddr *)address, &lv, 2))
 868                         return -ENOTCONN;
 869                 if (lv < len)
 870                         return -EINVAL;
 871                 if (copy_to_user(optval, address, len))
 872                         return -EFAULT;
 873                 goto lenout;
 874         }
 875
 876         /* Dubious BSD thing... Probably nobody even uses it, but
 877          * the UNIX standard wants it for whatever reason... -DaveM
 878          */
 879         case SO_ACCEPTCONN:
 880                 v.val = sk->sk_state == TCP_LISTEN;
 881                 break;
 882
 883         case SO_PASSSEC:
 884                 v.val = test_bit(SOCK_PASSSEC, &sock->flags) ? 1 : 0;
 885                 break;
 886
 887         case SO_PEERSEC:
 888                 return security_socket_getpeersec_stream(sock, optval, optlen, len);
 889
 890         case SO_MARK:
 891                 v.val = sk->sk_mark;
 892                 break;
 893
 894         default:
 895                 return -ENOPROTOOPT;
 896         }
 897
 898         if (len > lv)
 899                 len = lv;
 900         if (copy_to_user(optval, &v, len))
 901                 return -EFAULT;
 902 lenout:
 903         if (put_user(len, optlen))
 904                 return -EFAULT;
 905         return 0;
 906 }
 907
 908 /*
 909  * Initialize an sk_lock.
 910  *
 911  * (We also register the sk_lock with the lock validator.)
 912  */
 913 static inline void sock_lock_init(struct sock *sk)
 914 {
 915         sock_lock_init_class_and_name(sk,
 916                         af_family_slock_key_strings[sk->sk_family],
 917                         af_family_slock_keys + sk->sk_family,
 918                         af_family_key_strings[sk->sk_family],
 919                         af_family_keys + sk->sk_family);
 920 }
 921
 922 /*
 923  * Copy all fields from osk to nsk but nsk->sk_refcnt must not change yet,
 924  * even temporarly, because of RCU lookups. sk_node should also be left as is.
 925  */
 926 static void sock_copy(struct sock *nsk, const struct sock *osk)
 927 {
 928 #ifdef CONFIG_SECURITY_NETWORK
 929         void *sptr = nsk->sk_security;
 930 #endif
 931         BUILD_BUG_ON(offsetof(struct sock, sk_copy_start) !=
 932                      sizeof(osk->sk_node) + sizeof(osk->sk_refcnt));
 933         memcpy(&nsk->sk_copy_start, &osk->sk_copy_start,
 934                osk->sk_prot->obj_size - offsetof(struct sock, sk_copy_start));
 935 #ifdef CONFIG_SECURITY_NETWORK
 936         nsk->sk_security = sptr;
 937         security_sk_clone(osk, nsk);
 938 #endif
 939 }
 940
 941 static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
 942                 int family)
 943 {
 944         struct sock *sk;
 945         struct kmem_cache *slab;
 946
 947         slab = prot->slab;
 948         if (slab != NULL) {
 949                 sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO);
 950                 if (!sk)
 951                         return sk;
 952                 if (priority & __GFP_ZERO) {
 953                         /*
 954                          * caches using SLAB_DESTROY_BY_RCU should let
 955                          * sk_node.next un-modified. Special care is taken
 956                          * when initializing object to zero.
 957                          */
 958                         if (offsetof(struct sock, sk_node.next) != 0)
 959                                 memset(sk, 0, offsetof(struct sock, sk_node.next));
 960                         memset(&sk->sk_node.pprev, 0,
 961                                prot->obj_size - offsetof(struct sock,
 962                                                          sk_node.pprev));
 963                 }
 964         }
 965         else
 966                 sk = kmalloc(prot->obj_size, priority);
 967
 968         if (sk != NULL) {
 969                 kmemcheck_annotate_bitfield(sk, flags);
 970
 971                 if (security_sk_alloc(sk, family, priority))
 972                         goto out_free;
 973
 974                 if (!try_module_get(prot->owner))
 975                         goto out_free_sec;
 976         }
 977
 978         return sk;
 979
 980 out_free_sec:
 981         security_sk_free(sk);
 982 out_free:
 983         if (slab != NULL)
 984                 kmem_cache_free(slab, sk);
 985         else
 986                 kfree(sk);
 987         return NULL;
 988 }
 989
 990 static void sk_prot_free(struct proto *prot, struct sock *sk)
 991 {
 992         struct kmem_cache *slab;
 993         struct module *owner;
 994
 995         owner = prot->owner;
 996         slab = prot->slab;
 997
 998         security_sk_free(sk);
 999         if (slab != NULL)
1000                 kmem_cache_free(slab, sk);
1001         else
1002                 kfree(sk);
1003         module_put(owner);
1004 }
1005
1006 /**
1007  *      sk_alloc - All socket objects are allocated here
1008  *      @net: the applicable net namespace
1009  *      @family: protocol family
1010  *      @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
1011  *      @prot: struct proto associated with this new sock instance
1012  */
1013 struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
1014                       struct proto *prot)
1015 {
1016         struct sock *sk;
1017
1018         sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family);
1019         if (sk) {
1020                 sk->sk_family = family;
1021                 /*
1022                  * See comment in struct sock definition to understand
1023                  * why we need sk_prot_creator -acme
1024                  */
1025                 sk->sk_prot = sk->sk_prot_creator = prot;
1026                 sock_lock_init(sk);
1027                 sock_net_set(sk, get_net(net));
1028         }
1029
1030         return sk;
1031 }
1032 EXPORT_SYMBOL(sk_alloc);
1033
1034 static void __sk_free(struct sock *sk)
1035 {
1036         struct sk_filter *filter;
1037
1038         if (sk->sk_destruct)
1039                 sk->sk_destruct(sk);
1040
1041         filter = rcu_dereference(sk->sk_filter);
1042         if (filter) {
1043                 sk_filter_uncharge(sk, filter);
1044                 rcu_assign_pointer(sk->sk_filter, NULL);
1045         }
1046
1047         sock_disable_timestamp(sk, SOCK_TIMESTAMP);
1048         sock_disable_timestamp(sk, SOCK_TIMESTAMPING_RX_SOFTWARE);
1049
1050         if (atomic_read(&sk->sk_omem_alloc))
1051                 printk(KERN_DEBUG "%s: optmem leakage (%d bytes) detected.\n",
1052                        __func__, atomic_read(&sk->sk_omem_alloc));
1053
1054         put_net(sock_net(sk));
1055         sk_prot_free(sk->sk_prot_creator, sk);
1056 }
1057
1058 void sk_free(struct sock *sk)
1059 {
1060         /*
1061          * We substract one from sk_wmem_alloc and can know if
1062          * some packets are still in some tx queue.
1063          * If not null, sock_wfree() will call __sk_free(sk) later
1064          */
1065         if (atomic_dec_and_test(&sk->sk_wmem_alloc))
1066                 __sk_free(sk);
1067 }
1068 EXPORT_SYMBOL(sk_free);
1069
1070 /*
1071  * Last sock_put should drop referrence to sk->sk_net. It has already
1072  * been dropped in sk_change_net. Taking referrence to stopping namespace
1073  * is not an option.
1074  * Take referrence to a socket to remove it from hash _alive_ and after that
1075  * destroy it in the context of init_net.
1076  */
1077 void sk_release_kernel(struct sock *sk)
1078 {
1079         if (sk == NULL || sk->sk_socket == NULL)
1080                 return;
1081
1082         sock_hold(sk);
1083         sock_release(sk->sk_socket);
1084         release_net(sock_net(sk));
1085         sock_net_set(sk, get_net(&init_net));
1086         sock_put(sk);
1087 }
1088 EXPORT_SYMBOL(sk_release_kernel);
1089
1090 struct sock *sk_clone(const struct sock *sk, const gfp_t priority)
1091 {
1092         struct sock *newsk;
1093
1094         newsk = sk_prot_alloc(sk->sk_prot, priority, sk->sk_family);
1095         if (newsk != NULL) {
1096                 struct sk_filter *filter;
1097
1098                 sock_copy(newsk, sk);
1099
1100                 /* SANITY */
1101                 get_net(sock_net(newsk));
1102                 sk_node_init(&newsk->sk_node);
1103                 sock_lock_init(newsk);
1104                 bh_lock_sock(newsk);
1105                 newsk->sk_backlog.head  = newsk->sk_backlog.tail = NULL;
1106
1107                 atomic_set(&newsk->sk_rmem_alloc, 0);
1108                 /*
1109                  * sk_wmem_alloc set to one (see sk_free() and sock_wfree())
1110                  */
1111                 atomic_set(&newsk->sk_wmem_alloc, 1);
1112                 atomic_set(&newsk->sk_omem_alloc, 0);
1113                 skb_queue_head_init(&newsk->sk_receive_queue);
1114                 skb_queue_head_init(&newsk->sk_write_queue);
1115 #ifdef CONFIG_NET_DMA
1116                 skb_queue_head_init(&newsk->sk_async_wait_queue);
1117 #endif
1118
1119                 rwlock_init(&newsk->sk_dst_lock);
1120                 rwlock_init(&newsk->sk_callback_lock);
1121                 lockdep_set_class_and_name(&newsk->sk_callback_lock,
1122                                 af_callback_keys + newsk->sk_family,
1123                                 af_family_clock_key_strings[newsk->sk_family]);
1124
1125                 newsk->sk_dst_cache     = NULL;
1126                 newsk->sk_wmem_queued   = 0;
1127                 newsk->sk_forward_alloc = 0;
1128                 newsk->sk_send_head     = NULL;
1129                 newsk->sk_userlocks     = sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
1130
1131                 sock_reset_flag(newsk, SOCK_DONE);
1132                 skb_queue_head_init(&newsk->sk_error_queue);
1133
1134                 filter = newsk->sk_filter;
1135                 if (filter != NULL)
1136                         sk_filter_charge(newsk, filter);
1137
1138                 if (unlikely(xfrm_sk_clone_policy(newsk))) {
1139                         /* It is still raw copy of parent, so invalidate
1140                          * destructor and make plain sk_free() */
1141                         newsk->sk_destruct = NULL;
1142                         sk_free(newsk);
1143                         newsk = NULL;
1144                         goto out;
1145                 }
1146
1147                 newsk->sk_err      = 0;
1148                 newsk->sk_priority = 0;
1149                 /*
1150                  * Before updating sk_refcnt, we must commit prior changes to memory
1151                  * (Documentation/RCU/rculist_nulls.txt for details)
1152                  */
1153                 smp_wmb();
1154                 atomic_set(&newsk->sk_refcnt, 2);
1155
1156                 /*
1157                  * Increment the counter in the same struct proto as the master
1158                  * sock (sk_refcnt_debug_inc uses newsk->sk_prot->socks, that
1159                  * is the same as sk->sk_prot->socks, as this field was copied
1160                  * with memcpy).
1161                  *
1162                  * This _changes_ the previous behaviour, where
1163                  * tcp_create_openreq_child always was incrementing the
1164                  * equivalent to tcp_prot->socks (inet_sock_nr), so this have
1165                  * to be taken into account in all callers. -acme
1166                  */
1167                 sk_refcnt_debug_inc(newsk);
1168                 sk_set_socket(newsk, NULL);
1169                 newsk->sk_sleep  = NULL;
1170
1171                 if (newsk->sk_prot->sockets_allocated)
1172                         percpu_counter_inc(newsk->sk_prot->sockets_allocated);
1173         }
1174 out:
1175         return newsk;
1176 }
1177 EXPORT_SYMBOL_GPL(sk_clone);
1178
1179 void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
1180 {
1181         __sk_dst_set(sk, dst);
1182         sk->sk_route_caps = dst->dev->features;
1183         if (sk->sk_route_caps & NETIF_F_GSO)
1184                 sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE;
1185         if (sk_can_gso(sk)) {
1186                 if (dst->header_len) {
1187                         sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
1188                 } else {
1189                         sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM;
1190                         sk->sk_gso_max_size = dst->dev->gso_max_size;
1191                 }
1192         }
1193 }
1194 EXPORT_SYMBOL_GPL(sk_setup_caps);
1195
1196 void __init sk_init(void)
1197 {
1198         if (num_physpages <= 4096) {
1199                 sysctl_wmem_max = 32767;
1200                 sysctl_rmem_max = 32767;
1201                 sysctl_wmem_default = 32767;
1202                 sysctl_rmem_default = 32767;
1203         } else if (num_physpages >= 131072) {
1204                 sysctl_wmem_max = 131071;
1205                 sysctl_rmem_max = 131071;
1206         }
1207 }
1208
1209 /*
1210  *      Simple resource managers for sockets.
1211  */
1212
1213
1214 /*
1215  * Write buffer destructor automatically called from kfree_skb.
1216  */
1217 void sock_wfree(struct sk_buff *skb)
1218 {
1219         struct sock *sk = skb->sk;
1220         int res;
1221
1222         /* In case it might be waiting for more memory. */
1223         res = atomic_sub_return(skb->truesize, &sk->sk_wmem_alloc);
1224         if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE))
1225                 sk->sk_write_space(sk);
1226         /*
1227          * if sk_wmem_alloc reached 0, we are last user and should
1228          * free this sock, as sk_free() call could not do it.
1229          */
1230         if (res == 0)
1231                 __sk_free(sk);
1232 }
1233 EXPORT_SYMBOL(sock_wfree);
1234
1235 /*
1236  * Read buffer destructor automatically called from kfree_skb.
1237  */
1238 void sock_rfree(struct sk_buff *skb)
1239 {
1240         struct sock *sk = skb->sk;
1241
1242         atomic_sub(skb->truesize, &sk->sk_rmem_alloc);
1243         sk_mem_uncharge(skb->sk, skb->truesize);
1244 }
1245 EXPORT_SYMBOL(sock_rfree);
1246
1247
1248 int sock_i_uid(struct sock *sk)
1249 {
1250         int uid;
1251
1252         read_lock(&sk->sk_callback_lock);
1253         uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : 0;
1254         read_unlock(&sk->sk_callback_lock);
1255         return uid;
1256 }
1257 EXPORT_SYMBOL(sock_i_uid);
1258
1259 unsigned long sock_i_ino(struct sock *sk)
1260 {
1261         unsigned long ino;
1262
1263         read_lock(&sk->sk_callback_lock);
1264         ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0;
1265         read_unlock(&sk->sk_callback_lock);
1266         return ino;
1267 }
1268 EXPORT_SYMBOL(sock_i_ino);
1269
1270 /*
1271  * Allocate a skb from the socket's send buffer.
1272  */
1273 struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
1274                              gfp_t priority)
1275 {
1276         if (force || atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
1277                 struct sk_buff *skb = alloc_skb(size, priority);
1278                 if (skb) {
1279                         skb_set_owner_w(skb, sk);
1280                         return skb;
1281                 }
1282         }
1283         return NULL;
1284 }
1285 EXPORT_SYMBOL(sock_wmalloc);
1286
1287 /*
1288  * Allocate a skb from the socket's receive buffer.
1289  */
1290 struct sk_buff *sock_rmalloc(struct sock *sk, unsigned long size, int force,
1291                              gfp_t priority)
1292 {
1293         if (force || atomic_read(&sk->sk_rmem_alloc) < sk->sk_rcvbuf) {
1294                 struct sk_buff *skb = alloc_skb(size, priority);
1295                 if (skb) {
1296                         skb_set_owner_r(skb, sk);
1297                         return skb;
1298                 }
1299         }
1300         return NULL;
1301 }
1302
1303 /*
1304  * Allocate a memory block from the socket's option memory buffer.
1305  */
1306 void *sock_kmalloc(struct sock *sk, int size, gfp_t priority)
1307 {
1308         if ((unsigned)size <= sysctl_optmem_max &&
1309             atomic_read(&sk->sk_omem_alloc) + size < sysctl_optmem_max) {
1310                 void *mem;
1311                 /* First do the add, to avoid the race if kmalloc
1312                  * might sleep.
1313                  */
1314                 atomic_add(size, &sk->sk_omem_alloc);
1315                 mem = kmalloc(size, priority);
1316                 if (mem)
1317                         return mem;
1318                 atomic_sub(size, &sk->sk_omem_alloc);
1319         }
1320         return NULL;
1321 }
1322 EXPORT_SYMBOL(sock_kmalloc);
1323
1324 /*
1325  * Free an option memory block.
1326  */
1327 void sock_kfree_s(struct sock *sk, void *mem, int size)
1328 {
1329         kfree(mem);
1330         atomic_sub(size, &sk->sk_omem_alloc);
1331 }
1332 EXPORT_SYMBOL(sock_kfree_s);
1333
1334 /* It is almost wait_for_tcp_memory minus release_sock/lock_sock.
1335    I think, these locks should be removed for datagram sockets.
1336  */
1337 static long sock_wait_for_wmem(struct sock *sk, long timeo)
1338 {
1339         DEFINE_WAIT(wait);
1340
1341         clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
1342         for (;;) {
1343                 if (!timeo)
1344                         break;
1345                 if (signal_pending(current))
1346                         break;
1347                 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1348                 prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE);
1349                 if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf)
1350                         break;
1351                 if (sk->sk_shutdown & SEND_SHUTDOWN)
1352                         break;
1353                 if (sk->sk_err)
1354                         break;
1355                 timeo = schedule_timeout(timeo);
1356         }
1357         finish_wait(sk->sk_sleep, &wait);
1358         return timeo;
1359 }
1360
1361
1362 /*
1363  *      Generic send/receive buffer handlers
1364  */
1365
1366 struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len,
1367                                      unsigned long data_len, int noblock,
1368                                      int *errcode)
1369 {
1370         struct sk_buff *skb;
1371         gfp_t gfp_mask;
1372         long timeo;
1373         int err;
1374
1375         gfp_mask = sk->sk_allocation;
1376         if (gfp_mask & __GFP_WAIT)
1377                 gfp_mask |= __GFP_REPEAT;
1378
1379         timeo = sock_sndtimeo(sk, noblock);
1380         while (1) {
1381                 err = sock_error(sk);
1382                 if (err != 0)
1383                         goto failure;
1384
1385                 err = -EPIPE;
1386                 if (sk->sk_shutdown & SEND_SHUTDOWN)
1387                         goto failure;
1388
1389                 if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
1390                         skb = alloc_skb(header_len, gfp_mask);
1391                         if (skb) {
1392                                 int npages;
1393                                 int i;
1394
1395                                 /* No pages, we're done... */
1396                                 if (!data_len)
1397                                         break;
1398
1399                                 npages = (data_len + (PAGE_SIZE - 1)) >> PAGE_SHIFT;
1400                                 skb->truesize += data_len;
1401                                 skb_shinfo(skb)->nr_frags = npages;
1402                                 for (i = 0; i < npages; i++) {
1403                                         struct page *page;
1404                                         skb_frag_t *frag;
1405
1406                                         page = alloc_pages(sk->sk_allocation, 0);
1407                                         if (!page) {
1408                                                 err = -ENOBUFS;
1409                                                 skb_shinfo(skb)->nr_frags = i;
1410                                                 kfree_skb(skb);
1411                                                 goto failure;
1412                                         }
1413
1414                                         frag = &skb_shinfo(skb)->frags[i];
1415                                         frag->page = page;
1416                                         frag->page_offset = 0;
1417                                         frag->size = (data_len >= PAGE_SIZE ?
1418                                                       PAGE_SIZE :
1419                                                       data_len);
1420                                         data_len -= PAGE_SIZE;
1421                                 }
1422
1423                                 /* Full success... */
1424                                 break;
1425                         }
1426                         err = -ENOBUFS;
1427                         goto failure;
1428                 }
1429                 set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
1430                 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1431                 err = -EAGAIN;
1432                 if (!timeo)
1433                         goto failure;
1434                 if (signal_pending(current))
1435                         goto interrupted;
1436                 timeo = sock_wait_for_wmem(sk, timeo);
1437         }
1438
1439         skb_set_owner_w(skb, sk);
1440         return skb;
1441
1442 interrupted:
1443         err = sock_intr_errno(timeo);
1444 failure:
1445         *errcode = err;
1446         return NULL;
1447 }
1448 EXPORT_SYMBOL(sock_alloc_send_pskb);
1449
1450 struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size,
1451                                     int noblock, int *errcode)
1452 {
1453         return sock_alloc_send_pskb(sk, size, 0, noblock, errcode);
1454 }
1455 EXPORT_SYMBOL(sock_alloc_send_skb);
1456
1457 static void __lock_sock(struct sock *sk)
1458 {
1459         DEFINE_WAIT(wait);
1460
1461         for (;;) {
1462                 prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait,
1463                                         TASK_UNINTERRUPTIBLE);
1464                 spin_unlock_bh(&sk->sk_lock.slock);
1465                 schedule();
1466                 spin_lock_bh(&sk->sk_lock.slock);
1467                 if (!sock_owned_by_user(sk))
1468                         break;
1469         }
1470         finish_wait(&sk->sk_lock.wq, &wait);
1471 }
1472
1473 static void __release_sock(struct sock *sk)
1474 {
1475         struct sk_buff *skb = sk->sk_backlog.head;
1476
1477         do {
1478                 sk->sk_backlog.head = sk->sk_backlog.tail = NULL;
1479                 bh_unlock_sock(sk);
1480
1481                 do {
1482                         struct sk_buff *next = skb->next;
1483
1484                         skb->next = NULL;
1485                         sk_backlog_rcv(sk, skb);
1486
1487                         /*
1488                          * We are in process context here with softirqs
1489                          * disabled, use cond_resched_softirq() to preempt.
1490                          * This is safe to do because we've taken the backlog
1491                          * queue private:
1492                          */
1493                         cond_resched_softirq();
1494
1495                         skb = next;
1496                 } while (skb != NULL);
1497
1498                 bh_lock_sock(sk);
1499         } while ((skb = sk->sk_backlog.head) != NULL);
1500 }
1501
1502 /**
1503  * sk_wait_data - wait for data to arrive at sk_receive_queue
1504  * @sk:    sock to wait on
1505  * @timeo: for how long
1506  *
1507  * Now socket state including sk->sk_err is changed only under lock,
1508  * hence we may omit checks after joining wait queue.
1509  * We check receive queue before schedule() only as optimization;
1510  * it is very likely that release_sock() added new data.
1511  */
1512 int sk_wait_data(struct sock *sk, long *timeo)
1513 {
1514         int rc;
1515         DEFINE_WAIT(wait);
1516
1517         prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE);
1518         set_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
1519         rc = sk_wait_event(sk, timeo, !skb_queue_empty(&sk->sk_receive_queue));
1520         clear_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
1521         finish_wait(sk->sk_sleep, &wait);
1522         return rc;
1523 }
1524 EXPORT_SYMBOL(sk_wait_data);
1525
1526 /**
1527  *      __sk_mem_schedule - increase sk_forward_alloc and memory_allocated
1528  *      @sk: socket
1529  *      @size: memory size to allocate
1530  *      @kind: allocation type
1531  *
1532  *      If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means
1533  *      rmem allocation. This function assumes that protocols which have
1534  *      memory_pressure use sk_wmem_queued as write buffer accounting.
1535  */
1536 int __sk_mem_schedule(struct sock *sk, int size, int kind)
1537 {
1538         struct proto *prot = sk->sk_prot;
1539         int amt = sk_mem_pages(size);
1540         int allocated;
1541
1542         sk->sk_forward_alloc += amt * SK_MEM_QUANTUM;
1543         allocated = atomic_add_return(amt, prot->memory_allocated);
1544
1545         /* Under limit. */
1546         if (allocated <= prot->sysctl_mem[0]) {
1547                 if (prot->memory_pressure && *prot->memory_pressure)
1548                         *prot->memory_pressure = 0;
1549                 return 1;
1550         }
1551
1552         /* Under pressure. */
1553         if (allocated > prot->sysctl_mem[1])
1554                 if (prot->enter_memory_pressure)
1555                         prot->enter_memory_pressure(sk);
1556
1557         /* Over hard limit. */
1558         if (allocated > prot->sysctl_mem[2])
1559                 goto suppress_allocation;
1560
1561         /* guarantee minimum buffer size under pressure */
1562         if (kind == SK_MEM_RECV) {
1563                 if (atomic_read(&sk->sk_rmem_alloc) < prot->sysctl_rmem[0])
1564                         return 1;
1565         } else { /* SK_MEM_SEND */
1566                 if (sk->sk_type == SOCK_STREAM) {
1567                         if (sk->sk_wmem_queued < prot->sysctl_wmem[0])
1568                                 return 1;
1569                 } else if (atomic_read(&sk->sk_wmem_alloc) <
1570                            prot->sysctl_wmem[0])
1571                                 return 1;
1572         }
1573
1574         if (prot->memory_pressure) {
1575                 int alloc;
1576
1577                 if (!*prot->memory_pressure)
1578                         return 1;
1579                 alloc = percpu_counter_read_positive(prot->sockets_allocated);
1580                 if (prot->sysctl_mem[2] > alloc *
1581                     sk_mem_pages(sk->sk_wmem_queued +
1582                                  atomic_read(&sk->sk_rmem_alloc) +
1583                                  sk->sk_forward_alloc))
1584                         return 1;
1585         }
1586
1587 suppress_allocation:
1588
1589         if (kind == SK_MEM_SEND && sk->sk_type == SOCK_STREAM) {
1590                 sk_stream_moderate_sndbuf(sk);
1591
1592                 /* Fail only if socket is _under_ its sndbuf.
1593                  * In this case we cannot block, so that we have to fail.
1594                  */
1595                 if (sk->sk_wmem_queued + size >= sk->sk_sndbuf)
1596                         return 1;
1597         }
1598
1599         /* Alas. Undo changes. */
1600         sk->sk_forward_alloc -= amt * SK_MEM_QUANTUM;
1601         atomic_sub(amt, prot->memory_allocated);
1602         return 0;
1603 }
1604 EXPORT_SYMBOL(__sk_mem_schedule);
1605
1606 /**
1607  *      __sk_reclaim - reclaim memory_allocated
1608  *      @sk: socket
1609  */
1610 void __sk_mem_reclaim(struct sock *sk)
1611 {
1612         struct proto *prot = sk->sk_prot;
1613
1614         atomic_sub(sk->sk_forward_alloc >> SK_MEM_QUANTUM_SHIFT,
1615                    prot->memory_allocated);
1616         sk->sk_forward_alloc &= SK_MEM_QUANTUM - 1;
1617
1618         if (prot->memory_pressure && *prot->memory_pressure &&
1619             (atomic_read(prot->memory_allocated) < prot->sysctl_mem[0]))
1620                 *prot->memory_pressure = 0;
1621 }
1622 EXPORT_SYMBOL(__sk_mem_reclaim);
1623
1624
1625 /*
1626  * Set of default routines for initialising struct proto_ops when
1627  * the protocol does not support a particular function. In certain
1628  * cases where it makes no sense for a protocol to have a "do nothing"
1629  * function, some default processing is provided.
1630  */
1631
1632 int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len)
1633 {
1634         return -EOPNOTSUPP;
1635 }
1636 EXPORT_SYMBOL(sock_no_bind);
1637
1638 int sock_no_connect(struct socket *sock, struct sockaddr *saddr,
1639                     int len, int flags)
1640 {
1641         return -EOPNOTSUPP;
1642 }
1643 EXPORT_SYMBOL(sock_no_connect);
1644
1645 int sock_no_socketpair(struct socket *sock1, struct socket *sock2)
1646 {
1647         return -EOPNOTSUPP;
1648 }
1649 EXPORT_SYMBOL(sock_no_socketpair);
1650
1651 int sock_no_accept(struct socket *sock, struct socket *newsock, int flags)
1652 {
1653         return -EOPNOTSUPP;
1654 }
1655 EXPORT_SYMBOL(sock_no_accept);
1656
1657 int sock_no_getname(struct socket *sock, struct sockaddr *saddr,
1658                     int *len, int peer)
1659 {
1660         return -EOPNOTSUPP;
1661 }
1662 EXPORT_SYMBOL(sock_no_getname);
1663
1664 unsigned int sock_no_poll(struct file *file, struct socket *sock, poll_table *pt)
1665 {
1666         return 0;
1667 }
1668 EXPORT_SYMBOL(sock_no_poll);
1669
1670 int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
1671 {
1672         return -EOPNOTSUPP;
1673 }
1674 EXPORT_SYMBOL(sock_no_ioctl);
1675
1676 int sock_no_listen(struct socket *sock, int backlog)
1677 {
1678         return -EOPNOTSUPP;
1679 }
1680 EXPORT_SYMBOL(sock_no_listen);
1681
1682 int sock_no_shutdown(struct socket *sock, int how)
1683 {
1684         return -EOPNOTSUPP;
1685 }
1686 EXPORT_SYMBOL(sock_no_shutdown);
1687
1688 int sock_no_setsockopt(struct socket *sock, int level, int optname,
1689                     char __user *optval, int optlen)
1690 {
1691         return -EOPNOTSUPP;
1692 }
1693 EXPORT_SYMBOL(sock_no_setsockopt);
1694
1695 int sock_no_getsockopt(struct socket *sock, int level, int optname,
1696                     char __user *optval, int __user *optlen)
1697 {
1698         return -EOPNOTSUPP;
1699 }
1700 EXPORT_SYMBOL(sock_no_getsockopt);
1701
1702 int sock_no_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *m,
1703                     size_t len)
1704 {
1705         return -EOPNOTSUPP;
1706 }
1707 EXPORT_SYMBOL(sock_no_sendmsg);
1708
1709 int sock_no_recvmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *m,
1710                     size_t len, int flags)
1711 {
1712         return -EOPNOTSUPP;
1713 }
1714 EXPORT_SYMBOL(sock_no_recvmsg);
1715
1716 int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
1717 {
1718         /* Mirror missing mmap method error code */
1719         return -ENODEV;
1720 }
1721 EXPORT_SYMBOL(sock_no_mmap);
1722
1723 ssize_t sock_no_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags)
1724 {
1725         ssize_t res;
1726         struct msghdr msg = {.msg_flags = flags};
1727         struct kvec iov;
1728         char *kaddr = kmap(page);
1729         iov.iov_base = kaddr + offset;
1730         iov.iov_len = size;
1731         res = kernel_sendmsg(sock, &msg, &iov, 1, size);
1732         kunmap(page);
1733         return res;
1734 }
1735 EXPORT_SYMBOL(sock_no_sendpage);
1736
1737 /*
1738  *      Default Socket Callbacks
1739  */
1740
1741 static void sock_def_wakeup(struct sock *sk)
1742 {
1743         read_lock(&sk->sk_callback_lock);
1744         if (sk_has_sleeper(sk))
1745                 wake_up_interruptible_all(sk->sk_sleep);
1746         read_unlock(&sk->sk_callback_lock);
1747 }
1748
1749 static void sock_def_error_report(struct sock *sk)
1750 {
1751         read_lock(&sk->sk_callback_lock);
1752         if (sk_has_sleeper(sk))
1753                 wake_up_interruptible_poll(sk->sk_sleep, POLLERR);
1754         sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR);
1755         read_unlock(&sk->sk_callback_lock);
1756 }
1757
1758 static void sock_def_readable(struct sock *sk, int len)
1759 {
1760         read_lock(&sk->sk_callback_lock);
1761         if (sk_has_sleeper(sk))
1762                 wake_up_interruptible_sync_poll(sk->sk_sleep, POLLIN |
1763                                                 POLLRDNORM | POLLRDBAND);
1764         sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
1765         read_unlock(&sk->sk_callback_lock);
1766 }
1767
1768 static void sock_def_write_space(struct sock *sk)
1769 {
1770         read_lock(&sk->sk_callback_lock);
1771
1772         /* Do not wake up a writer until he can make "significant"
1773          * progress.  --DaveM
1774          */
1775         if ((atomic_read(&sk->sk_wmem_alloc) << 1) <= sk->sk_sndbuf) {
1776                 if (sk_has_sleeper(sk))
1777                         wake_up_interruptible_sync_poll(sk->sk_sleep, POLLOUT |
1778                                                 POLLWRNORM | POLLWRBAND);
1779
1780                 /* Should agree with poll, otherwise some programs break */
1781                 if (sock_writeable(sk))
1782                         sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
1783         }
1784
1785         read_unlock(&sk->sk_callback_lock);
1786 }
1787
1788 static void sock_def_destruct(struct sock *sk)
1789 {
1790         kfree(sk->sk_protinfo);
1791 }
1792
1793 void sk_send_sigurg(struct sock *sk)
1794 {
1795         if (sk->sk_socket && sk->sk_socket->file)
1796                 if (send_sigurg(&sk->sk_socket->file->f_owner))
1797                         sk_wake_async(sk, SOCK_WAKE_URG, POLL_PRI);
1798 }
1799 EXPORT_SYMBOL(sk_send_sigurg);
1800
1801 void sk_reset_timer(struct sock *sk, struct timer_list* timer,
1802                     unsigned long expires)
1803 {
1804         if (!mod_timer(timer, expires))
1805                 sock_hold(sk);
1806 }
1807 EXPORT_SYMBOL(sk_reset_timer);
1808
1809 void sk_stop_timer(struct sock *sk, struct timer_list* timer)
1810 {
1811         if (timer_pending(timer) && del_timer(timer))
1812                 __sock_put(sk);
1813 }
1814 EXPORT_SYMBOL(sk_stop_timer);
1815
1816 void sock_init_data(struct socket *sock, struct sock *sk)
1817 {
1818         skb_queue_head_init(&sk->sk_receive_queue);
1819         skb_queue_head_init(&sk->sk_write_queue);
1820         skb_queue_head_init(&sk->sk_error_queue);
1821 #ifdef CONFIG_NET_DMA
1822         skb_queue_head_init(&sk->sk_async_wait_queue);
1823 #endif
1824
1825         sk->sk_send_head        =       NULL;
1826
1827         init_timer(&sk->sk_timer);
1828
1829         sk->sk_allocation       =       GFP_KERNEL;
1830         sk->sk_rcvbuf           =       sysctl_rmem_default;
1831         sk->sk_sndbuf           =       sysctl_wmem_default;
1832         sk->sk_state            =       TCP_CLOSE;
1833         sk_set_socket(sk, sock);
1834
1835         sock_set_flag(sk, SOCK_ZAPPED);
1836
1837         if (sock) {
1838                 sk->sk_type     =       sock->type;
1839                 sk->sk_sleep    =       &sock->wait;
1840                 sock->sk        =       sk;
1841         } else
1842                 sk->sk_sleep    =       NULL;
1843
1844         rwlock_init(&sk->sk_dst_lock);
1845         rwlock_init(&sk->sk_callback_lock);
1846         lockdep_set_class_and_name(&sk->sk_callback_lock,
1847                         af_callback_keys + sk->sk_family,
1848                         af_family_clock_key_strings[sk->sk_family]);
1849
1850         sk->sk_state_change     =       sock_def_wakeup;
1851         sk->sk_data_ready       =       sock_def_readable;
1852         sk->sk_write_space      =       sock_def_write_space;
1853         sk->sk_error_report     =       sock_def_error_report;
1854         sk->sk_destruct         =       sock_def_destruct;
1855
1856         sk->sk_sndmsg_page      =       NULL;
1857         sk->sk_sndmsg_off       =       0;
1858
1859         sk->sk_peercred.pid     =       0;
1860         sk->sk_peercred.uid     =       -1;
1861         sk->sk_peercred.gid     =       -1;
1862         sk->sk_write_pending    =       0;
1863         sk->sk_rcvlowat         =       1;
1864         sk->sk_rcvtimeo         =       MAX_SCHEDULE_TIMEOUT;
1865         sk->sk_sndtimeo         =       MAX_SCHEDULE_TIMEOUT;
1866
1867         sk->sk_stamp = ktime_set(-1L, 0);
1868
1869         /*
1870          * Before updating sk_refcnt, we must commit prior changes to memory
1871          * (Documentation/RCU/rculist_nulls.txt for details)
1872          */
1873         smp_wmb();
1874         atomic_set(&sk->sk_refcnt, 1);
1875         atomic_set(&sk->sk_wmem_alloc, 1);
1876         atomic_set(&sk->sk_drops, 0);
1877 }
1878 EXPORT_SYMBOL(sock_init_data);
1879
1880 void lock_sock_nested(struct sock *sk, int subclass)
1881 {
1882         might_sleep();
1883         spin_lock_bh(&sk->sk_lock.slock);
1884         if (sk->sk_lock.owned)
1885                 __lock_sock(sk);
1886         sk->sk_lock.owned = 1;
1887         spin_unlock(&sk->sk_lock.slock);
1888         /*
1889          * The sk_lock has mutex_lock() semantics here:
1890          */
1891         mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_);
1892         local_bh_enable();
1893 }
1894 EXPORT_SYMBOL(lock_sock_nested);
1895
1896 void release_sock(struct sock *sk)
1897 {
1898         /*
1899          * The sk_lock has mutex_unlock() semantics:
1900          */
1901         mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
1902
1903         spin_lock_bh(&sk->sk_lock.slock);
1904         if (sk->sk_backlog.tail)
1905                 __release_sock(sk);
1906         sk->sk_lock.owned = 0;
1907         if (waitqueue_active(&sk->sk_lock.wq))
1908                 wake_up(&sk->sk_lock.wq);
1909         spin_unlock_bh(&sk->sk_lock.slock);
1910 }
1911 EXPORT_SYMBOL(release_sock);
1912
1913 int sock_get_timestamp(struct sock *sk, struct timeval __user *userstamp)
1914 {
1915         struct timeval tv;
1916         if (!sock_flag(sk, SOCK_TIMESTAMP))
1917                 sock_enable_timestamp(sk, SOCK_TIMESTAMP);
1918         tv = ktime_to_timeval(sk->sk_stamp);
1919         if (tv.tv_sec == -1)
1920                 return -ENOENT;
1921         if (tv.tv_sec == 0) {
1922                 sk->sk_stamp = ktime_get_real();
1923                 tv = ktime_to_timeval(sk->sk_stamp);
1924         }
1925         return copy_to_user(userstamp, &tv, sizeof(tv)) ? -EFAULT : 0;
1926 }
1927 EXPORT_SYMBOL(sock_get_timestamp);
1928
1929 int sock_get_timestampns(struct sock *sk, struct timespec __user *userstamp)
1930 {
1931         struct timespec ts;
1932         if (!sock_flag(sk, SOCK_TIMESTAMP))
1933                 sock_enable_timestamp(sk, SOCK_TIMESTAMP);
1934         ts = ktime_to_timespec(sk->sk_stamp);
1935         if (ts.tv_sec == -1)
1936                 return -ENOENT;
1937         if (ts.tv_sec == 0) {
1938                 sk->sk_stamp = ktime_get_real();
1939                 ts = ktime_to_timespec(sk->sk_stamp);
1940         }
1941         return copy_to_user(userstamp, &ts, sizeof(ts)) ? -EFAULT : 0;
1942 }
1943 EXPORT_SYMBOL(sock_get_timestampns);
1944
1945 void sock_enable_timestamp(struct sock *sk, int flag)
1946 {
1947         if (!sock_flag(sk, flag)) {
1948                 sock_set_flag(sk, flag);
1949                 /*
1950                  * we just set one of the two flags which require net
1951                  * time stamping, but time stamping might have been on
1952                  * already because of the other one
1953                  */
1954                 if (!sock_flag(sk,
1955                                 flag == SOCK_TIMESTAMP ?
1956                                 SOCK_TIMESTAMPING_RX_SOFTWARE :
1957                                 SOCK_TIMESTAMP))
1958                         net_enable_timestamp();
1959         }
1960 }
1961
1962 /*
1963  *      Get a socket option on an socket.
1964  *
1965  *      FIX: POSIX 1003.1g is very ambiguous here. It states that
1966  *      asynchronous errors should be reported by getsockopt. We assume
1967  *      this means if you specify SO_ERROR (otherwise whats the point of it).
1968  */
1969 int sock_common_getsockopt(struct socket *sock, int level, int optname,
1970                            char __user *optval, int __user *optlen)
1971 {
1972         struct sock *sk = sock->sk;
1973
1974         return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
1975 }
1976 EXPORT_SYMBOL(sock_common_getsockopt);
1977
1978 #ifdef CONFIG_COMPAT
1979 int compat_sock_common_getsockopt(struct socket *sock, int level, int optname,
1980                                   char __user *optval, int __user *optlen)
1981 {
1982         struct sock *sk = sock->sk;
1983
1984         if (sk->sk_prot->compat_getsockopt != NULL)
1985                 return sk->sk_prot->compat_getsockopt(sk, level, optname,
1986                                                       optval, optlen);
1987         return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
1988 }
1989 EXPORT_SYMBOL(compat_sock_common_getsockopt);
1990 #endif
1991
1992 int sock_common_recvmsg(struct kiocb *iocb, struct socket *sock,
1993                         struct msghdr *msg, size_t size, int flags)
1994 {
1995         struct sock *sk = sock->sk;
1996         int addr_len = 0;
1997         int err;
1998
1999         err = sk->sk_prot->recvmsg(iocb, sk, msg, size, flags & MSG_DONTWAIT,
2000                                    flags & ~MSG_DONTWAIT, &addr_len);
2001         if (err >= 0)
2002                 msg->msg_namelen = addr_len;
2003         return err;
2004 }
2005 EXPORT_SYMBOL(sock_common_recvmsg);
2006
2007 /*
2008  *      Set socket options on an inet socket.
2009  */
2010 int sock_common_setsockopt(struct socket *sock, int level, int optname,
2011                            char __user *optval, int optlen)
2012 {
2013         struct sock *sk = sock->sk;
2014
2015         return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
2016 }
2017 EXPORT_SYMBOL(sock_common_setsockopt);
2018
2019 #ifdef CONFIG_COMPAT
2020 int compat_sock_common_setsockopt(struct socket *sock, int level, int optname,
2021                                   char __user *optval, int optlen)
2022 {
2023         struct sock *sk = sock->sk;
2024
2025         if (sk->sk_prot->compat_setsockopt != NULL)
2026                 return sk->sk_prot->compat_setsockopt(sk, level, optname,
2027                                                       optval, optlen);
2028         return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
2029 }
2030 EXPORT_SYMBOL(compat_sock_common_setsockopt);
2031 #endif
2032
2033 void sk_common_release(struct sock *sk)
2034 {
2035         if (sk->sk_prot->destroy)
2036                 sk->sk_prot->destroy(sk);
2037
2038         /*
2039          * Observation: when sock_common_release is called, processes have
2040          * no access to socket. But net still has.
2041          * Step one, detach it from networking:
2042          *
2043          * A. Remove from hash tables.
2044          */
2045
2046         sk->sk_prot->unhash(sk);
2047
2048         /*
2049          * In this point socket cannot receive new packets, but it is possible
2050          * that some packets are in flight because some CPU runs receiver and
2051          * did hash table lookup before we unhashed socket. They will achieve
2052          * receive queue and will be purged by socket destructor.
2053          *
2054          * Also we still have packets pending on receive queue and probably,
2055          * our own packets waiting in device queues. sock_destroy will drain
2056          * receive queue, but transmitted packets will delay socket destruction
2057          * until the last reference will be released.
2058          */
2059
2060         sock_orphan(sk);
2061
2062         xfrm_sk_free_policy(sk);
2063
2064         sk_refcnt_debug_release(sk);
2065         sock_put(sk);
2066 }
2067 EXPORT_SYMBOL(sk_common_release);
2068
2069 static DEFINE_RWLOCK(proto_list_lock);
2070 static LIST_HEAD(proto_list);
2071
2072 #ifdef CONFIG_PROC_FS
2073 #define PROTO_INUSE_NR  64      /* should be enough for the first time */
2074 struct prot_inuse {
2075         int val[PROTO_INUSE_NR];
2076 };
2077
2078 static DECLARE_BITMAP(proto_inuse_idx, PROTO_INUSE_NR);
2079
2080 #ifdef CONFIG_NET_NS
2081 void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
2082 {
2083         int cpu = smp_processor_id();
2084         per_cpu_ptr(net->core.inuse, cpu)->val[prot->inuse_idx] += val;
2085 }
2086 EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
2087
2088 int sock_prot_inuse_get(struct net *net, struct proto *prot)
2089 {
2090         int cpu, idx = prot->inuse_idx;
2091         int res = 0;
2092
2093         for_each_possible_cpu(cpu)
2094                 res += per_cpu_ptr(net->core.inuse, cpu)->val[idx];
2095
2096         return res >= 0 ? res : 0;
2097 }
2098 EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
2099
2100 static int sock_inuse_init_net(struct net *net)
2101 {
2102         net->core.inuse = alloc_percpu(struct prot_inuse);
2103         return net->core.inuse ? 0 : -ENOMEM;
2104 }
2105
2106 static void sock_inuse_exit_net(struct net *net)
2107 {
2108         free_percpu(net->core.inuse);
2109 }
2110
2111 static struct pernet_operations net_inuse_ops = {
2112         .init = sock_inuse_init_net,
2113         .exit = sock_inuse_exit_net,
2114 };
2115
2116 static __init int net_inuse_init(void)
2117 {
2118         if (register_pernet_subsys(&net_inuse_ops))
2119                 panic("Cannot initialize net inuse counters");
2120
2121         return 0;
2122 }
2123
2124 core_initcall(net_inuse_init);
2125 #else
2126 static DEFINE_PER_CPU(struct prot_inuse, prot_inuse);
2127
2128 void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
2129 {
2130         __get_cpu_var(prot_inuse).val[prot->inuse_idx] += val;
2131 }
2132 EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
2133
2134 int sock_prot_inuse_get(struct net *net, struct proto *prot)
2135 {
2136         int cpu, idx = prot->inuse_idx;
2137         int res = 0;
2138
2139         for_each_possible_cpu(cpu)
2140                 res += per_cpu(prot_inuse, cpu).val[idx];
2141
2142         return res >= 0 ? res : 0;
2143 }
2144 EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
2145 #endif
2146
2147 static void assign_proto_idx(struct proto *prot)
2148 {
2149         prot->inuse_idx = find_first_zero_bit(proto_inuse_idx, PROTO_INUSE_NR);
2150
2151         if (unlikely(prot->inuse_idx == PROTO_INUSE_NR - 1)) {
2152                 printk(KERN_ERR "PROTO_INUSE_NR exhausted\n");
2153                 return;
2154         }
2155
2156         set_bit(prot->inuse_idx, proto_inuse_idx);
2157 }
2158
2159 static void release_proto_idx(struct proto *prot)
2160 {
2161         if (prot->inuse_idx != PROTO_INUSE_NR - 1)
2162                 clear_bit(prot->inuse_idx, proto_inuse_idx);
2163 }
2164 #else
2165 static inline void assign_proto_idx(struct proto *prot)
2166 {
2167 }
2168
2169 static inline void release_proto_idx(struct proto *prot)
2170 {
2171 }
2172 #endif
2173
2174 int proto_register(struct proto *prot, int alloc_slab)
2175 {
2176         if (alloc_slab) {
2177                 prot->slab = kmem_cache_create(prot->name, prot->obj_size, 0,
2178                                         SLAB_HWCACHE_ALIGN | prot->slab_flags,
2179                                         NULL);
2180
2181                 if (prot->slab == NULL) {
2182                         printk(KERN_CRIT "%s: Can't create sock SLAB cache!\n",
2183                                prot->name);
2184                         goto out;
2185                 }
2186
2187                 if (prot->rsk_prot != NULL) {
2188                         static const char mask[] = "request_sock_%s";
2189
2190                         prot->rsk_prot->slab_name = kmalloc(strlen(prot->name) + sizeof(mask) - 1, GFP_KERNEL);
2191                         if (prot->rsk_prot->slab_name == NULL)
2192                                 goto out_free_sock_slab;
2193
2194                         sprintf(prot->rsk_prot->slab_name, mask, prot->name);
2195                         prot->rsk_prot->slab = kmem_cache_create(prot->rsk_prot->slab_name,
2196                                                                  prot->rsk_prot->obj_size, 0,
2197                                                                  SLAB_HWCACHE_ALIGN, NULL);
2198
2199                         if (prot->rsk_prot->slab == NULL) {
2200                                 printk(KERN_CRIT "%s: Can't create request sock SLAB cache!\n",
2201                                        prot->name);
2202                                 goto out_free_request_sock_slab_name;
2203                         }
2204                 }
2205
2206                 if (prot->twsk_prot != NULL) {
2207                         static const char mask[] = "tw_sock_%s";
2208
2209                         prot->twsk_prot->twsk_slab_name = kmalloc(strlen(prot->name) + sizeof(mask) - 1, GFP_KERNEL);
2210
2211                         if (prot->twsk_prot->twsk_slab_name == NULL)
2212                                 goto out_free_request_sock_slab;
2213
2214                         sprintf(prot->twsk_prot->twsk_slab_name, mask, prot->name);
2215                         prot->twsk_prot->twsk_slab =
2216                                 kmem_cache_create(prot->twsk_prot->twsk_slab_name,
2217                                                   prot->twsk_prot->twsk_obj_size,
2218                                                   0,
2219                                                   SLAB_HWCACHE_ALIGN |
2220                                                         prot->slab_flags,
2221                                                   NULL);
2222                         if (prot->twsk_prot->twsk_slab == NULL)
2223                                 goto out_free_timewait_sock_slab_name;
2224                 }
2225         }
2226
2227         write_lock(&proto_list_lock);
2228         list_add(&prot->node, &proto_list);
2229         assign_proto_idx(prot);
2230         write_unlock(&proto_list_lock);
2231         return 0;
2232
2233 out_free_timewait_sock_slab_name:
2234         kfree(prot->twsk_prot->twsk_slab_name);
2235 out_free_request_sock_slab:
2236         if (prot->rsk_prot && prot->rsk_prot->slab) {
2237                 kmem_cache_destroy(prot->rsk_prot->slab);
2238                 prot->rsk_prot->slab = NULL;
2239         }
2240 out_free_request_sock_slab_name:
2241         kfree(prot->rsk_prot->slab_name);
2242 out_free_sock_slab:
2243         kmem_cache_destroy(prot->slab);
2244         prot->slab = NULL;
2245 out:
2246         return -ENOBUFS;
2247 }
2248 EXPORT_SYMBOL(proto_register);
2249
2250 void proto_unregister(struct proto *prot)
2251 {
2252         write_lock(&proto_list_lock);
2253         release_proto_idx(prot);
2254         list_del(&prot->node);
2255         write_unlock(&proto_list_lock);
2256
2257         if (prot->slab != NULL) {
2258                 kmem_cache_destroy(prot->slab);
2259                 prot->slab = NULL;
2260         }
2261
2262         if (prot->rsk_prot != NULL && prot->rsk_prot->slab != NULL) {
2263                 kmem_cache_destroy(prot->rsk_prot->slab);
2264                 kfree(prot->rsk_prot->slab_name);
2265                 prot->rsk_prot->slab = NULL;
2266         }
2267
2268         if (prot->twsk_prot != NULL && prot->twsk_prot->twsk_slab != NULL) {
2269                 kmem_cache_destroy(prot->twsk_prot->twsk_slab);
2270                 kfree(prot->twsk_prot->twsk_slab_name);
2271                 prot->twsk_prot->twsk_slab = NULL;
2272         }
2273 }
2274 EXPORT_SYMBOL(proto_unregister);
2275
2276 #ifdef CONFIG_PROC_FS
2277 static void *proto_seq_start(struct seq_file *seq, loff_t *pos)
2278         __acquires(proto_list_lock)
2279 {
2280         read_lock(&proto_list_lock);
2281         return seq_list_start_head(&proto_list, *pos);
2282 }
2283
2284 static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2285 {
2286         return seq_list_next(v, &proto_list, pos);
2287 }
2288
2289 static void proto_seq_stop(struct seq_file *seq, void *v)
2290         __releases(proto_list_lock)
2291 {
2292         read_unlock(&proto_list_lock);
2293 }
2294
2295 static char proto_method_implemented(const void *method)
2296 {
2297         return method == NULL ? 'n' : 'y';
2298 }
2299
2300 static void proto_seq_printf(struct seq_file *seq, struct proto *proto)
2301 {
2302         seq_printf(seq, "%-9s %4u %6d  %6d   %-3s %6u   %-3s  %-10s "
2303                         "%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n",
2304                    proto->name,
2305                    proto->obj_size,
2306                    sock_prot_inuse_get(seq_file_net(seq), proto),
2307                    proto->memory_allocated != NULL ? atomic_read(proto->memory_allocated) : -1,
2308                    proto->memory_pressure != NULL ? *proto->memory_pressure ? "yes" : "no" : "NI",
2309                    proto->max_header,
2310                    proto->slab == NULL ? "no" : "yes",
2311                    module_name(proto->owner),
2312                    proto_method_implemented(proto->close),
2313                    proto_method_implemented(proto->connect),
2314                    proto_method_implemented(proto->disconnect),
2315                    proto_method_implemented(proto->accept),
2316                    proto_method_implemented(proto->ioctl),
2317                    proto_method_implemented(proto->init),
2318                    proto_method_implemented(proto->destroy),
2319                    proto_method_implemented(proto->shutdown),
2320                    proto_method_implemented(proto->setsockopt),
2321                    proto_method_implemented(proto->getsockopt),
2322                    proto_method_implemented(proto->sendmsg),
2323                    proto_method_implemented(proto->recvmsg),
2324                    proto_method_implemented(proto->sendpage),
2325                    proto_method_implemented(proto->bind),
2326                    proto_method_implemented(proto->backlog_rcv),
2327                    proto_method_implemented(proto->hash),
2328                    proto_method_implemented(proto->unhash),
2329                    proto_method_implemented(proto->get_port),
2330                    proto_method_implemented(proto->enter_memory_pressure));
2331 }
2332
2333 static int proto_seq_show(struct seq_file *seq, void *v)
2334 {
2335         if (v == &proto_list)
2336                 seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s",
2337                            "protocol",
2338                            "size",
2339                            "sockets",
2340                            "memory",
2341                            "press",
2342                            "maxhdr",
2343                            "slab",
2344                            "module",
2345                            "cl co di ac io in de sh ss gs se re sp bi br ha uh gp em\n");
2346         else
2347                 proto_seq_printf(seq, list_entry(v, struct proto, node));
2348         return 0;
2349 }
2350
2351 static const struct seq_operations proto_seq_ops = {
2352         .start  = proto_seq_start,
2353         .next   = proto_seq_next,
2354         .stop   = proto_seq_stop,
2355         .show   = proto_seq_show,
2356 };
2357
2358 static int proto_seq_open(struct inode *inode, struct file *file)
2359 {
2360         return seq_open_net(inode, file, &proto_seq_ops,
2361                             sizeof(struct seq_net_private));
2362 }
2363
2364 static const struct file_operations proto_seq_fops = {
2365         .owner          = THIS_MODULE,
2366         .open           = proto_seq_open,
2367         .read           = seq_read,
2368         .llseek         = seq_lseek,
2369         .release        = seq_release_net,
2370 };
2371
2372 static __net_init int proto_init_net(struct net *net)
2373 {
2374         if (!proc_net_fops_create(net, "protocols", S_IRUGO, &proto_seq_fops))
2375                 return -ENOMEM;
2376
2377         return 0;
2378 }
2379
2380 static __net_exit void proto_exit_net(struct net *net)
2381 {
2382         proc_net_remove(net, "protocols");
2383 }
2384
2385
2386 static __net_initdata struct pernet_operations proto_net_ops = {
2387         .init = proto_init_net,
2388         .exit = proto_exit_net,
2389 };
2390
2391 static int __init proto_init(void)
2392 {
2393         return register_pernet_subsys(&proto_net_ops);
2394 }
2395
2396 subsys_initcall(proto_init);
2397
2398 #endif /* PROC_FS */