net/core/sock.c

   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              Generic socket support routines. Memory allocators, socket lock/release
   7  *              handler for protocols to use and generic option handler.
   8  *
   9  *
  10  * Authors:     Ross Biro
  11  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12  *              Florian La Roche, <flla@stud.uni-sb.de>
  13  *              Alan Cox, <A.Cox@swansea.ac.uk>
  14  *
  15  * Fixes:
  16  *              Alan Cox        :       Numerous verify_area() problems
  17  *              Alan Cox        :       Connecting on a connecting socket
  18  *                                      now returns an error for tcp.
  19  *              Alan Cox        :       sock->protocol is set correctly.
  20  *                                      and is not sometimes left as 0.
  21  *              Alan Cox        :       connect handles icmp errors on a
  22  *                                      connect properly. Unfortunately there
  23  *                                      is a restart syscall nasty there. I
  24  *                                      can't match BSD without hacking the C
  25  *                                      library. Ideas urgently sought!
  26  *              Alan Cox        :       Disallow bind() to addresses that are
  27  *                                      not ours - especially broadcast ones!!
  28  *              Alan Cox        :       Socket 1024 _IS_ ok for users. (fencepost)
  29  *              Alan Cox        :       sock_wfree/sock_rfree don't destroy sockets,
  30  *                                      instead they leave that for the DESTROY timer.
  31  *              Alan Cox        :       Clean up error flag in accept
  32  *              Alan Cox        :       TCP ack handling is buggy, the DESTROY timer
  33  *                                      was buggy. Put a remove_sock() in the handler
  34  *                                      for memory when we hit 0. Also altered the timer
  35  *                                      code. The ACK stuff can wait and needs major
  36  *                                      TCP layer surgery.
  37  *              Alan Cox        :       Fixed TCP ack bug, removed remove sock
  38  *                                      and fixed timer/inet_bh race.
  39  *              Alan Cox        :       Added zapped flag for TCP
  40  *              Alan Cox        :       Move kfree_skb into skbuff.c and tidied up surplus code
  41  *              Alan Cox        :       for new sk_buff allocations wmalloc/rmalloc now call alloc_skb
  42  *              Alan Cox        :       kfree_s calls now are kfree_skbmem so we can track skb resources
  43  *              Alan Cox        :       Supports socket option broadcast now as does udp. Packet and raw need fixing.
  44  *              Alan Cox        :       Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so...
  45  *              Rick Sladkey    :       Relaxed UDP rules for matching packets.
  46  *              C.E.Hawkins     :       IFF_PROMISC/SIOCGHWADDR support
  47  *      Pauline Middelink       :       identd support
  48  *              Alan Cox        :       Fixed connect() taking signals I think.
  49  *              Alan Cox        :       SO_LINGER supported
  50  *              Alan Cox        :       Error reporting fixes
  51  *              Anonymous       :       inet_create tidied up (sk->reuse setting)
  52  *              Alan Cox        :       inet sockets don't set sk->type!
  53  *              Alan Cox        :       Split socket option code
  54  *              Alan Cox        :       Callbacks
  55  *              Alan Cox        :       Nagle flag for Charles & Johannes stuff
  56  *              Alex            :       Removed restriction on inet fioctl
  57  *              Alan Cox        :       Splitting INET from NET core
  58  *              Alan Cox        :       Fixed bogus SO_TYPE handling in getsockopt()
  59  *              Adam Caldwell   :       Missing return in SO_DONTROUTE/SO_DEBUG code
  60  *              Alan Cox        :       Split IP from generic code
  61  *              Alan Cox        :       New kfree_skbmem()
  62  *              Alan Cox        :       Make SO_DEBUG superuser only.
  63  *              Alan Cox        :       Allow anyone to clear SO_DEBUG
  64  *                                      (compatibility fix)
  65  *              Alan Cox        :       Added optimistic memory grabbing for AF_UNIX throughput.
  66  *              Alan Cox        :       Allocator for a socket is settable.
  67  *              Alan Cox        :       SO_ERROR includes soft errors.
  68  *              Alan Cox        :       Allow NULL arguments on some SO_ opts
  69  *              Alan Cox        :       Generic socket allocation to make hooks
  70  *                                      easier (suggested by Craig Metz).
  71  *              Michael Pall    :       SO_ERROR returns positive errno again
  72  *              Steve Whitehouse:       Added default destructor to free
  73  *                                      protocol private data.
  74  *              Steve Whitehouse:       Added various other default routines
  75  *                                      common to several socket families.
  76  *              Chris Evans     :       Call suser() check last on F_SETOWN
  77  *              Jay Schulist    :       Added SO_ATTACH_FILTER and SO_DETACH_FILTER.
  78  *              Andi Kleen      :       Add sock_kmalloc()/sock_kfree_s()
  79  *              Andi Kleen      :       Fix write_space callback
  80  *              Chris Evans     :       Security fixes - signedness again
  81  *              Arnaldo C. Melo :       cleanups, use skb_queue_purge
  82  *
  83  * To Fix:
  84  *
  85  *
  86  *              This program is free software; you can redistribute it and/or
  87  *              modify it under the terms of the GNU General Public License
  88  *              as published by the Free Software Foundation; either version
  89  *              2 of the License, or (at your option) any later version.
  90  */
  91
  92 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
  93
  94 #include <linux/capability.h>
  95 #include <linux/errno.h>
  96 #include <linux/errqueue.h>
  97 #include <linux/types.h>
  98 #include <linux/socket.h>
  99 #include <linux/in.h>
 100 #include <linux/kernel.h>
 101 #include <linux/module.h>
 102 #include <linux/proc_fs.h>
 103 #include <linux/seq_file.h>
 104 #include <linux/sched.h>
 105 #include <linux/timer.h>
 106 #include <linux/string.h>
 107 #include <linux/sockios.h>
 108 #include <linux/net.h>
 109 #include <linux/mm.h>
 110 #include <linux/slab.h>
 111 #include <linux/interrupt.h>
 112 #include <linux/poll.h>
 113 #include <linux/tcp.h>
 114 #include <linux/init.h>
 115 #include <linux/highmem.h>
 116 #include <linux/user_namespace.h>
 117 #include <linux/static_key.h>
 118 #include <linux/memcontrol.h>
 119 #include <linux/prefetch.h>
 120
 121 #include <asm/uaccess.h>
 122
 123 #include <linux/netdevice.h>
 124 #include <net/protocol.h>
 125 #include <linux/skbuff.h>
 126 #include <net/net_namespace.h>
 127 #include <net/request_sock.h>
 128 #include <net/sock.h>
 129 #include <linux/net_tstamp.h>
 130 #include <net/xfrm.h>
 131 #include <linux/ipsec.h>
 132 #include <net/cls_cgroup.h>
 133 #include <net/netprio_cgroup.h>
 134
 135 #include <linux/filter.h>
 136
 137 #include <trace/events/sock.h>
 138
 139 #ifdef CONFIG_INET
 140 #include <net/tcp.h>
 141 #endif
 142
 143 #include <net/busy_poll.h>
 144
 145 static DEFINE_MUTEX(proto_list_mutex);
 146 static LIST_HEAD(proto_list);
 147
 148 /**
 149  * sk_ns_capable - General socket capability test
 150  * @sk: Socket to use a capability on or through
 151  * @user_ns: The user namespace of the capability to use
 152  * @cap: The capability to use
 153  *
 154  * Test to see if the opener of the socket had when the socket was
 155  * created and the current process has the capability @cap in the user
 156  * namespace @user_ns.
 157  */
 158 bool sk_ns_capable(const struct sock *sk,
 159                    struct user_namespace *user_ns, int cap)
 160 {
 161         return file_ns_capable(sk->sk_socket->file, user_ns, cap) &&
 162                 ns_capable(user_ns, cap);
 163 }
 164 EXPORT_SYMBOL(sk_ns_capable);
 165
 166 /**
 167  * sk_capable - Socket global capability test
 168  * @sk: Socket to use a capability on or through
 169  * @cap: The global capbility to use
 170  *
 171  * Test to see if the opener of the socket had when the socket was
 172  * created and the current process has the capability @cap in all user
 173  * namespaces.
 174  */
 175 bool sk_capable(const struct sock *sk, int cap)
 176 {
 177         return sk_ns_capable(sk, &init_user_ns, cap);
 178 }
 179 EXPORT_SYMBOL(sk_capable);
 180
 181 /**
 182  * sk_net_capable - Network namespace socket capability test
 183  * @sk: Socket to use a capability on or through
 184  * @cap: The capability to use
 185  *
 186  * Test to see if the opener of the socket had when the socke was created
 187  * and the current process has the capability @cap over the network namespace
 188  * the socket is a member of.
 189  */
 190 bool sk_net_capable(const struct sock *sk, int cap)
 191 {
 192         return sk_ns_capable(sk, sock_net(sk)->user_ns, cap);
 193 }
 194 EXPORT_SYMBOL(sk_net_capable);
 195
 196
 197 #ifdef CONFIG_MEMCG_KMEM
 198 int mem_cgroup_sockets_init(struct mem_cgroup *memcg, struct cgroup_subsys *ss)
 199 {
 200         struct proto *proto;
 201         int ret = 0;
 202
 203         mutex_lock(&proto_list_mutex);
 204         list_for_each_entry(proto, &proto_list, node) {
 205                 if (proto->init_cgroup) {
 206                         ret = proto->init_cgroup(memcg, ss);
 207                         if (ret)
 208                                 goto out;
 209                 }
 210         }
 211
 212         mutex_unlock(&proto_list_mutex);
 213         return ret;
 214 out:
 215         list_for_each_entry_continue_reverse(proto, &proto_list, node)
 216                 if (proto->destroy_cgroup)
 217                         proto->destroy_cgroup(memcg);
 218         mutex_unlock(&proto_list_mutex);
 219         return ret;
 220 }
 221
 222 void mem_cgroup_sockets_destroy(struct mem_cgroup *memcg)
 223 {
 224         struct proto *proto;
 225
 226         mutex_lock(&proto_list_mutex);
 227         list_for_each_entry_reverse(proto, &proto_list, node)
 228                 if (proto->destroy_cgroup)
 229                         proto->destroy_cgroup(memcg);
 230         mutex_unlock(&proto_list_mutex);
 231 }
 232 #endif
 233
 234 /*
 235  * Each address family might have different locking rules, so we have
 236  * one slock key per address family:
 237  */
 238 static struct lock_class_key af_family_keys[AF_MAX];
 239 static struct lock_class_key af_family_slock_keys[AF_MAX];
 240
 241 #if defined(CONFIG_MEMCG_KMEM)
 242 struct static_key memcg_socket_limit_enabled;
 243 EXPORT_SYMBOL(memcg_socket_limit_enabled);
 244 #endif
 245
 246 /*
 247  * Make lock validator output more readable. (we pre-construct these
 248  * strings build-time, so that runtime initialization of socket
 249  * locks is fast):
 250  */
 251 static const char *const af_family_key_strings[AF_MAX+1] = {
 252   "sk_lock-AF_UNSPEC", "sk_lock-AF_UNIX"     , "sk_lock-AF_INET"     ,
 253   "sk_lock-AF_AX25"  , "sk_lock-AF_IPX"      , "sk_lock-AF_APPLETALK",
 254   "sk_lock-AF_NETROM", "sk_lock-AF_BRIDGE"   , "sk_lock-AF_ATMPVC"   ,
 255   "sk_lock-AF_X25"   , "sk_lock-AF_INET6"    , "sk_lock-AF_ROSE"     ,
 256   "sk_lock-AF_DECnet", "sk_lock-AF_NETBEUI"  , "sk_lock-AF_SECURITY" ,
 257   "sk_lock-AF_KEY"   , "sk_lock-AF_NETLINK"  , "sk_lock-AF_PACKET"   ,
 258   "sk_lock-AF_ASH"   , "sk_lock-AF_ECONET"   , "sk_lock-AF_ATMSVC"   ,
 259   "sk_lock-AF_RDS"   , "sk_lock-AF_SNA"      , "sk_lock-AF_IRDA"     ,
 260   "sk_lock-AF_PPPOX" , "sk_lock-AF_WANPIPE"  , "sk_lock-AF_LLC"      ,
 261   "sk_lock-27"       , "sk_lock-28"          , "sk_lock-AF_CAN"      ,
 262   "sk_lock-AF_TIPC"  , "sk_lock-AF_BLUETOOTH", "sk_lock-IUCV"        ,
 263   "sk_lock-AF_RXRPC" , "sk_lock-AF_ISDN"     , "sk_lock-AF_PHONET"   ,
 264   "sk_lock-AF_IEEE802154", "sk_lock-AF_CAIF" , "sk_lock-AF_ALG"      ,
 265   "sk_lock-AF_NFC"   , "sk_lock-AF_VSOCK"    , "sk_lock-AF_MAX"
 266 };
 267 static const char *const af_family_slock_key_strings[AF_MAX+1] = {
 268   "slock-AF_UNSPEC", "slock-AF_UNIX"     , "slock-AF_INET"     ,
 269   "slock-AF_AX25"  , "slock-AF_IPX"      , "slock-AF_APPLETALK",
 270   "slock-AF_NETROM", "slock-AF_BRIDGE"   , "slock-AF_ATMPVC"   ,
 271   "slock-AF_X25"   , "slock-AF_INET6"    , "slock-AF_ROSE"     ,
 272   "slock-AF_DECnet", "slock-AF_NETBEUI"  , "slock-AF_SECURITY" ,
 273   "slock-AF_KEY"   , "slock-AF_NETLINK"  , "slock-AF_PACKET"   ,
 274   "slock-AF_ASH"   , "slock-AF_ECONET"   , "slock-AF_ATMSVC"   ,
 275   "slock-AF_RDS"   , "slock-AF_SNA"      , "slock-AF_IRDA"     ,
 276   "slock-AF_PPPOX" , "slock-AF_WANPIPE"  , "slock-AF_LLC"      ,
 277   "slock-27"       , "slock-28"          , "slock-AF_CAN"      ,
 278   "slock-AF_TIPC"  , "slock-AF_BLUETOOTH", "slock-AF_IUCV"     ,
 279   "slock-AF_RXRPC" , "slock-AF_ISDN"     , "slock-AF_PHONET"   ,
 280   "slock-AF_IEEE802154", "slock-AF_CAIF" , "slock-AF_ALG"      ,
 281   "slock-AF_NFC"   , "slock-AF_VSOCK"    ,"slock-AF_MAX"
 282 };
 283 static const char *const af_family_clock_key_strings[AF_MAX+1] = {
 284   "clock-AF_UNSPEC", "clock-AF_UNIX"     , "clock-AF_INET"     ,
 285   "clock-AF_AX25"  , "clock-AF_IPX"      , "clock-AF_APPLETALK",
 286   "clock-AF_NETROM", "clock-AF_BRIDGE"   , "clock-AF_ATMPVC"   ,
 287   "clock-AF_X25"   , "clock-AF_INET6"    , "clock-AF_ROSE"     ,
 288   "clock-AF_DECnet", "clock-AF_NETBEUI"  , "clock-AF_SECURITY" ,
 289   "clock-AF_KEY"   , "clock-AF_NETLINK"  , "clock-AF_PACKET"   ,
 290   "clock-AF_ASH"   , "clock-AF_ECONET"   , "clock-AF_ATMSVC"   ,
 291   "clock-AF_RDS"   , "clock-AF_SNA"      , "clock-AF_IRDA"     ,
 292   "clock-AF_PPPOX" , "clock-AF_WANPIPE"  , "clock-AF_LLC"      ,
 293   "clock-27"       , "clock-28"          , "clock-AF_CAN"      ,
 294   "clock-AF_TIPC"  , "clock-AF_BLUETOOTH", "clock-AF_IUCV"     ,
 295   "clock-AF_RXRPC" , "clock-AF_ISDN"     , "clock-AF_PHONET"   ,
 296   "clock-AF_IEEE802154", "clock-AF_CAIF" , "clock-AF_ALG"      ,
 297   "clock-AF_NFC"   , "clock-AF_VSOCK"    , "clock-AF_MAX"
 298 };
 299
 300 /*
 301  * sk_callback_lock locking rules are per-address-family,
 302  * so split the lock classes by using a per-AF key:
 303  */
 304 static struct lock_class_key af_callback_keys[AF_MAX];
 305
 306 /* Take into consideration the size of the struct sk_buff overhead in the
 307  * determination of these values, since that is non-constant across
 308  * platforms.  This makes socket queueing behavior and performance
 309  * not depend upon such differences.
 310  */
 311 #define _SK_MEM_PACKETS         256
 312 #define _SK_MEM_OVERHEAD        SKB_TRUESIZE(256)
 313 #define SK_WMEM_MAX             (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
 314 #define SK_RMEM_MAX             (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
 315
 316 /* Run time adjustable parameters. */
 317 __u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX;
 318 EXPORT_SYMBOL(sysctl_wmem_max);
 319 __u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX;
 320 EXPORT_SYMBOL(sysctl_rmem_max);
 321 __u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX;
 322 __u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX;
 323
 324 /* Maximal space eaten by iovec or ancillary data plus some space */
 325 int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512);
 326 EXPORT_SYMBOL(sysctl_optmem_max);
 327
 328 struct static_key memalloc_socks = STATIC_KEY_INIT_FALSE;
 329 EXPORT_SYMBOL_GPL(memalloc_socks);
 330
 331 /**
 332  * sk_set_memalloc - sets %SOCK_MEMALLOC
 333  * @sk: socket to set it on
 334  *
 335  * Set %SOCK_MEMALLOC on a socket for access to emergency reserves.
 336  * It's the responsibility of the admin to adjust min_free_kbytes
 337  * to meet the requirements
 338  */
 339 void sk_set_memalloc(struct sock *sk)
 340 {
 341         sock_set_flag(sk, SOCK_MEMALLOC);
 342         sk->sk_allocation |= __GFP_MEMALLOC;
 343         static_key_slow_inc(&memalloc_socks);
 344 }
 345 EXPORT_SYMBOL_GPL(sk_set_memalloc);
 346
 347 void sk_clear_memalloc(struct sock *sk)
 348 {
 349         sock_reset_flag(sk, SOCK_MEMALLOC);
 350         sk->sk_allocation &= ~__GFP_MEMALLOC;
 351         static_key_slow_dec(&memalloc_socks);
 352
 353         /*
 354          * SOCK_MEMALLOC is allowed to ignore rmem limits to ensure forward
 355          * progress of swapping. However, if SOCK_MEMALLOC is cleared while
 356          * it has rmem allocations there is a risk that the user of the
 357          * socket cannot make forward progress due to exceeding the rmem
 358          * limits. By rights, sk_clear_memalloc() should only be called
 359          * on sockets being torn down but warn and reset the accounting if
 360          * that assumption breaks.
 361          */
 362         if (WARN_ON(sk->sk_forward_alloc))
 363                 sk_mem_reclaim(sk);
 364 }
 365 EXPORT_SYMBOL_GPL(sk_clear_memalloc);
 366
 367 int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb)
 368 {
 369         int ret;
 370         unsigned long pflags = current->flags;
 371
 372         /* these should have been dropped before queueing */
 373         BUG_ON(!sock_flag(sk, SOCK_MEMALLOC));
 374
 375         current->flags |= PF_MEMALLOC;
 376         ret = sk->sk_backlog_rcv(sk, skb);
 377         tsk_restore_flags(current, pflags, PF_MEMALLOC);
 378
 379         return ret;
 380 }
 381 EXPORT_SYMBOL(__sk_backlog_rcv);
 382
 383 static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen)
 384 {
 385         struct timeval tv;
 386
 387         if (optlen < sizeof(tv))
 388                 return -EINVAL;
 389         if (copy_from_user(&tv, optval, sizeof(tv)))
 390                 return -EFAULT;
 391         if (tv.tv_usec < 0 || tv.tv_usec >= USEC_PER_SEC)
 392                 return -EDOM;
 393
 394         if (tv.tv_sec < 0) {
 395                 static int warned __read_mostly;
 396
 397                 *timeo_p = 0;
 398                 if (warned < 10 && net_ratelimit()) {
 399                         warned++;
 400                         pr_info("%s: `%s' (pid %d) tries to set negative timeout\n",
 401                                 __func__, current->comm, task_pid_nr(current));
 402                 }
 403                 return 0;
 404         }
 405         *timeo_p = MAX_SCHEDULE_TIMEOUT;
 406         if (tv.tv_sec == 0 && tv.tv_usec == 0)
 407                 return 0;
 408         if (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT/HZ - 1))
 409                 *timeo_p = tv.tv_sec*HZ + (tv.tv_usec+(1000000/HZ-1))/(1000000/HZ);
 410         return 0;
 411 }
 412
 413 static void sock_warn_obsolete_bsdism(const char *name)
 414 {
 415         static int warned;
 416         static char warncomm[TASK_COMM_LEN];
 417         if (strcmp(warncomm, current->comm) && warned < 5) {
 418                 strcpy(warncomm,  current->comm);
 419                 pr_warn("process `%s' is using obsolete %s SO_BSDCOMPAT\n",
 420                         warncomm, name);
 421                 warned++;
 422         }
 423 }
 424
 425 #define SK_FLAGS_TIMESTAMP ((1UL << SOCK_TIMESTAMP) | (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE))
 426
 427 static void sock_disable_timestamp(struct sock *sk, unsigned long flags)
 428 {
 429         if (sk->sk_flags & flags) {
 430                 sk->sk_flags &= ~flags;
 431                 if (!(sk->sk_flags & SK_FLAGS_TIMESTAMP))
 432                         net_disable_timestamp();
 433         }
 434 }
 435
 436
 437 int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
 438 {
 439         int err;
 440         int skb_len;
 441         unsigned long flags;
 442         struct sk_buff_head *list = &sk->sk_receive_queue;
 443
 444         if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) {
 445                 atomic_inc(&sk->sk_drops);
 446                 trace_sock_rcvqueue_full(sk, skb);
 447                 return -ENOMEM;
 448         }
 449
 450         err = sk_filter(sk, skb);
 451         if (err)
 452                 return err;
 453
 454         if (!sk_rmem_schedule(sk, skb, skb->truesize)) {
 455                 atomic_inc(&sk->sk_drops);
 456                 return -ENOBUFS;
 457         }
 458
 459         skb->dev = NULL;
 460         skb_set_owner_r(skb, sk);
 461
 462         /* Cache the SKB length before we tack it onto the receive
 463          * queue.  Once it is added it no longer belongs to us and
 464          * may be freed by other threads of control pulling packets
 465          * from the queue.
 466          */
 467         skb_len = skb->len;
 468
 469         /* we escape from rcu protected region, make sure we dont leak
 470          * a norefcounted dst
 471          */
 472         skb_dst_force(skb);
 473
 474         spin_lock_irqsave(&list->lock, flags);
 475         skb->dropcount = atomic_read(&sk->sk_drops);
 476         __skb_queue_tail(list, skb);
 477         spin_unlock_irqrestore(&list->lock, flags);
 478
 479         if (!sock_flag(sk, SOCK_DEAD))
 480                 sk->sk_data_ready(sk, skb_len);
 481         return 0;
 482 }
 483 EXPORT_SYMBOL(sock_queue_rcv_skb);
 484
 485 int sk_receive_skb(struct sock *sk, struct sk_buff *skb, const int nested)
 486 {
 487         int rc = NET_RX_SUCCESS;
 488
 489         if (sk_filter(sk, skb))
 490                 goto discard_and_relse;
 491
 492         skb->dev = NULL;
 493
 494         if (sk_rcvqueues_full(sk, skb, sk->sk_rcvbuf)) {
 495                 atomic_inc(&sk->sk_drops);
 496                 goto discard_and_relse;
 497         }
 498         if (nested)
 499                 bh_lock_sock_nested(sk);
 500         else
 501                 bh_lock_sock(sk);
 502         if (!sock_owned_by_user(sk)) {
 503                 /*
 504                  * trylock + unlock semantics:
 505                  */
 506                 mutex_acquire(&sk->sk_lock.dep_map, 0, 1, _RET_IP_);
 507
 508                 rc = sk_backlog_rcv(sk, skb);
 509
 510                 mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
 511         } else if (sk_add_backlog(sk, skb, sk->sk_rcvbuf)) {
 512                 bh_unlock_sock(sk);
 513                 atomic_inc(&sk->sk_drops);
 514                 goto discard_and_relse;
 515         }
 516
 517         bh_unlock_sock(sk);
 518 out:
 519         sock_put(sk);
 520         return rc;
 521 discard_and_relse:
 522         kfree_skb(skb);
 523         goto out;
 524 }
 525 EXPORT_SYMBOL(sk_receive_skb);
 526
 527 void sk_reset_txq(struct sock *sk)
 528 {
 529         sk_tx_queue_clear(sk);
 530 }
 531 EXPORT_SYMBOL(sk_reset_txq);
 532
 533 struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie)
 534 {
 535         struct dst_entry *dst = __sk_dst_get(sk);
 536
 537         if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
 538                 sk_tx_queue_clear(sk);
 539                 RCU_INIT_POINTER(sk->sk_dst_cache, NULL);
 540                 dst_release(dst);
 541                 return NULL;
 542         }
 543
 544         return dst;
 545 }
 546 EXPORT_SYMBOL(__sk_dst_check);
 547
 548 struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie)
 549 {
 550         struct dst_entry *dst = sk_dst_get(sk);
 551
 552         if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
 553                 sk_dst_reset(sk);
 554                 dst_release(dst);
 555                 return NULL;
 556         }
 557
 558         return dst;
 559 }
 560 EXPORT_SYMBOL(sk_dst_check);
 561
 562 static int sock_setbindtodevice(struct sock *sk, char __user *optval,
 563                                 int optlen)
 564 {
 565         int ret = -ENOPROTOOPT;
 566 #ifdef CONFIG_NETDEVICES
 567         struct net *net = sock_net(sk);
 568         char devname[IFNAMSIZ];
 569         int index;
 570
 571         /* Sorry... */
 572         ret = -EPERM;
 573         if (!ns_capable(net->user_ns, CAP_NET_RAW))
 574                 goto out;
 575
 576         ret = -EINVAL;
 577         if (optlen < 0)
 578                 goto out;
 579
 580         /* Bind this socket to a particular device like "eth0",
 581          * as specified in the passed interface name. If the
 582          * name is "" or the option length is zero the socket
 583          * is not bound.
 584          */
 585         if (optlen > IFNAMSIZ - 1)
 586                 optlen = IFNAMSIZ - 1;
 587         memset(devname, 0, sizeof(devname));
 588
 589         ret = -EFAULT;
 590         if (copy_from_user(devname, optval, optlen))
 591                 goto out;
 592
 593         index = 0;
 594         if (devname[0] != '\0') {
 595                 struct net_device *dev;
 596
 597                 rcu_read_lock();
 598                 dev = dev_get_by_name_rcu(net, devname);
 599                 if (dev)
 600                         index = dev->ifindex;
 601                 rcu_read_unlock();
 602                 ret = -ENODEV;
 603                 if (!dev)
 604                         goto out;
 605         }
 606
 607         lock_sock(sk);
 608         sk->sk_bound_dev_if = index;
 609         sk_dst_reset(sk);
 610         release_sock(sk);
 611
 612         ret = 0;
 613
 614 out:
 615 #endif
 616
 617         return ret;
 618 }
 619
 620 static int sock_getbindtodevice(struct sock *sk, char __user *optval,
 621                                 int __user *optlen, int len)
 622 {
 623         int ret = -ENOPROTOOPT;
 624 #ifdef CONFIG_NETDEVICES
 625         struct net *net = sock_net(sk);
 626         char devname[IFNAMSIZ];
 627
 628         if (sk->sk_bound_dev_if == 0) {
 629                 len = 0;
 630                 goto zero;
 631         }
 632
 633         ret = -EINVAL;
 634         if (len < IFNAMSIZ)
 635                 goto out;
 636
 637         ret = netdev_get_name(net, devname, sk->sk_bound_dev_if);
 638         if (ret)
 639                 goto out;
 640
 641         len = strlen(devname) + 1;
 642
 643         ret = -EFAULT;
 644         if (copy_to_user(optval, devname, len))
 645                 goto out;
 646
 647 zero:
 648         ret = -EFAULT;
 649         if (put_user(len, optlen))
 650                 goto out;
 651
 652         ret = 0;
 653
 654 out:
 655 #endif
 656
 657         return ret;
 658 }
 659
 660 static inline void sock_valbool_flag(struct sock *sk, int bit, int valbool)
 661 {
 662         if (valbool)
 663                 sock_set_flag(sk, bit);
 664         else
 665                 sock_reset_flag(sk, bit);
 666 }
 667
 668 /*
 669  *      This is meant for all protocols to use and covers goings on
 670  *      at the socket level. Everything here is generic.
 671  */
 672
 673 int sock_setsockopt(struct socket *sock, int level, int optname,
 674                     char __user *optval, unsigned int optlen)
 675 {
 676         struct sock *sk = sock->sk;
 677         int val;
 678         int valbool;
 679         struct linger ling;
 680         int ret = 0;
 681
 682         /*
 683          *      Options without arguments
 684          */
 685
 686         if (optname == SO_BINDTODEVICE)
 687                 return sock_setbindtodevice(sk, optval, optlen);
 688
 689         if (optlen < sizeof(int))
 690                 return -EINVAL;
 691
 692         if (get_user(val, (int __user *)optval))
 693                 return -EFAULT;
 694
 695         valbool = val ? 1 : 0;
 696
 697         lock_sock(sk);
 698
 699         switch (optname) {
 700         case SO_DEBUG:
 701                 if (val && !capable(CAP_NET_ADMIN))
 702                         ret = -EACCES;
 703                 else
 704                         sock_valbool_flag(sk, SOCK_DBG, valbool);
 705                 break;
 706         case SO_REUSEADDR:
 707                 sk->sk_reuse = (valbool ? SK_CAN_REUSE : SK_NO_REUSE);
 708                 break;
 709         case SO_REUSEPORT:
 710                 sk->sk_reuseport = valbool;
 711                 break;
 712         case SO_TYPE:
 713         case SO_PROTOCOL:
 714         case SO_DOMAIN:
 715         case SO_ERROR:
 716                 ret = -ENOPROTOOPT;
 717                 break;
 718         case SO_DONTROUTE:
 719                 sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool);
 720                 break;
 721         case SO_BROADCAST:
 722                 sock_valbool_flag(sk, SOCK_BROADCAST, valbool);
 723                 break;
 724         case SO_SNDBUF:
 725                 /* Don't error on this BSD doesn't and if you think
 726                  * about it this is right. Otherwise apps have to
 727                  * play 'guess the biggest size' games. RCVBUF/SNDBUF
 728                  * are treated in BSD as hints
 729                  */
 730                 val = min_t(u32, val, sysctl_wmem_max);
 731 set_sndbuf:
 732                 sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
 733                 sk->sk_sndbuf = max_t(u32, val * 2, SOCK_MIN_SNDBUF);
 734                 /* Wake up sending tasks if we upped the value. */
 735                 sk->sk_write_space(sk);
 736                 break;
 737
 738         case SO_SNDBUFFORCE:
 739                 if (!capable(CAP_NET_ADMIN)) {
 740                         ret = -EPERM;
 741                         break;
 742                 }
 743                 goto set_sndbuf;
 744
 745         case SO_RCVBUF:
 746                 /* Don't error on this BSD doesn't and if you think
 747                  * about it this is right. Otherwise apps have to
 748                  * play 'guess the biggest size' games. RCVBUF/SNDBUF
 749                  * are treated in BSD as hints
 750                  */
 751                 val = min_t(u32, val, sysctl_rmem_max);
 752 set_rcvbuf:
 753                 sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
 754                 /*
 755                  * We double it on the way in to account for
 756                  * "struct sk_buff" etc. overhead.   Applications
 757                  * assume that the SO_RCVBUF setting they make will
 758                  * allow that much actual data to be received on that
 759                  * socket.
 760                  *
 761                  * Applications are unaware that "struct sk_buff" and
 762                  * other overheads allocate from the receive buffer
 763                  * during socket buffer allocation.
 764                  *
 765                  * And after considering the possible alternatives,
 766                  * returning the value we actually used in getsockopt
 767                  * is the most desirable behavior.
 768                  */
 769                 sk->sk_rcvbuf = max_t(u32, val * 2, SOCK_MIN_RCVBUF);
 770                 break;
 771
 772         case SO_RCVBUFFORCE:
 773                 if (!capable(CAP_NET_ADMIN)) {
 774                         ret = -EPERM;
 775                         break;
 776                 }
 777                 goto set_rcvbuf;
 778
 779         case SO_KEEPALIVE:
 780 #ifdef CONFIG_INET
 781                 if (sk->sk_protocol == IPPROTO_TCP &&
 782                     sk->sk_type == SOCK_STREAM)
 783                         tcp_set_keepalive(sk, valbool);
 784 #endif
 785                 sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool);
 786                 break;
 787
 788         case SO_OOBINLINE:
 789                 sock_valbool_flag(sk, SOCK_URGINLINE, valbool);
 790                 break;
 791
 792         case SO_NO_CHECK:
 793                 sk->sk_no_check = valbool;
 794                 break;
 795
 796         case SO_PRIORITY:
 797                 if ((val >= 0 && val <= 6) ||
 798                     ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
 799                         sk->sk_priority = val;
 800                 else
 801                         ret = -EPERM;
 802                 break;
 803
 804         case SO_LINGER:
 805                 if (optlen < sizeof(ling)) {
 806                         ret = -EINVAL;  /* 1003.1g */
 807                         break;
 808                 }
 809                 if (copy_from_user(&ling, optval, sizeof(ling))) {
 810                         ret = -EFAULT;
 811                         break;
 812                 }
 813                 if (!ling.l_onoff)
 814                         sock_reset_flag(sk, SOCK_LINGER);
 815                 else {
 816 #if (BITS_PER_LONG == 32)
 817                         if ((unsigned int)ling.l_linger >= MAX_SCHEDULE_TIMEOUT/HZ)
 818                                 sk->sk_lingertime = MAX_SCHEDULE_TIMEOUT;
 819                         else
 820 #endif
 821                                 sk->sk_lingertime = (unsigned int)ling.l_linger * HZ;
 822                         sock_set_flag(sk, SOCK_LINGER);
 823                 }
 824                 break;
 825
 826         case SO_BSDCOMPAT:
 827                 sock_warn_obsolete_bsdism("setsockopt");
 828                 break;
 829
 830         case SO_PASSCRED:
 831                 if (valbool)
 832                         set_bit(SOCK_PASSCRED, &sock->flags);
 833                 else
 834                         clear_bit(SOCK_PASSCRED, &sock->flags);
 835                 break;
 836
 837         case SO_TIMESTAMP:
 838         case SO_TIMESTAMPNS:
 839                 if (valbool)  {
 840                         if (optname == SO_TIMESTAMP)
 841                                 sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
 842                         else
 843                                 sock_set_flag(sk, SOCK_RCVTSTAMPNS);
 844                         sock_set_flag(sk, SOCK_RCVTSTAMP);
 845                         sock_enable_timestamp(sk, SOCK_TIMESTAMP);
 846                 } else {
 847                         sock_reset_flag(sk, SOCK_RCVTSTAMP);
 848                         sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
 849                 }
 850                 break;
 851
 852         case SO_TIMESTAMPING:
 853                 if (val & ~SOF_TIMESTAMPING_MASK) {
 854                         ret = -EINVAL;
 855                         break;
 856                 }
 857                 sock_valbool_flag(sk, SOCK_TIMESTAMPING_TX_HARDWARE,
 858                                   val & SOF_TIMESTAMPING_TX_HARDWARE);
 859                 sock_valbool_flag(sk, SOCK_TIMESTAMPING_TX_SOFTWARE,
 860                                   val & SOF_TIMESTAMPING_TX_SOFTWARE);
 861                 sock_valbool_flag(sk, SOCK_TIMESTAMPING_RX_HARDWARE,
 862                                   val & SOF_TIMESTAMPING_RX_HARDWARE);
 863                 if (val & SOF_TIMESTAMPING_RX_SOFTWARE)
 864                         sock_enable_timestamp(sk,
 865                                               SOCK_TIMESTAMPING_RX_SOFTWARE);
 866                 else
 867                         sock_disable_timestamp(sk,
 868                                                (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE));
 869                 sock_valbool_flag(sk, SOCK_TIMESTAMPING_SOFTWARE,
 870                                   val & SOF_TIMESTAMPING_SOFTWARE);
 871                 sock_valbool_flag(sk, SOCK_TIMESTAMPING_SYS_HARDWARE,
 872                                   val & SOF_TIMESTAMPING_SYS_HARDWARE);
 873                 sock_valbool_flag(sk, SOCK_TIMESTAMPING_RAW_HARDWARE,
 874                                   val & SOF_TIMESTAMPING_RAW_HARDWARE);
 875                 break;
 876
 877         case SO_RCVLOWAT:
 878                 if (val < 0)
 879                         val = INT_MAX;
 880                 sk->sk_rcvlowat = val ? : 1;
 881                 break;
 882
 883         case SO_RCVTIMEO:
 884                 ret = sock_set_timeout(&sk->sk_rcvtimeo, optval, optlen);
 885                 break;
 886
 887         case SO_SNDTIMEO:
 888                 ret = sock_set_timeout(&sk->sk_sndtimeo, optval, optlen);
 889                 break;
 890
 891         case SO_ATTACH_FILTER:
 892                 ret = -EINVAL;
 893                 if (optlen == sizeof(struct sock_fprog)) {
 894                         struct sock_fprog fprog;
 895
 896                         ret = -EFAULT;
 897                         if (copy_from_user(&fprog, optval, sizeof(fprog)))
 898                                 break;
 899
 900                         ret = sk_attach_filter(&fprog, sk);
 901                 }
 902                 break;
 903
 904         case SO_DETACH_FILTER:
 905                 ret = sk_detach_filter(sk);
 906                 break;
 907
 908         case SO_LOCK_FILTER:
 909                 if (sock_flag(sk, SOCK_FILTER_LOCKED) && !valbool)
 910                         ret = -EPERM;
 911                 else
 912                         sock_valbool_flag(sk, SOCK_FILTER_LOCKED, valbool);
 913                 break;
 914
 915         case SO_PASSSEC:
 916                 if (valbool)
 917                         set_bit(SOCK_PASSSEC, &sock->flags);
 918                 else
 919                         clear_bit(SOCK_PASSSEC, &sock->flags);
 920                 break;
 921         case SO_MARK:
 922                 if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
 923                         ret = -EPERM;
 924                 else
 925                         sk->sk_mark = val;
 926                 break;
 927
 928                 /* We implement the SO_SNDLOWAT etc to
 929                    not be settable (1003.1g 5.3) */
 930         case SO_RXQ_OVFL:
 931                 sock_valbool_flag(sk, SOCK_RXQ_OVFL, valbool);
 932                 break;
 933
 934         case SO_WIFI_STATUS:
 935                 sock_valbool_flag(sk, SOCK_WIFI_STATUS, valbool);
 936                 break;
 937
 938         case SO_PEEK_OFF:
 939                 if (sock->ops->set_peek_off)
 940                         ret = sock->ops->set_peek_off(sk, val);
 941                 else
 942                         ret = -EOPNOTSUPP;
 943                 break;
 944
 945         case SO_NOFCS:
 946                 sock_valbool_flag(sk, SOCK_NOFCS, valbool);
 947                 break;
 948
 949         case SO_SELECT_ERR_QUEUE:
 950                 sock_valbool_flag(sk, SOCK_SELECT_ERR_QUEUE, valbool);
 951                 break;
 952
 953 #ifdef CONFIG_NET_RX_BUSY_POLL
 954         case SO_BUSY_POLL:
 955                 /* allow unprivileged users to decrease the value */
 956                 if ((val > sk->sk_ll_usec) && !capable(CAP_NET_ADMIN))
 957                         ret = -EPERM;
 958                 else {
 959                         if (val < 0)
 960                                 ret = -EINVAL;
 961                         else
 962                                 sk->sk_ll_usec = val;
 963                 }
 964                 break;
 965 #endif
 966         default:
 967                 ret = -ENOPROTOOPT;
 968                 break;
 969         }
 970         release_sock(sk);
 971         return ret;
 972 }
 973 EXPORT_SYMBOL(sock_setsockopt);
 974
 975
 976 void cred_to_ucred(struct pid *pid, const struct cred *cred,
 977                    struct ucred *ucred)
 978 {
 979         ucred->pid = pid_vnr(pid);
 980         ucred->uid = ucred->gid = -1;
 981         if (cred) {
 982                 struct user_namespace *current_ns = current_user_ns();
 983
 984                 ucred->uid = from_kuid_munged(current_ns, cred->euid);
 985                 ucred->gid = from_kgid_munged(current_ns, cred->egid);
 986         }
 987 }
 988 EXPORT_SYMBOL_GPL(cred_to_ucred);
 989
 990 int sock_getsockopt(struct socket *sock, int level, int optname,
 991                     char __user *optval, int __user *optlen)
 992 {
 993         struct sock *sk = sock->sk;
 994
 995         union {
 996                 int val;
 997                 struct linger ling;
 998                 struct timeval tm;
 999         } v;
1000
1001         int lv = sizeof(int);
1002         int len;
1003
1004         if (get_user(len, optlen))
1005                 return -EFAULT;
1006         if (len < 0)
1007                 return -EINVAL;
1008
1009         memset(&v, 0, sizeof(v));
1010
1011         switch (optname) {
1012         case SO_DEBUG:
1013                 v.val = sock_flag(sk, SOCK_DBG);
1014                 break;
1015
1016         case SO_DONTROUTE:
1017                 v.val = sock_flag(sk, SOCK_LOCALROUTE);
1018                 break;
1019
1020         case SO_BROADCAST:
1021                 v.val = sock_flag(sk, SOCK_BROADCAST);
1022                 break;
1023
1024         case SO_SNDBUF:
1025                 v.val = sk->sk_sndbuf;
1026                 break;
1027
1028         case SO_RCVBUF:
1029                 v.val = sk->sk_rcvbuf;
1030                 break;
1031
1032         case SO_REUSEADDR:
1033                 v.val = sk->sk_reuse;
1034                 break;
1035
1036         case SO_REUSEPORT:
1037                 v.val = sk->sk_reuseport;
1038                 break;
1039
1040         case SO_KEEPALIVE:
1041                 v.val = sock_flag(sk, SOCK_KEEPOPEN);
1042                 break;
1043
1044         case SO_TYPE:
1045                 v.val = sk->sk_type;
1046                 break;
1047
1048         case SO_PROTOCOL:
1049                 v.val = sk->sk_protocol;
1050                 break;
1051
1052         case SO_DOMAIN:
1053                 v.val = sk->sk_family;
1054                 break;
1055
1056         case SO_ERROR:
1057                 v.val = -sock_error(sk);
1058                 if (v.val == 0)
1059                         v.val = xchg(&sk->sk_err_soft, 0);
1060                 break;
1061
1062         case SO_OOBINLINE:
1063                 v.val = sock_flag(sk, SOCK_URGINLINE);
1064                 break;
1065
1066         case SO_NO_CHECK:
1067                 v.val = sk->sk_no_check;
1068                 break;
1069
1070         case SO_PRIORITY:
1071                 v.val = sk->sk_priority;
1072                 break;
1073
1074         case SO_LINGER:
1075                 lv              = sizeof(v.ling);
1076                 v.ling.l_onoff  = sock_flag(sk, SOCK_LINGER);
1077                 v.ling.l_linger = sk->sk_lingertime / HZ;
1078                 break;
1079
1080         case SO_BSDCOMPAT:
1081                 sock_warn_obsolete_bsdism("getsockopt");
1082                 break;
1083
1084         case SO_TIMESTAMP:
1085                 v.val = sock_flag(sk, SOCK_RCVTSTAMP) &&
1086                                 !sock_flag(sk, SOCK_RCVTSTAMPNS);
1087                 break;
1088
1089         case SO_TIMESTAMPNS:
1090                 v.val = sock_flag(sk, SOCK_RCVTSTAMPNS);
1091                 break;
1092
1093         case SO_TIMESTAMPING:
1094                 v.val = 0;
1095                 if (sock_flag(sk, SOCK_TIMESTAMPING_TX_HARDWARE))
1096                         v.val |= SOF_TIMESTAMPING_TX_HARDWARE;
1097                 if (sock_flag(sk, SOCK_TIMESTAMPING_TX_SOFTWARE))
1098                         v.val |= SOF_TIMESTAMPING_TX_SOFTWARE;
1099                 if (sock_flag(sk, SOCK_TIMESTAMPING_RX_HARDWARE))
1100                         v.val |= SOF_TIMESTAMPING_RX_HARDWARE;
1101                 if (sock_flag(sk, SOCK_TIMESTAMPING_RX_SOFTWARE))
1102                         v.val |= SOF_TIMESTAMPING_RX_SOFTWARE;
1103                 if (sock_flag(sk, SOCK_TIMESTAMPING_SOFTWARE))
1104                         v.val |= SOF_TIMESTAMPING_SOFTWARE;
1105                 if (sock_flag(sk, SOCK_TIMESTAMPING_SYS_HARDWARE))
1106                         v.val |= SOF_TIMESTAMPING_SYS_HARDWARE;
1107                 if (sock_flag(sk, SOCK_TIMESTAMPING_RAW_HARDWARE))
1108                         v.val |= SOF_TIMESTAMPING_RAW_HARDWARE;
1109                 break;
1110
1111         case SO_RCVTIMEO:
1112                 lv = sizeof(struct timeval);
1113                 if (sk->sk_rcvtimeo == MAX_SCHEDULE_TIMEOUT) {
1114                         v.tm.tv_sec = 0;
1115                         v.tm.tv_usec = 0;
1116                 } else {
1117                         v.tm.tv_sec = sk->sk_rcvtimeo / HZ;
1118                         v.tm.tv_usec = ((sk->sk_rcvtimeo % HZ) * 1000000) / HZ;
1119                 }
1120                 break;
1121
1122         case SO_SNDTIMEO:
1123                 lv = sizeof(struct timeval);
1124                 if (sk->sk_sndtimeo == MAX_SCHEDULE_TIMEOUT) {
1125                         v.tm.tv_sec = 0;
1126                         v.tm.tv_usec = 0;
1127                 } else {
1128                         v.tm.tv_sec = sk->sk_sndtimeo / HZ;
1129                         v.tm.tv_usec = ((sk->sk_sndtimeo % HZ) * 1000000) / HZ;
1130                 }
1131                 break;
1132
1133         case SO_RCVLOWAT:
1134                 v.val = sk->sk_rcvlowat;
1135                 break;
1136
1137         case SO_SNDLOWAT:
1138                 v.val = 1;
1139                 break;
1140
1141         case SO_PASSCRED:
1142                 v.val = !!test_bit(SOCK_PASSCRED, &sock->flags);
1143                 break;
1144
1145         case SO_PEERCRED:
1146         {
1147                 struct ucred peercred;
1148                 if (len > sizeof(peercred))
1149                         len = sizeof(peercred);
1150                 cred_to_ucred(sk->sk_peer_pid, sk->sk_peer_cred, &peercred);
1151                 if (copy_to_user(optval, &peercred, len))
1152                         return -EFAULT;
1153                 goto lenout;
1154         }
1155
1156         case SO_PEERNAME:
1157         {
1158                 char address[128];
1159
1160                 if (sock->ops->getname(sock, (struct sockaddr *)address, &lv, 2))
1161                         return -ENOTCONN;
1162                 if (lv < len)
1163                         return -EINVAL;
1164                 if (copy_to_user(optval, address, len))
1165                         return -EFAULT;
1166                 goto lenout;
1167         }
1168
1169         /* Dubious BSD thing... Probably nobody even uses it, but
1170          * the UNIX standard wants it for whatever reason... -DaveM
1171          */
1172         case SO_ACCEPTCONN:
1173                 v.val = sk->sk_state == TCP_LISTEN;
1174                 break;
1175
1176         case SO_PASSSEC:
1177                 v.val = !!test_bit(SOCK_PASSSEC, &sock->flags);
1178                 break;
1179
1180         case SO_PEERSEC:
1181                 return security_socket_getpeersec_stream(sock, optval, optlen, len);
1182
1183         case SO_MARK:
1184                 v.val = sk->sk_mark;
1185                 break;
1186
1187         case SO_RXQ_OVFL:
1188                 v.val = sock_flag(sk, SOCK_RXQ_OVFL);
1189                 break;
1190
1191         case SO_WIFI_STATUS:
1192                 v.val = sock_flag(sk, SOCK_WIFI_STATUS);
1193                 break;
1194
1195         case SO_PEEK_OFF:
1196                 if (!sock->ops->set_peek_off)
1197                         return -EOPNOTSUPP;
1198
1199                 v.val = sk->sk_peek_off;
1200                 break;
1201         case SO_NOFCS:
1202                 v.val = sock_flag(sk, SOCK_NOFCS);
1203                 break;
1204
1205         case SO_BINDTODEVICE:
1206                 return sock_getbindtodevice(sk, optval, optlen, len);
1207
1208         case SO_GET_FILTER:
1209                 len = sk_get_filter(sk, (struct sock_filter __user *)optval, len);
1210                 if (len < 0)
1211                         return len;
1212
1213                 goto lenout;
1214
1215         case SO_LOCK_FILTER:
1216                 v.val = sock_flag(sk, SOCK_FILTER_LOCKED);
1217                 break;
1218
1219         case SO_SELECT_ERR_QUEUE:
1220                 v.val = sock_flag(sk, SOCK_SELECT_ERR_QUEUE);
1221                 break;
1222
1223 #ifdef CONFIG_NET_RX_BUSY_POLL
1224         case SO_BUSY_POLL:
1225                 v.val = sk->sk_ll_usec;
1226                 break;
1227 #endif
1228
1229         default:
1230                 return -ENOPROTOOPT;
1231         }
1232
1233         if (len > lv)
1234                 len = lv;
1235         if (copy_to_user(optval, &v, len))
1236                 return -EFAULT;
1237 lenout:
1238         if (put_user(len, optlen))
1239                 return -EFAULT;
1240         return 0;
1241 }
1242
1243 /*
1244  * Initialize an sk_lock.
1245  *
1246  * (We also register the sk_lock with the lock validator.)
1247  */
1248 static inline void sock_lock_init(struct sock *sk)
1249 {
1250         sock_lock_init_class_and_name(sk,
1251                         af_family_slock_key_strings[sk->sk_family],
1252                         af_family_slock_keys + sk->sk_family,
1253                         af_family_key_strings[sk->sk_family],
1254                         af_family_keys + sk->sk_family);
1255 }
1256
1257 /*
1258  * Copy all fields from osk to nsk but nsk->sk_refcnt must not change yet,
1259  * even temporarly, because of RCU lookups. sk_node should also be left as is.
1260  * We must not copy fields between sk_dontcopy_begin and sk_dontcopy_end
1261  */
1262 static void sock_copy(struct sock *nsk, const struct sock *osk)
1263 {
1264 #ifdef CONFIG_SECURITY_NETWORK
1265         void *sptr = nsk->sk_security;
1266 #endif
1267         memcpy(nsk, osk, offsetof(struct sock, sk_dontcopy_begin));
1268
1269         memcpy(&nsk->sk_dontcopy_end, &osk->sk_dontcopy_end,
1270                osk->sk_prot->obj_size - offsetof(struct sock, sk_dontcopy_end));
1271
1272 #ifdef CONFIG_SECURITY_NETWORK
1273         nsk->sk_security = sptr;
1274         security_sk_clone(osk, nsk);
1275 #endif
1276 }
1277
1278 void sk_prot_clear_portaddr_nulls(struct sock *sk, int size)
1279 {
1280         unsigned long nulls1, nulls2;
1281
1282         nulls1 = offsetof(struct sock, __sk_common.skc_node.next);
1283         nulls2 = offsetof(struct sock, __sk_common.skc_portaddr_node.next);
1284         if (nulls1 > nulls2)
1285                 swap(nulls1, nulls2);
1286
1287         if (nulls1 != 0)
1288                 memset((char *)sk, 0, nulls1);
1289         memset((char *)sk + nulls1 + sizeof(void *), 0,
1290                nulls2 - nulls1 - sizeof(void *));
1291         memset((char *)sk + nulls2 + sizeof(void *), 0,
1292                size - nulls2 - sizeof(void *));
1293 }
1294 EXPORT_SYMBOL(sk_prot_clear_portaddr_nulls);
1295
1296 static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
1297                 int family)
1298 {
1299         struct sock *sk;
1300         struct kmem_cache *slab;
1301
1302         slab = prot->slab;
1303         if (slab != NULL) {
1304                 sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO);
1305                 if (!sk)
1306                         return sk;
1307                 if (priority & __GFP_ZERO) {
1308                         if (prot->clear_sk)
1309                                 prot->clear_sk(sk, prot->obj_size);
1310                         else
1311                                 sk_prot_clear_nulls(sk, prot->obj_size);
1312                 }
1313         } else
1314                 sk = kmalloc(prot->obj_size, priority);
1315
1316         if (sk != NULL) {
1317                 kmemcheck_annotate_bitfield(sk, flags);
1318
1319                 if (security_sk_alloc(sk, family, priority))
1320                         goto out_free;
1321
1322                 if (!try_module_get(prot->owner))
1323                         goto out_free_sec;
1324                 sk_tx_queue_clear(sk);
1325         }
1326
1327         return sk;
1328
1329 out_free_sec:
1330         security_sk_free(sk);
1331 out_free:
1332         if (slab != NULL)
1333                 kmem_cache_free(slab, sk);
1334         else
1335                 kfree(sk);
1336         return NULL;
1337 }
1338
1339 static void sk_prot_free(struct proto *prot, struct sock *sk)
1340 {
1341         struct kmem_cache *slab;
1342         struct module *owner;
1343
1344         owner = prot->owner;
1345         slab = prot->slab;
1346
1347         security_sk_free(sk);
1348         if (slab != NULL)
1349                 kmem_cache_free(slab, sk);
1350         else
1351                 kfree(sk);
1352         module_put(owner);
1353 }
1354
1355 #if IS_ENABLED(CONFIG_NET_CLS_CGROUP)
1356 void sock_update_classid(struct sock *sk)
1357 {
1358         u32 classid;
1359
1360         classid = task_cls_classid(current);
1361         if (classid != sk->sk_classid)
1362                 sk->sk_classid = classid;
1363 }
1364 EXPORT_SYMBOL(sock_update_classid);
1365 #endif
1366
1367 #if IS_ENABLED(CONFIG_NETPRIO_CGROUP)
1368 void sock_update_netprioidx(struct sock *sk)
1369 {
1370         if (in_interrupt())
1371                 return;
1372
1373         sk->sk_cgrp_prioidx = task_netprioidx(current);
1374 }
1375 EXPORT_SYMBOL_GPL(sock_update_netprioidx);
1376 #endif
1377
1378 /**
1379  *      sk_alloc - All socket objects are allocated here
1380  *      @net: the applicable net namespace
1381  *      @family: protocol family
1382  *      @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
1383  *      @prot: struct proto associated with this new sock instance
1384  */
1385 struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
1386                       struct proto *prot)
1387 {
1388         struct sock *sk;
1389
1390         sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family);
1391         if (sk) {
1392                 sk->sk_family = family;
1393                 /*
1394                  * See comment in struct sock definition to understand
1395                  * why we need sk_prot_creator -acme
1396                  */
1397                 sk->sk_prot = sk->sk_prot_creator = prot;
1398                 sock_lock_init(sk);
1399                 sock_net_set(sk, get_net(net));
1400                 atomic_set(&sk->sk_wmem_alloc, 1);
1401
1402                 sock_update_classid(sk);
1403                 sock_update_netprioidx(sk);
1404         }
1405
1406         return sk;
1407 }
1408 EXPORT_SYMBOL(sk_alloc);
1409
1410 static void __sk_free(struct sock *sk)
1411 {
1412         struct sk_filter *filter;
1413
1414         if (sk->sk_destruct)
1415                 sk->sk_destruct(sk);
1416
1417         filter = rcu_dereference_check(sk->sk_filter,
1418                                        atomic_read(&sk->sk_wmem_alloc) == 0);
1419         if (filter) {
1420                 sk_filter_uncharge(sk, filter);
1421                 RCU_INIT_POINTER(sk->sk_filter, NULL);
1422         }
1423
1424         sock_disable_timestamp(sk, SK_FLAGS_TIMESTAMP);
1425
1426         if (atomic_read(&sk->sk_omem_alloc))
1427                 pr_debug("%s: optmem leakage (%d bytes) detected\n",
1428                          __func__, atomic_read(&sk->sk_omem_alloc));
1429
1430         if (sk->sk_peer_cred)
1431                 put_cred(sk->sk_peer_cred);
1432         put_pid(sk->sk_peer_pid);
1433         put_net(sock_net(sk));
1434         sk_prot_free(sk->sk_prot_creator, sk);
1435 }
1436
1437 void sk_free(struct sock *sk)
1438 {
1439         /*
1440          * We subtract one from sk_wmem_alloc and can know if
1441          * some packets are still in some tx queue.
1442          * If not null, sock_wfree() will call __sk_free(sk) later
1443          */
1444         if (atomic_dec_and_test(&sk->sk_wmem_alloc))
1445                 __sk_free(sk);
1446 }
1447 EXPORT_SYMBOL(sk_free);
1448
1449 /*
1450  * Last sock_put should drop reference to sk->sk_net. It has already
1451  * been dropped in sk_change_net. Taking reference to stopping namespace
1452  * is not an option.
1453  * Take reference to a socket to remove it from hash _alive_ and after that
1454  * destroy it in the context of init_net.
1455  */
1456 void sk_release_kernel(struct sock *sk)
1457 {
1458         if (sk == NULL || sk->sk_socket == NULL)
1459                 return;
1460
1461         sock_hold(sk);
1462         sock_release(sk->sk_socket);
1463         release_net(sock_net(sk));
1464         sock_net_set(sk, get_net(&init_net));
1465         sock_put(sk);
1466 }
1467 EXPORT_SYMBOL(sk_release_kernel);
1468
1469 static void sk_update_clone(const struct sock *sk, struct sock *newsk)
1470 {
1471         if (mem_cgroup_sockets_enabled && sk->sk_cgrp)
1472                 sock_update_memcg(newsk);
1473 }
1474
1475 /**
1476  *      sk_clone_lock - clone a socket, and lock its clone
1477  *      @sk: the socket to clone
1478  *      @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
1479  *
1480  *      Caller must unlock socket even in error path (bh_unlock_sock(newsk))
1481  */
1482 struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
1483 {
1484         struct sock *newsk;
1485
1486         newsk = sk_prot_alloc(sk->sk_prot, priority, sk->sk_family);
1487         if (newsk != NULL) {
1488                 struct sk_filter *filter;
1489
1490                 sock_copy(newsk, sk);
1491
1492                 /* SANITY */
1493                 get_net(sock_net(newsk));
1494                 sk_node_init(&newsk->sk_node);
1495                 sock_lock_init(newsk);
1496                 bh_lock_sock(newsk);
1497                 newsk->sk_backlog.head  = newsk->sk_backlog.tail = NULL;
1498                 newsk->sk_backlog.len = 0;
1499
1500                 atomic_set(&newsk->sk_rmem_alloc, 0);
1501                 /*
1502                  * sk_wmem_alloc set to one (see sk_free() and sock_wfree())
1503                  */
1504                 atomic_set(&newsk->sk_wmem_alloc, 1);
1505                 atomic_set(&newsk->sk_omem_alloc, 0);
1506                 skb_queue_head_init(&newsk->sk_receive_queue);
1507                 skb_queue_head_init(&newsk->sk_write_queue);
1508 #ifdef CONFIG_NET_DMA
1509                 skb_queue_head_init(&newsk->sk_async_wait_queue);
1510 #endif
1511
1512                 spin_lock_init(&newsk->sk_dst_lock);
1513                 rwlock_init(&newsk->sk_callback_lock);
1514                 lockdep_set_class_and_name(&newsk->sk_callback_lock,
1515                                 af_callback_keys + newsk->sk_family,
1516                                 af_family_clock_key_strings[newsk->sk_family]);
1517
1518                 newsk->sk_dst_cache     = NULL;
1519                 newsk->sk_wmem_queued   = 0;
1520                 newsk->sk_forward_alloc = 0;
1521                 newsk->sk_send_head     = NULL;
1522                 newsk->sk_userlocks     = sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
1523
1524                 sock_reset_flag(newsk, SOCK_DONE);
1525                 skb_queue_head_init(&newsk->sk_error_queue);
1526
1527                 filter = rcu_dereference_protected(newsk->sk_filter, 1);
1528                 if (filter != NULL)
1529                         sk_filter_charge(newsk, filter);
1530
1531                 if (unlikely(xfrm_sk_clone_policy(newsk))) {
1532                         /* It is still raw copy of parent, so invalidate
1533                          * destructor and make plain sk_free() */
1534                         newsk->sk_destruct = NULL;
1535                         bh_unlock_sock(newsk);
1536                         sk_free(newsk);
1537                         newsk = NULL;
1538                         goto out;
1539                 }
1540
1541                 newsk->sk_err      = 0;
1542                 newsk->sk_priority = 0;
1543                 /*
1544                  * Before updating sk_refcnt, we must commit prior changes to memory
1545                  * (Documentation/RCU/rculist_nulls.txt for details)
1546                  */
1547                 smp_wmb();
1548                 atomic_set(&newsk->sk_refcnt, 2);
1549
1550                 /*
1551                  * Increment the counter in the same struct proto as the master
1552                  * sock (sk_refcnt_debug_inc uses newsk->sk_prot->socks, that
1553                  * is the same as sk->sk_prot->socks, as this field was copied
1554                  * with memcpy).
1555                  *
1556                  * This _changes_ the previous behaviour, where
1557                  * tcp_create_openreq_child always was incrementing the
1558                  * equivalent to tcp_prot->socks (inet_sock_nr), so this have
1559                  * to be taken into account in all callers. -acme
1560                  */
1561                 sk_refcnt_debug_inc(newsk);
1562                 sk_set_socket(newsk, NULL);
1563                 newsk->sk_wq = NULL;
1564
1565                 sk_update_clone(sk, newsk);
1566
1567                 if (newsk->sk_prot->sockets_allocated)
1568                         sk_sockets_allocated_inc(newsk);
1569
1570                 if (newsk->sk_flags & SK_FLAGS_TIMESTAMP)
1571                         net_enable_timestamp();
1572         }
1573 out:
1574         return newsk;
1575 }
1576 EXPORT_SYMBOL_GPL(sk_clone_lock);
1577
1578 void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
1579 {
1580         __sk_dst_set(sk, dst);
1581         sk->sk_route_caps = dst->dev->features;
1582         if (sk->sk_route_caps & NETIF_F_GSO)
1583                 sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE;
1584         sk->sk_route_caps &= ~sk->sk_route_nocaps;
1585         if (sk_can_gso(sk)) {
1586                 if (dst->header_len) {
1587                         sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
1588                 } else {
1589                         sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM;
1590                         sk->sk_gso_max_size = dst->dev->gso_max_size;
1591                         sk->sk_gso_max_segs = dst->dev->gso_max_segs;
1592                 }
1593         }
1594 }
1595 EXPORT_SYMBOL_GPL(sk_setup_caps);
1596
1597 /*
1598  *      Simple resource managers for sockets.
1599  */
1600
1601
1602 /*
1603  * Write buffer destructor automatically called from kfree_skb.
1604  */
1605 void sock_wfree(struct sk_buff *skb)
1606 {
1607         struct sock *sk = skb->sk;
1608         unsigned int len = skb->truesize;
1609
1610         if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE)) {
1611                 /*
1612                  * Keep a reference on sk_wmem_alloc, this will be released
1613                  * after sk_write_space() call
1614                  */
1615                 atomic_sub(len - 1, &sk->sk_wmem_alloc);
1616                 sk->sk_write_space(sk);
1617                 len = 1;
1618         }
1619         /*
1620          * if sk_wmem_alloc reaches 0, we must finish what sk_free()
1621          * could not do because of in-flight packets
1622          */
1623         if (atomic_sub_and_test(len, &sk->sk_wmem_alloc))
1624                 __sk_free(sk);
1625 }
1626 EXPORT_SYMBOL(sock_wfree);
1627
1628 void skb_orphan_partial(struct sk_buff *skb)
1629 {
1630         /* TCP stack sets skb->ooo_okay based on sk_wmem_alloc,
1631          * so we do not completely orphan skb, but transfert all
1632          * accounted bytes but one, to avoid unexpected reorders.
1633          */
1634         if (skb->destructor == sock_wfree
1635 #ifdef CONFIG_INET
1636             || skb->destructor == tcp_wfree
1637 #endif
1638                 ) {
1639                 atomic_sub(skb->truesize - 1, &skb->sk->sk_wmem_alloc);
1640                 skb->truesize = 1;
1641         } else {
1642                 skb_orphan(skb);
1643         }
1644 }
1645 EXPORT_SYMBOL(skb_orphan_partial);
1646
1647 /*
1648  * Read buffer destructor automatically called from kfree_skb.
1649  */
1650 void sock_rfree(struct sk_buff *skb)
1651 {
1652         struct sock *sk = skb->sk;
1653         unsigned int len = skb->truesize;
1654
1655         atomic_sub(len, &sk->sk_rmem_alloc);
1656         sk_mem_uncharge(sk, len);
1657 }
1658 EXPORT_SYMBOL(sock_rfree);
1659
1660 void sock_edemux(struct sk_buff *skb)
1661 {
1662         struct sock *sk = skb->sk;
1663
1664 #ifdef CONFIG_INET
1665         if (sk->sk_state == TCP_TIME_WAIT)
1666                 inet_twsk_put(inet_twsk(sk));
1667         else
1668 #endif
1669                 sock_put(sk);
1670 }
1671 EXPORT_SYMBOL(sock_edemux);
1672
1673 kuid_t sock_i_uid(struct sock *sk)
1674 {
1675         kuid_t uid;
1676
1677         read_lock_bh(&sk->sk_callback_lock);
1678         uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : GLOBAL_ROOT_UID;
1679         read_unlock_bh(&sk->sk_callback_lock);
1680         return uid;
1681 }
1682 EXPORT_SYMBOL(sock_i_uid);
1683
1684 unsigned long sock_i_ino(struct sock *sk)
1685 {
1686         unsigned long ino;
1687
1688         read_lock_bh(&sk->sk_callback_lock);
1689         ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0;
1690         read_unlock_bh(&sk->sk_callback_lock);
1691         return ino;
1692 }
1693 EXPORT_SYMBOL(sock_i_ino);
1694
1695 /*
1696  * Allocate a skb from the socket's send buffer.
1697  */
1698 struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
1699                              gfp_t priority)
1700 {
1701         if (force || atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
1702                 struct sk_buff *skb = alloc_skb(size, priority);
1703                 if (skb) {
1704                         skb_set_owner_w(skb, sk);
1705                         return skb;
1706                 }
1707         }
1708         return NULL;
1709 }
1710 EXPORT_SYMBOL(sock_wmalloc);
1711
1712 /*
1713  * Allocate a skb from the socket's receive buffer.
1714  */
1715 struct sk_buff *sock_rmalloc(struct sock *sk, unsigned long size, int force,
1716                              gfp_t priority)
1717 {
1718         if (force || atomic_read(&sk->sk_rmem_alloc) < sk->sk_rcvbuf) {
1719                 struct sk_buff *skb = alloc_skb(size, priority);
1720                 if (skb) {
1721                         skb_set_owner_r(skb, sk);
1722                         return skb;
1723                 }
1724         }
1725         return NULL;
1726 }
1727
1728 /*
1729  * Allocate a memory block from the socket's option memory buffer.
1730  */
1731 void *sock_kmalloc(struct sock *sk, int size, gfp_t priority)
1732 {
1733         if ((unsigned int)size <= sysctl_optmem_max &&
1734             atomic_read(&sk->sk_omem_alloc) + size < sysctl_optmem_max) {
1735                 void *mem;
1736                 /* First do the add, to avoid the race if kmalloc
1737                  * might sleep.
1738                  */
1739                 atomic_add(size, &sk->sk_omem_alloc);
1740                 mem = kmalloc(size, priority);
1741                 if (mem)
1742                         return mem;
1743                 atomic_sub(size, &sk->sk_omem_alloc);
1744         }
1745         return NULL;
1746 }
1747 EXPORT_SYMBOL(sock_kmalloc);
1748
1749 /*
1750  * Free an option memory block.
1751  */
1752 void sock_kfree_s(struct sock *sk, void *mem, int size)
1753 {
1754         kfree(mem);
1755         atomic_sub(size, &sk->sk_omem_alloc);
1756 }
1757 EXPORT_SYMBOL(sock_kfree_s);
1758
1759 /* It is almost wait_for_tcp_memory minus release_sock/lock_sock.
1760    I think, these locks should be removed for datagram sockets.
1761  */
1762 static long sock_wait_for_wmem(struct sock *sk, long timeo)
1763 {
1764         DEFINE_WAIT(wait);
1765
1766         clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
1767         for (;;) {
1768                 if (!timeo)
1769                         break;
1770                 if (signal_pending(current))
1771                         break;
1772                 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1773                 prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
1774                 if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf)
1775                         break;
1776                 if (sk->sk_shutdown & SEND_SHUTDOWN)
1777                         break;
1778                 if (sk->sk_err)
1779                         break;
1780                 timeo = schedule_timeout(timeo);
1781         }
1782         finish_wait(sk_sleep(sk), &wait);
1783         return timeo;
1784 }
1785
1786
1787 /*
1788  *      Generic send/receive buffer handlers
1789  */
1790
1791 struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len,
1792                                      unsigned long data_len, int noblock,
1793                                      int *errcode, int max_page_order)
1794 {
1795         struct sk_buff *skb = NULL;
1796         unsigned long chunk;
1797         gfp_t gfp_mask;
1798         long timeo;
1799         int err;
1800         int npages = (data_len + (PAGE_SIZE - 1)) >> PAGE_SHIFT;
1801         struct page *page;
1802         int i;
1803
1804         err = -EMSGSIZE;
1805         if (npages > MAX_SKB_FRAGS)
1806                 goto failure;
1807
1808         timeo = sock_sndtimeo(sk, noblock);
1809         while (!skb) {
1810                 err = sock_error(sk);
1811                 if (err != 0)
1812                         goto failure;
1813
1814                 err = -EPIPE;
1815                 if (sk->sk_shutdown & SEND_SHUTDOWN)
1816                         goto failure;
1817
1818                 if (atomic_read(&sk->sk_wmem_alloc) >= sk->sk_sndbuf) {
1819                         set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
1820                         set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1821                         err = -EAGAIN;
1822                         if (!timeo)
1823                                 goto failure;
1824                         if (signal_pending(current))
1825                                 goto interrupted;
1826                         timeo = sock_wait_for_wmem(sk, timeo);
1827                         continue;
1828                 }
1829
1830                 err = -ENOBUFS;
1831                 gfp_mask = sk->sk_allocation;
1832                 if (gfp_mask & __GFP_WAIT)
1833                         gfp_mask |= __GFP_REPEAT;
1834
1835                 skb = alloc_skb(header_len, gfp_mask);
1836                 if (!skb)
1837                         goto failure;
1838
1839                 skb->truesize += data_len;
1840
1841                 for (i = 0; npages > 0; i++) {
1842                         int order = max_page_order;
1843
1844                         while (order) {
1845                                 if (npages >= 1 << order) {
1846                                         page = alloc_pages(sk->sk_allocation |
1847                                                            __GFP_COMP |
1848                                                            __GFP_NOWARN |
1849                                                            __GFP_NORETRY,
1850                                                            order);
1851                                         if (page)
1852                                                 goto fill_page;
1853                                 }
1854                                 order--;
1855                         }
1856                         page = alloc_page(sk->sk_allocation);
1857                         if (!page)
1858                                 goto failure;
1859 fill_page:
1860                         chunk = min_t(unsigned long, data_len,
1861                                       PAGE_SIZE << order);
1862                         skb_fill_page_desc(skb, i, page, 0, chunk);
1863                         data_len -= chunk;
1864                         npages -= 1 << order;
1865                 }
1866         }
1867
1868         skb_set_owner_w(skb, sk);
1869         return skb;
1870
1871 interrupted:
1872         err = sock_intr_errno(timeo);
1873 failure:
1874         kfree_skb(skb);
1875         *errcode = err;
1876         return NULL;
1877 }
1878 EXPORT_SYMBOL(sock_alloc_send_pskb);
1879
1880 struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size,
1881                                     int noblock, int *errcode)
1882 {
1883         return sock_alloc_send_pskb(sk, size, 0, noblock, errcode, 0);
1884 }
1885 EXPORT_SYMBOL(sock_alloc_send_skb);
1886
1887 /* On 32bit arches, an skb frag is limited to 2^15 */
1888 #define SKB_FRAG_PAGE_ORDER     get_order(32768)
1889
1890 bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag)
1891 {
1892         int order;
1893
1894         if (pfrag->page) {
1895                 if (atomic_read(&pfrag->page->_count) == 1) {
1896                         pfrag->offset = 0;
1897                         return true;
1898                 }
1899                 if (pfrag->offset < pfrag->size)
1900                         return true;
1901                 put_page(pfrag->page);
1902         }
1903
1904         /* We restrict high order allocations to users that can afford to wait */
1905         order = (sk->sk_allocation & __GFP_WAIT) ? SKB_FRAG_PAGE_ORDER : 0;
1906
1907         do {
1908                 gfp_t gfp = sk->sk_allocation;
1909
1910                 if (order)
1911                         gfp |= __GFP_COMP | __GFP_NOWARN | __GFP_NORETRY;
1912                 pfrag->page = alloc_pages(gfp, order);
1913                 if (likely(pfrag->page)) {
1914                         pfrag->offset = 0;
1915                         pfrag->size = PAGE_SIZE << order;
1916                         return true;
1917                 }
1918         } while (--order >= 0);
1919
1920         sk_enter_memory_pressure(sk);
1921         sk_stream_moderate_sndbuf(sk);
1922         return false;
1923 }
1924 EXPORT_SYMBOL(sk_page_frag_refill);
1925
1926 static void __lock_sock(struct sock *sk)
1927         __releases(&sk->sk_lock.slock)
1928         __acquires(&sk->sk_lock.slock)
1929 {
1930         DEFINE_WAIT(wait);
1931
1932         for (;;) {
1933                 prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait,
1934                                         TASK_UNINTERRUPTIBLE);
1935                 spin_unlock_bh(&sk->sk_lock.slock);
1936                 schedule();
1937                 spin_lock_bh(&sk->sk_lock.slock);
1938                 if (!sock_owned_by_user(sk))
1939                         break;
1940         }
1941         finish_wait(&sk->sk_lock.wq, &wait);
1942 }
1943
1944 static void __release_sock(struct sock *sk)
1945         __releases(&sk->sk_lock.slock)
1946         __acquires(&sk->sk_lock.slock)
1947 {
1948         struct sk_buff *skb = sk->sk_backlog.head;
1949
1950         do {
1951                 sk->sk_backlog.head = sk->sk_backlog.tail = NULL;
1952                 bh_unlock_sock(sk);
1953
1954                 do {
1955                         struct sk_buff *next = skb->next;
1956
1957                         prefetch(next);
1958                         WARN_ON_ONCE(skb_dst_is_noref(skb));
1959                         skb->next = NULL;
1960                         sk_backlog_rcv(sk, skb);
1961
1962                         /*
1963                          * We are in process context here with softirqs
1964                          * disabled, use cond_resched_softirq() to preempt.
1965                          * This is safe to do because we've taken the backlog
1966                          * queue private:
1967                          */
1968                         cond_resched_softirq();
1969
1970                         skb = next;
1971                 } while (skb != NULL);
1972
1973                 bh_lock_sock(sk);
1974         } while ((skb = sk->sk_backlog.head) != NULL);
1975
1976         /*
1977          * Doing the zeroing here guarantee we can not loop forever
1978          * while a wild producer attempts to flood us.
1979          */
1980         sk->sk_backlog.len = 0;
1981 }
1982
1983 /**
1984  * sk_wait_data - wait for data to arrive at sk_receive_queue
1985  * @sk:    sock to wait on
1986  * @timeo: for how long
1987  *
1988  * Now socket state including sk->sk_err is changed only under lock,
1989  * hence we may omit checks after joining wait queue.
1990  * We check receive queue before schedule() only as optimization;
1991  * it is very likely that release_sock() added new data.
1992  */
1993 int sk_wait_data(struct sock *sk, long *timeo)
1994 {
1995         int rc;
1996         DEFINE_WAIT(wait);
1997
1998         prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
1999         set_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
2000         rc = sk_wait_event(sk, timeo, !skb_queue_empty(&sk->sk_receive_queue));
2001         clear_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
2002         finish_wait(sk_sleep(sk), &wait);
2003         return rc;
2004 }
2005 EXPORT_SYMBOL(sk_wait_data);
2006
2007 /**
2008  *      __sk_mem_schedule - increase sk_forward_alloc and memory_allocated
2009  *      @sk: socket
2010  *      @size: memory size to allocate
2011  *      @kind: allocation type
2012  *
2013  *      If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means
2014  *      rmem allocation. This function assumes that protocols which have
2015  *      memory_pressure use sk_wmem_queued as write buffer accounting.
2016  */
2017 int __sk_mem_schedule(struct sock *sk, int size, int kind)
2018 {
2019         struct proto *prot = sk->sk_prot;
2020         int amt = sk_mem_pages(size);
2021         long allocated;
2022         int parent_status = UNDER_LIMIT;
2023
2024         sk->sk_forward_alloc += amt * SK_MEM_QUANTUM;
2025
2026         allocated = sk_memory_allocated_add(sk, amt, &parent_status);
2027
2028         /* Under limit. */
2029         if (parent_status == UNDER_LIMIT &&
2030                         allocated <= sk_prot_mem_limits(sk, 0)) {
2031                 sk_leave_memory_pressure(sk);
2032                 return 1;
2033         }
2034
2035         /* Under pressure. (we or our parents) */
2036         if ((parent_status > SOFT_LIMIT) ||
2037                         allocated > sk_prot_mem_limits(sk, 1))
2038                 sk_enter_memory_pressure(sk);
2039
2040         /* Over hard limit (we or our parents) */
2041         if ((parent_status == OVER_LIMIT) ||
2042                         (allocated > sk_prot_mem_limits(sk, 2)))
2043                 goto suppress_allocation;
2044
2045         /* guarantee minimum buffer size under pressure */
2046         if (kind == SK_MEM_RECV) {
2047                 if (atomic_read(&sk->sk_rmem_alloc) < prot->sysctl_rmem[0])
2048                         return 1;
2049
2050         } else { /* SK_MEM_SEND */
2051                 if (sk->sk_type == SOCK_STREAM) {
2052                         if (sk->sk_wmem_queued < prot->sysctl_wmem[0])
2053                                 return 1;
2054                 } else if (atomic_read(&sk->sk_wmem_alloc) <
2055                            prot->sysctl_wmem[0])
2056                                 return 1;
2057         }
2058
2059         if (sk_has_memory_pressure(sk)) {
2060                 int alloc;
2061
2062                 if (!sk_under_memory_pressure(sk))
2063                         return 1;
2064                 alloc = sk_sockets_allocated_read_positive(sk);
2065                 if (sk_prot_mem_limits(sk, 2) > alloc *
2066                     sk_mem_pages(sk->sk_wmem_queued +
2067                                  atomic_read(&sk->sk_rmem_alloc) +
2068                                  sk->sk_forward_alloc))
2069                         return 1;
2070         }
2071
2072 suppress_allocation:
2073
2074         if (kind == SK_MEM_SEND && sk->sk_type == SOCK_STREAM) {
2075                 sk_stream_moderate_sndbuf(sk);
2076
2077                 /* Fail only if socket is _under_ its sndbuf.
2078                  * In this case we cannot block, so that we have to fail.
2079                  */
2080                 if (sk->sk_wmem_queued + size >= sk->sk_sndbuf)
2081                         return 1;
2082         }
2083
2084         trace_sock_exceed_buf_limit(sk, prot, allocated);
2085
2086         /* Alas. Undo changes. */
2087         sk->sk_forward_alloc -= amt * SK_MEM_QUANTUM;
2088
2089         sk_memory_allocated_sub(sk, amt);
2090
2091         return 0;
2092 }
2093 EXPORT_SYMBOL(__sk_mem_schedule);
2094
2095 /**
2096  *      __sk_reclaim - reclaim memory_allocated
2097  *      @sk: socket
2098  */
2099 void __sk_mem_reclaim(struct sock *sk)
2100 {
2101         sk_memory_allocated_sub(sk,
2102                                 sk->sk_forward_alloc >> SK_MEM_QUANTUM_SHIFT);
2103         sk->sk_forward_alloc &= SK_MEM_QUANTUM - 1;
2104
2105         if (sk_under_memory_pressure(sk) &&
2106             (sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)))
2107                 sk_leave_memory_pressure(sk);
2108 }
2109 EXPORT_SYMBOL(__sk_mem_reclaim);
2110
2111
2112 /*
2113  * Set of default routines for initialising struct proto_ops when
2114  * the protocol does not support a particular function. In certain
2115  * cases where it makes no sense for a protocol to have a "do nothing"
2116  * function, some default processing is provided.
2117  */
2118
2119 int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len)
2120 {
2121         return -EOPNOTSUPP;
2122 }
2123 EXPORT_SYMBOL(sock_no_bind);
2124
2125 int sock_no_connect(struct socket *sock, struct sockaddr *saddr,
2126                     int len, int flags)
2127 {
2128         return -EOPNOTSUPP;
2129 }
2130 EXPORT_SYMBOL(sock_no_connect);
2131
2132 int sock_no_socketpair(struct socket *sock1, struct socket *sock2)
2133 {
2134         return -EOPNOTSUPP;
2135 }
2136 EXPORT_SYMBOL(sock_no_socketpair);
2137
2138 int sock_no_accept(struct socket *sock, struct socket *newsock, int flags)
2139 {
2140         return -EOPNOTSUPP;
2141 }
2142 EXPORT_SYMBOL(sock_no_accept);
2143
2144 int sock_no_getname(struct socket *sock, struct sockaddr *saddr,
2145                     int *len, int peer)
2146 {
2147         return -EOPNOTSUPP;
2148 }
2149 EXPORT_SYMBOL(sock_no_getname);
2150
2151 unsigned int sock_no_poll(struct file *file, struct socket *sock, poll_table *pt)
2152 {
2153         return 0;
2154 }
2155 EXPORT_SYMBOL(sock_no_poll);
2156
2157 int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
2158 {
2159         return -EOPNOTSUPP;
2160 }
2161 EXPORT_SYMBOL(sock_no_ioctl);
2162
2163 int sock_no_listen(struct socket *sock, int backlog)
2164 {
2165         return -EOPNOTSUPP;
2166 }
2167 EXPORT_SYMBOL(sock_no_listen);
2168
2169 int sock_no_shutdown(struct socket *sock, int how)
2170 {
2171         return -EOPNOTSUPP;
2172 }
2173 EXPORT_SYMBOL(sock_no_shutdown);
2174
2175 int sock_no_setsockopt(struct socket *sock, int level, int optname,
2176                     char __user *optval, unsigned int optlen)
2177 {
2178         return -EOPNOTSUPP;
2179 }
2180 EXPORT_SYMBOL(sock_no_setsockopt);
2181
2182 int sock_no_getsockopt(struct socket *sock, int level, int optname,
2183                     char __user *optval, int __user *optlen)
2184 {
2185         return -EOPNOTSUPP;
2186 }
2187 EXPORT_SYMBOL(sock_no_getsockopt);
2188
2189 int sock_no_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *m,
2190                     size_t len)
2191 {
2192         return -EOPNOTSUPP;
2193 }
2194 EXPORT_SYMBOL(sock_no_sendmsg);
2195
2196 int sock_no_recvmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *m,
2197                     size_t len, int flags)
2198 {
2199         return -EOPNOTSUPP;
2200 }
2201 EXPORT_SYMBOL(sock_no_recvmsg);
2202
2203 int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
2204 {
2205         /* Mirror missing mmap method error code */
2206         return -ENODEV;
2207 }
2208 EXPORT_SYMBOL(sock_no_mmap);
2209
2210 ssize_t sock_no_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags)
2211 {
2212         ssize_t res;
2213         struct msghdr msg = {.msg_flags = flags};
2214         struct kvec iov;
2215         char *kaddr = kmap(page);
2216         iov.iov_base = kaddr + offset;
2217         iov.iov_len = size;
2218         res = kernel_sendmsg(sock, &msg, &iov, 1, size);
2219         kunmap(page);
2220         return res;
2221 }
2222 EXPORT_SYMBOL(sock_no_sendpage);
2223
2224 /*
2225  *      Default Socket Callbacks
2226  */
2227
2228 static void sock_def_wakeup(struct sock *sk)
2229 {
2230         struct socket_wq *wq;
2231
2232         rcu_read_lock();
2233         wq = rcu_dereference(sk->sk_wq);
2234         if (wq_has_sleeper(wq))
2235                 wake_up_interruptible_all(&wq->wait);
2236         rcu_read_unlock();
2237 }
2238
2239 static void sock_def_error_report(struct sock *sk)
2240 {
2241         struct socket_wq *wq;
2242
2243         rcu_read_lock();
2244         wq = rcu_dereference(sk->sk_wq);
2245         if (wq_has_sleeper(wq))
2246                 wake_up_interruptible_poll(&wq->wait, POLLERR);
2247         sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR);
2248         rcu_read_unlock();
2249 }
2250
2251 static void sock_def_readable(struct sock *sk, int len)
2252 {
2253         struct socket_wq *wq;
2254
2255         rcu_read_lock();
2256         wq = rcu_dereference(sk->sk_wq);
2257         if (wq_has_sleeper(wq))
2258                 wake_up_interruptible_sync_poll(&wq->wait, POLLIN | POLLPRI |
2259                                                 POLLRDNORM | POLLRDBAND);
2260         sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
2261         rcu_read_unlock();
2262 }
2263
2264 static void sock_def_write_space(struct sock *sk)
2265 {
2266         struct socket_wq *wq;
2267
2268         rcu_read_lock();
2269
2270         /* Do not wake up a writer until he can make "significant"
2271          * progress.  --DaveM
2272          */
2273         if ((atomic_read(&sk->sk_wmem_alloc) << 1) <= sk->sk_sndbuf) {
2274                 wq = rcu_dereference(sk->sk_wq);
2275                 if (wq_has_sleeper(wq))
2276                         wake_up_interruptible_sync_poll(&wq->wait, POLLOUT |
2277                                                 POLLWRNORM | POLLWRBAND);
2278
2279                 /* Should agree with poll, otherwise some programs break */
2280                 if (sock_writeable(sk))
2281                         sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
2282         }
2283
2284         rcu_read_unlock();
2285 }
2286
2287 static void sock_def_destruct(struct sock *sk)
2288 {
2289         kfree(sk->sk_protinfo);
2290 }
2291
2292 void sk_send_sigurg(struct sock *sk)
2293 {
2294         if (sk->sk_socket && sk->sk_socket->file)
2295                 if (send_sigurg(&sk->sk_socket->file->f_owner))
2296                         sk_wake_async(sk, SOCK_WAKE_URG, POLL_PRI);
2297 }
2298 EXPORT_SYMBOL(sk_send_sigurg);
2299
2300 void sk_reset_timer(struct sock *sk, struct timer_list* timer,
2301                     unsigned long expires)
2302 {
2303         if (!mod_timer(timer, expires))
2304                 sock_hold(sk);
2305 }
2306 EXPORT_SYMBOL(sk_reset_timer);
2307
2308 void sk_stop_timer(struct sock *sk, struct timer_list* timer)
2309 {
2310         if (del_timer(timer))
2311                 __sock_put(sk);
2312 }
2313 EXPORT_SYMBOL(sk_stop_timer);
2314
2315 void sock_init_data(struct socket *sock, struct sock *sk)
2316 {
2317         skb_queue_head_init(&sk->sk_receive_queue);
2318         skb_queue_head_init(&sk->sk_write_queue);
2319         skb_queue_head_init(&sk->sk_error_queue);
2320 #ifdef CONFIG_NET_DMA
2321         skb_queue_head_init(&sk->sk_async_wait_queue);
2322 #endif
2323
2324         sk->sk_send_head        =       NULL;
2325
2326         init_timer(&sk->sk_timer);
2327
2328         sk->sk_allocation       =       GFP_KERNEL;
2329         sk->sk_rcvbuf           =       sysctl_rmem_default;
2330         sk->sk_sndbuf           =       sysctl_wmem_default;
2331         sk->sk_state            =       TCP_CLOSE;
2332         sk_set_socket(sk, sock);
2333
2334         sock_set_flag(sk, SOCK_ZAPPED);
2335
2336         if (sock) {
2337                 sk->sk_type     =       sock->type;
2338                 sk->sk_wq       =       sock->wq;
2339                 sock->sk        =       sk;
2340         } else
2341                 sk->sk_wq       =       NULL;
2342
2343         spin_lock_init(&sk->sk_dst_lock);
2344         rwlock_init(&sk->sk_callback_lock);
2345         lockdep_set_class_and_name(&sk->sk_callback_lock,
2346                         af_callback_keys + sk->sk_family,
2347                         af_family_clock_key_strings[sk->sk_family]);
2348
2349         sk->sk_state_change     =       sock_def_wakeup;
2350         sk->sk_data_ready       =       sock_def_readable;
2351         sk->sk_write_space      =       sock_def_write_space;
2352         sk->sk_error_report     =       sock_def_error_report;
2353         sk->sk_destruct         =       sock_def_destruct;
2354
2355         sk->sk_frag.page        =       NULL;
2356         sk->sk_frag.offset      =       0;
2357         sk->sk_peek_off         =       -1;
2358
2359         sk->sk_peer_pid         =       NULL;
2360         sk->sk_peer_cred        =       NULL;
2361         sk->sk_write_pending    =       0;
2362         sk->sk_rcvlowat         =       1;
2363         sk->sk_rcvtimeo         =       MAX_SCHEDULE_TIMEOUT;
2364         sk->sk_sndtimeo         =       MAX_SCHEDULE_TIMEOUT;
2365
2366         sk->sk_stamp = ktime_set(-1L, 0);
2367
2368 #ifdef CONFIG_NET_RX_BUSY_POLL
2369         sk->sk_napi_id          =       0;
2370         sk->sk_ll_usec          =       sysctl_net_busy_read;
2371 #endif
2372
2373         sk->sk_pacing_rate = ~0U;
2374         /*
2375          * Before updating sk_refcnt, we must commit prior changes to memory
2376          * (Documentation/RCU/rculist_nulls.txt for details)
2377          */
2378         smp_wmb();
2379         atomic_set(&sk->sk_refcnt, 1);
2380         atomic_set(&sk->sk_drops, 0);
2381 }
2382 EXPORT_SYMBOL(sock_init_data);
2383
2384 void lock_sock_nested(struct sock *sk, int subclass)
2385 {
2386         might_sleep();
2387         spin_lock_bh(&sk->sk_lock.slock);
2388         if (sk->sk_lock.owned)
2389                 __lock_sock(sk);
2390         sk->sk_lock.owned = 1;
2391         spin_unlock(&sk->sk_lock.slock);
2392         /*
2393          * The sk_lock has mutex_lock() semantics here:
2394          */
2395         mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_);
2396         local_bh_enable();
2397 }
2398 EXPORT_SYMBOL(lock_sock_nested);
2399
2400 void release_sock(struct sock *sk)
2401 {
2402         /*
2403          * The sk_lock has mutex_unlock() semantics:
2404          */
2405         mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
2406
2407         spin_lock_bh(&sk->sk_lock.slock);
2408         if (sk->sk_backlog.tail)
2409                 __release_sock(sk);
2410
2411         /* Warning : release_cb() might need to release sk ownership,
2412          * ie call sock_release_ownership(sk) before us.
2413          */
2414         if (sk->sk_prot->release_cb)
2415                 sk->sk_prot->release_cb(sk);
2416
2417         sock_release_ownership(sk);
2418         if (waitqueue_active(&sk->sk_lock.wq))
2419                 wake_up(&sk->sk_lock.wq);
2420         spin_unlock_bh(&sk->sk_lock.slock);
2421 }
2422 EXPORT_SYMBOL(release_sock);
2423
2424 /**
2425  * lock_sock_fast - fast version of lock_sock
2426  * @sk: socket
2427  *
2428  * This version should be used for very small section, where process wont block
2429  * return false if fast path is taken
2430  *   sk_lock.slock locked, owned = 0, BH disabled
2431  * return true if slow path is taken
2432  *   sk_lock.slock unlocked, owned = 1, BH enabled
2433  */
2434 bool lock_sock_fast(struct sock *sk)
2435 {
2436         might_sleep();
2437         spin_lock_bh(&sk->sk_lock.slock);
2438
2439         if (!sk->sk_lock.owned)
2440                 /*
2441                  * Note : We must disable BH
2442                  */
2443                 return false;
2444
2445         __lock_sock(sk);
2446         sk->sk_lock.owned = 1;
2447         spin_unlock(&sk->sk_lock.slock);
2448         /*
2449          * The sk_lock has mutex_lock() semantics here:
2450          */
2451         mutex_acquire(&sk->sk_lock.dep_map, 0, 0, _RET_IP_);
2452         local_bh_enable();
2453         return true;
2454 }
2455 EXPORT_SYMBOL(lock_sock_fast);
2456
2457 int sock_get_timestamp(struct sock *sk, struct timeval __user *userstamp)
2458 {
2459         struct timeval tv;
2460         if (!sock_flag(sk, SOCK_TIMESTAMP))
2461                 sock_enable_timestamp(sk, SOCK_TIMESTAMP);
2462         tv = ktime_to_timeval(sk->sk_stamp);
2463         if (tv.tv_sec == -1)
2464                 return -ENOENT;
2465         if (tv.tv_sec == 0) {
2466                 sk->sk_stamp = ktime_get_real();
2467                 tv = ktime_to_timeval(sk->sk_stamp);
2468         }
2469         return copy_to_user(userstamp, &tv, sizeof(tv)) ? -EFAULT : 0;
2470 }
2471 EXPORT_SYMBOL(sock_get_timestamp);
2472
2473 int sock_get_timestampns(struct sock *sk, struct timespec __user *userstamp)
2474 {
2475         struct timespec ts;
2476         if (!sock_flag(sk, SOCK_TIMESTAMP))
2477                 sock_enable_timestamp(sk, SOCK_TIMESTAMP);
2478         ts = ktime_to_timespec(sk->sk_stamp);
2479         if (ts.tv_sec == -1)
2480                 return -ENOENT;
2481         if (ts.tv_sec == 0) {
2482                 sk->sk_stamp = ktime_get_real();
2483                 ts = ktime_to_timespec(sk->sk_stamp);
2484         }
2485         return copy_to_user(userstamp, &ts, sizeof(ts)) ? -EFAULT : 0;
2486 }
2487 EXPORT_SYMBOL(sock_get_timestampns);
2488
2489 void sock_enable_timestamp(struct sock *sk, int flag)
2490 {
2491         if (!sock_flag(sk, flag)) {
2492                 unsigned long previous_flags = sk->sk_flags;
2493
2494                 sock_set_flag(sk, flag);
2495                 /*
2496                  * we just set one of the two flags which require net
2497                  * time stamping, but time stamping might have been on
2498                  * already because of the other one
2499                  */
2500                 if (!(previous_flags & SK_FLAGS_TIMESTAMP))
2501                         net_enable_timestamp();
2502         }
2503 }
2504
2505 int sock_recv_errqueue(struct sock *sk, struct msghdr *msg, int len,
2506                        int level, int type)
2507 {
2508         struct sock_exterr_skb *serr;
2509         struct sk_buff *skb, *skb2;
2510         int copied, err;
2511
2512         err = -EAGAIN;
2513         skb = skb_dequeue(&sk->sk_error_queue);
2514         if (skb == NULL)
2515                 goto out;
2516
2517         copied = skb->len;
2518         if (copied > len) {
2519                 msg->msg_flags |= MSG_TRUNC;
2520                 copied = len;
2521         }
2522         err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied);
2523         if (err)
2524                 goto out_free_skb;
2525
2526         sock_recv_timestamp(msg, sk, skb);
2527
2528         serr = SKB_EXT_ERR(skb);
2529         put_cmsg(msg, level, type, sizeof(serr->ee), &serr->ee);
2530
2531         msg->msg_flags |= MSG_ERRQUEUE;
2532         err = copied;
2533
2534         /* Reset and regenerate socket error */
2535         spin_lock_bh(&sk->sk_error_queue.lock);
2536         sk->sk_err = 0;
2537         if ((skb2 = skb_peek(&sk->sk_error_queue)) != NULL) {
2538                 sk->sk_err = SKB_EXT_ERR(skb2)->ee.ee_errno;
2539                 spin_unlock_bh(&sk->sk_error_queue.lock);
2540                 sk->sk_error_report(sk);
2541         } else
2542                 spin_unlock_bh(&sk->sk_error_queue.lock);
2543
2544 out_free_skb:
2545         kfree_skb(skb);
2546 out:
2547         return err;
2548 }
2549 EXPORT_SYMBOL(sock_recv_errqueue);
2550
2551 /*
2552  *      Get a socket option on an socket.
2553  *
2554  *      FIX: POSIX 1003.1g is very ambiguous here. It states that
2555  *      asynchronous errors should be reported by getsockopt. We assume
2556  *      this means if you specify SO_ERROR (otherwise whats the point of it).
2557  */
2558 int sock_common_getsockopt(struct socket *sock, int level, int optname,
2559                            char __user *optval, int __user *optlen)
2560 {
2561         struct sock *sk = sock->sk;
2562
2563         return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
2564 }
2565 EXPORT_SYMBOL(sock_common_getsockopt);
2566
2567 #ifdef CONFIG_COMPAT
2568 int compat_sock_common_getsockopt(struct socket *sock, int level, int optname,
2569                                   char __user *optval, int __user *optlen)
2570 {
2571         struct sock *sk = sock->sk;
2572
2573         if (sk->sk_prot->compat_getsockopt != NULL)
2574                 return sk->sk_prot->compat_getsockopt(sk, level, optname,
2575                                                       optval, optlen);
2576         return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
2577 }
2578 EXPORT_SYMBOL(compat_sock_common_getsockopt);
2579 #endif
2580
2581 int sock_common_recvmsg(struct kiocb *iocb, struct socket *sock,
2582                         struct msghdr *msg, size_t size, int flags)
2583 {
2584         struct sock *sk = sock->sk;
2585         int addr_len = 0;
2586         int err;
2587
2588         err = sk->sk_prot->recvmsg(iocb, sk, msg, size, flags & MSG_DONTWAIT,
2589                                    flags & ~MSG_DONTWAIT, &addr_len);
2590         if (err >= 0)
2591                 msg->msg_namelen = addr_len;
2592         return err;
2593 }
2594 EXPORT_SYMBOL(sock_common_recvmsg);
2595
2596 /*
2597  *      Set socket options on an inet socket.
2598  */
2599 int sock_common_setsockopt(struct socket *sock, int level, int optname,
2600                            char __user *optval, unsigned int optlen)
2601 {
2602         struct sock *sk = sock->sk;
2603
2604         return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
2605 }
2606 EXPORT_SYMBOL(sock_common_setsockopt);
2607
2608 #ifdef CONFIG_COMPAT
2609 int compat_sock_common_setsockopt(struct socket *sock, int level, int optname,
2610                                   char __user *optval, unsigned int optlen)
2611 {
2612         struct sock *sk = sock->sk;
2613
2614         if (sk->sk_prot->compat_setsockopt != NULL)
2615                 return sk->sk_prot->compat_setsockopt(sk, level, optname,
2616                                                       optval, optlen);
2617         return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
2618 }
2619 EXPORT_SYMBOL(compat_sock_common_setsockopt);
2620 #endif
2621
2622 void sk_common_release(struct sock *sk)
2623 {
2624         if (sk->sk_prot->destroy)
2625                 sk->sk_prot->destroy(sk);
2626
2627         /*
2628          * Observation: when sock_common_release is called, processes have
2629          * no access to socket. But net still has.
2630          * Step one, detach it from networking:
2631          *
2632          * A. Remove from hash tables.
2633          */
2634
2635         sk->sk_prot->unhash(sk);
2636
2637         /*
2638          * In this point socket cannot receive new packets, but it is possible
2639          * that some packets are in flight because some CPU runs receiver and
2640          * did hash table lookup before we unhashed socket. They will achieve
2641          * receive queue and will be purged by socket destructor.
2642          *
2643          * Also we still have packets pending on receive queue and probably,
2644          * our own packets waiting in device queues. sock_destroy will drain
2645          * receive queue, but transmitted packets will delay socket destruction
2646          * until the last reference will be released.
2647          */
2648
2649         sock_orphan(sk);
2650
2651         xfrm_sk_free_policy(sk);
2652
2653         sk_refcnt_debug_release(sk);
2654
2655         if (sk->sk_frag.page) {
2656                 put_page(sk->sk_frag.page);
2657                 sk->sk_frag.page = NULL;
2658         }
2659
2660         sock_put(sk);
2661 }
2662 EXPORT_SYMBOL(sk_common_release);
2663
2664 #ifdef CONFIG_PROC_FS
2665 #define PROTO_INUSE_NR  64      /* should be enough for the first time */
2666 struct prot_inuse {
2667         int val[PROTO_INUSE_NR];
2668 };
2669
2670 static DECLARE_BITMAP(proto_inuse_idx, PROTO_INUSE_NR);
2671
2672 #ifdef CONFIG_NET_NS
2673 void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
2674 {
2675         __this_cpu_add(net->core.inuse->val[prot->inuse_idx], val);
2676 }
2677 EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
2678
2679 int sock_prot_inuse_get(struct net *net, struct proto *prot)
2680 {
2681         int cpu, idx = prot->inuse_idx;
2682         int res = 0;
2683
2684         for_each_possible_cpu(cpu)
2685                 res += per_cpu_ptr(net->core.inuse, cpu)->val[idx];
2686
2687         return res >= 0 ? res : 0;
2688 }
2689 EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
2690
2691 static int __net_init sock_inuse_init_net(struct net *net)
2692 {
2693         net->core.inuse = alloc_percpu(struct prot_inuse);
2694         return net->core.inuse ? 0 : -ENOMEM;
2695 }
2696
2697 static void __net_exit sock_inuse_exit_net(struct net *net)
2698 {
2699         free_percpu(net->core.inuse);
2700 }
2701
2702 static struct pernet_operations net_inuse_ops = {
2703         .init = sock_inuse_init_net,
2704         .exit = sock_inuse_exit_net,
2705 };
2706
2707 static __init int net_inuse_init(void)
2708 {
2709         if (register_pernet_subsys(&net_inuse_ops))
2710                 panic("Cannot initialize net inuse counters");
2711
2712         return 0;
2713 }
2714
2715 core_initcall(net_inuse_init);
2716 #else
2717 static DEFINE_PER_CPU(struct prot_inuse, prot_inuse);
2718
2719 void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
2720 {
2721         __this_cpu_add(prot_inuse.val[prot->inuse_idx], val);
2722 }
2723 EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
2724
2725 int sock_prot_inuse_get(struct net *net, struct proto *prot)
2726 {
2727         int cpu, idx = prot->inuse_idx;
2728         int res = 0;
2729
2730         for_each_possible_cpu(cpu)
2731                 res += per_cpu(prot_inuse, cpu).val[idx];
2732
2733         return res >= 0 ? res : 0;
2734 }
2735 EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
2736 #endif
2737
2738 static void assign_proto_idx(struct proto *prot)
2739 {
2740         prot->inuse_idx = find_first_zero_bit(proto_inuse_idx, PROTO_INUSE_NR);
2741
2742         if (unlikely(prot->inuse_idx == PROTO_INUSE_NR - 1)) {
2743                 pr_err("PROTO_INUSE_NR exhausted\n");
2744                 return;
2745         }
2746
2747         set_bit(prot->inuse_idx, proto_inuse_idx);
2748 }
2749
2750 static void release_proto_idx(struct proto *prot)
2751 {
2752         if (prot->inuse_idx != PROTO_INUSE_NR - 1)
2753                 clear_bit(prot->inuse_idx, proto_inuse_idx);
2754 }
2755 #else
2756 static inline void assign_proto_idx(struct proto *prot)
2757 {
2758 }
2759
2760 static inline void release_proto_idx(struct proto *prot)
2761 {
2762 }
2763 #endif
2764
2765 int proto_register(struct proto *prot, int alloc_slab)
2766 {
2767         if (alloc_slab) {
2768                 prot->slab = kmem_cache_create(prot->name, prot->obj_size, 0,
2769                                         SLAB_HWCACHE_ALIGN | prot->slab_flags,
2770                                         NULL);
2771
2772                 if (prot->slab == NULL) {
2773                         pr_crit("%s: Can't create sock SLAB cache!\n",
2774                                 prot->name);
2775                         goto out;
2776                 }
2777
2778                 if (prot->rsk_prot != NULL) {
2779                         prot->rsk_prot->slab_name = kasprintf(GFP_KERNEL, "request_sock_%s", prot->name);
2780                         if (prot->rsk_prot->slab_name == NULL)
2781                                 goto out_free_sock_slab;
2782
2783                         prot->rsk_prot->slab = kmem_cache_create(prot->rsk_prot->slab_name,
2784                                                                  prot->rsk_prot->obj_size, 0,
2785                                                                  SLAB_HWCACHE_ALIGN, NULL);
2786
2787                         if (prot->rsk_prot->slab == NULL) {
2788                                 pr_crit("%s: Can't create request sock SLAB cache!\n",
2789                                         prot->name);
2790                                 goto out_free_request_sock_slab_name;
2791                         }
2792                 }
2793
2794                 if (prot->twsk_prot != NULL) {
2795                         prot->twsk_prot->twsk_slab_name = kasprintf(GFP_KERNEL, "tw_sock_%s", prot->name);
2796
2797                         if (prot->twsk_prot->twsk_slab_name == NULL)
2798                                 goto out_free_request_sock_slab;
2799
2800                         prot->twsk_prot->twsk_slab =
2801                                 kmem_cache_create(prot->twsk_prot->twsk_slab_name,
2802                                                   prot->twsk_prot->twsk_obj_size,
2803                                                   0,
2804                                                   SLAB_HWCACHE_ALIGN |
2805                                                         prot->slab_flags,
2806                                                   NULL);
2807                         if (prot->twsk_prot->twsk_slab == NULL)
2808                                 goto out_free_timewait_sock_slab_name;
2809                 }
2810         }
2811
2812         mutex_lock(&proto_list_mutex);
2813         list_add(&prot->node, &proto_list);
2814         assign_proto_idx(prot);
2815         mutex_unlock(&proto_list_mutex);
2816         return 0;
2817
2818 out_free_timewait_sock_slab_name:
2819         kfree(prot->twsk_prot->twsk_slab_name);
2820 out_free_request_sock_slab:
2821         if (prot->rsk_prot && prot->rsk_prot->slab) {
2822                 kmem_cache_destroy(prot->rsk_prot->slab);
2823                 prot->rsk_prot->slab = NULL;
2824         }
2825 out_free_request_sock_slab_name:
2826         if (prot->rsk_prot)
2827                 kfree(prot->rsk_prot->slab_name);
2828 out_free_sock_slab:
2829         kmem_cache_destroy(prot->slab);
2830         prot->slab = NULL;
2831 out:
2832         return -ENOBUFS;
2833 }
2834 EXPORT_SYMBOL(proto_register);
2835
2836 void proto_unregister(struct proto *prot)
2837 {
2838         mutex_lock(&proto_list_mutex);
2839         release_proto_idx(prot);
2840         list_del(&prot->node);
2841         mutex_unlock(&proto_list_mutex);
2842
2843         if (prot->slab != NULL) {
2844                 kmem_cache_destroy(prot->slab);
2845                 prot->slab = NULL;
2846         }
2847
2848         if (prot->rsk_prot != NULL && prot->rsk_prot->slab != NULL) {
2849                 kmem_cache_destroy(prot->rsk_prot->slab);
2850                 kfree(prot->rsk_prot->slab_name);
2851                 prot->rsk_prot->slab = NULL;
2852         }
2853
2854         if (prot->twsk_prot != NULL && prot->twsk_prot->twsk_slab != NULL) {
2855                 kmem_cache_destroy(prot->twsk_prot->twsk_slab);
2856                 kfree(prot->twsk_prot->twsk_slab_name);
2857                 prot->twsk_prot->twsk_slab = NULL;
2858         }
2859 }
2860 EXPORT_SYMBOL(proto_unregister);
2861
2862 #ifdef CONFIG_PROC_FS
2863 static void *proto_seq_start(struct seq_file *seq, loff_t *pos)
2864         __acquires(proto_list_mutex)
2865 {
2866         mutex_lock(&proto_list_mutex);
2867         return seq_list_start_head(&proto_list, *pos);
2868 }
2869
2870 static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2871 {
2872         return seq_list_next(v, &proto_list, pos);
2873 }
2874
2875 static void proto_seq_stop(struct seq_file *seq, void *v)
2876         __releases(proto_list_mutex)
2877 {
2878         mutex_unlock(&proto_list_mutex);
2879 }
2880
2881 static char proto_method_implemented(const void *method)
2882 {
2883         return method == NULL ? 'n' : 'y';
2884 }
2885 static long sock_prot_memory_allocated(struct proto *proto)
2886 {
2887         return proto->memory_allocated != NULL ? proto_memory_allocated(proto) : -1L;
2888 }
2889
2890 static char *sock_prot_memory_pressure(struct proto *proto)
2891 {
2892         return proto->memory_pressure != NULL ?
2893         proto_memory_pressure(proto) ? "yes" : "no" : "NI";
2894 }
2895
2896 static void proto_seq_printf(struct seq_file *seq, struct proto *proto)
2897 {
2898
2899         seq_printf(seq, "%-9s %4u %6d  %6ld   %-3s %6u   %-3s  %-10s "
2900                         "%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n",
2901                    proto->name,
2902                    proto->obj_size,
2903                    sock_prot_inuse_get(seq_file_net(seq), proto),
2904                    sock_prot_memory_allocated(proto),
2905                    sock_prot_memory_pressure(proto),
2906                    proto->max_header,
2907                    proto->slab == NULL ? "no" : "yes",
2908                    module_name(proto->owner),
2909                    proto_method_implemented(proto->close),
2910                    proto_method_implemented(proto->connect),
2911                    proto_method_implemented(proto->disconnect),
2912                    proto_method_implemented(proto->accept),
2913                    proto_method_implemented(proto->ioctl),
2914                    proto_method_implemented(proto->init),
2915                    proto_method_implemented(proto->destroy),
2916                    proto_method_implemented(proto->shutdown),
2917                    proto_method_implemented(proto->setsockopt),
2918                    proto_method_implemented(proto->getsockopt),
2919                    proto_method_implemented(proto->sendmsg),
2920                    proto_method_implemented(proto->recvmsg),
2921                    proto_method_implemented(proto->sendpage),
2922                    proto_method_implemented(proto->bind),
2923                    proto_method_implemented(proto->backlog_rcv),
2924                    proto_method_implemented(proto->hash),
2925                    proto_method_implemented(proto->unhash),
2926                    proto_method_implemented(proto->get_port),
2927                    proto_method_implemented(proto->enter_memory_pressure));
2928 }
2929
2930 static int proto_seq_show(struct seq_file *seq, void *v)
2931 {
2932         if (v == &proto_list)
2933                 seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s",
2934                            "protocol",
2935                            "size",
2936                            "sockets",
2937                            "memory",
2938                            "press",
2939                            "maxhdr",
2940                            "slab",
2941                            "module",
2942                            "cl co di ac io in de sh ss gs se re sp bi br ha uh gp em\n");
2943         else
2944                 proto_seq_printf(seq, list_entry(v, struct proto, node));
2945         return 0;
2946 }
2947
2948 static const struct seq_operations proto_seq_ops = {
2949         .start  = proto_seq_start,
2950         .next   = proto_seq_next,
2951         .stop   = proto_seq_stop,
2952         .show   = proto_seq_show,
2953 };
2954
2955 static int proto_seq_open(struct inode *inode, struct file *file)
2956 {
2957         return seq_open_net(inode, file, &proto_seq_ops,
2958                             sizeof(struct seq_net_private));
2959 }
2960
2961 static const struct file_operations proto_seq_fops = {
2962         .owner          = THIS_MODULE,
2963         .open           = proto_seq_open,
2964         .read           = seq_read,
2965         .llseek         = seq_lseek,
2966         .release        = seq_release_net,
2967 };
2968
2969 static __net_init int proto_init_net(struct net *net)
2970 {
2971         if (!proc_create("protocols", S_IRUGO, net->proc_net, &proto_seq_fops))
2972                 return -ENOMEM;
2973
2974         return 0;
2975 }
2976
2977 static __net_exit void proto_exit_net(struct net *net)
2978 {
2979         remove_proc_entry("protocols", net->proc_net);
2980 }
2981
2982
2983 static __net_initdata struct pernet_operations proto_net_ops = {
2984         .init = proto_init_net,
2985         .exit = proto_exit_net,
2986 };
2987
2988 static int __init proto_init(void)
2989 {
2990         return register_pernet_subsys(&proto_net_ops);
2991 }
2992
2993 subsys_initcall(proto_init);
2994
2995 #endif /* PROC_FS */