net/core/sock.c

   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              Generic socket support routines. Memory allocators, socket lock/release
   7  *              handler for protocols to use and generic option handler.
   8  *
   9  *
  10  * Authors:     Ross Biro
  11  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12  *              Florian La Roche, <flla@stud.uni-sb.de>
  13  *              Alan Cox, <A.Cox@swansea.ac.uk>
  14  *
  15  * Fixes:
  16  *              Alan Cox        :       Numerous verify_area() problems
  17  *              Alan Cox        :       Connecting on a connecting socket
  18  *                                      now returns an error for tcp.
  19  *              Alan Cox        :       sock->protocol is set correctly.
  20  *                                      and is not sometimes left as 0.
  21  *              Alan Cox        :       connect handles icmp errors on a
  22  *                                      connect properly. Unfortunately there
  23  *                                      is a restart syscall nasty there. I
  24  *                                      can't match BSD without hacking the C
  25  *                                      library. Ideas urgently sought!
  26  *              Alan Cox        :       Disallow bind() to addresses that are
  27  *                                      not ours - especially broadcast ones!!
  28  *              Alan Cox        :       Socket 1024 _IS_ ok for users. (fencepost)
  29  *              Alan Cox        :       sock_wfree/sock_rfree don't destroy sockets,
  30  *                                      instead they leave that for the DESTROY timer.
  31  *              Alan Cox        :       Clean up error flag in accept
  32  *              Alan Cox        :       TCP ack handling is buggy, the DESTROY timer
  33  *                                      was buggy. Put a remove_sock() in the handler
  34  *                                      for memory when we hit 0. Also altered the timer
  35  *                                      code. The ACK stuff can wait and needs major
  36  *                                      TCP layer surgery.
  37  *              Alan Cox        :       Fixed TCP ack bug, removed remove sock
  38  *                                      and fixed timer/inet_bh race.
  39  *              Alan Cox        :       Added zapped flag for TCP
  40  *              Alan Cox        :       Move kfree_skb into skbuff.c and tidied up surplus code
  41  *              Alan Cox        :       for new sk_buff allocations wmalloc/rmalloc now call alloc_skb
  42  *              Alan Cox        :       kfree_s calls now are kfree_skbmem so we can track skb resources
  43  *              Alan Cox        :       Supports socket option broadcast now as does udp. Packet and raw need fixing.
  44  *              Alan Cox        :       Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so...
  45  *              Rick Sladkey    :       Relaxed UDP rules for matching packets.
  46  *              C.E.Hawkins     :       IFF_PROMISC/SIOCGHWADDR support
  47  *      Pauline Middelink       :       identd support
  48  *              Alan Cox        :       Fixed connect() taking signals I think.
  49  *              Alan Cox        :       SO_LINGER supported
  50  *              Alan Cox        :       Error reporting fixes
  51  *              Anonymous       :       inet_create tidied up (sk->reuse setting)
  52  *              Alan Cox        :       inet sockets don't set sk->type!
  53  *              Alan Cox        :       Split socket option code
  54  *              Alan Cox        :       Callbacks
  55  *              Alan Cox        :       Nagle flag for Charles & Johannes stuff
  56  *              Alex            :       Removed restriction on inet fioctl
  57  *              Alan Cox        :       Splitting INET from NET core
  58  *              Alan Cox        :       Fixed bogus SO_TYPE handling in getsockopt()
  59  *              Adam Caldwell   :       Missing return in SO_DONTROUTE/SO_DEBUG code
  60  *              Alan Cox        :       Split IP from generic code
  61  *              Alan Cox        :       New kfree_skbmem()
  62  *              Alan Cox        :       Make SO_DEBUG superuser only.
  63  *              Alan Cox        :       Allow anyone to clear SO_DEBUG
  64  *                                      (compatibility fix)
  65  *              Alan Cox        :       Added optimistic memory grabbing for AF_UNIX throughput.
  66  *              Alan Cox        :       Allocator for a socket is settable.
  67  *              Alan Cox        :       SO_ERROR includes soft errors.
  68  *              Alan Cox        :       Allow NULL arguments on some SO_ opts
  69  *              Alan Cox        :       Generic socket allocation to make hooks
  70  *                                      easier (suggested by Craig Metz).
  71  *              Michael Pall    :       SO_ERROR returns positive errno again
  72  *              Steve Whitehouse:       Added default destructor to free
  73  *                                      protocol private data.
  74  *              Steve Whitehouse:       Added various other default routines
  75  *                                      common to several socket families.
  76  *              Chris Evans     :       Call suser() check last on F_SETOWN
  77  *              Jay Schulist    :       Added SO_ATTACH_FILTER and SO_DETACH_FILTER.
  78  *              Andi Kleen      :       Add sock_kmalloc()/sock_kfree_s()
  79  *              Andi Kleen      :       Fix write_space callback
  80  *              Chris Evans     :       Security fixes - signedness again
  81  *              Arnaldo C. Melo :       cleanups, use skb_queue_purge
  82  *
  83  * To Fix:
  84  *
  85  *
  86  *              This program is free software; you can redistribute it and/or
  87  *              modify it under the terms of the GNU General Public License
  88  *              as published by the Free Software Foundation; either version
  89  *              2 of the License, or (at your option) any later version.
  90  */
  91
  92 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
  93
  94 #include <linux/capability.h>
  95 #include <linux/errno.h>
  96 #include <linux/errqueue.h>
  97 #include <linux/types.h>
  98 #include <linux/socket.h>
  99 #include <linux/in.h>
 100 #include <linux/kernel.h>
 101 #include <linux/module.h>
 102 #include <linux/proc_fs.h>
 103 #include <linux/seq_file.h>
 104 #include <linux/sched.h>
 105 #include <linux/timer.h>
 106 #include <linux/string.h>
 107 #include <linux/sockios.h>
 108 #include <linux/net.h>
 109 #include <linux/mm.h>
 110 #include <linux/slab.h>
 111 #include <linux/interrupt.h>
 112 #include <linux/poll.h>
 113 #include <linux/tcp.h>
 114 #include <linux/init.h>
 115 #include <linux/highmem.h>
 116 #include <linux/user_namespace.h>
 117 #include <linux/static_key.h>
 118 #include <linux/memcontrol.h>
 119 #include <linux/prefetch.h>
 120
 121 #include <asm/uaccess.h>
 122
 123 #include <linux/netdevice.h>
 124 #include <net/protocol.h>
 125 #include <linux/skbuff.h>
 126 #include <net/net_namespace.h>
 127 #include <net/request_sock.h>
 128 #include <net/sock.h>
 129 #include <linux/net_tstamp.h>
 130 #include <net/xfrm.h>
 131 #include <linux/ipsec.h>
 132 #include <net/cls_cgroup.h>
 133 #include <net/netprio_cgroup.h>
 134 #include <linux/sock_diag.h>
 135
 136 #include <linux/filter.h>
 137 #include <net/sock_reuseport.h>
 138
 139 #include <trace/events/sock.h>
 140
 141 #include <net/tcp.h>
 142 #include <net/busy_poll.h>
 143
 144 static DEFINE_MUTEX(proto_list_mutex);
 145 static LIST_HEAD(proto_list);
 146
 147 /**
 148  * sk_ns_capable - General socket capability test
 149  * @sk: Socket to use a capability on or through
 150  * @user_ns: The user namespace of the capability to use
 151  * @cap: The capability to use
 152  *
 153  * Test to see if the opener of the socket had when the socket was
 154  * created and the current process has the capability @cap in the user
 155  * namespace @user_ns.
 156  */
 157 bool sk_ns_capable(const struct sock *sk,
 158                    struct user_namespace *user_ns, int cap)
 159 {
 160         return file_ns_capable(sk->sk_socket->file, user_ns, cap) &&
 161                 ns_capable(user_ns, cap);
 162 }
 163 EXPORT_SYMBOL(sk_ns_capable);
 164
 165 /**
 166  * sk_capable - Socket global capability test
 167  * @sk: Socket to use a capability on or through
 168  * @cap: The global capability to use
 169  *
 170  * Test to see if the opener of the socket had when the socket was
 171  * created and the current process has the capability @cap in all user
 172  * namespaces.
 173  */
 174 bool sk_capable(const struct sock *sk, int cap)
 175 {
 176         return sk_ns_capable(sk, &init_user_ns, cap);
 177 }
 178 EXPORT_SYMBOL(sk_capable);
 179
 180 /**
 181  * sk_net_capable - Network namespace socket capability test
 182  * @sk: Socket to use a capability on or through
 183  * @cap: The capability to use
 184  *
 185  * Test to see if the opener of the socket had when the socket was created
 186  * and the current process has the capability @cap over the network namespace
 187  * the socket is a member of.
 188  */
 189 bool sk_net_capable(const struct sock *sk, int cap)
 190 {
 191         return sk_ns_capable(sk, sock_net(sk)->user_ns, cap);
 192 }
 193 EXPORT_SYMBOL(sk_net_capable);
 194
 195 /*
 196  * Each address family might have different locking rules, so we have
 197  * one slock key per address family:
 198  */
 199 static struct lock_class_key af_family_keys[AF_MAX];
 200 static struct lock_class_key af_family_slock_keys[AF_MAX];
 201
 202 /*
 203  * Make lock validator output more readable. (we pre-construct these
 204  * strings build-time, so that runtime initialization of socket
 205  * locks is fast):
 206  */
 207 static const char *const af_family_key_strings[AF_MAX+1] = {
 208   "sk_lock-AF_UNSPEC", "sk_lock-AF_UNIX"     , "sk_lock-AF_INET"     ,
 209   "sk_lock-AF_AX25"  , "sk_lock-AF_IPX"      , "sk_lock-AF_APPLETALK",
 210   "sk_lock-AF_NETROM", "sk_lock-AF_BRIDGE"   , "sk_lock-AF_ATMPVC"   ,
 211   "sk_lock-AF_X25"   , "sk_lock-AF_INET6"    , "sk_lock-AF_ROSE"     ,
 212   "sk_lock-AF_DECnet", "sk_lock-AF_NETBEUI"  , "sk_lock-AF_SECURITY" ,
 213   "sk_lock-AF_KEY"   , "sk_lock-AF_NETLINK"  , "sk_lock-AF_PACKET"   ,
 214   "sk_lock-AF_ASH"   , "sk_lock-AF_ECONET"   , "sk_lock-AF_ATMSVC"   ,
 215   "sk_lock-AF_RDS"   , "sk_lock-AF_SNA"      , "sk_lock-AF_IRDA"     ,
 216   "sk_lock-AF_PPPOX" , "sk_lock-AF_WANPIPE"  , "sk_lock-AF_LLC"      ,
 217   "sk_lock-27"       , "sk_lock-28"          , "sk_lock-AF_CAN"      ,
 218   "sk_lock-AF_TIPC"  , "sk_lock-AF_BLUETOOTH", "sk_lock-IUCV"        ,
 219   "sk_lock-AF_RXRPC" , "sk_lock-AF_ISDN"     , "sk_lock-AF_PHONET"   ,
 220   "sk_lock-AF_IEEE802154", "sk_lock-AF_CAIF" , "sk_lock-AF_ALG"      ,
 221   "sk_lock-AF_NFC"   , "sk_lock-AF_VSOCK"    , "sk_lock-AF_KCM"      ,
 222   "sk_lock-AF_QIPCRTR", "sk_lock-AF_MAX"
 223 };
 224 static const char *const af_family_slock_key_strings[AF_MAX+1] = {
 225   "slock-AF_UNSPEC", "slock-AF_UNIX"     , "slock-AF_INET"     ,
 226   "slock-AF_AX25"  , "slock-AF_IPX"      , "slock-AF_APPLETALK",
 227   "slock-AF_NETROM", "slock-AF_BRIDGE"   , "slock-AF_ATMPVC"   ,
 228   "slock-AF_X25"   , "slock-AF_INET6"    , "slock-AF_ROSE"     ,
 229   "slock-AF_DECnet", "slock-AF_NETBEUI"  , "slock-AF_SECURITY" ,
 230   "slock-AF_KEY"   , "slock-AF_NETLINK"  , "slock-AF_PACKET"   ,
 231   "slock-AF_ASH"   , "slock-AF_ECONET"   , "slock-AF_ATMSVC"   ,
 232   "slock-AF_RDS"   , "slock-AF_SNA"      , "slock-AF_IRDA"     ,
 233   "slock-AF_PPPOX" , "slock-AF_WANPIPE"  , "slock-AF_LLC"      ,
 234   "slock-27"       , "slock-28"          , "slock-AF_CAN"      ,
 235   "slock-AF_TIPC"  , "slock-AF_BLUETOOTH", "slock-AF_IUCV"     ,
 236   "slock-AF_RXRPC" , "slock-AF_ISDN"     , "slock-AF_PHONET"   ,
 237   "slock-AF_IEEE802154", "slock-AF_CAIF" , "slock-AF_ALG"      ,
 238   "slock-AF_NFC"   , "slock-AF_VSOCK"    ,"slock-AF_KCM"       ,
 239   "slock-AF_QIPCRTR", "slock-AF_MAX"
 240 };
 241 static const char *const af_family_clock_key_strings[AF_MAX+1] = {
 242   "clock-AF_UNSPEC", "clock-AF_UNIX"     , "clock-AF_INET"     ,
 243   "clock-AF_AX25"  , "clock-AF_IPX"      , "clock-AF_APPLETALK",
 244   "clock-AF_NETROM", "clock-AF_BRIDGE"   , "clock-AF_ATMPVC"   ,
 245   "clock-AF_X25"   , "clock-AF_INET6"    , "clock-AF_ROSE"     ,
 246   "clock-AF_DECnet", "clock-AF_NETBEUI"  , "clock-AF_SECURITY" ,
 247   "clock-AF_KEY"   , "clock-AF_NETLINK"  , "clock-AF_PACKET"   ,
 248   "clock-AF_ASH"   , "clock-AF_ECONET"   , "clock-AF_ATMSVC"   ,
 249   "clock-AF_RDS"   , "clock-AF_SNA"      , "clock-AF_IRDA"     ,
 250   "clock-AF_PPPOX" , "clock-AF_WANPIPE"  , "clock-AF_LLC"      ,
 251   "clock-27"       , "clock-28"          , "clock-AF_CAN"      ,
 252   "clock-AF_TIPC"  , "clock-AF_BLUETOOTH", "clock-AF_IUCV"     ,
 253   "clock-AF_RXRPC" , "clock-AF_ISDN"     , "clock-AF_PHONET"   ,
 254   "clock-AF_IEEE802154", "clock-AF_CAIF" , "clock-AF_ALG"      ,
 255   "clock-AF_NFC"   , "clock-AF_VSOCK"    , "clock-AF_KCM"      ,
 256   "clock-AF_QIPCRTR", "clock-AF_MAX"
 257 };
 258
 259 /*
 260  * sk_callback_lock locking rules are per-address-family,
 261  * so split the lock classes by using a per-AF key:
 262  */
 263 static struct lock_class_key af_callback_keys[AF_MAX];
 264
 265 /* Take into consideration the size of the struct sk_buff overhead in the
 266  * determination of these values, since that is non-constant across
 267  * platforms.  This makes socket queueing behavior and performance
 268  * not depend upon such differences.
 269  */
 270 #define _SK_MEM_PACKETS         256
 271 #define _SK_MEM_OVERHEAD        SKB_TRUESIZE(256)
 272 #define SK_WMEM_MAX             (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
 273 #define SK_RMEM_MAX             (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
 274
 275 /* Run time adjustable parameters. */
 276 __u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX;
 277 EXPORT_SYMBOL(sysctl_wmem_max);
 278 __u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX;
 279 EXPORT_SYMBOL(sysctl_rmem_max);
 280 __u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX;
 281 __u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX;
 282
 283 /* Maximal space eaten by iovec or ancillary data plus some space */
 284 int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512);
 285 EXPORT_SYMBOL(sysctl_optmem_max);
 286
 287 int sysctl_tstamp_allow_data __read_mostly = 1;
 288
 289 struct static_key memalloc_socks = STATIC_KEY_INIT_FALSE;
 290 EXPORT_SYMBOL_GPL(memalloc_socks);
 291
 292 /**
 293  * sk_set_memalloc - sets %SOCK_MEMALLOC
 294  * @sk: socket to set it on
 295  *
 296  * Set %SOCK_MEMALLOC on a socket for access to emergency reserves.
 297  * It's the responsibility of the admin to adjust min_free_kbytes
 298  * to meet the requirements
 299  */
 300 void sk_set_memalloc(struct sock *sk)
 301 {
 302         sock_set_flag(sk, SOCK_MEMALLOC);
 303         sk->sk_allocation |= __GFP_MEMALLOC;
 304         static_key_slow_inc(&memalloc_socks);
 305 }
 306 EXPORT_SYMBOL_GPL(sk_set_memalloc);
 307
 308 void sk_clear_memalloc(struct sock *sk)
 309 {
 310         sock_reset_flag(sk, SOCK_MEMALLOC);
 311         sk->sk_allocation &= ~__GFP_MEMALLOC;
 312         static_key_slow_dec(&memalloc_socks);
 313
 314         /*
 315          * SOCK_MEMALLOC is allowed to ignore rmem limits to ensure forward
 316          * progress of swapping. SOCK_MEMALLOC may be cleared while
 317          * it has rmem allocations due to the last swapfile being deactivated
 318          * but there is a risk that the socket is unusable due to exceeding
 319          * the rmem limits. Reclaim the reserves and obey rmem limits again.
 320          */
 321         sk_mem_reclaim(sk);
 322 }
 323 EXPORT_SYMBOL_GPL(sk_clear_memalloc);
 324
 325 int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb)
 326 {
 327         int ret;
 328         unsigned long pflags = current->flags;
 329
 330         /* these should have been dropped before queueing */
 331         BUG_ON(!sock_flag(sk, SOCK_MEMALLOC));
 332
 333         current->flags |= PF_MEMALLOC;
 334         ret = sk->sk_backlog_rcv(sk, skb);
 335         tsk_restore_flags(current, pflags, PF_MEMALLOC);
 336
 337         return ret;
 338 }
 339 EXPORT_SYMBOL(__sk_backlog_rcv);
 340
 341 static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen)
 342 {
 343         struct timeval tv;
 344
 345         if (optlen < sizeof(tv))
 346                 return -EINVAL;
 347         if (copy_from_user(&tv, optval, sizeof(tv)))
 348                 return -EFAULT;
 349         if (tv.tv_usec < 0 || tv.tv_usec >= USEC_PER_SEC)
 350                 return -EDOM;
 351
 352         if (tv.tv_sec < 0) {
 353                 static int warned __read_mostly;
 354
 355                 *timeo_p = 0;
 356                 if (warned < 10 && net_ratelimit()) {
 357                         warned++;
 358                         pr_info("%s: `%s' (pid %d) tries to set negative timeout\n",
 359                                 __func__, current->comm, task_pid_nr(current));
 360                 }
 361                 return 0;
 362         }
 363         *timeo_p = MAX_SCHEDULE_TIMEOUT;
 364         if (tv.tv_sec == 0 && tv.tv_usec == 0)
 365                 return 0;
 366         if (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT/HZ - 1))
 367                 *timeo_p = tv.tv_sec*HZ + (tv.tv_usec+(1000000/HZ-1))/(1000000/HZ);
 368         return 0;
 369 }
 370
 371 static void sock_warn_obsolete_bsdism(const char *name)
 372 {
 373         static int warned;
 374         static char warncomm[TASK_COMM_LEN];
 375         if (strcmp(warncomm, current->comm) && warned < 5) {
 376                 strcpy(warncomm,  current->comm);
 377                 pr_warn("process `%s' is using obsolete %s SO_BSDCOMPAT\n",
 378                         warncomm, name);
 379                 warned++;
 380         }
 381 }
 382
 383 static bool sock_needs_netstamp(const struct sock *sk)
 384 {
 385         switch (sk->sk_family) {
 386         case AF_UNSPEC:
 387         case AF_UNIX:
 388                 return false;
 389         default:
 390                 return true;
 391         }
 392 }
 393
 394 static void sock_disable_timestamp(struct sock *sk, unsigned long flags)
 395 {
 396         if (sk->sk_flags & flags) {
 397                 sk->sk_flags &= ~flags;
 398                 if (sock_needs_netstamp(sk) &&
 399                     !(sk->sk_flags & SK_FLAGS_TIMESTAMP))
 400                         net_disable_timestamp();
 401         }
 402 }
 403
 404
 405 int __sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
 406 {
 407         unsigned long flags;
 408         struct sk_buff_head *list = &sk->sk_receive_queue;
 409
 410         if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) {
 411                 atomic_inc(&sk->sk_drops);
 412                 trace_sock_rcvqueue_full(sk, skb);
 413                 return -ENOMEM;
 414         }
 415
 416         if (!sk_rmem_schedule(sk, skb, skb->truesize)) {
 417                 atomic_inc(&sk->sk_drops);
 418                 return -ENOBUFS;
 419         }
 420
 421         skb->dev = NULL;
 422         skb_set_owner_r(skb, sk);
 423
 424         /* we escape from rcu protected region, make sure we dont leak
 425          * a norefcounted dst
 426          */
 427         skb_dst_force(skb);
 428
 429         spin_lock_irqsave(&list->lock, flags);
 430         sock_skb_set_dropcount(sk, skb);
 431         __skb_queue_tail(list, skb);
 432         spin_unlock_irqrestore(&list->lock, flags);
 433
 434         if (!sock_flag(sk, SOCK_DEAD))
 435                 sk->sk_data_ready(sk);
 436         return 0;
 437 }
 438 EXPORT_SYMBOL(__sock_queue_rcv_skb);
 439
 440 int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
 441 {
 442         int err;
 443
 444         err = sk_filter(sk, skb);
 445         if (err)
 446                 return err;
 447
 448         return __sock_queue_rcv_skb(sk, skb);
 449 }
 450 EXPORT_SYMBOL(sock_queue_rcv_skb);
 451
 452 int __sk_receive_skb(struct sock *sk, struct sk_buff *skb,
 453                      const int nested, unsigned int trim_cap, bool refcounted)
 454 {
 455         int rc = NET_RX_SUCCESS;
 456
 457         if (sk_filter_trim_cap(sk, skb, trim_cap))
 458                 goto discard_and_relse;
 459
 460         skb->dev = NULL;
 461
 462         if (sk_rcvqueues_full(sk, sk->sk_rcvbuf)) {
 463                 atomic_inc(&sk->sk_drops);
 464                 goto discard_and_relse;
 465         }
 466         if (nested)
 467                 bh_lock_sock_nested(sk);
 468         else
 469                 bh_lock_sock(sk);
 470         if (!sock_owned_by_user(sk)) {
 471                 /*
 472                  * trylock + unlock semantics:
 473                  */
 474                 mutex_acquire(&sk->sk_lock.dep_map, 0, 1, _RET_IP_);
 475
 476                 rc = sk_backlog_rcv(sk, skb);
 477
 478                 mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
 479         } else if (sk_add_backlog(sk, skb, sk->sk_rcvbuf)) {
 480                 bh_unlock_sock(sk);
 481                 atomic_inc(&sk->sk_drops);
 482                 goto discard_and_relse;
 483         }
 484
 485         bh_unlock_sock(sk);
 486 out:
 487         if (refcounted)
 488                 sock_put(sk);
 489         return rc;
 490 discard_and_relse:
 491         kfree_skb(skb);
 492         goto out;
 493 }
 494 EXPORT_SYMBOL(__sk_receive_skb);
 495
 496 struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie)
 497 {
 498         struct dst_entry *dst = __sk_dst_get(sk);
 499
 500         if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
 501                 sk_tx_queue_clear(sk);
 502                 RCU_INIT_POINTER(sk->sk_dst_cache, NULL);
 503                 dst_release(dst);
 504                 return NULL;
 505         }
 506
 507         return dst;
 508 }
 509 EXPORT_SYMBOL(__sk_dst_check);
 510
 511 struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie)
 512 {
 513         struct dst_entry *dst = sk_dst_get(sk);
 514
 515         if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
 516                 sk_dst_reset(sk);
 517                 dst_release(dst);
 518                 return NULL;
 519         }
 520
 521         return dst;
 522 }
 523 EXPORT_SYMBOL(sk_dst_check);
 524
 525 static int sock_setbindtodevice(struct sock *sk, char __user *optval,
 526                                 int optlen)
 527 {
 528         int ret = -ENOPROTOOPT;
 529 #ifdef CONFIG_NETDEVICES
 530         struct net *net = sock_net(sk);
 531         char devname[IFNAMSIZ];
 532         int index;
 533
 534         /* Sorry... */
 535         ret = -EPERM;
 536         if (!ns_capable(net->user_ns, CAP_NET_RAW))
 537                 goto out;
 538
 539         ret = -EINVAL;
 540         if (optlen < 0)
 541                 goto out;
 542
 543         /* Bind this socket to a particular device like "eth0",
 544          * as specified in the passed interface name. If the
 545          * name is "" or the option length is zero the socket
 546          * is not bound.
 547          */
 548         if (optlen > IFNAMSIZ - 1)
 549                 optlen = IFNAMSIZ - 1;
 550         memset(devname, 0, sizeof(devname));
 551
 552         ret = -EFAULT;
 553         if (copy_from_user(devname, optval, optlen))
 554                 goto out;
 555
 556         index = 0;
 557         if (devname[0] != '\0') {
 558                 struct net_device *dev;
 559
 560                 rcu_read_lock();
 561                 dev = dev_get_by_name_rcu(net, devname);
 562                 if (dev)
 563                         index = dev->ifindex;
 564                 rcu_read_unlock();
 565                 ret = -ENODEV;
 566                 if (!dev)
 567                         goto out;
 568         }
 569
 570         lock_sock(sk);
 571         sk->sk_bound_dev_if = index;
 572         sk_dst_reset(sk);
 573         release_sock(sk);
 574
 575         ret = 0;
 576
 577 out:
 578 #endif
 579
 580         return ret;
 581 }
 582
 583 static int sock_getbindtodevice(struct sock *sk, char __user *optval,
 584                                 int __user *optlen, int len)
 585 {
 586         int ret = -ENOPROTOOPT;
 587 #ifdef CONFIG_NETDEVICES
 588         struct net *net = sock_net(sk);
 589         char devname[IFNAMSIZ];
 590
 591         if (sk->sk_bound_dev_if == 0) {
 592                 len = 0;
 593                 goto zero;
 594         }
 595
 596         ret = -EINVAL;
 597         if (len < IFNAMSIZ)
 598                 goto out;
 599
 600         ret = netdev_get_name(net, devname, sk->sk_bound_dev_if);
 601         if (ret)
 602                 goto out;
 603
 604         len = strlen(devname) + 1;
 605
 606         ret = -EFAULT;
 607         if (copy_to_user(optval, devname, len))
 608                 goto out;
 609
 610 zero:
 611         ret = -EFAULT;
 612         if (put_user(len, optlen))
 613                 goto out;
 614
 615         ret = 0;
 616
 617 out:
 618 #endif
 619
 620         return ret;
 621 }
 622
 623 static inline void sock_valbool_flag(struct sock *sk, int bit, int valbool)
 624 {
 625         if (valbool)
 626                 sock_set_flag(sk, bit);
 627         else
 628                 sock_reset_flag(sk, bit);
 629 }
 630
 631 bool sk_mc_loop(struct sock *sk)
 632 {
 633         if (dev_recursion_level())
 634                 return false;
 635         if (!sk)
 636                 return true;
 637         switch (sk->sk_family) {
 638         case AF_INET:
 639                 return inet_sk(sk)->mc_loop;
 640 #if IS_ENABLED(CONFIG_IPV6)
 641         case AF_INET6:
 642                 return inet6_sk(sk)->mc_loop;
 643 #endif
 644         }
 645         WARN_ON(1);
 646         return true;
 647 }
 648 EXPORT_SYMBOL(sk_mc_loop);
 649
 650 /*
 651  *      This is meant for all protocols to use and covers goings on
 652  *      at the socket level. Everything here is generic.
 653  */
 654
 655 int sock_setsockopt(struct socket *sock, int level, int optname,
 656                     char __user *optval, unsigned int optlen)
 657 {
 658         struct sock *sk = sock->sk;
 659         int val;
 660         int valbool;
 661         struct linger ling;
 662         int ret = 0;
 663
 664         /*
 665          *      Options without arguments
 666          */
 667
 668         if (optname == SO_BINDTODEVICE)
 669                 return sock_setbindtodevice(sk, optval, optlen);
 670
 671         if (optlen < sizeof(int))
 672                 return -EINVAL;
 673
 674         if (get_user(val, (int __user *)optval))
 675                 return -EFAULT;
 676
 677         valbool = val ? 1 : 0;
 678
 679         lock_sock(sk);
 680
 681         switch (optname) {
 682         case SO_DEBUG:
 683                 if (val && !capable(CAP_NET_ADMIN))
 684                         ret = -EACCES;
 685                 else
 686                         sock_valbool_flag(sk, SOCK_DBG, valbool);
 687                 break;
 688         case SO_REUSEADDR:
 689                 sk->sk_reuse = (valbool ? SK_CAN_REUSE : SK_NO_REUSE);
 690                 break;
 691         case SO_REUSEPORT:
 692                 sk->sk_reuseport = valbool;
 693                 break;
 694         case SO_TYPE:
 695         case SO_PROTOCOL:
 696         case SO_DOMAIN:
 697         case SO_ERROR:
 698                 ret = -ENOPROTOOPT;
 699                 break;
 700         case SO_DONTROUTE:
 701                 sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool);
 702                 sk_dst_reset(sk);
 703                 break;
 704         case SO_BROADCAST:
 705                 sock_valbool_flag(sk, SOCK_BROADCAST, valbool);
 706                 break;
 707         case SO_SNDBUF:
 708                 /* Don't error on this BSD doesn't and if you think
 709                  * about it this is right. Otherwise apps have to
 710                  * play 'guess the biggest size' games. RCVBUF/SNDBUF
 711                  * are treated in BSD as hints
 712                  */
 713                 val = min_t(u32, val, sysctl_wmem_max);
 714 set_sndbuf:
 715                 sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
 716                 sk->sk_sndbuf = max_t(int, val * 2, SOCK_MIN_SNDBUF);
 717                 /* Wake up sending tasks if we upped the value. */
 718                 sk->sk_write_space(sk);
 719                 break;
 720
 721         case SO_SNDBUFFORCE:
 722                 if (!capable(CAP_NET_ADMIN)) {
 723                         ret = -EPERM;
 724                         break;
 725                 }
 726                 goto set_sndbuf;
 727
 728         case SO_RCVBUF:
 729                 /* Don't error on this BSD doesn't and if you think
 730                  * about it this is right. Otherwise apps have to
 731                  * play 'guess the biggest size' games. RCVBUF/SNDBUF
 732                  * are treated in BSD as hints
 733                  */
 734                 val = min_t(u32, val, sysctl_rmem_max);
 735 set_rcvbuf:
 736                 sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
 737                 /*
 738                  * We double it on the way in to account for
 739                  * "struct sk_buff" etc. overhead.   Applications
 740                  * assume that the SO_RCVBUF setting they make will
 741                  * allow that much actual data to be received on that
 742                  * socket.
 743                  *
 744                  * Applications are unaware that "struct sk_buff" and
 745                  * other overheads allocate from the receive buffer
 746                  * during socket buffer allocation.
 747                  *
 748                  * And after considering the possible alternatives,
 749                  * returning the value we actually used in getsockopt
 750                  * is the most desirable behavior.
 751                  */
 752                 sk->sk_rcvbuf = max_t(int, val * 2, SOCK_MIN_RCVBUF);
 753                 break;
 754
 755         case SO_RCVBUFFORCE:
 756                 if (!capable(CAP_NET_ADMIN)) {
 757                         ret = -EPERM;
 758                         break;
 759                 }
 760                 goto set_rcvbuf;
 761
 762         case SO_KEEPALIVE:
 763 #ifdef CONFIG_INET
 764                 if (sk->sk_protocol == IPPROTO_TCP &&
 765                     sk->sk_type == SOCK_STREAM)
 766                         tcp_set_keepalive(sk, valbool);
 767 #endif
 768                 sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool);
 769                 break;
 770
 771         case SO_OOBINLINE:
 772                 sock_valbool_flag(sk, SOCK_URGINLINE, valbool);
 773                 break;
 774
 775         case SO_NO_CHECK:
 776                 sk->sk_no_check_tx = valbool;
 777                 break;
 778
 779         case SO_PRIORITY:
 780                 if ((val >= 0 && val <= 6) ||
 781                     ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
 782                         sk->sk_priority = val;
 783                 else
 784                         ret = -EPERM;
 785                 break;
 786
 787         case SO_LINGER:
 788                 if (optlen < sizeof(ling)) {
 789                         ret = -EINVAL;  /* 1003.1g */
 790                         break;
 791                 }
 792                 if (copy_from_user(&ling, optval, sizeof(ling))) {
 793                         ret = -EFAULT;
 794                         break;
 795                 }
 796                 if (!ling.l_onoff)
 797                         sock_reset_flag(sk, SOCK_LINGER);
 798                 else {
 799 #if (BITS_PER_LONG == 32)
 800                         if ((unsigned int)ling.l_linger >= MAX_SCHEDULE_TIMEOUT/HZ)
 801                                 sk->sk_lingertime = MAX_SCHEDULE_TIMEOUT;
 802                         else
 803 #endif
 804                                 sk->sk_lingertime = (unsigned int)ling.l_linger * HZ;
 805                         sock_set_flag(sk, SOCK_LINGER);
 806                 }
 807                 break;
 808
 809         case SO_BSDCOMPAT:
 810                 sock_warn_obsolete_bsdism("setsockopt");
 811                 break;
 812
 813         case SO_PASSCRED:
 814                 if (valbool)
 815                         set_bit(SOCK_PASSCRED, &sock->flags);
 816                 else
 817                         clear_bit(SOCK_PASSCRED, &sock->flags);
 818                 break;
 819
 820         case SO_TIMESTAMP:
 821         case SO_TIMESTAMPNS:
 822                 if (valbool)  {
 823                         if (optname == SO_TIMESTAMP)
 824                                 sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
 825                         else
 826                                 sock_set_flag(sk, SOCK_RCVTSTAMPNS);
 827                         sock_set_flag(sk, SOCK_RCVTSTAMP);
 828                         sock_enable_timestamp(sk, SOCK_TIMESTAMP);
 829                 } else {
 830                         sock_reset_flag(sk, SOCK_RCVTSTAMP);
 831                         sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
 832                 }
 833                 break;
 834
 835         case SO_TIMESTAMPING:
 836                 if (val & ~SOF_TIMESTAMPING_MASK) {
 837                         ret = -EINVAL;
 838                         break;
 839                 }
 840
 841                 if (val & SOF_TIMESTAMPING_OPT_ID &&
 842                     !(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)) {
 843                         if (sk->sk_protocol == IPPROTO_TCP &&
 844                             sk->sk_type == SOCK_STREAM) {
 845                                 if ((1 << sk->sk_state) &
 846                                     (TCPF_CLOSE | TCPF_LISTEN)) {
 847                                         ret = -EINVAL;
 848                                         break;
 849                                 }
 850                                 sk->sk_tskey = tcp_sk(sk)->snd_una;
 851                         } else {
 852                                 sk->sk_tskey = 0;
 853                         }
 854                 }
 855                 sk->sk_tsflags = val;
 856                 if (val & SOF_TIMESTAMPING_RX_SOFTWARE)
 857                         sock_enable_timestamp(sk,
 858                                               SOCK_TIMESTAMPING_RX_SOFTWARE);
 859                 else
 860                         sock_disable_timestamp(sk,
 861                                                (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE));
 862                 break;
 863
 864         case SO_RCVLOWAT:
 865                 if (val < 0)
 866                         val = INT_MAX;
 867                 sk->sk_rcvlowat = val ? : 1;
 868                 break;
 869
 870         case SO_RCVTIMEO:
 871                 ret = sock_set_timeout(&sk->sk_rcvtimeo, optval, optlen);
 872                 break;
 873
 874         case SO_SNDTIMEO:
 875                 ret = sock_set_timeout(&sk->sk_sndtimeo, optval, optlen);
 876                 break;
 877
 878         case SO_ATTACH_FILTER:
 879                 ret = -EINVAL;
 880                 if (optlen == sizeof(struct sock_fprog)) {
 881                         struct sock_fprog fprog;
 882
 883                         ret = -EFAULT;
 884                         if (copy_from_user(&fprog, optval, sizeof(fprog)))
 885                                 break;
 886
 887                         ret = sk_attach_filter(&fprog, sk);
 888                 }
 889                 break;
 890
 891         case SO_ATTACH_BPF:
 892                 ret = -EINVAL;
 893                 if (optlen == sizeof(u32)) {
 894                         u32 ufd;
 895
 896                         ret = -EFAULT;
 897                         if (copy_from_user(&ufd, optval, sizeof(ufd)))
 898                                 break;
 899
 900                         ret = sk_attach_bpf(ufd, sk);
 901                 }
 902                 break;
 903
 904         case SO_ATTACH_REUSEPORT_CBPF:
 905                 ret = -EINVAL;
 906                 if (optlen == sizeof(struct sock_fprog)) {
 907                         struct sock_fprog fprog;
 908
 909                         ret = -EFAULT;
 910                         if (copy_from_user(&fprog, optval, sizeof(fprog)))
 911                                 break;
 912
 913                         ret = sk_reuseport_attach_filter(&fprog, sk);
 914                 }
 915                 break;
 916
 917         case SO_ATTACH_REUSEPORT_EBPF:
 918                 ret = -EINVAL;
 919                 if (optlen == sizeof(u32)) {
 920                         u32 ufd;
 921
 922                         ret = -EFAULT;
 923                         if (copy_from_user(&ufd, optval, sizeof(ufd)))
 924                                 break;
 925
 926                         ret = sk_reuseport_attach_bpf(ufd, sk);
 927                 }
 928                 break;
 929
 930         case SO_DETACH_FILTER:
 931                 ret = sk_detach_filter(sk);
 932                 break;
 933
 934         case SO_LOCK_FILTER:
 935                 if (sock_flag(sk, SOCK_FILTER_LOCKED) && !valbool)
 936                         ret = -EPERM;
 937                 else
 938                         sock_valbool_flag(sk, SOCK_FILTER_LOCKED, valbool);
 939                 break;
 940
 941         case SO_PASSSEC:
 942                 if (valbool)
 943                         set_bit(SOCK_PASSSEC, &sock->flags);
 944                 else
 945                         clear_bit(SOCK_PASSSEC, &sock->flags);
 946                 break;
 947         case SO_MARK:
 948                 if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
 949                         ret = -EPERM;
 950                 else
 951                         sk->sk_mark = val;
 952                 break;
 953
 954         case SO_RXQ_OVFL:
 955                 sock_valbool_flag(sk, SOCK_RXQ_OVFL, valbool);
 956                 break;
 957
 958         case SO_WIFI_STATUS:
 959                 sock_valbool_flag(sk, SOCK_WIFI_STATUS, valbool);
 960                 break;
 961
 962         case SO_PEEK_OFF:
 963                 if (sock->ops->set_peek_off)
 964                         ret = sock->ops->set_peek_off(sk, val);
 965                 else
 966                         ret = -EOPNOTSUPP;
 967                 break;
 968
 969         case SO_NOFCS:
 970                 sock_valbool_flag(sk, SOCK_NOFCS, valbool);
 971                 break;
 972
 973         case SO_SELECT_ERR_QUEUE:
 974                 sock_valbool_flag(sk, SOCK_SELECT_ERR_QUEUE, valbool);
 975                 break;
 976
 977 #ifdef CONFIG_NET_RX_BUSY_POLL
 978         case SO_BUSY_POLL:
 979                 /* allow unprivileged users to decrease the value */
 980                 if ((val > sk->sk_ll_usec) && !capable(CAP_NET_ADMIN))
 981                         ret = -EPERM;
 982                 else {
 983                         if (val < 0)
 984                                 ret = -EINVAL;
 985                         else
 986                                 sk->sk_ll_usec = val;
 987                 }
 988                 break;
 989 #endif
 990
 991         case SO_MAX_PACING_RATE:
 992                 sk->sk_max_pacing_rate = val;
 993                 sk->sk_pacing_rate = min(sk->sk_pacing_rate,
 994                                          sk->sk_max_pacing_rate);
 995                 break;
 996
 997         case SO_INCOMING_CPU:
 998                 sk->sk_incoming_cpu = val;
 999                 break;
1000
1001         case SO_CNX_ADVICE:
1002                 if (val == 1)
1003                         dst_negative_advice(sk);
1004                 break;
1005         default:
1006                 ret = -ENOPROTOOPT;
1007                 break;
1008         }
1009         release_sock(sk);
1010         return ret;
1011 }
1012 EXPORT_SYMBOL(sock_setsockopt);
1013
1014
1015 static void cred_to_ucred(struct pid *pid, const struct cred *cred,
1016                           struct ucred *ucred)
1017 {
1018         ucred->pid = pid_vnr(pid);
1019         ucred->uid = ucred->gid = -1;
1020         if (cred) {
1021                 struct user_namespace *current_ns = current_user_ns();
1022
1023                 ucred->uid = from_kuid_munged(current_ns, cred->euid);
1024                 ucred->gid = from_kgid_munged(current_ns, cred->egid);
1025         }
1026 }
1027
1028 int sock_getsockopt(struct socket *sock, int level, int optname,
1029                     char __user *optval, int __user *optlen)
1030 {
1031         struct sock *sk = sock->sk;
1032
1033         union {
1034                 int val;
1035                 struct linger ling;
1036                 struct timeval tm;
1037         } v;
1038
1039         int lv = sizeof(int);
1040         int len;
1041
1042         if (get_user(len, optlen))
1043                 return -EFAULT;
1044         if (len < 0)
1045                 return -EINVAL;
1046
1047         memset(&v, 0, sizeof(v));
1048
1049         switch (optname) {
1050         case SO_DEBUG:
1051                 v.val = sock_flag(sk, SOCK_DBG);
1052                 break;
1053
1054         case SO_DONTROUTE:
1055                 v.val = sock_flag(sk, SOCK_LOCALROUTE);
1056                 break;
1057
1058         case SO_BROADCAST:
1059                 v.val = sock_flag(sk, SOCK_BROADCAST);
1060                 break;
1061
1062         case SO_SNDBUF:
1063                 v.val = sk->sk_sndbuf;
1064                 break;
1065
1066         case SO_RCVBUF:
1067                 v.val = sk->sk_rcvbuf;
1068                 break;
1069
1070         case SO_REUSEADDR:
1071                 v.val = sk->sk_reuse;
1072                 break;
1073
1074         case SO_REUSEPORT:
1075                 v.val = sk->sk_reuseport;
1076                 break;
1077
1078         case SO_KEEPALIVE:
1079                 v.val = sock_flag(sk, SOCK_KEEPOPEN);
1080                 break;
1081
1082         case SO_TYPE:
1083                 v.val = sk->sk_type;
1084                 break;
1085
1086         case SO_PROTOCOL:
1087                 v.val = sk->sk_protocol;
1088                 break;
1089
1090         case SO_DOMAIN:
1091                 v.val = sk->sk_family;
1092                 break;
1093
1094         case SO_ERROR:
1095                 v.val = -sock_error(sk);
1096                 if (v.val == 0)
1097                         v.val = xchg(&sk->sk_err_soft, 0);
1098                 break;
1099
1100         case SO_OOBINLINE:
1101                 v.val = sock_flag(sk, SOCK_URGINLINE);
1102                 break;
1103
1104         case SO_NO_CHECK:
1105                 v.val = sk->sk_no_check_tx;
1106                 break;
1107
1108         case SO_PRIORITY:
1109                 v.val = sk->sk_priority;
1110                 break;
1111
1112         case SO_LINGER:
1113                 lv              = sizeof(v.ling);
1114                 v.ling.l_onoff  = sock_flag(sk, SOCK_LINGER);
1115                 v.ling.l_linger = sk->sk_lingertime / HZ;
1116                 break;
1117
1118         case SO_BSDCOMPAT:
1119                 sock_warn_obsolete_bsdism("getsockopt");
1120                 break;
1121
1122         case SO_TIMESTAMP:
1123                 v.val = sock_flag(sk, SOCK_RCVTSTAMP) &&
1124                                 !sock_flag(sk, SOCK_RCVTSTAMPNS);
1125                 break;
1126
1127         case SO_TIMESTAMPNS:
1128                 v.val = sock_flag(sk, SOCK_RCVTSTAMPNS);
1129                 break;
1130
1131         case SO_TIMESTAMPING:
1132                 v.val = sk->sk_tsflags;
1133                 break;
1134
1135         case SO_RCVTIMEO:
1136                 lv = sizeof(struct timeval);
1137                 if (sk->sk_rcvtimeo == MAX_SCHEDULE_TIMEOUT) {
1138                         v.tm.tv_sec = 0;
1139                         v.tm.tv_usec = 0;
1140                 } else {
1141                         v.tm.tv_sec = sk->sk_rcvtimeo / HZ;
1142                         v.tm.tv_usec = ((sk->sk_rcvtimeo % HZ) * 1000000) / HZ;
1143                 }
1144                 break;
1145
1146         case SO_SNDTIMEO:
1147                 lv = sizeof(struct timeval);
1148                 if (sk->sk_sndtimeo == MAX_SCHEDULE_TIMEOUT) {
1149                         v.tm.tv_sec = 0;
1150                         v.tm.tv_usec = 0;
1151                 } else {
1152                         v.tm.tv_sec = sk->sk_sndtimeo / HZ;
1153                         v.tm.tv_usec = ((sk->sk_sndtimeo % HZ) * 1000000) / HZ;
1154                 }
1155                 break;
1156
1157         case SO_RCVLOWAT:
1158                 v.val = sk->sk_rcvlowat;
1159                 break;
1160
1161         case SO_SNDLOWAT:
1162                 v.val = 1;
1163                 break;
1164
1165         case SO_PASSCRED:
1166                 v.val = !!test_bit(SOCK_PASSCRED, &sock->flags);
1167                 break;
1168
1169         case SO_PEERCRED:
1170         {
1171                 struct ucred peercred;
1172                 if (len > sizeof(peercred))
1173                         len = sizeof(peercred);
1174                 cred_to_ucred(sk->sk_peer_pid, sk->sk_peer_cred, &peercred);
1175                 if (copy_to_user(optval, &peercred, len))
1176                         return -EFAULT;
1177                 goto lenout;
1178         }
1179
1180         case SO_PEERNAME:
1181         {
1182                 char address[128];
1183
1184                 if (sock->ops->getname(sock, (struct sockaddr *)address, &lv, 2))
1185                         return -ENOTCONN;
1186                 if (lv < len)
1187                         return -EINVAL;
1188                 if (copy_to_user(optval, address, len))
1189                         return -EFAULT;
1190                 goto lenout;
1191         }
1192
1193         /* Dubious BSD thing... Probably nobody even uses it, but
1194          * the UNIX standard wants it for whatever reason... -DaveM
1195          */
1196         case SO_ACCEPTCONN:
1197                 v.val = sk->sk_state == TCP_LISTEN;
1198                 break;
1199
1200         case SO_PASSSEC:
1201                 v.val = !!test_bit(SOCK_PASSSEC, &sock->flags);
1202                 break;
1203
1204         case SO_PEERSEC:
1205                 return security_socket_getpeersec_stream(sock, optval, optlen, len);
1206
1207         case SO_MARK:
1208                 v.val = sk->sk_mark;
1209                 break;
1210
1211         case SO_RXQ_OVFL:
1212                 v.val = sock_flag(sk, SOCK_RXQ_OVFL);
1213                 break;
1214
1215         case SO_WIFI_STATUS:
1216                 v.val = sock_flag(sk, SOCK_WIFI_STATUS);
1217                 break;
1218
1219         case SO_PEEK_OFF:
1220                 if (!sock->ops->set_peek_off)
1221                         return -EOPNOTSUPP;
1222
1223                 v.val = sk->sk_peek_off;
1224                 break;
1225         case SO_NOFCS:
1226                 v.val = sock_flag(sk, SOCK_NOFCS);
1227                 break;
1228
1229         case SO_BINDTODEVICE:
1230                 return sock_getbindtodevice(sk, optval, optlen, len);
1231
1232         case SO_GET_FILTER:
1233                 len = sk_get_filter(sk, (struct sock_filter __user *)optval, len);
1234                 if (len < 0)
1235                         return len;
1236
1237                 goto lenout;
1238
1239         case SO_LOCK_FILTER:
1240                 v.val = sock_flag(sk, SOCK_FILTER_LOCKED);
1241                 break;
1242
1243         case SO_BPF_EXTENSIONS:
1244                 v.val = bpf_tell_extensions();
1245                 break;
1246
1247         case SO_SELECT_ERR_QUEUE:
1248                 v.val = sock_flag(sk, SOCK_SELECT_ERR_QUEUE);
1249                 break;
1250
1251 #ifdef CONFIG_NET_RX_BUSY_POLL
1252         case SO_BUSY_POLL:
1253                 v.val = sk->sk_ll_usec;
1254                 break;
1255 #endif
1256
1257         case SO_MAX_PACING_RATE:
1258                 v.val = sk->sk_max_pacing_rate;
1259                 break;
1260
1261         case SO_INCOMING_CPU:
1262                 v.val = sk->sk_incoming_cpu;
1263                 break;
1264
1265         default:
1266                 /* We implement the SO_SNDLOWAT etc to not be settable
1267                  * (1003.1g 7).
1268                  */
1269                 return -ENOPROTOOPT;
1270         }
1271
1272         if (len > lv)
1273                 len = lv;
1274         if (copy_to_user(optval, &v, len))
1275                 return -EFAULT;
1276 lenout:
1277         if (put_user(len, optlen))
1278                 return -EFAULT;
1279         return 0;
1280 }
1281
1282 /*
1283  * Initialize an sk_lock.
1284  *
1285  * (We also register the sk_lock with the lock validator.)
1286  */
1287 static inline void sock_lock_init(struct sock *sk)
1288 {
1289         sock_lock_init_class_and_name(sk,
1290                         af_family_slock_key_strings[sk->sk_family],
1291                         af_family_slock_keys + sk->sk_family,
1292                         af_family_key_strings[sk->sk_family],
1293                         af_family_keys + sk->sk_family);
1294 }
1295
1296 /*
1297  * Copy all fields from osk to nsk but nsk->sk_refcnt must not change yet,
1298  * even temporarly, because of RCU lookups. sk_node should also be left as is.
1299  * We must not copy fields between sk_dontcopy_begin and sk_dontcopy_end
1300  */
1301 static void sock_copy(struct sock *nsk, const struct sock *osk)
1302 {
1303 #ifdef CONFIG_SECURITY_NETWORK
1304         void *sptr = nsk->sk_security;
1305 #endif
1306         memcpy(nsk, osk, offsetof(struct sock, sk_dontcopy_begin));
1307
1308         memcpy(&nsk->sk_dontcopy_end, &osk->sk_dontcopy_end,
1309                osk->sk_prot->obj_size - offsetof(struct sock, sk_dontcopy_end));
1310
1311 #ifdef CONFIG_SECURITY_NETWORK
1312         nsk->sk_security = sptr;
1313         security_sk_clone(osk, nsk);
1314 #endif
1315 }
1316
1317 static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
1318                 int family)
1319 {
1320         struct sock *sk;
1321         struct kmem_cache *slab;
1322
1323         slab = prot->slab;
1324         if (slab != NULL) {
1325                 sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO);
1326                 if (!sk)
1327                         return sk;
1328                 if (priority & __GFP_ZERO)
1329                         sk_prot_clear_nulls(sk, prot->obj_size);
1330         } else
1331                 sk = kmalloc(prot->obj_size, priority);
1332
1333         if (sk != NULL) {
1334                 kmemcheck_annotate_bitfield(sk, flags);
1335
1336                 if (security_sk_alloc(sk, family, priority))
1337                         goto out_free;
1338
1339                 if (!try_module_get(prot->owner))
1340                         goto out_free_sec;
1341                 sk_tx_queue_clear(sk);
1342         }
1343
1344         return sk;
1345
1346 out_free_sec:
1347         security_sk_free(sk);
1348 out_free:
1349         if (slab != NULL)
1350                 kmem_cache_free(slab, sk);
1351         else
1352                 kfree(sk);
1353         return NULL;
1354 }
1355
1356 static void sk_prot_free(struct proto *prot, struct sock *sk)
1357 {
1358         struct kmem_cache *slab;
1359         struct module *owner;
1360
1361         owner = prot->owner;
1362         slab = prot->slab;
1363
1364         cgroup_sk_free(&sk->sk_cgrp_data);
1365         mem_cgroup_sk_free(sk);
1366         security_sk_free(sk);
1367         if (slab != NULL)
1368                 kmem_cache_free(slab, sk);
1369         else
1370                 kfree(sk);
1371         module_put(owner);
1372 }
1373
1374 /**
1375  *      sk_alloc - All socket objects are allocated here
1376  *      @net: the applicable net namespace
1377  *      @family: protocol family
1378  *      @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
1379  *      @prot: struct proto associated with this new sock instance
1380  *      @kern: is this to be a kernel socket?
1381  */
1382 struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
1383                       struct proto *prot, int kern)
1384 {
1385         struct sock *sk;
1386
1387         sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family);
1388         if (sk) {
1389                 sk->sk_family = family;
1390                 /*
1391                  * See comment in struct sock definition to understand
1392                  * why we need sk_prot_creator -acme
1393                  */
1394                 sk->sk_prot = sk->sk_prot_creator = prot;
1395                 sock_lock_init(sk);
1396                 sk->sk_net_refcnt = kern ? 0 : 1;
1397                 if (likely(sk->sk_net_refcnt))
1398                         get_net(net);
1399                 sock_net_set(sk, net);
1400                 atomic_set(&sk->sk_wmem_alloc, 1);
1401
1402                 mem_cgroup_sk_alloc(sk);
1403                 cgroup_sk_alloc(&sk->sk_cgrp_data);
1404                 sock_update_classid(&sk->sk_cgrp_data);
1405                 sock_update_netprioidx(&sk->sk_cgrp_data);
1406                 sk_tx_queue_clear(sk);
1407         }
1408
1409         return sk;
1410 }
1411 EXPORT_SYMBOL(sk_alloc);
1412
1413 /* Sockets having SOCK_RCU_FREE will call this function after one RCU
1414  * grace period. This is the case for UDP sockets and TCP listeners.
1415  */
1416 static void __sk_destruct(struct rcu_head *head)
1417 {
1418         struct sock *sk = container_of(head, struct sock, sk_rcu);
1419         struct sk_filter *filter;
1420
1421         if (sk->sk_destruct)
1422                 sk->sk_destruct(sk);
1423
1424         filter = rcu_dereference_check(sk->sk_filter,
1425                                        atomic_read(&sk->sk_wmem_alloc) == 0);
1426         if (filter) {
1427                 sk_filter_uncharge(sk, filter);
1428                 RCU_INIT_POINTER(sk->sk_filter, NULL);
1429         }
1430
1431         sock_disable_timestamp(sk, SK_FLAGS_TIMESTAMP);
1432
1433         if (atomic_read(&sk->sk_omem_alloc))
1434                 pr_debug("%s: optmem leakage (%d bytes) detected\n",
1435                          __func__, atomic_read(&sk->sk_omem_alloc));
1436
1437         if (sk->sk_frag.page) {
1438                 put_page(sk->sk_frag.page);
1439                 sk->sk_frag.page = NULL;
1440         }
1441
1442         if (sk->sk_peer_cred)
1443                 put_cred(sk->sk_peer_cred);
1444         put_pid(sk->sk_peer_pid);
1445         if (likely(sk->sk_net_refcnt))
1446                 put_net(sock_net(sk));
1447         sk_prot_free(sk->sk_prot_creator, sk);
1448 }
1449
1450 void sk_destruct(struct sock *sk)
1451 {
1452         bool use_call_rcu = sock_flag(sk, SOCK_RCU_FREE);
1453
1454         if (rcu_access_pointer(sk->sk_reuseport_cb)) {
1455                 reuseport_detach_sock(sk);
1456                 use_call_rcu = true;
1457         }
1458
1459         if (use_call_rcu)
1460                 call_rcu(&sk->sk_rcu, __sk_destruct);
1461         else
1462                 __sk_destruct(&sk->sk_rcu);
1463 }
1464
1465 static void __sk_free(struct sock *sk)
1466 {
1467         if (unlikely(sk->sk_net_refcnt && sock_diag_has_destroy_listeners(sk)))
1468                 sock_diag_broadcast_destroy(sk);
1469         else
1470                 sk_destruct(sk);
1471 }
1472
1473 void sk_free(struct sock *sk)
1474 {
1475         /*
1476          * We subtract one from sk_wmem_alloc and can know if
1477          * some packets are still in some tx queue.
1478          * If not null, sock_wfree() will call __sk_free(sk) later
1479          */
1480         if (atomic_dec_and_test(&sk->sk_wmem_alloc))
1481                 __sk_free(sk);
1482 }
1483 EXPORT_SYMBOL(sk_free);
1484
1485 /**
1486  *      sk_clone_lock - clone a socket, and lock its clone
1487  *      @sk: the socket to clone
1488  *      @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
1489  *
1490  *      Caller must unlock socket even in error path (bh_unlock_sock(newsk))
1491  */
1492 struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
1493 {
1494         struct sock *newsk;
1495         bool is_charged = true;
1496
1497         newsk = sk_prot_alloc(sk->sk_prot, priority, sk->sk_family);
1498         if (newsk != NULL) {
1499                 struct sk_filter *filter;
1500
1501                 sock_copy(newsk, sk);
1502
1503                 newsk->sk_prot_creator = sk->sk_prot;
1504
1505                 /* SANITY */
1506                 if (likely(newsk->sk_net_refcnt))
1507                         get_net(sock_net(newsk));
1508                 sk_node_init(&newsk->sk_node);
1509                 sock_lock_init(newsk);
1510                 bh_lock_sock(newsk);
1511                 newsk->sk_backlog.head  = newsk->sk_backlog.tail = NULL;
1512                 newsk->sk_backlog.len = 0;
1513
1514                 atomic_set(&newsk->sk_rmem_alloc, 0);
1515                 /*
1516                  * sk_wmem_alloc set to one (see sk_free() and sock_wfree())
1517                  */
1518                 atomic_set(&newsk->sk_wmem_alloc, 1);
1519                 atomic_set(&newsk->sk_omem_alloc, 0);
1520                 skb_queue_head_init(&newsk->sk_receive_queue);
1521                 skb_queue_head_init(&newsk->sk_write_queue);
1522
1523                 rwlock_init(&newsk->sk_callback_lock);
1524                 lockdep_set_class_and_name(&newsk->sk_callback_lock,
1525                                 af_callback_keys + newsk->sk_family,
1526                                 af_family_clock_key_strings[newsk->sk_family]);
1527
1528                 newsk->sk_dst_cache     = NULL;
1529                 newsk->sk_wmem_queued   = 0;
1530                 newsk->sk_forward_alloc = 0;
1531                 atomic_set(&newsk->sk_drops, 0);
1532                 newsk->sk_send_head     = NULL;
1533                 newsk->sk_userlocks     = sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
1534
1535                 sock_reset_flag(newsk, SOCK_DONE);
1536                 cgroup_sk_clone(&newsk->sk_cgrp_data);
1537                 skb_queue_head_init(&newsk->sk_error_queue);
1538
1539                 filter = rcu_dereference_protected(newsk->sk_filter, 1);
1540                 if (filter != NULL)
1541                         /* though it's an empty new sock, the charging may fail
1542                          * if sysctl_optmem_max was changed between creation of
1543                          * original socket and cloning
1544                          */
1545                         is_charged = sk_filter_charge(newsk, filter);
1546
1547                 if (unlikely(!is_charged || xfrm_sk_clone_policy(newsk, sk))) {
1548                         /* We need to make sure that we don't uncharge the new
1549                          * socket if we couldn't charge it in the first place
1550                          * as otherwise we uncharge the parent's filter.
1551                          */
1552                         if (!is_charged)
1553                                 RCU_INIT_POINTER(newsk->sk_filter, NULL);
1554                         /* It is still raw copy of parent, so invalidate
1555                          * destructor and make plain sk_free() */
1556                         newsk->sk_destruct = NULL;
1557                         bh_unlock_sock(newsk);
1558                         sk_free(newsk);
1559                         newsk = NULL;
1560                         goto out;
1561                 }
1562                 RCU_INIT_POINTER(newsk->sk_reuseport_cb, NULL);
1563
1564                 newsk->sk_err      = 0;
1565                 newsk->sk_err_soft = 0;
1566                 newsk->sk_priority = 0;
1567                 newsk->sk_incoming_cpu = raw_smp_processor_id();
1568                 atomic64_set(&newsk->sk_cookie, 0);
1569
1570                 mem_cgroup_sk_alloc(newsk);
1571                 /*
1572                  * Before updating sk_refcnt, we must commit prior changes to memory
1573                  * (Documentation/RCU/rculist_nulls.txt for details)
1574                  */
1575                 smp_wmb();
1576                 atomic_set(&newsk->sk_refcnt, 2);
1577
1578                 /*
1579                  * Increment the counter in the same struct proto as the master
1580                  * sock (sk_refcnt_debug_inc uses newsk->sk_prot->socks, that
1581                  * is the same as sk->sk_prot->socks, as this field was copied
1582                  * with memcpy).
1583                  *
1584                  * This _changes_ the previous behaviour, where
1585                  * tcp_create_openreq_child always was incrementing the
1586                  * equivalent to tcp_prot->socks (inet_sock_nr), so this have
1587                  * to be taken into account in all callers. -acme
1588                  */
1589                 sk_refcnt_debug_inc(newsk);
1590                 sk_set_socket(newsk, NULL);
1591                 sk_tx_queue_clear(newsk);
1592                 newsk->sk_wq = NULL;
1593
1594                 if (newsk->sk_prot->sockets_allocated)
1595                         sk_sockets_allocated_inc(newsk);
1596
1597                 if (sock_needs_netstamp(sk) &&
1598                     newsk->sk_flags & SK_FLAGS_TIMESTAMP)
1599                         net_enable_timestamp();
1600         }
1601 out:
1602         return newsk;
1603 }
1604 EXPORT_SYMBOL_GPL(sk_clone_lock);
1605
1606 void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
1607 {
1608         u32 max_segs = 1;
1609
1610         sk_dst_set(sk, dst);
1611         sk->sk_route_caps = dst->dev->features;
1612         if (sk->sk_route_caps & NETIF_F_GSO)
1613                 sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE;
1614         sk->sk_route_caps &= ~sk->sk_route_nocaps;
1615         if (sk_can_gso(sk)) {
1616                 if (dst->header_len) {
1617                         sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
1618                 } else {
1619                         sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM;
1620                         sk->sk_gso_max_size = dst->dev->gso_max_size;
1621                         max_segs = max_t(u32, dst->dev->gso_max_segs, 1);
1622                 }
1623         }
1624         sk->sk_gso_max_segs = max_segs;
1625 }
1626 EXPORT_SYMBOL_GPL(sk_setup_caps);
1627
1628 /*
1629  *      Simple resource managers for sockets.
1630  */
1631
1632
1633 /*
1634  * Write buffer destructor automatically called from kfree_skb.
1635  */
1636 void sock_wfree(struct sk_buff *skb)
1637 {
1638         struct sock *sk = skb->sk;
1639         unsigned int len = skb->truesize;
1640
1641         if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE)) {
1642                 /*
1643                  * Keep a reference on sk_wmem_alloc, this will be released
1644                  * after sk_write_space() call
1645                  */
1646                 atomic_sub(len - 1, &sk->sk_wmem_alloc);
1647                 sk->sk_write_space(sk);
1648                 len = 1;
1649         }
1650         /*
1651          * if sk_wmem_alloc reaches 0, we must finish what sk_free()
1652          * could not do because of in-flight packets
1653          */
1654         if (atomic_sub_and_test(len, &sk->sk_wmem_alloc))
1655                 __sk_free(sk);
1656 }
1657 EXPORT_SYMBOL(sock_wfree);
1658
1659 /* This variant of sock_wfree() is used by TCP,
1660  * since it sets SOCK_USE_WRITE_QUEUE.
1661  */
1662 void __sock_wfree(struct sk_buff *skb)
1663 {
1664         struct sock *sk = skb->sk;
1665
1666         if (atomic_sub_and_test(skb->truesize, &sk->sk_wmem_alloc))
1667                 __sk_free(sk);
1668 }
1669
1670 void skb_set_owner_w(struct sk_buff *skb, struct sock *sk)
1671 {
1672         skb_orphan(skb);
1673         skb->sk = sk;
1674 #ifdef CONFIG_INET
1675         if (unlikely(!sk_fullsock(sk))) {
1676                 skb->destructor = sock_edemux;
1677                 sock_hold(sk);
1678                 return;
1679         }
1680 #endif
1681         skb->destructor = sock_wfree;
1682         skb_set_hash_from_sk(skb, sk);
1683         /*
1684          * We used to take a refcount on sk, but following operation
1685          * is enough to guarantee sk_free() wont free this sock until
1686          * all in-flight packets are completed
1687          */
1688         atomic_add(skb->truesize, &sk->sk_wmem_alloc);
1689 }
1690 EXPORT_SYMBOL(skb_set_owner_w);
1691
1692 /* This helper is used by netem, as it can hold packets in its
1693  * delay queue. We want to allow the owner socket to send more
1694  * packets, as if they were already TX completed by a typical driver.
1695  * But we also want to keep skb->sk set because some packet schedulers
1696  * rely on it (sch_fq for example).
1697  */
1698 void skb_orphan_partial(struct sk_buff *skb)
1699 {
1700         if (skb_is_tcp_pure_ack(skb))
1701                 return;
1702
1703         if (skb->destructor == sock_wfree
1704 #ifdef CONFIG_INET
1705             || skb->destructor == tcp_wfree
1706 #endif
1707                 ) {
1708                 struct sock *sk = skb->sk;
1709
1710                 if (atomic_inc_not_zero(&sk->sk_refcnt)) {
1711                         atomic_sub(skb->truesize, &sk->sk_wmem_alloc);
1712                         skb->destructor = sock_efree;
1713                 }
1714         } else {
1715                 skb_orphan(skb);
1716         }
1717 }
1718 EXPORT_SYMBOL(skb_orphan_partial);
1719
1720 /*
1721  * Read buffer destructor automatically called from kfree_skb.
1722  */
1723 void sock_rfree(struct sk_buff *skb)
1724 {
1725         struct sock *sk = skb->sk;
1726         unsigned int len = skb->truesize;
1727
1728         atomic_sub(len, &sk->sk_rmem_alloc);
1729         sk_mem_uncharge(sk, len);
1730 }
1731 EXPORT_SYMBOL(sock_rfree);
1732
1733 /*
1734  * Buffer destructor for skbs that are not used directly in read or write
1735  * path, e.g. for error handler skbs. Automatically called from kfree_skb.
1736  */
1737 void sock_efree(struct sk_buff *skb)
1738 {
1739         sock_put(skb->sk);
1740 }
1741 EXPORT_SYMBOL(sock_efree);
1742
1743 kuid_t sock_i_uid(struct sock *sk)
1744 {
1745         kuid_t uid;
1746
1747         read_lock_bh(&sk->sk_callback_lock);
1748         uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : GLOBAL_ROOT_UID;
1749         read_unlock_bh(&sk->sk_callback_lock);
1750         return uid;
1751 }
1752 EXPORT_SYMBOL(sock_i_uid);
1753
1754 unsigned long sock_i_ino(struct sock *sk)
1755 {
1756         unsigned long ino;
1757
1758         read_lock_bh(&sk->sk_callback_lock);
1759         ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0;
1760         read_unlock_bh(&sk->sk_callback_lock);
1761         return ino;
1762 }
1763 EXPORT_SYMBOL(sock_i_ino);
1764
1765 /*
1766  * Allocate a skb from the socket's send buffer.
1767  */
1768 struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
1769                              gfp_t priority)
1770 {
1771         if (force || atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
1772                 struct sk_buff *skb = alloc_skb(size, priority);
1773                 if (skb) {
1774                         skb_set_owner_w(skb, sk);
1775                         return skb;
1776                 }
1777         }
1778         return NULL;
1779 }
1780 EXPORT_SYMBOL(sock_wmalloc);
1781
1782 /*
1783  * Allocate a memory block from the socket's option memory buffer.
1784  */
1785 void *sock_kmalloc(struct sock *sk, int size, gfp_t priority)
1786 {
1787         if ((unsigned int)size <= sysctl_optmem_max &&
1788             atomic_read(&sk->sk_omem_alloc) + size < sysctl_optmem_max) {
1789                 void *mem;
1790                 /* First do the add, to avoid the race if kmalloc
1791                  * might sleep.
1792                  */
1793                 atomic_add(size, &sk->sk_omem_alloc);
1794                 mem = kmalloc(size, priority);
1795                 if (mem)
1796                         return mem;
1797                 atomic_sub(size, &sk->sk_omem_alloc);
1798         }
1799         return NULL;
1800 }
1801 EXPORT_SYMBOL(sock_kmalloc);
1802
1803 /* Free an option memory block. Note, we actually want the inline
1804  * here as this allows gcc to detect the nullify and fold away the
1805  * condition entirely.
1806  */
1807 static inline void __sock_kfree_s(struct sock *sk, void *mem, int size,
1808                                   const bool nullify)
1809 {
1810         if (WARN_ON_ONCE(!mem))
1811                 return;
1812         if (nullify)
1813                 kzfree(mem);
1814         else
1815                 kfree(mem);
1816         atomic_sub(size, &sk->sk_omem_alloc);
1817 }
1818
1819 void sock_kfree_s(struct sock *sk, void *mem, int size)
1820 {
1821         __sock_kfree_s(sk, mem, size, false);
1822 }
1823 EXPORT_SYMBOL(sock_kfree_s);
1824
1825 void sock_kzfree_s(struct sock *sk, void *mem, int size)
1826 {
1827         __sock_kfree_s(sk, mem, size, true);
1828 }
1829 EXPORT_SYMBOL(sock_kzfree_s);
1830
1831 /* It is almost wait_for_tcp_memory minus release_sock/lock_sock.
1832    I think, these locks should be removed for datagram sockets.
1833  */
1834 static long sock_wait_for_wmem(struct sock *sk, long timeo)
1835 {
1836         DEFINE_WAIT(wait);
1837
1838         sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
1839         for (;;) {
1840                 if (!timeo)
1841                         break;
1842                 if (signal_pending(current))
1843                         break;
1844                 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1845                 prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
1846                 if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf)
1847                         break;
1848                 if (sk->sk_shutdown & SEND_SHUTDOWN)
1849                         break;
1850                 if (sk->sk_err)
1851                         break;
1852                 timeo = schedule_timeout(timeo);
1853         }
1854         finish_wait(sk_sleep(sk), &wait);
1855         return timeo;
1856 }
1857
1858
1859 /*
1860  *      Generic send/receive buffer handlers
1861  */
1862
1863 struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len,
1864                                      unsigned long data_len, int noblock,
1865                                      int *errcode, int max_page_order)
1866 {
1867         struct sk_buff *skb;
1868         long timeo;
1869         int err;
1870
1871         timeo = sock_sndtimeo(sk, noblock);
1872         for (;;) {
1873                 err = sock_error(sk);
1874                 if (err != 0)
1875                         goto failure;
1876
1877                 err = -EPIPE;
1878                 if (sk->sk_shutdown & SEND_SHUTDOWN)
1879                         goto failure;
1880
1881                 if (sk_wmem_alloc_get(sk) < sk->sk_sndbuf)
1882                         break;
1883
1884                 sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
1885                 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1886                 err = -EAGAIN;
1887                 if (!timeo)
1888                         goto failure;
1889                 if (signal_pending(current))
1890                         goto interrupted;
1891                 timeo = sock_wait_for_wmem(sk, timeo);
1892         }
1893         skb = alloc_skb_with_frags(header_len, data_len, max_page_order,
1894                                    errcode, sk->sk_allocation);
1895         if (skb)
1896                 skb_set_owner_w(skb, sk);
1897         return skb;
1898
1899 interrupted:
1900         err = sock_intr_errno(timeo);
1901 failure:
1902         *errcode = err;
1903         return NULL;
1904 }
1905 EXPORT_SYMBOL(sock_alloc_send_pskb);
1906
1907 struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size,
1908                                     int noblock, int *errcode)
1909 {
1910         return sock_alloc_send_pskb(sk, size, 0, noblock, errcode, 0);
1911 }
1912 EXPORT_SYMBOL(sock_alloc_send_skb);
1913
1914 int __sock_cmsg_send(struct sock *sk, struct msghdr *msg, struct cmsghdr *cmsg,
1915                      struct sockcm_cookie *sockc)
1916 {
1917         u32 tsflags;
1918
1919         switch (cmsg->cmsg_type) {
1920         case SO_MARK:
1921                 if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
1922                         return -EPERM;
1923                 if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
1924                         return -EINVAL;
1925                 sockc->mark = *(u32 *)CMSG_DATA(cmsg);
1926                 break;
1927         case SO_TIMESTAMPING:
1928                 if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
1929                         return -EINVAL;
1930
1931                 tsflags = *(u32 *)CMSG_DATA(cmsg);
1932                 if (tsflags & ~SOF_TIMESTAMPING_TX_RECORD_MASK)
1933                         return -EINVAL;
1934
1935                 sockc->tsflags &= ~SOF_TIMESTAMPING_TX_RECORD_MASK;
1936                 sockc->tsflags |= tsflags;
1937                 break;
1938         /* SCM_RIGHTS and SCM_CREDENTIALS are semantically in SOL_UNIX. */
1939         case SCM_RIGHTS:
1940         case SCM_CREDENTIALS:
1941                 break;
1942         default:
1943                 return -EINVAL;
1944         }
1945         return 0;
1946 }
1947 EXPORT_SYMBOL(__sock_cmsg_send);
1948
1949 int sock_cmsg_send(struct sock *sk, struct msghdr *msg,
1950                    struct sockcm_cookie *sockc)
1951 {
1952         struct cmsghdr *cmsg;
1953         int ret;
1954
1955         for_each_cmsghdr(cmsg, msg) {
1956                 if (!CMSG_OK(msg, cmsg))
1957                         return -EINVAL;
1958                 if (cmsg->cmsg_level != SOL_SOCKET)
1959                         continue;
1960                 ret = __sock_cmsg_send(sk, msg, cmsg, sockc);
1961                 if (ret)
1962                         return ret;
1963         }
1964         return 0;
1965 }
1966 EXPORT_SYMBOL(sock_cmsg_send);
1967
1968 /* On 32bit arches, an skb frag is limited to 2^15 */
1969 #define SKB_FRAG_PAGE_ORDER     get_order(32768)
1970
1971 /**
1972  * skb_page_frag_refill - check that a page_frag contains enough room
1973  * @sz: minimum size of the fragment we want to get
1974  * @pfrag: pointer to page_frag
1975  * @gfp: priority for memory allocation
1976  *
1977  * Note: While this allocator tries to use high order pages, there is
1978  * no guarantee that allocations succeed. Therefore, @sz MUST be
1979  * less or equal than PAGE_SIZE.
1980  */
1981 bool skb_page_frag_refill(unsigned int sz, struct page_frag *pfrag, gfp_t gfp)
1982 {
1983         if (pfrag->page) {
1984                 if (page_ref_count(pfrag->page) == 1) {
1985                         pfrag->offset = 0;
1986                         return true;
1987                 }
1988                 if (pfrag->offset + sz <= pfrag->size)
1989                         return true;
1990                 put_page(pfrag->page);
1991         }
1992
1993         pfrag->offset = 0;
1994         if (SKB_FRAG_PAGE_ORDER) {
1995                 /* Avoid direct reclaim but allow kswapd to wake */
1996                 pfrag->page = alloc_pages((gfp & ~__GFP_DIRECT_RECLAIM) |
1997                                           __GFP_COMP | __GFP_NOWARN |
1998                                           __GFP_NORETRY,
1999                                           SKB_FRAG_PAGE_ORDER);
2000                 if (likely(pfrag->page)) {
2001                         pfrag->size = PAGE_SIZE << SKB_FRAG_PAGE_ORDER;
2002                         return true;
2003                 }
2004         }
2005         pfrag->page = alloc_page(gfp);
2006         if (likely(pfrag->page)) {
2007                 pfrag->size = PAGE_SIZE;
2008                 return true;
2009         }
2010         return false;
2011 }
2012 EXPORT_SYMBOL(skb_page_frag_refill);
2013
2014 bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag)
2015 {
2016         if (likely(skb_page_frag_refill(32U, pfrag, sk->sk_allocation)))
2017                 return true;
2018
2019         sk_enter_memory_pressure(sk);
2020         sk_stream_moderate_sndbuf(sk);
2021         return false;
2022 }
2023 EXPORT_SYMBOL(sk_page_frag_refill);
2024
2025 static void __lock_sock(struct sock *sk)
2026         __releases(&sk->sk_lock.slock)
2027         __acquires(&sk->sk_lock.slock)
2028 {
2029         DEFINE_WAIT(wait);
2030
2031         for (;;) {
2032                 prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait,
2033                                         TASK_UNINTERRUPTIBLE);
2034                 spin_unlock_bh(&sk->sk_lock.slock);
2035                 schedule();
2036                 spin_lock_bh(&sk->sk_lock.slock);
2037                 if (!sock_owned_by_user(sk))
2038                         break;
2039         }
2040         finish_wait(&sk->sk_lock.wq, &wait);
2041 }
2042
2043 static void __release_sock(struct sock *sk)
2044         __releases(&sk->sk_lock.slock)
2045         __acquires(&sk->sk_lock.slock)
2046 {
2047         struct sk_buff *skb, *next;
2048
2049         while ((skb = sk->sk_backlog.head) != NULL) {
2050                 sk->sk_backlog.head = sk->sk_backlog.tail = NULL;
2051
2052                 spin_unlock_bh(&sk->sk_lock.slock);
2053
2054                 do {
2055                         next = skb->next;
2056                         prefetch(next);
2057                         WARN_ON_ONCE(skb_dst_is_noref(skb));
2058                         skb->next = NULL;
2059                         sk_backlog_rcv(sk, skb);
2060
2061                         cond_resched();
2062
2063                         skb = next;
2064                 } while (skb != NULL);
2065
2066                 spin_lock_bh(&sk->sk_lock.slock);
2067         }
2068
2069         /*
2070          * Doing the zeroing here guarantee we can not loop forever
2071          * while a wild producer attempts to flood us.
2072          */
2073         sk->sk_backlog.len = 0;
2074 }
2075
2076 void __sk_flush_backlog(struct sock *sk)
2077 {
2078         spin_lock_bh(&sk->sk_lock.slock);
2079         __release_sock(sk);
2080         spin_unlock_bh(&sk->sk_lock.slock);
2081 }
2082
2083 /**
2084  * sk_wait_data - wait for data to arrive at sk_receive_queue
2085  * @sk:    sock to wait on
2086  * @timeo: for how long
2087  * @skb:   last skb seen on sk_receive_queue
2088  *
2089  * Now socket state including sk->sk_err is changed only under lock,
2090  * hence we may omit checks after joining wait queue.
2091  * We check receive queue before schedule() only as optimization;
2092  * it is very likely that release_sock() added new data.
2093  */
2094 int sk_wait_data(struct sock *sk, long *timeo, const struct sk_buff *skb)
2095 {
2096         int rc;
2097         DEFINE_WAIT(wait);
2098
2099         prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
2100         sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2101         rc = sk_wait_event(sk, timeo, skb_peek_tail(&sk->sk_receive_queue) != skb);
2102         sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2103         finish_wait(sk_sleep(sk), &wait);
2104         return rc;
2105 }
2106 EXPORT_SYMBOL(sk_wait_data);
2107
2108 /**
2109  *      __sk_mem_schedule - increase sk_forward_alloc and memory_allocated
2110  *      @sk: socket
2111  *      @size: memory size to allocate
2112  *      @kind: allocation type
2113  *
2114  *      If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means
2115  *      rmem allocation. This function assumes that protocols which have
2116  *      memory_pressure use sk_wmem_queued as write buffer accounting.
2117  */
2118 int __sk_mem_schedule(struct sock *sk, int size, int kind)
2119 {
2120         struct proto *prot = sk->sk_prot;
2121         int amt = sk_mem_pages(size);
2122         long allocated;
2123
2124         sk->sk_forward_alloc += amt * SK_MEM_QUANTUM;
2125
2126         allocated = sk_memory_allocated_add(sk, amt);
2127
2128         if (mem_cgroup_sockets_enabled && sk->sk_memcg &&
2129             !mem_cgroup_charge_skmem(sk->sk_memcg, amt))
2130                 goto suppress_allocation;
2131
2132         /* Under limit. */
2133         if (allocated <= sk_prot_mem_limits(sk, 0)) {
2134                 sk_leave_memory_pressure(sk);
2135                 return 1;
2136         }
2137
2138         /* Under pressure. */
2139         if (allocated > sk_prot_mem_limits(sk, 1))
2140                 sk_enter_memory_pressure(sk);
2141
2142         /* Over hard limit. */
2143         if (allocated > sk_prot_mem_limits(sk, 2))
2144                 goto suppress_allocation;
2145
2146         /* guarantee minimum buffer size under pressure */
2147         if (kind == SK_MEM_RECV) {
2148                 if (atomic_read(&sk->sk_rmem_alloc) < prot->sysctl_rmem[0])
2149                         return 1;
2150
2151         } else { /* SK_MEM_SEND */
2152                 if (sk->sk_type == SOCK_STREAM) {
2153                         if (sk->sk_wmem_queued < prot->sysctl_wmem[0])
2154                                 return 1;
2155                 } else if (atomic_read(&sk->sk_wmem_alloc) <
2156                            prot->sysctl_wmem[0])
2157                                 return 1;
2158         }
2159
2160         if (sk_has_memory_pressure(sk)) {
2161                 u64 alloc;
2162
2163                 if (!sk_under_memory_pressure(sk))
2164                         return 1;
2165                 alloc = sk_sockets_allocated_read_positive(sk);
2166                 if (sk_prot_mem_limits(sk, 2) > alloc *
2167                     sk_mem_pages(sk->sk_wmem_queued +
2168                                  atomic_read(&sk->sk_rmem_alloc) +
2169                                  sk->sk_forward_alloc))
2170                         return 1;
2171         }
2172
2173 suppress_allocation:
2174
2175         if (kind == SK_MEM_SEND && sk->sk_type == SOCK_STREAM) {
2176                 sk_stream_moderate_sndbuf(sk);
2177
2178                 /* Fail only if socket is _under_ its sndbuf.
2179                  * In this case we cannot block, so that we have to fail.
2180                  */
2181                 if (sk->sk_wmem_queued + size >= sk->sk_sndbuf)
2182                         return 1;
2183         }
2184
2185         trace_sock_exceed_buf_limit(sk, prot, allocated);
2186
2187         /* Alas. Undo changes. */
2188         sk->sk_forward_alloc -= amt * SK_MEM_QUANTUM;
2189
2190         sk_memory_allocated_sub(sk, amt);
2191
2192         if (mem_cgroup_sockets_enabled && sk->sk_memcg)
2193                 mem_cgroup_uncharge_skmem(sk->sk_memcg, amt);
2194
2195         return 0;
2196 }
2197 EXPORT_SYMBOL(__sk_mem_schedule);
2198
2199 /**
2200  *      __sk_mem_reclaim - reclaim memory_allocated
2201  *      @sk: socket
2202  *      @amount: number of bytes (rounded down to a SK_MEM_QUANTUM multiple)
2203  */
2204 void __sk_mem_reclaim(struct sock *sk, int amount)
2205 {
2206         amount >>= SK_MEM_QUANTUM_SHIFT;
2207         sk_memory_allocated_sub(sk, amount);
2208         sk->sk_forward_alloc -= amount << SK_MEM_QUANTUM_SHIFT;
2209
2210         if (mem_cgroup_sockets_enabled && sk->sk_memcg)
2211                 mem_cgroup_uncharge_skmem(sk->sk_memcg, amount);
2212
2213         if (sk_under_memory_pressure(sk) &&
2214             (sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)))
2215                 sk_leave_memory_pressure(sk);
2216 }
2217 EXPORT_SYMBOL(__sk_mem_reclaim);
2218
2219 int sk_set_peek_off(struct sock *sk, int val)
2220 {
2221         if (val < 0)
2222                 return -EINVAL;
2223
2224         sk->sk_peek_off = val;
2225         return 0;
2226 }
2227 EXPORT_SYMBOL_GPL(sk_set_peek_off);
2228
2229 /*
2230  * Set of default routines for initialising struct proto_ops when
2231  * the protocol does not support a particular function. In certain
2232  * cases where it makes no sense for a protocol to have a "do nothing"
2233  * function, some default processing is provided.
2234  */
2235
2236 int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len)
2237 {
2238         return -EOPNOTSUPP;
2239 }
2240 EXPORT_SYMBOL(sock_no_bind);
2241
2242 int sock_no_connect(struct socket *sock, struct sockaddr *saddr,
2243                     int len, int flags)
2244 {
2245         return -EOPNOTSUPP;
2246 }
2247 EXPORT_SYMBOL(sock_no_connect);
2248
2249 int sock_no_socketpair(struct socket *sock1, struct socket *sock2)
2250 {
2251         return -EOPNOTSUPP;
2252 }
2253 EXPORT_SYMBOL(sock_no_socketpair);
2254
2255 int sock_no_accept(struct socket *sock, struct socket *newsock, int flags)
2256 {
2257         return -EOPNOTSUPP;
2258 }
2259 EXPORT_SYMBOL(sock_no_accept);
2260
2261 int sock_no_getname(struct socket *sock, struct sockaddr *saddr,
2262                     int *len, int peer)
2263 {
2264         return -EOPNOTSUPP;
2265 }
2266 EXPORT_SYMBOL(sock_no_getname);
2267
2268 unsigned int sock_no_poll(struct file *file, struct socket *sock, poll_table *pt)
2269 {
2270         return 0;
2271 }
2272 EXPORT_SYMBOL(sock_no_poll);
2273
2274 int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
2275 {
2276         return -EOPNOTSUPP;
2277 }
2278 EXPORT_SYMBOL(sock_no_ioctl);
2279
2280 int sock_no_listen(struct socket *sock, int backlog)
2281 {
2282         return -EOPNOTSUPP;
2283 }
2284 EXPORT_SYMBOL(sock_no_listen);
2285
2286 int sock_no_shutdown(struct socket *sock, int how)
2287 {
2288         return -EOPNOTSUPP;
2289 }
2290 EXPORT_SYMBOL(sock_no_shutdown);
2291
2292 int sock_no_setsockopt(struct socket *sock, int level, int optname,
2293                     char __user *optval, unsigned int optlen)
2294 {
2295         return -EOPNOTSUPP;
2296 }
2297 EXPORT_SYMBOL(sock_no_setsockopt);
2298
2299 int sock_no_getsockopt(struct socket *sock, int level, int optname,
2300                     char __user *optval, int __user *optlen)
2301 {
2302         return -EOPNOTSUPP;
2303 }
2304 EXPORT_SYMBOL(sock_no_getsockopt);
2305
2306 int sock_no_sendmsg(struct socket *sock, struct msghdr *m, size_t len)
2307 {
2308         return -EOPNOTSUPP;
2309 }
2310 EXPORT_SYMBOL(sock_no_sendmsg);
2311
2312 int sock_no_recvmsg(struct socket *sock, struct msghdr *m, size_t len,
2313                     int flags)
2314 {
2315         return -EOPNOTSUPP;
2316 }
2317 EXPORT_SYMBOL(sock_no_recvmsg);
2318
2319 int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
2320 {
2321         /* Mirror missing mmap method error code */
2322         return -ENODEV;
2323 }
2324 EXPORT_SYMBOL(sock_no_mmap);
2325
2326 /*
2327  * When a file is received (via SCM_RIGHTS, etc), we must bump the
2328  * various sock-based usage counts.
2329  */
2330 void __receive_sock(struct file *file)
2331 {
2332         struct socket *sock;
2333         int error;
2334
2335         /*
2336          * The resulting value of "error" is ignored here since we only
2337          * need to take action when the file is a socket and testing
2338          * "sock" for NULL is sufficient.
2339          */
2340         sock = sock_from_file(file, &error);
2341         if (sock) {
2342                 sock_update_netprioidx(&sock->sk->sk_cgrp_data);
2343                 sock_update_classid(&sock->sk->sk_cgrp_data);
2344         }
2345 }
2346
2347 ssize_t sock_no_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags)
2348 {
2349         ssize_t res;
2350         struct msghdr msg = {.msg_flags = flags};
2351         struct kvec iov;
2352         char *kaddr = kmap(page);
2353         iov.iov_base = kaddr + offset;
2354         iov.iov_len = size;
2355         res = kernel_sendmsg(sock, &msg, &iov, 1, size);
2356         kunmap(page);
2357         return res;
2358 }
2359 EXPORT_SYMBOL(sock_no_sendpage);
2360
2361 /*
2362  *      Default Socket Callbacks
2363  */
2364
2365 static void sock_def_wakeup(struct sock *sk)
2366 {
2367         struct socket_wq *wq;
2368
2369         rcu_read_lock();
2370         wq = rcu_dereference(sk->sk_wq);
2371         if (skwq_has_sleeper(wq))
2372                 wake_up_interruptible_all(&wq->wait);
2373         rcu_read_unlock();
2374 }
2375
2376 static void sock_def_error_report(struct sock *sk)
2377 {
2378         struct socket_wq *wq;
2379
2380         rcu_read_lock();
2381         wq = rcu_dereference(sk->sk_wq);
2382         if (skwq_has_sleeper(wq))
2383                 wake_up_interruptible_poll(&wq->wait, POLLERR);
2384         sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR);
2385         rcu_read_unlock();
2386 }
2387
2388 static void sock_def_readable(struct sock *sk)
2389 {
2390         struct socket_wq *wq;
2391
2392         rcu_read_lock();
2393         wq = rcu_dereference(sk->sk_wq);
2394         if (skwq_has_sleeper(wq))
2395                 wake_up_interruptible_sync_poll(&wq->wait, POLLIN | POLLPRI |
2396                                                 POLLRDNORM | POLLRDBAND);
2397         sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
2398         rcu_read_unlock();
2399 }
2400
2401 static void sock_def_write_space(struct sock *sk)
2402 {
2403         struct socket_wq *wq;
2404
2405         rcu_read_lock();
2406
2407         /* Do not wake up a writer until he can make "significant"
2408          * progress.  --DaveM
2409          */
2410         if ((atomic_read(&sk->sk_wmem_alloc) << 1) <= sk->sk_sndbuf) {
2411                 wq = rcu_dereference(sk->sk_wq);
2412                 if (skwq_has_sleeper(wq))
2413                         wake_up_interruptible_sync_poll(&wq->wait, POLLOUT |
2414                                                 POLLWRNORM | POLLWRBAND);
2415
2416                 /* Should agree with poll, otherwise some programs break */
2417                 if (sock_writeable(sk))
2418                         sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
2419         }
2420
2421         rcu_read_unlock();
2422 }
2423
2424 static void sock_def_destruct(struct sock *sk)
2425 {
2426 }
2427
2428 void sk_send_sigurg(struct sock *sk)
2429 {
2430         if (sk->sk_socket && sk->sk_socket->file)
2431                 if (send_sigurg(&sk->sk_socket->file->f_owner))
2432                         sk_wake_async(sk, SOCK_WAKE_URG, POLL_PRI);
2433 }
2434 EXPORT_SYMBOL(sk_send_sigurg);
2435
2436 void sk_reset_timer(struct sock *sk, struct timer_list* timer,
2437                     unsigned long expires)
2438 {
2439         if (!mod_timer(timer, expires))
2440                 sock_hold(sk);
2441 }
2442 EXPORT_SYMBOL(sk_reset_timer);
2443
2444 void sk_stop_timer(struct sock *sk, struct timer_list* timer)
2445 {
2446         if (del_timer(timer))
2447                 __sock_put(sk);
2448 }
2449 EXPORT_SYMBOL(sk_stop_timer);
2450
2451 void sock_init_data(struct socket *sock, struct sock *sk)
2452 {
2453         skb_queue_head_init(&sk->sk_receive_queue);
2454         skb_queue_head_init(&sk->sk_write_queue);
2455         skb_queue_head_init(&sk->sk_error_queue);
2456
2457         sk->sk_send_head        =       NULL;
2458
2459         init_timer(&sk->sk_timer);
2460
2461         sk->sk_allocation       =       GFP_KERNEL;
2462         sk->sk_rcvbuf           =       sysctl_rmem_default;
2463         sk->sk_sndbuf           =       sysctl_wmem_default;
2464         sk->sk_state            =       TCP_CLOSE;
2465         sk_set_socket(sk, sock);
2466
2467         sock_set_flag(sk, SOCK_ZAPPED);
2468
2469         if (sock) {
2470                 sk->sk_type     =       sock->type;
2471                 sk->sk_wq       =       sock->wq;
2472                 sock->sk        =       sk;
2473         } else
2474                 sk->sk_wq       =       NULL;
2475
2476         rwlock_init(&sk->sk_callback_lock);
2477         lockdep_set_class_and_name(&sk->sk_callback_lock,
2478                         af_callback_keys + sk->sk_family,
2479                         af_family_clock_key_strings[sk->sk_family]);
2480
2481         sk->sk_state_change     =       sock_def_wakeup;
2482         sk->sk_data_ready       =       sock_def_readable;
2483         sk->sk_write_space      =       sock_def_write_space;
2484         sk->sk_error_report     =       sock_def_error_report;
2485         sk->sk_destruct         =       sock_def_destruct;
2486
2487         sk->sk_frag.page        =       NULL;
2488         sk->sk_frag.offset      =       0;
2489         sk->sk_peek_off         =       -1;
2490
2491         sk->sk_peer_pid         =       NULL;
2492         sk->sk_peer_cred        =       NULL;
2493         sk->sk_write_pending    =       0;
2494         sk->sk_rcvlowat         =       1;
2495         sk->sk_rcvtimeo         =       MAX_SCHEDULE_TIMEOUT;
2496         sk->sk_sndtimeo         =       MAX_SCHEDULE_TIMEOUT;
2497
2498         sk->sk_stamp = ktime_set(-1L, 0);
2499 #if BITS_PER_LONG==32
2500         seqlock_init(&sk->sk_stamp_seq);
2501 #endif
2502
2503 #ifdef CONFIG_NET_RX_BUSY_POLL
2504         sk->sk_napi_id          =       0;
2505         sk->sk_ll_usec          =       sysctl_net_busy_read;
2506 #endif
2507
2508         sk->sk_max_pacing_rate = ~0U;
2509         sk->sk_pacing_rate = ~0U;
2510         sk->sk_incoming_cpu = -1;
2511         /*
2512          * Before updating sk_refcnt, we must commit prior changes to memory
2513          * (Documentation/RCU/rculist_nulls.txt for details)
2514          */
2515         smp_wmb();
2516         atomic_set(&sk->sk_refcnt, 1);
2517         atomic_set(&sk->sk_drops, 0);
2518 }
2519 EXPORT_SYMBOL(sock_init_data);
2520
2521 void lock_sock_nested(struct sock *sk, int subclass)
2522 {
2523         might_sleep();
2524         spin_lock_bh(&sk->sk_lock.slock);
2525         if (sk->sk_lock.owned)
2526                 __lock_sock(sk);
2527         sk->sk_lock.owned = 1;
2528         spin_unlock(&sk->sk_lock.slock);
2529         /*
2530          * The sk_lock has mutex_lock() semantics here:
2531          */
2532         mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_);
2533         local_bh_enable();
2534 }
2535 EXPORT_SYMBOL(lock_sock_nested);
2536
2537 void release_sock(struct sock *sk)
2538 {
2539         spin_lock_bh(&sk->sk_lock.slock);
2540         if (sk->sk_backlog.tail)
2541                 __release_sock(sk);
2542
2543         /* Warning : release_cb() might need to release sk ownership,
2544          * ie call sock_release_ownership(sk) before us.
2545          */
2546         if (sk->sk_prot->release_cb)
2547                 sk->sk_prot->release_cb(sk);
2548
2549         sock_release_ownership(sk);
2550         if (waitqueue_active(&sk->sk_lock.wq))
2551                 wake_up(&sk->sk_lock.wq);
2552         spin_unlock_bh(&sk->sk_lock.slock);
2553 }
2554 EXPORT_SYMBOL(release_sock);
2555
2556 /**
2557  * lock_sock_fast - fast version of lock_sock
2558  * @sk: socket
2559  *
2560  * This version should be used for very small section, where process wont block
2561  * return false if fast path is taken
2562  *   sk_lock.slock locked, owned = 0, BH disabled
2563  * return true if slow path is taken
2564  *   sk_lock.slock unlocked, owned = 1, BH enabled
2565  */
2566 bool lock_sock_fast(struct sock *sk)
2567 {
2568         might_sleep();
2569         spin_lock_bh(&sk->sk_lock.slock);
2570
2571         if (!sk->sk_lock.owned)
2572                 /*
2573                  * Note : We must disable BH
2574                  */
2575                 return false;
2576
2577         __lock_sock(sk);
2578         sk->sk_lock.owned = 1;
2579         spin_unlock(&sk->sk_lock.slock);
2580         /*
2581          * The sk_lock has mutex_lock() semantics here:
2582          */
2583         mutex_acquire(&sk->sk_lock.dep_map, 0, 0, _RET_IP_);
2584         local_bh_enable();
2585         return true;
2586 }
2587 EXPORT_SYMBOL(lock_sock_fast);
2588
2589 int sock_get_timestamp(struct sock *sk, struct timeval __user *userstamp)
2590 {
2591         struct timeval tv;
2592         if (!sock_flag(sk, SOCK_TIMESTAMP))
2593                 sock_enable_timestamp(sk, SOCK_TIMESTAMP);
2594         tv = ktime_to_timeval(sk->sk_stamp);
2595         if (tv.tv_sec == -1)
2596                 return -ENOENT;
2597         if (tv.tv_sec == 0) {
2598                 sk->sk_stamp = ktime_get_real();
2599                 tv = ktime_to_timeval(sk->sk_stamp);
2600         }
2601         return copy_to_user(userstamp, &tv, sizeof(tv)) ? -EFAULT : 0;
2602 }
2603 EXPORT_SYMBOL(sock_get_timestamp);
2604
2605 int sock_get_timestampns(struct sock *sk, struct timespec __user *userstamp)
2606 {
2607         struct timespec ts;
2608         if (!sock_flag(sk, SOCK_TIMESTAMP))
2609                 sock_enable_timestamp(sk, SOCK_TIMESTAMP);
2610         ts = ktime_to_timespec(sk->sk_stamp);
2611         if (ts.tv_sec == -1)
2612                 return -ENOENT;
2613         if (ts.tv_sec == 0) {
2614                 sk->sk_stamp = ktime_get_real();
2615                 ts = ktime_to_timespec(sk->sk_stamp);
2616         }
2617         return copy_to_user(userstamp, &ts, sizeof(ts)) ? -EFAULT : 0;
2618 }
2619 EXPORT_SYMBOL(sock_get_timestampns);
2620
2621 void sock_enable_timestamp(struct sock *sk, int flag)
2622 {
2623         if (!sock_flag(sk, flag)) {
2624                 unsigned long previous_flags = sk->sk_flags;
2625
2626                 sock_set_flag(sk, flag);
2627                 /*
2628                  * we just set one of the two flags which require net
2629                  * time stamping, but time stamping might have been on
2630                  * already because of the other one
2631                  */
2632                 if (sock_needs_netstamp(sk) &&
2633                     !(previous_flags & SK_FLAGS_TIMESTAMP))
2634                         net_enable_timestamp();
2635         }
2636 }
2637
2638 int sock_recv_errqueue(struct sock *sk, struct msghdr *msg, int len,
2639                        int level, int type)
2640 {
2641         struct sock_exterr_skb *serr;
2642         struct sk_buff *skb;
2643         int copied, err;
2644
2645         err = -EAGAIN;
2646         skb = sock_dequeue_err_skb(sk);
2647         if (skb == NULL)
2648                 goto out;
2649
2650         copied = skb->len;
2651         if (copied > len) {
2652                 msg->msg_flags |= MSG_TRUNC;
2653                 copied = len;
2654         }
2655         err = skb_copy_datagram_msg(skb, 0, msg, copied);
2656         if (err)
2657                 goto out_free_skb;
2658
2659         sock_recv_timestamp(msg, sk, skb);
2660
2661         serr = SKB_EXT_ERR(skb);
2662         put_cmsg(msg, level, type, sizeof(serr->ee), &serr->ee);
2663
2664         msg->msg_flags |= MSG_ERRQUEUE;
2665         err = copied;
2666
2667 out_free_skb:
2668         kfree_skb(skb);
2669 out:
2670         return err;
2671 }
2672 EXPORT_SYMBOL(sock_recv_errqueue);
2673
2674 /*
2675  *      Get a socket option on an socket.
2676  *
2677  *      FIX: POSIX 1003.1g is very ambiguous here. It states that
2678  *      asynchronous errors should be reported by getsockopt. We assume
2679  *      this means if you specify SO_ERROR (otherwise whats the point of it).
2680  */
2681 int sock_common_getsockopt(struct socket *sock, int level, int optname,
2682                            char __user *optval, int __user *optlen)
2683 {
2684         struct sock *sk = sock->sk;
2685
2686         return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
2687 }
2688 EXPORT_SYMBOL(sock_common_getsockopt);
2689
2690 #ifdef CONFIG_COMPAT
2691 int compat_sock_common_getsockopt(struct socket *sock, int level, int optname,
2692                                   char __user *optval, int __user *optlen)
2693 {
2694         struct sock *sk = sock->sk;
2695
2696         if (sk->sk_prot->compat_getsockopt != NULL)
2697                 return sk->sk_prot->compat_getsockopt(sk, level, optname,
2698                                                       optval, optlen);
2699         return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
2700 }
2701 EXPORT_SYMBOL(compat_sock_common_getsockopt);
2702 #endif
2703
2704 int sock_common_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
2705                         int flags)
2706 {
2707         struct sock *sk = sock->sk;
2708         int addr_len = 0;
2709         int err;
2710
2711         err = sk->sk_prot->recvmsg(sk, msg, size, flags & MSG_DONTWAIT,
2712                                    flags & ~MSG_DONTWAIT, &addr_len);
2713         if (err >= 0)
2714                 msg->msg_namelen = addr_len;
2715         return err;
2716 }
2717 EXPORT_SYMBOL(sock_common_recvmsg);
2718
2719 /*
2720  *      Set socket options on an inet socket.
2721  */
2722 int sock_common_setsockopt(struct socket *sock, int level, int optname,
2723                            char __user *optval, unsigned int optlen)
2724 {
2725         struct sock *sk = sock->sk;
2726
2727         return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
2728 }
2729 EXPORT_SYMBOL(sock_common_setsockopt);
2730
2731 #ifdef CONFIG_COMPAT
2732 int compat_sock_common_setsockopt(struct socket *sock, int level, int optname,
2733                                   char __user *optval, unsigned int optlen)
2734 {
2735         struct sock *sk = sock->sk;
2736
2737         if (sk->sk_prot->compat_setsockopt != NULL)
2738                 return sk->sk_prot->compat_setsockopt(sk, level, optname,
2739                                                       optval, optlen);
2740         return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
2741 }
2742 EXPORT_SYMBOL(compat_sock_common_setsockopt);
2743 #endif
2744
2745 void sk_common_release(struct sock *sk)
2746 {
2747         if (sk->sk_prot->destroy)
2748                 sk->sk_prot->destroy(sk);
2749
2750         /*
2751          * Observation: when sock_common_release is called, processes have
2752          * no access to socket. But net still has.
2753          * Step one, detach it from networking:
2754          *
2755          * A. Remove from hash tables.
2756          */
2757
2758         sk->sk_prot->unhash(sk);
2759
2760         /*
2761          * In this point socket cannot receive new packets, but it is possible
2762          * that some packets are in flight because some CPU runs receiver and
2763          * did hash table lookup before we unhashed socket. They will achieve
2764          * receive queue and will be purged by socket destructor.
2765          *
2766          * Also we still have packets pending on receive queue and probably,
2767          * our own packets waiting in device queues. sock_destroy will drain
2768          * receive queue, but transmitted packets will delay socket destruction
2769          * until the last reference will be released.
2770          */
2771
2772         sock_orphan(sk);
2773
2774         xfrm_sk_free_policy(sk);
2775
2776         sk_refcnt_debug_release(sk);
2777
2778         sock_put(sk);
2779 }
2780 EXPORT_SYMBOL(sk_common_release);
2781
2782 #ifdef CONFIG_PROC_FS
2783 #define PROTO_INUSE_NR  64      /* should be enough for the first time */
2784 struct prot_inuse {
2785         int val[PROTO_INUSE_NR];
2786 };
2787
2788 static DECLARE_BITMAP(proto_inuse_idx, PROTO_INUSE_NR);
2789
2790 #ifdef CONFIG_NET_NS
2791 void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
2792 {
2793         __this_cpu_add(net->core.inuse->val[prot->inuse_idx], val);
2794 }
2795 EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
2796
2797 int sock_prot_inuse_get(struct net *net, struct proto *prot)
2798 {
2799         int cpu, idx = prot->inuse_idx;
2800         int res = 0;
2801
2802         for_each_possible_cpu(cpu)
2803                 res += per_cpu_ptr(net->core.inuse, cpu)->val[idx];
2804
2805         return res >= 0 ? res : 0;
2806 }
2807 EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
2808
2809 static int __net_init sock_inuse_init_net(struct net *net)
2810 {
2811         net->core.inuse = alloc_percpu(struct prot_inuse);
2812         return net->core.inuse ? 0 : -ENOMEM;
2813 }
2814
2815 static void __net_exit sock_inuse_exit_net(struct net *net)
2816 {
2817         free_percpu(net->core.inuse);
2818 }
2819
2820 static struct pernet_operations net_inuse_ops = {
2821         .init = sock_inuse_init_net,
2822         .exit = sock_inuse_exit_net,
2823 };
2824
2825 static __init int net_inuse_init(void)
2826 {
2827         if (register_pernet_subsys(&net_inuse_ops))
2828                 panic("Cannot initialize net inuse counters");
2829
2830         return 0;
2831 }
2832
2833 core_initcall(net_inuse_init);
2834 #else
2835 static DEFINE_PER_CPU(struct prot_inuse, prot_inuse);
2836
2837 void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
2838 {
2839         __this_cpu_add(prot_inuse.val[prot->inuse_idx], val);
2840 }
2841 EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
2842
2843 int sock_prot_inuse_get(struct net *net, struct proto *prot)
2844 {
2845         int cpu, idx = prot->inuse_idx;
2846         int res = 0;
2847
2848         for_each_possible_cpu(cpu)
2849                 res += per_cpu(prot_inuse, cpu).val[idx];
2850
2851         return res >= 0 ? res : 0;
2852 }
2853 EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
2854 #endif
2855
2856 static void assign_proto_idx(struct proto *prot)
2857 {
2858         prot->inuse_idx = find_first_zero_bit(proto_inuse_idx, PROTO_INUSE_NR);
2859
2860         if (unlikely(prot->inuse_idx == PROTO_INUSE_NR - 1)) {
2861                 pr_err("PROTO_INUSE_NR exhausted\n");
2862                 return;
2863         }
2864
2865         set_bit(prot->inuse_idx, proto_inuse_idx);
2866 }
2867
2868 static void release_proto_idx(struct proto *prot)
2869 {
2870         if (prot->inuse_idx != PROTO_INUSE_NR - 1)
2871                 clear_bit(prot->inuse_idx, proto_inuse_idx);
2872 }
2873 #else
2874 static inline void assign_proto_idx(struct proto *prot)
2875 {
2876 }
2877
2878 static inline void release_proto_idx(struct proto *prot)
2879 {
2880 }
2881 #endif
2882
2883 static void req_prot_cleanup(struct request_sock_ops *rsk_prot)
2884 {
2885         if (!rsk_prot)
2886                 return;
2887         kfree(rsk_prot->slab_name);
2888         rsk_prot->slab_name = NULL;
2889         kmem_cache_destroy(rsk_prot->slab);
2890         rsk_prot->slab = NULL;
2891 }
2892
2893 static int req_prot_init(const struct proto *prot)
2894 {
2895         struct request_sock_ops *rsk_prot = prot->rsk_prot;
2896
2897         if (!rsk_prot)
2898                 return 0;
2899
2900         rsk_prot->slab_name = kasprintf(GFP_KERNEL, "request_sock_%s",
2901                                         prot->name);
2902         if (!rsk_prot->slab_name)
2903                 return -ENOMEM;
2904
2905         rsk_prot->slab = kmem_cache_create(rsk_prot->slab_name,
2906                                            rsk_prot->obj_size, 0,
2907                                            prot->slab_flags, NULL);
2908
2909         if (!rsk_prot->slab) {
2910                 pr_crit("%s: Can't create request sock SLAB cache!\n",
2911                         prot->name);
2912                 return -ENOMEM;
2913         }
2914         return 0;
2915 }
2916
2917 int proto_register(struct proto *prot, int alloc_slab)
2918 {
2919         if (alloc_slab) {
2920                 prot->slab = kmem_cache_create(prot->name, prot->obj_size, 0,
2921                                         SLAB_HWCACHE_ALIGN | prot->slab_flags,
2922                                         NULL);
2923
2924                 if (prot->slab == NULL) {
2925                         pr_crit("%s: Can't create sock SLAB cache!\n",
2926                                 prot->name);
2927                         goto out;
2928                 }
2929
2930                 if (req_prot_init(prot))
2931                         goto out_free_request_sock_slab;
2932
2933                 if (prot->twsk_prot != NULL) {
2934                         prot->twsk_prot->twsk_slab_name = kasprintf(GFP_KERNEL, "tw_sock_%s", prot->name);
2935
2936                         if (prot->twsk_prot->twsk_slab_name == NULL)
2937                                 goto out_free_request_sock_slab;
2938
2939                         prot->twsk_prot->twsk_slab =
2940                                 kmem_cache_create(prot->twsk_prot->twsk_slab_name,
2941                                                   prot->twsk_prot->twsk_obj_size,
2942                                                   0,
2943                                                   prot->slab_flags,
2944                                                   NULL);
2945                         if (prot->twsk_prot->twsk_slab == NULL)
2946                                 goto out_free_timewait_sock_slab_name;
2947                 }
2948         }
2949
2950         mutex_lock(&proto_list_mutex);
2951         list_add(&prot->node, &proto_list);
2952         assign_proto_idx(prot);
2953         mutex_unlock(&proto_list_mutex);
2954         return 0;
2955
2956 out_free_timewait_sock_slab_name:
2957         kfree(prot->twsk_prot->twsk_slab_name);
2958 out_free_request_sock_slab:
2959         req_prot_cleanup(prot->rsk_prot);
2960
2961         kmem_cache_destroy(prot->slab);
2962         prot->slab = NULL;
2963 out:
2964         return -ENOBUFS;
2965 }
2966 EXPORT_SYMBOL(proto_register);
2967
2968 void proto_unregister(struct proto *prot)
2969 {
2970         mutex_lock(&proto_list_mutex);
2971         release_proto_idx(prot);
2972         list_del(&prot->node);
2973         mutex_unlock(&proto_list_mutex);
2974
2975         kmem_cache_destroy(prot->slab);
2976         prot->slab = NULL;
2977
2978         req_prot_cleanup(prot->rsk_prot);
2979
2980         if (prot->twsk_prot != NULL && prot->twsk_prot->twsk_slab != NULL) {
2981                 kmem_cache_destroy(prot->twsk_prot->twsk_slab);
2982                 kfree(prot->twsk_prot->twsk_slab_name);
2983                 prot->twsk_prot->twsk_slab = NULL;
2984         }
2985 }
2986 EXPORT_SYMBOL(proto_unregister);
2987
2988 #ifdef CONFIG_PROC_FS
2989 static void *proto_seq_start(struct seq_file *seq, loff_t *pos)
2990         __acquires(proto_list_mutex)
2991 {
2992         mutex_lock(&proto_list_mutex);
2993         return seq_list_start_head(&proto_list, *pos);
2994 }
2995
2996 static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2997 {
2998         return seq_list_next(v, &proto_list, pos);
2999 }
3000
3001 static void proto_seq_stop(struct seq_file *seq, void *v)
3002         __releases(proto_list_mutex)
3003 {
3004         mutex_unlock(&proto_list_mutex);
3005 }
3006
3007 static char proto_method_implemented(const void *method)
3008 {
3009         return method == NULL ? 'n' : 'y';
3010 }
3011 static long sock_prot_memory_allocated(struct proto *proto)
3012 {
3013         return proto->memory_allocated != NULL ? proto_memory_allocated(proto) : -1L;
3014 }
3015
3016 static char *sock_prot_memory_pressure(struct proto *proto)
3017 {
3018         return proto->memory_pressure != NULL ?
3019         proto_memory_pressure(proto) ? "yes" : "no" : "NI";
3020 }
3021
3022 static void proto_seq_printf(struct seq_file *seq, struct proto *proto)
3023 {
3024
3025         seq_printf(seq, "%-9s %4u %6d  %6ld   %-3s %6u   %-3s  %-10s "
3026                         "%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n",
3027                    proto->name,
3028                    proto->obj_size,
3029                    sock_prot_inuse_get(seq_file_net(seq), proto),
3030                    sock_prot_memory_allocated(proto),
3031                    sock_prot_memory_pressure(proto),
3032                    proto->max_header,
3033                    proto->slab == NULL ? "no" : "yes",
3034                    module_name(proto->owner),
3035                    proto_method_implemented(proto->close),
3036                    proto_method_implemented(proto->connect),
3037                    proto_method_implemented(proto->disconnect),
3038                    proto_method_implemented(proto->accept),
3039                    proto_method_implemented(proto->ioctl),
3040                    proto_method_implemented(proto->init),
3041                    proto_method_implemented(proto->destroy),
3042                    proto_method_implemented(proto->shutdown),
3043                    proto_method_implemented(proto->setsockopt),
3044                    proto_method_implemented(proto->getsockopt),
3045                    proto_method_implemented(proto->sendmsg),
3046                    proto_method_implemented(proto->recvmsg),
3047                    proto_method_implemented(proto->sendpage),
3048                    proto_method_implemented(proto->bind),
3049                    proto_method_implemented(proto->backlog_rcv),
3050                    proto_method_implemented(proto->hash),
3051                    proto_method_implemented(proto->unhash),
3052                    proto_method_implemented(proto->get_port),
3053                    proto_method_implemented(proto->enter_memory_pressure));
3054 }
3055
3056 static int proto_seq_show(struct seq_file *seq, void *v)
3057 {
3058         if (v == &proto_list)
3059                 seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s",
3060                            "protocol",
3061                            "size",
3062                            "sockets",
3063                            "memory",
3064                            "press",
3065                            "maxhdr",
3066                            "slab",
3067                            "module",
3068                            "cl co di ac io in de sh ss gs se re sp bi br ha uh gp em\n");
3069         else
3070                 proto_seq_printf(seq, list_entry(v, struct proto, node));
3071         return 0;
3072 }
3073
3074 static const struct seq_operations proto_seq_ops = {
3075         .start  = proto_seq_start,
3076         .next   = proto_seq_next,
3077         .stop   = proto_seq_stop,
3078         .show   = proto_seq_show,
3079 };
3080
3081 static int proto_seq_open(struct inode *inode, struct file *file)
3082 {
3083         return seq_open_net(inode, file, &proto_seq_ops,
3084                             sizeof(struct seq_net_private));
3085 }
3086
3087 static const struct file_operations proto_seq_fops = {
3088         .owner          = THIS_MODULE,
3089         .open           = proto_seq_open,
3090         .read           = seq_read,
3091         .llseek         = seq_lseek,
3092         .release        = seq_release_net,
3093 };
3094
3095 static __net_init int proto_init_net(struct net *net)
3096 {
3097         if (!proc_create("protocols", S_IRUGO, net->proc_net, &proto_seq_fops))
3098                 return -ENOMEM;
3099
3100         return 0;
3101 }
3102
3103 static __net_exit void proto_exit_net(struct net *net)
3104 {
3105         remove_proc_entry("protocols", net->proc_net);
3106 }
3107
3108
3109 static __net_initdata struct pernet_operations proto_net_ops = {
3110         .init = proto_init_net,
3111         .exit = proto_exit_net,
3112 };
3113
3114 static int __init proto_init(void)
3115 {
3116         return register_pernet_subsys(&proto_net_ops);
3117 }
3118
3119 subsys_initcall(proto_init);
3120
3121 #endif /* PROC_FS */