net/core/sock.c

   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              Generic socket support routines. Memory allocators, socket lock/release
   7  *              handler for protocols to use and generic option handler.
   8  *
   9  *
  10  * Authors:     Ross Biro
  11  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12  *              Florian La Roche, <flla@stud.uni-sb.de>
  13  *              Alan Cox, <A.Cox@swansea.ac.uk>
  14  *
  15  * Fixes:
  16  *              Alan Cox        :       Numerous verify_area() problems
  17  *              Alan Cox        :       Connecting on a connecting socket
  18  *                                      now returns an error for tcp.
  19  *              Alan Cox        :       sock->protocol is set correctly.
  20  *                                      and is not sometimes left as 0.
  21  *              Alan Cox        :       connect handles icmp errors on a
  22  *                                      connect properly. Unfortunately there
  23  *                                      is a restart syscall nasty there. I
  24  *                                      can't match BSD without hacking the C
  25  *                                      library. Ideas urgently sought!
  26  *              Alan Cox        :       Disallow bind() to addresses that are
  27  *                                      not ours - especially broadcast ones!!
  28  *              Alan Cox        :       Socket 1024 _IS_ ok for users. (fencepost)
  29  *              Alan Cox        :       sock_wfree/sock_rfree don't destroy sockets,
  30  *                                      instead they leave that for the DESTROY timer.
  31  *              Alan Cox        :       Clean up error flag in accept
  32  *              Alan Cox        :       TCP ack handling is buggy, the DESTROY timer
  33  *                                      was buggy. Put a remove_sock() in the handler
  34  *                                      for memory when we hit 0. Also altered the timer
  35  *                                      code. The ACK stuff can wait and needs major
  36  *                                      TCP layer surgery.
  37  *              Alan Cox        :       Fixed TCP ack bug, removed remove sock
  38  *                                      and fixed timer/inet_bh race.
  39  *              Alan Cox        :       Added zapped flag for TCP
  40  *              Alan Cox        :       Move kfree_skb into skbuff.c and tidied up surplus code
  41  *              Alan Cox        :       for new sk_buff allocations wmalloc/rmalloc now call alloc_skb
  42  *              Alan Cox        :       kfree_s calls now are kfree_skbmem so we can track skb resources
  43  *              Alan Cox        :       Supports socket option broadcast now as does udp. Packet and raw need fixing.
  44  *              Alan Cox        :       Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so...
  45  *              Rick Sladkey    :       Relaxed UDP rules for matching packets.
  46  *              C.E.Hawkins     :       IFF_PROMISC/SIOCGHWADDR support
  47  *      Pauline Middelink       :       identd support
  48  *              Alan Cox        :       Fixed connect() taking signals I think.
  49  *              Alan Cox        :       SO_LINGER supported
  50  *              Alan Cox        :       Error reporting fixes
  51  *              Anonymous       :       inet_create tidied up (sk->reuse setting)
  52  *              Alan Cox        :       inet sockets don't set sk->type!
  53  *              Alan Cox        :       Split socket option code
  54  *              Alan Cox        :       Callbacks
  55  *              Alan Cox        :       Nagle flag for Charles & Johannes stuff
  56  *              Alex            :       Removed restriction on inet fioctl
  57  *              Alan Cox        :       Splitting INET from NET core
  58  *              Alan Cox        :       Fixed bogus SO_TYPE handling in getsockopt()
  59  *              Adam Caldwell   :       Missing return in SO_DONTROUTE/SO_DEBUG code
  60  *              Alan Cox        :       Split IP from generic code
  61  *              Alan Cox        :       New kfree_skbmem()
  62  *              Alan Cox        :       Make SO_DEBUG superuser only.
  63  *              Alan Cox        :       Allow anyone to clear SO_DEBUG
  64  *                                      (compatibility fix)
  65  *              Alan Cox        :       Added optimistic memory grabbing for AF_UNIX throughput.
  66  *              Alan Cox        :       Allocator for a socket is settable.
  67  *              Alan Cox        :       SO_ERROR includes soft errors.
  68  *              Alan Cox        :       Allow NULL arguments on some SO_ opts
  69  *              Alan Cox        :       Generic socket allocation to make hooks
  70  *                                      easier (suggested by Craig Metz).
  71  *              Michael Pall    :       SO_ERROR returns positive errno again
  72  *              Steve Whitehouse:       Added default destructor to free
  73  *                                      protocol private data.
  74  *              Steve Whitehouse:       Added various other default routines
  75  *                                      common to several socket families.
  76  *              Chris Evans     :       Call suser() check last on F_SETOWN
  77  *              Jay Schulist    :       Added SO_ATTACH_FILTER and SO_DETACH_FILTER.
  78  *              Andi Kleen      :       Add sock_kmalloc()/sock_kfree_s()
  79  *              Andi Kleen      :       Fix write_space callback
  80  *              Chris Evans     :       Security fixes - signedness again
  81  *              Arnaldo C. Melo :       cleanups, use skb_queue_purge
  82  *
  83  * To Fix:
  84  *
  85  *
  86  *              This program is free software; you can redistribute it and/or
  87  *              modify it under the terms of the GNU General Public License
  88  *              as published by the Free Software Foundation; either version
  89  *              2 of the License, or (at your option) any later version.
  90  */
  91
  92 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
  93
  94 #include <linux/capability.h>
  95 #include <linux/errno.h>
  96 #include <linux/errqueue.h>
  97 #include <linux/types.h>
  98 #include <linux/socket.h>
  99 #include <linux/in.h>
 100 #include <linux/kernel.h>
 101 #include <linux/module.h>
 102 #include <linux/proc_fs.h>
 103 #include <linux/seq_file.h>
 104 #include <linux/sched.h>
 105 #include <linux/timer.h>
 106 #include <linux/string.h>
 107 #include <linux/sockios.h>
 108 #include <linux/net.h>
 109 #include <linux/mm.h>
 110 #include <linux/slab.h>
 111 #include <linux/interrupt.h>
 112 #include <linux/poll.h>
 113 #include <linux/tcp.h>
 114 #include <linux/init.h>
 115 #include <linux/highmem.h>
 116 #include <linux/user_namespace.h>
 117 #include <linux/static_key.h>
 118 #include <linux/memcontrol.h>
 119 #include <linux/prefetch.h>
 120
 121 #include <asm/uaccess.h>
 122
 123 #include <linux/netdevice.h>
 124 #include <net/protocol.h>
 125 #include <linux/skbuff.h>
 126 #include <net/net_namespace.h>
 127 #include <net/request_sock.h>
 128 #include <net/sock.h>
 129 #include <linux/net_tstamp.h>
 130 #include <net/xfrm.h>
 131 #include <linux/ipsec.h>
 132 #include <net/cls_cgroup.h>
 133 #include <net/netprio_cgroup.h>
 134 #include <linux/sock_diag.h>
 135
 136 #include <linux/filter.h>
 137 #include <net/sock_reuseport.h>
 138
 139 #include <trace/events/sock.h>
 140
 141 #include <net/tcp.h>
 142 #include <net/busy_poll.h>
 143
 144 static DEFINE_MUTEX(proto_list_mutex);
 145 static LIST_HEAD(proto_list);
 146
 147 /**
 148  * sk_ns_capable - General socket capability test
 149  * @sk: Socket to use a capability on or through
 150  * @user_ns: The user namespace of the capability to use
 151  * @cap: The capability to use
 152  *
 153  * Test to see if the opener of the socket had when the socket was
 154  * created and the current process has the capability @cap in the user
 155  * namespace @user_ns.
 156  */
 157 bool sk_ns_capable(const struct sock *sk,
 158                    struct user_namespace *user_ns, int cap)
 159 {
 160         return file_ns_capable(sk->sk_socket->file, user_ns, cap) &&
 161                 ns_capable(user_ns, cap);
 162 }
 163 EXPORT_SYMBOL(sk_ns_capable);
 164
 165 /**
 166  * sk_capable - Socket global capability test
 167  * @sk: Socket to use a capability on or through
 168  * @cap: The global capability to use
 169  *
 170  * Test to see if the opener of the socket had when the socket was
 171  * created and the current process has the capability @cap in all user
 172  * namespaces.
 173  */
 174 bool sk_capable(const struct sock *sk, int cap)
 175 {
 176         return sk_ns_capable(sk, &init_user_ns, cap);
 177 }
 178 EXPORT_SYMBOL(sk_capable);
 179
 180 /**
 181  * sk_net_capable - Network namespace socket capability test
 182  * @sk: Socket to use a capability on or through
 183  * @cap: The capability to use
 184  *
 185  * Test to see if the opener of the socket had when the socket was created
 186  * and the current process has the capability @cap over the network namespace
 187  * the socket is a member of.
 188  */
 189 bool sk_net_capable(const struct sock *sk, int cap)
 190 {
 191         return sk_ns_capable(sk, sock_net(sk)->user_ns, cap);
 192 }
 193 EXPORT_SYMBOL(sk_net_capable);
 194
 195 /*
 196  * Each address family might have different locking rules, so we have
 197  * one slock key per address family:
 198  */
 199 static struct lock_class_key af_family_keys[AF_MAX];
 200 static struct lock_class_key af_family_slock_keys[AF_MAX];
 201
 202 /*
 203  * Make lock validator output more readable. (we pre-construct these
 204  * strings build-time, so that runtime initialization of socket
 205  * locks is fast):
 206  */
 207 static const char *const af_family_key_strings[AF_MAX+1] = {
 208   "sk_lock-AF_UNSPEC", "sk_lock-AF_UNIX"     , "sk_lock-AF_INET"     ,
 209   "sk_lock-AF_AX25"  , "sk_lock-AF_IPX"      , "sk_lock-AF_APPLETALK",
 210   "sk_lock-AF_NETROM", "sk_lock-AF_BRIDGE"   , "sk_lock-AF_ATMPVC"   ,
 211   "sk_lock-AF_X25"   , "sk_lock-AF_INET6"    , "sk_lock-AF_ROSE"     ,
 212   "sk_lock-AF_DECnet", "sk_lock-AF_NETBEUI"  , "sk_lock-AF_SECURITY" ,
 213   "sk_lock-AF_KEY"   , "sk_lock-AF_NETLINK"  , "sk_lock-AF_PACKET"   ,
 214   "sk_lock-AF_ASH"   , "sk_lock-AF_ECONET"   , "sk_lock-AF_ATMSVC"   ,
 215   "sk_lock-AF_RDS"   , "sk_lock-AF_SNA"      , "sk_lock-AF_IRDA"     ,
 216   "sk_lock-AF_PPPOX" , "sk_lock-AF_WANPIPE"  , "sk_lock-AF_LLC"      ,
 217   "sk_lock-27"       , "sk_lock-28"          , "sk_lock-AF_CAN"      ,
 218   "sk_lock-AF_TIPC"  , "sk_lock-AF_BLUETOOTH", "sk_lock-IUCV"        ,
 219   "sk_lock-AF_RXRPC" , "sk_lock-AF_ISDN"     , "sk_lock-AF_PHONET"   ,
 220   "sk_lock-AF_IEEE802154", "sk_lock-AF_CAIF" , "sk_lock-AF_ALG"      ,
 221   "sk_lock-AF_NFC"   , "sk_lock-AF_VSOCK"    , "sk_lock-AF_KCM"      ,
 222   "sk_lock-AF_QIPCRTR", "sk_lock-AF_MAX"
 223 };
 224 static const char *const af_family_slock_key_strings[AF_MAX+1] = {
 225   "slock-AF_UNSPEC", "slock-AF_UNIX"     , "slock-AF_INET"     ,
 226   "slock-AF_AX25"  , "slock-AF_IPX"      , "slock-AF_APPLETALK",
 227   "slock-AF_NETROM", "slock-AF_BRIDGE"   , "slock-AF_ATMPVC"   ,
 228   "slock-AF_X25"   , "slock-AF_INET6"    , "slock-AF_ROSE"     ,
 229   "slock-AF_DECnet", "slock-AF_NETBEUI"  , "slock-AF_SECURITY" ,
 230   "slock-AF_KEY"   , "slock-AF_NETLINK"  , "slock-AF_PACKET"   ,
 231   "slock-AF_ASH"   , "slock-AF_ECONET"   , "slock-AF_ATMSVC"   ,
 232   "slock-AF_RDS"   , "slock-AF_SNA"      , "slock-AF_IRDA"     ,
 233   "slock-AF_PPPOX" , "slock-AF_WANPIPE"  , "slock-AF_LLC"      ,
 234   "slock-27"       , "slock-28"          , "slock-AF_CAN"      ,
 235   "slock-AF_TIPC"  , "slock-AF_BLUETOOTH", "slock-AF_IUCV"     ,
 236   "slock-AF_RXRPC" , "slock-AF_ISDN"     , "slock-AF_PHONET"   ,
 237   "slock-AF_IEEE802154", "slock-AF_CAIF" , "slock-AF_ALG"      ,
 238   "slock-AF_NFC"   , "slock-AF_VSOCK"    ,"slock-AF_KCM"       ,
 239   "slock-AF_QIPCRTR", "slock-AF_MAX"
 240 };
 241 static const char *const af_family_clock_key_strings[AF_MAX+1] = {
 242   "clock-AF_UNSPEC", "clock-AF_UNIX"     , "clock-AF_INET"     ,
 243   "clock-AF_AX25"  , "clock-AF_IPX"      , "clock-AF_APPLETALK",
 244   "clock-AF_NETROM", "clock-AF_BRIDGE"   , "clock-AF_ATMPVC"   ,
 245   "clock-AF_X25"   , "clock-AF_INET6"    , "clock-AF_ROSE"     ,
 246   "clock-AF_DECnet", "clock-AF_NETBEUI"  , "clock-AF_SECURITY" ,
 247   "clock-AF_KEY"   , "clock-AF_NETLINK"  , "clock-AF_PACKET"   ,
 248   "clock-AF_ASH"   , "clock-AF_ECONET"   , "clock-AF_ATMSVC"   ,
 249   "clock-AF_RDS"   , "clock-AF_SNA"      , "clock-AF_IRDA"     ,
 250   "clock-AF_PPPOX" , "clock-AF_WANPIPE"  , "clock-AF_LLC"      ,
 251   "clock-27"       , "clock-28"          , "clock-AF_CAN"      ,
 252   "clock-AF_TIPC"  , "clock-AF_BLUETOOTH", "clock-AF_IUCV"     ,
 253   "clock-AF_RXRPC" , "clock-AF_ISDN"     , "clock-AF_PHONET"   ,
 254   "clock-AF_IEEE802154", "clock-AF_CAIF" , "clock-AF_ALG"      ,
 255   "clock-AF_NFC"   , "clock-AF_VSOCK"    , "clock-AF_KCM"      ,
 256   "clock-AF_QIPCRTR", "clock-AF_MAX"
 257 };
 258
 259 /*
 260  * sk_callback_lock locking rules are per-address-family,
 261  * so split the lock classes by using a per-AF key:
 262  */
 263 static struct lock_class_key af_callback_keys[AF_MAX];
 264
 265 /* Take into consideration the size of the struct sk_buff overhead in the
 266  * determination of these values, since that is non-constant across
 267  * platforms.  This makes socket queueing behavior and performance
 268  * not depend upon such differences.
 269  */
 270 #define _SK_MEM_PACKETS         256
 271 #define _SK_MEM_OVERHEAD        SKB_TRUESIZE(256)
 272 #define SK_WMEM_MAX             (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
 273 #define SK_RMEM_MAX             (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
 274
 275 /* Run time adjustable parameters. */
 276 __u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX;
 277 EXPORT_SYMBOL(sysctl_wmem_max);
 278 __u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX;
 279 EXPORT_SYMBOL(sysctl_rmem_max);
 280 __u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX;
 281 __u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX;
 282
 283 /* Maximal space eaten by iovec or ancillary data plus some space */
 284 int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512);
 285 EXPORT_SYMBOL(sysctl_optmem_max);
 286
 287 int sysctl_tstamp_allow_data __read_mostly = 1;
 288
 289 struct static_key memalloc_socks = STATIC_KEY_INIT_FALSE;
 290 EXPORT_SYMBOL_GPL(memalloc_socks);
 291
 292 /**
 293  * sk_set_memalloc - sets %SOCK_MEMALLOC
 294  * @sk: socket to set it on
 295  *
 296  * Set %SOCK_MEMALLOC on a socket for access to emergency reserves.
 297  * It's the responsibility of the admin to adjust min_free_kbytes
 298  * to meet the requirements
 299  */
 300 void sk_set_memalloc(struct sock *sk)
 301 {
 302         sock_set_flag(sk, SOCK_MEMALLOC);
 303         sk->sk_allocation |= __GFP_MEMALLOC;
 304         static_key_slow_inc(&memalloc_socks);
 305 }
 306 EXPORT_SYMBOL_GPL(sk_set_memalloc);
 307
 308 void sk_clear_memalloc(struct sock *sk)
 309 {
 310         sock_reset_flag(sk, SOCK_MEMALLOC);
 311         sk->sk_allocation &= ~__GFP_MEMALLOC;
 312         static_key_slow_dec(&memalloc_socks);
 313
 314         /*
 315          * SOCK_MEMALLOC is allowed to ignore rmem limits to ensure forward
 316          * progress of swapping. SOCK_MEMALLOC may be cleared while
 317          * it has rmem allocations due to the last swapfile being deactivated
 318          * but there is a risk that the socket is unusable due to exceeding
 319          * the rmem limits. Reclaim the reserves and obey rmem limits again.
 320          */
 321         sk_mem_reclaim(sk);
 322 }
 323 EXPORT_SYMBOL_GPL(sk_clear_memalloc);
 324
 325 int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb)
 326 {
 327         int ret;
 328         unsigned long pflags = current->flags;
 329
 330         /* these should have been dropped before queueing */
 331         BUG_ON(!sock_flag(sk, SOCK_MEMALLOC));
 332
 333         current->flags |= PF_MEMALLOC;
 334         ret = sk->sk_backlog_rcv(sk, skb);
 335         tsk_restore_flags(current, pflags, PF_MEMALLOC);
 336
 337         return ret;
 338 }
 339 EXPORT_SYMBOL(__sk_backlog_rcv);
 340
 341 static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen)
 342 {
 343         struct timeval tv;
 344
 345         if (optlen < sizeof(tv))
 346                 return -EINVAL;
 347         if (copy_from_user(&tv, optval, sizeof(tv)))
 348                 return -EFAULT;
 349         if (tv.tv_usec < 0 || tv.tv_usec >= USEC_PER_SEC)
 350                 return -EDOM;
 351
 352         if (tv.tv_sec < 0) {
 353                 static int warned __read_mostly;
 354
 355                 *timeo_p = 0;
 356                 if (warned < 10 && net_ratelimit()) {
 357                         warned++;
 358                         pr_info("%s: `%s' (pid %d) tries to set negative timeout\n",
 359                                 __func__, current->comm, task_pid_nr(current));
 360                 }
 361                 return 0;
 362         }
 363         *timeo_p = MAX_SCHEDULE_TIMEOUT;
 364         if (tv.tv_sec == 0 && tv.tv_usec == 0)
 365                 return 0;
 366         if (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT/HZ - 1))
 367                 *timeo_p = tv.tv_sec*HZ + (tv.tv_usec+(1000000/HZ-1))/(1000000/HZ);
 368         return 0;
 369 }
 370
 371 static void sock_warn_obsolete_bsdism(const char *name)
 372 {
 373         static int warned;
 374         static char warncomm[TASK_COMM_LEN];
 375         if (strcmp(warncomm, current->comm) && warned < 5) {
 376                 strcpy(warncomm,  current->comm);
 377                 pr_warn("process `%s' is using obsolete %s SO_BSDCOMPAT\n",
 378                         warncomm, name);
 379                 warned++;
 380         }
 381 }
 382
 383 static bool sock_needs_netstamp(const struct sock *sk)
 384 {
 385         switch (sk->sk_family) {
 386         case AF_UNSPEC:
 387         case AF_UNIX:
 388                 return false;
 389         default:
 390                 return true;
 391         }
 392 }
 393
 394 static void sock_disable_timestamp(struct sock *sk, unsigned long flags)
 395 {
 396         if (sk->sk_flags & flags) {
 397                 sk->sk_flags &= ~flags;
 398                 if (sock_needs_netstamp(sk) &&
 399                     !(sk->sk_flags & SK_FLAGS_TIMESTAMP))
 400                         net_disable_timestamp();
 401         }
 402 }
 403
 404
 405 int __sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
 406 {
 407         unsigned long flags;
 408         struct sk_buff_head *list = &sk->sk_receive_queue;
 409
 410         if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) {
 411                 atomic_inc(&sk->sk_drops);
 412                 trace_sock_rcvqueue_full(sk, skb);
 413                 return -ENOMEM;
 414         }
 415
 416         if (!sk_rmem_schedule(sk, skb, skb->truesize)) {
 417                 atomic_inc(&sk->sk_drops);
 418                 return -ENOBUFS;
 419         }
 420
 421         skb->dev = NULL;
 422         skb_set_owner_r(skb, sk);
 423
 424         /* we escape from rcu protected region, make sure we dont leak
 425          * a norefcounted dst
 426          */
 427         skb_dst_force(skb);
 428
 429         spin_lock_irqsave(&list->lock, flags);
 430         sock_skb_set_dropcount(sk, skb);
 431         __skb_queue_tail(list, skb);
 432         spin_unlock_irqrestore(&list->lock, flags);
 433
 434         if (!sock_flag(sk, SOCK_DEAD))
 435                 sk->sk_data_ready(sk);
 436         return 0;
 437 }
 438 EXPORT_SYMBOL(__sock_queue_rcv_skb);
 439
 440 int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
 441 {
 442         int err;
 443
 444         err = sk_filter(sk, skb);
 445         if (err)
 446                 return err;
 447
 448         return __sock_queue_rcv_skb(sk, skb);
 449 }
 450 EXPORT_SYMBOL(sock_queue_rcv_skb);
 451
 452 int __sk_receive_skb(struct sock *sk, struct sk_buff *skb,
 453                      const int nested, unsigned int trim_cap, bool refcounted)
 454 {
 455         int rc = NET_RX_SUCCESS;
 456
 457         if (sk_filter_trim_cap(sk, skb, trim_cap))
 458                 goto discard_and_relse;
 459
 460         skb->dev = NULL;
 461
 462         if (sk_rcvqueues_full(sk, sk->sk_rcvbuf)) {
 463                 atomic_inc(&sk->sk_drops);
 464                 goto discard_and_relse;
 465         }
 466         if (nested)
 467                 bh_lock_sock_nested(sk);
 468         else
 469                 bh_lock_sock(sk);
 470         if (!sock_owned_by_user(sk)) {
 471                 /*
 472                  * trylock + unlock semantics:
 473                  */
 474                 mutex_acquire(&sk->sk_lock.dep_map, 0, 1, _RET_IP_);
 475
 476                 rc = sk_backlog_rcv(sk, skb);
 477
 478                 mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
 479         } else if (sk_add_backlog(sk, skb, sk->sk_rcvbuf)) {
 480                 bh_unlock_sock(sk);
 481                 atomic_inc(&sk->sk_drops);
 482                 goto discard_and_relse;
 483         }
 484
 485         bh_unlock_sock(sk);
 486 out:
 487         if (refcounted)
 488                 sock_put(sk);
 489         return rc;
 490 discard_and_relse:
 491         kfree_skb(skb);
 492         goto out;
 493 }
 494 EXPORT_SYMBOL(__sk_receive_skb);
 495
 496 struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie)
 497 {
 498         struct dst_entry *dst = __sk_dst_get(sk);
 499
 500         if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
 501                 sk_tx_queue_clear(sk);
 502                 RCU_INIT_POINTER(sk->sk_dst_cache, NULL);
 503                 dst_release(dst);
 504                 return NULL;
 505         }
 506
 507         return dst;
 508 }
 509 EXPORT_SYMBOL(__sk_dst_check);
 510
 511 struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie)
 512 {
 513         struct dst_entry *dst = sk_dst_get(sk);
 514
 515         if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
 516                 sk_dst_reset(sk);
 517                 dst_release(dst);
 518                 return NULL;
 519         }
 520
 521         return dst;
 522 }
 523 EXPORT_SYMBOL(sk_dst_check);
 524
 525 static int sock_setbindtodevice(struct sock *sk, char __user *optval,
 526                                 int optlen)
 527 {
 528         int ret = -ENOPROTOOPT;
 529 #ifdef CONFIG_NETDEVICES
 530         struct net *net = sock_net(sk);
 531         char devname[IFNAMSIZ];
 532         int index;
 533
 534         /* Sorry... */
 535         ret = -EPERM;
 536         if (!ns_capable(net->user_ns, CAP_NET_RAW))
 537                 goto out;
 538
 539         ret = -EINVAL;
 540         if (optlen < 0)
 541                 goto out;
 542
 543         /* Bind this socket to a particular device like "eth0",
 544          * as specified in the passed interface name. If the
 545          * name is "" or the option length is zero the socket
 546          * is not bound.
 547          */
 548         if (optlen > IFNAMSIZ - 1)
 549                 optlen = IFNAMSIZ - 1;
 550         memset(devname, 0, sizeof(devname));
 551
 552         ret = -EFAULT;
 553         if (copy_from_user(devname, optval, optlen))
 554                 goto out;
 555
 556         index = 0;
 557         if (devname[0] != '\0') {
 558                 struct net_device *dev;
 559
 560                 rcu_read_lock();
 561                 dev = dev_get_by_name_rcu(net, devname);
 562                 if (dev)
 563                         index = dev->ifindex;
 564                 rcu_read_unlock();
 565                 ret = -ENODEV;
 566                 if (!dev)
 567                         goto out;
 568         }
 569
 570         lock_sock(sk);
 571         sk->sk_bound_dev_if = index;
 572         sk_dst_reset(sk);
 573         release_sock(sk);
 574
 575         ret = 0;
 576
 577 out:
 578 #endif
 579
 580         return ret;
 581 }
 582
 583 static int sock_getbindtodevice(struct sock *sk, char __user *optval,
 584                                 int __user *optlen, int len)
 585 {
 586         int ret = -ENOPROTOOPT;
 587 #ifdef CONFIG_NETDEVICES
 588         struct net *net = sock_net(sk);
 589         char devname[IFNAMSIZ];
 590
 591         if (sk->sk_bound_dev_if == 0) {
 592                 len = 0;
 593                 goto zero;
 594         }
 595
 596         ret = -EINVAL;
 597         if (len < IFNAMSIZ)
 598                 goto out;
 599
 600         ret = netdev_get_name(net, devname, sk->sk_bound_dev_if);
 601         if (ret)
 602                 goto out;
 603
 604         len = strlen(devname) + 1;
 605
 606         ret = -EFAULT;
 607         if (copy_to_user(optval, devname, len))
 608                 goto out;
 609
 610 zero:
 611         ret = -EFAULT;
 612         if (put_user(len, optlen))
 613                 goto out;
 614
 615         ret = 0;
 616
 617 out:
 618 #endif
 619
 620         return ret;
 621 }
 622
 623 static inline void sock_valbool_flag(struct sock *sk, int bit, int valbool)
 624 {
 625         if (valbool)
 626                 sock_set_flag(sk, bit);
 627         else
 628                 sock_reset_flag(sk, bit);
 629 }
 630
 631 bool sk_mc_loop(struct sock *sk)
 632 {
 633         if (dev_recursion_level())
 634                 return false;
 635         if (!sk)
 636                 return true;
 637         switch (sk->sk_family) {
 638         case AF_INET:
 639                 return inet_sk(sk)->mc_loop;
 640 #if IS_ENABLED(CONFIG_IPV6)
 641         case AF_INET6:
 642                 return inet6_sk(sk)->mc_loop;
 643 #endif
 644         }
 645         WARN_ON(1);
 646         return true;
 647 }
 648 EXPORT_SYMBOL(sk_mc_loop);
 649
 650 /*
 651  *      This is meant for all protocols to use and covers goings on
 652  *      at the socket level. Everything here is generic.
 653  */
 654
 655 int sock_setsockopt(struct socket *sock, int level, int optname,
 656                     char __user *optval, unsigned int optlen)
 657 {
 658         struct sock *sk = sock->sk;
 659         int val;
 660         int valbool;
 661         struct linger ling;
 662         int ret = 0;
 663
 664         /*
 665          *      Options without arguments
 666          */
 667
 668         if (optname == SO_BINDTODEVICE)
 669                 return sock_setbindtodevice(sk, optval, optlen);
 670
 671         if (optlen < sizeof(int))
 672                 return -EINVAL;
 673
 674         if (get_user(val, (int __user *)optval))
 675                 return -EFAULT;
 676
 677         valbool = val ? 1 : 0;
 678
 679         lock_sock(sk);
 680
 681         switch (optname) {
 682         case SO_DEBUG:
 683                 if (val && !capable(CAP_NET_ADMIN))
 684                         ret = -EACCES;
 685                 else
 686                         sock_valbool_flag(sk, SOCK_DBG, valbool);
 687                 break;
 688         case SO_REUSEADDR:
 689                 sk->sk_reuse = (valbool ? SK_CAN_REUSE : SK_NO_REUSE);
 690                 break;
 691         case SO_REUSEPORT:
 692                 sk->sk_reuseport = valbool;
 693                 break;
 694         case SO_TYPE:
 695         case SO_PROTOCOL:
 696         case SO_DOMAIN:
 697         case SO_ERROR:
 698                 ret = -ENOPROTOOPT;
 699                 break;
 700         case SO_DONTROUTE:
 701                 sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool);
 702                 break;
 703         case SO_BROADCAST:
 704                 sock_valbool_flag(sk, SOCK_BROADCAST, valbool);
 705                 break;
 706         case SO_SNDBUF:
 707                 /* Don't error on this BSD doesn't and if you think
 708                  * about it this is right. Otherwise apps have to
 709                  * play 'guess the biggest size' games. RCVBUF/SNDBUF
 710                  * are treated in BSD as hints
 711                  */
 712                 val = min_t(u32, val, sysctl_wmem_max);
 713 set_sndbuf:
 714                 sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
 715                 sk->sk_sndbuf = max_t(int, val * 2, SOCK_MIN_SNDBUF);
 716                 /* Wake up sending tasks if we upped the value. */
 717                 sk->sk_write_space(sk);
 718                 break;
 719
 720         case SO_SNDBUFFORCE:
 721                 if (!capable(CAP_NET_ADMIN)) {
 722                         ret = -EPERM;
 723                         break;
 724                 }
 725                 goto set_sndbuf;
 726
 727         case SO_RCVBUF:
 728                 /* Don't error on this BSD doesn't and if you think
 729                  * about it this is right. Otherwise apps have to
 730                  * play 'guess the biggest size' games. RCVBUF/SNDBUF
 731                  * are treated in BSD as hints
 732                  */
 733                 val = min_t(u32, val, sysctl_rmem_max);
 734 set_rcvbuf:
 735                 sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
 736                 /*
 737                  * We double it on the way in to account for
 738                  * "struct sk_buff" etc. overhead.   Applications
 739                  * assume that the SO_RCVBUF setting they make will
 740                  * allow that much actual data to be received on that
 741                  * socket.
 742                  *
 743                  * Applications are unaware that "struct sk_buff" and
 744                  * other overheads allocate from the receive buffer
 745                  * during socket buffer allocation.
 746                  *
 747                  * And after considering the possible alternatives,
 748                  * returning the value we actually used in getsockopt
 749                  * is the most desirable behavior.
 750                  */
 751                 sk->sk_rcvbuf = max_t(int, val * 2, SOCK_MIN_RCVBUF);
 752                 break;
 753
 754         case SO_RCVBUFFORCE:
 755                 if (!capable(CAP_NET_ADMIN)) {
 756                         ret = -EPERM;
 757                         break;
 758                 }
 759                 goto set_rcvbuf;
 760
 761         case SO_KEEPALIVE:
 762 #ifdef CONFIG_INET
 763                 if (sk->sk_protocol == IPPROTO_TCP &&
 764                     sk->sk_type == SOCK_STREAM)
 765                         tcp_set_keepalive(sk, valbool);
 766 #endif
 767                 sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool);
 768                 break;
 769
 770         case SO_OOBINLINE:
 771                 sock_valbool_flag(sk, SOCK_URGINLINE, valbool);
 772                 break;
 773
 774         case SO_NO_CHECK:
 775                 sk->sk_no_check_tx = valbool;
 776                 break;
 777
 778         case SO_PRIORITY:
 779                 if ((val >= 0 && val <= 6) ||
 780                     ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
 781                         sk->sk_priority = val;
 782                 else
 783                         ret = -EPERM;
 784                 break;
 785
 786         case SO_LINGER:
 787                 if (optlen < sizeof(ling)) {
 788                         ret = -EINVAL;  /* 1003.1g */
 789                         break;
 790                 }
 791                 if (copy_from_user(&ling, optval, sizeof(ling))) {
 792                         ret = -EFAULT;
 793                         break;
 794                 }
 795                 if (!ling.l_onoff)
 796                         sock_reset_flag(sk, SOCK_LINGER);
 797                 else {
 798 #if (BITS_PER_LONG == 32)
 799                         if ((unsigned int)ling.l_linger >= MAX_SCHEDULE_TIMEOUT/HZ)
 800                                 sk->sk_lingertime = MAX_SCHEDULE_TIMEOUT;
 801                         else
 802 #endif
 803                                 sk->sk_lingertime = (unsigned int)ling.l_linger * HZ;
 804                         sock_set_flag(sk, SOCK_LINGER);
 805                 }
 806                 break;
 807
 808         case SO_BSDCOMPAT:
 809                 sock_warn_obsolete_bsdism("setsockopt");
 810                 break;
 811
 812         case SO_PASSCRED:
 813                 if (valbool)
 814                         set_bit(SOCK_PASSCRED, &sock->flags);
 815                 else
 816                         clear_bit(SOCK_PASSCRED, &sock->flags);
 817                 break;
 818
 819         case SO_TIMESTAMP:
 820         case SO_TIMESTAMPNS:
 821                 if (valbool)  {
 822                         if (optname == SO_TIMESTAMP)
 823                                 sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
 824                         else
 825                                 sock_set_flag(sk, SOCK_RCVTSTAMPNS);
 826                         sock_set_flag(sk, SOCK_RCVTSTAMP);
 827                         sock_enable_timestamp(sk, SOCK_TIMESTAMP);
 828                 } else {
 829                         sock_reset_flag(sk, SOCK_RCVTSTAMP);
 830                         sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
 831                 }
 832                 break;
 833
 834         case SO_TIMESTAMPING:
 835                 if (val & ~SOF_TIMESTAMPING_MASK) {
 836                         ret = -EINVAL;
 837                         break;
 838                 }
 839
 840                 if (val & SOF_TIMESTAMPING_OPT_ID &&
 841                     !(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)) {
 842                         if (sk->sk_protocol == IPPROTO_TCP &&
 843                             sk->sk_type == SOCK_STREAM) {
 844                                 if ((1 << sk->sk_state) &
 845                                     (TCPF_CLOSE | TCPF_LISTEN)) {
 846                                         ret = -EINVAL;
 847                                         break;
 848                                 }
 849                                 sk->sk_tskey = tcp_sk(sk)->snd_una;
 850                         } else {
 851                                 sk->sk_tskey = 0;
 852                         }
 853                 }
 854                 sk->sk_tsflags = val;
 855                 if (val & SOF_TIMESTAMPING_RX_SOFTWARE)
 856                         sock_enable_timestamp(sk,
 857                                               SOCK_TIMESTAMPING_RX_SOFTWARE);
 858                 else
 859                         sock_disable_timestamp(sk,
 860                                                (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE));
 861                 break;
 862
 863         case SO_RCVLOWAT:
 864                 if (val < 0)
 865                         val = INT_MAX;
 866                 sk->sk_rcvlowat = val ? : 1;
 867                 break;
 868
 869         case SO_RCVTIMEO:
 870                 ret = sock_set_timeout(&sk->sk_rcvtimeo, optval, optlen);
 871                 break;
 872
 873         case SO_SNDTIMEO:
 874                 ret = sock_set_timeout(&sk->sk_sndtimeo, optval, optlen);
 875                 break;
 876
 877         case SO_ATTACH_FILTER:
 878                 ret = -EINVAL;
 879                 if (optlen == sizeof(struct sock_fprog)) {
 880                         struct sock_fprog fprog;
 881
 882                         ret = -EFAULT;
 883                         if (copy_from_user(&fprog, optval, sizeof(fprog)))
 884                                 break;
 885
 886                         ret = sk_attach_filter(&fprog, sk);
 887                 }
 888                 break;
 889
 890         case SO_ATTACH_BPF:
 891                 ret = -EINVAL;
 892                 if (optlen == sizeof(u32)) {
 893                         u32 ufd;
 894
 895                         ret = -EFAULT;
 896                         if (copy_from_user(&ufd, optval, sizeof(ufd)))
 897                                 break;
 898
 899                         ret = sk_attach_bpf(ufd, sk);
 900                 }
 901                 break;
 902
 903         case SO_ATTACH_REUSEPORT_CBPF:
 904                 ret = -EINVAL;
 905                 if (optlen == sizeof(struct sock_fprog)) {
 906                         struct sock_fprog fprog;
 907
 908                         ret = -EFAULT;
 909                         if (copy_from_user(&fprog, optval, sizeof(fprog)))
 910                                 break;
 911
 912                         ret = sk_reuseport_attach_filter(&fprog, sk);
 913                 }
 914                 break;
 915
 916         case SO_ATTACH_REUSEPORT_EBPF:
 917                 ret = -EINVAL;
 918                 if (optlen == sizeof(u32)) {
 919                         u32 ufd;
 920
 921                         ret = -EFAULT;
 922                         if (copy_from_user(&ufd, optval, sizeof(ufd)))
 923                                 break;
 924
 925                         ret = sk_reuseport_attach_bpf(ufd, sk);
 926                 }
 927                 break;
 928
 929         case SO_DETACH_FILTER:
 930                 ret = sk_detach_filter(sk);
 931                 break;
 932
 933         case SO_LOCK_FILTER:
 934                 if (sock_flag(sk, SOCK_FILTER_LOCKED) && !valbool)
 935                         ret = -EPERM;
 936                 else
 937                         sock_valbool_flag(sk, SOCK_FILTER_LOCKED, valbool);
 938                 break;
 939
 940         case SO_PASSSEC:
 941                 if (valbool)
 942                         set_bit(SOCK_PASSSEC, &sock->flags);
 943                 else
 944                         clear_bit(SOCK_PASSSEC, &sock->flags);
 945                 break;
 946         case SO_MARK:
 947                 if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
 948                         ret = -EPERM;
 949                 else
 950                         sk->sk_mark = val;
 951                 break;
 952
 953         case SO_RXQ_OVFL:
 954                 sock_valbool_flag(sk, SOCK_RXQ_OVFL, valbool);
 955                 break;
 956
 957         case SO_WIFI_STATUS:
 958                 sock_valbool_flag(sk, SOCK_WIFI_STATUS, valbool);
 959                 break;
 960
 961         case SO_PEEK_OFF:
 962                 if (sock->ops->set_peek_off)
 963                         ret = sock->ops->set_peek_off(sk, val);
 964                 else
 965                         ret = -EOPNOTSUPP;
 966                 break;
 967
 968         case SO_NOFCS:
 969                 sock_valbool_flag(sk, SOCK_NOFCS, valbool);
 970                 break;
 971
 972         case SO_SELECT_ERR_QUEUE:
 973                 sock_valbool_flag(sk, SOCK_SELECT_ERR_QUEUE, valbool);
 974                 break;
 975
 976 #ifdef CONFIG_NET_RX_BUSY_POLL
 977         case SO_BUSY_POLL:
 978                 /* allow unprivileged users to decrease the value */
 979                 if ((val > sk->sk_ll_usec) && !capable(CAP_NET_ADMIN))
 980                         ret = -EPERM;
 981                 else {
 982                         if (val < 0)
 983                                 ret = -EINVAL;
 984                         else
 985                                 sk->sk_ll_usec = val;
 986                 }
 987                 break;
 988 #endif
 989
 990         case SO_MAX_PACING_RATE:
 991                 sk->sk_max_pacing_rate = val;
 992                 sk->sk_pacing_rate = min(sk->sk_pacing_rate,
 993                                          sk->sk_max_pacing_rate);
 994                 break;
 995
 996         case SO_INCOMING_CPU:
 997                 sk->sk_incoming_cpu = val;
 998                 break;
 999
1000         case SO_CNX_ADVICE:
1001                 if (val == 1)
1002                         dst_negative_advice(sk);
1003                 break;
1004         default:
1005                 ret = -ENOPROTOOPT;
1006                 break;
1007         }
1008         release_sock(sk);
1009         return ret;
1010 }
1011 EXPORT_SYMBOL(sock_setsockopt);
1012
1013
1014 static void cred_to_ucred(struct pid *pid, const struct cred *cred,
1015                           struct ucred *ucred)
1016 {
1017         ucred->pid = pid_vnr(pid);
1018         ucred->uid = ucred->gid = -1;
1019         if (cred) {
1020                 struct user_namespace *current_ns = current_user_ns();
1021
1022                 ucred->uid = from_kuid_munged(current_ns, cred->euid);
1023                 ucred->gid = from_kgid_munged(current_ns, cred->egid);
1024         }
1025 }
1026
1027 int sock_getsockopt(struct socket *sock, int level, int optname,
1028                     char __user *optval, int __user *optlen)
1029 {
1030         struct sock *sk = sock->sk;
1031
1032         union {
1033                 int val;
1034                 struct linger ling;
1035                 struct timeval tm;
1036         } v;
1037
1038         int lv = sizeof(int);
1039         int len;
1040
1041         if (get_user(len, optlen))
1042                 return -EFAULT;
1043         if (len < 0)
1044                 return -EINVAL;
1045
1046         memset(&v, 0, sizeof(v));
1047
1048         switch (optname) {
1049         case SO_DEBUG:
1050                 v.val = sock_flag(sk, SOCK_DBG);
1051                 break;
1052
1053         case SO_DONTROUTE:
1054                 v.val = sock_flag(sk, SOCK_LOCALROUTE);
1055                 break;
1056
1057         case SO_BROADCAST:
1058                 v.val = sock_flag(sk, SOCK_BROADCAST);
1059                 break;
1060
1061         case SO_SNDBUF:
1062                 v.val = sk->sk_sndbuf;
1063                 break;
1064
1065         case SO_RCVBUF:
1066                 v.val = sk->sk_rcvbuf;
1067                 break;
1068
1069         case SO_REUSEADDR:
1070                 v.val = sk->sk_reuse;
1071                 break;
1072
1073         case SO_REUSEPORT:
1074                 v.val = sk->sk_reuseport;
1075                 break;
1076
1077         case SO_KEEPALIVE:
1078                 v.val = sock_flag(sk, SOCK_KEEPOPEN);
1079                 break;
1080
1081         case SO_TYPE:
1082                 v.val = sk->sk_type;
1083                 break;
1084
1085         case SO_PROTOCOL:
1086                 v.val = sk->sk_protocol;
1087                 break;
1088
1089         case SO_DOMAIN:
1090                 v.val = sk->sk_family;
1091                 break;
1092
1093         case SO_ERROR:
1094                 v.val = -sock_error(sk);
1095                 if (v.val == 0)
1096                         v.val = xchg(&sk->sk_err_soft, 0);
1097                 break;
1098
1099         case SO_OOBINLINE:
1100                 v.val = sock_flag(sk, SOCK_URGINLINE);
1101                 break;
1102
1103         case SO_NO_CHECK:
1104                 v.val = sk->sk_no_check_tx;
1105                 break;
1106
1107         case SO_PRIORITY:
1108                 v.val = sk->sk_priority;
1109                 break;
1110
1111         case SO_LINGER:
1112                 lv              = sizeof(v.ling);
1113                 v.ling.l_onoff  = sock_flag(sk, SOCK_LINGER);
1114                 v.ling.l_linger = sk->sk_lingertime / HZ;
1115                 break;
1116
1117         case SO_BSDCOMPAT:
1118                 sock_warn_obsolete_bsdism("getsockopt");
1119                 break;
1120
1121         case SO_TIMESTAMP:
1122                 v.val = sock_flag(sk, SOCK_RCVTSTAMP) &&
1123                                 !sock_flag(sk, SOCK_RCVTSTAMPNS);
1124                 break;
1125
1126         case SO_TIMESTAMPNS:
1127                 v.val = sock_flag(sk, SOCK_RCVTSTAMPNS);
1128                 break;
1129
1130         case SO_TIMESTAMPING:
1131                 v.val = sk->sk_tsflags;
1132                 break;
1133
1134         case SO_RCVTIMEO:
1135                 lv = sizeof(struct timeval);
1136                 if (sk->sk_rcvtimeo == MAX_SCHEDULE_TIMEOUT) {
1137                         v.tm.tv_sec = 0;
1138                         v.tm.tv_usec = 0;
1139                 } else {
1140                         v.tm.tv_sec = sk->sk_rcvtimeo / HZ;
1141                         v.tm.tv_usec = ((sk->sk_rcvtimeo % HZ) * 1000000) / HZ;
1142                 }
1143                 break;
1144
1145         case SO_SNDTIMEO:
1146                 lv = sizeof(struct timeval);
1147                 if (sk->sk_sndtimeo == MAX_SCHEDULE_TIMEOUT) {
1148                         v.tm.tv_sec = 0;
1149                         v.tm.tv_usec = 0;
1150                 } else {
1151                         v.tm.tv_sec = sk->sk_sndtimeo / HZ;
1152                         v.tm.tv_usec = ((sk->sk_sndtimeo % HZ) * 1000000) / HZ;
1153                 }
1154                 break;
1155
1156         case SO_RCVLOWAT:
1157                 v.val = sk->sk_rcvlowat;
1158                 break;
1159
1160         case SO_SNDLOWAT:
1161                 v.val = 1;
1162                 break;
1163
1164         case SO_PASSCRED:
1165                 v.val = !!test_bit(SOCK_PASSCRED, &sock->flags);
1166                 break;
1167
1168         case SO_PEERCRED:
1169         {
1170                 struct ucred peercred;
1171                 if (len > sizeof(peercred))
1172                         len = sizeof(peercred);
1173                 cred_to_ucred(sk->sk_peer_pid, sk->sk_peer_cred, &peercred);
1174                 if (copy_to_user(optval, &peercred, len))
1175                         return -EFAULT;
1176                 goto lenout;
1177         }
1178
1179         case SO_PEERNAME:
1180         {
1181                 char address[128];
1182
1183                 if (sock->ops->getname(sock, (struct sockaddr *)address, &lv, 2))
1184                         return -ENOTCONN;
1185                 if (lv < len)
1186                         return -EINVAL;
1187                 if (copy_to_user(optval, address, len))
1188                         return -EFAULT;
1189                 goto lenout;
1190         }
1191
1192         /* Dubious BSD thing... Probably nobody even uses it, but
1193          * the UNIX standard wants it for whatever reason... -DaveM
1194          */
1195         case SO_ACCEPTCONN:
1196                 v.val = sk->sk_state == TCP_LISTEN;
1197                 break;
1198
1199         case SO_PASSSEC:
1200                 v.val = !!test_bit(SOCK_PASSSEC, &sock->flags);
1201                 break;
1202
1203         case SO_PEERSEC:
1204                 return security_socket_getpeersec_stream(sock, optval, optlen, len);
1205
1206         case SO_MARK:
1207                 v.val = sk->sk_mark;
1208                 break;
1209
1210         case SO_RXQ_OVFL:
1211                 v.val = sock_flag(sk, SOCK_RXQ_OVFL);
1212                 break;
1213
1214         case SO_WIFI_STATUS:
1215                 v.val = sock_flag(sk, SOCK_WIFI_STATUS);
1216                 break;
1217
1218         case SO_PEEK_OFF:
1219                 if (!sock->ops->set_peek_off)
1220                         return -EOPNOTSUPP;
1221
1222                 v.val = sk->sk_peek_off;
1223                 break;
1224         case SO_NOFCS:
1225                 v.val = sock_flag(sk, SOCK_NOFCS);
1226                 break;
1227
1228         case SO_BINDTODEVICE:
1229                 return sock_getbindtodevice(sk, optval, optlen, len);
1230
1231         case SO_GET_FILTER:
1232                 len = sk_get_filter(sk, (struct sock_filter __user *)optval, len);
1233                 if (len < 0)
1234                         return len;
1235
1236                 goto lenout;
1237
1238         case SO_LOCK_FILTER:
1239                 v.val = sock_flag(sk, SOCK_FILTER_LOCKED);
1240                 break;
1241
1242         case SO_BPF_EXTENSIONS:
1243                 v.val = bpf_tell_extensions();
1244                 break;
1245
1246         case SO_SELECT_ERR_QUEUE:
1247                 v.val = sock_flag(sk, SOCK_SELECT_ERR_QUEUE);
1248                 break;
1249
1250 #ifdef CONFIG_NET_RX_BUSY_POLL
1251         case SO_BUSY_POLL:
1252                 v.val = sk->sk_ll_usec;
1253                 break;
1254 #endif
1255
1256         case SO_MAX_PACING_RATE:
1257                 v.val = sk->sk_max_pacing_rate;
1258                 break;
1259
1260         case SO_INCOMING_CPU:
1261                 v.val = sk->sk_incoming_cpu;
1262                 break;
1263
1264         default:
1265                 /* We implement the SO_SNDLOWAT etc to not be settable
1266                  * (1003.1g 7).
1267                  */
1268                 return -ENOPROTOOPT;
1269         }
1270
1271         if (len > lv)
1272                 len = lv;
1273         if (copy_to_user(optval, &v, len))
1274                 return -EFAULT;
1275 lenout:
1276         if (put_user(len, optlen))
1277                 return -EFAULT;
1278         return 0;
1279 }
1280
1281 /*
1282  * Initialize an sk_lock.
1283  *
1284  * (We also register the sk_lock with the lock validator.)
1285  */
1286 static inline void sock_lock_init(struct sock *sk)
1287 {
1288         sock_lock_init_class_and_name(sk,
1289                         af_family_slock_key_strings[sk->sk_family],
1290                         af_family_slock_keys + sk->sk_family,
1291                         af_family_key_strings[sk->sk_family],
1292                         af_family_keys + sk->sk_family);
1293 }
1294
1295 /*
1296  * Copy all fields from osk to nsk but nsk->sk_refcnt must not change yet,
1297  * even temporarly, because of RCU lookups. sk_node should also be left as is.
1298  * We must not copy fields between sk_dontcopy_begin and sk_dontcopy_end
1299  */
1300 static void sock_copy(struct sock *nsk, const struct sock *osk)
1301 {
1302 #ifdef CONFIG_SECURITY_NETWORK
1303         void *sptr = nsk->sk_security;
1304 #endif
1305         memcpy(nsk, osk, offsetof(struct sock, sk_dontcopy_begin));
1306
1307         memcpy(&nsk->sk_dontcopy_end, &osk->sk_dontcopy_end,
1308                osk->sk_prot->obj_size - offsetof(struct sock, sk_dontcopy_end));
1309
1310 #ifdef CONFIG_SECURITY_NETWORK
1311         nsk->sk_security = sptr;
1312         security_sk_clone(osk, nsk);
1313 #endif
1314 }
1315
1316 static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
1317                 int family)
1318 {
1319         struct sock *sk;
1320         struct kmem_cache *slab;
1321
1322         slab = prot->slab;
1323         if (slab != NULL) {
1324                 sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO);
1325                 if (!sk)
1326                         return sk;
1327                 if (priority & __GFP_ZERO)
1328                         sk_prot_clear_nulls(sk, prot->obj_size);
1329         } else
1330                 sk = kmalloc(prot->obj_size, priority);
1331
1332         if (sk != NULL) {
1333                 kmemcheck_annotate_bitfield(sk, flags);
1334
1335                 if (security_sk_alloc(sk, family, priority))
1336                         goto out_free;
1337
1338                 if (!try_module_get(prot->owner))
1339                         goto out_free_sec;
1340                 sk_tx_queue_clear(sk);
1341         }
1342
1343         return sk;
1344
1345 out_free_sec:
1346         security_sk_free(sk);
1347 out_free:
1348         if (slab != NULL)
1349                 kmem_cache_free(slab, sk);
1350         else
1351                 kfree(sk);
1352         return NULL;
1353 }
1354
1355 static void sk_prot_free(struct proto *prot, struct sock *sk)
1356 {
1357         struct kmem_cache *slab;
1358         struct module *owner;
1359
1360         owner = prot->owner;
1361         slab = prot->slab;
1362
1363         cgroup_sk_free(&sk->sk_cgrp_data);
1364         mem_cgroup_sk_free(sk);
1365         security_sk_free(sk);
1366         if (slab != NULL)
1367                 kmem_cache_free(slab, sk);
1368         else
1369                 kfree(sk);
1370         module_put(owner);
1371 }
1372
1373 /**
1374  *      sk_alloc - All socket objects are allocated here
1375  *      @net: the applicable net namespace
1376  *      @family: protocol family
1377  *      @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
1378  *      @prot: struct proto associated with this new sock instance
1379  *      @kern: is this to be a kernel socket?
1380  */
1381 struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
1382                       struct proto *prot, int kern)
1383 {
1384         struct sock *sk;
1385
1386         sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family);
1387         if (sk) {
1388                 sk->sk_family = family;
1389                 /*
1390                  * See comment in struct sock definition to understand
1391                  * why we need sk_prot_creator -acme
1392                  */
1393                 sk->sk_prot = sk->sk_prot_creator = prot;
1394                 sock_lock_init(sk);
1395                 sk->sk_net_refcnt = kern ? 0 : 1;
1396                 if (likely(sk->sk_net_refcnt))
1397                         get_net(net);
1398                 sock_net_set(sk, net);
1399                 atomic_set(&sk->sk_wmem_alloc, 1);
1400
1401                 mem_cgroup_sk_alloc(sk);
1402                 cgroup_sk_alloc(&sk->sk_cgrp_data);
1403                 sock_update_classid(&sk->sk_cgrp_data);
1404                 sock_update_netprioidx(&sk->sk_cgrp_data);
1405         }
1406
1407         return sk;
1408 }
1409 EXPORT_SYMBOL(sk_alloc);
1410
1411 /* Sockets having SOCK_RCU_FREE will call this function after one RCU
1412  * grace period. This is the case for UDP sockets and TCP listeners.
1413  */
1414 static void __sk_destruct(struct rcu_head *head)
1415 {
1416         struct sock *sk = container_of(head, struct sock, sk_rcu);
1417         struct sk_filter *filter;
1418
1419         if (sk->sk_destruct)
1420                 sk->sk_destruct(sk);
1421
1422         filter = rcu_dereference_check(sk->sk_filter,
1423                                        atomic_read(&sk->sk_wmem_alloc) == 0);
1424         if (filter) {
1425                 sk_filter_uncharge(sk, filter);
1426                 RCU_INIT_POINTER(sk->sk_filter, NULL);
1427         }
1428         if (rcu_access_pointer(sk->sk_reuseport_cb))
1429                 reuseport_detach_sock(sk);
1430
1431         sock_disable_timestamp(sk, SK_FLAGS_TIMESTAMP);
1432
1433         if (atomic_read(&sk->sk_omem_alloc))
1434                 pr_debug("%s: optmem leakage (%d bytes) detected\n",
1435                          __func__, atomic_read(&sk->sk_omem_alloc));
1436
1437         if (sk->sk_frag.page) {
1438                 put_page(sk->sk_frag.page);
1439                 sk->sk_frag.page = NULL;
1440         }
1441
1442         if (sk->sk_peer_cred)
1443                 put_cred(sk->sk_peer_cred);
1444         put_pid(sk->sk_peer_pid);
1445         if (likely(sk->sk_net_refcnt))
1446                 put_net(sock_net(sk));
1447         sk_prot_free(sk->sk_prot_creator, sk);
1448 }
1449
1450 void sk_destruct(struct sock *sk)
1451 {
1452         if (sock_flag(sk, SOCK_RCU_FREE))
1453                 call_rcu(&sk->sk_rcu, __sk_destruct);
1454         else
1455                 __sk_destruct(&sk->sk_rcu);
1456 }
1457
1458 static void __sk_free(struct sock *sk)
1459 {
1460         if (unlikely(sk->sk_net_refcnt && sock_diag_has_destroy_listeners(sk)))
1461                 sock_diag_broadcast_destroy(sk);
1462         else
1463                 sk_destruct(sk);
1464 }
1465
1466 void sk_free(struct sock *sk)
1467 {
1468         /*
1469          * We subtract one from sk_wmem_alloc and can know if
1470          * some packets are still in some tx queue.
1471          * If not null, sock_wfree() will call __sk_free(sk) later
1472          */
1473         if (atomic_dec_and_test(&sk->sk_wmem_alloc))
1474                 __sk_free(sk);
1475 }
1476 EXPORT_SYMBOL(sk_free);
1477
1478 /**
1479  *      sk_clone_lock - clone a socket, and lock its clone
1480  *      @sk: the socket to clone
1481  *      @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
1482  *
1483  *      Caller must unlock socket even in error path (bh_unlock_sock(newsk))
1484  */
1485 struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
1486 {
1487         struct sock *newsk;
1488         bool is_charged = true;
1489
1490         newsk = sk_prot_alloc(sk->sk_prot, priority, sk->sk_family);
1491         if (newsk != NULL) {
1492                 struct sk_filter *filter;
1493
1494                 sock_copy(newsk, sk);
1495
1496                 newsk->sk_prot_creator = sk->sk_prot;
1497
1498                 /* SANITY */
1499                 if (likely(newsk->sk_net_refcnt))
1500                         get_net(sock_net(newsk));
1501                 sk_node_init(&newsk->sk_node);
1502                 sock_lock_init(newsk);
1503                 bh_lock_sock(newsk);
1504                 newsk->sk_backlog.head  = newsk->sk_backlog.tail = NULL;
1505                 newsk->sk_backlog.len = 0;
1506
1507                 atomic_set(&newsk->sk_rmem_alloc, 0);
1508                 /*
1509                  * sk_wmem_alloc set to one (see sk_free() and sock_wfree())
1510                  */
1511                 atomic_set(&newsk->sk_wmem_alloc, 1);
1512                 atomic_set(&newsk->sk_omem_alloc, 0);
1513                 skb_queue_head_init(&newsk->sk_receive_queue);
1514                 skb_queue_head_init(&newsk->sk_write_queue);
1515
1516                 rwlock_init(&newsk->sk_callback_lock);
1517                 lockdep_set_class_and_name(&newsk->sk_callback_lock,
1518                                 af_callback_keys + newsk->sk_family,
1519                                 af_family_clock_key_strings[newsk->sk_family]);
1520
1521                 newsk->sk_dst_cache     = NULL;
1522                 newsk->sk_wmem_queued   = 0;
1523                 newsk->sk_forward_alloc = 0;
1524                 atomic_set(&newsk->sk_drops, 0);
1525                 newsk->sk_send_head     = NULL;
1526                 newsk->sk_userlocks     = sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
1527
1528                 sock_reset_flag(newsk, SOCK_DONE);
1529                 cgroup_sk_alloc(&newsk->sk_cgrp_data);
1530                 skb_queue_head_init(&newsk->sk_error_queue);
1531
1532                 filter = rcu_dereference_protected(newsk->sk_filter, 1);
1533                 if (filter != NULL)
1534                         /* though it's an empty new sock, the charging may fail
1535                          * if sysctl_optmem_max was changed between creation of
1536                          * original socket and cloning
1537                          */
1538                         is_charged = sk_filter_charge(newsk, filter);
1539
1540                 if (unlikely(!is_charged || xfrm_sk_clone_policy(newsk, sk))) {
1541                         /* We need to make sure that we don't uncharge the new
1542                          * socket if we couldn't charge it in the first place
1543                          * as otherwise we uncharge the parent's filter.
1544                          */
1545                         if (!is_charged)
1546                                 RCU_INIT_POINTER(newsk->sk_filter, NULL);
1547                         /* It is still raw copy of parent, so invalidate
1548                          * destructor and make plain sk_free() */
1549                         newsk->sk_destruct = NULL;
1550                         bh_unlock_sock(newsk);
1551                         sk_free(newsk);
1552                         newsk = NULL;
1553                         goto out;
1554                 }
1555                 RCU_INIT_POINTER(newsk->sk_reuseport_cb, NULL);
1556
1557                 newsk->sk_err      = 0;
1558                 newsk->sk_err_soft = 0;
1559                 newsk->sk_priority = 0;
1560                 newsk->sk_incoming_cpu = raw_smp_processor_id();
1561                 atomic64_set(&newsk->sk_cookie, 0);
1562
1563                 mem_cgroup_sk_alloc(newsk);
1564                 /*
1565                  * Before updating sk_refcnt, we must commit prior changes to memory
1566                  * (Documentation/RCU/rculist_nulls.txt for details)
1567                  */
1568                 smp_wmb();
1569                 atomic_set(&newsk->sk_refcnt, 2);
1570
1571                 /*
1572                  * Increment the counter in the same struct proto as the master
1573                  * sock (sk_refcnt_debug_inc uses newsk->sk_prot->socks, that
1574                  * is the same as sk->sk_prot->socks, as this field was copied
1575                  * with memcpy).
1576                  *
1577                  * This _changes_ the previous behaviour, where
1578                  * tcp_create_openreq_child always was incrementing the
1579                  * equivalent to tcp_prot->socks (inet_sock_nr), so this have
1580                  * to be taken into account in all callers. -acme
1581                  */
1582                 sk_refcnt_debug_inc(newsk);
1583                 sk_set_socket(newsk, NULL);
1584                 newsk->sk_wq = NULL;
1585
1586                 if (newsk->sk_prot->sockets_allocated)
1587                         sk_sockets_allocated_inc(newsk);
1588
1589                 if (sock_needs_netstamp(sk) &&
1590                     newsk->sk_flags & SK_FLAGS_TIMESTAMP)
1591                         net_enable_timestamp();
1592         }
1593 out:
1594         return newsk;
1595 }
1596 EXPORT_SYMBOL_GPL(sk_clone_lock);
1597
1598 void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
1599 {
1600         u32 max_segs = 1;
1601
1602         sk_dst_set(sk, dst);
1603         sk->sk_route_caps = dst->dev->features;
1604         if (sk->sk_route_caps & NETIF_F_GSO)
1605                 sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE;
1606         sk->sk_route_caps &= ~sk->sk_route_nocaps;
1607         if (sk_can_gso(sk)) {
1608                 if (dst->header_len) {
1609                         sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
1610                 } else {
1611                         sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM;
1612                         sk->sk_gso_max_size = dst->dev->gso_max_size;
1613                         max_segs = max_t(u32, dst->dev->gso_max_segs, 1);
1614                 }
1615         }
1616         sk->sk_gso_max_segs = max_segs;
1617 }
1618 EXPORT_SYMBOL_GPL(sk_setup_caps);
1619
1620 /*
1621  *      Simple resource managers for sockets.
1622  */
1623
1624
1625 /*
1626  * Write buffer destructor automatically called from kfree_skb.
1627  */
1628 void sock_wfree(struct sk_buff *skb)
1629 {
1630         struct sock *sk = skb->sk;
1631         unsigned int len = skb->truesize;
1632
1633         if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE)) {
1634                 /*
1635                  * Keep a reference on sk_wmem_alloc, this will be released
1636                  * after sk_write_space() call
1637                  */
1638                 atomic_sub(len - 1, &sk->sk_wmem_alloc);
1639                 sk->sk_write_space(sk);
1640                 len = 1;
1641         }
1642         /*
1643          * if sk_wmem_alloc reaches 0, we must finish what sk_free()
1644          * could not do because of in-flight packets
1645          */
1646         if (atomic_sub_and_test(len, &sk->sk_wmem_alloc))
1647                 __sk_free(sk);
1648 }
1649 EXPORT_SYMBOL(sock_wfree);
1650
1651 /* This variant of sock_wfree() is used by TCP,
1652  * since it sets SOCK_USE_WRITE_QUEUE.
1653  */
1654 void __sock_wfree(struct sk_buff *skb)
1655 {
1656         struct sock *sk = skb->sk;
1657
1658         if (atomic_sub_and_test(skb->truesize, &sk->sk_wmem_alloc))
1659                 __sk_free(sk);
1660 }
1661
1662 void skb_set_owner_w(struct sk_buff *skb, struct sock *sk)
1663 {
1664         skb_orphan(skb);
1665         skb->sk = sk;
1666 #ifdef CONFIG_INET
1667         if (unlikely(!sk_fullsock(sk))) {
1668                 skb->destructor = sock_edemux;
1669                 sock_hold(sk);
1670                 return;
1671         }
1672 #endif
1673         skb->destructor = sock_wfree;
1674         skb_set_hash_from_sk(skb, sk);
1675         /*
1676          * We used to take a refcount on sk, but following operation
1677          * is enough to guarantee sk_free() wont free this sock until
1678          * all in-flight packets are completed
1679          */
1680         atomic_add(skb->truesize, &sk->sk_wmem_alloc);
1681 }
1682 EXPORT_SYMBOL(skb_set_owner_w);
1683
1684 /* This helper is used by netem, as it can hold packets in its
1685  * delay queue. We want to allow the owner socket to send more
1686  * packets, as if they were already TX completed by a typical driver.
1687  * But we also want to keep skb->sk set because some packet schedulers
1688  * rely on it (sch_fq for example).
1689  */
1690 void skb_orphan_partial(struct sk_buff *skb)
1691 {
1692         if (skb_is_tcp_pure_ack(skb))
1693                 return;
1694
1695         if (skb->destructor == sock_wfree
1696 #ifdef CONFIG_INET
1697             || skb->destructor == tcp_wfree
1698 #endif
1699                 ) {
1700                 struct sock *sk = skb->sk;
1701
1702                 if (atomic_inc_not_zero(&sk->sk_refcnt)) {
1703                         atomic_sub(skb->truesize, &sk->sk_wmem_alloc);
1704                         skb->destructor = sock_efree;
1705                 }
1706         } else {
1707                 skb_orphan(skb);
1708         }
1709 }
1710 EXPORT_SYMBOL(skb_orphan_partial);
1711
1712 /*
1713  * Read buffer destructor automatically called from kfree_skb.
1714  */
1715 void sock_rfree(struct sk_buff *skb)
1716 {
1717         struct sock *sk = skb->sk;
1718         unsigned int len = skb->truesize;
1719
1720         atomic_sub(len, &sk->sk_rmem_alloc);
1721         sk_mem_uncharge(sk, len);
1722 }
1723 EXPORT_SYMBOL(sock_rfree);
1724
1725 /*
1726  * Buffer destructor for skbs that are not used directly in read or write
1727  * path, e.g. for error handler skbs. Automatically called from kfree_skb.
1728  */
1729 void sock_efree(struct sk_buff *skb)
1730 {
1731         sock_put(skb->sk);
1732 }
1733 EXPORT_SYMBOL(sock_efree);
1734
1735 kuid_t sock_i_uid(struct sock *sk)
1736 {
1737         kuid_t uid;
1738
1739         read_lock_bh(&sk->sk_callback_lock);
1740         uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : GLOBAL_ROOT_UID;
1741         read_unlock_bh(&sk->sk_callback_lock);
1742         return uid;
1743 }
1744 EXPORT_SYMBOL(sock_i_uid);
1745
1746 unsigned long sock_i_ino(struct sock *sk)
1747 {
1748         unsigned long ino;
1749
1750         read_lock_bh(&sk->sk_callback_lock);
1751         ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0;
1752         read_unlock_bh(&sk->sk_callback_lock);
1753         return ino;
1754 }
1755 EXPORT_SYMBOL(sock_i_ino);
1756
1757 /*
1758  * Allocate a skb from the socket's send buffer.
1759  */
1760 struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
1761                              gfp_t priority)
1762 {
1763         if (force || atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
1764                 struct sk_buff *skb = alloc_skb(size, priority);
1765                 if (skb) {
1766                         skb_set_owner_w(skb, sk);
1767                         return skb;
1768                 }
1769         }
1770         return NULL;
1771 }
1772 EXPORT_SYMBOL(sock_wmalloc);
1773
1774 /*
1775  * Allocate a memory block from the socket's option memory buffer.
1776  */
1777 void *sock_kmalloc(struct sock *sk, int size, gfp_t priority)
1778 {
1779         if ((unsigned int)size <= sysctl_optmem_max &&
1780             atomic_read(&sk->sk_omem_alloc) + size < sysctl_optmem_max) {
1781                 void *mem;
1782                 /* First do the add, to avoid the race if kmalloc
1783                  * might sleep.
1784                  */
1785                 atomic_add(size, &sk->sk_omem_alloc);
1786                 mem = kmalloc(size, priority);
1787                 if (mem)
1788                         return mem;
1789                 atomic_sub(size, &sk->sk_omem_alloc);
1790         }
1791         return NULL;
1792 }
1793 EXPORT_SYMBOL(sock_kmalloc);
1794
1795 /* Free an option memory block. Note, we actually want the inline
1796  * here as this allows gcc to detect the nullify and fold away the
1797  * condition entirely.
1798  */
1799 static inline void __sock_kfree_s(struct sock *sk, void *mem, int size,
1800                                   const bool nullify)
1801 {
1802         if (WARN_ON_ONCE(!mem))
1803                 return;
1804         if (nullify)
1805                 kzfree(mem);
1806         else
1807                 kfree(mem);
1808         atomic_sub(size, &sk->sk_omem_alloc);
1809 }
1810
1811 void sock_kfree_s(struct sock *sk, void *mem, int size)
1812 {
1813         __sock_kfree_s(sk, mem, size, false);
1814 }
1815 EXPORT_SYMBOL(sock_kfree_s);
1816
1817 void sock_kzfree_s(struct sock *sk, void *mem, int size)
1818 {
1819         __sock_kfree_s(sk, mem, size, true);
1820 }
1821 EXPORT_SYMBOL(sock_kzfree_s);
1822
1823 /* It is almost wait_for_tcp_memory minus release_sock/lock_sock.
1824    I think, these locks should be removed for datagram sockets.
1825  */
1826 static long sock_wait_for_wmem(struct sock *sk, long timeo)
1827 {
1828         DEFINE_WAIT(wait);
1829
1830         sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
1831         for (;;) {
1832                 if (!timeo)
1833                         break;
1834                 if (signal_pending(current))
1835                         break;
1836                 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1837                 prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
1838                 if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf)
1839                         break;
1840                 if (sk->sk_shutdown & SEND_SHUTDOWN)
1841                         break;
1842                 if (sk->sk_err)
1843                         break;
1844                 timeo = schedule_timeout(timeo);
1845         }
1846         finish_wait(sk_sleep(sk), &wait);
1847         return timeo;
1848 }
1849
1850
1851 /*
1852  *      Generic send/receive buffer handlers
1853  */
1854
1855 struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len,
1856                                      unsigned long data_len, int noblock,
1857                                      int *errcode, int max_page_order)
1858 {
1859         struct sk_buff *skb;
1860         long timeo;
1861         int err;
1862
1863         timeo = sock_sndtimeo(sk, noblock);
1864         for (;;) {
1865                 err = sock_error(sk);
1866                 if (err != 0)
1867                         goto failure;
1868
1869                 err = -EPIPE;
1870                 if (sk->sk_shutdown & SEND_SHUTDOWN)
1871                         goto failure;
1872
1873                 if (sk_wmem_alloc_get(sk) < sk->sk_sndbuf)
1874                         break;
1875
1876                 sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
1877                 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1878                 err = -EAGAIN;
1879                 if (!timeo)
1880                         goto failure;
1881                 if (signal_pending(current))
1882                         goto interrupted;
1883                 timeo = sock_wait_for_wmem(sk, timeo);
1884         }
1885         skb = alloc_skb_with_frags(header_len, data_len, max_page_order,
1886                                    errcode, sk->sk_allocation);
1887         if (skb)
1888                 skb_set_owner_w(skb, sk);
1889         return skb;
1890
1891 interrupted:
1892         err = sock_intr_errno(timeo);
1893 failure:
1894         *errcode = err;
1895         return NULL;
1896 }
1897 EXPORT_SYMBOL(sock_alloc_send_pskb);
1898
1899 struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size,
1900                                     int noblock, int *errcode)
1901 {
1902         return sock_alloc_send_pskb(sk, size, 0, noblock, errcode, 0);
1903 }
1904 EXPORT_SYMBOL(sock_alloc_send_skb);
1905
1906 int __sock_cmsg_send(struct sock *sk, struct msghdr *msg, struct cmsghdr *cmsg,
1907                      struct sockcm_cookie *sockc)
1908 {
1909         u32 tsflags;
1910
1911         switch (cmsg->cmsg_type) {
1912         case SO_MARK:
1913                 if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
1914                         return -EPERM;
1915                 if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
1916                         return -EINVAL;
1917                 sockc->mark = *(u32 *)CMSG_DATA(cmsg);
1918                 break;
1919         case SO_TIMESTAMPING:
1920                 if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
1921                         return -EINVAL;
1922
1923                 tsflags = *(u32 *)CMSG_DATA(cmsg);
1924                 if (tsflags & ~SOF_TIMESTAMPING_TX_RECORD_MASK)
1925                         return -EINVAL;
1926
1927                 sockc->tsflags &= ~SOF_TIMESTAMPING_TX_RECORD_MASK;
1928                 sockc->tsflags |= tsflags;
1929                 break;
1930         /* SCM_RIGHTS and SCM_CREDENTIALS are semantically in SOL_UNIX. */
1931         case SCM_RIGHTS:
1932         case SCM_CREDENTIALS:
1933                 break;
1934         default:
1935                 return -EINVAL;
1936         }
1937         return 0;
1938 }
1939 EXPORT_SYMBOL(__sock_cmsg_send);
1940
1941 int sock_cmsg_send(struct sock *sk, struct msghdr *msg,
1942                    struct sockcm_cookie *sockc)
1943 {
1944         struct cmsghdr *cmsg;
1945         int ret;
1946
1947         for_each_cmsghdr(cmsg, msg) {
1948                 if (!CMSG_OK(msg, cmsg))
1949                         return -EINVAL;
1950                 if (cmsg->cmsg_level != SOL_SOCKET)
1951                         continue;
1952                 ret = __sock_cmsg_send(sk, msg, cmsg, sockc);
1953                 if (ret)
1954                         return ret;
1955         }
1956         return 0;
1957 }
1958 EXPORT_SYMBOL(sock_cmsg_send);
1959
1960 /* On 32bit arches, an skb frag is limited to 2^15 */
1961 #define SKB_FRAG_PAGE_ORDER     get_order(32768)
1962
1963 /**
1964  * skb_page_frag_refill - check that a page_frag contains enough room
1965  * @sz: minimum size of the fragment we want to get
1966  * @pfrag: pointer to page_frag
1967  * @gfp: priority for memory allocation
1968  *
1969  * Note: While this allocator tries to use high order pages, there is
1970  * no guarantee that allocations succeed. Therefore, @sz MUST be
1971  * less or equal than PAGE_SIZE.
1972  */
1973 bool skb_page_frag_refill(unsigned int sz, struct page_frag *pfrag, gfp_t gfp)
1974 {
1975         if (pfrag->page) {
1976                 if (page_ref_count(pfrag->page) == 1) {
1977                         pfrag->offset = 0;
1978                         return true;
1979                 }
1980                 if (pfrag->offset + sz <= pfrag->size)
1981                         return true;
1982                 put_page(pfrag->page);
1983         }
1984
1985         pfrag->offset = 0;
1986         if (SKB_FRAG_PAGE_ORDER) {
1987                 /* Avoid direct reclaim but allow kswapd to wake */
1988                 pfrag->page = alloc_pages((gfp & ~__GFP_DIRECT_RECLAIM) |
1989                                           __GFP_COMP | __GFP_NOWARN |
1990                                           __GFP_NORETRY,
1991                                           SKB_FRAG_PAGE_ORDER);
1992                 if (likely(pfrag->page)) {
1993                         pfrag->size = PAGE_SIZE << SKB_FRAG_PAGE_ORDER;
1994                         return true;
1995                 }
1996         }
1997         pfrag->page = alloc_page(gfp);
1998         if (likely(pfrag->page)) {
1999                 pfrag->size = PAGE_SIZE;
2000                 return true;
2001         }
2002         return false;
2003 }
2004 EXPORT_SYMBOL(skb_page_frag_refill);
2005
2006 bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag)
2007 {
2008         if (likely(skb_page_frag_refill(32U, pfrag, sk->sk_allocation)))
2009                 return true;
2010
2011         sk_enter_memory_pressure(sk);
2012         sk_stream_moderate_sndbuf(sk);
2013         return false;
2014 }
2015 EXPORT_SYMBOL(sk_page_frag_refill);
2016
2017 static void __lock_sock(struct sock *sk)
2018         __releases(&sk->sk_lock.slock)
2019         __acquires(&sk->sk_lock.slock)
2020 {
2021         DEFINE_WAIT(wait);
2022
2023         for (;;) {
2024                 prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait,
2025                                         TASK_UNINTERRUPTIBLE);
2026                 spin_unlock_bh(&sk->sk_lock.slock);
2027                 schedule();
2028                 spin_lock_bh(&sk->sk_lock.slock);
2029                 if (!sock_owned_by_user(sk))
2030                         break;
2031         }
2032         finish_wait(&sk->sk_lock.wq, &wait);
2033 }
2034
2035 static void __release_sock(struct sock *sk)
2036         __releases(&sk->sk_lock.slock)
2037         __acquires(&sk->sk_lock.slock)
2038 {
2039         struct sk_buff *skb, *next;
2040
2041         while ((skb = sk->sk_backlog.head) != NULL) {
2042                 sk->sk_backlog.head = sk->sk_backlog.tail = NULL;
2043
2044                 spin_unlock_bh(&sk->sk_lock.slock);
2045
2046                 do {
2047                         next = skb->next;
2048                         prefetch(next);
2049                         WARN_ON_ONCE(skb_dst_is_noref(skb));
2050                         skb->next = NULL;
2051                         sk_backlog_rcv(sk, skb);
2052
2053                         cond_resched();
2054
2055                         skb = next;
2056                 } while (skb != NULL);
2057
2058                 spin_lock_bh(&sk->sk_lock.slock);
2059         }
2060
2061         /*
2062          * Doing the zeroing here guarantee we can not loop forever
2063          * while a wild producer attempts to flood us.
2064          */
2065         sk->sk_backlog.len = 0;
2066 }
2067
2068 void __sk_flush_backlog(struct sock *sk)
2069 {
2070         spin_lock_bh(&sk->sk_lock.slock);
2071         __release_sock(sk);
2072         spin_unlock_bh(&sk->sk_lock.slock);
2073 }
2074
2075 /**
2076  * sk_wait_data - wait for data to arrive at sk_receive_queue
2077  * @sk:    sock to wait on
2078  * @timeo: for how long
2079  * @skb:   last skb seen on sk_receive_queue
2080  *
2081  * Now socket state including sk->sk_err is changed only under lock,
2082  * hence we may omit checks after joining wait queue.
2083  * We check receive queue before schedule() only as optimization;
2084  * it is very likely that release_sock() added new data.
2085  */
2086 int sk_wait_data(struct sock *sk, long *timeo, const struct sk_buff *skb)
2087 {
2088         int rc;
2089         DEFINE_WAIT(wait);
2090
2091         prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
2092         sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2093         rc = sk_wait_event(sk, timeo, skb_peek_tail(&sk->sk_receive_queue) != skb);
2094         sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2095         finish_wait(sk_sleep(sk), &wait);
2096         return rc;
2097 }
2098 EXPORT_SYMBOL(sk_wait_data);
2099
2100 /**
2101  *      __sk_mem_schedule - increase sk_forward_alloc and memory_allocated
2102  *      @sk: socket
2103  *      @size: memory size to allocate
2104  *      @kind: allocation type
2105  *
2106  *      If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means
2107  *      rmem allocation. This function assumes that protocols which have
2108  *      memory_pressure use sk_wmem_queued as write buffer accounting.
2109  */
2110 int __sk_mem_schedule(struct sock *sk, int size, int kind)
2111 {
2112         struct proto *prot = sk->sk_prot;
2113         int amt = sk_mem_pages(size);
2114         long allocated;
2115
2116         sk->sk_forward_alloc += amt * SK_MEM_QUANTUM;
2117
2118         allocated = sk_memory_allocated_add(sk, amt);
2119
2120         if (mem_cgroup_sockets_enabled && sk->sk_memcg &&
2121             !mem_cgroup_charge_skmem(sk->sk_memcg, amt))
2122                 goto suppress_allocation;
2123
2124         /* Under limit. */
2125         if (allocated <= sk_prot_mem_limits(sk, 0)) {
2126                 sk_leave_memory_pressure(sk);
2127                 return 1;
2128         }
2129
2130         /* Under pressure. */
2131         if (allocated > sk_prot_mem_limits(sk, 1))
2132                 sk_enter_memory_pressure(sk);
2133
2134         /* Over hard limit. */
2135         if (allocated > sk_prot_mem_limits(sk, 2))
2136                 goto suppress_allocation;
2137
2138         /* guarantee minimum buffer size under pressure */
2139         if (kind == SK_MEM_RECV) {
2140                 if (atomic_read(&sk->sk_rmem_alloc) < prot->sysctl_rmem[0])
2141                         return 1;
2142
2143         } else { /* SK_MEM_SEND */
2144                 if (sk->sk_type == SOCK_STREAM) {
2145                         if (sk->sk_wmem_queued < prot->sysctl_wmem[0])
2146                                 return 1;
2147                 } else if (atomic_read(&sk->sk_wmem_alloc) <
2148                            prot->sysctl_wmem[0])
2149                                 return 1;
2150         }
2151
2152         if (sk_has_memory_pressure(sk)) {
2153                 int alloc;
2154
2155                 if (!sk_under_memory_pressure(sk))
2156                         return 1;
2157                 alloc = sk_sockets_allocated_read_positive(sk);
2158                 if (sk_prot_mem_limits(sk, 2) > alloc *
2159                     sk_mem_pages(sk->sk_wmem_queued +
2160                                  atomic_read(&sk->sk_rmem_alloc) +
2161                                  sk->sk_forward_alloc))
2162                         return 1;
2163         }
2164
2165 suppress_allocation:
2166
2167         if (kind == SK_MEM_SEND && sk->sk_type == SOCK_STREAM) {
2168                 sk_stream_moderate_sndbuf(sk);
2169
2170                 /* Fail only if socket is _under_ its sndbuf.
2171                  * In this case we cannot block, so that we have to fail.
2172                  */
2173                 if (sk->sk_wmem_queued + size >= sk->sk_sndbuf)
2174                         return 1;
2175         }
2176
2177         trace_sock_exceed_buf_limit(sk, prot, allocated);
2178
2179         /* Alas. Undo changes. */
2180         sk->sk_forward_alloc -= amt * SK_MEM_QUANTUM;
2181
2182         sk_memory_allocated_sub(sk, amt);
2183
2184         if (mem_cgroup_sockets_enabled && sk->sk_memcg)
2185                 mem_cgroup_uncharge_skmem(sk->sk_memcg, amt);
2186
2187         return 0;
2188 }
2189 EXPORT_SYMBOL(__sk_mem_schedule);
2190
2191 /**
2192  *      __sk_mem_reclaim - reclaim memory_allocated
2193  *      @sk: socket
2194  *      @amount: number of bytes (rounded down to a SK_MEM_QUANTUM multiple)
2195  */
2196 void __sk_mem_reclaim(struct sock *sk, int amount)
2197 {
2198         amount >>= SK_MEM_QUANTUM_SHIFT;
2199         sk_memory_allocated_sub(sk, amount);
2200         sk->sk_forward_alloc -= amount << SK_MEM_QUANTUM_SHIFT;
2201
2202         if (mem_cgroup_sockets_enabled && sk->sk_memcg)
2203                 mem_cgroup_uncharge_skmem(sk->sk_memcg, amount);
2204
2205         if (sk_under_memory_pressure(sk) &&
2206             (sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)))
2207                 sk_leave_memory_pressure(sk);
2208 }
2209 EXPORT_SYMBOL(__sk_mem_reclaim);
2210
2211 int sk_set_peek_off(struct sock *sk, int val)
2212 {
2213         if (val < 0)
2214                 return -EINVAL;
2215
2216         sk->sk_peek_off = val;
2217         return 0;
2218 }
2219 EXPORT_SYMBOL_GPL(sk_set_peek_off);
2220
2221 /*
2222  * Set of default routines for initialising struct proto_ops when
2223  * the protocol does not support a particular function. In certain
2224  * cases where it makes no sense for a protocol to have a "do nothing"
2225  * function, some default processing is provided.
2226  */
2227
2228 int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len)
2229 {
2230         return -EOPNOTSUPP;
2231 }
2232 EXPORT_SYMBOL(sock_no_bind);
2233
2234 int sock_no_connect(struct socket *sock, struct sockaddr *saddr,
2235                     int len, int flags)
2236 {
2237         return -EOPNOTSUPP;
2238 }
2239 EXPORT_SYMBOL(sock_no_connect);
2240
2241 int sock_no_socketpair(struct socket *sock1, struct socket *sock2)
2242 {
2243         return -EOPNOTSUPP;
2244 }
2245 EXPORT_SYMBOL(sock_no_socketpair);
2246
2247 int sock_no_accept(struct socket *sock, struct socket *newsock, int flags)
2248 {
2249         return -EOPNOTSUPP;
2250 }
2251 EXPORT_SYMBOL(sock_no_accept);
2252
2253 int sock_no_getname(struct socket *sock, struct sockaddr *saddr,
2254                     int *len, int peer)
2255 {
2256         return -EOPNOTSUPP;
2257 }
2258 EXPORT_SYMBOL(sock_no_getname);
2259
2260 unsigned int sock_no_poll(struct file *file, struct socket *sock, poll_table *pt)
2261 {
2262         return 0;
2263 }
2264 EXPORT_SYMBOL(sock_no_poll);
2265
2266 int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
2267 {
2268         return -EOPNOTSUPP;
2269 }
2270 EXPORT_SYMBOL(sock_no_ioctl);
2271
2272 int sock_no_listen(struct socket *sock, int backlog)
2273 {
2274         return -EOPNOTSUPP;
2275 }
2276 EXPORT_SYMBOL(sock_no_listen);
2277
2278 int sock_no_shutdown(struct socket *sock, int how)
2279 {
2280         return -EOPNOTSUPP;
2281 }
2282 EXPORT_SYMBOL(sock_no_shutdown);
2283
2284 int sock_no_setsockopt(struct socket *sock, int level, int optname,
2285                     char __user *optval, unsigned int optlen)
2286 {
2287         return -EOPNOTSUPP;
2288 }
2289 EXPORT_SYMBOL(sock_no_setsockopt);
2290
2291 int sock_no_getsockopt(struct socket *sock, int level, int optname,
2292                     char __user *optval, int __user *optlen)
2293 {
2294         return -EOPNOTSUPP;
2295 }
2296 EXPORT_SYMBOL(sock_no_getsockopt);
2297
2298 int sock_no_sendmsg(struct socket *sock, struct msghdr *m, size_t len)
2299 {
2300         return -EOPNOTSUPP;
2301 }
2302 EXPORT_SYMBOL(sock_no_sendmsg);
2303
2304 int sock_no_recvmsg(struct socket *sock, struct msghdr *m, size_t len,
2305                     int flags)
2306 {
2307         return -EOPNOTSUPP;
2308 }
2309 EXPORT_SYMBOL(sock_no_recvmsg);
2310
2311 int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
2312 {
2313         /* Mirror missing mmap method error code */
2314         return -ENODEV;
2315 }
2316 EXPORT_SYMBOL(sock_no_mmap);
2317
2318 ssize_t sock_no_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags)
2319 {
2320         ssize_t res;
2321         struct msghdr msg = {.msg_flags = flags};
2322         struct kvec iov;
2323         char *kaddr = kmap(page);
2324         iov.iov_base = kaddr + offset;
2325         iov.iov_len = size;
2326         res = kernel_sendmsg(sock, &msg, &iov, 1, size);
2327         kunmap(page);
2328         return res;
2329 }
2330 EXPORT_SYMBOL(sock_no_sendpage);
2331
2332 /*
2333  *      Default Socket Callbacks
2334  */
2335
2336 static void sock_def_wakeup(struct sock *sk)
2337 {
2338         struct socket_wq *wq;
2339
2340         rcu_read_lock();
2341         wq = rcu_dereference(sk->sk_wq);
2342         if (skwq_has_sleeper(wq))
2343                 wake_up_interruptible_all(&wq->wait);
2344         rcu_read_unlock();
2345 }
2346
2347 static void sock_def_error_report(struct sock *sk)
2348 {
2349         struct socket_wq *wq;
2350
2351         rcu_read_lock();
2352         wq = rcu_dereference(sk->sk_wq);
2353         if (skwq_has_sleeper(wq))
2354                 wake_up_interruptible_poll(&wq->wait, POLLERR);
2355         sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR);
2356         rcu_read_unlock();
2357 }
2358
2359 static void sock_def_readable(struct sock *sk)
2360 {
2361         struct socket_wq *wq;
2362
2363         rcu_read_lock();
2364         wq = rcu_dereference(sk->sk_wq);
2365         if (skwq_has_sleeper(wq))
2366                 wake_up_interruptible_sync_poll(&wq->wait, POLLIN | POLLPRI |
2367                                                 POLLRDNORM | POLLRDBAND);
2368         sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
2369         rcu_read_unlock();
2370 }
2371
2372 static void sock_def_write_space(struct sock *sk)
2373 {
2374         struct socket_wq *wq;
2375
2376         rcu_read_lock();
2377
2378         /* Do not wake up a writer until he can make "significant"
2379          * progress.  --DaveM
2380          */
2381         if ((atomic_read(&sk->sk_wmem_alloc) << 1) <= sk->sk_sndbuf) {
2382                 wq = rcu_dereference(sk->sk_wq);
2383                 if (skwq_has_sleeper(wq))
2384                         wake_up_interruptible_sync_poll(&wq->wait, POLLOUT |
2385                                                 POLLWRNORM | POLLWRBAND);
2386
2387                 /* Should agree with poll, otherwise some programs break */
2388                 if (sock_writeable(sk))
2389                         sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
2390         }
2391
2392         rcu_read_unlock();
2393 }
2394
2395 static void sock_def_destruct(struct sock *sk)
2396 {
2397 }
2398
2399 void sk_send_sigurg(struct sock *sk)
2400 {
2401         if (sk->sk_socket && sk->sk_socket->file)
2402                 if (send_sigurg(&sk->sk_socket->file->f_owner))
2403                         sk_wake_async(sk, SOCK_WAKE_URG, POLL_PRI);
2404 }
2405 EXPORT_SYMBOL(sk_send_sigurg);
2406
2407 void sk_reset_timer(struct sock *sk, struct timer_list* timer,
2408                     unsigned long expires)
2409 {
2410         if (!mod_timer(timer, expires))
2411                 sock_hold(sk);
2412 }
2413 EXPORT_SYMBOL(sk_reset_timer);
2414
2415 void sk_stop_timer(struct sock *sk, struct timer_list* timer)
2416 {
2417         if (del_timer(timer))
2418                 __sock_put(sk);
2419 }
2420 EXPORT_SYMBOL(sk_stop_timer);
2421
2422 void sock_init_data(struct socket *sock, struct sock *sk)
2423 {
2424         skb_queue_head_init(&sk->sk_receive_queue);
2425         skb_queue_head_init(&sk->sk_write_queue);
2426         skb_queue_head_init(&sk->sk_error_queue);
2427
2428         sk->sk_send_head        =       NULL;
2429
2430         init_timer(&sk->sk_timer);
2431
2432         sk->sk_allocation       =       GFP_KERNEL;
2433         sk->sk_rcvbuf           =       sysctl_rmem_default;
2434         sk->sk_sndbuf           =       sysctl_wmem_default;
2435         sk->sk_state            =       TCP_CLOSE;
2436         sk_set_socket(sk, sock);
2437
2438         sock_set_flag(sk, SOCK_ZAPPED);
2439
2440         if (sock) {
2441                 sk->sk_type     =       sock->type;
2442                 sk->sk_wq       =       sock->wq;
2443                 sock->sk        =       sk;
2444         } else
2445                 sk->sk_wq       =       NULL;
2446
2447         rwlock_init(&sk->sk_callback_lock);
2448         lockdep_set_class_and_name(&sk->sk_callback_lock,
2449                         af_callback_keys + sk->sk_family,
2450                         af_family_clock_key_strings[sk->sk_family]);
2451
2452         sk->sk_state_change     =       sock_def_wakeup;
2453         sk->sk_data_ready       =       sock_def_readable;
2454         sk->sk_write_space      =       sock_def_write_space;
2455         sk->sk_error_report     =       sock_def_error_report;
2456         sk->sk_destruct         =       sock_def_destruct;
2457
2458         sk->sk_frag.page        =       NULL;
2459         sk->sk_frag.offset      =       0;
2460         sk->sk_peek_off         =       -1;
2461
2462         sk->sk_peer_pid         =       NULL;
2463         sk->sk_peer_cred        =       NULL;
2464         sk->sk_write_pending    =       0;
2465         sk->sk_rcvlowat         =       1;
2466         sk->sk_rcvtimeo         =       MAX_SCHEDULE_TIMEOUT;
2467         sk->sk_sndtimeo         =       MAX_SCHEDULE_TIMEOUT;
2468
2469         sk->sk_stamp = ktime_set(-1L, 0);
2470
2471 #ifdef CONFIG_NET_RX_BUSY_POLL
2472         sk->sk_napi_id          =       0;
2473         sk->sk_ll_usec          =       sysctl_net_busy_read;
2474 #endif
2475
2476         sk->sk_max_pacing_rate = ~0U;
2477         sk->sk_pacing_rate = ~0U;
2478         sk->sk_incoming_cpu = -1;
2479         /*
2480          * Before updating sk_refcnt, we must commit prior changes to memory
2481          * (Documentation/RCU/rculist_nulls.txt for details)
2482          */
2483         smp_wmb();
2484         atomic_set(&sk->sk_refcnt, 1);
2485         atomic_set(&sk->sk_drops, 0);
2486 }
2487 EXPORT_SYMBOL(sock_init_data);
2488
2489 void lock_sock_nested(struct sock *sk, int subclass)
2490 {
2491         might_sleep();
2492         spin_lock_bh(&sk->sk_lock.slock);
2493         if (sk->sk_lock.owned)
2494                 __lock_sock(sk);
2495         sk->sk_lock.owned = 1;
2496         spin_unlock(&sk->sk_lock.slock);
2497         /*
2498          * The sk_lock has mutex_lock() semantics here:
2499          */
2500         mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_);
2501         local_bh_enable();
2502 }
2503 EXPORT_SYMBOL(lock_sock_nested);
2504
2505 void release_sock(struct sock *sk)
2506 {
2507         spin_lock_bh(&sk->sk_lock.slock);
2508         if (sk->sk_backlog.tail)
2509                 __release_sock(sk);
2510
2511         /* Warning : release_cb() might need to release sk ownership,
2512          * ie call sock_release_ownership(sk) before us.
2513          */
2514         if (sk->sk_prot->release_cb)
2515                 sk->sk_prot->release_cb(sk);
2516
2517         sock_release_ownership(sk);
2518         if (waitqueue_active(&sk->sk_lock.wq))
2519                 wake_up(&sk->sk_lock.wq);
2520         spin_unlock_bh(&sk->sk_lock.slock);
2521 }
2522 EXPORT_SYMBOL(release_sock);
2523
2524 /**
2525  * lock_sock_fast - fast version of lock_sock
2526  * @sk: socket
2527  *
2528  * This version should be used for very small section, where process wont block
2529  * return false if fast path is taken
2530  *   sk_lock.slock locked, owned = 0, BH disabled
2531  * return true if slow path is taken
2532  *   sk_lock.slock unlocked, owned = 1, BH enabled
2533  */
2534 bool lock_sock_fast(struct sock *sk)
2535 {
2536         might_sleep();
2537         spin_lock_bh(&sk->sk_lock.slock);
2538
2539         if (!sk->sk_lock.owned)
2540                 /*
2541                  * Note : We must disable BH
2542                  */
2543                 return false;
2544
2545         __lock_sock(sk);
2546         sk->sk_lock.owned = 1;
2547         spin_unlock(&sk->sk_lock.slock);
2548         /*
2549          * The sk_lock has mutex_lock() semantics here:
2550          */
2551         mutex_acquire(&sk->sk_lock.dep_map, 0, 0, _RET_IP_);
2552         local_bh_enable();
2553         return true;
2554 }
2555 EXPORT_SYMBOL(lock_sock_fast);
2556
2557 int sock_get_timestamp(struct sock *sk, struct timeval __user *userstamp)
2558 {
2559         struct timeval tv;
2560         if (!sock_flag(sk, SOCK_TIMESTAMP))
2561                 sock_enable_timestamp(sk, SOCK_TIMESTAMP);
2562         tv = ktime_to_timeval(sk->sk_stamp);
2563         if (tv.tv_sec == -1)
2564                 return -ENOENT;
2565         if (tv.tv_sec == 0) {
2566                 sk->sk_stamp = ktime_get_real();
2567                 tv = ktime_to_timeval(sk->sk_stamp);
2568         }
2569         return copy_to_user(userstamp, &tv, sizeof(tv)) ? -EFAULT : 0;
2570 }
2571 EXPORT_SYMBOL(sock_get_timestamp);
2572
2573 int sock_get_timestampns(struct sock *sk, struct timespec __user *userstamp)
2574 {
2575         struct timespec ts;
2576         if (!sock_flag(sk, SOCK_TIMESTAMP))
2577                 sock_enable_timestamp(sk, SOCK_TIMESTAMP);
2578         ts = ktime_to_timespec(sk->sk_stamp);
2579         if (ts.tv_sec == -1)
2580                 return -ENOENT;
2581         if (ts.tv_sec == 0) {
2582                 sk->sk_stamp = ktime_get_real();
2583                 ts = ktime_to_timespec(sk->sk_stamp);
2584         }
2585         return copy_to_user(userstamp, &ts, sizeof(ts)) ? -EFAULT : 0;
2586 }
2587 EXPORT_SYMBOL(sock_get_timestampns);
2588
2589 void sock_enable_timestamp(struct sock *sk, int flag)
2590 {
2591         if (!sock_flag(sk, flag)) {
2592                 unsigned long previous_flags = sk->sk_flags;
2593
2594                 sock_set_flag(sk, flag);
2595                 /*
2596                  * we just set one of the two flags which require net
2597                  * time stamping, but time stamping might have been on
2598                  * already because of the other one
2599                  */
2600                 if (sock_needs_netstamp(sk) &&
2601                     !(previous_flags & SK_FLAGS_TIMESTAMP))
2602                         net_enable_timestamp();
2603         }
2604 }
2605
2606 int sock_recv_errqueue(struct sock *sk, struct msghdr *msg, int len,
2607                        int level, int type)
2608 {
2609         struct sock_exterr_skb *serr;
2610         struct sk_buff *skb;
2611         int copied, err;
2612
2613         err = -EAGAIN;
2614         skb = sock_dequeue_err_skb(sk);
2615         if (skb == NULL)
2616                 goto out;
2617
2618         copied = skb->len;
2619         if (copied > len) {
2620                 msg->msg_flags |= MSG_TRUNC;
2621                 copied = len;
2622         }
2623         err = skb_copy_datagram_msg(skb, 0, msg, copied);
2624         if (err)
2625                 goto out_free_skb;
2626
2627         sock_recv_timestamp(msg, sk, skb);
2628
2629         serr = SKB_EXT_ERR(skb);
2630         put_cmsg(msg, level, type, sizeof(serr->ee), &serr->ee);
2631
2632         msg->msg_flags |= MSG_ERRQUEUE;
2633         err = copied;
2634
2635 out_free_skb:
2636         kfree_skb(skb);
2637 out:
2638         return err;
2639 }
2640 EXPORT_SYMBOL(sock_recv_errqueue);
2641
2642 /*
2643  *      Get a socket option on an socket.
2644  *
2645  *      FIX: POSIX 1003.1g is very ambiguous here. It states that
2646  *      asynchronous errors should be reported by getsockopt. We assume
2647  *      this means if you specify SO_ERROR (otherwise whats the point of it).
2648  */
2649 int sock_common_getsockopt(struct socket *sock, int level, int optname,
2650                            char __user *optval, int __user *optlen)
2651 {
2652         struct sock *sk = sock->sk;
2653
2654         return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
2655 }
2656 EXPORT_SYMBOL(sock_common_getsockopt);
2657
2658 #ifdef CONFIG_COMPAT
2659 int compat_sock_common_getsockopt(struct socket *sock, int level, int optname,
2660                                   char __user *optval, int __user *optlen)
2661 {
2662         struct sock *sk = sock->sk;
2663
2664         if (sk->sk_prot->compat_getsockopt != NULL)
2665                 return sk->sk_prot->compat_getsockopt(sk, level, optname,
2666                                                       optval, optlen);
2667         return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
2668 }
2669 EXPORT_SYMBOL(compat_sock_common_getsockopt);
2670 #endif
2671
2672 int sock_common_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
2673                         int flags)
2674 {
2675         struct sock *sk = sock->sk;
2676         int addr_len = 0;
2677         int err;
2678
2679         err = sk->sk_prot->recvmsg(sk, msg, size, flags & MSG_DONTWAIT,
2680                                    flags & ~MSG_DONTWAIT, &addr_len);
2681         if (err >= 0)
2682                 msg->msg_namelen = addr_len;
2683         return err;
2684 }
2685 EXPORT_SYMBOL(sock_common_recvmsg);
2686
2687 /*
2688  *      Set socket options on an inet socket.
2689  */
2690 int sock_common_setsockopt(struct socket *sock, int level, int optname,
2691                            char __user *optval, unsigned int optlen)
2692 {
2693         struct sock *sk = sock->sk;
2694
2695         return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
2696 }
2697 EXPORT_SYMBOL(sock_common_setsockopt);
2698
2699 #ifdef CONFIG_COMPAT
2700 int compat_sock_common_setsockopt(struct socket *sock, int level, int optname,
2701                                   char __user *optval, unsigned int optlen)
2702 {
2703         struct sock *sk = sock->sk;
2704
2705         if (sk->sk_prot->compat_setsockopt != NULL)
2706                 return sk->sk_prot->compat_setsockopt(sk, level, optname,
2707                                                       optval, optlen);
2708         return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
2709 }
2710 EXPORT_SYMBOL(compat_sock_common_setsockopt);
2711 #endif
2712
2713 void sk_common_release(struct sock *sk)
2714 {
2715         if (sk->sk_prot->destroy)
2716                 sk->sk_prot->destroy(sk);
2717
2718         /*
2719          * Observation: when sock_common_release is called, processes have
2720          * no access to socket. But net still has.
2721          * Step one, detach it from networking:
2722          *
2723          * A. Remove from hash tables.
2724          */
2725
2726         sk->sk_prot->unhash(sk);
2727
2728         /*
2729          * In this point socket cannot receive new packets, but it is possible
2730          * that some packets are in flight because some CPU runs receiver and
2731          * did hash table lookup before we unhashed socket. They will achieve
2732          * receive queue and will be purged by socket destructor.
2733          *
2734          * Also we still have packets pending on receive queue and probably,
2735          * our own packets waiting in device queues. sock_destroy will drain
2736          * receive queue, but transmitted packets will delay socket destruction
2737          * until the last reference will be released.
2738          */
2739
2740         sock_orphan(sk);
2741
2742         xfrm_sk_free_policy(sk);
2743
2744         sk_refcnt_debug_release(sk);
2745
2746         sock_put(sk);
2747 }
2748 EXPORT_SYMBOL(sk_common_release);
2749
2750 #ifdef CONFIG_PROC_FS
2751 #define PROTO_INUSE_NR  64      /* should be enough for the first time */
2752 struct prot_inuse {
2753         int val[PROTO_INUSE_NR];
2754 };
2755
2756 static DECLARE_BITMAP(proto_inuse_idx, PROTO_INUSE_NR);
2757
2758 #ifdef CONFIG_NET_NS
2759 void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
2760 {
2761         __this_cpu_add(net->core.inuse->val[prot->inuse_idx], val);
2762 }
2763 EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
2764
2765 int sock_prot_inuse_get(struct net *net, struct proto *prot)
2766 {
2767         int cpu, idx = prot->inuse_idx;
2768         int res = 0;
2769
2770         for_each_possible_cpu(cpu)
2771                 res += per_cpu_ptr(net->core.inuse, cpu)->val[idx];
2772
2773         return res >= 0 ? res : 0;
2774 }
2775 EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
2776
2777 static int __net_init sock_inuse_init_net(struct net *net)
2778 {
2779         net->core.inuse = alloc_percpu(struct prot_inuse);
2780         return net->core.inuse ? 0 : -ENOMEM;
2781 }
2782
2783 static void __net_exit sock_inuse_exit_net(struct net *net)
2784 {
2785         free_percpu(net->core.inuse);
2786 }
2787
2788 static struct pernet_operations net_inuse_ops = {
2789         .init = sock_inuse_init_net,
2790         .exit = sock_inuse_exit_net,
2791 };
2792
2793 static __init int net_inuse_init(void)
2794 {
2795         if (register_pernet_subsys(&net_inuse_ops))
2796                 panic("Cannot initialize net inuse counters");
2797
2798         return 0;
2799 }
2800
2801 core_initcall(net_inuse_init);
2802 #else
2803 static DEFINE_PER_CPU(struct prot_inuse, prot_inuse);
2804
2805 void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
2806 {
2807         __this_cpu_add(prot_inuse.val[prot->inuse_idx], val);
2808 }
2809 EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
2810
2811 int sock_prot_inuse_get(struct net *net, struct proto *prot)
2812 {
2813         int cpu, idx = prot->inuse_idx;
2814         int res = 0;
2815
2816         for_each_possible_cpu(cpu)
2817                 res += per_cpu(prot_inuse, cpu).val[idx];
2818
2819         return res >= 0 ? res : 0;
2820 }
2821 EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
2822 #endif
2823
2824 static void assign_proto_idx(struct proto *prot)
2825 {
2826         prot->inuse_idx = find_first_zero_bit(proto_inuse_idx, PROTO_INUSE_NR);
2827
2828         if (unlikely(prot->inuse_idx == PROTO_INUSE_NR - 1)) {
2829                 pr_err("PROTO_INUSE_NR exhausted\n");
2830                 return;
2831         }
2832
2833         set_bit(prot->inuse_idx, proto_inuse_idx);
2834 }
2835
2836 static void release_proto_idx(struct proto *prot)
2837 {
2838         if (prot->inuse_idx != PROTO_INUSE_NR - 1)
2839                 clear_bit(prot->inuse_idx, proto_inuse_idx);
2840 }
2841 #else
2842 static inline void assign_proto_idx(struct proto *prot)
2843 {
2844 }
2845
2846 static inline void release_proto_idx(struct proto *prot)
2847 {
2848 }
2849 #endif
2850
2851 static void req_prot_cleanup(struct request_sock_ops *rsk_prot)
2852 {
2853         if (!rsk_prot)
2854                 return;
2855         kfree(rsk_prot->slab_name);
2856         rsk_prot->slab_name = NULL;
2857         kmem_cache_destroy(rsk_prot->slab);
2858         rsk_prot->slab = NULL;
2859 }
2860
2861 static int req_prot_init(const struct proto *prot)
2862 {
2863         struct request_sock_ops *rsk_prot = prot->rsk_prot;
2864
2865         if (!rsk_prot)
2866                 return 0;
2867
2868         rsk_prot->slab_name = kasprintf(GFP_KERNEL, "request_sock_%s",
2869                                         prot->name);
2870         if (!rsk_prot->slab_name)
2871                 return -ENOMEM;
2872
2873         rsk_prot->slab = kmem_cache_create(rsk_prot->slab_name,
2874                                            rsk_prot->obj_size, 0,
2875                                            prot->slab_flags, NULL);
2876
2877         if (!rsk_prot->slab) {
2878                 pr_crit("%s: Can't create request sock SLAB cache!\n",
2879                         prot->name);
2880                 return -ENOMEM;
2881         }
2882         return 0;
2883 }
2884
2885 int proto_register(struct proto *prot, int alloc_slab)
2886 {
2887         if (alloc_slab) {
2888                 prot->slab = kmem_cache_create(prot->name, prot->obj_size, 0,
2889                                         SLAB_HWCACHE_ALIGN | prot->slab_flags,
2890                                         NULL);
2891
2892                 if (prot->slab == NULL) {
2893                         pr_crit("%s: Can't create sock SLAB cache!\n",
2894                                 prot->name);
2895                         goto out;
2896                 }
2897
2898                 if (req_prot_init(prot))
2899                         goto out_free_request_sock_slab;
2900
2901                 if (prot->twsk_prot != NULL) {
2902                         prot->twsk_prot->twsk_slab_name = kasprintf(GFP_KERNEL, "tw_sock_%s", prot->name);
2903
2904                         if (prot->twsk_prot->twsk_slab_name == NULL)
2905                                 goto out_free_request_sock_slab;
2906
2907                         prot->twsk_prot->twsk_slab =
2908                                 kmem_cache_create(prot->twsk_prot->twsk_slab_name,
2909                                                   prot->twsk_prot->twsk_obj_size,
2910                                                   0,
2911                                                   prot->slab_flags,
2912                                                   NULL);
2913                         if (prot->twsk_prot->twsk_slab == NULL)
2914                                 goto out_free_timewait_sock_slab_name;
2915                 }
2916         }
2917
2918         mutex_lock(&proto_list_mutex);
2919         list_add(&prot->node, &proto_list);
2920         assign_proto_idx(prot);
2921         mutex_unlock(&proto_list_mutex);
2922         return 0;
2923
2924 out_free_timewait_sock_slab_name:
2925         kfree(prot->twsk_prot->twsk_slab_name);
2926 out_free_request_sock_slab:
2927         req_prot_cleanup(prot->rsk_prot);
2928
2929         kmem_cache_destroy(prot->slab);
2930         prot->slab = NULL;
2931 out:
2932         return -ENOBUFS;
2933 }
2934 EXPORT_SYMBOL(proto_register);
2935
2936 void proto_unregister(struct proto *prot)
2937 {
2938         mutex_lock(&proto_list_mutex);
2939         release_proto_idx(prot);
2940         list_del(&prot->node);
2941         mutex_unlock(&proto_list_mutex);
2942
2943         kmem_cache_destroy(prot->slab);
2944         prot->slab = NULL;
2945
2946         req_prot_cleanup(prot->rsk_prot);
2947
2948         if (prot->twsk_prot != NULL && prot->twsk_prot->twsk_slab != NULL) {
2949                 kmem_cache_destroy(prot->twsk_prot->twsk_slab);
2950                 kfree(prot->twsk_prot->twsk_slab_name);
2951                 prot->twsk_prot->twsk_slab = NULL;
2952         }
2953 }
2954 EXPORT_SYMBOL(proto_unregister);
2955
2956 #ifdef CONFIG_PROC_FS
2957 static void *proto_seq_start(struct seq_file *seq, loff_t *pos)
2958         __acquires(proto_list_mutex)
2959 {
2960         mutex_lock(&proto_list_mutex);
2961         return seq_list_start_head(&proto_list, *pos);
2962 }
2963
2964 static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2965 {
2966         return seq_list_next(v, &proto_list, pos);
2967 }
2968
2969 static void proto_seq_stop(struct seq_file *seq, void *v)
2970         __releases(proto_list_mutex)
2971 {
2972         mutex_unlock(&proto_list_mutex);
2973 }
2974
2975 static char proto_method_implemented(const void *method)
2976 {
2977         return method == NULL ? 'n' : 'y';
2978 }
2979 static long sock_prot_memory_allocated(struct proto *proto)
2980 {
2981         return proto->memory_allocated != NULL ? proto_memory_allocated(proto) : -1L;
2982 }
2983
2984 static char *sock_prot_memory_pressure(struct proto *proto)
2985 {
2986         return proto->memory_pressure != NULL ?
2987         proto_memory_pressure(proto) ? "yes" : "no" : "NI";
2988 }
2989
2990 static void proto_seq_printf(struct seq_file *seq, struct proto *proto)
2991 {
2992
2993         seq_printf(seq, "%-9s %4u %6d  %6ld   %-3s %6u   %-3s  %-10s "
2994                         "%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n",
2995                    proto->name,
2996                    proto->obj_size,
2997                    sock_prot_inuse_get(seq_file_net(seq), proto),
2998                    sock_prot_memory_allocated(proto),
2999                    sock_prot_memory_pressure(proto),
3000                    proto->max_header,
3001                    proto->slab == NULL ? "no" : "yes",
3002                    module_name(proto->owner),
3003                    proto_method_implemented(proto->close),
3004                    proto_method_implemented(proto->connect),
3005                    proto_method_implemented(proto->disconnect),
3006                    proto_method_implemented(proto->accept),
3007                    proto_method_implemented(proto->ioctl),
3008                    proto_method_implemented(proto->init),
3009                    proto_method_implemented(proto->destroy),
3010                    proto_method_implemented(proto->shutdown),
3011                    proto_method_implemented(proto->setsockopt),
3012                    proto_method_implemented(proto->getsockopt),
3013                    proto_method_implemented(proto->sendmsg),
3014                    proto_method_implemented(proto->recvmsg),
3015                    proto_method_implemented(proto->sendpage),
3016                    proto_method_implemented(proto->bind),
3017                    proto_method_implemented(proto->backlog_rcv),
3018                    proto_method_implemented(proto->hash),
3019                    proto_method_implemented(proto->unhash),
3020                    proto_method_implemented(proto->get_port),
3021                    proto_method_implemented(proto->enter_memory_pressure));
3022 }
3023
3024 static int proto_seq_show(struct seq_file *seq, void *v)
3025 {
3026         if (v == &proto_list)
3027                 seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s",
3028                            "protocol",
3029                            "size",
3030                            "sockets",
3031                            "memory",
3032                            "press",
3033                            "maxhdr",
3034                            "slab",
3035                            "module",
3036                            "cl co di ac io in de sh ss gs se re sp bi br ha uh gp em\n");
3037         else
3038                 proto_seq_printf(seq, list_entry(v, struct proto, node));
3039         return 0;
3040 }
3041
3042 static const struct seq_operations proto_seq_ops = {
3043         .start  = proto_seq_start,
3044         .next   = proto_seq_next,
3045         .stop   = proto_seq_stop,
3046         .show   = proto_seq_show,
3047 };
3048
3049 static int proto_seq_open(struct inode *inode, struct file *file)
3050 {
3051         return seq_open_net(inode, file, &proto_seq_ops,
3052                             sizeof(struct seq_net_private));
3053 }
3054
3055 static const struct file_operations proto_seq_fops = {
3056         .owner          = THIS_MODULE,
3057         .open           = proto_seq_open,
3058         .read           = seq_read,
3059         .llseek         = seq_lseek,
3060         .release        = seq_release_net,
3061 };
3062
3063 static __net_init int proto_init_net(struct net *net)
3064 {
3065         if (!proc_create("protocols", S_IRUGO, net->proc_net, &proto_seq_fops))
3066                 return -ENOMEM;
3067
3068         return 0;
3069 }
3070
3071 static __net_exit void proto_exit_net(struct net *net)
3072 {
3073         remove_proc_entry("protocols", net->proc_net);
3074 }
3075
3076
3077 static __net_initdata struct pernet_operations proto_net_ops = {
3078         .init = proto_init_net,
3079         .exit = proto_exit_net,
3080 };
3081
3082 static int __init proto_init(void)
3083 {
3084         return register_pernet_subsys(&proto_net_ops);
3085 }
3086
3087 subsys_initcall(proto_init);
3088
3089 #endif /* PROC_FS */