net/core/sock.c

   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              Generic socket support routines. Memory allocators, socket lock/release
   7  *              handler for protocols to use and generic option handler.
   8  *
   9  *
  10  * Authors:     Ross Biro
  11  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12  *              Florian La Roche, <flla@stud.uni-sb.de>
  13  *              Alan Cox, <A.Cox@swansea.ac.uk>
  14  *
  15  * Fixes:
  16  *              Alan Cox        :       Numerous verify_area() problems
  17  *              Alan Cox        :       Connecting on a connecting socket
  18  *                                      now returns an error for tcp.
  19  *              Alan Cox        :       sock->protocol is set correctly.
  20  *                                      and is not sometimes left as 0.
  21  *              Alan Cox        :       connect handles icmp errors on a
  22  *                                      connect properly. Unfortunately there
  23  *                                      is a restart syscall nasty there. I
  24  *                                      can't match BSD without hacking the C
  25  *                                      library. Ideas urgently sought!
  26  *              Alan Cox        :       Disallow bind() to addresses that are
  27  *                                      not ours - especially broadcast ones!!
  28  *              Alan Cox        :       Socket 1024 _IS_ ok for users. (fencepost)
  29  *              Alan Cox        :       sock_wfree/sock_rfree don't destroy sockets,
  30  *                                      instead they leave that for the DESTROY timer.
  31  *              Alan Cox        :       Clean up error flag in accept
  32  *              Alan Cox        :       TCP ack handling is buggy, the DESTROY timer
  33  *                                      was buggy. Put a remove_sock() in the handler
  34  *                                      for memory when we hit 0. Also altered the timer
  35  *                                      code. The ACK stuff can wait and needs major
  36  *                                      TCP layer surgery.
  37  *              Alan Cox        :       Fixed TCP ack bug, removed remove sock
  38  *                                      and fixed timer/inet_bh race.
  39  *              Alan Cox        :       Added zapped flag for TCP
  40  *              Alan Cox        :       Move kfree_skb into skbuff.c and tidied up surplus code
  41  *              Alan Cox        :       for new sk_buff allocations wmalloc/rmalloc now call alloc_skb
  42  *              Alan Cox        :       kfree_s calls now are kfree_skbmem so we can track skb resources
  43  *              Alan Cox        :       Supports socket option broadcast now as does udp. Packet and raw need fixing.
  44  *              Alan Cox        :       Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so...
  45  *              Rick Sladkey    :       Relaxed UDP rules for matching packets.
  46  *              C.E.Hawkins     :       IFF_PROMISC/SIOCGHWADDR support
  47  *      Pauline Middelink       :       identd support
  48  *              Alan Cox        :       Fixed connect() taking signals I think.
  49  *              Alan Cox        :       SO_LINGER supported
  50  *              Alan Cox        :       Error reporting fixes
  51  *              Anonymous       :       inet_create tidied up (sk->reuse setting)
  52  *              Alan Cox        :       inet sockets don't set sk->type!
  53  *              Alan Cox        :       Split socket option code
  54  *              Alan Cox        :       Callbacks
  55  *              Alan Cox        :       Nagle flag for Charles & Johannes stuff
  56  *              Alex            :       Removed restriction on inet fioctl
  57  *              Alan Cox        :       Splitting INET from NET core
  58  *              Alan Cox        :       Fixed bogus SO_TYPE handling in getsockopt()
  59  *              Adam Caldwell   :       Missing return in SO_DONTROUTE/SO_DEBUG code
  60  *              Alan Cox        :       Split IP from generic code
  61  *              Alan Cox        :       New kfree_skbmem()
  62  *              Alan Cox        :       Make SO_DEBUG superuser only.
  63  *              Alan Cox        :       Allow anyone to clear SO_DEBUG
  64  *                                      (compatibility fix)
  65  *              Alan Cox        :       Added optimistic memory grabbing for AF_UNIX throughput.
  66  *              Alan Cox        :       Allocator for a socket is settable.
  67  *              Alan Cox        :       SO_ERROR includes soft errors.
  68  *              Alan Cox        :       Allow NULL arguments on some SO_ opts
  69  *              Alan Cox        :       Generic socket allocation to make hooks
  70  *                                      easier (suggested by Craig Metz).
  71  *              Michael Pall    :       SO_ERROR returns positive errno again
  72  *              Steve Whitehouse:       Added default destructor to free
  73  *                                      protocol private data.
  74  *              Steve Whitehouse:       Added various other default routines
  75  *                                      common to several socket families.
  76  *              Chris Evans     :       Call suser() check last on F_SETOWN
  77  *              Jay Schulist    :       Added SO_ATTACH_FILTER and SO_DETACH_FILTER.
  78  *              Andi Kleen      :       Add sock_kmalloc()/sock_kfree_s()
  79  *              Andi Kleen      :       Fix write_space callback
  80  *              Chris Evans     :       Security fixes - signedness again
  81  *              Arnaldo C. Melo :       cleanups, use skb_queue_purge
  82  *
  83  * To Fix:
  84  *
  85  *
  86  *              This program is free software; you can redistribute it and/or
  87  *              modify it under the terms of the GNU General Public License
  88  *              as published by the Free Software Foundation; either version
  89  *              2 of the License, or (at your option) any later version.
  90  */
  91
  92 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
  93
  94 #include <linux/capability.h>
  95 #include <linux/errno.h>
  96 #include <linux/errqueue.h>
  97 #include <linux/types.h>
  98 #include <linux/socket.h>
  99 #include <linux/in.h>
 100 #include <linux/kernel.h>
 101 #include <linux/module.h>
 102 #include <linux/proc_fs.h>
 103 #include <linux/seq_file.h>
 104 #include <linux/sched.h>
 105 #include <linux/timer.h>
 106 #include <linux/string.h>
 107 #include <linux/sockios.h>
 108 #include <linux/net.h>
 109 #include <linux/mm.h>
 110 #include <linux/slab.h>
 111 #include <linux/interrupt.h>
 112 #include <linux/poll.h>
 113 #include <linux/tcp.h>
 114 #include <linux/init.h>
 115 #include <linux/highmem.h>
 116 #include <linux/user_namespace.h>
 117 #include <linux/static_key.h>
 118 #include <linux/memcontrol.h>
 119 #include <linux/prefetch.h>
 120
 121 #include <asm/uaccess.h>
 122
 123 #include <linux/netdevice.h>
 124 #include <net/protocol.h>
 125 #include <linux/skbuff.h>
 126 #include <net/net_namespace.h>
 127 #include <net/request_sock.h>
 128 #include <net/sock.h>
 129 #include <linux/net_tstamp.h>
 130 #include <net/xfrm.h>
 131 #include <linux/ipsec.h>
 132 #include <net/cls_cgroup.h>
 133 #include <net/netprio_cgroup.h>
 134
 135 #include <linux/filter.h>
 136
 137 #include <trace/events/sock.h>
 138
 139 #ifdef CONFIG_INET
 140 #include <net/tcp.h>
 141 #endif
 142
 143 #include <net/busy_poll.h>
 144
 145 static DEFINE_MUTEX(proto_list_mutex);
 146 static LIST_HEAD(proto_list);
 147
 148 /**
 149  * sk_ns_capable - General socket capability test
 150  * @sk: Socket to use a capability on or through
 151  * @user_ns: The user namespace of the capability to use
 152  * @cap: The capability to use
 153  *
 154  * Test to see if the opener of the socket had when the socket was
 155  * created and the current process has the capability @cap in the user
 156  * namespace @user_ns.
 157  */
 158 bool sk_ns_capable(const struct sock *sk,
 159                    struct user_namespace *user_ns, int cap)
 160 {
 161         return file_ns_capable(sk->sk_socket->file, user_ns, cap) &&
 162                 ns_capable(user_ns, cap);
 163 }
 164 EXPORT_SYMBOL(sk_ns_capable);
 165
 166 /**
 167  * sk_capable - Socket global capability test
 168  * @sk: Socket to use a capability on or through
 169  * @cap: The global capability to use
 170  *
 171  * Test to see if the opener of the socket had when the socket was
 172  * created and the current process has the capability @cap in all user
 173  * namespaces.
 174  */
 175 bool sk_capable(const struct sock *sk, int cap)
 176 {
 177         return sk_ns_capable(sk, &init_user_ns, cap);
 178 }
 179 EXPORT_SYMBOL(sk_capable);
 180
 181 /**
 182  * sk_net_capable - Network namespace socket capability test
 183  * @sk: Socket to use a capability on or through
 184  * @cap: The capability to use
 185  *
 186  * Test to see if the opener of the socket had when the socket was created
 187  * and the current process has the capability @cap over the network namespace
 188  * the socket is a member of.
 189  */
 190 bool sk_net_capable(const struct sock *sk, int cap)
 191 {
 192         return sk_ns_capable(sk, sock_net(sk)->user_ns, cap);
 193 }
 194 EXPORT_SYMBOL(sk_net_capable);
 195
 196
 197 #ifdef CONFIG_MEMCG_KMEM
 198 int mem_cgroup_sockets_init(struct mem_cgroup *memcg, struct cgroup_subsys *ss)
 199 {
 200         struct proto *proto;
 201         int ret = 0;
 202
 203         mutex_lock(&proto_list_mutex);
 204         list_for_each_entry(proto, &proto_list, node) {
 205                 if (proto->init_cgroup) {
 206                         ret = proto->init_cgroup(memcg, ss);
 207                         if (ret)
 208                                 goto out;
 209                 }
 210         }
 211
 212         mutex_unlock(&proto_list_mutex);
 213         return ret;
 214 out:
 215         list_for_each_entry_continue_reverse(proto, &proto_list, node)
 216                 if (proto->destroy_cgroup)
 217                         proto->destroy_cgroup(memcg);
 218         mutex_unlock(&proto_list_mutex);
 219         return ret;
 220 }
 221
 222 void mem_cgroup_sockets_destroy(struct mem_cgroup *memcg)
 223 {
 224         struct proto *proto;
 225
 226         mutex_lock(&proto_list_mutex);
 227         list_for_each_entry_reverse(proto, &proto_list, node)
 228                 if (proto->destroy_cgroup)
 229                         proto->destroy_cgroup(memcg);
 230         mutex_unlock(&proto_list_mutex);
 231 }
 232 #endif
 233
 234 /*
 235  * Each address family might have different locking rules, so we have
 236  * one slock key per address family:
 237  */
 238 static struct lock_class_key af_family_keys[AF_MAX];
 239 static struct lock_class_key af_family_slock_keys[AF_MAX];
 240
 241 #if defined(CONFIG_MEMCG_KMEM)
 242 struct static_key memcg_socket_limit_enabled;
 243 EXPORT_SYMBOL(memcg_socket_limit_enabled);
 244 #endif
 245
 246 /*
 247  * Make lock validator output more readable. (we pre-construct these
 248  * strings build-time, so that runtime initialization of socket
 249  * locks is fast):
 250  */
 251 static const char *const af_family_key_strings[AF_MAX+1] = {
 252   "sk_lock-AF_UNSPEC", "sk_lock-AF_UNIX"     , "sk_lock-AF_INET"     ,
 253   "sk_lock-AF_AX25"  , "sk_lock-AF_IPX"      , "sk_lock-AF_APPLETALK",
 254   "sk_lock-AF_NETROM", "sk_lock-AF_BRIDGE"   , "sk_lock-AF_ATMPVC"   ,
 255   "sk_lock-AF_X25"   , "sk_lock-AF_INET6"    , "sk_lock-AF_ROSE"     ,
 256   "sk_lock-AF_DECnet", "sk_lock-AF_NETBEUI"  , "sk_lock-AF_SECURITY" ,
 257   "sk_lock-AF_KEY"   , "sk_lock-AF_NETLINK"  , "sk_lock-AF_PACKET"   ,
 258   "sk_lock-AF_ASH"   , "sk_lock-AF_ECONET"   , "sk_lock-AF_ATMSVC"   ,
 259   "sk_lock-AF_RDS"   , "sk_lock-AF_SNA"      , "sk_lock-AF_IRDA"     ,
 260   "sk_lock-AF_PPPOX" , "sk_lock-AF_WANPIPE"  , "sk_lock-AF_LLC"      ,
 261   "sk_lock-27"       , "sk_lock-28"          , "sk_lock-AF_CAN"      ,
 262   "sk_lock-AF_TIPC"  , "sk_lock-AF_BLUETOOTH", "sk_lock-IUCV"        ,
 263   "sk_lock-AF_RXRPC" , "sk_lock-AF_ISDN"     , "sk_lock-AF_PHONET"   ,
 264   "sk_lock-AF_IEEE802154", "sk_lock-AF_CAIF" , "sk_lock-AF_ALG"      ,
 265   "sk_lock-AF_NFC"   , "sk_lock-AF_VSOCK"    , "sk_lock-AF_MAX"
 266 };
 267 static const char *const af_family_slock_key_strings[AF_MAX+1] = {
 268   "slock-AF_UNSPEC", "slock-AF_UNIX"     , "slock-AF_INET"     ,
 269   "slock-AF_AX25"  , "slock-AF_IPX"      , "slock-AF_APPLETALK",
 270   "slock-AF_NETROM", "slock-AF_BRIDGE"   , "slock-AF_ATMPVC"   ,
 271   "slock-AF_X25"   , "slock-AF_INET6"    , "slock-AF_ROSE"     ,
 272   "slock-AF_DECnet", "slock-AF_NETBEUI"  , "slock-AF_SECURITY" ,
 273   "slock-AF_KEY"   , "slock-AF_NETLINK"  , "slock-AF_PACKET"   ,
 274   "slock-AF_ASH"   , "slock-AF_ECONET"   , "slock-AF_ATMSVC"   ,
 275   "slock-AF_RDS"   , "slock-AF_SNA"      , "slock-AF_IRDA"     ,
 276   "slock-AF_PPPOX" , "slock-AF_WANPIPE"  , "slock-AF_LLC"      ,
 277   "slock-27"       , "slock-28"          , "slock-AF_CAN"      ,
 278   "slock-AF_TIPC"  , "slock-AF_BLUETOOTH", "slock-AF_IUCV"     ,
 279   "slock-AF_RXRPC" , "slock-AF_ISDN"     , "slock-AF_PHONET"   ,
 280   "slock-AF_IEEE802154", "slock-AF_CAIF" , "slock-AF_ALG"      ,
 281   "slock-AF_NFC"   , "slock-AF_VSOCK"    ,"slock-AF_MAX"
 282 };
 283 static const char *const af_family_clock_key_strings[AF_MAX+1] = {
 284   "clock-AF_UNSPEC", "clock-AF_UNIX"     , "clock-AF_INET"     ,
 285   "clock-AF_AX25"  , "clock-AF_IPX"      , "clock-AF_APPLETALK",
 286   "clock-AF_NETROM", "clock-AF_BRIDGE"   , "clock-AF_ATMPVC"   ,
 287   "clock-AF_X25"   , "clock-AF_INET6"    , "clock-AF_ROSE"     ,
 288   "clock-AF_DECnet", "clock-AF_NETBEUI"  , "clock-AF_SECURITY" ,
 289   "clock-AF_KEY"   , "clock-AF_NETLINK"  , "clock-AF_PACKET"   ,
 290   "clock-AF_ASH"   , "clock-AF_ECONET"   , "clock-AF_ATMSVC"   ,
 291   "clock-AF_RDS"   , "clock-AF_SNA"      , "clock-AF_IRDA"     ,
 292   "clock-AF_PPPOX" , "clock-AF_WANPIPE"  , "clock-AF_LLC"      ,
 293   "clock-27"       , "clock-28"          , "clock-AF_CAN"      ,
 294   "clock-AF_TIPC"  , "clock-AF_BLUETOOTH", "clock-AF_IUCV"     ,
 295   "clock-AF_RXRPC" , "clock-AF_ISDN"     , "clock-AF_PHONET"   ,
 296   "clock-AF_IEEE802154", "clock-AF_CAIF" , "clock-AF_ALG"      ,
 297   "clock-AF_NFC"   , "clock-AF_VSOCK"    , "clock-AF_MAX"
 298 };
 299
 300 /*
 301  * sk_callback_lock locking rules are per-address-family,
 302  * so split the lock classes by using a per-AF key:
 303  */
 304 static struct lock_class_key af_callback_keys[AF_MAX];
 305
 306 /* Take into consideration the size of the struct sk_buff overhead in the
 307  * determination of these values, since that is non-constant across
 308  * platforms.  This makes socket queueing behavior and performance
 309  * not depend upon such differences.
 310  */
 311 #define _SK_MEM_PACKETS         256
 312 #define _SK_MEM_OVERHEAD        SKB_TRUESIZE(256)
 313 #define SK_WMEM_MAX             (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
 314 #define SK_RMEM_MAX             (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
 315
 316 /* Run time adjustable parameters. */
 317 __u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX;
 318 EXPORT_SYMBOL(sysctl_wmem_max);
 319 __u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX;
 320 EXPORT_SYMBOL(sysctl_rmem_max);
 321 __u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX;
 322 __u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX;
 323
 324 /* Maximal space eaten by iovec or ancillary data plus some space */
 325 int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512);
 326 EXPORT_SYMBOL(sysctl_optmem_max);
 327
 328 int sysctl_tstamp_allow_data __read_mostly = 1;
 329
 330 struct static_key memalloc_socks = STATIC_KEY_INIT_FALSE;
 331 EXPORT_SYMBOL_GPL(memalloc_socks);
 332
 333 /**
 334  * sk_set_memalloc - sets %SOCK_MEMALLOC
 335  * @sk: socket to set it on
 336  *
 337  * Set %SOCK_MEMALLOC on a socket for access to emergency reserves.
 338  * It's the responsibility of the admin to adjust min_free_kbytes
 339  * to meet the requirements
 340  */
 341 void sk_set_memalloc(struct sock *sk)
 342 {
 343         sock_set_flag(sk, SOCK_MEMALLOC);
 344         sk->sk_allocation |= __GFP_MEMALLOC;
 345         static_key_slow_inc(&memalloc_socks);
 346 }
 347 EXPORT_SYMBOL_GPL(sk_set_memalloc);
 348
 349 void sk_clear_memalloc(struct sock *sk)
 350 {
 351         sock_reset_flag(sk, SOCK_MEMALLOC);
 352         sk->sk_allocation &= ~__GFP_MEMALLOC;
 353         static_key_slow_dec(&memalloc_socks);
 354
 355         /*
 356          * SOCK_MEMALLOC is allowed to ignore rmem limits to ensure forward
 357          * progress of swapping. SOCK_MEMALLOC may be cleared while
 358          * it has rmem allocations due to the last swapfile being deactivated
 359          * but there is a risk that the socket is unusable due to exceeding
 360          * the rmem limits. Reclaim the reserves and obey rmem limits again.
 361          */
 362         sk_mem_reclaim(sk);
 363 }
 364 EXPORT_SYMBOL_GPL(sk_clear_memalloc);
 365
 366 int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb)
 367 {
 368         int ret;
 369         unsigned long pflags = current->flags;
 370
 371         /* these should have been dropped before queueing */
 372         BUG_ON(!sock_flag(sk, SOCK_MEMALLOC));
 373
 374         current->flags |= PF_MEMALLOC;
 375         ret = sk->sk_backlog_rcv(sk, skb);
 376         tsk_restore_flags(current, pflags, PF_MEMALLOC);
 377
 378         return ret;
 379 }
 380 EXPORT_SYMBOL(__sk_backlog_rcv);
 381
 382 static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen)
 383 {
 384         struct timeval tv;
 385
 386         if (optlen < sizeof(tv))
 387                 return -EINVAL;
 388         if (copy_from_user(&tv, optval, sizeof(tv)))
 389                 return -EFAULT;
 390         if (tv.tv_usec < 0 || tv.tv_usec >= USEC_PER_SEC)
 391                 return -EDOM;
 392
 393         if (tv.tv_sec < 0) {
 394                 static int warned __read_mostly;
 395
 396                 *timeo_p = 0;
 397                 if (warned < 10 && net_ratelimit()) {
 398                         warned++;
 399                         pr_info("%s: `%s' (pid %d) tries to set negative timeout\n",
 400                                 __func__, current->comm, task_pid_nr(current));
 401                 }
 402                 return 0;
 403         }
 404         *timeo_p = MAX_SCHEDULE_TIMEOUT;
 405         if (tv.tv_sec == 0 && tv.tv_usec == 0)
 406                 return 0;
 407         if (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT/HZ - 1))
 408                 *timeo_p = tv.tv_sec*HZ + (tv.tv_usec+(1000000/HZ-1))/(1000000/HZ);
 409         return 0;
 410 }
 411
 412 static void sock_warn_obsolete_bsdism(const char *name)
 413 {
 414         static int warned;
 415         static char warncomm[TASK_COMM_LEN];
 416         if (strcmp(warncomm, current->comm) && warned < 5) {
 417                 strcpy(warncomm,  current->comm);
 418                 pr_warn("process `%s' is using obsolete %s SO_BSDCOMPAT\n",
 419                         warncomm, name);
 420                 warned++;
 421         }
 422 }
 423
 424 static void sock_disable_timestamp(struct sock *sk, unsigned long flags)
 425 {
 426         if (sk->sk_flags & flags) {
 427                 sk->sk_flags &= ~flags;
 428                 if (!(sk->sk_flags & SK_FLAGS_TIMESTAMP))
 429                         net_disable_timestamp();
 430         }
 431 }
 432
 433
 434 int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
 435 {
 436         int err;
 437         unsigned long flags;
 438         struct sk_buff_head *list = &sk->sk_receive_queue;
 439
 440         if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) {
 441                 atomic_inc(&sk->sk_drops);
 442                 trace_sock_rcvqueue_full(sk, skb);
 443                 return -ENOMEM;
 444         }
 445
 446         err = sk_filter(sk, skb);
 447         if (err)
 448                 return err;
 449
 450         if (!sk_rmem_schedule(sk, skb, skb->truesize)) {
 451                 atomic_inc(&sk->sk_drops);
 452                 return -ENOBUFS;
 453         }
 454
 455         skb->dev = NULL;
 456         skb_set_owner_r(skb, sk);
 457
 458         /* we escape from rcu protected region, make sure we dont leak
 459          * a norefcounted dst
 460          */
 461         skb_dst_force(skb);
 462
 463         spin_lock_irqsave(&list->lock, flags);
 464         sock_skb_set_dropcount(sk, skb);
 465         __skb_queue_tail(list, skb);
 466         spin_unlock_irqrestore(&list->lock, flags);
 467
 468         if (!sock_flag(sk, SOCK_DEAD))
 469                 sk->sk_data_ready(sk);
 470         return 0;
 471 }
 472 EXPORT_SYMBOL(sock_queue_rcv_skb);
 473
 474 int sk_receive_skb(struct sock *sk, struct sk_buff *skb, const int nested)
 475 {
 476         int rc = NET_RX_SUCCESS;
 477
 478         if (sk_filter(sk, skb))
 479                 goto discard_and_relse;
 480
 481         skb->dev = NULL;
 482
 483         if (sk_rcvqueues_full(sk, sk->sk_rcvbuf)) {
 484                 atomic_inc(&sk->sk_drops);
 485                 goto discard_and_relse;
 486         }
 487         if (nested)
 488                 bh_lock_sock_nested(sk);
 489         else
 490                 bh_lock_sock(sk);
 491         if (!sock_owned_by_user(sk)) {
 492                 /*
 493                  * trylock + unlock semantics:
 494                  */
 495                 mutex_acquire(&sk->sk_lock.dep_map, 0, 1, _RET_IP_);
 496
 497                 rc = sk_backlog_rcv(sk, skb);
 498
 499                 mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
 500         } else if (sk_add_backlog(sk, skb, sk->sk_rcvbuf)) {
 501                 bh_unlock_sock(sk);
 502                 atomic_inc(&sk->sk_drops);
 503                 goto discard_and_relse;
 504         }
 505
 506         bh_unlock_sock(sk);
 507 out:
 508         sock_put(sk);
 509         return rc;
 510 discard_and_relse:
 511         kfree_skb(skb);
 512         goto out;
 513 }
 514 EXPORT_SYMBOL(sk_receive_skb);
 515
 516 struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie)
 517 {
 518         struct dst_entry *dst = __sk_dst_get(sk);
 519
 520         if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
 521                 sk_tx_queue_clear(sk);
 522                 RCU_INIT_POINTER(sk->sk_dst_cache, NULL);
 523                 dst_release(dst);
 524                 return NULL;
 525         }
 526
 527         return dst;
 528 }
 529 EXPORT_SYMBOL(__sk_dst_check);
 530
 531 struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie)
 532 {
 533         struct dst_entry *dst = sk_dst_get(sk);
 534
 535         if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
 536                 sk_dst_reset(sk);
 537                 dst_release(dst);
 538                 return NULL;
 539         }
 540
 541         return dst;
 542 }
 543 EXPORT_SYMBOL(sk_dst_check);
 544
 545 static int sock_setbindtodevice(struct sock *sk, char __user *optval,
 546                                 int optlen)
 547 {
 548         int ret = -ENOPROTOOPT;
 549 #ifdef CONFIG_NETDEVICES
 550         struct net *net = sock_net(sk);
 551         char devname[IFNAMSIZ];
 552         int index;
 553
 554         /* Sorry... */
 555         ret = -EPERM;
 556         if (!ns_capable(net->user_ns, CAP_NET_RAW))
 557                 goto out;
 558
 559         ret = -EINVAL;
 560         if (optlen < 0)
 561                 goto out;
 562
 563         /* Bind this socket to a particular device like "eth0",
 564          * as specified in the passed interface name. If the
 565          * name is "" or the option length is zero the socket
 566          * is not bound.
 567          */
 568         if (optlen > IFNAMSIZ - 1)
 569                 optlen = IFNAMSIZ - 1;
 570         memset(devname, 0, sizeof(devname));
 571
 572         ret = -EFAULT;
 573         if (copy_from_user(devname, optval, optlen))
 574                 goto out;
 575
 576         index = 0;
 577         if (devname[0] != '\0') {
 578                 struct net_device *dev;
 579
 580                 rcu_read_lock();
 581                 dev = dev_get_by_name_rcu(net, devname);
 582                 if (dev)
 583                         index = dev->ifindex;
 584                 rcu_read_unlock();
 585                 ret = -ENODEV;
 586                 if (!dev)
 587                         goto out;
 588         }
 589
 590         lock_sock(sk);
 591         sk->sk_bound_dev_if = index;
 592         sk_dst_reset(sk);
 593         release_sock(sk);
 594
 595         ret = 0;
 596
 597 out:
 598 #endif
 599
 600         return ret;
 601 }
 602
 603 static int sock_getbindtodevice(struct sock *sk, char __user *optval,
 604                                 int __user *optlen, int len)
 605 {
 606         int ret = -ENOPROTOOPT;
 607 #ifdef CONFIG_NETDEVICES
 608         struct net *net = sock_net(sk);
 609         char devname[IFNAMSIZ];
 610
 611         if (sk->sk_bound_dev_if == 0) {
 612                 len = 0;
 613                 goto zero;
 614         }
 615
 616         ret = -EINVAL;
 617         if (len < IFNAMSIZ)
 618                 goto out;
 619
 620         ret = netdev_get_name(net, devname, sk->sk_bound_dev_if);
 621         if (ret)
 622                 goto out;
 623
 624         len = strlen(devname) + 1;
 625
 626         ret = -EFAULT;
 627         if (copy_to_user(optval, devname, len))
 628                 goto out;
 629
 630 zero:
 631         ret = -EFAULT;
 632         if (put_user(len, optlen))
 633                 goto out;
 634
 635         ret = 0;
 636
 637 out:
 638 #endif
 639
 640         return ret;
 641 }
 642
 643 static inline void sock_valbool_flag(struct sock *sk, int bit, int valbool)
 644 {
 645         if (valbool)
 646                 sock_set_flag(sk, bit);
 647         else
 648                 sock_reset_flag(sk, bit);
 649 }
 650
 651 bool sk_mc_loop(struct sock *sk)
 652 {
 653         if (dev_recursion_level())
 654                 return false;
 655         if (!sk)
 656                 return true;
 657         switch (sk->sk_family) {
 658         case AF_INET:
 659                 return inet_sk(sk)->mc_loop;
 660 #if IS_ENABLED(CONFIG_IPV6)
 661         case AF_INET6:
 662                 return inet6_sk(sk)->mc_loop;
 663 #endif
 664         }
 665         WARN_ON(1);
 666         return true;
 667 }
 668 EXPORT_SYMBOL(sk_mc_loop);
 669
 670 /*
 671  *      This is meant for all protocols to use and covers goings on
 672  *      at the socket level. Everything here is generic.
 673  */
 674
 675 int sock_setsockopt(struct socket *sock, int level, int optname,
 676                     char __user *optval, unsigned int optlen)
 677 {
 678         struct sock *sk = sock->sk;
 679         int val;
 680         int valbool;
 681         struct linger ling;
 682         int ret = 0;
 683
 684         /*
 685          *      Options without arguments
 686          */
 687
 688         if (optname == SO_BINDTODEVICE)
 689                 return sock_setbindtodevice(sk, optval, optlen);
 690
 691         if (optlen < sizeof(int))
 692                 return -EINVAL;
 693
 694         if (get_user(val, (int __user *)optval))
 695                 return -EFAULT;
 696
 697         valbool = val ? 1 : 0;
 698
 699         lock_sock(sk);
 700
 701         switch (optname) {
 702         case SO_DEBUG:
 703                 if (val && !capable(CAP_NET_ADMIN))
 704                         ret = -EACCES;
 705                 else
 706                         sock_valbool_flag(sk, SOCK_DBG, valbool);
 707                 break;
 708         case SO_REUSEADDR:
 709                 sk->sk_reuse = (valbool ? SK_CAN_REUSE : SK_NO_REUSE);
 710                 break;
 711         case SO_REUSEPORT:
 712                 sk->sk_reuseport = valbool;
 713                 break;
 714         case SO_TYPE:
 715         case SO_PROTOCOL:
 716         case SO_DOMAIN:
 717         case SO_ERROR:
 718                 ret = -ENOPROTOOPT;
 719                 break;
 720         case SO_DONTROUTE:
 721                 sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool);
 722                 break;
 723         case SO_BROADCAST:
 724                 sock_valbool_flag(sk, SOCK_BROADCAST, valbool);
 725                 break;
 726         case SO_SNDBUF:
 727                 /* Don't error on this BSD doesn't and if you think
 728                  * about it this is right. Otherwise apps have to
 729                  * play 'guess the biggest size' games. RCVBUF/SNDBUF
 730                  * are treated in BSD as hints
 731                  */
 732                 val = min_t(u32, val, sysctl_wmem_max);
 733 set_sndbuf:
 734                 sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
 735                 sk->sk_sndbuf = max_t(u32, val * 2, SOCK_MIN_SNDBUF);
 736                 /* Wake up sending tasks if we upped the value. */
 737                 sk->sk_write_space(sk);
 738                 break;
 739
 740         case SO_SNDBUFFORCE:
 741                 if (!capable(CAP_NET_ADMIN)) {
 742                         ret = -EPERM;
 743                         break;
 744                 }
 745                 goto set_sndbuf;
 746
 747         case SO_RCVBUF:
 748                 /* Don't error on this BSD doesn't and if you think
 749                  * about it this is right. Otherwise apps have to
 750                  * play 'guess the biggest size' games. RCVBUF/SNDBUF
 751                  * are treated in BSD as hints
 752                  */
 753                 val = min_t(u32, val, sysctl_rmem_max);
 754 set_rcvbuf:
 755                 sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
 756                 /*
 757                  * We double it on the way in to account for
 758                  * "struct sk_buff" etc. overhead.   Applications
 759                  * assume that the SO_RCVBUF setting they make will
 760                  * allow that much actual data to be received on that
 761                  * socket.
 762                  *
 763                  * Applications are unaware that "struct sk_buff" and
 764                  * other overheads allocate from the receive buffer
 765                  * during socket buffer allocation.
 766                  *
 767                  * And after considering the possible alternatives,
 768                  * returning the value we actually used in getsockopt
 769                  * is the most desirable behavior.
 770                  */
 771                 sk->sk_rcvbuf = max_t(u32, val * 2, SOCK_MIN_RCVBUF);
 772                 break;
 773
 774         case SO_RCVBUFFORCE:
 775                 if (!capable(CAP_NET_ADMIN)) {
 776                         ret = -EPERM;
 777                         break;
 778                 }
 779                 goto set_rcvbuf;
 780
 781         case SO_KEEPALIVE:
 782 #ifdef CONFIG_INET
 783                 if (sk->sk_protocol == IPPROTO_TCP &&
 784                     sk->sk_type == SOCK_STREAM)
 785                         tcp_set_keepalive(sk, valbool);
 786 #endif
 787                 sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool);
 788                 break;
 789
 790         case SO_OOBINLINE:
 791                 sock_valbool_flag(sk, SOCK_URGINLINE, valbool);
 792                 break;
 793
 794         case SO_NO_CHECK:
 795                 sk->sk_no_check_tx = valbool;
 796                 break;
 797
 798         case SO_PRIORITY:
 799                 if ((val >= 0 && val <= 6) ||
 800                     ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
 801                         sk->sk_priority = val;
 802                 else
 803                         ret = -EPERM;
 804                 break;
 805
 806         case SO_LINGER:
 807                 if (optlen < sizeof(ling)) {
 808                         ret = -EINVAL;  /* 1003.1g */
 809                         break;
 810                 }
 811                 if (copy_from_user(&ling, optval, sizeof(ling))) {
 812                         ret = -EFAULT;
 813                         break;
 814                 }
 815                 if (!ling.l_onoff)
 816                         sock_reset_flag(sk, SOCK_LINGER);
 817                 else {
 818 #if (BITS_PER_LONG == 32)
 819                         if ((unsigned int)ling.l_linger >= MAX_SCHEDULE_TIMEOUT/HZ)
 820                                 sk->sk_lingertime = MAX_SCHEDULE_TIMEOUT;
 821                         else
 822 #endif
 823                                 sk->sk_lingertime = (unsigned int)ling.l_linger * HZ;
 824                         sock_set_flag(sk, SOCK_LINGER);
 825                 }
 826                 break;
 827
 828         case SO_BSDCOMPAT:
 829                 sock_warn_obsolete_bsdism("setsockopt");
 830                 break;
 831
 832         case SO_PASSCRED:
 833                 if (valbool)
 834                         set_bit(SOCK_PASSCRED, &sock->flags);
 835                 else
 836                         clear_bit(SOCK_PASSCRED, &sock->flags);
 837                 break;
 838
 839         case SO_TIMESTAMP:
 840         case SO_TIMESTAMPNS:
 841                 if (valbool)  {
 842                         if (optname == SO_TIMESTAMP)
 843                                 sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
 844                         else
 845                                 sock_set_flag(sk, SOCK_RCVTSTAMPNS);
 846                         sock_set_flag(sk, SOCK_RCVTSTAMP);
 847                         sock_enable_timestamp(sk, SOCK_TIMESTAMP);
 848                 } else {
 849                         sock_reset_flag(sk, SOCK_RCVTSTAMP);
 850                         sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
 851                 }
 852                 break;
 853
 854         case SO_TIMESTAMPING:
 855                 if (val & ~SOF_TIMESTAMPING_MASK) {
 856                         ret = -EINVAL;
 857                         break;
 858                 }
 859
 860                 if (val & SOF_TIMESTAMPING_OPT_ID &&
 861                     !(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)) {
 862                         if (sk->sk_protocol == IPPROTO_TCP &&
 863                             sk->sk_type == SOCK_STREAM) {
 864                                 if (sk->sk_state != TCP_ESTABLISHED) {
 865                                         ret = -EINVAL;
 866                                         break;
 867                                 }
 868                                 sk->sk_tskey = tcp_sk(sk)->snd_una;
 869                         } else {
 870                                 sk->sk_tskey = 0;
 871                         }
 872                 }
 873                 sk->sk_tsflags = val;
 874                 if (val & SOF_TIMESTAMPING_RX_SOFTWARE)
 875                         sock_enable_timestamp(sk,
 876                                               SOCK_TIMESTAMPING_RX_SOFTWARE);
 877                 else
 878                         sock_disable_timestamp(sk,
 879                                                (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE));
 880                 break;
 881
 882         case SO_RCVLOWAT:
 883                 if (val < 0)
 884                         val = INT_MAX;
 885                 sk->sk_rcvlowat = val ? : 1;
 886                 break;
 887
 888         case SO_RCVTIMEO:
 889                 ret = sock_set_timeout(&sk->sk_rcvtimeo, optval, optlen);
 890                 break;
 891
 892         case SO_SNDTIMEO:
 893                 ret = sock_set_timeout(&sk->sk_sndtimeo, optval, optlen);
 894                 break;
 895
 896         case SO_ATTACH_FILTER:
 897                 ret = -EINVAL;
 898                 if (optlen == sizeof(struct sock_fprog)) {
 899                         struct sock_fprog fprog;
 900
 901                         ret = -EFAULT;
 902                         if (copy_from_user(&fprog, optval, sizeof(fprog)))
 903                                 break;
 904
 905                         ret = sk_attach_filter(&fprog, sk);
 906                 }
 907                 break;
 908
 909         case SO_ATTACH_BPF:
 910                 ret = -EINVAL;
 911                 if (optlen == sizeof(u32)) {
 912                         u32 ufd;
 913
 914                         ret = -EFAULT;
 915                         if (copy_from_user(&ufd, optval, sizeof(ufd)))
 916                                 break;
 917
 918                         ret = sk_attach_bpf(ufd, sk);
 919                 }
 920                 break;
 921
 922         case SO_DETACH_FILTER:
 923                 ret = sk_detach_filter(sk);
 924                 break;
 925
 926         case SO_LOCK_FILTER:
 927                 if (sock_flag(sk, SOCK_FILTER_LOCKED) && !valbool)
 928                         ret = -EPERM;
 929                 else
 930                         sock_valbool_flag(sk, SOCK_FILTER_LOCKED, valbool);
 931                 break;
 932
 933         case SO_PASSSEC:
 934                 if (valbool)
 935                         set_bit(SOCK_PASSSEC, &sock->flags);
 936                 else
 937                         clear_bit(SOCK_PASSSEC, &sock->flags);
 938                 break;
 939         case SO_MARK:
 940                 if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
 941                         ret = -EPERM;
 942                 else
 943                         sk->sk_mark = val;
 944                 break;
 945
 946         case SO_RXQ_OVFL:
 947                 sock_valbool_flag(sk, SOCK_RXQ_OVFL, valbool);
 948                 break;
 949
 950         case SO_WIFI_STATUS:
 951                 sock_valbool_flag(sk, SOCK_WIFI_STATUS, valbool);
 952                 break;
 953
 954         case SO_PEEK_OFF:
 955                 if (sock->ops->set_peek_off)
 956                         ret = sock->ops->set_peek_off(sk, val);
 957                 else
 958                         ret = -EOPNOTSUPP;
 959                 break;
 960
 961         case SO_NOFCS:
 962                 sock_valbool_flag(sk, SOCK_NOFCS, valbool);
 963                 break;
 964
 965         case SO_SELECT_ERR_QUEUE:
 966                 sock_valbool_flag(sk, SOCK_SELECT_ERR_QUEUE, valbool);
 967                 break;
 968
 969 #ifdef CONFIG_NET_RX_BUSY_POLL
 970         case SO_BUSY_POLL:
 971                 /* allow unprivileged users to decrease the value */
 972                 if ((val > sk->sk_ll_usec) && !capable(CAP_NET_ADMIN))
 973                         ret = -EPERM;
 974                 else {
 975                         if (val < 0)
 976                                 ret = -EINVAL;
 977                         else
 978                                 sk->sk_ll_usec = val;
 979                 }
 980                 break;
 981 #endif
 982
 983         case SO_MAX_PACING_RATE:
 984                 sk->sk_max_pacing_rate = val;
 985                 sk->sk_pacing_rate = min(sk->sk_pacing_rate,
 986                                          sk->sk_max_pacing_rate);
 987                 break;
 988
 989         default:
 990                 ret = -ENOPROTOOPT;
 991                 break;
 992         }
 993         release_sock(sk);
 994         return ret;
 995 }
 996 EXPORT_SYMBOL(sock_setsockopt);
 997
 998
 999 static void cred_to_ucred(struct pid *pid, const struct cred *cred,
1000                           struct ucred *ucred)
1001 {
1002         ucred->pid = pid_vnr(pid);
1003         ucred->uid = ucred->gid = -1;
1004         if (cred) {
1005                 struct user_namespace *current_ns = current_user_ns();
1006
1007                 ucred->uid = from_kuid_munged(current_ns, cred->euid);
1008                 ucred->gid = from_kgid_munged(current_ns, cred->egid);
1009         }
1010 }
1011
1012 int sock_getsockopt(struct socket *sock, int level, int optname,
1013                     char __user *optval, int __user *optlen)
1014 {
1015         struct sock *sk = sock->sk;
1016
1017         union {
1018                 int val;
1019                 struct linger ling;
1020                 struct timeval tm;
1021         } v;
1022
1023         int lv = sizeof(int);
1024         int len;
1025
1026         if (get_user(len, optlen))
1027                 return -EFAULT;
1028         if (len < 0)
1029                 return -EINVAL;
1030
1031         memset(&v, 0, sizeof(v));
1032
1033         switch (optname) {
1034         case SO_DEBUG:
1035                 v.val = sock_flag(sk, SOCK_DBG);
1036                 break;
1037
1038         case SO_DONTROUTE:
1039                 v.val = sock_flag(sk, SOCK_LOCALROUTE);
1040                 break;
1041
1042         case SO_BROADCAST:
1043                 v.val = sock_flag(sk, SOCK_BROADCAST);
1044                 break;
1045
1046         case SO_SNDBUF:
1047                 v.val = sk->sk_sndbuf;
1048                 break;
1049
1050         case SO_RCVBUF:
1051                 v.val = sk->sk_rcvbuf;
1052                 break;
1053
1054         case SO_REUSEADDR:
1055                 v.val = sk->sk_reuse;
1056                 break;
1057
1058         case SO_REUSEPORT:
1059                 v.val = sk->sk_reuseport;
1060                 break;
1061
1062         case SO_KEEPALIVE:
1063                 v.val = sock_flag(sk, SOCK_KEEPOPEN);
1064                 break;
1065
1066         case SO_TYPE:
1067                 v.val = sk->sk_type;
1068                 break;
1069
1070         case SO_PROTOCOL:
1071                 v.val = sk->sk_protocol;
1072                 break;
1073
1074         case SO_DOMAIN:
1075                 v.val = sk->sk_family;
1076                 break;
1077
1078         case SO_ERROR:
1079                 v.val = -sock_error(sk);
1080                 if (v.val == 0)
1081                         v.val = xchg(&sk->sk_err_soft, 0);
1082                 break;
1083
1084         case SO_OOBINLINE:
1085                 v.val = sock_flag(sk, SOCK_URGINLINE);
1086                 break;
1087
1088         case SO_NO_CHECK:
1089                 v.val = sk->sk_no_check_tx;
1090                 break;
1091
1092         case SO_PRIORITY:
1093                 v.val = sk->sk_priority;
1094                 break;
1095
1096         case SO_LINGER:
1097                 lv              = sizeof(v.ling);
1098                 v.ling.l_onoff  = sock_flag(sk, SOCK_LINGER);
1099                 v.ling.l_linger = sk->sk_lingertime / HZ;
1100                 break;
1101
1102         case SO_BSDCOMPAT:
1103                 sock_warn_obsolete_bsdism("getsockopt");
1104                 break;
1105
1106         case SO_TIMESTAMP:
1107                 v.val = sock_flag(sk, SOCK_RCVTSTAMP) &&
1108                                 !sock_flag(sk, SOCK_RCVTSTAMPNS);
1109                 break;
1110
1111         case SO_TIMESTAMPNS:
1112                 v.val = sock_flag(sk, SOCK_RCVTSTAMPNS);
1113                 break;
1114
1115         case SO_TIMESTAMPING:
1116                 v.val = sk->sk_tsflags;
1117                 break;
1118
1119         case SO_RCVTIMEO:
1120                 lv = sizeof(struct timeval);
1121                 if (sk->sk_rcvtimeo == MAX_SCHEDULE_TIMEOUT) {
1122                         v.tm.tv_sec = 0;
1123                         v.tm.tv_usec = 0;
1124                 } else {
1125                         v.tm.tv_sec = sk->sk_rcvtimeo / HZ;
1126                         v.tm.tv_usec = ((sk->sk_rcvtimeo % HZ) * 1000000) / HZ;
1127                 }
1128                 break;
1129
1130         case SO_SNDTIMEO:
1131                 lv = sizeof(struct timeval);
1132                 if (sk->sk_sndtimeo == MAX_SCHEDULE_TIMEOUT) {
1133                         v.tm.tv_sec = 0;
1134                         v.tm.tv_usec = 0;
1135                 } else {
1136                         v.tm.tv_sec = sk->sk_sndtimeo / HZ;
1137                         v.tm.tv_usec = ((sk->sk_sndtimeo % HZ) * 1000000) / HZ;
1138                 }
1139                 break;
1140
1141         case SO_RCVLOWAT:
1142                 v.val = sk->sk_rcvlowat;
1143                 break;
1144
1145         case SO_SNDLOWAT:
1146                 v.val = 1;
1147                 break;
1148
1149         case SO_PASSCRED:
1150                 v.val = !!test_bit(SOCK_PASSCRED, &sock->flags);
1151                 break;
1152
1153         case SO_PEERCRED:
1154         {
1155                 struct ucred peercred;
1156                 if (len > sizeof(peercred))
1157                         len = sizeof(peercred);
1158                 cred_to_ucred(sk->sk_peer_pid, sk->sk_peer_cred, &peercred);
1159                 if (copy_to_user(optval, &peercred, len))
1160                         return -EFAULT;
1161                 goto lenout;
1162         }
1163
1164         case SO_PEERNAME:
1165         {
1166                 char address[128];
1167
1168                 if (sock->ops->getname(sock, (struct sockaddr *)address, &lv, 2))
1169                         return -ENOTCONN;
1170                 if (lv < len)
1171                         return -EINVAL;
1172                 if (copy_to_user(optval, address, len))
1173                         return -EFAULT;
1174                 goto lenout;
1175         }
1176
1177         /* Dubious BSD thing... Probably nobody even uses it, but
1178          * the UNIX standard wants it for whatever reason... -DaveM
1179          */
1180         case SO_ACCEPTCONN:
1181                 v.val = sk->sk_state == TCP_LISTEN;
1182                 break;
1183
1184         case SO_PASSSEC:
1185                 v.val = !!test_bit(SOCK_PASSSEC, &sock->flags);
1186                 break;
1187
1188         case SO_PEERSEC:
1189                 return security_socket_getpeersec_stream(sock, optval, optlen, len);
1190
1191         case SO_MARK:
1192                 v.val = sk->sk_mark;
1193                 break;
1194
1195         case SO_RXQ_OVFL:
1196                 v.val = sock_flag(sk, SOCK_RXQ_OVFL);
1197                 break;
1198
1199         case SO_WIFI_STATUS:
1200                 v.val = sock_flag(sk, SOCK_WIFI_STATUS);
1201                 break;
1202
1203         case SO_PEEK_OFF:
1204                 if (!sock->ops->set_peek_off)
1205                         return -EOPNOTSUPP;
1206
1207                 v.val = sk->sk_peek_off;
1208                 break;
1209         case SO_NOFCS:
1210                 v.val = sock_flag(sk, SOCK_NOFCS);
1211                 break;
1212
1213         case SO_BINDTODEVICE:
1214                 return sock_getbindtodevice(sk, optval, optlen, len);
1215
1216         case SO_GET_FILTER:
1217                 len = sk_get_filter(sk, (struct sock_filter __user *)optval, len);
1218                 if (len < 0)
1219                         return len;
1220
1221                 goto lenout;
1222
1223         case SO_LOCK_FILTER:
1224                 v.val = sock_flag(sk, SOCK_FILTER_LOCKED);
1225                 break;
1226
1227         case SO_BPF_EXTENSIONS:
1228                 v.val = bpf_tell_extensions();
1229                 break;
1230
1231         case SO_SELECT_ERR_QUEUE:
1232                 v.val = sock_flag(sk, SOCK_SELECT_ERR_QUEUE);
1233                 break;
1234
1235 #ifdef CONFIG_NET_RX_BUSY_POLL
1236         case SO_BUSY_POLL:
1237                 v.val = sk->sk_ll_usec;
1238                 break;
1239 #endif
1240
1241         case SO_MAX_PACING_RATE:
1242                 v.val = sk->sk_max_pacing_rate;
1243                 break;
1244
1245         case SO_INCOMING_CPU:
1246                 v.val = sk->sk_incoming_cpu;
1247                 break;
1248
1249         default:
1250                 /* We implement the SO_SNDLOWAT etc to not be settable
1251                  * (1003.1g 7).
1252                  */
1253                 return -ENOPROTOOPT;
1254         }
1255
1256         if (len > lv)
1257                 len = lv;
1258         if (copy_to_user(optval, &v, len))
1259                 return -EFAULT;
1260 lenout:
1261         if (put_user(len, optlen))
1262                 return -EFAULT;
1263         return 0;
1264 }
1265
1266 /*
1267  * Initialize an sk_lock.
1268  *
1269  * (We also register the sk_lock with the lock validator.)
1270  */
1271 static inline void sock_lock_init(struct sock *sk)
1272 {
1273         sock_lock_init_class_and_name(sk,
1274                         af_family_slock_key_strings[sk->sk_family],
1275                         af_family_slock_keys + sk->sk_family,
1276                         af_family_key_strings[sk->sk_family],
1277                         af_family_keys + sk->sk_family);
1278 }
1279
1280 /*
1281  * Copy all fields from osk to nsk but nsk->sk_refcnt must not change yet,
1282  * even temporarly, because of RCU lookups. sk_node should also be left as is.
1283  * We must not copy fields between sk_dontcopy_begin and sk_dontcopy_end
1284  */
1285 static void sock_copy(struct sock *nsk, const struct sock *osk)
1286 {
1287 #ifdef CONFIG_SECURITY_NETWORK
1288         void *sptr = nsk->sk_security;
1289 #endif
1290         memcpy(nsk, osk, offsetof(struct sock, sk_dontcopy_begin));
1291
1292         memcpy(&nsk->sk_dontcopy_end, &osk->sk_dontcopy_end,
1293                osk->sk_prot->obj_size - offsetof(struct sock, sk_dontcopy_end));
1294
1295 #ifdef CONFIG_SECURITY_NETWORK
1296         nsk->sk_security = sptr;
1297         security_sk_clone(osk, nsk);
1298 #endif
1299 }
1300
1301 void sk_prot_clear_portaddr_nulls(struct sock *sk, int size)
1302 {
1303         unsigned long nulls1, nulls2;
1304
1305         nulls1 = offsetof(struct sock, __sk_common.skc_node.next);
1306         nulls2 = offsetof(struct sock, __sk_common.skc_portaddr_node.next);
1307         if (nulls1 > nulls2)
1308                 swap(nulls1, nulls2);
1309
1310         if (nulls1 != 0)
1311                 memset((char *)sk, 0, nulls1);
1312         memset((char *)sk + nulls1 + sizeof(void *), 0,
1313                nulls2 - nulls1 - sizeof(void *));
1314         memset((char *)sk + nulls2 + sizeof(void *), 0,
1315                size - nulls2 - sizeof(void *));
1316 }
1317 EXPORT_SYMBOL(sk_prot_clear_portaddr_nulls);
1318
1319 static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
1320                 int family)
1321 {
1322         struct sock *sk;
1323         struct kmem_cache *slab;
1324
1325         slab = prot->slab;
1326         if (slab != NULL) {
1327                 sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO);
1328                 if (!sk)
1329                         return sk;
1330                 if (priority & __GFP_ZERO) {
1331                         if (prot->clear_sk)
1332                                 prot->clear_sk(sk, prot->obj_size);
1333                         else
1334                                 sk_prot_clear_nulls(sk, prot->obj_size);
1335                 }
1336         } else
1337                 sk = kmalloc(prot->obj_size, priority);
1338
1339         if (sk != NULL) {
1340                 kmemcheck_annotate_bitfield(sk, flags);
1341
1342                 if (security_sk_alloc(sk, family, priority))
1343                         goto out_free;
1344
1345                 if (!try_module_get(prot->owner))
1346                         goto out_free_sec;
1347                 sk_tx_queue_clear(sk);
1348         }
1349
1350         return sk;
1351
1352 out_free_sec:
1353         security_sk_free(sk);
1354 out_free:
1355         if (slab != NULL)
1356                 kmem_cache_free(slab, sk);
1357         else
1358                 kfree(sk);
1359         return NULL;
1360 }
1361
1362 static void sk_prot_free(struct proto *prot, struct sock *sk)
1363 {
1364         struct kmem_cache *slab;
1365         struct module *owner;
1366
1367         owner = prot->owner;
1368         slab = prot->slab;
1369
1370         security_sk_free(sk);
1371         if (slab != NULL)
1372                 kmem_cache_free(slab, sk);
1373         else
1374                 kfree(sk);
1375         module_put(owner);
1376 }
1377
1378 #if IS_ENABLED(CONFIG_CGROUP_NET_PRIO)
1379 void sock_update_netprioidx(struct sock *sk)
1380 {
1381         if (in_interrupt())
1382                 return;
1383
1384         sk->sk_cgrp_prioidx = task_netprioidx(current);
1385 }
1386 EXPORT_SYMBOL_GPL(sock_update_netprioidx);
1387 #endif
1388
1389 /**
1390  *      sk_alloc - All socket objects are allocated here
1391  *      @net: the applicable net namespace
1392  *      @family: protocol family
1393  *      @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
1394  *      @prot: struct proto associated with this new sock instance
1395  */
1396 struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
1397                       struct proto *prot)
1398 {
1399         struct sock *sk;
1400
1401         sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family);
1402         if (sk) {
1403                 sk->sk_family = family;
1404                 /*
1405                  * See comment in struct sock definition to understand
1406                  * why we need sk_prot_creator -acme
1407                  */
1408                 sk->sk_prot = sk->sk_prot_creator = prot;
1409                 sock_lock_init(sk);
1410                 sock_net_set(sk, get_net(net));
1411                 atomic_set(&sk->sk_wmem_alloc, 1);
1412
1413                 sock_update_classid(sk);
1414                 sock_update_netprioidx(sk);
1415         }
1416
1417         return sk;
1418 }
1419 EXPORT_SYMBOL(sk_alloc);
1420
1421 static void __sk_free(struct sock *sk)
1422 {
1423         struct sk_filter *filter;
1424
1425         if (sk->sk_destruct)
1426                 sk->sk_destruct(sk);
1427
1428         filter = rcu_dereference_check(sk->sk_filter,
1429                                        atomic_read(&sk->sk_wmem_alloc) == 0);
1430         if (filter) {
1431                 sk_filter_uncharge(sk, filter);
1432                 RCU_INIT_POINTER(sk->sk_filter, NULL);
1433         }
1434
1435         sock_disable_timestamp(sk, SK_FLAGS_TIMESTAMP);
1436
1437         if (atomic_read(&sk->sk_omem_alloc))
1438                 pr_debug("%s: optmem leakage (%d bytes) detected\n",
1439                          __func__, atomic_read(&sk->sk_omem_alloc));
1440
1441         if (sk->sk_peer_cred)
1442                 put_cred(sk->sk_peer_cred);
1443         put_pid(sk->sk_peer_pid);
1444         put_net(sock_net(sk));
1445         sk_prot_free(sk->sk_prot_creator, sk);
1446 }
1447
1448 void sk_free(struct sock *sk)
1449 {
1450         /*
1451          * We subtract one from sk_wmem_alloc and can know if
1452          * some packets are still in some tx queue.
1453          * If not null, sock_wfree() will call __sk_free(sk) later
1454          */
1455         if (atomic_dec_and_test(&sk->sk_wmem_alloc))
1456                 __sk_free(sk);
1457 }
1458 EXPORT_SYMBOL(sk_free);
1459
1460 /*
1461  * Last sock_put should drop reference to sk->sk_net. It has already
1462  * been dropped in sk_change_net. Taking reference to stopping namespace
1463  * is not an option.
1464  * Take reference to a socket to remove it from hash _alive_ and after that
1465  * destroy it in the context of init_net.
1466  */
1467 void sk_release_kernel(struct sock *sk)
1468 {
1469         if (sk == NULL || sk->sk_socket == NULL)
1470                 return;
1471
1472         sock_hold(sk);
1473         sock_release(sk->sk_socket);
1474         sock_net_set(sk, get_net(&init_net));
1475         sock_put(sk);
1476 }
1477 EXPORT_SYMBOL(sk_release_kernel);
1478
1479 static void sk_update_clone(const struct sock *sk, struct sock *newsk)
1480 {
1481         if (mem_cgroup_sockets_enabled && sk->sk_cgrp)
1482                 sock_update_memcg(newsk);
1483 }
1484
1485 /**
1486  *      sk_clone_lock - clone a socket, and lock its clone
1487  *      @sk: the socket to clone
1488  *      @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
1489  *
1490  *      Caller must unlock socket even in error path (bh_unlock_sock(newsk))
1491  */
1492 struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
1493 {
1494         struct sock *newsk;
1495         bool is_charged = true;
1496
1497         newsk = sk_prot_alloc(sk->sk_prot, priority, sk->sk_family);
1498         if (newsk != NULL) {
1499                 struct sk_filter *filter;
1500
1501                 sock_copy(newsk, sk);
1502
1503                 /* SANITY */
1504                 get_net(sock_net(newsk));
1505                 sk_node_init(&newsk->sk_node);
1506                 sock_lock_init(newsk);
1507                 bh_lock_sock(newsk);
1508                 newsk->sk_backlog.head  = newsk->sk_backlog.tail = NULL;
1509                 newsk->sk_backlog.len = 0;
1510
1511                 atomic_set(&newsk->sk_rmem_alloc, 0);
1512                 /*
1513                  * sk_wmem_alloc set to one (see sk_free() and sock_wfree())
1514                  */
1515                 atomic_set(&newsk->sk_wmem_alloc, 1);
1516                 atomic_set(&newsk->sk_omem_alloc, 0);
1517                 skb_queue_head_init(&newsk->sk_receive_queue);
1518                 skb_queue_head_init(&newsk->sk_write_queue);
1519
1520                 spin_lock_init(&newsk->sk_dst_lock);
1521                 rwlock_init(&newsk->sk_callback_lock);
1522                 lockdep_set_class_and_name(&newsk->sk_callback_lock,
1523                                 af_callback_keys + newsk->sk_family,
1524                                 af_family_clock_key_strings[newsk->sk_family]);
1525
1526                 newsk->sk_dst_cache     = NULL;
1527                 newsk->sk_wmem_queued   = 0;
1528                 newsk->sk_forward_alloc = 0;
1529                 newsk->sk_send_head     = NULL;
1530                 newsk->sk_userlocks     = sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
1531
1532                 sock_reset_flag(newsk, SOCK_DONE);
1533                 skb_queue_head_init(&newsk->sk_error_queue);
1534
1535                 filter = rcu_dereference_protected(newsk->sk_filter, 1);
1536                 if (filter != NULL)
1537                         /* though it's an empty new sock, the charging may fail
1538                          * if sysctl_optmem_max was changed between creation of
1539                          * original socket and cloning
1540                          */
1541                         is_charged = sk_filter_charge(newsk, filter);
1542
1543                 if (unlikely(!is_charged || xfrm_sk_clone_policy(newsk))) {
1544                         /* It is still raw copy of parent, so invalidate
1545                          * destructor and make plain sk_free() */
1546                         newsk->sk_destruct = NULL;
1547                         bh_unlock_sock(newsk);
1548                         sk_free(newsk);
1549                         newsk = NULL;
1550                         goto out;
1551                 }
1552
1553                 newsk->sk_err      = 0;
1554                 newsk->sk_priority = 0;
1555                 newsk->sk_incoming_cpu = raw_smp_processor_id();
1556                 atomic64_set(&newsk->sk_cookie, 0);
1557                 /*
1558                  * Before updating sk_refcnt, we must commit prior changes to memory
1559                  * (Documentation/RCU/rculist_nulls.txt for details)
1560                  */
1561                 smp_wmb();
1562                 atomic_set(&newsk->sk_refcnt, 2);
1563
1564                 /*
1565                  * Increment the counter in the same struct proto as the master
1566                  * sock (sk_refcnt_debug_inc uses newsk->sk_prot->socks, that
1567                  * is the same as sk->sk_prot->socks, as this field was copied
1568                  * with memcpy).
1569                  *
1570                  * This _changes_ the previous behaviour, where
1571                  * tcp_create_openreq_child always was incrementing the
1572                  * equivalent to tcp_prot->socks (inet_sock_nr), so this have
1573                  * to be taken into account in all callers. -acme
1574                  */
1575                 sk_refcnt_debug_inc(newsk);
1576                 sk_set_socket(newsk, NULL);
1577                 newsk->sk_wq = NULL;
1578
1579                 sk_update_clone(sk, newsk);
1580
1581                 if (newsk->sk_prot->sockets_allocated)
1582                         sk_sockets_allocated_inc(newsk);
1583
1584                 if (newsk->sk_flags & SK_FLAGS_TIMESTAMP)
1585                         net_enable_timestamp();
1586         }
1587 out:
1588         return newsk;
1589 }
1590 EXPORT_SYMBOL_GPL(sk_clone_lock);
1591
1592 void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
1593 {
1594         __sk_dst_set(sk, dst);
1595         sk->sk_route_caps = dst->dev->features;
1596         if (sk->sk_route_caps & NETIF_F_GSO)
1597                 sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE;
1598         sk->sk_route_caps &= ~sk->sk_route_nocaps;
1599         if (sk_can_gso(sk)) {
1600                 if (dst->header_len) {
1601                         sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
1602                 } else {
1603                         sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM;
1604                         sk->sk_gso_max_size = dst->dev->gso_max_size;
1605                         sk->sk_gso_max_segs = dst->dev->gso_max_segs;
1606                 }
1607         }
1608 }
1609 EXPORT_SYMBOL_GPL(sk_setup_caps);
1610
1611 /*
1612  *      Simple resource managers for sockets.
1613  */
1614
1615
1616 /*
1617  * Write buffer destructor automatically called from kfree_skb.
1618  */
1619 void sock_wfree(struct sk_buff *skb)
1620 {
1621         struct sock *sk = skb->sk;
1622         unsigned int len = skb->truesize;
1623
1624         if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE)) {
1625                 /*
1626                  * Keep a reference on sk_wmem_alloc, this will be released
1627                  * after sk_write_space() call
1628                  */
1629                 atomic_sub(len - 1, &sk->sk_wmem_alloc);
1630                 sk->sk_write_space(sk);
1631                 len = 1;
1632         }
1633         /*
1634          * if sk_wmem_alloc reaches 0, we must finish what sk_free()
1635          * could not do because of in-flight packets
1636          */
1637         if (atomic_sub_and_test(len, &sk->sk_wmem_alloc))
1638                 __sk_free(sk);
1639 }
1640 EXPORT_SYMBOL(sock_wfree);
1641
1642 void skb_orphan_partial(struct sk_buff *skb)
1643 {
1644         /* TCP stack sets skb->ooo_okay based on sk_wmem_alloc,
1645          * so we do not completely orphan skb, but transfert all
1646          * accounted bytes but one, to avoid unexpected reorders.
1647          */
1648         if (skb->destructor == sock_wfree
1649 #ifdef CONFIG_INET
1650             || skb->destructor == tcp_wfree
1651 #endif
1652                 ) {
1653                 atomic_sub(skb->truesize - 1, &skb->sk->sk_wmem_alloc);
1654                 skb->truesize = 1;
1655         } else {
1656                 skb_orphan(skb);
1657         }
1658 }
1659 EXPORT_SYMBOL(skb_orphan_partial);
1660
1661 /*
1662  * Read buffer destructor automatically called from kfree_skb.
1663  */
1664 void sock_rfree(struct sk_buff *skb)
1665 {
1666         struct sock *sk = skb->sk;
1667         unsigned int len = skb->truesize;
1668
1669         atomic_sub(len, &sk->sk_rmem_alloc);
1670         sk_mem_uncharge(sk, len);
1671 }
1672 EXPORT_SYMBOL(sock_rfree);
1673
1674 /*
1675  * Buffer destructor for skbs that are not used directly in read or write
1676  * path, e.g. for error handler skbs. Automatically called from kfree_skb.
1677  */
1678 void sock_efree(struct sk_buff *skb)
1679 {
1680         sock_put(skb->sk);
1681 }
1682 EXPORT_SYMBOL(sock_efree);
1683
1684 kuid_t sock_i_uid(struct sock *sk)
1685 {
1686         kuid_t uid;
1687
1688         read_lock_bh(&sk->sk_callback_lock);
1689         uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : GLOBAL_ROOT_UID;
1690         read_unlock_bh(&sk->sk_callback_lock);
1691         return uid;
1692 }
1693 EXPORT_SYMBOL(sock_i_uid);
1694
1695 unsigned long sock_i_ino(struct sock *sk)
1696 {
1697         unsigned long ino;
1698
1699         read_lock_bh(&sk->sk_callback_lock);
1700         ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0;
1701         read_unlock_bh(&sk->sk_callback_lock);
1702         return ino;
1703 }
1704 EXPORT_SYMBOL(sock_i_ino);
1705
1706 /*
1707  * Allocate a skb from the socket's send buffer.
1708  */
1709 struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
1710                              gfp_t priority)
1711 {
1712         if (force || atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
1713                 struct sk_buff *skb = alloc_skb(size, priority);
1714                 if (skb) {
1715                         skb_set_owner_w(skb, sk);
1716                         return skb;
1717                 }
1718         }
1719         return NULL;
1720 }
1721 EXPORT_SYMBOL(sock_wmalloc);
1722
1723 /*
1724  * Allocate a memory block from the socket's option memory buffer.
1725  */
1726 void *sock_kmalloc(struct sock *sk, int size, gfp_t priority)
1727 {
1728         if ((unsigned int)size <= sysctl_optmem_max &&
1729             atomic_read(&sk->sk_omem_alloc) + size < sysctl_optmem_max) {
1730                 void *mem;
1731                 /* First do the add, to avoid the race if kmalloc
1732                  * might sleep.
1733                  */
1734                 atomic_add(size, &sk->sk_omem_alloc);
1735                 mem = kmalloc(size, priority);
1736                 if (mem)
1737                         return mem;
1738                 atomic_sub(size, &sk->sk_omem_alloc);
1739         }
1740         return NULL;
1741 }
1742 EXPORT_SYMBOL(sock_kmalloc);
1743
1744 /* Free an option memory block. Note, we actually want the inline
1745  * here as this allows gcc to detect the nullify and fold away the
1746  * condition entirely.
1747  */
1748 static inline void __sock_kfree_s(struct sock *sk, void *mem, int size,
1749                                   const bool nullify)
1750 {
1751         if (WARN_ON_ONCE(!mem))
1752                 return;
1753         if (nullify)
1754                 kzfree(mem);
1755         else
1756                 kfree(mem);
1757         atomic_sub(size, &sk->sk_omem_alloc);
1758 }
1759
1760 void sock_kfree_s(struct sock *sk, void *mem, int size)
1761 {
1762         __sock_kfree_s(sk, mem, size, false);
1763 }
1764 EXPORT_SYMBOL(sock_kfree_s);
1765
1766 void sock_kzfree_s(struct sock *sk, void *mem, int size)
1767 {
1768         __sock_kfree_s(sk, mem, size, true);
1769 }
1770 EXPORT_SYMBOL(sock_kzfree_s);
1771
1772 /* It is almost wait_for_tcp_memory minus release_sock/lock_sock.
1773    I think, these locks should be removed for datagram sockets.
1774  */
1775 static long sock_wait_for_wmem(struct sock *sk, long timeo)
1776 {
1777         DEFINE_WAIT(wait);
1778
1779         clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
1780         for (;;) {
1781                 if (!timeo)
1782                         break;
1783                 if (signal_pending(current))
1784                         break;
1785                 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1786                 prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
1787                 if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf)
1788                         break;
1789                 if (sk->sk_shutdown & SEND_SHUTDOWN)
1790                         break;
1791                 if (sk->sk_err)
1792                         break;
1793                 timeo = schedule_timeout(timeo);
1794         }
1795         finish_wait(sk_sleep(sk), &wait);
1796         return timeo;
1797 }
1798
1799
1800 /*
1801  *      Generic send/receive buffer handlers
1802  */
1803
1804 struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len,
1805                                      unsigned long data_len, int noblock,
1806                                      int *errcode, int max_page_order)
1807 {
1808         struct sk_buff *skb;
1809         long timeo;
1810         int err;
1811
1812         timeo = sock_sndtimeo(sk, noblock);
1813         for (;;) {
1814                 err = sock_error(sk);
1815                 if (err != 0)
1816                         goto failure;
1817
1818                 err = -EPIPE;
1819                 if (sk->sk_shutdown & SEND_SHUTDOWN)
1820                         goto failure;
1821
1822                 if (sk_wmem_alloc_get(sk) < sk->sk_sndbuf)
1823                         break;
1824
1825                 set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
1826                 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1827                 err = -EAGAIN;
1828                 if (!timeo)
1829                         goto failure;
1830                 if (signal_pending(current))
1831                         goto interrupted;
1832                 timeo = sock_wait_for_wmem(sk, timeo);
1833         }
1834         skb = alloc_skb_with_frags(header_len, data_len, max_page_order,
1835                                    errcode, sk->sk_allocation);
1836         if (skb)
1837                 skb_set_owner_w(skb, sk);
1838         return skb;
1839
1840 interrupted:
1841         err = sock_intr_errno(timeo);
1842 failure:
1843         *errcode = err;
1844         return NULL;
1845 }
1846 EXPORT_SYMBOL(sock_alloc_send_pskb);
1847
1848 struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size,
1849                                     int noblock, int *errcode)
1850 {
1851         return sock_alloc_send_pskb(sk, size, 0, noblock, errcode, 0);
1852 }
1853 EXPORT_SYMBOL(sock_alloc_send_skb);
1854
1855 /* On 32bit arches, an skb frag is limited to 2^15 */
1856 #define SKB_FRAG_PAGE_ORDER     get_order(32768)
1857
1858 /**
1859  * skb_page_frag_refill - check that a page_frag contains enough room
1860  * @sz: minimum size of the fragment we want to get
1861  * @pfrag: pointer to page_frag
1862  * @gfp: priority for memory allocation
1863  *
1864  * Note: While this allocator tries to use high order pages, there is
1865  * no guarantee that allocations succeed. Therefore, @sz MUST be
1866  * less or equal than PAGE_SIZE.
1867  */
1868 bool skb_page_frag_refill(unsigned int sz, struct page_frag *pfrag, gfp_t gfp)
1869 {
1870         if (pfrag->page) {
1871                 if (atomic_read(&pfrag->page->_count) == 1) {
1872                         pfrag->offset = 0;
1873                         return true;
1874                 }
1875                 if (pfrag->offset + sz <= pfrag->size)
1876                         return true;
1877                 put_page(pfrag->page);
1878         }
1879
1880         pfrag->offset = 0;
1881         if (SKB_FRAG_PAGE_ORDER) {
1882                 pfrag->page = alloc_pages((gfp & ~__GFP_WAIT) | __GFP_COMP |
1883                                           __GFP_NOWARN | __GFP_NORETRY,
1884                                           SKB_FRAG_PAGE_ORDER);
1885                 if (likely(pfrag->page)) {
1886                         pfrag->size = PAGE_SIZE << SKB_FRAG_PAGE_ORDER;
1887                         return true;
1888                 }
1889         }
1890         pfrag->page = alloc_page(gfp);
1891         if (likely(pfrag->page)) {
1892                 pfrag->size = PAGE_SIZE;
1893                 return true;
1894         }
1895         return false;
1896 }
1897 EXPORT_SYMBOL(skb_page_frag_refill);
1898
1899 bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag)
1900 {
1901         if (likely(skb_page_frag_refill(32U, pfrag, sk->sk_allocation)))
1902                 return true;
1903
1904         sk_enter_memory_pressure(sk);
1905         sk_stream_moderate_sndbuf(sk);
1906         return false;
1907 }
1908 EXPORT_SYMBOL(sk_page_frag_refill);
1909
1910 static void __lock_sock(struct sock *sk)
1911         __releases(&sk->sk_lock.slock)
1912         __acquires(&sk->sk_lock.slock)
1913 {
1914         DEFINE_WAIT(wait);
1915
1916         for (;;) {
1917                 prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait,
1918                                         TASK_UNINTERRUPTIBLE);
1919                 spin_unlock_bh(&sk->sk_lock.slock);
1920                 schedule();
1921                 spin_lock_bh(&sk->sk_lock.slock);
1922                 if (!sock_owned_by_user(sk))
1923                         break;
1924         }
1925         finish_wait(&sk->sk_lock.wq, &wait);
1926 }
1927
1928 static void __release_sock(struct sock *sk)
1929         __releases(&sk->sk_lock.slock)
1930         __acquires(&sk->sk_lock.slock)
1931 {
1932         struct sk_buff *skb = sk->sk_backlog.head;
1933
1934         do {
1935                 sk->sk_backlog.head = sk->sk_backlog.tail = NULL;
1936                 bh_unlock_sock(sk);
1937
1938                 do {
1939                         struct sk_buff *next = skb->next;
1940
1941                         prefetch(next);
1942                         WARN_ON_ONCE(skb_dst_is_noref(skb));
1943                         skb->next = NULL;
1944                         sk_backlog_rcv(sk, skb);
1945
1946                         /*
1947                          * We are in process context here with softirqs
1948                          * disabled, use cond_resched_softirq() to preempt.
1949                          * This is safe to do because we've taken the backlog
1950                          * queue private:
1951                          */
1952                         cond_resched_softirq();
1953
1954                         skb = next;
1955                 } while (skb != NULL);
1956
1957                 bh_lock_sock(sk);
1958         } while ((skb = sk->sk_backlog.head) != NULL);
1959
1960         /*
1961          * Doing the zeroing here guarantee we can not loop forever
1962          * while a wild producer attempts to flood us.
1963          */
1964         sk->sk_backlog.len = 0;
1965 }
1966
1967 /**
1968  * sk_wait_data - wait for data to arrive at sk_receive_queue
1969  * @sk:    sock to wait on
1970  * @timeo: for how long
1971  *
1972  * Now socket state including sk->sk_err is changed only under lock,
1973  * hence we may omit checks after joining wait queue.
1974  * We check receive queue before schedule() only as optimization;
1975  * it is very likely that release_sock() added new data.
1976  */
1977 int sk_wait_data(struct sock *sk, long *timeo)
1978 {
1979         int rc;
1980         DEFINE_WAIT(wait);
1981
1982         prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
1983         set_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
1984         rc = sk_wait_event(sk, timeo, !skb_queue_empty(&sk->sk_receive_queue));
1985         clear_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
1986         finish_wait(sk_sleep(sk), &wait);
1987         return rc;
1988 }
1989 EXPORT_SYMBOL(sk_wait_data);
1990
1991 /**
1992  *      __sk_mem_schedule - increase sk_forward_alloc and memory_allocated
1993  *      @sk: socket
1994  *      @size: memory size to allocate
1995  *      @kind: allocation type
1996  *
1997  *      If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means
1998  *      rmem allocation. This function assumes that protocols which have
1999  *      memory_pressure use sk_wmem_queued as write buffer accounting.
2000  */
2001 int __sk_mem_schedule(struct sock *sk, int size, int kind)
2002 {
2003         struct proto *prot = sk->sk_prot;
2004         int amt = sk_mem_pages(size);
2005         long allocated;
2006         int parent_status = UNDER_LIMIT;
2007
2008         sk->sk_forward_alloc += amt * SK_MEM_QUANTUM;
2009
2010         allocated = sk_memory_allocated_add(sk, amt, &parent_status);
2011
2012         /* Under limit. */
2013         if (parent_status == UNDER_LIMIT &&
2014                         allocated <= sk_prot_mem_limits(sk, 0)) {
2015                 sk_leave_memory_pressure(sk);
2016                 return 1;
2017         }
2018
2019         /* Under pressure. (we or our parents) */
2020         if ((parent_status > SOFT_LIMIT) ||
2021                         allocated > sk_prot_mem_limits(sk, 1))
2022                 sk_enter_memory_pressure(sk);
2023
2024         /* Over hard limit (we or our parents) */
2025         if ((parent_status == OVER_LIMIT) ||
2026                         (allocated > sk_prot_mem_limits(sk, 2)))
2027                 goto suppress_allocation;
2028
2029         /* guarantee minimum buffer size under pressure */
2030         if (kind == SK_MEM_RECV) {
2031                 if (atomic_read(&sk->sk_rmem_alloc) < prot->sysctl_rmem[0])
2032                         return 1;
2033
2034         } else { /* SK_MEM_SEND */
2035                 if (sk->sk_type == SOCK_STREAM) {
2036                         if (sk->sk_wmem_queued < prot->sysctl_wmem[0])
2037                                 return 1;
2038                 } else if (atomic_read(&sk->sk_wmem_alloc) <
2039                            prot->sysctl_wmem[0])
2040                                 return 1;
2041         }
2042
2043         if (sk_has_memory_pressure(sk)) {
2044                 int alloc;
2045
2046                 if (!sk_under_memory_pressure(sk))
2047                         return 1;
2048                 alloc = sk_sockets_allocated_read_positive(sk);
2049                 if (sk_prot_mem_limits(sk, 2) > alloc *
2050                     sk_mem_pages(sk->sk_wmem_queued +
2051                                  atomic_read(&sk->sk_rmem_alloc) +
2052                                  sk->sk_forward_alloc))
2053                         return 1;
2054         }
2055
2056 suppress_allocation:
2057
2058         if (kind == SK_MEM_SEND && sk->sk_type == SOCK_STREAM) {
2059                 sk_stream_moderate_sndbuf(sk);
2060
2061                 /* Fail only if socket is _under_ its sndbuf.
2062                  * In this case we cannot block, so that we have to fail.
2063                  */
2064                 if (sk->sk_wmem_queued + size >= sk->sk_sndbuf)
2065                         return 1;
2066         }
2067
2068         trace_sock_exceed_buf_limit(sk, prot, allocated);
2069
2070         /* Alas. Undo changes. */
2071         sk->sk_forward_alloc -= amt * SK_MEM_QUANTUM;
2072
2073         sk_memory_allocated_sub(sk, amt);
2074
2075         return 0;
2076 }
2077 EXPORT_SYMBOL(__sk_mem_schedule);
2078
2079 /**
2080  *      __sk_reclaim - reclaim memory_allocated
2081  *      @sk: socket
2082  */
2083 void __sk_mem_reclaim(struct sock *sk)
2084 {
2085         sk_memory_allocated_sub(sk,
2086                                 sk->sk_forward_alloc >> SK_MEM_QUANTUM_SHIFT);
2087         sk->sk_forward_alloc &= SK_MEM_QUANTUM - 1;
2088
2089         if (sk_under_memory_pressure(sk) &&
2090             (sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)))
2091                 sk_leave_memory_pressure(sk);
2092 }
2093 EXPORT_SYMBOL(__sk_mem_reclaim);
2094
2095
2096 /*
2097  * Set of default routines for initialising struct proto_ops when
2098  * the protocol does not support a particular function. In certain
2099  * cases where it makes no sense for a protocol to have a "do nothing"
2100  * function, some default processing is provided.
2101  */
2102
2103 int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len)
2104 {
2105         return -EOPNOTSUPP;
2106 }
2107 EXPORT_SYMBOL(sock_no_bind);
2108
2109 int sock_no_connect(struct socket *sock, struct sockaddr *saddr,
2110                     int len, int flags)
2111 {
2112         return -EOPNOTSUPP;
2113 }
2114 EXPORT_SYMBOL(sock_no_connect);
2115
2116 int sock_no_socketpair(struct socket *sock1, struct socket *sock2)
2117 {
2118         return -EOPNOTSUPP;
2119 }
2120 EXPORT_SYMBOL(sock_no_socketpair);
2121
2122 int sock_no_accept(struct socket *sock, struct socket *newsock, int flags)
2123 {
2124         return -EOPNOTSUPP;
2125 }
2126 EXPORT_SYMBOL(sock_no_accept);
2127
2128 int sock_no_getname(struct socket *sock, struct sockaddr *saddr,
2129                     int *len, int peer)
2130 {
2131         return -EOPNOTSUPP;
2132 }
2133 EXPORT_SYMBOL(sock_no_getname);
2134
2135 unsigned int sock_no_poll(struct file *file, struct socket *sock, poll_table *pt)
2136 {
2137         return 0;
2138 }
2139 EXPORT_SYMBOL(sock_no_poll);
2140
2141 int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
2142 {
2143         return -EOPNOTSUPP;
2144 }
2145 EXPORT_SYMBOL(sock_no_ioctl);
2146
2147 int sock_no_listen(struct socket *sock, int backlog)
2148 {
2149         return -EOPNOTSUPP;
2150 }
2151 EXPORT_SYMBOL(sock_no_listen);
2152
2153 int sock_no_shutdown(struct socket *sock, int how)
2154 {
2155         return -EOPNOTSUPP;
2156 }
2157 EXPORT_SYMBOL(sock_no_shutdown);
2158
2159 int sock_no_setsockopt(struct socket *sock, int level, int optname,
2160                     char __user *optval, unsigned int optlen)
2161 {
2162         return -EOPNOTSUPP;
2163 }
2164 EXPORT_SYMBOL(sock_no_setsockopt);
2165
2166 int sock_no_getsockopt(struct socket *sock, int level, int optname,
2167                     char __user *optval, int __user *optlen)
2168 {
2169         return -EOPNOTSUPP;
2170 }
2171 EXPORT_SYMBOL(sock_no_getsockopt);
2172
2173 int sock_no_sendmsg(struct socket *sock, struct msghdr *m, size_t len)
2174 {
2175         return -EOPNOTSUPP;
2176 }
2177 EXPORT_SYMBOL(sock_no_sendmsg);
2178
2179 int sock_no_recvmsg(struct socket *sock, struct msghdr *m, size_t len,
2180                     int flags)
2181 {
2182         return -EOPNOTSUPP;
2183 }
2184 EXPORT_SYMBOL(sock_no_recvmsg);
2185
2186 int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
2187 {
2188         /* Mirror missing mmap method error code */
2189         return -ENODEV;
2190 }
2191 EXPORT_SYMBOL(sock_no_mmap);
2192
2193 ssize_t sock_no_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags)
2194 {
2195         ssize_t res;
2196         struct msghdr msg = {.msg_flags = flags};
2197         struct kvec iov;
2198         char *kaddr = kmap(page);
2199         iov.iov_base = kaddr + offset;
2200         iov.iov_len = size;
2201         res = kernel_sendmsg(sock, &msg, &iov, 1, size);
2202         kunmap(page);
2203         return res;
2204 }
2205 EXPORT_SYMBOL(sock_no_sendpage);
2206
2207 /*
2208  *      Default Socket Callbacks
2209  */
2210
2211 static void sock_def_wakeup(struct sock *sk)
2212 {
2213         struct socket_wq *wq;
2214
2215         rcu_read_lock();
2216         wq = rcu_dereference(sk->sk_wq);
2217         if (wq_has_sleeper(wq))
2218                 wake_up_interruptible_all(&wq->wait);
2219         rcu_read_unlock();
2220 }
2221
2222 static void sock_def_error_report(struct sock *sk)
2223 {
2224         struct socket_wq *wq;
2225
2226         rcu_read_lock();
2227         wq = rcu_dereference(sk->sk_wq);
2228         if (wq_has_sleeper(wq))
2229                 wake_up_interruptible_poll(&wq->wait, POLLERR);
2230         sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR);
2231         rcu_read_unlock();
2232 }
2233
2234 static void sock_def_readable(struct sock *sk)
2235 {
2236         struct socket_wq *wq;
2237
2238         rcu_read_lock();
2239         wq = rcu_dereference(sk->sk_wq);
2240         if (wq_has_sleeper(wq))
2241                 wake_up_interruptible_sync_poll(&wq->wait, POLLIN | POLLPRI |
2242                                                 POLLRDNORM | POLLRDBAND);
2243         sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
2244         rcu_read_unlock();
2245 }
2246
2247 static void sock_def_write_space(struct sock *sk)
2248 {
2249         struct socket_wq *wq;
2250
2251         rcu_read_lock();
2252
2253         /* Do not wake up a writer until he can make "significant"
2254          * progress.  --DaveM
2255          */
2256         if ((atomic_read(&sk->sk_wmem_alloc) << 1) <= sk->sk_sndbuf) {
2257                 wq = rcu_dereference(sk->sk_wq);
2258                 if (wq_has_sleeper(wq))
2259                         wake_up_interruptible_sync_poll(&wq->wait, POLLOUT |
2260                                                 POLLWRNORM | POLLWRBAND);
2261
2262                 /* Should agree with poll, otherwise some programs break */
2263                 if (sock_writeable(sk))
2264                         sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
2265         }
2266
2267         rcu_read_unlock();
2268 }
2269
2270 static void sock_def_destruct(struct sock *sk)
2271 {
2272         kfree(sk->sk_protinfo);
2273 }
2274
2275 void sk_send_sigurg(struct sock *sk)
2276 {
2277         if (sk->sk_socket && sk->sk_socket->file)
2278                 if (send_sigurg(&sk->sk_socket->file->f_owner))
2279                         sk_wake_async(sk, SOCK_WAKE_URG, POLL_PRI);
2280 }
2281 EXPORT_SYMBOL(sk_send_sigurg);
2282
2283 void sk_reset_timer(struct sock *sk, struct timer_list* timer,
2284                     unsigned long expires)
2285 {
2286         if (!mod_timer(timer, expires))
2287                 sock_hold(sk);
2288 }
2289 EXPORT_SYMBOL(sk_reset_timer);
2290
2291 void sk_stop_timer(struct sock *sk, struct timer_list* timer)
2292 {
2293         if (del_timer(timer))
2294                 __sock_put(sk);
2295 }
2296 EXPORT_SYMBOL(sk_stop_timer);
2297
2298 void sock_init_data(struct socket *sock, struct sock *sk)
2299 {
2300         skb_queue_head_init(&sk->sk_receive_queue);
2301         skb_queue_head_init(&sk->sk_write_queue);
2302         skb_queue_head_init(&sk->sk_error_queue);
2303
2304         sk->sk_send_head        =       NULL;
2305
2306         init_timer(&sk->sk_timer);
2307
2308         sk->sk_allocation       =       GFP_KERNEL;
2309         sk->sk_rcvbuf           =       sysctl_rmem_default;
2310         sk->sk_sndbuf           =       sysctl_wmem_default;
2311         sk->sk_state            =       TCP_CLOSE;
2312         sk_set_socket(sk, sock);
2313
2314         sock_set_flag(sk, SOCK_ZAPPED);
2315
2316         if (sock) {
2317                 sk->sk_type     =       sock->type;
2318                 sk->sk_wq       =       sock->wq;
2319                 sock->sk        =       sk;
2320         } else
2321                 sk->sk_wq       =       NULL;
2322
2323         spin_lock_init(&sk->sk_dst_lock);
2324         rwlock_init(&sk->sk_callback_lock);
2325         lockdep_set_class_and_name(&sk->sk_callback_lock,
2326                         af_callback_keys + sk->sk_family,
2327                         af_family_clock_key_strings[sk->sk_family]);
2328
2329         sk->sk_state_change     =       sock_def_wakeup;
2330         sk->sk_data_ready       =       sock_def_readable;
2331         sk->sk_write_space      =       sock_def_write_space;
2332         sk->sk_error_report     =       sock_def_error_report;
2333         sk->sk_destruct         =       sock_def_destruct;
2334
2335         sk->sk_frag.page        =       NULL;
2336         sk->sk_frag.offset      =       0;
2337         sk->sk_peek_off         =       -1;
2338
2339         sk->sk_peer_pid         =       NULL;
2340         sk->sk_peer_cred        =       NULL;
2341         sk->sk_write_pending    =       0;
2342         sk->sk_rcvlowat         =       1;
2343         sk->sk_rcvtimeo         =       MAX_SCHEDULE_TIMEOUT;
2344         sk->sk_sndtimeo         =       MAX_SCHEDULE_TIMEOUT;
2345
2346         sk->sk_stamp = ktime_set(-1L, 0);
2347
2348 #ifdef CONFIG_NET_RX_BUSY_POLL
2349         sk->sk_napi_id          =       0;
2350         sk->sk_ll_usec          =       sysctl_net_busy_read;
2351 #endif
2352
2353         sk->sk_max_pacing_rate = ~0U;
2354         sk->sk_pacing_rate = ~0U;
2355         /*
2356          * Before updating sk_refcnt, we must commit prior changes to memory
2357          * (Documentation/RCU/rculist_nulls.txt for details)
2358          */
2359         smp_wmb();
2360         atomic_set(&sk->sk_refcnt, 1);
2361         atomic_set(&sk->sk_drops, 0);
2362 }
2363 EXPORT_SYMBOL(sock_init_data);
2364
2365 void lock_sock_nested(struct sock *sk, int subclass)
2366 {
2367         might_sleep();
2368         spin_lock_bh(&sk->sk_lock.slock);
2369         if (sk->sk_lock.owned)
2370                 __lock_sock(sk);
2371         sk->sk_lock.owned = 1;
2372         spin_unlock(&sk->sk_lock.slock);
2373         /*
2374          * The sk_lock has mutex_lock() semantics here:
2375          */
2376         mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_);
2377         local_bh_enable();
2378 }
2379 EXPORT_SYMBOL(lock_sock_nested);
2380
2381 void release_sock(struct sock *sk)
2382 {
2383         /*
2384          * The sk_lock has mutex_unlock() semantics:
2385          */
2386         mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
2387
2388         spin_lock_bh(&sk->sk_lock.slock);
2389         if (sk->sk_backlog.tail)
2390                 __release_sock(sk);
2391
2392         /* Warning : release_cb() might need to release sk ownership,
2393          * ie call sock_release_ownership(sk) before us.
2394          */
2395         if (sk->sk_prot->release_cb)
2396                 sk->sk_prot->release_cb(sk);
2397
2398         sock_release_ownership(sk);
2399         if (waitqueue_active(&sk->sk_lock.wq))
2400                 wake_up(&sk->sk_lock.wq);
2401         spin_unlock_bh(&sk->sk_lock.slock);
2402 }
2403 EXPORT_SYMBOL(release_sock);
2404
2405 /**
2406  * lock_sock_fast - fast version of lock_sock
2407  * @sk: socket
2408  *
2409  * This version should be used for very small section, where process wont block
2410  * return false if fast path is taken
2411  *   sk_lock.slock locked, owned = 0, BH disabled
2412  * return true if slow path is taken
2413  *   sk_lock.slock unlocked, owned = 1, BH enabled
2414  */
2415 bool lock_sock_fast(struct sock *sk)
2416 {
2417         might_sleep();
2418         spin_lock_bh(&sk->sk_lock.slock);
2419
2420         if (!sk->sk_lock.owned)
2421                 /*
2422                  * Note : We must disable BH
2423                  */
2424                 return false;
2425
2426         __lock_sock(sk);
2427         sk->sk_lock.owned = 1;
2428         spin_unlock(&sk->sk_lock.slock);
2429         /*
2430          * The sk_lock has mutex_lock() semantics here:
2431          */
2432         mutex_acquire(&sk->sk_lock.dep_map, 0, 0, _RET_IP_);
2433         local_bh_enable();
2434         return true;
2435 }
2436 EXPORT_SYMBOL(lock_sock_fast);
2437
2438 int sock_get_timestamp(struct sock *sk, struct timeval __user *userstamp)
2439 {
2440         struct timeval tv;
2441         if (!sock_flag(sk, SOCK_TIMESTAMP))
2442                 sock_enable_timestamp(sk, SOCK_TIMESTAMP);
2443         tv = ktime_to_timeval(sk->sk_stamp);
2444         if (tv.tv_sec == -1)
2445                 return -ENOENT;
2446         if (tv.tv_sec == 0) {
2447                 sk->sk_stamp = ktime_get_real();
2448                 tv = ktime_to_timeval(sk->sk_stamp);
2449         }
2450         return copy_to_user(userstamp, &tv, sizeof(tv)) ? -EFAULT : 0;
2451 }
2452 EXPORT_SYMBOL(sock_get_timestamp);
2453
2454 int sock_get_timestampns(struct sock *sk, struct timespec __user *userstamp)
2455 {
2456         struct timespec ts;
2457         if (!sock_flag(sk, SOCK_TIMESTAMP))
2458                 sock_enable_timestamp(sk, SOCK_TIMESTAMP);
2459         ts = ktime_to_timespec(sk->sk_stamp);
2460         if (ts.tv_sec == -1)
2461                 return -ENOENT;
2462         if (ts.tv_sec == 0) {
2463                 sk->sk_stamp = ktime_get_real();
2464                 ts = ktime_to_timespec(sk->sk_stamp);
2465         }
2466         return copy_to_user(userstamp, &ts, sizeof(ts)) ? -EFAULT : 0;
2467 }
2468 EXPORT_SYMBOL(sock_get_timestampns);
2469
2470 void sock_enable_timestamp(struct sock *sk, int flag)
2471 {
2472         if (!sock_flag(sk, flag)) {
2473                 unsigned long previous_flags = sk->sk_flags;
2474
2475                 sock_set_flag(sk, flag);
2476                 /*
2477                  * we just set one of the two flags which require net
2478                  * time stamping, but time stamping might have been on
2479                  * already because of the other one
2480                  */
2481                 if (!(previous_flags & SK_FLAGS_TIMESTAMP))
2482                         net_enable_timestamp();
2483         }
2484 }
2485
2486 int sock_recv_errqueue(struct sock *sk, struct msghdr *msg, int len,
2487                        int level, int type)
2488 {
2489         struct sock_exterr_skb *serr;
2490         struct sk_buff *skb;
2491         int copied, err;
2492
2493         err = -EAGAIN;
2494         skb = sock_dequeue_err_skb(sk);
2495         if (skb == NULL)
2496                 goto out;
2497
2498         copied = skb->len;
2499         if (copied > len) {
2500                 msg->msg_flags |= MSG_TRUNC;
2501                 copied = len;
2502         }
2503         err = skb_copy_datagram_msg(skb, 0, msg, copied);
2504         if (err)
2505                 goto out_free_skb;
2506
2507         sock_recv_timestamp(msg, sk, skb);
2508
2509         serr = SKB_EXT_ERR(skb);
2510         put_cmsg(msg, level, type, sizeof(serr->ee), &serr->ee);
2511
2512         msg->msg_flags |= MSG_ERRQUEUE;
2513         err = copied;
2514
2515 out_free_skb:
2516         kfree_skb(skb);
2517 out:
2518         return err;
2519 }
2520 EXPORT_SYMBOL(sock_recv_errqueue);
2521
2522 /*
2523  *      Get a socket option on an socket.
2524  *
2525  *      FIX: POSIX 1003.1g is very ambiguous here. It states that
2526  *      asynchronous errors should be reported by getsockopt. We assume
2527  *      this means if you specify SO_ERROR (otherwise whats the point of it).
2528  */
2529 int sock_common_getsockopt(struct socket *sock, int level, int optname,
2530                            char __user *optval, int __user *optlen)
2531 {
2532         struct sock *sk = sock->sk;
2533
2534         return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
2535 }
2536 EXPORT_SYMBOL(sock_common_getsockopt);
2537
2538 #ifdef CONFIG_COMPAT
2539 int compat_sock_common_getsockopt(struct socket *sock, int level, int optname,
2540                                   char __user *optval, int __user *optlen)
2541 {
2542         struct sock *sk = sock->sk;
2543
2544         if (sk->sk_prot->compat_getsockopt != NULL)
2545                 return sk->sk_prot->compat_getsockopt(sk, level, optname,
2546                                                       optval, optlen);
2547         return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
2548 }
2549 EXPORT_SYMBOL(compat_sock_common_getsockopt);
2550 #endif
2551
2552 int sock_common_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
2553                         int flags)
2554 {
2555         struct sock *sk = sock->sk;
2556         int addr_len = 0;
2557         int err;
2558
2559         err = sk->sk_prot->recvmsg(sk, msg, size, flags & MSG_DONTWAIT,
2560                                    flags & ~MSG_DONTWAIT, &addr_len);
2561         if (err >= 0)
2562                 msg->msg_namelen = addr_len;
2563         return err;
2564 }
2565 EXPORT_SYMBOL(sock_common_recvmsg);
2566
2567 /*
2568  *      Set socket options on an inet socket.
2569  */
2570 int sock_common_setsockopt(struct socket *sock, int level, int optname,
2571                            char __user *optval, unsigned int optlen)
2572 {
2573         struct sock *sk = sock->sk;
2574
2575         return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
2576 }
2577 EXPORT_SYMBOL(sock_common_setsockopt);
2578
2579 #ifdef CONFIG_COMPAT
2580 int compat_sock_common_setsockopt(struct socket *sock, int level, int optname,
2581                                   char __user *optval, unsigned int optlen)
2582 {
2583         struct sock *sk = sock->sk;
2584
2585         if (sk->sk_prot->compat_setsockopt != NULL)
2586                 return sk->sk_prot->compat_setsockopt(sk, level, optname,
2587                                                       optval, optlen);
2588         return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
2589 }
2590 EXPORT_SYMBOL(compat_sock_common_setsockopt);
2591 #endif
2592
2593 void sk_common_release(struct sock *sk)
2594 {
2595         if (sk->sk_prot->destroy)
2596                 sk->sk_prot->destroy(sk);
2597
2598         /*
2599          * Observation: when sock_common_release is called, processes have
2600          * no access to socket. But net still has.
2601          * Step one, detach it from networking:
2602          *
2603          * A. Remove from hash tables.
2604          */
2605
2606         sk->sk_prot->unhash(sk);
2607
2608         /*
2609          * In this point socket cannot receive new packets, but it is possible
2610          * that some packets are in flight because some CPU runs receiver and
2611          * did hash table lookup before we unhashed socket. They will achieve
2612          * receive queue and will be purged by socket destructor.
2613          *
2614          * Also we still have packets pending on receive queue and probably,
2615          * our own packets waiting in device queues. sock_destroy will drain
2616          * receive queue, but transmitted packets will delay socket destruction
2617          * until the last reference will be released.
2618          */
2619
2620         sock_orphan(sk);
2621
2622         xfrm_sk_free_policy(sk);
2623
2624         sk_refcnt_debug_release(sk);
2625
2626         if (sk->sk_frag.page) {
2627                 put_page(sk->sk_frag.page);
2628                 sk->sk_frag.page = NULL;
2629         }
2630
2631         sock_put(sk);
2632 }
2633 EXPORT_SYMBOL(sk_common_release);
2634
2635 #ifdef CONFIG_PROC_FS
2636 #define PROTO_INUSE_NR  64      /* should be enough for the first time */
2637 struct prot_inuse {
2638         int val[PROTO_INUSE_NR];
2639 };
2640
2641 static DECLARE_BITMAP(proto_inuse_idx, PROTO_INUSE_NR);
2642
2643 #ifdef CONFIG_NET_NS
2644 void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
2645 {
2646         __this_cpu_add(net->core.inuse->val[prot->inuse_idx], val);
2647 }
2648 EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
2649
2650 int sock_prot_inuse_get(struct net *net, struct proto *prot)
2651 {
2652         int cpu, idx = prot->inuse_idx;
2653         int res = 0;
2654
2655         for_each_possible_cpu(cpu)
2656                 res += per_cpu_ptr(net->core.inuse, cpu)->val[idx];
2657
2658         return res >= 0 ? res : 0;
2659 }
2660 EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
2661
2662 static int __net_init sock_inuse_init_net(struct net *net)
2663 {
2664         net->core.inuse = alloc_percpu(struct prot_inuse);
2665         return net->core.inuse ? 0 : -ENOMEM;
2666 }
2667
2668 static void __net_exit sock_inuse_exit_net(struct net *net)
2669 {
2670         free_percpu(net->core.inuse);
2671 }
2672
2673 static struct pernet_operations net_inuse_ops = {
2674         .init = sock_inuse_init_net,
2675         .exit = sock_inuse_exit_net,
2676 };
2677
2678 static __init int net_inuse_init(void)
2679 {
2680         if (register_pernet_subsys(&net_inuse_ops))
2681                 panic("Cannot initialize net inuse counters");
2682
2683         return 0;
2684 }
2685
2686 core_initcall(net_inuse_init);
2687 #else
2688 static DEFINE_PER_CPU(struct prot_inuse, prot_inuse);
2689
2690 void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
2691 {
2692         __this_cpu_add(prot_inuse.val[prot->inuse_idx], val);
2693 }
2694 EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
2695
2696 int sock_prot_inuse_get(struct net *net, struct proto *prot)
2697 {
2698         int cpu, idx = prot->inuse_idx;
2699         int res = 0;
2700
2701         for_each_possible_cpu(cpu)
2702                 res += per_cpu(prot_inuse, cpu).val[idx];
2703
2704         return res >= 0 ? res : 0;
2705 }
2706 EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
2707 #endif
2708
2709 static void assign_proto_idx(struct proto *prot)
2710 {
2711         prot->inuse_idx = find_first_zero_bit(proto_inuse_idx, PROTO_INUSE_NR);
2712
2713         if (unlikely(prot->inuse_idx == PROTO_INUSE_NR - 1)) {
2714                 pr_err("PROTO_INUSE_NR exhausted\n");
2715                 return;
2716         }
2717
2718         set_bit(prot->inuse_idx, proto_inuse_idx);
2719 }
2720
2721 static void release_proto_idx(struct proto *prot)
2722 {
2723         if (prot->inuse_idx != PROTO_INUSE_NR - 1)
2724                 clear_bit(prot->inuse_idx, proto_inuse_idx);
2725 }
2726 #else
2727 static inline void assign_proto_idx(struct proto *prot)
2728 {
2729 }
2730
2731 static inline void release_proto_idx(struct proto *prot)
2732 {
2733 }
2734 #endif
2735
2736 static void req_prot_cleanup(struct request_sock_ops *rsk_prot)
2737 {
2738         if (!rsk_prot)
2739                 return;
2740         kfree(rsk_prot->slab_name);
2741         rsk_prot->slab_name = NULL;
2742         if (rsk_prot->slab) {
2743                 kmem_cache_destroy(rsk_prot->slab);
2744                 rsk_prot->slab = NULL;
2745         }
2746 }
2747
2748 static int req_prot_init(const struct proto *prot)
2749 {
2750         struct request_sock_ops *rsk_prot = prot->rsk_prot;
2751
2752         if (!rsk_prot)
2753                 return 0;
2754
2755         rsk_prot->slab_name = kasprintf(GFP_KERNEL, "request_sock_%s",
2756                                         prot->name);
2757         if (!rsk_prot->slab_name)
2758                 return -ENOMEM;
2759
2760         rsk_prot->slab = kmem_cache_create(rsk_prot->slab_name,
2761                                            rsk_prot->obj_size, 0,
2762                                            0, NULL);
2763
2764         if (!rsk_prot->slab) {
2765                 pr_crit("%s: Can't create request sock SLAB cache!\n",
2766                         prot->name);
2767                 return -ENOMEM;
2768         }
2769         return 0;
2770 }
2771
2772 int proto_register(struct proto *prot, int alloc_slab)
2773 {
2774         if (alloc_slab) {
2775                 prot->slab = kmem_cache_create(prot->name, prot->obj_size, 0,
2776                                         SLAB_HWCACHE_ALIGN | prot->slab_flags,
2777                                         NULL);
2778
2779                 if (prot->slab == NULL) {
2780                         pr_crit("%s: Can't create sock SLAB cache!\n",
2781                                 prot->name);
2782                         goto out;
2783                 }
2784
2785                 if (req_prot_init(prot))
2786                         goto out_free_request_sock_slab;
2787
2788                 if (prot->twsk_prot != NULL) {
2789                         prot->twsk_prot->twsk_slab_name = kasprintf(GFP_KERNEL, "tw_sock_%s", prot->name);
2790
2791                         if (prot->twsk_prot->twsk_slab_name == NULL)
2792                                 goto out_free_request_sock_slab;
2793
2794                         prot->twsk_prot->twsk_slab =
2795                                 kmem_cache_create(prot->twsk_prot->twsk_slab_name,
2796                                                   prot->twsk_prot->twsk_obj_size,
2797                                                   0,
2798                                                   prot->slab_flags,
2799                                                   NULL);
2800                         if (prot->twsk_prot->twsk_slab == NULL)
2801                                 goto out_free_timewait_sock_slab_name;
2802                 }
2803         }
2804
2805         mutex_lock(&proto_list_mutex);
2806         list_add(&prot->node, &proto_list);
2807         assign_proto_idx(prot);
2808         mutex_unlock(&proto_list_mutex);
2809         return 0;
2810
2811 out_free_timewait_sock_slab_name:
2812         kfree(prot->twsk_prot->twsk_slab_name);
2813 out_free_request_sock_slab:
2814         req_prot_cleanup(prot->rsk_prot);
2815
2816         kmem_cache_destroy(prot->slab);
2817         prot->slab = NULL;
2818 out:
2819         return -ENOBUFS;
2820 }
2821 EXPORT_SYMBOL(proto_register);
2822
2823 void proto_unregister(struct proto *prot)
2824 {
2825         mutex_lock(&proto_list_mutex);
2826         release_proto_idx(prot);
2827         list_del(&prot->node);
2828         mutex_unlock(&proto_list_mutex);
2829
2830         if (prot->slab != NULL) {
2831                 kmem_cache_destroy(prot->slab);
2832                 prot->slab = NULL;
2833         }
2834
2835         req_prot_cleanup(prot->rsk_prot);
2836
2837         if (prot->twsk_prot != NULL && prot->twsk_prot->twsk_slab != NULL) {
2838                 kmem_cache_destroy(prot->twsk_prot->twsk_slab);
2839                 kfree(prot->twsk_prot->twsk_slab_name);
2840                 prot->twsk_prot->twsk_slab = NULL;
2841         }
2842 }
2843 EXPORT_SYMBOL(proto_unregister);
2844
2845 #ifdef CONFIG_PROC_FS
2846 static void *proto_seq_start(struct seq_file *seq, loff_t *pos)
2847         __acquires(proto_list_mutex)
2848 {
2849         mutex_lock(&proto_list_mutex);
2850         return seq_list_start_head(&proto_list, *pos);
2851 }
2852
2853 static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2854 {
2855         return seq_list_next(v, &proto_list, pos);
2856 }
2857
2858 static void proto_seq_stop(struct seq_file *seq, void *v)
2859         __releases(proto_list_mutex)
2860 {
2861         mutex_unlock(&proto_list_mutex);
2862 }
2863
2864 static char proto_method_implemented(const void *method)
2865 {
2866         return method == NULL ? 'n' : 'y';
2867 }
2868 static long sock_prot_memory_allocated(struct proto *proto)
2869 {
2870         return proto->memory_allocated != NULL ? proto_memory_allocated(proto) : -1L;
2871 }
2872
2873 static char *sock_prot_memory_pressure(struct proto *proto)
2874 {
2875         return proto->memory_pressure != NULL ?
2876         proto_memory_pressure(proto) ? "yes" : "no" : "NI";
2877 }
2878
2879 static void proto_seq_printf(struct seq_file *seq, struct proto *proto)
2880 {
2881
2882         seq_printf(seq, "%-9s %4u %6d  %6ld   %-3s %6u   %-3s  %-10s "
2883                         "%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n",
2884                    proto->name,
2885                    proto->obj_size,
2886                    sock_prot_inuse_get(seq_file_net(seq), proto),
2887                    sock_prot_memory_allocated(proto),
2888                    sock_prot_memory_pressure(proto),
2889                    proto->max_header,
2890                    proto->slab == NULL ? "no" : "yes",
2891                    module_name(proto->owner),
2892                    proto_method_implemented(proto->close),
2893                    proto_method_implemented(proto->connect),
2894                    proto_method_implemented(proto->disconnect),
2895                    proto_method_implemented(proto->accept),
2896                    proto_method_implemented(proto->ioctl),
2897                    proto_method_implemented(proto->init),
2898                    proto_method_implemented(proto->destroy),
2899                    proto_method_implemented(proto->shutdown),
2900                    proto_method_implemented(proto->setsockopt),
2901                    proto_method_implemented(proto->getsockopt),
2902                    proto_method_implemented(proto->sendmsg),
2903                    proto_method_implemented(proto->recvmsg),
2904                    proto_method_implemented(proto->sendpage),
2905                    proto_method_implemented(proto->bind),
2906                    proto_method_implemented(proto->backlog_rcv),
2907                    proto_method_implemented(proto->hash),
2908                    proto_method_implemented(proto->unhash),
2909                    proto_method_implemented(proto->get_port),
2910                    proto_method_implemented(proto->enter_memory_pressure));
2911 }
2912
2913 static int proto_seq_show(struct seq_file *seq, void *v)
2914 {
2915         if (v == &proto_list)
2916                 seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s",
2917                            "protocol",
2918                            "size",
2919                            "sockets",
2920                            "memory",
2921                            "press",
2922                            "maxhdr",
2923                            "slab",
2924                            "module",
2925                            "cl co di ac io in de sh ss gs se re sp bi br ha uh gp em\n");
2926         else
2927                 proto_seq_printf(seq, list_entry(v, struct proto, node));
2928         return 0;
2929 }
2930
2931 static const struct seq_operations proto_seq_ops = {
2932         .start  = proto_seq_start,
2933         .next   = proto_seq_next,
2934         .stop   = proto_seq_stop,
2935         .show   = proto_seq_show,
2936 };
2937
2938 static int proto_seq_open(struct inode *inode, struct file *file)
2939 {
2940         return seq_open_net(inode, file, &proto_seq_ops,
2941                             sizeof(struct seq_net_private));
2942 }
2943
2944 static const struct file_operations proto_seq_fops = {
2945         .owner          = THIS_MODULE,
2946         .open           = proto_seq_open,
2947         .read           = seq_read,
2948         .llseek         = seq_lseek,
2949         .release        = seq_release_net,
2950 };
2951
2952 static __net_init int proto_init_net(struct net *net)
2953 {
2954         if (!proc_create("protocols", S_IRUGO, net->proc_net, &proto_seq_fops))
2955                 return -ENOMEM;
2956
2957         return 0;
2958 }
2959
2960 static __net_exit void proto_exit_net(struct net *net)
2961 {
2962         remove_proc_entry("protocols", net->proc_net);
2963 }
2964
2965
2966 static __net_initdata struct pernet_operations proto_net_ops = {
2967         .init = proto_init_net,
2968         .exit = proto_exit_net,
2969 };
2970
2971 static int __init proto_init(void)
2972 {
2973         return register_pernet_subsys(&proto_net_ops);
2974 }
2975
2976 subsys_initcall(proto_init);
2977
2978 #endif /* PROC_FS */