net/core/sock.c

   1 // SPDX-License-Identifier: GPL-2.0-or-later
   2 /*
   3  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   4  *              operating system.  INET is implemented using the  BSD Socket
   5  *              interface as the means of communication with the user level.
   6  *
   7  *              Generic socket support routines. Memory allocators, socket lock/release
   8  *              handler for protocols to use and generic option handler.
   9  *
  10  * Authors:     Ross Biro
  11  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12  *              Florian La Roche, <flla@stud.uni-sb.de>
  13  *              Alan Cox, <A.Cox@swansea.ac.uk>
  14  *
  15  * Fixes:
  16  *              Alan Cox        :       Numerous verify_area() problems
  17  *              Alan Cox        :       Connecting on a connecting socket
  18  *                                      now returns an error for tcp.
  19  *              Alan Cox        :       sock->protocol is set correctly.
  20  *                                      and is not sometimes left as 0.
  21  *              Alan Cox        :       connect handles icmp errors on a
  22  *                                      connect properly. Unfortunately there
  23  *                                      is a restart syscall nasty there. I
  24  *                                      can't match BSD without hacking the C
  25  *                                      library. Ideas urgently sought!
  26  *              Alan Cox        :       Disallow bind() to addresses that are
  27  *                                      not ours - especially broadcast ones!!
  28  *              Alan Cox        :       Socket 1024 _IS_ ok for users. (fencepost)
  29  *              Alan Cox        :       sock_wfree/sock_rfree don't destroy sockets,
  30  *                                      instead they leave that for the DESTROY timer.
  31  *              Alan Cox        :       Clean up error flag in accept
  32  *              Alan Cox        :       TCP ack handling is buggy, the DESTROY timer
  33  *                                      was buggy. Put a remove_sock() in the handler
  34  *                                      for memory when we hit 0. Also altered the timer
  35  *                                      code. The ACK stuff can wait and needs major
  36  *                                      TCP layer surgery.
  37  *              Alan Cox        :       Fixed TCP ack bug, removed remove sock
  38  *                                      and fixed timer/inet_bh race.
  39  *              Alan Cox        :       Added zapped flag for TCP
  40  *              Alan Cox        :       Move kfree_skb into skbuff.c and tidied up surplus code
  41  *              Alan Cox        :       for new sk_buff allocations wmalloc/rmalloc now call alloc_skb
  42  *              Alan Cox        :       kfree_s calls now are kfree_skbmem so we can track skb resources
  43  *              Alan Cox        :       Supports socket option broadcast now as does udp. Packet and raw need fixing.
  44  *              Alan Cox        :       Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so...
  45  *              Rick Sladkey    :       Relaxed UDP rules for matching packets.
  46  *              C.E.Hawkins     :       IFF_PROMISC/SIOCGHWADDR support
  47  *      Pauline Middelink       :       identd support
  48  *              Alan Cox        :       Fixed connect() taking signals I think.
  49  *              Alan Cox        :       SO_LINGER supported
  50  *              Alan Cox        :       Error reporting fixes
  51  *              Anonymous       :       inet_create tidied up (sk->reuse setting)
  52  *              Alan Cox        :       inet sockets don't set sk->type!
  53  *              Alan Cox        :       Split socket option code
  54  *              Alan Cox        :       Callbacks
  55  *              Alan Cox        :       Nagle flag for Charles & Johannes stuff
  56  *              Alex            :       Removed restriction on inet fioctl
  57  *              Alan Cox        :       Splitting INET from NET core
  58  *              Alan Cox        :       Fixed bogus SO_TYPE handling in getsockopt()
  59  *              Adam Caldwell   :       Missing return in SO_DONTROUTE/SO_DEBUG code
  60  *              Alan Cox        :       Split IP from generic code
  61  *              Alan Cox        :       New kfree_skbmem()
  62  *              Alan Cox        :       Make SO_DEBUG superuser only.
  63  *              Alan Cox        :       Allow anyone to clear SO_DEBUG
  64  *                                      (compatibility fix)
  65  *              Alan Cox        :       Added optimistic memory grabbing for AF_UNIX throughput.
  66  *              Alan Cox        :       Allocator for a socket is settable.
  67  *              Alan Cox        :       SO_ERROR includes soft errors.
  68  *              Alan Cox        :       Allow NULL arguments on some SO_ opts
  69  *              Alan Cox        :       Generic socket allocation to make hooks
  70  *                                      easier (suggested by Craig Metz).
  71  *              Michael Pall    :       SO_ERROR returns positive errno again
  72  *              Steve Whitehouse:       Added default destructor to free
  73  *                                      protocol private data.
  74  *              Steve Whitehouse:       Added various other default routines
  75  *                                      common to several socket families.
  76  *              Chris Evans     :       Call suser() check last on F_SETOWN
  77  *              Jay Schulist    :       Added SO_ATTACH_FILTER and SO_DETACH_FILTER.
  78  *              Andi Kleen      :       Add sock_kmalloc()/sock_kfree_s()
  79  *              Andi Kleen      :       Fix write_space callback
  80  *              Chris Evans     :       Security fixes - signedness again
  81  *              Arnaldo C. Melo :       cleanups, use skb_queue_purge
  82  *
  83  * To Fix:
  84  */
  85
  86 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
  87
  88 #include <linux/unaligned.h>
  89 #include <linux/capability.h>
  90 #include <linux/errno.h>
  91 #include <linux/errqueue.h>
  92 #include <linux/types.h>
  93 #include <linux/socket.h>
  94 #include <linux/in.h>
  95 #include <linux/kernel.h>
  96 #include <linux/module.h>
  97 #include <linux/proc_fs.h>
  98 #include <linux/seq_file.h>
  99 #include <linux/sched.h>
 100 #include <linux/sched/mm.h>
 101 #include <linux/timer.h>
 102 #include <linux/string.h>
 103 #include <linux/sockios.h>
 104 #include <linux/net.h>
 105 #include <linux/mm.h>
 106 #include <linux/slab.h>
 107 #include <linux/interrupt.h>
 108 #include <linux/poll.h>
 109 #include <linux/tcp.h>
 110 #include <linux/udp.h>
 111 #include <linux/init.h>
 112 #include <linux/highmem.h>
 113 #include <linux/user_namespace.h>
 114 #include <linux/static_key.h>
 115 #include <linux/memcontrol.h>
 116 #include <linux/prefetch.h>
 117 #include <linux/compat.h>
 118 #include <linux/mroute.h>
 119 #include <linux/mroute6.h>
 120 #include <linux/icmpv6.h>
 121
 122 #include <linux/uaccess.h>
 123
 124 #include <linux/netdevice.h>
 125 #include <net/protocol.h>
 126 #include <linux/skbuff.h>
 127 #include <linux/skbuff_ref.h>
 128 #include <net/net_namespace.h>
 129 #include <net/request_sock.h>
 130 #include <net/sock.h>
 131 #include <net/proto_memory.h>
 132 #include <linux/net_tstamp.h>
 133 #include <net/xfrm.h>
 134 #include <linux/ipsec.h>
 135 #include <net/cls_cgroup.h>
 136 #include <net/netprio_cgroup.h>
 137 #include <linux/sock_diag.h>
 138
 139 #include <linux/filter.h>
 140 #include <net/sock_reuseport.h>
 141 #include <net/bpf_sk_storage.h>
 142
 143 #include <trace/events/sock.h>
 144
 145 #include <net/tcp.h>
 146 #include <net/busy_poll.h>
 147 #include <net/phonet/phonet.h>
 148
 149 #include <linux/ethtool.h>
 150
 151 #include "dev.h"
 152
 153 static DEFINE_MUTEX(proto_list_mutex);
 154 static LIST_HEAD(proto_list);
 155
 156 static void sock_def_write_space_wfree(struct sock *sk);
 157 static void sock_def_write_space(struct sock *sk);
 158
 159 /**
 160  * sk_ns_capable - General socket capability test
 161  * @sk: Socket to use a capability on or through
 162  * @user_ns: The user namespace of the capability to use
 163  * @cap: The capability to use
 164  *
 165  * Test to see if the opener of the socket had when the socket was
 166  * created and the current process has the capability @cap in the user
 167  * namespace @user_ns.
 168  */
 169 bool sk_ns_capable(const struct sock *sk,
 170                    struct user_namespace *user_ns, int cap)
 171 {
 172         return file_ns_capable(sk->sk_socket->file, user_ns, cap) &&
 173                 ns_capable(user_ns, cap);
 174 }
 175 EXPORT_SYMBOL(sk_ns_capable);
 176
 177 /**
 178  * sk_capable - Socket global capability test
 179  * @sk: Socket to use a capability on or through
 180  * @cap: The global capability to use
 181  *
 182  * Test to see if the opener of the socket had when the socket was
 183  * created and the current process has the capability @cap in all user
 184  * namespaces.
 185  */
 186 bool sk_capable(const struct sock *sk, int cap)
 187 {
 188         return sk_ns_capable(sk, &init_user_ns, cap);
 189 }
 190 EXPORT_SYMBOL(sk_capable);
 191
 192 /**
 193  * sk_net_capable - Network namespace socket capability test
 194  * @sk: Socket to use a capability on or through
 195  * @cap: The capability to use
 196  *
 197  * Test to see if the opener of the socket had when the socket was created
 198  * and the current process has the capability @cap over the network namespace
 199  * the socket is a member of.
 200  */
 201 bool sk_net_capable(const struct sock *sk, int cap)
 202 {
 203         return sk_ns_capable(sk, sock_net(sk)->user_ns, cap);
 204 }
 205 EXPORT_SYMBOL(sk_net_capable);
 206
 207 /*
 208  * Each address family might have different locking rules, so we have
 209  * one slock key per address family and separate keys for internal and
 210  * userspace sockets.
 211  */
 212 static struct lock_class_key af_family_keys[AF_MAX];
 213 static struct lock_class_key af_family_kern_keys[AF_MAX];
 214 static struct lock_class_key af_family_slock_keys[AF_MAX];
 215 static struct lock_class_key af_family_kern_slock_keys[AF_MAX];
 216
 217 /*
 218  * Make lock validator output more readable. (we pre-construct these
 219  * strings build-time, so that runtime initialization of socket
 220  * locks is fast):
 221  */
 222
 223 #define _sock_locks(x)                                            \
 224   x "AF_UNSPEC",        x "AF_UNIX"     ,       x "AF_INET"     , \
 225   x "AF_AX25"  ,        x "AF_IPX"      ,       x "AF_APPLETALK", \
 226   x "AF_NETROM",        x "AF_BRIDGE"   ,       x "AF_ATMPVC"   , \
 227   x "AF_X25"   ,        x "AF_INET6"    ,       x "AF_ROSE"     , \
 228   x "AF_DECnet",        x "AF_NETBEUI"  ,       x "AF_SECURITY" , \
 229   x "AF_KEY"   ,        x "AF_NETLINK"  ,       x "AF_PACKET"   , \
 230   x "AF_ASH"   ,        x "AF_ECONET"   ,       x "AF_ATMSVC"   , \
 231   x "AF_RDS"   ,        x "AF_SNA"      ,       x "AF_IRDA"     , \
 232   x "AF_PPPOX" ,        x "AF_WANPIPE"  ,       x "AF_LLC"      , \
 233   x "27"       ,        x "28"          ,       x "AF_CAN"      , \
 234   x "AF_TIPC"  ,        x "AF_BLUETOOTH",       x "IUCV"        , \
 235   x "AF_RXRPC" ,        x "AF_ISDN"     ,       x "AF_PHONET"   , \
 236   x "AF_IEEE802154",    x "AF_CAIF"     ,       x "AF_ALG"      , \
 237   x "AF_NFC"   ,        x "AF_VSOCK"    ,       x "AF_KCM"      , \
 238   x "AF_QIPCRTR",       x "AF_SMC"      ,       x "AF_XDP"      , \
 239   x "AF_MCTP"  , \
 240   x "AF_MAX"
 241
 242 static const char *const af_family_key_strings[AF_MAX+1] = {
 243         _sock_locks("sk_lock-")
 244 };
 245 static const char *const af_family_slock_key_strings[AF_MAX+1] = {
 246         _sock_locks("slock-")
 247 };
 248 static const char *const af_family_clock_key_strings[AF_MAX+1] = {
 249         _sock_locks("clock-")
 250 };
 251
 252 static const char *const af_family_kern_key_strings[AF_MAX+1] = {
 253         _sock_locks("k-sk_lock-")
 254 };
 255 static const char *const af_family_kern_slock_key_strings[AF_MAX+1] = {
 256         _sock_locks("k-slock-")
 257 };
 258 static const char *const af_family_kern_clock_key_strings[AF_MAX+1] = {
 259         _sock_locks("k-clock-")
 260 };
 261 static const char *const af_family_rlock_key_strings[AF_MAX+1] = {
 262         _sock_locks("rlock-")
 263 };
 264 static const char *const af_family_wlock_key_strings[AF_MAX+1] = {
 265         _sock_locks("wlock-")
 266 };
 267 static const char *const af_family_elock_key_strings[AF_MAX+1] = {
 268         _sock_locks("elock-")
 269 };
 270
 271 /*
 272  * sk_callback_lock and sk queues locking rules are per-address-family,
 273  * so split the lock classes by using a per-AF key:
 274  */
 275 static struct lock_class_key af_callback_keys[AF_MAX];
 276 static struct lock_class_key af_rlock_keys[AF_MAX];
 277 static struct lock_class_key af_wlock_keys[AF_MAX];
 278 static struct lock_class_key af_elock_keys[AF_MAX];
 279 static struct lock_class_key af_kern_callback_keys[AF_MAX];
 280
 281 /* Run time adjustable parameters. */
 282 __u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX;
 283 EXPORT_SYMBOL(sysctl_wmem_max);
 284 __u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX;
 285 EXPORT_SYMBOL(sysctl_rmem_max);
 286 __u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX;
 287 __u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX;
 288
 289 DEFINE_STATIC_KEY_FALSE(memalloc_socks_key);
 290 EXPORT_SYMBOL_GPL(memalloc_socks_key);
 291
 292 /**
 293  * sk_set_memalloc - sets %SOCK_MEMALLOC
 294  * @sk: socket to set it on
 295  *
 296  * Set %SOCK_MEMALLOC on a socket for access to emergency reserves.
 297  * It's the responsibility of the admin to adjust min_free_kbytes
 298  * to meet the requirements
 299  */
 300 void sk_set_memalloc(struct sock *sk)
 301 {
 302         sock_set_flag(sk, SOCK_MEMALLOC);
 303         sk->sk_allocation |= __GFP_MEMALLOC;
 304         static_branch_inc(&memalloc_socks_key);
 305 }
 306 EXPORT_SYMBOL_GPL(sk_set_memalloc);
 307
 308 void sk_clear_memalloc(struct sock *sk)
 309 {
 310         sock_reset_flag(sk, SOCK_MEMALLOC);
 311         sk->sk_allocation &= ~__GFP_MEMALLOC;
 312         static_branch_dec(&memalloc_socks_key);
 313
 314         /*
 315          * SOCK_MEMALLOC is allowed to ignore rmem limits to ensure forward
 316          * progress of swapping. SOCK_MEMALLOC may be cleared while
 317          * it has rmem allocations due to the last swapfile being deactivated
 318          * but there is a risk that the socket is unusable due to exceeding
 319          * the rmem limits. Reclaim the reserves and obey rmem limits again.
 320          */
 321         sk_mem_reclaim(sk);
 322 }
 323 EXPORT_SYMBOL_GPL(sk_clear_memalloc);
 324
 325 int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb)
 326 {
 327         int ret;
 328         unsigned int noreclaim_flag;
 329
 330         /* these should have been dropped before queueing */
 331         BUG_ON(!sock_flag(sk, SOCK_MEMALLOC));
 332
 333         noreclaim_flag = memalloc_noreclaim_save();
 334         ret = INDIRECT_CALL_INET(sk->sk_backlog_rcv,
 335                                  tcp_v6_do_rcv,
 336                                  tcp_v4_do_rcv,
 337                                  sk, skb);
 338         memalloc_noreclaim_restore(noreclaim_flag);
 339
 340         return ret;
 341 }
 342 EXPORT_SYMBOL(__sk_backlog_rcv);
 343
 344 void sk_error_report(struct sock *sk)
 345 {
 346         sk->sk_error_report(sk);
 347
 348         switch (sk->sk_family) {
 349         case AF_INET:
 350                 fallthrough;
 351         case AF_INET6:
 352                 trace_inet_sk_error_report(sk);
 353                 break;
 354         default:
 355                 break;
 356         }
 357 }
 358 EXPORT_SYMBOL(sk_error_report);
 359
 360 int sock_get_timeout(long timeo, void *optval, bool old_timeval)
 361 {
 362         struct __kernel_sock_timeval tv;
 363
 364         if (timeo == MAX_SCHEDULE_TIMEOUT) {
 365                 tv.tv_sec = 0;
 366                 tv.tv_usec = 0;
 367         } else {
 368                 tv.tv_sec = timeo / HZ;
 369                 tv.tv_usec = ((timeo % HZ) * USEC_PER_SEC) / HZ;
 370         }
 371
 372         if (old_timeval && in_compat_syscall() && !COMPAT_USE_64BIT_TIME) {
 373                 struct old_timeval32 tv32 = { tv.tv_sec, tv.tv_usec };
 374                 *(struct old_timeval32 *)optval = tv32;
 375                 return sizeof(tv32);
 376         }
 377
 378         if (old_timeval) {
 379                 struct __kernel_old_timeval old_tv;
 380                 old_tv.tv_sec = tv.tv_sec;
 381                 old_tv.tv_usec = tv.tv_usec;
 382                 *(struct __kernel_old_timeval *)optval = old_tv;
 383                 return sizeof(old_tv);
 384         }
 385
 386         *(struct __kernel_sock_timeval *)optval = tv;
 387         return sizeof(tv);
 388 }
 389 EXPORT_SYMBOL(sock_get_timeout);
 390
 391 int sock_copy_user_timeval(struct __kernel_sock_timeval *tv,
 392                            sockptr_t optval, int optlen, bool old_timeval)
 393 {
 394         if (old_timeval && in_compat_syscall() && !COMPAT_USE_64BIT_TIME) {
 395                 struct old_timeval32 tv32;
 396
 397                 if (optlen < sizeof(tv32))
 398                         return -EINVAL;
 399
 400                 if (copy_from_sockptr(&tv32, optval, sizeof(tv32)))
 401                         return -EFAULT;
 402                 tv->tv_sec = tv32.tv_sec;
 403                 tv->tv_usec = tv32.tv_usec;
 404         } else if (old_timeval) {
 405                 struct __kernel_old_timeval old_tv;
 406
 407                 if (optlen < sizeof(old_tv))
 408                         return -EINVAL;
 409                 if (copy_from_sockptr(&old_tv, optval, sizeof(old_tv)))
 410                         return -EFAULT;
 411                 tv->tv_sec = old_tv.tv_sec;
 412                 tv->tv_usec = old_tv.tv_usec;
 413         } else {
 414                 if (optlen < sizeof(*tv))
 415                         return -EINVAL;
 416                 if (copy_from_sockptr(tv, optval, sizeof(*tv)))
 417                         return -EFAULT;
 418         }
 419
 420         return 0;
 421 }
 422 EXPORT_SYMBOL(sock_copy_user_timeval);
 423
 424 static int sock_set_timeout(long *timeo_p, sockptr_t optval, int optlen,
 425                             bool old_timeval)
 426 {
 427         struct __kernel_sock_timeval tv;
 428         int err = sock_copy_user_timeval(&tv, optval, optlen, old_timeval);
 429         long val;
 430
 431         if (err)
 432                 return err;
 433
 434         if (tv.tv_usec < 0 || tv.tv_usec >= USEC_PER_SEC)
 435                 return -EDOM;
 436
 437         if (tv.tv_sec < 0) {
 438                 static int warned __read_mostly;
 439
 440                 WRITE_ONCE(*timeo_p, 0);
 441                 if (warned < 10 && net_ratelimit()) {
 442                         warned++;
 443                         pr_info("%s: `%s' (pid %d) tries to set negative timeout\n",
 444                                 __func__, current->comm, task_pid_nr(current));
 445                 }
 446                 return 0;
 447         }
 448         val = MAX_SCHEDULE_TIMEOUT;
 449         if ((tv.tv_sec || tv.tv_usec) &&
 450             (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT / HZ - 1)))
 451                 val = tv.tv_sec * HZ + DIV_ROUND_UP((unsigned long)tv.tv_usec,
 452                                                     USEC_PER_SEC / HZ);
 453         WRITE_ONCE(*timeo_p, val);
 454         return 0;
 455 }
 456
 457 static bool sk_set_prio_allowed(const struct sock *sk, int val)
 458 {
 459         return ((val >= TC_PRIO_BESTEFFORT && val <= TC_PRIO_INTERACTIVE) ||
 460                 sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) ||
 461                 sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN));
 462 }
 463
 464 static bool sock_needs_netstamp(const struct sock *sk)
 465 {
 466         switch (sk->sk_family) {
 467         case AF_UNSPEC:
 468         case AF_UNIX:
 469                 return false;
 470         default:
 471                 return true;
 472         }
 473 }
 474
 475 static void sock_disable_timestamp(struct sock *sk, unsigned long flags)
 476 {
 477         if (sk->sk_flags & flags) {
 478                 sk->sk_flags &= ~flags;
 479                 if (sock_needs_netstamp(sk) &&
 480                     !(sk->sk_flags & SK_FLAGS_TIMESTAMP))
 481                         net_disable_timestamp();
 482         }
 483 }
 484
 485
 486 int __sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
 487 {
 488         unsigned long flags;
 489         struct sk_buff_head *list = &sk->sk_receive_queue;
 490
 491         if (atomic_read(&sk->sk_rmem_alloc) >= READ_ONCE(sk->sk_rcvbuf)) {
 492                 atomic_inc(&sk->sk_drops);
 493                 trace_sock_rcvqueue_full(sk, skb);
 494                 return -ENOMEM;
 495         }
 496
 497         if (!sk_rmem_schedule(sk, skb, skb->truesize)) {
 498                 atomic_inc(&sk->sk_drops);
 499                 return -ENOBUFS;
 500         }
 501
 502         skb->dev = NULL;
 503         skb_set_owner_r(skb, sk);
 504
 505         /* we escape from rcu protected region, make sure we dont leak
 506          * a norefcounted dst
 507          */
 508         skb_dst_force(skb);
 509
 510         spin_lock_irqsave(&list->lock, flags);
 511         sock_skb_set_dropcount(sk, skb);
 512         __skb_queue_tail(list, skb);
 513         spin_unlock_irqrestore(&list->lock, flags);
 514
 515         if (!sock_flag(sk, SOCK_DEAD))
 516                 sk->sk_data_ready(sk);
 517         return 0;
 518 }
 519 EXPORT_SYMBOL(__sock_queue_rcv_skb);
 520
 521 int sock_queue_rcv_skb_reason(struct sock *sk, struct sk_buff *skb,
 522                               enum skb_drop_reason *reason)
 523 {
 524         enum skb_drop_reason drop_reason;
 525         int err;
 526
 527         err = sk_filter(sk, skb);
 528         if (err) {
 529                 drop_reason = SKB_DROP_REASON_SOCKET_FILTER;
 530                 goto out;
 531         }
 532         err = __sock_queue_rcv_skb(sk, skb);
 533         switch (err) {
 534         case -ENOMEM:
 535                 drop_reason = SKB_DROP_REASON_SOCKET_RCVBUFF;
 536                 break;
 537         case -ENOBUFS:
 538                 drop_reason = SKB_DROP_REASON_PROTO_MEM;
 539                 break;
 540         default:
 541                 drop_reason = SKB_NOT_DROPPED_YET;
 542                 break;
 543         }
 544 out:
 545         if (reason)
 546                 *reason = drop_reason;
 547         return err;
 548 }
 549 EXPORT_SYMBOL(sock_queue_rcv_skb_reason);
 550
 551 int __sk_receive_skb(struct sock *sk, struct sk_buff *skb,
 552                      const int nested, unsigned int trim_cap, bool refcounted)
 553 {
 554         int rc = NET_RX_SUCCESS;
 555
 556         if (sk_filter_trim_cap(sk, skb, trim_cap))
 557                 goto discard_and_relse;
 558
 559         skb->dev = NULL;
 560
 561         if (sk_rcvqueues_full(sk, READ_ONCE(sk->sk_rcvbuf))) {
 562                 atomic_inc(&sk->sk_drops);
 563                 goto discard_and_relse;
 564         }
 565         if (nested)
 566                 bh_lock_sock_nested(sk);
 567         else
 568                 bh_lock_sock(sk);
 569         if (!sock_owned_by_user(sk)) {
 570                 /*
 571                  * trylock + unlock semantics:
 572                  */
 573                 mutex_acquire(&sk->sk_lock.dep_map, 0, 1, _RET_IP_);
 574
 575                 rc = sk_backlog_rcv(sk, skb);
 576
 577                 mutex_release(&sk->sk_lock.dep_map, _RET_IP_);
 578         } else if (sk_add_backlog(sk, skb, READ_ONCE(sk->sk_rcvbuf))) {
 579                 bh_unlock_sock(sk);
 580                 atomic_inc(&sk->sk_drops);
 581                 goto discard_and_relse;
 582         }
 583
 584         bh_unlock_sock(sk);
 585 out:
 586         if (refcounted)
 587                 sock_put(sk);
 588         return rc;
 589 discard_and_relse:
 590         kfree_skb(skb);
 591         goto out;
 592 }
 593 EXPORT_SYMBOL(__sk_receive_skb);
 594
 595 INDIRECT_CALLABLE_DECLARE(struct dst_entry *ip6_dst_check(struct dst_entry *,
 596                                                           u32));
 597 INDIRECT_CALLABLE_DECLARE(struct dst_entry *ipv4_dst_check(struct dst_entry *,
 598                                                            u32));
 599 struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie)
 600 {
 601         struct dst_entry *dst = __sk_dst_get(sk);
 602
 603         if (dst && dst->obsolete &&
 604             INDIRECT_CALL_INET(dst->ops->check, ip6_dst_check, ipv4_dst_check,
 605                                dst, cookie) == NULL) {
 606                 sk_tx_queue_clear(sk);
 607                 WRITE_ONCE(sk->sk_dst_pending_confirm, 0);
 608                 RCU_INIT_POINTER(sk->sk_dst_cache, NULL);
 609                 dst_release(dst);
 610                 return NULL;
 611         }
 612
 613         return dst;
 614 }
 615 EXPORT_SYMBOL(__sk_dst_check);
 616
 617 struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie)
 618 {
 619         struct dst_entry *dst = sk_dst_get(sk);
 620
 621         if (dst && dst->obsolete &&
 622             INDIRECT_CALL_INET(dst->ops->check, ip6_dst_check, ipv4_dst_check,
 623                                dst, cookie) == NULL) {
 624                 sk_dst_reset(sk);
 625                 dst_release(dst);
 626                 return NULL;
 627         }
 628
 629         return dst;
 630 }
 631 EXPORT_SYMBOL(sk_dst_check);
 632
 633 static int sock_bindtoindex_locked(struct sock *sk, int ifindex)
 634 {
 635         int ret = -ENOPROTOOPT;
 636 #ifdef CONFIG_NETDEVICES
 637         struct net *net = sock_net(sk);
 638
 639         /* Sorry... */
 640         ret = -EPERM;
 641         if (sk->sk_bound_dev_if && !ns_capable(net->user_ns, CAP_NET_RAW))
 642                 goto out;
 643
 644         ret = -EINVAL;
 645         if (ifindex < 0)
 646                 goto out;
 647
 648         /* Paired with all READ_ONCE() done locklessly. */
 649         WRITE_ONCE(sk->sk_bound_dev_if, ifindex);
 650
 651         if (sk->sk_prot->rehash)
 652                 sk->sk_prot->rehash(sk);
 653         sk_dst_reset(sk);
 654
 655         ret = 0;
 656
 657 out:
 658 #endif
 659
 660         return ret;
 661 }
 662
 663 int sock_bindtoindex(struct sock *sk, int ifindex, bool lock_sk)
 664 {
 665         int ret;
 666
 667         if (lock_sk)
 668                 lock_sock(sk);
 669         ret = sock_bindtoindex_locked(sk, ifindex);
 670         if (lock_sk)
 671                 release_sock(sk);
 672
 673         return ret;
 674 }
 675 EXPORT_SYMBOL(sock_bindtoindex);
 676
 677 static int sock_setbindtodevice(struct sock *sk, sockptr_t optval, int optlen)
 678 {
 679         int ret = -ENOPROTOOPT;
 680 #ifdef CONFIG_NETDEVICES
 681         struct net *net = sock_net(sk);
 682         char devname[IFNAMSIZ];
 683         int index;
 684
 685         ret = -EINVAL;
 686         if (optlen < 0)
 687                 goto out;
 688
 689         /* Bind this socket to a particular device like "eth0",
 690          * as specified in the passed interface name. If the
 691          * name is "" or the option length is zero the socket
 692          * is not bound.
 693          */
 694         if (optlen > IFNAMSIZ - 1)
 695                 optlen = IFNAMSIZ - 1;
 696         memset(devname, 0, sizeof(devname));
 697
 698         ret = -EFAULT;
 699         if (copy_from_sockptr(devname, optval, optlen))
 700                 goto out;
 701
 702         index = 0;
 703         if (devname[0] != '\0') {
 704                 struct net_device *dev;
 705
 706                 rcu_read_lock();
 707                 dev = dev_get_by_name_rcu(net, devname);
 708                 if (dev)
 709                         index = dev->ifindex;
 710                 rcu_read_unlock();
 711                 ret = -ENODEV;
 712                 if (!dev)
 713                         goto out;
 714         }
 715
 716         sockopt_lock_sock(sk);
 717         ret = sock_bindtoindex_locked(sk, index);
 718         sockopt_release_sock(sk);
 719 out:
 720 #endif
 721
 722         return ret;
 723 }
 724
 725 static int sock_getbindtodevice(struct sock *sk, sockptr_t optval,
 726                                 sockptr_t optlen, int len)
 727 {
 728         int ret = -ENOPROTOOPT;
 729 #ifdef CONFIG_NETDEVICES
 730         int bound_dev_if = READ_ONCE(sk->sk_bound_dev_if);
 731         struct net *net = sock_net(sk);
 732         char devname[IFNAMSIZ];
 733
 734         if (bound_dev_if == 0) {
 735                 len = 0;
 736                 goto zero;
 737         }
 738
 739         ret = -EINVAL;
 740         if (len < IFNAMSIZ)
 741                 goto out;
 742
 743         ret = netdev_get_name(net, devname, bound_dev_if);
 744         if (ret)
 745                 goto out;
 746
 747         len = strlen(devname) + 1;
 748
 749         ret = -EFAULT;
 750         if (copy_to_sockptr(optval, devname, len))
 751                 goto out;
 752
 753 zero:
 754         ret = -EFAULT;
 755         if (copy_to_sockptr(optlen, &len, sizeof(int)))
 756                 goto out;
 757
 758         ret = 0;
 759
 760 out:
 761 #endif
 762
 763         return ret;
 764 }
 765
 766 bool sk_mc_loop(const struct sock *sk)
 767 {
 768         if (dev_recursion_level())
 769                 return false;
 770         if (!sk)
 771                 return true;
 772         /* IPV6_ADDRFORM can change sk->sk_family under us. */
 773         switch (READ_ONCE(sk->sk_family)) {
 774         case AF_INET:
 775                 return inet_test_bit(MC_LOOP, sk);
 776 #if IS_ENABLED(CONFIG_IPV6)
 777         case AF_INET6:
 778                 return inet6_test_bit(MC6_LOOP, sk);
 779 #endif
 780         }
 781         WARN_ON_ONCE(1);
 782         return true;
 783 }
 784 EXPORT_SYMBOL(sk_mc_loop);
 785
 786 void sock_set_reuseaddr(struct sock *sk)
 787 {
 788         lock_sock(sk);
 789         sk->sk_reuse = SK_CAN_REUSE;
 790         release_sock(sk);
 791 }
 792 EXPORT_SYMBOL(sock_set_reuseaddr);
 793
 794 void sock_set_reuseport(struct sock *sk)
 795 {
 796         lock_sock(sk);
 797         sk->sk_reuseport = true;
 798         release_sock(sk);
 799 }
 800 EXPORT_SYMBOL(sock_set_reuseport);
 801
 802 void sock_no_linger(struct sock *sk)
 803 {
 804         lock_sock(sk);
 805         WRITE_ONCE(sk->sk_lingertime, 0);
 806         sock_set_flag(sk, SOCK_LINGER);
 807         release_sock(sk);
 808 }
 809 EXPORT_SYMBOL(sock_no_linger);
 810
 811 void sock_set_priority(struct sock *sk, u32 priority)
 812 {
 813         WRITE_ONCE(sk->sk_priority, priority);
 814 }
 815 EXPORT_SYMBOL(sock_set_priority);
 816
 817 void sock_set_sndtimeo(struct sock *sk, s64 secs)
 818 {
 819         lock_sock(sk);
 820         if (secs && secs < MAX_SCHEDULE_TIMEOUT / HZ - 1)
 821                 WRITE_ONCE(sk->sk_sndtimeo, secs * HZ);
 822         else
 823                 WRITE_ONCE(sk->sk_sndtimeo, MAX_SCHEDULE_TIMEOUT);
 824         release_sock(sk);
 825 }
 826 EXPORT_SYMBOL(sock_set_sndtimeo);
 827
 828 static void __sock_set_timestamps(struct sock *sk, bool val, bool new, bool ns)
 829 {
 830         sock_valbool_flag(sk, SOCK_RCVTSTAMP, val);
 831         sock_valbool_flag(sk, SOCK_RCVTSTAMPNS, val && ns);
 832         if (val)  {
 833                 sock_valbool_flag(sk, SOCK_TSTAMP_NEW, new);
 834                 sock_enable_timestamp(sk, SOCK_TIMESTAMP);
 835         }
 836 }
 837
 838 void sock_enable_timestamps(struct sock *sk)
 839 {
 840         lock_sock(sk);
 841         __sock_set_timestamps(sk, true, false, true);
 842         release_sock(sk);
 843 }
 844 EXPORT_SYMBOL(sock_enable_timestamps);
 845
 846 void sock_set_timestamp(struct sock *sk, int optname, bool valbool)
 847 {
 848         switch (optname) {
 849         case SO_TIMESTAMP_OLD:
 850                 __sock_set_timestamps(sk, valbool, false, false);
 851                 break;
 852         case SO_TIMESTAMP_NEW:
 853                 __sock_set_timestamps(sk, valbool, true, false);
 854                 break;
 855         case SO_TIMESTAMPNS_OLD:
 856                 __sock_set_timestamps(sk, valbool, false, true);
 857                 break;
 858         case SO_TIMESTAMPNS_NEW:
 859                 __sock_set_timestamps(sk, valbool, true, true);
 860                 break;
 861         }
 862 }
 863
 864 static int sock_timestamping_bind_phc(struct sock *sk, int phc_index)
 865 {
 866         struct net *net = sock_net(sk);
 867         struct net_device *dev = NULL;
 868         bool match = false;
 869         int *vclock_index;
 870         int i, num;
 871
 872         if (sk->sk_bound_dev_if)
 873                 dev = dev_get_by_index(net, sk->sk_bound_dev_if);
 874
 875         if (!dev) {
 876                 pr_err("%s: sock not bind to device\n", __func__);
 877                 return -EOPNOTSUPP;
 878         }
 879
 880         num = ethtool_get_phc_vclocks(dev, &vclock_index);
 881         dev_put(dev);
 882
 883         for (i = 0; i < num; i++) {
 884                 if (*(vclock_index + i) == phc_index) {
 885                         match = true;
 886                         break;
 887                 }
 888         }
 889
 890         if (num > 0)
 891                 kfree(vclock_index);
 892
 893         if (!match)
 894                 return -EINVAL;
 895
 896         WRITE_ONCE(sk->sk_bind_phc, phc_index);
 897
 898         return 0;
 899 }
 900
 901 int sock_set_timestamping(struct sock *sk, int optname,
 902                           struct so_timestamping timestamping)
 903 {
 904         int val = timestamping.flags;
 905         int ret;
 906
 907         if (val & ~SOF_TIMESTAMPING_MASK)
 908                 return -EINVAL;
 909
 910         if (val & SOF_TIMESTAMPING_OPT_ID_TCP &&
 911             !(val & SOF_TIMESTAMPING_OPT_ID))
 912                 return -EINVAL;
 913
 914         if (val & SOF_TIMESTAMPING_OPT_ID &&
 915             !(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)) {
 916                 if (sk_is_tcp(sk)) {
 917                         if ((1 << sk->sk_state) &
 918                             (TCPF_CLOSE | TCPF_LISTEN))
 919                                 return -EINVAL;
 920                         if (val & SOF_TIMESTAMPING_OPT_ID_TCP)
 921                                 atomic_set(&sk->sk_tskey, tcp_sk(sk)->write_seq);
 922                         else
 923                                 atomic_set(&sk->sk_tskey, tcp_sk(sk)->snd_una);
 924                 } else {
 925                         atomic_set(&sk->sk_tskey, 0);
 926                 }
 927         }
 928
 929         if (val & SOF_TIMESTAMPING_OPT_STATS &&
 930             !(val & SOF_TIMESTAMPING_OPT_TSONLY))
 931                 return -EINVAL;
 932
 933         if (val & SOF_TIMESTAMPING_BIND_PHC) {
 934                 ret = sock_timestamping_bind_phc(sk, timestamping.bind_phc);
 935                 if (ret)
 936                         return ret;
 937         }
 938
 939         WRITE_ONCE(sk->sk_tsflags, val);
 940         sock_valbool_flag(sk, SOCK_TSTAMP_NEW, optname == SO_TIMESTAMPING_NEW);
 941
 942         if (val & SOF_TIMESTAMPING_RX_SOFTWARE)
 943                 sock_enable_timestamp(sk,
 944                                       SOCK_TIMESTAMPING_RX_SOFTWARE);
 945         else
 946                 sock_disable_timestamp(sk,
 947                                        (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE));
 948         return 0;
 949 }
 950
 951 void sock_set_keepalive(struct sock *sk)
 952 {
 953         lock_sock(sk);
 954         if (sk->sk_prot->keepalive)
 955                 sk->sk_prot->keepalive(sk, true);
 956         sock_valbool_flag(sk, SOCK_KEEPOPEN, true);
 957         release_sock(sk);
 958 }
 959 EXPORT_SYMBOL(sock_set_keepalive);
 960
 961 static void __sock_set_rcvbuf(struct sock *sk, int val)
 962 {
 963         /* Ensure val * 2 fits into an int, to prevent max_t() from treating it
 964          * as a negative value.
 965          */
 966         val = min_t(int, val, INT_MAX / 2);
 967         sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
 968
 969         /* We double it on the way in to account for "struct sk_buff" etc.
 970          * overhead.   Applications assume that the SO_RCVBUF setting they make
 971          * will allow that much actual data to be received on that socket.
 972          *
 973          * Applications are unaware that "struct sk_buff" and other overheads
 974          * allocate from the receive buffer during socket buffer allocation.
 975          *
 976          * And after considering the possible alternatives, returning the value
 977          * we actually used in getsockopt is the most desirable behavior.
 978          */
 979         WRITE_ONCE(sk->sk_rcvbuf, max_t(int, val * 2, SOCK_MIN_RCVBUF));
 980 }
 981
 982 void sock_set_rcvbuf(struct sock *sk, int val)
 983 {
 984         lock_sock(sk);
 985         __sock_set_rcvbuf(sk, val);
 986         release_sock(sk);
 987 }
 988 EXPORT_SYMBOL(sock_set_rcvbuf);
 989
 990 static void __sock_set_mark(struct sock *sk, u32 val)
 991 {
 992         if (val != sk->sk_mark) {
 993                 WRITE_ONCE(sk->sk_mark, val);
 994                 sk_dst_reset(sk);
 995         }
 996 }
 997
 998 void sock_set_mark(struct sock *sk, u32 val)
 999 {
1000         lock_sock(sk);
1001         __sock_set_mark(sk, val);
1002         release_sock(sk);
1003 }
1004 EXPORT_SYMBOL(sock_set_mark);
1005
1006 static void sock_release_reserved_memory(struct sock *sk, int bytes)
1007 {
1008         /* Round down bytes to multiple of pages */
1009         bytes = round_down(bytes, PAGE_SIZE);
1010
1011         WARN_ON(bytes > sk->sk_reserved_mem);
1012         WRITE_ONCE(sk->sk_reserved_mem, sk->sk_reserved_mem - bytes);
1013         sk_mem_reclaim(sk);
1014 }
1015
1016 static int sock_reserve_memory(struct sock *sk, int bytes)
1017 {
1018         long allocated;
1019         bool charged;
1020         int pages;
1021
1022         if (!mem_cgroup_sockets_enabled || !sk->sk_memcg || !sk_has_account(sk))
1023                 return -EOPNOTSUPP;
1024
1025         if (!bytes)
1026                 return 0;
1027
1028         pages = sk_mem_pages(bytes);
1029
1030         /* pre-charge to memcg */
1031         charged = mem_cgroup_charge_skmem(sk->sk_memcg, pages,
1032                                           GFP_KERNEL | __GFP_RETRY_MAYFAIL);
1033         if (!charged)
1034                 return -ENOMEM;
1035
1036         /* pre-charge to forward_alloc */
1037         sk_memory_allocated_add(sk, pages);
1038         allocated = sk_memory_allocated(sk);
1039         /* If the system goes into memory pressure with this
1040          * precharge, give up and return error.
1041          */
1042         if (allocated > sk_prot_mem_limits(sk, 1)) {
1043                 sk_memory_allocated_sub(sk, pages);
1044                 mem_cgroup_uncharge_skmem(sk->sk_memcg, pages);
1045                 return -ENOMEM;
1046         }
1047         sk_forward_alloc_add(sk, pages << PAGE_SHIFT);
1048
1049         WRITE_ONCE(sk->sk_reserved_mem,
1050                    sk->sk_reserved_mem + (pages << PAGE_SHIFT));
1051
1052         return 0;
1053 }
1054
1055 #ifdef CONFIG_PAGE_POOL
1056
1057 /* This is the number of tokens and frags that the user can SO_DEVMEM_DONTNEED
1058  * in 1 syscall. The limit exists to limit the amount of memory the kernel
1059  * allocates to copy these tokens, and to prevent looping over the frags for
1060  * too long.
1061  */
1062 #define MAX_DONTNEED_TOKENS 128
1063 #define MAX_DONTNEED_FRAGS 1024
1064
1065 static noinline_for_stack int
1066 sock_devmem_dontneed(struct sock *sk, sockptr_t optval, unsigned int optlen)
1067 {
1068         unsigned int num_tokens, i, j, k, netmem_num = 0;
1069         struct dmabuf_token *tokens;
1070         int ret = 0, num_frags = 0;
1071         netmem_ref netmems[16];
1072
1073         if (!sk_is_tcp(sk))
1074                 return -EBADF;
1075
1076         if (optlen % sizeof(*tokens) ||
1077             optlen > sizeof(*tokens) * MAX_DONTNEED_TOKENS)
1078                 return -EINVAL;
1079
1080         num_tokens = optlen / sizeof(*tokens);
1081         tokens = kvmalloc_array(num_tokens, sizeof(*tokens), GFP_KERNEL);
1082         if (!tokens)
1083                 return -ENOMEM;
1084
1085         if (copy_from_sockptr(tokens, optval, optlen)) {
1086                 kvfree(tokens);
1087                 return -EFAULT;
1088         }
1089
1090         xa_lock_bh(&sk->sk_user_frags);
1091         for (i = 0; i < num_tokens; i++) {
1092                 for (j = 0; j < tokens[i].token_count; j++) {
1093                         if (++num_frags > MAX_DONTNEED_FRAGS)
1094                                 goto frag_limit_reached;
1095
1096                         netmem_ref netmem = (__force netmem_ref)__xa_erase(
1097                                 &sk->sk_user_frags, tokens[i].token_start + j);
1098
1099                         if (!netmem || WARN_ON_ONCE(!netmem_is_net_iov(netmem)))
1100                                 continue;
1101
1102                         netmems[netmem_num++] = netmem;
1103                         if (netmem_num == ARRAY_SIZE(netmems)) {
1104                                 xa_unlock_bh(&sk->sk_user_frags);
1105                                 for (k = 0; k < netmem_num; k++)
1106                                         WARN_ON_ONCE(!napi_pp_put_page(netmems[k]));
1107                                 netmem_num = 0;
1108                                 xa_lock_bh(&sk->sk_user_frags);
1109                         }
1110                         ret++;
1111                 }
1112         }
1113
1114 frag_limit_reached:
1115         xa_unlock_bh(&sk->sk_user_frags);
1116         for (k = 0; k < netmem_num; k++)
1117                 WARN_ON_ONCE(!napi_pp_put_page(netmems[k]));
1118
1119         kvfree(tokens);
1120         return ret;
1121 }
1122 #endif
1123
1124 void sockopt_lock_sock(struct sock *sk)
1125 {
1126         /* When current->bpf_ctx is set, the setsockopt is called from
1127          * a bpf prog.  bpf has ensured the sk lock has been
1128          * acquired before calling setsockopt().
1129          */
1130         if (has_current_bpf_ctx())
1131                 return;
1132
1133         lock_sock(sk);
1134 }
1135 EXPORT_SYMBOL(sockopt_lock_sock);
1136
1137 void sockopt_release_sock(struct sock *sk)
1138 {
1139         if (has_current_bpf_ctx())
1140                 return;
1141
1142         release_sock(sk);
1143 }
1144 EXPORT_SYMBOL(sockopt_release_sock);
1145
1146 bool sockopt_ns_capable(struct user_namespace *ns, int cap)
1147 {
1148         return has_current_bpf_ctx() || ns_capable(ns, cap);
1149 }
1150 EXPORT_SYMBOL(sockopt_ns_capable);
1151
1152 bool sockopt_capable(int cap)
1153 {
1154         return has_current_bpf_ctx() || capable(cap);
1155 }
1156 EXPORT_SYMBOL(sockopt_capable);
1157
1158 static int sockopt_validate_clockid(__kernel_clockid_t value)
1159 {
1160         switch (value) {
1161         case CLOCK_REALTIME:
1162         case CLOCK_MONOTONIC:
1163         case CLOCK_TAI:
1164                 return 0;
1165         }
1166         return -EINVAL;
1167 }
1168
1169 /*
1170  *      This is meant for all protocols to use and covers goings on
1171  *      at the socket level. Everything here is generic.
1172  */
1173
1174 int sk_setsockopt(struct sock *sk, int level, int optname,
1175                   sockptr_t optval, unsigned int optlen)
1176 {
1177         struct so_timestamping timestamping;
1178         struct socket *sock = sk->sk_socket;
1179         struct sock_txtime sk_txtime;
1180         int val;
1181         int valbool;
1182         struct linger ling;
1183         int ret = 0;
1184
1185         /*
1186          *      Options without arguments
1187          */
1188
1189         if (optname == SO_BINDTODEVICE)
1190                 return sock_setbindtodevice(sk, optval, optlen);
1191
1192         if (optlen < sizeof(int))
1193                 return -EINVAL;
1194
1195         if (copy_from_sockptr(&val, optval, sizeof(val)))
1196                 return -EFAULT;
1197
1198         valbool = val ? 1 : 0;
1199
1200         /* handle options which do not require locking the socket. */
1201         switch (optname) {
1202         case SO_PRIORITY:
1203                 if (sk_set_prio_allowed(sk, val)) {
1204                         sock_set_priority(sk, val);
1205                         return 0;
1206                 }
1207                 return -EPERM;
1208         case SO_PASSSEC:
1209                 assign_bit(SOCK_PASSSEC, &sock->flags, valbool);
1210                 return 0;
1211         case SO_PASSCRED:
1212                 assign_bit(SOCK_PASSCRED, &sock->flags, valbool);
1213                 return 0;
1214         case SO_PASSPIDFD:
1215                 assign_bit(SOCK_PASSPIDFD, &sock->flags, valbool);
1216                 return 0;
1217         case SO_TYPE:
1218         case SO_PROTOCOL:
1219         case SO_DOMAIN:
1220         case SO_ERROR:
1221                 return -ENOPROTOOPT;
1222 #ifdef CONFIG_NET_RX_BUSY_POLL
1223         case SO_BUSY_POLL:
1224                 if (val < 0)
1225                         return -EINVAL;
1226                 WRITE_ONCE(sk->sk_ll_usec, val);
1227                 return 0;
1228         case SO_PREFER_BUSY_POLL:
1229                 if (valbool && !sockopt_capable(CAP_NET_ADMIN))
1230                         return -EPERM;
1231                 WRITE_ONCE(sk->sk_prefer_busy_poll, valbool);
1232                 return 0;
1233         case SO_BUSY_POLL_BUDGET:
1234                 if (val > READ_ONCE(sk->sk_busy_poll_budget) &&
1235                     !sockopt_capable(CAP_NET_ADMIN))
1236                         return -EPERM;
1237                 if (val < 0 || val > U16_MAX)
1238                         return -EINVAL;
1239                 WRITE_ONCE(sk->sk_busy_poll_budget, val);
1240                 return 0;
1241 #endif
1242         case SO_MAX_PACING_RATE:
1243                 {
1244                 unsigned long ulval = (val == ~0U) ? ~0UL : (unsigned int)val;
1245                 unsigned long pacing_rate;
1246
1247                 if (sizeof(ulval) != sizeof(val) &&
1248                     optlen >= sizeof(ulval) &&
1249                     copy_from_sockptr(&ulval, optval, sizeof(ulval))) {
1250                         return -EFAULT;
1251                 }
1252                 if (ulval != ~0UL)
1253                         cmpxchg(&sk->sk_pacing_status,
1254                                 SK_PACING_NONE,
1255                                 SK_PACING_NEEDED);
1256                 /* Pairs with READ_ONCE() from sk_getsockopt() */
1257                 WRITE_ONCE(sk->sk_max_pacing_rate, ulval);
1258                 pacing_rate = READ_ONCE(sk->sk_pacing_rate);
1259                 if (ulval < pacing_rate)
1260                         WRITE_ONCE(sk->sk_pacing_rate, ulval);
1261                 return 0;
1262                 }
1263         case SO_TXREHASH:
1264                 if (val < -1 || val > 1)
1265                         return -EINVAL;
1266                 if ((u8)val == SOCK_TXREHASH_DEFAULT)
1267                         val = READ_ONCE(sock_net(sk)->core.sysctl_txrehash);
1268                 /* Paired with READ_ONCE() in tcp_rtx_synack()
1269                  * and sk_getsockopt().
1270                  */
1271                 WRITE_ONCE(sk->sk_txrehash, (u8)val);
1272                 return 0;
1273         case SO_PEEK_OFF:
1274                 {
1275                 int (*set_peek_off)(struct sock *sk, int val);
1276
1277                 set_peek_off = READ_ONCE(sock->ops)->set_peek_off;
1278                 if (set_peek_off)
1279                         ret = set_peek_off(sk, val);
1280                 else
1281                         ret = -EOPNOTSUPP;
1282                 return ret;
1283                 }
1284 #ifdef CONFIG_PAGE_POOL
1285         case SO_DEVMEM_DONTNEED:
1286                 return sock_devmem_dontneed(sk, optval, optlen);
1287 #endif
1288         }
1289
1290         sockopt_lock_sock(sk);
1291
1292         switch (optname) {
1293         case SO_DEBUG:
1294                 if (val && !sockopt_capable(CAP_NET_ADMIN))
1295                         ret = -EACCES;
1296                 else
1297                         sock_valbool_flag(sk, SOCK_DBG, valbool);
1298                 break;
1299         case SO_REUSEADDR:
1300                 sk->sk_reuse = (valbool ? SK_CAN_REUSE : SK_NO_REUSE);
1301                 break;
1302         case SO_REUSEPORT:
1303                 if (valbool && !sk_is_inet(sk))
1304                         ret = -EOPNOTSUPP;
1305                 else
1306                         sk->sk_reuseport = valbool;
1307                 break;
1308         case SO_DONTROUTE:
1309                 sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool);
1310                 sk_dst_reset(sk);
1311                 break;
1312         case SO_BROADCAST:
1313                 sock_valbool_flag(sk, SOCK_BROADCAST, valbool);
1314                 break;
1315         case SO_SNDBUF:
1316                 /* Don't error on this BSD doesn't and if you think
1317                  * about it this is right. Otherwise apps have to
1318                  * play 'guess the biggest size' games. RCVBUF/SNDBUF
1319                  * are treated in BSD as hints
1320                  */
1321                 val = min_t(u32, val, READ_ONCE(sysctl_wmem_max));
1322 set_sndbuf:
1323                 /* Ensure val * 2 fits into an int, to prevent max_t()
1324                  * from treating it as a negative value.
1325                  */
1326                 val = min_t(int, val, INT_MAX / 2);
1327                 sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
1328                 WRITE_ONCE(sk->sk_sndbuf,
1329                            max_t(int, val * 2, SOCK_MIN_SNDBUF));
1330                 /* Wake up sending tasks if we upped the value. */
1331                 sk->sk_write_space(sk);
1332                 break;
1333
1334         case SO_SNDBUFFORCE:
1335                 if (!sockopt_capable(CAP_NET_ADMIN)) {
1336                         ret = -EPERM;
1337                         break;
1338                 }
1339
1340                 /* No negative values (to prevent underflow, as val will be
1341                  * multiplied by 2).
1342                  */
1343                 if (val < 0)
1344                         val = 0;
1345                 goto set_sndbuf;
1346
1347         case SO_RCVBUF:
1348                 /* Don't error on this BSD doesn't and if you think
1349                  * about it this is right. Otherwise apps have to
1350                  * play 'guess the biggest size' games. RCVBUF/SNDBUF
1351                  * are treated in BSD as hints
1352                  */
1353                 __sock_set_rcvbuf(sk, min_t(u32, val, READ_ONCE(sysctl_rmem_max)));
1354                 break;
1355
1356         case SO_RCVBUFFORCE:
1357                 if (!sockopt_capable(CAP_NET_ADMIN)) {
1358                         ret = -EPERM;
1359                         break;
1360                 }
1361
1362                 /* No negative values (to prevent underflow, as val will be
1363                  * multiplied by 2).
1364                  */
1365                 __sock_set_rcvbuf(sk, max(val, 0));
1366                 break;
1367
1368         case SO_KEEPALIVE:
1369                 if (sk->sk_prot->keepalive)
1370                         sk->sk_prot->keepalive(sk, valbool);
1371                 sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool);
1372                 break;
1373
1374         case SO_OOBINLINE:
1375                 sock_valbool_flag(sk, SOCK_URGINLINE, valbool);
1376                 break;
1377
1378         case SO_NO_CHECK:
1379                 sk->sk_no_check_tx = valbool;
1380                 break;
1381
1382         case SO_LINGER:
1383                 if (optlen < sizeof(ling)) {
1384                         ret = -EINVAL;  /* 1003.1g */
1385                         break;
1386                 }
1387                 if (copy_from_sockptr(&ling, optval, sizeof(ling))) {
1388                         ret = -EFAULT;
1389                         break;
1390                 }
1391                 if (!ling.l_onoff) {
1392                         sock_reset_flag(sk, SOCK_LINGER);
1393                 } else {
1394                         unsigned long t_sec = ling.l_linger;
1395
1396                         if (t_sec >= MAX_SCHEDULE_TIMEOUT / HZ)
1397                                 WRITE_ONCE(sk->sk_lingertime, MAX_SCHEDULE_TIMEOUT);
1398                         else
1399                                 WRITE_ONCE(sk->sk_lingertime, t_sec * HZ);
1400                         sock_set_flag(sk, SOCK_LINGER);
1401                 }
1402                 break;
1403
1404         case SO_BSDCOMPAT:
1405                 break;
1406
1407         case SO_TIMESTAMP_OLD:
1408         case SO_TIMESTAMP_NEW:
1409         case SO_TIMESTAMPNS_OLD:
1410         case SO_TIMESTAMPNS_NEW:
1411                 sock_set_timestamp(sk, optname, valbool);
1412                 break;
1413
1414         case SO_TIMESTAMPING_NEW:
1415         case SO_TIMESTAMPING_OLD:
1416                 if (optlen == sizeof(timestamping)) {
1417                         if (copy_from_sockptr(&timestamping, optval,
1418                                               sizeof(timestamping))) {
1419                                 ret = -EFAULT;
1420                                 break;
1421                         }
1422                 } else {
1423                         memset(&timestamping, 0, sizeof(timestamping));
1424                         timestamping.flags = val;
1425                 }
1426                 ret = sock_set_timestamping(sk, optname, timestamping);
1427                 break;
1428
1429         case SO_RCVLOWAT:
1430                 {
1431                 int (*set_rcvlowat)(struct sock *sk, int val) = NULL;
1432
1433                 if (val < 0)
1434                         val = INT_MAX;
1435                 if (sock)
1436                         set_rcvlowat = READ_ONCE(sock->ops)->set_rcvlowat;
1437                 if (set_rcvlowat)
1438                         ret = set_rcvlowat(sk, val);
1439                 else
1440                         WRITE_ONCE(sk->sk_rcvlowat, val ? : 1);
1441                 break;
1442                 }
1443         case SO_RCVTIMEO_OLD:
1444         case SO_RCVTIMEO_NEW:
1445                 ret = sock_set_timeout(&sk->sk_rcvtimeo, optval,
1446                                        optlen, optname == SO_RCVTIMEO_OLD);
1447                 break;
1448
1449         case SO_SNDTIMEO_OLD:
1450         case SO_SNDTIMEO_NEW:
1451                 ret = sock_set_timeout(&sk->sk_sndtimeo, optval,
1452                                        optlen, optname == SO_SNDTIMEO_OLD);
1453                 break;
1454
1455         case SO_ATTACH_FILTER: {
1456                 struct sock_fprog fprog;
1457
1458                 ret = copy_bpf_fprog_from_user(&fprog, optval, optlen);
1459                 if (!ret)
1460                         ret = sk_attach_filter(&fprog, sk);
1461                 break;
1462         }
1463         case SO_ATTACH_BPF:
1464                 ret = -EINVAL;
1465                 if (optlen == sizeof(u32)) {
1466                         u32 ufd;
1467
1468                         ret = -EFAULT;
1469                         if (copy_from_sockptr(&ufd, optval, sizeof(ufd)))
1470                                 break;
1471
1472                         ret = sk_attach_bpf(ufd, sk);
1473                 }
1474                 break;
1475
1476         case SO_ATTACH_REUSEPORT_CBPF: {
1477                 struct sock_fprog fprog;
1478
1479                 ret = copy_bpf_fprog_from_user(&fprog, optval, optlen);
1480                 if (!ret)
1481                         ret = sk_reuseport_attach_filter(&fprog, sk);
1482                 break;
1483         }
1484         case SO_ATTACH_REUSEPORT_EBPF:
1485                 ret = -EINVAL;
1486                 if (optlen == sizeof(u32)) {
1487                         u32 ufd;
1488
1489                         ret = -EFAULT;
1490                         if (copy_from_sockptr(&ufd, optval, sizeof(ufd)))
1491                                 break;
1492
1493                         ret = sk_reuseport_attach_bpf(ufd, sk);
1494                 }
1495                 break;
1496
1497         case SO_DETACH_REUSEPORT_BPF:
1498                 ret = reuseport_detach_prog(sk);
1499                 break;
1500
1501         case SO_DETACH_FILTER:
1502                 ret = sk_detach_filter(sk);
1503                 break;
1504
1505         case SO_LOCK_FILTER:
1506                 if (sock_flag(sk, SOCK_FILTER_LOCKED) && !valbool)
1507                         ret = -EPERM;
1508                 else
1509                         sock_valbool_flag(sk, SOCK_FILTER_LOCKED, valbool);
1510                 break;
1511
1512         case SO_MARK:
1513                 if (!sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) &&
1514                     !sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
1515                         ret = -EPERM;
1516                         break;
1517                 }
1518
1519                 __sock_set_mark(sk, val);
1520                 break;
1521         case SO_RCVMARK:
1522                 sock_valbool_flag(sk, SOCK_RCVMARK, valbool);
1523                 break;
1524
1525         case SO_RCVPRIORITY:
1526                 sock_valbool_flag(sk, SOCK_RCVPRIORITY, valbool);
1527                 break;
1528
1529         case SO_RXQ_OVFL:
1530                 sock_valbool_flag(sk, SOCK_RXQ_OVFL, valbool);
1531                 break;
1532
1533         case SO_WIFI_STATUS:
1534                 sock_valbool_flag(sk, SOCK_WIFI_STATUS, valbool);
1535                 break;
1536
1537         case SO_NOFCS:
1538                 sock_valbool_flag(sk, SOCK_NOFCS, valbool);
1539                 break;
1540
1541         case SO_SELECT_ERR_QUEUE:
1542                 sock_valbool_flag(sk, SOCK_SELECT_ERR_QUEUE, valbool);
1543                 break;
1544
1545
1546         case SO_INCOMING_CPU:
1547                 reuseport_update_incoming_cpu(sk, val);
1548                 break;
1549
1550         case SO_CNX_ADVICE:
1551                 if (val == 1)
1552                         dst_negative_advice(sk);
1553                 break;
1554
1555         case SO_ZEROCOPY:
1556                 if (sk->sk_family == PF_INET || sk->sk_family == PF_INET6) {
1557                         if (!(sk_is_tcp(sk) ||
1558                               (sk->sk_type == SOCK_DGRAM &&
1559                                sk->sk_protocol == IPPROTO_UDP)))
1560                                 ret = -EOPNOTSUPP;
1561                 } else if (sk->sk_family != PF_RDS) {
1562                         ret = -EOPNOTSUPP;
1563                 }
1564                 if (!ret) {
1565                         if (val < 0 || val > 1)
1566                                 ret = -EINVAL;
1567                         else
1568                                 sock_valbool_flag(sk, SOCK_ZEROCOPY, valbool);
1569                 }
1570                 break;
1571
1572         case SO_TXTIME:
1573                 if (optlen != sizeof(struct sock_txtime)) {
1574                         ret = -EINVAL;
1575                         break;
1576                 } else if (copy_from_sockptr(&sk_txtime, optval,
1577                            sizeof(struct sock_txtime))) {
1578                         ret = -EFAULT;
1579                         break;
1580                 } else if (sk_txtime.flags & ~SOF_TXTIME_FLAGS_MASK) {
1581                         ret = -EINVAL;
1582                         break;
1583                 }
1584                 /* CLOCK_MONOTONIC is only used by sch_fq, and this packet
1585                  * scheduler has enough safe guards.
1586                  */
1587                 if (sk_txtime.clockid != CLOCK_MONOTONIC &&
1588                     !sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
1589                         ret = -EPERM;
1590                         break;
1591                 }
1592
1593                 ret = sockopt_validate_clockid(sk_txtime.clockid);
1594                 if (ret)
1595                         break;
1596
1597                 sock_valbool_flag(sk, SOCK_TXTIME, true);
1598                 sk->sk_clockid = sk_txtime.clockid;
1599                 sk->sk_txtime_deadline_mode =
1600                         !!(sk_txtime.flags & SOF_TXTIME_DEADLINE_MODE);
1601                 sk->sk_txtime_report_errors =
1602                         !!(sk_txtime.flags & SOF_TXTIME_REPORT_ERRORS);
1603                 break;
1604
1605         case SO_BINDTOIFINDEX:
1606                 ret = sock_bindtoindex_locked(sk, val);
1607                 break;
1608
1609         case SO_BUF_LOCK:
1610                 if (val & ~SOCK_BUF_LOCK_MASK) {
1611                         ret = -EINVAL;
1612                         break;
1613                 }
1614                 sk->sk_userlocks = val | (sk->sk_userlocks &
1615                                           ~SOCK_BUF_LOCK_MASK);
1616                 break;
1617
1618         case SO_RESERVE_MEM:
1619         {
1620                 int delta;
1621
1622                 if (val < 0) {
1623                         ret = -EINVAL;
1624                         break;
1625                 }
1626
1627                 delta = val - sk->sk_reserved_mem;
1628                 if (delta < 0)
1629                         sock_release_reserved_memory(sk, -delta);
1630                 else
1631                         ret = sock_reserve_memory(sk, delta);
1632                 break;
1633         }
1634
1635         default:
1636                 ret = -ENOPROTOOPT;
1637                 break;
1638         }
1639         sockopt_release_sock(sk);
1640         return ret;
1641 }
1642
1643 int sock_setsockopt(struct socket *sock, int level, int optname,
1644                     sockptr_t optval, unsigned int optlen)
1645 {
1646         return sk_setsockopt(sock->sk, level, optname,
1647                              optval, optlen);
1648 }
1649 EXPORT_SYMBOL(sock_setsockopt);
1650
1651 static const struct cred *sk_get_peer_cred(struct sock *sk)
1652 {
1653         const struct cred *cred;
1654
1655         spin_lock(&sk->sk_peer_lock);
1656         cred = get_cred(sk->sk_peer_cred);
1657         spin_unlock(&sk->sk_peer_lock);
1658
1659         return cred;
1660 }
1661
1662 static void cred_to_ucred(struct pid *pid, const struct cred *cred,
1663                           struct ucred *ucred)
1664 {
1665         ucred->pid = pid_vnr(pid);
1666         ucred->uid = ucred->gid = -1;
1667         if (cred) {
1668                 struct user_namespace *current_ns = current_user_ns();
1669
1670                 ucred->uid = from_kuid_munged(current_ns, cred->euid);
1671                 ucred->gid = from_kgid_munged(current_ns, cred->egid);
1672         }
1673 }
1674
1675 static int groups_to_user(sockptr_t dst, const struct group_info *src)
1676 {
1677         struct user_namespace *user_ns = current_user_ns();
1678         int i;
1679
1680         for (i = 0; i < src->ngroups; i++) {
1681                 gid_t gid = from_kgid_munged(user_ns, src->gid[i]);
1682
1683                 if (copy_to_sockptr_offset(dst, i * sizeof(gid), &gid, sizeof(gid)))
1684                         return -EFAULT;
1685         }
1686
1687         return 0;
1688 }
1689
1690 int sk_getsockopt(struct sock *sk, int level, int optname,
1691                   sockptr_t optval, sockptr_t optlen)
1692 {
1693         struct socket *sock = sk->sk_socket;
1694
1695         union {
1696                 int val;
1697                 u64 val64;
1698                 unsigned long ulval;
1699                 struct linger ling;
1700                 struct old_timeval32 tm32;
1701                 struct __kernel_old_timeval tm;
1702                 struct  __kernel_sock_timeval stm;
1703                 struct sock_txtime txtime;
1704                 struct so_timestamping timestamping;
1705         } v;
1706
1707         int lv = sizeof(int);
1708         int len;
1709
1710         if (copy_from_sockptr(&len, optlen, sizeof(int)))
1711                 return -EFAULT;
1712         if (len < 0)
1713                 return -EINVAL;
1714
1715         memset(&v, 0, sizeof(v));
1716
1717         switch (optname) {
1718         case SO_DEBUG:
1719                 v.val = sock_flag(sk, SOCK_DBG);
1720                 break;
1721
1722         case SO_DONTROUTE:
1723                 v.val = sock_flag(sk, SOCK_LOCALROUTE);
1724                 break;
1725
1726         case SO_BROADCAST:
1727                 v.val = sock_flag(sk, SOCK_BROADCAST);
1728                 break;
1729
1730         case SO_SNDBUF:
1731                 v.val = READ_ONCE(sk->sk_sndbuf);
1732                 break;
1733
1734         case SO_RCVBUF:
1735                 v.val = READ_ONCE(sk->sk_rcvbuf);
1736                 break;
1737
1738         case SO_REUSEADDR:
1739                 v.val = sk->sk_reuse;
1740                 break;
1741
1742         case SO_REUSEPORT:
1743                 v.val = sk->sk_reuseport;
1744                 break;
1745
1746         case SO_KEEPALIVE:
1747                 v.val = sock_flag(sk, SOCK_KEEPOPEN);
1748                 break;
1749
1750         case SO_TYPE:
1751                 v.val = sk->sk_type;
1752                 break;
1753
1754         case SO_PROTOCOL:
1755                 v.val = sk->sk_protocol;
1756                 break;
1757
1758         case SO_DOMAIN:
1759                 v.val = sk->sk_family;
1760                 break;
1761
1762         case SO_ERROR:
1763                 v.val = -sock_error(sk);
1764                 if (v.val == 0)
1765                         v.val = xchg(&sk->sk_err_soft, 0);
1766                 break;
1767
1768         case SO_OOBINLINE:
1769                 v.val = sock_flag(sk, SOCK_URGINLINE);
1770                 break;
1771
1772         case SO_NO_CHECK:
1773                 v.val = sk->sk_no_check_tx;
1774                 break;
1775
1776         case SO_PRIORITY:
1777                 v.val = READ_ONCE(sk->sk_priority);
1778                 break;
1779
1780         case SO_LINGER:
1781                 lv              = sizeof(v.ling);
1782                 v.ling.l_onoff  = sock_flag(sk, SOCK_LINGER);
1783                 v.ling.l_linger = READ_ONCE(sk->sk_lingertime) / HZ;
1784                 break;
1785
1786         case SO_BSDCOMPAT:
1787                 break;
1788
1789         case SO_TIMESTAMP_OLD:
1790                 v.val = sock_flag(sk, SOCK_RCVTSTAMP) &&
1791                                 !sock_flag(sk, SOCK_TSTAMP_NEW) &&
1792                                 !sock_flag(sk, SOCK_RCVTSTAMPNS);
1793                 break;
1794
1795         case SO_TIMESTAMPNS_OLD:
1796                 v.val = sock_flag(sk, SOCK_RCVTSTAMPNS) && !sock_flag(sk, SOCK_TSTAMP_NEW);
1797                 break;
1798
1799         case SO_TIMESTAMP_NEW:
1800                 v.val = sock_flag(sk, SOCK_RCVTSTAMP) && sock_flag(sk, SOCK_TSTAMP_NEW);
1801                 break;
1802
1803         case SO_TIMESTAMPNS_NEW:
1804                 v.val = sock_flag(sk, SOCK_RCVTSTAMPNS) && sock_flag(sk, SOCK_TSTAMP_NEW);
1805                 break;
1806
1807         case SO_TIMESTAMPING_OLD:
1808         case SO_TIMESTAMPING_NEW:
1809                 lv = sizeof(v.timestamping);
1810                 /* For the later-added case SO_TIMESTAMPING_NEW: Be strict about only
1811                  * returning the flags when they were set through the same option.
1812                  * Don't change the beviour for the old case SO_TIMESTAMPING_OLD.
1813                  */
1814                 if (optname == SO_TIMESTAMPING_OLD || sock_flag(sk, SOCK_TSTAMP_NEW)) {
1815                         v.timestamping.flags = READ_ONCE(sk->sk_tsflags);
1816                         v.timestamping.bind_phc = READ_ONCE(sk->sk_bind_phc);
1817                 }
1818                 break;
1819
1820         case SO_RCVTIMEO_OLD:
1821         case SO_RCVTIMEO_NEW:
1822                 lv = sock_get_timeout(READ_ONCE(sk->sk_rcvtimeo), &v,
1823                                       SO_RCVTIMEO_OLD == optname);
1824                 break;
1825
1826         case SO_SNDTIMEO_OLD:
1827         case SO_SNDTIMEO_NEW:
1828                 lv = sock_get_timeout(READ_ONCE(sk->sk_sndtimeo), &v,
1829                                       SO_SNDTIMEO_OLD == optname);
1830                 break;
1831
1832         case SO_RCVLOWAT:
1833                 v.val = READ_ONCE(sk->sk_rcvlowat);
1834                 break;
1835
1836         case SO_SNDLOWAT:
1837                 v.val = 1;
1838                 break;
1839
1840         case SO_PASSCRED:
1841                 v.val = !!test_bit(SOCK_PASSCRED, &sock->flags);
1842                 break;
1843
1844         case SO_PASSPIDFD:
1845                 v.val = !!test_bit(SOCK_PASSPIDFD, &sock->flags);
1846                 break;
1847
1848         case SO_PEERCRED:
1849         {
1850                 struct ucred peercred;
1851                 if (len > sizeof(peercred))
1852                         len = sizeof(peercred);
1853
1854                 spin_lock(&sk->sk_peer_lock);
1855                 cred_to_ucred(sk->sk_peer_pid, sk->sk_peer_cred, &peercred);
1856                 spin_unlock(&sk->sk_peer_lock);
1857
1858                 if (copy_to_sockptr(optval, &peercred, len))
1859                         return -EFAULT;
1860                 goto lenout;
1861         }
1862
1863         case SO_PEERPIDFD:
1864         {
1865                 struct pid *peer_pid;
1866                 struct file *pidfd_file = NULL;
1867                 int pidfd;
1868
1869                 if (len > sizeof(pidfd))
1870                         len = sizeof(pidfd);
1871
1872                 spin_lock(&sk->sk_peer_lock);
1873                 peer_pid = get_pid(sk->sk_peer_pid);
1874                 spin_unlock(&sk->sk_peer_lock);
1875
1876                 if (!peer_pid)
1877                         return -ENODATA;
1878
1879                 pidfd = pidfd_prepare(peer_pid, 0, &pidfd_file);
1880                 put_pid(peer_pid);
1881                 if (pidfd < 0)
1882                         return pidfd;
1883
1884                 if (copy_to_sockptr(optval, &pidfd, len) ||
1885                     copy_to_sockptr(optlen, &len, sizeof(int))) {
1886                         put_unused_fd(pidfd);
1887                         fput(pidfd_file);
1888
1889                         return -EFAULT;
1890                 }
1891
1892                 fd_install(pidfd, pidfd_file);
1893                 return 0;
1894         }
1895
1896         case SO_PEERGROUPS:
1897         {
1898                 const struct cred *cred;
1899                 int ret, n;
1900
1901                 cred = sk_get_peer_cred(sk);
1902                 if (!cred)
1903                         return -ENODATA;
1904
1905                 n = cred->group_info->ngroups;
1906                 if (len < n * sizeof(gid_t)) {
1907                         len = n * sizeof(gid_t);
1908                         put_cred(cred);
1909                         return copy_to_sockptr(optlen, &len, sizeof(int)) ? -EFAULT : -ERANGE;
1910                 }
1911                 len = n * sizeof(gid_t);
1912
1913                 ret = groups_to_user(optval, cred->group_info);
1914                 put_cred(cred);
1915                 if (ret)
1916                         return ret;
1917                 goto lenout;
1918         }
1919
1920         case SO_PEERNAME:
1921         {
1922                 struct sockaddr_storage address;
1923
1924                 lv = READ_ONCE(sock->ops)->getname(sock, (struct sockaddr *)&address, 2);
1925                 if (lv < 0)
1926                         return -ENOTCONN;
1927                 if (lv < len)
1928                         return -EINVAL;
1929                 if (copy_to_sockptr(optval, &address, len))
1930                         return -EFAULT;
1931                 goto lenout;
1932         }
1933
1934         /* Dubious BSD thing... Probably nobody even uses it, but
1935          * the UNIX standard wants it for whatever reason... -DaveM
1936          */
1937         case SO_ACCEPTCONN:
1938                 v.val = sk->sk_state == TCP_LISTEN;
1939                 break;
1940
1941         case SO_PASSSEC:
1942                 v.val = !!test_bit(SOCK_PASSSEC, &sock->flags);
1943                 break;
1944
1945         case SO_PEERSEC:
1946                 return security_socket_getpeersec_stream(sock,
1947                                                          optval, optlen, len);
1948
1949         case SO_MARK:
1950                 v.val = READ_ONCE(sk->sk_mark);
1951                 break;
1952
1953         case SO_RCVMARK:
1954                 v.val = sock_flag(sk, SOCK_RCVMARK);
1955                 break;
1956
1957         case SO_RCVPRIORITY:
1958                 v.val = sock_flag(sk, SOCK_RCVPRIORITY);
1959                 break;
1960
1961         case SO_RXQ_OVFL:
1962                 v.val = sock_flag(sk, SOCK_RXQ_OVFL);
1963                 break;
1964
1965         case SO_WIFI_STATUS:
1966                 v.val = sock_flag(sk, SOCK_WIFI_STATUS);
1967                 break;
1968
1969         case SO_PEEK_OFF:
1970                 if (!READ_ONCE(sock->ops)->set_peek_off)
1971                         return -EOPNOTSUPP;
1972
1973                 v.val = READ_ONCE(sk->sk_peek_off);
1974                 break;
1975         case SO_NOFCS:
1976                 v.val = sock_flag(sk, SOCK_NOFCS);
1977                 break;
1978
1979         case SO_BINDTODEVICE:
1980                 return sock_getbindtodevice(sk, optval, optlen, len);
1981
1982         case SO_GET_FILTER:
1983                 len = sk_get_filter(sk, optval, len);
1984                 if (len < 0)
1985                         return len;
1986
1987                 goto lenout;
1988
1989         case SO_LOCK_FILTER:
1990                 v.val = sock_flag(sk, SOCK_FILTER_LOCKED);
1991                 break;
1992
1993         case SO_BPF_EXTENSIONS:
1994                 v.val = bpf_tell_extensions();
1995                 break;
1996
1997         case SO_SELECT_ERR_QUEUE:
1998                 v.val = sock_flag(sk, SOCK_SELECT_ERR_QUEUE);
1999                 break;
2000
2001 #ifdef CONFIG_NET_RX_BUSY_POLL
2002         case SO_BUSY_POLL:
2003                 v.val = READ_ONCE(sk->sk_ll_usec);
2004                 break;
2005         case SO_PREFER_BUSY_POLL:
2006                 v.val = READ_ONCE(sk->sk_prefer_busy_poll);
2007                 break;
2008 #endif
2009
2010         case SO_MAX_PACING_RATE:
2011                 /* The READ_ONCE() pair with the WRITE_ONCE() in sk_setsockopt() */
2012                 if (sizeof(v.ulval) != sizeof(v.val) && len >= sizeof(v.ulval)) {
2013                         lv = sizeof(v.ulval);
2014                         v.ulval = READ_ONCE(sk->sk_max_pacing_rate);
2015                 } else {
2016                         /* 32bit version */
2017                         v.val = min_t(unsigned long, ~0U,
2018                                       READ_ONCE(sk->sk_max_pacing_rate));
2019                 }
2020                 break;
2021
2022         case SO_INCOMING_CPU:
2023                 v.val = READ_ONCE(sk->sk_incoming_cpu);
2024                 break;
2025
2026         case SO_MEMINFO:
2027         {
2028                 u32 meminfo[SK_MEMINFO_VARS];
2029
2030                 sk_get_meminfo(sk, meminfo);
2031
2032                 len = min_t(unsigned int, len, sizeof(meminfo));
2033                 if (copy_to_sockptr(optval, &meminfo, len))
2034                         return -EFAULT;
2035
2036                 goto lenout;
2037         }
2038
2039 #ifdef CONFIG_NET_RX_BUSY_POLL
2040         case SO_INCOMING_NAPI_ID:
2041                 v.val = READ_ONCE(sk->sk_napi_id);
2042
2043                 /* aggregate non-NAPI IDs down to 0 */
2044                 if (v.val < MIN_NAPI_ID)
2045                         v.val = 0;
2046
2047                 break;
2048 #endif
2049
2050         case SO_COOKIE:
2051                 lv = sizeof(u64);
2052                 if (len < lv)
2053                         return -EINVAL;
2054                 v.val64 = sock_gen_cookie(sk);
2055                 break;
2056
2057         case SO_ZEROCOPY:
2058                 v.val = sock_flag(sk, SOCK_ZEROCOPY);
2059                 break;
2060
2061         case SO_TXTIME:
2062                 lv = sizeof(v.txtime);
2063                 v.txtime.clockid = sk->sk_clockid;
2064                 v.txtime.flags |= sk->sk_txtime_deadline_mode ?
2065                                   SOF_TXTIME_DEADLINE_MODE : 0;
2066                 v.txtime.flags |= sk->sk_txtime_report_errors ?
2067                                   SOF_TXTIME_REPORT_ERRORS : 0;
2068                 break;
2069
2070         case SO_BINDTOIFINDEX:
2071                 v.val = READ_ONCE(sk->sk_bound_dev_if);
2072                 break;
2073
2074         case SO_NETNS_COOKIE:
2075                 lv = sizeof(u64);
2076                 if (len != lv)
2077                         return -EINVAL;
2078                 v.val64 = sock_net(sk)->net_cookie;
2079                 break;
2080
2081         case SO_BUF_LOCK:
2082                 v.val = sk->sk_userlocks & SOCK_BUF_LOCK_MASK;
2083                 break;
2084
2085         case SO_RESERVE_MEM:
2086                 v.val = READ_ONCE(sk->sk_reserved_mem);
2087                 break;
2088
2089         case SO_TXREHASH:
2090                 /* Paired with WRITE_ONCE() in sk_setsockopt() */
2091                 v.val = READ_ONCE(sk->sk_txrehash);
2092                 break;
2093
2094         default:
2095                 /* We implement the SO_SNDLOWAT etc to not be settable
2096                  * (1003.1g 7).
2097                  */
2098                 return -ENOPROTOOPT;
2099         }
2100
2101         if (len > lv)
2102                 len = lv;
2103         if (copy_to_sockptr(optval, &v, len))
2104                 return -EFAULT;
2105 lenout:
2106         if (copy_to_sockptr(optlen, &len, sizeof(int)))
2107                 return -EFAULT;
2108         return 0;
2109 }
2110
2111 /*
2112  * Initialize an sk_lock.
2113  *
2114  * (We also register the sk_lock with the lock validator.)
2115  */
2116 static inline void sock_lock_init(struct sock *sk)
2117 {
2118         if (sk->sk_kern_sock)
2119                 sock_lock_init_class_and_name(
2120                         sk,
2121                         af_family_kern_slock_key_strings[sk->sk_family],
2122                         af_family_kern_slock_keys + sk->sk_family,
2123                         af_family_kern_key_strings[sk->sk_family],
2124                         af_family_kern_keys + sk->sk_family);
2125         else
2126                 sock_lock_init_class_and_name(
2127                         sk,
2128                         af_family_slock_key_strings[sk->sk_family],
2129                         af_family_slock_keys + sk->sk_family,
2130                         af_family_key_strings[sk->sk_family],
2131                         af_family_keys + sk->sk_family);
2132 }
2133
2134 /*
2135  * Copy all fields from osk to nsk but nsk->sk_refcnt must not change yet,
2136  * even temporarily, because of RCU lookups. sk_node should also be left as is.
2137  * We must not copy fields between sk_dontcopy_begin and sk_dontcopy_end
2138  */
2139 static void sock_copy(struct sock *nsk, const struct sock *osk)
2140 {
2141         const struct proto *prot = READ_ONCE(osk->sk_prot);
2142 #ifdef CONFIG_SECURITY_NETWORK
2143         void *sptr = nsk->sk_security;
2144 #endif
2145
2146         /* If we move sk_tx_queue_mapping out of the private section,
2147          * we must check if sk_tx_queue_clear() is called after
2148          * sock_copy() in sk_clone_lock().
2149          */
2150         BUILD_BUG_ON(offsetof(struct sock, sk_tx_queue_mapping) <
2151                      offsetof(struct sock, sk_dontcopy_begin) ||
2152                      offsetof(struct sock, sk_tx_queue_mapping) >=
2153                      offsetof(struct sock, sk_dontcopy_end));
2154
2155         memcpy(nsk, osk, offsetof(struct sock, sk_dontcopy_begin));
2156
2157         unsafe_memcpy(&nsk->sk_dontcopy_end, &osk->sk_dontcopy_end,
2158                       prot->obj_size - offsetof(struct sock, sk_dontcopy_end),
2159                       /* alloc is larger than struct, see sk_prot_alloc() */);
2160
2161 #ifdef CONFIG_SECURITY_NETWORK
2162         nsk->sk_security = sptr;
2163         security_sk_clone(osk, nsk);
2164 #endif
2165 }
2166
2167 static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
2168                 int family)
2169 {
2170         struct sock *sk;
2171         struct kmem_cache *slab;
2172
2173         slab = prot->slab;
2174         if (slab != NULL) {
2175                 sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO);
2176                 if (!sk)
2177                         return sk;
2178                 if (want_init_on_alloc(priority))
2179                         sk_prot_clear_nulls(sk, prot->obj_size);
2180         } else
2181                 sk = kmalloc(prot->obj_size, priority);
2182
2183         if (sk != NULL) {
2184                 if (security_sk_alloc(sk, family, priority))
2185                         goto out_free;
2186
2187                 if (!try_module_get(prot->owner))
2188                         goto out_free_sec;
2189         }
2190
2191         return sk;
2192
2193 out_free_sec:
2194         security_sk_free(sk);
2195 out_free:
2196         if (slab != NULL)
2197                 kmem_cache_free(slab, sk);
2198         else
2199                 kfree(sk);
2200         return NULL;
2201 }
2202
2203 static void sk_prot_free(struct proto *prot, struct sock *sk)
2204 {
2205         struct kmem_cache *slab;
2206         struct module *owner;
2207
2208         owner = prot->owner;
2209         slab = prot->slab;
2210
2211         cgroup_sk_free(&sk->sk_cgrp_data);
2212         mem_cgroup_sk_free(sk);
2213         security_sk_free(sk);
2214         if (slab != NULL)
2215                 kmem_cache_free(slab, sk);
2216         else
2217                 kfree(sk);
2218         module_put(owner);
2219 }
2220
2221 /**
2222  *      sk_alloc - All socket objects are allocated here
2223  *      @net: the applicable net namespace
2224  *      @family: protocol family
2225  *      @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
2226  *      @prot: struct proto associated with this new sock instance
2227  *      @kern: is this to be a kernel socket?
2228  */
2229 struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
2230                       struct proto *prot, int kern)
2231 {
2232         struct sock *sk;
2233
2234         sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family);
2235         if (sk) {
2236                 sk->sk_family = family;
2237                 /*
2238                  * See comment in struct sock definition to understand
2239                  * why we need sk_prot_creator -acme
2240                  */
2241                 sk->sk_prot = sk->sk_prot_creator = prot;
2242                 sk->sk_kern_sock = kern;
2243                 sock_lock_init(sk);
2244                 sk->sk_net_refcnt = kern ? 0 : 1;
2245                 if (likely(sk->sk_net_refcnt)) {
2246                         get_net_track(net, &sk->ns_tracker, priority);
2247                         sock_inuse_add(net, 1);
2248                 } else {
2249                         __netns_tracker_alloc(net, &sk->ns_tracker,
2250                                               false, priority);
2251                 }
2252
2253                 sock_net_set(sk, net);
2254                 refcount_set(&sk->sk_wmem_alloc, 1);
2255
2256                 mem_cgroup_sk_alloc(sk);
2257                 cgroup_sk_alloc(&sk->sk_cgrp_data);
2258                 sock_update_classid(&sk->sk_cgrp_data);
2259                 sock_update_netprioidx(&sk->sk_cgrp_data);
2260                 sk_tx_queue_clear(sk);
2261         }
2262
2263         return sk;
2264 }
2265 EXPORT_SYMBOL(sk_alloc);
2266
2267 /* Sockets having SOCK_RCU_FREE will call this function after one RCU
2268  * grace period. This is the case for UDP sockets and TCP listeners.
2269  */
2270 static void __sk_destruct(struct rcu_head *head)
2271 {
2272         struct sock *sk = container_of(head, struct sock, sk_rcu);
2273         struct sk_filter *filter;
2274
2275         if (sk->sk_destruct)
2276                 sk->sk_destruct(sk);
2277
2278         filter = rcu_dereference_check(sk->sk_filter,
2279                                        refcount_read(&sk->sk_wmem_alloc) == 0);
2280         if (filter) {
2281                 sk_filter_uncharge(sk, filter);
2282                 RCU_INIT_POINTER(sk->sk_filter, NULL);
2283         }
2284
2285         sock_disable_timestamp(sk, SK_FLAGS_TIMESTAMP);
2286
2287 #ifdef CONFIG_BPF_SYSCALL
2288         bpf_sk_storage_free(sk);
2289 #endif
2290
2291         if (atomic_read(&sk->sk_omem_alloc))
2292                 pr_debug("%s: optmem leakage (%d bytes) detected\n",
2293                          __func__, atomic_read(&sk->sk_omem_alloc));
2294
2295         if (sk->sk_frag.page) {
2296                 put_page(sk->sk_frag.page);
2297                 sk->sk_frag.page = NULL;
2298         }
2299
2300         /* We do not need to acquire sk->sk_peer_lock, we are the last user. */
2301         put_cred(sk->sk_peer_cred);
2302         put_pid(sk->sk_peer_pid);
2303
2304         if (likely(sk->sk_net_refcnt))
2305                 put_net_track(sock_net(sk), &sk->ns_tracker);
2306         else
2307                 __netns_tracker_free(sock_net(sk), &sk->ns_tracker, false);
2308
2309         sk_prot_free(sk->sk_prot_creator, sk);
2310 }
2311
2312 void sk_destruct(struct sock *sk)
2313 {
2314         bool use_call_rcu = sock_flag(sk, SOCK_RCU_FREE);
2315
2316         if (rcu_access_pointer(sk->sk_reuseport_cb)) {
2317                 reuseport_detach_sock(sk);
2318                 use_call_rcu = true;
2319         }
2320
2321         if (use_call_rcu)
2322                 call_rcu(&sk->sk_rcu, __sk_destruct);
2323         else
2324                 __sk_destruct(&sk->sk_rcu);
2325 }
2326
2327 static void __sk_free(struct sock *sk)
2328 {
2329         if (likely(sk->sk_net_refcnt))
2330                 sock_inuse_add(sock_net(sk), -1);
2331
2332         if (unlikely(sk->sk_net_refcnt && sock_diag_has_destroy_listeners(sk)))
2333                 sock_diag_broadcast_destroy(sk);
2334         else
2335                 sk_destruct(sk);
2336 }
2337
2338 void sk_free(struct sock *sk)
2339 {
2340         /*
2341          * We subtract one from sk_wmem_alloc and can know if
2342          * some packets are still in some tx queue.
2343          * If not null, sock_wfree() will call __sk_free(sk) later
2344          */
2345         if (refcount_dec_and_test(&sk->sk_wmem_alloc))
2346                 __sk_free(sk);
2347 }
2348 EXPORT_SYMBOL(sk_free);
2349
2350 static void sk_init_common(struct sock *sk)
2351 {
2352         skb_queue_head_init(&sk->sk_receive_queue);
2353         skb_queue_head_init(&sk->sk_write_queue);
2354         skb_queue_head_init(&sk->sk_error_queue);
2355
2356         rwlock_init(&sk->sk_callback_lock);
2357         lockdep_set_class_and_name(&sk->sk_receive_queue.lock,
2358                         af_rlock_keys + sk->sk_family,
2359                         af_family_rlock_key_strings[sk->sk_family]);
2360         lockdep_set_class_and_name(&sk->sk_write_queue.lock,
2361                         af_wlock_keys + sk->sk_family,
2362                         af_family_wlock_key_strings[sk->sk_family]);
2363         lockdep_set_class_and_name(&sk->sk_error_queue.lock,
2364                         af_elock_keys + sk->sk_family,
2365                         af_family_elock_key_strings[sk->sk_family]);
2366         if (sk->sk_kern_sock)
2367                 lockdep_set_class_and_name(&sk->sk_callback_lock,
2368                         af_kern_callback_keys + sk->sk_family,
2369                         af_family_kern_clock_key_strings[sk->sk_family]);
2370         else
2371                 lockdep_set_class_and_name(&sk->sk_callback_lock,
2372                         af_callback_keys + sk->sk_family,
2373                         af_family_clock_key_strings[sk->sk_family]);
2374 }
2375
2376 /**
2377  *      sk_clone_lock - clone a socket, and lock its clone
2378  *      @sk: the socket to clone
2379  *      @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
2380  *
2381  *      Caller must unlock socket even in error path (bh_unlock_sock(newsk))
2382  */
2383 struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
2384 {
2385         struct proto *prot = READ_ONCE(sk->sk_prot);
2386         struct sk_filter *filter;
2387         bool is_charged = true;
2388         struct sock *newsk;
2389
2390         newsk = sk_prot_alloc(prot, priority, sk->sk_family);
2391         if (!newsk)
2392                 goto out;
2393
2394         sock_copy(newsk, sk);
2395
2396         newsk->sk_prot_creator = prot;
2397
2398         /* SANITY */
2399         if (likely(newsk->sk_net_refcnt)) {
2400                 get_net_track(sock_net(newsk), &newsk->ns_tracker, priority);
2401                 sock_inuse_add(sock_net(newsk), 1);
2402         } else {
2403                 /* Kernel sockets are not elevating the struct net refcount.
2404                  * Instead, use a tracker to more easily detect if a layer
2405                  * is not properly dismantling its kernel sockets at netns
2406                  * destroy time.
2407                  */
2408                 __netns_tracker_alloc(sock_net(newsk), &newsk->ns_tracker,
2409                                       false, priority);
2410         }
2411         sk_node_init(&newsk->sk_node);
2412         sock_lock_init(newsk);
2413         bh_lock_sock(newsk);
2414         newsk->sk_backlog.head  = newsk->sk_backlog.tail = NULL;
2415         newsk->sk_backlog.len = 0;
2416
2417         atomic_set(&newsk->sk_rmem_alloc, 0);
2418
2419         /* sk_wmem_alloc set to one (see sk_free() and sock_wfree()) */
2420         refcount_set(&newsk->sk_wmem_alloc, 1);
2421
2422         atomic_set(&newsk->sk_omem_alloc, 0);
2423         sk_init_common(newsk);
2424
2425         newsk->sk_dst_cache     = NULL;
2426         newsk->sk_dst_pending_confirm = 0;
2427         newsk->sk_wmem_queued   = 0;
2428         newsk->sk_forward_alloc = 0;
2429         newsk->sk_reserved_mem  = 0;
2430         atomic_set(&newsk->sk_drops, 0);
2431         newsk->sk_send_head     = NULL;
2432         newsk->sk_userlocks     = sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
2433         atomic_set(&newsk->sk_zckey, 0);
2434
2435         sock_reset_flag(newsk, SOCK_DONE);
2436
2437         /* sk->sk_memcg will be populated at accept() time */
2438         newsk->sk_memcg = NULL;
2439
2440         cgroup_sk_clone(&newsk->sk_cgrp_data);
2441
2442         rcu_read_lock();
2443         filter = rcu_dereference(sk->sk_filter);
2444         if (filter != NULL)
2445                 /* though it's an empty new sock, the charging may fail
2446                  * if sysctl_optmem_max was changed between creation of
2447                  * original socket and cloning
2448                  */
2449                 is_charged = sk_filter_charge(newsk, filter);
2450         RCU_INIT_POINTER(newsk->sk_filter, filter);
2451         rcu_read_unlock();
2452
2453         if (unlikely(!is_charged || xfrm_sk_clone_policy(newsk, sk))) {
2454                 /* We need to make sure that we don't uncharge the new
2455                  * socket if we couldn't charge it in the first place
2456                  * as otherwise we uncharge the parent's filter.
2457                  */
2458                 if (!is_charged)
2459                         RCU_INIT_POINTER(newsk->sk_filter, NULL);
2460                 sk_free_unlock_clone(newsk);
2461                 newsk = NULL;
2462                 goto out;
2463         }
2464         RCU_INIT_POINTER(newsk->sk_reuseport_cb, NULL);
2465
2466         if (bpf_sk_storage_clone(sk, newsk)) {
2467                 sk_free_unlock_clone(newsk);
2468                 newsk = NULL;
2469                 goto out;
2470         }
2471
2472         /* Clear sk_user_data if parent had the pointer tagged
2473          * as not suitable for copying when cloning.
2474          */
2475         if (sk_user_data_is_nocopy(newsk))
2476                 newsk->sk_user_data = NULL;
2477
2478         newsk->sk_err      = 0;
2479         newsk->sk_err_soft = 0;
2480         newsk->sk_priority = 0;
2481         newsk->sk_incoming_cpu = raw_smp_processor_id();
2482
2483         /* Before updating sk_refcnt, we must commit prior changes to memory
2484          * (Documentation/RCU/rculist_nulls.rst for details)
2485          */
2486         smp_wmb();
2487         refcount_set(&newsk->sk_refcnt, 2);
2488
2489         sk_set_socket(newsk, NULL);
2490         sk_tx_queue_clear(newsk);
2491         RCU_INIT_POINTER(newsk->sk_wq, NULL);
2492
2493         if (newsk->sk_prot->sockets_allocated)
2494                 sk_sockets_allocated_inc(newsk);
2495
2496         if (sock_needs_netstamp(sk) && newsk->sk_flags & SK_FLAGS_TIMESTAMP)
2497                 net_enable_timestamp();
2498 out:
2499         return newsk;
2500 }
2501 EXPORT_SYMBOL_GPL(sk_clone_lock);
2502
2503 void sk_free_unlock_clone(struct sock *sk)
2504 {
2505         /* It is still raw copy of parent, so invalidate
2506          * destructor and make plain sk_free() */
2507         sk->sk_destruct = NULL;
2508         bh_unlock_sock(sk);
2509         sk_free(sk);
2510 }
2511 EXPORT_SYMBOL_GPL(sk_free_unlock_clone);
2512
2513 static u32 sk_dst_gso_max_size(struct sock *sk, struct dst_entry *dst)
2514 {
2515         bool is_ipv6 = false;
2516         u32 max_size;
2517
2518 #if IS_ENABLED(CONFIG_IPV6)
2519         is_ipv6 = (sk->sk_family == AF_INET6 &&
2520                    !ipv6_addr_v4mapped(&sk->sk_v6_rcv_saddr));
2521 #endif
2522         /* pairs with the WRITE_ONCE() in netif_set_gso(_ipv4)_max_size() */
2523         max_size = is_ipv6 ? READ_ONCE(dst->dev->gso_max_size) :
2524                         READ_ONCE(dst->dev->gso_ipv4_max_size);
2525         if (max_size > GSO_LEGACY_MAX_SIZE && !sk_is_tcp(sk))
2526                 max_size = GSO_LEGACY_MAX_SIZE;
2527
2528         return max_size - (MAX_TCP_HEADER + 1);
2529 }
2530
2531 void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
2532 {
2533         u32 max_segs = 1;
2534
2535         sk->sk_route_caps = dst->dev->features;
2536         if (sk_is_tcp(sk))
2537                 sk->sk_route_caps |= NETIF_F_GSO;
2538         if (sk->sk_route_caps & NETIF_F_GSO)
2539                 sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE;
2540         if (unlikely(sk->sk_gso_disabled))
2541                 sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
2542         if (sk_can_gso(sk)) {
2543                 if (dst->header_len && !xfrm_dst_offload_ok(dst)) {
2544                         sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
2545                 } else {
2546                         sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM;
2547                         sk->sk_gso_max_size = sk_dst_gso_max_size(sk, dst);
2548                         /* pairs with the WRITE_ONCE() in netif_set_gso_max_segs() */
2549                         max_segs = max_t(u32, READ_ONCE(dst->dev->gso_max_segs), 1);
2550                 }
2551         }
2552         sk->sk_gso_max_segs = max_segs;
2553         sk_dst_set(sk, dst);
2554 }
2555 EXPORT_SYMBOL_GPL(sk_setup_caps);
2556
2557 /*
2558  *      Simple resource managers for sockets.
2559  */
2560
2561
2562 /*
2563  * Write buffer destructor automatically called from kfree_skb.
2564  */
2565 void sock_wfree(struct sk_buff *skb)
2566 {
2567         struct sock *sk = skb->sk;
2568         unsigned int len = skb->truesize;
2569         bool free;
2570
2571         if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE)) {
2572                 if (sock_flag(sk, SOCK_RCU_FREE) &&
2573                     sk->sk_write_space == sock_def_write_space) {
2574                         rcu_read_lock();
2575                         free = refcount_sub_and_test(len, &sk->sk_wmem_alloc);
2576                         sock_def_write_space_wfree(sk);
2577                         rcu_read_unlock();
2578                         if (unlikely(free))
2579                                 __sk_free(sk);
2580                         return;
2581                 }
2582
2583                 /*
2584                  * Keep a reference on sk_wmem_alloc, this will be released
2585                  * after sk_write_space() call
2586                  */
2587                 WARN_ON(refcount_sub_and_test(len - 1, &sk->sk_wmem_alloc));
2588                 sk->sk_write_space(sk);
2589                 len = 1;
2590         }
2591         /*
2592          * if sk_wmem_alloc reaches 0, we must finish what sk_free()
2593          * could not do because of in-flight packets
2594          */
2595         if (refcount_sub_and_test(len, &sk->sk_wmem_alloc))
2596                 __sk_free(sk);
2597 }
2598 EXPORT_SYMBOL(sock_wfree);
2599
2600 /* This variant of sock_wfree() is used by TCP,
2601  * since it sets SOCK_USE_WRITE_QUEUE.
2602  */
2603 void __sock_wfree(struct sk_buff *skb)
2604 {
2605         struct sock *sk = skb->sk;
2606
2607         if (refcount_sub_and_test(skb->truesize, &sk->sk_wmem_alloc))
2608                 __sk_free(sk);
2609 }
2610
2611 void skb_set_owner_w(struct sk_buff *skb, struct sock *sk)
2612 {
2613         skb_orphan(skb);
2614 #ifdef CONFIG_INET
2615         if (unlikely(!sk_fullsock(sk)))
2616                 return skb_set_owner_edemux(skb, sk);
2617 #endif
2618         skb->sk = sk;
2619         skb->destructor = sock_wfree;
2620         skb_set_hash_from_sk(skb, sk);
2621         /*
2622          * We used to take a refcount on sk, but following operation
2623          * is enough to guarantee sk_free() won't free this sock until
2624          * all in-flight packets are completed
2625          */
2626         refcount_add(skb->truesize, &sk->sk_wmem_alloc);
2627 }
2628 EXPORT_SYMBOL(skb_set_owner_w);
2629
2630 static bool can_skb_orphan_partial(const struct sk_buff *skb)
2631 {
2632         /* Drivers depend on in-order delivery for crypto offload,
2633          * partial orphan breaks out-of-order-OK logic.
2634          */
2635         if (skb_is_decrypted(skb))
2636                 return false;
2637
2638         return (skb->destructor == sock_wfree ||
2639                 (IS_ENABLED(CONFIG_INET) && skb->destructor == tcp_wfree));
2640 }
2641
2642 /* This helper is used by netem, as it can hold packets in its
2643  * delay queue. We want to allow the owner socket to send more
2644  * packets, as if they were already TX completed by a typical driver.
2645  * But we also want to keep skb->sk set because some packet schedulers
2646  * rely on it (sch_fq for example).
2647  */
2648 void skb_orphan_partial(struct sk_buff *skb)
2649 {
2650         if (skb_is_tcp_pure_ack(skb))
2651                 return;
2652
2653         if (can_skb_orphan_partial(skb) && skb_set_owner_sk_safe(skb, skb->sk))
2654                 return;
2655
2656         skb_orphan(skb);
2657 }
2658 EXPORT_SYMBOL(skb_orphan_partial);
2659
2660 /*
2661  * Read buffer destructor automatically called from kfree_skb.
2662  */
2663 void sock_rfree(struct sk_buff *skb)
2664 {
2665         struct sock *sk = skb->sk;
2666         unsigned int len = skb->truesize;
2667
2668         atomic_sub(len, &sk->sk_rmem_alloc);
2669         sk_mem_uncharge(sk, len);
2670 }
2671 EXPORT_SYMBOL(sock_rfree);
2672
2673 /*
2674  * Buffer destructor for skbs that are not used directly in read or write
2675  * path, e.g. for error handler skbs. Automatically called from kfree_skb.
2676  */
2677 void sock_efree(struct sk_buff *skb)
2678 {
2679         sock_put(skb->sk);
2680 }
2681 EXPORT_SYMBOL(sock_efree);
2682
2683 /* Buffer destructor for prefetch/receive path where reference count may
2684  * not be held, e.g. for listen sockets.
2685  */
2686 #ifdef CONFIG_INET
2687 void sock_pfree(struct sk_buff *skb)
2688 {
2689         struct sock *sk = skb->sk;
2690
2691         if (!sk_is_refcounted(sk))
2692                 return;
2693
2694         if (sk->sk_state == TCP_NEW_SYN_RECV && inet_reqsk(sk)->syncookie) {
2695                 inet_reqsk(sk)->rsk_listener = NULL;
2696                 reqsk_free(inet_reqsk(sk));
2697                 return;
2698         }
2699
2700         sock_gen_put(sk);
2701 }
2702 EXPORT_SYMBOL(sock_pfree);
2703 #endif /* CONFIG_INET */
2704
2705 kuid_t sock_i_uid(struct sock *sk)
2706 {
2707         kuid_t uid;
2708
2709         read_lock_bh(&sk->sk_callback_lock);
2710         uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : GLOBAL_ROOT_UID;
2711         read_unlock_bh(&sk->sk_callback_lock);
2712         return uid;
2713 }
2714 EXPORT_SYMBOL(sock_i_uid);
2715
2716 unsigned long __sock_i_ino(struct sock *sk)
2717 {
2718         unsigned long ino;
2719
2720         read_lock(&sk->sk_callback_lock);
2721         ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0;
2722         read_unlock(&sk->sk_callback_lock);
2723         return ino;
2724 }
2725 EXPORT_SYMBOL(__sock_i_ino);
2726
2727 unsigned long sock_i_ino(struct sock *sk)
2728 {
2729         unsigned long ino;
2730
2731         local_bh_disable();
2732         ino = __sock_i_ino(sk);
2733         local_bh_enable();
2734         return ino;
2735 }
2736 EXPORT_SYMBOL(sock_i_ino);
2737
2738 /*
2739  * Allocate a skb from the socket's send buffer.
2740  */
2741 struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
2742                              gfp_t priority)
2743 {
2744         if (force ||
2745             refcount_read(&sk->sk_wmem_alloc) < READ_ONCE(sk->sk_sndbuf)) {
2746                 struct sk_buff *skb = alloc_skb(size, priority);
2747
2748                 if (skb) {
2749                         skb_set_owner_w(skb, sk);
2750                         return skb;
2751                 }
2752         }
2753         return NULL;
2754 }
2755 EXPORT_SYMBOL(sock_wmalloc);
2756
2757 static void sock_ofree(struct sk_buff *skb)
2758 {
2759         struct sock *sk = skb->sk;
2760
2761         atomic_sub(skb->truesize, &sk->sk_omem_alloc);
2762 }
2763
2764 struct sk_buff *sock_omalloc(struct sock *sk, unsigned long size,
2765                              gfp_t priority)
2766 {
2767         struct sk_buff *skb;
2768
2769         /* small safe race: SKB_TRUESIZE may differ from final skb->truesize */
2770         if (atomic_read(&sk->sk_omem_alloc) + SKB_TRUESIZE(size) >
2771             READ_ONCE(sock_net(sk)->core.sysctl_optmem_max))
2772                 return NULL;
2773
2774         skb = alloc_skb(size, priority);
2775         if (!skb)
2776                 return NULL;
2777
2778         atomic_add(skb->truesize, &sk->sk_omem_alloc);
2779         skb->sk = sk;
2780         skb->destructor = sock_ofree;
2781         return skb;
2782 }
2783
2784 /*
2785  * Allocate a memory block from the socket's option memory buffer.
2786  */
2787 void *sock_kmalloc(struct sock *sk, int size, gfp_t priority)
2788 {
2789         int optmem_max = READ_ONCE(sock_net(sk)->core.sysctl_optmem_max);
2790
2791         if ((unsigned int)size <= optmem_max &&
2792             atomic_read(&sk->sk_omem_alloc) + size < optmem_max) {
2793                 void *mem;
2794                 /* First do the add, to avoid the race if kmalloc
2795                  * might sleep.
2796                  */
2797                 atomic_add(size, &sk->sk_omem_alloc);
2798                 mem = kmalloc(size, priority);
2799                 if (mem)
2800                         return mem;
2801                 atomic_sub(size, &sk->sk_omem_alloc);
2802         }
2803         return NULL;
2804 }
2805 EXPORT_SYMBOL(sock_kmalloc);
2806
2807 /* Free an option memory block. Note, we actually want the inline
2808  * here as this allows gcc to detect the nullify and fold away the
2809  * condition entirely.
2810  */
2811 static inline void __sock_kfree_s(struct sock *sk, void *mem, int size,
2812                                   const bool nullify)
2813 {
2814         if (WARN_ON_ONCE(!mem))
2815                 return;
2816         if (nullify)
2817                 kfree_sensitive(mem);
2818         else
2819                 kfree(mem);
2820         atomic_sub(size, &sk->sk_omem_alloc);
2821 }
2822
2823 void sock_kfree_s(struct sock *sk, void *mem, int size)
2824 {
2825         __sock_kfree_s(sk, mem, size, false);
2826 }
2827 EXPORT_SYMBOL(sock_kfree_s);
2828
2829 void sock_kzfree_s(struct sock *sk, void *mem, int size)
2830 {
2831         __sock_kfree_s(sk, mem, size, true);
2832 }
2833 EXPORT_SYMBOL(sock_kzfree_s);
2834
2835 /* It is almost wait_for_tcp_memory minus release_sock/lock_sock.
2836    I think, these locks should be removed for datagram sockets.
2837  */
2838 static long sock_wait_for_wmem(struct sock *sk, long timeo)
2839 {
2840         DEFINE_WAIT(wait);
2841
2842         sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
2843         for (;;) {
2844                 if (!timeo)
2845                         break;
2846                 if (signal_pending(current))
2847                         break;
2848                 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
2849                 prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
2850                 if (refcount_read(&sk->sk_wmem_alloc) < READ_ONCE(sk->sk_sndbuf))
2851                         break;
2852                 if (READ_ONCE(sk->sk_shutdown) & SEND_SHUTDOWN)
2853                         break;
2854                 if (READ_ONCE(sk->sk_err))
2855                         break;
2856                 timeo = schedule_timeout(timeo);
2857         }
2858         finish_wait(sk_sleep(sk), &wait);
2859         return timeo;
2860 }
2861
2862
2863 /*
2864  *      Generic send/receive buffer handlers
2865  */
2866
2867 struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len,
2868                                      unsigned long data_len, int noblock,
2869                                      int *errcode, int max_page_order)
2870 {
2871         struct sk_buff *skb;
2872         long timeo;
2873         int err;
2874
2875         timeo = sock_sndtimeo(sk, noblock);
2876         for (;;) {
2877                 err = sock_error(sk);
2878                 if (err != 0)
2879                         goto failure;
2880
2881                 err = -EPIPE;
2882                 if (READ_ONCE(sk->sk_shutdown) & SEND_SHUTDOWN)
2883                         goto failure;
2884
2885                 if (sk_wmem_alloc_get(sk) < READ_ONCE(sk->sk_sndbuf))
2886                         break;
2887
2888                 sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
2889                 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
2890                 err = -EAGAIN;
2891                 if (!timeo)
2892                         goto failure;
2893                 if (signal_pending(current))
2894                         goto interrupted;
2895                 timeo = sock_wait_for_wmem(sk, timeo);
2896         }
2897         skb = alloc_skb_with_frags(header_len, data_len, max_page_order,
2898                                    errcode, sk->sk_allocation);
2899         if (skb)
2900                 skb_set_owner_w(skb, sk);
2901         return skb;
2902
2903 interrupted:
2904         err = sock_intr_errno(timeo);
2905 failure:
2906         *errcode = err;
2907         return NULL;
2908 }
2909 EXPORT_SYMBOL(sock_alloc_send_pskb);
2910
2911 int __sock_cmsg_send(struct sock *sk, struct cmsghdr *cmsg,
2912                      struct sockcm_cookie *sockc)
2913 {
2914         u32 tsflags;
2915
2916         BUILD_BUG_ON(SOF_TIMESTAMPING_LAST == (1 << 31));
2917
2918         switch (cmsg->cmsg_type) {
2919         case SO_MARK:
2920                 if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) &&
2921                     !ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
2922                         return -EPERM;
2923                 if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
2924                         return -EINVAL;
2925                 sockc->mark = *(u32 *)CMSG_DATA(cmsg);
2926                 break;
2927         case SO_TIMESTAMPING_OLD:
2928         case SO_TIMESTAMPING_NEW:
2929                 if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
2930                         return -EINVAL;
2931
2932                 tsflags = *(u32 *)CMSG_DATA(cmsg);
2933                 if (tsflags & ~SOF_TIMESTAMPING_TX_RECORD_MASK)
2934                         return -EINVAL;
2935
2936                 sockc->tsflags &= ~SOF_TIMESTAMPING_TX_RECORD_MASK;
2937                 sockc->tsflags |= tsflags;
2938                 break;
2939         case SCM_TXTIME:
2940                 if (!sock_flag(sk, SOCK_TXTIME))
2941                         return -EINVAL;
2942                 if (cmsg->cmsg_len != CMSG_LEN(sizeof(u64)))
2943                         return -EINVAL;
2944                 sockc->transmit_time = get_unaligned((u64 *)CMSG_DATA(cmsg));
2945                 break;
2946         case SCM_TS_OPT_ID:
2947                 if (sk_is_tcp(sk))
2948                         return -EINVAL;
2949                 tsflags = READ_ONCE(sk->sk_tsflags);
2950                 if (!(tsflags & SOF_TIMESTAMPING_OPT_ID))
2951                         return -EINVAL;
2952                 if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
2953                         return -EINVAL;
2954                 sockc->ts_opt_id = *(u32 *)CMSG_DATA(cmsg);
2955                 sockc->tsflags |= SOCKCM_FLAG_TS_OPT_ID;
2956                 break;
2957         /* SCM_RIGHTS and SCM_CREDENTIALS are semantically in SOL_UNIX. */
2958         case SCM_RIGHTS:
2959         case SCM_CREDENTIALS:
2960                 break;
2961         case SO_PRIORITY:
2962                 if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
2963                         return -EINVAL;
2964                 if (!sk_set_prio_allowed(sk, *(u32 *)CMSG_DATA(cmsg)))
2965                         return -EPERM;
2966                 sockc->priority = *(u32 *)CMSG_DATA(cmsg);
2967                 break;
2968         default:
2969                 return -EINVAL;
2970         }
2971         return 0;
2972 }
2973 EXPORT_SYMBOL(__sock_cmsg_send);
2974
2975 int sock_cmsg_send(struct sock *sk, struct msghdr *msg,
2976                    struct sockcm_cookie *sockc)
2977 {
2978         struct cmsghdr *cmsg;
2979         int ret;
2980
2981         for_each_cmsghdr(cmsg, msg) {
2982                 if (!CMSG_OK(msg, cmsg))
2983                         return -EINVAL;
2984                 if (cmsg->cmsg_level != SOL_SOCKET)
2985                         continue;
2986                 ret = __sock_cmsg_send(sk, cmsg, sockc);
2987                 if (ret)
2988                         return ret;
2989         }
2990         return 0;
2991 }
2992 EXPORT_SYMBOL(sock_cmsg_send);
2993
2994 static void sk_enter_memory_pressure(struct sock *sk)
2995 {
2996         if (!sk->sk_prot->enter_memory_pressure)
2997                 return;
2998
2999         sk->sk_prot->enter_memory_pressure(sk);
3000 }
3001
3002 static void sk_leave_memory_pressure(struct sock *sk)
3003 {
3004         if (sk->sk_prot->leave_memory_pressure) {
3005                 INDIRECT_CALL_INET_1(sk->sk_prot->leave_memory_pressure,
3006                                      tcp_leave_memory_pressure, sk);
3007         } else {
3008                 unsigned long *memory_pressure = sk->sk_prot->memory_pressure;
3009
3010                 if (memory_pressure && READ_ONCE(*memory_pressure))
3011                         WRITE_ONCE(*memory_pressure, 0);
3012         }
3013 }
3014
3015 DEFINE_STATIC_KEY_FALSE(net_high_order_alloc_disable_key);
3016
3017 /**
3018  * skb_page_frag_refill - check that a page_frag contains enough room
3019  * @sz: minimum size of the fragment we want to get
3020  * @pfrag: pointer to page_frag
3021  * @gfp: priority for memory allocation
3022  *
3023  * Note: While this allocator tries to use high order pages, there is
3024  * no guarantee that allocations succeed. Therefore, @sz MUST be
3025  * less or equal than PAGE_SIZE.
3026  */
3027 bool skb_page_frag_refill(unsigned int sz, struct page_frag *pfrag, gfp_t gfp)
3028 {
3029         if (pfrag->page) {
3030                 if (page_ref_count(pfrag->page) == 1) {
3031                         pfrag->offset = 0;
3032                         return true;
3033                 }
3034                 if (pfrag->offset + sz <= pfrag->size)
3035                         return true;
3036                 put_page(pfrag->page);
3037         }
3038
3039         pfrag->offset = 0;
3040         if (SKB_FRAG_PAGE_ORDER &&
3041             !static_branch_unlikely(&net_high_order_alloc_disable_key)) {
3042                 /* Avoid direct reclaim but allow kswapd to wake */
3043                 pfrag->page = alloc_pages((gfp & ~__GFP_DIRECT_RECLAIM) |
3044                                           __GFP_COMP | __GFP_NOWARN |
3045                                           __GFP_NORETRY,
3046                                           SKB_FRAG_PAGE_ORDER);
3047                 if (likely(pfrag->page)) {
3048                         pfrag->size = PAGE_SIZE << SKB_FRAG_PAGE_ORDER;
3049                         return true;
3050                 }
3051         }
3052         pfrag->page = alloc_page(gfp);
3053         if (likely(pfrag->page)) {
3054                 pfrag->size = PAGE_SIZE;
3055                 return true;
3056         }
3057         return false;
3058 }
3059 EXPORT_SYMBOL(skb_page_frag_refill);
3060
3061 bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag)
3062 {
3063         if (likely(skb_page_frag_refill(32U, pfrag, sk->sk_allocation)))
3064                 return true;
3065
3066         sk_enter_memory_pressure(sk);
3067         sk_stream_moderate_sndbuf(sk);
3068         return false;
3069 }
3070 EXPORT_SYMBOL(sk_page_frag_refill);
3071
3072 void __lock_sock(struct sock *sk)
3073         __releases(&sk->sk_lock.slock)
3074         __acquires(&sk->sk_lock.slock)
3075 {
3076         DEFINE_WAIT(wait);
3077
3078         for (;;) {
3079                 prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait,
3080                                         TASK_UNINTERRUPTIBLE);
3081                 spin_unlock_bh(&sk->sk_lock.slock);
3082                 schedule();
3083                 spin_lock_bh(&sk->sk_lock.slock);
3084                 if (!sock_owned_by_user(sk))
3085                         break;
3086         }
3087         finish_wait(&sk->sk_lock.wq, &wait);
3088 }
3089
3090 void __release_sock(struct sock *sk)
3091         __releases(&sk->sk_lock.slock)
3092         __acquires(&sk->sk_lock.slock)
3093 {
3094         struct sk_buff *skb, *next;
3095
3096         while ((skb = sk->sk_backlog.head) != NULL) {
3097                 sk->sk_backlog.head = sk->sk_backlog.tail = NULL;
3098
3099                 spin_unlock_bh(&sk->sk_lock.slock);
3100
3101                 do {
3102                         next = skb->next;
3103                         prefetch(next);
3104                         DEBUG_NET_WARN_ON_ONCE(skb_dst_is_noref(skb));
3105                         skb_mark_not_on_list(skb);
3106                         sk_backlog_rcv(sk, skb);
3107
3108                         cond_resched();
3109
3110                         skb = next;
3111                 } while (skb != NULL);
3112
3113                 spin_lock_bh(&sk->sk_lock.slock);
3114         }
3115
3116         /*
3117          * Doing the zeroing here guarantee we can not loop forever
3118          * while a wild producer attempts to flood us.
3119          */
3120         sk->sk_backlog.len = 0;
3121 }
3122
3123 void __sk_flush_backlog(struct sock *sk)
3124 {
3125         spin_lock_bh(&sk->sk_lock.slock);
3126         __release_sock(sk);
3127
3128         if (sk->sk_prot->release_cb)
3129                 INDIRECT_CALL_INET_1(sk->sk_prot->release_cb,
3130                                      tcp_release_cb, sk);
3131
3132         spin_unlock_bh(&sk->sk_lock.slock);
3133 }
3134 EXPORT_SYMBOL_GPL(__sk_flush_backlog);
3135
3136 /**
3137  * sk_wait_data - wait for data to arrive at sk_receive_queue
3138  * @sk:    sock to wait on
3139  * @timeo: for how long
3140  * @skb:   last skb seen on sk_receive_queue
3141  *
3142  * Now socket state including sk->sk_err is changed only under lock,
3143  * hence we may omit checks after joining wait queue.
3144  * We check receive queue before schedule() only as optimization;
3145  * it is very likely that release_sock() added new data.
3146  */
3147 int sk_wait_data(struct sock *sk, long *timeo, const struct sk_buff *skb)
3148 {
3149         DEFINE_WAIT_FUNC(wait, woken_wake_function);
3150         int rc;
3151
3152         add_wait_queue(sk_sleep(sk), &wait);
3153         sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
3154         rc = sk_wait_event(sk, timeo, skb_peek_tail(&sk->sk_receive_queue) != skb, &wait);
3155         sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
3156         remove_wait_queue(sk_sleep(sk), &wait);
3157         return rc;
3158 }
3159 EXPORT_SYMBOL(sk_wait_data);
3160
3161 /**
3162  *      __sk_mem_raise_allocated - increase memory_allocated
3163  *      @sk: socket
3164  *      @size: memory size to allocate
3165  *      @amt: pages to allocate
3166  *      @kind: allocation type
3167  *
3168  *      Similar to __sk_mem_schedule(), but does not update sk_forward_alloc.
3169  *
3170  *      Unlike the globally shared limits among the sockets under same protocol,
3171  *      consuming the budget of a memcg won't have direct effect on other ones.
3172  *      So be optimistic about memcg's tolerance, and leave the callers to decide
3173  *      whether or not to raise allocated through sk_under_memory_pressure() or
3174  *      its variants.
3175  */
3176 int __sk_mem_raise_allocated(struct sock *sk, int size, int amt, int kind)
3177 {
3178         struct mem_cgroup *memcg = mem_cgroup_sockets_enabled ? sk->sk_memcg : NULL;
3179         struct proto *prot = sk->sk_prot;
3180         bool charged = false;
3181         long allocated;
3182
3183         sk_memory_allocated_add(sk, amt);
3184         allocated = sk_memory_allocated(sk);
3185
3186         if (memcg) {
3187                 if (!mem_cgroup_charge_skmem(memcg, amt, gfp_memcg_charge()))
3188                         goto suppress_allocation;
3189                 charged = true;
3190         }
3191
3192         /* Under limit. */
3193         if (allocated <= sk_prot_mem_limits(sk, 0)) {
3194                 sk_leave_memory_pressure(sk);
3195                 return 1;
3196         }
3197
3198         /* Under pressure. */
3199         if (allocated > sk_prot_mem_limits(sk, 1))
3200                 sk_enter_memory_pressure(sk);
3201
3202         /* Over hard limit. */
3203         if (allocated > sk_prot_mem_limits(sk, 2))
3204                 goto suppress_allocation;
3205
3206         /* Guarantee minimum buffer size under pressure (either global
3207          * or memcg) to make sure features described in RFC 7323 (TCP
3208          * Extensions for High Performance) work properly.
3209          *
3210          * This rule does NOT stand when exceeds global or memcg's hard
3211          * limit, or else a DoS attack can be taken place by spawning
3212          * lots of sockets whose usage are under minimum buffer size.
3213          */
3214         if (kind == SK_MEM_RECV) {
3215                 if (atomic_read(&sk->sk_rmem_alloc) < sk_get_rmem0(sk, prot))
3216                         return 1;
3217
3218         } else { /* SK_MEM_SEND */
3219                 int wmem0 = sk_get_wmem0(sk, prot);
3220
3221                 if (sk->sk_type == SOCK_STREAM) {
3222                         if (sk->sk_wmem_queued < wmem0)
3223                                 return 1;
3224                 } else if (refcount_read(&sk->sk_wmem_alloc) < wmem0) {
3225                                 return 1;
3226                 }
3227         }
3228
3229         if (sk_has_memory_pressure(sk)) {
3230                 u64 alloc;
3231
3232                 /* The following 'average' heuristic is within the
3233                  * scope of global accounting, so it only makes
3234                  * sense for global memory pressure.
3235                  */
3236                 if (!sk_under_global_memory_pressure(sk))
3237                         return 1;
3238
3239                 /* Try to be fair among all the sockets under global
3240                  * pressure by allowing the ones that below average
3241                  * usage to raise.
3242                  */
3243                 alloc = sk_sockets_allocated_read_positive(sk);
3244                 if (sk_prot_mem_limits(sk, 2) > alloc *
3245                     sk_mem_pages(sk->sk_wmem_queued +
3246                                  atomic_read(&sk->sk_rmem_alloc) +
3247                                  sk->sk_forward_alloc))
3248                         return 1;
3249         }
3250
3251 suppress_allocation:
3252
3253         if (kind == SK_MEM_SEND && sk->sk_type == SOCK_STREAM) {
3254                 sk_stream_moderate_sndbuf(sk);
3255
3256                 /* Fail only if socket is _under_ its sndbuf.
3257                  * In this case we cannot block, so that we have to fail.
3258                  */
3259                 if (sk->sk_wmem_queued + size >= sk->sk_sndbuf) {
3260                         /* Force charge with __GFP_NOFAIL */
3261                         if (memcg && !charged) {
3262                                 mem_cgroup_charge_skmem(memcg, amt,
3263                                         gfp_memcg_charge() | __GFP_NOFAIL);
3264                         }
3265                         return 1;
3266                 }
3267         }
3268
3269         if (kind == SK_MEM_SEND || (kind == SK_MEM_RECV && charged))
3270                 trace_sock_exceed_buf_limit(sk, prot, allocated, kind);
3271
3272         sk_memory_allocated_sub(sk, amt);
3273
3274         if (charged)
3275                 mem_cgroup_uncharge_skmem(memcg, amt);
3276
3277         return 0;
3278 }
3279
3280 /**
3281  *      __sk_mem_schedule - increase sk_forward_alloc and memory_allocated
3282  *      @sk: socket
3283  *      @size: memory size to allocate
3284  *      @kind: allocation type
3285  *
3286  *      If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means
3287  *      rmem allocation. This function assumes that protocols which have
3288  *      memory_pressure use sk_wmem_queued as write buffer accounting.
3289  */
3290 int __sk_mem_schedule(struct sock *sk, int size, int kind)
3291 {
3292         int ret, amt = sk_mem_pages(size);
3293
3294         sk_forward_alloc_add(sk, amt << PAGE_SHIFT);
3295         ret = __sk_mem_raise_allocated(sk, size, amt, kind);
3296         if (!ret)
3297                 sk_forward_alloc_add(sk, -(amt << PAGE_SHIFT));
3298         return ret;
3299 }
3300 EXPORT_SYMBOL(__sk_mem_schedule);
3301
3302 /**
3303  *      __sk_mem_reduce_allocated - reclaim memory_allocated
3304  *      @sk: socket
3305  *      @amount: number of quanta
3306  *
3307  *      Similar to __sk_mem_reclaim(), but does not update sk_forward_alloc
3308  */
3309 void __sk_mem_reduce_allocated(struct sock *sk, int amount)
3310 {
3311         sk_memory_allocated_sub(sk, amount);
3312
3313         if (mem_cgroup_sockets_enabled && sk->sk_memcg)
3314                 mem_cgroup_uncharge_skmem(sk->sk_memcg, amount);
3315
3316         if (sk_under_global_memory_pressure(sk) &&
3317             (sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)))
3318                 sk_leave_memory_pressure(sk);
3319 }
3320
3321 /**
3322  *      __sk_mem_reclaim - reclaim sk_forward_alloc and memory_allocated
3323  *      @sk: socket
3324  *      @amount: number of bytes (rounded down to a PAGE_SIZE multiple)
3325  */
3326 void __sk_mem_reclaim(struct sock *sk, int amount)
3327 {
3328         amount >>= PAGE_SHIFT;
3329         sk_forward_alloc_add(sk, -(amount << PAGE_SHIFT));
3330         __sk_mem_reduce_allocated(sk, amount);
3331 }
3332 EXPORT_SYMBOL(__sk_mem_reclaim);
3333
3334 int sk_set_peek_off(struct sock *sk, int val)
3335 {
3336         WRITE_ONCE(sk->sk_peek_off, val);
3337         return 0;
3338 }
3339 EXPORT_SYMBOL_GPL(sk_set_peek_off);
3340
3341 /*
3342  * Set of default routines for initialising struct proto_ops when
3343  * the protocol does not support a particular function. In certain
3344  * cases where it makes no sense for a protocol to have a "do nothing"
3345  * function, some default processing is provided.
3346  */
3347
3348 int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len)
3349 {
3350         return -EOPNOTSUPP;
3351 }
3352 EXPORT_SYMBOL(sock_no_bind);
3353
3354 int sock_no_connect(struct socket *sock, struct sockaddr *saddr,
3355                     int len, int flags)
3356 {
3357         return -EOPNOTSUPP;
3358 }
3359 EXPORT_SYMBOL(sock_no_connect);
3360
3361 int sock_no_socketpair(struct socket *sock1, struct socket *sock2)
3362 {
3363         return -EOPNOTSUPP;
3364 }
3365 EXPORT_SYMBOL(sock_no_socketpair);
3366
3367 int sock_no_accept(struct socket *sock, struct socket *newsock,
3368                    struct proto_accept_arg *arg)
3369 {
3370         return -EOPNOTSUPP;
3371 }
3372 EXPORT_SYMBOL(sock_no_accept);
3373
3374 int sock_no_getname(struct socket *sock, struct sockaddr *saddr,
3375                     int peer)
3376 {
3377         return -EOPNOTSUPP;
3378 }
3379 EXPORT_SYMBOL(sock_no_getname);
3380
3381 int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
3382 {
3383         return -EOPNOTSUPP;
3384 }
3385 EXPORT_SYMBOL(sock_no_ioctl);
3386
3387 int sock_no_listen(struct socket *sock, int backlog)
3388 {
3389         return -EOPNOTSUPP;
3390 }
3391 EXPORT_SYMBOL(sock_no_listen);
3392
3393 int sock_no_shutdown(struct socket *sock, int how)
3394 {
3395         return -EOPNOTSUPP;
3396 }
3397 EXPORT_SYMBOL(sock_no_shutdown);
3398
3399 int sock_no_sendmsg(struct socket *sock, struct msghdr *m, size_t len)
3400 {
3401         return -EOPNOTSUPP;
3402 }
3403 EXPORT_SYMBOL(sock_no_sendmsg);
3404
3405 int sock_no_sendmsg_locked(struct sock *sk, struct msghdr *m, size_t len)
3406 {
3407         return -EOPNOTSUPP;
3408 }
3409 EXPORT_SYMBOL(sock_no_sendmsg_locked);
3410
3411 int sock_no_recvmsg(struct socket *sock, struct msghdr *m, size_t len,
3412                     int flags)
3413 {
3414         return -EOPNOTSUPP;
3415 }
3416 EXPORT_SYMBOL(sock_no_recvmsg);
3417
3418 int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
3419 {
3420         /* Mirror missing mmap method error code */
3421         return -ENODEV;
3422 }
3423 EXPORT_SYMBOL(sock_no_mmap);
3424
3425 /*
3426  * When a file is received (via SCM_RIGHTS, etc), we must bump the
3427  * various sock-based usage counts.
3428  */
3429 void __receive_sock(struct file *file)
3430 {
3431         struct socket *sock;
3432
3433         sock = sock_from_file(file);
3434         if (sock) {
3435                 sock_update_netprioidx(&sock->sk->sk_cgrp_data);
3436                 sock_update_classid(&sock->sk->sk_cgrp_data);
3437         }
3438 }
3439
3440 /*
3441  *      Default Socket Callbacks
3442  */
3443
3444 static void sock_def_wakeup(struct sock *sk)
3445 {
3446         struct socket_wq *wq;
3447
3448         rcu_read_lock();
3449         wq = rcu_dereference(sk->sk_wq);
3450         if (skwq_has_sleeper(wq))
3451                 wake_up_interruptible_all(&wq->wait);
3452         rcu_read_unlock();
3453 }
3454
3455 static void sock_def_error_report(struct sock *sk)
3456 {
3457         struct socket_wq *wq;
3458
3459         rcu_read_lock();
3460         wq = rcu_dereference(sk->sk_wq);
3461         if (skwq_has_sleeper(wq))
3462                 wake_up_interruptible_poll(&wq->wait, EPOLLERR);
3463         sk_wake_async_rcu(sk, SOCK_WAKE_IO, POLL_ERR);
3464         rcu_read_unlock();
3465 }
3466
3467 void sock_def_readable(struct sock *sk)
3468 {
3469         struct socket_wq *wq;
3470
3471         trace_sk_data_ready(sk);
3472
3473         rcu_read_lock();
3474         wq = rcu_dereference(sk->sk_wq);
3475         if (skwq_has_sleeper(wq))
3476                 wake_up_interruptible_sync_poll(&wq->wait, EPOLLIN | EPOLLPRI |
3477                                                 EPOLLRDNORM | EPOLLRDBAND);
3478         sk_wake_async_rcu(sk, SOCK_WAKE_WAITD, POLL_IN);
3479         rcu_read_unlock();
3480 }
3481
3482 static void sock_def_write_space(struct sock *sk)
3483 {
3484         struct socket_wq *wq;
3485
3486         rcu_read_lock();
3487
3488         /* Do not wake up a writer until he can make "significant"
3489          * progress.  --DaveM
3490          */
3491         if (sock_writeable(sk)) {
3492                 wq = rcu_dereference(sk->sk_wq);
3493                 if (skwq_has_sleeper(wq))
3494                         wake_up_interruptible_sync_poll(&wq->wait, EPOLLOUT |
3495                                                 EPOLLWRNORM | EPOLLWRBAND);
3496
3497                 /* Should agree with poll, otherwise some programs break */
3498                 sk_wake_async_rcu(sk, SOCK_WAKE_SPACE, POLL_OUT);
3499         }
3500
3501         rcu_read_unlock();
3502 }
3503
3504 /* An optimised version of sock_def_write_space(), should only be called
3505  * for SOCK_RCU_FREE sockets under RCU read section and after putting
3506  * ->sk_wmem_alloc.
3507  */
3508 static void sock_def_write_space_wfree(struct sock *sk)
3509 {
3510         /* Do not wake up a writer until he can make "significant"
3511          * progress.  --DaveM
3512          */
3513         if (sock_writeable(sk)) {
3514                 struct socket_wq *wq = rcu_dereference(sk->sk_wq);
3515
3516                 /* rely on refcount_sub from sock_wfree() */
3517                 smp_mb__after_atomic();
3518                 if (wq && waitqueue_active(&wq->wait))
3519                         wake_up_interruptible_sync_poll(&wq->wait, EPOLLOUT |
3520                                                 EPOLLWRNORM | EPOLLWRBAND);
3521
3522                 /* Should agree with poll, otherwise some programs break */
3523                 sk_wake_async_rcu(sk, SOCK_WAKE_SPACE, POLL_OUT);
3524         }
3525 }
3526
3527 static void sock_def_destruct(struct sock *sk)
3528 {
3529 }
3530
3531 void sk_send_sigurg(struct sock *sk)
3532 {
3533         if (sk->sk_socket && sk->sk_socket->file)
3534                 if (send_sigurg(sk->sk_socket->file))
3535                         sk_wake_async(sk, SOCK_WAKE_URG, POLL_PRI);
3536 }
3537 EXPORT_SYMBOL(sk_send_sigurg);
3538
3539 void sk_reset_timer(struct sock *sk, struct timer_list* timer,
3540                     unsigned long expires)
3541 {
3542         if (!mod_timer(timer, expires))
3543                 sock_hold(sk);
3544 }
3545 EXPORT_SYMBOL(sk_reset_timer);
3546
3547 void sk_stop_timer(struct sock *sk, struct timer_list* timer)
3548 {
3549         if (del_timer(timer))
3550                 __sock_put(sk);
3551 }
3552 EXPORT_SYMBOL(sk_stop_timer);
3553
3554 void sk_stop_timer_sync(struct sock *sk, struct timer_list *timer)
3555 {
3556         if (del_timer_sync(timer))
3557                 __sock_put(sk);
3558 }
3559 EXPORT_SYMBOL(sk_stop_timer_sync);
3560
3561 void sock_init_data_uid(struct socket *sock, struct sock *sk, kuid_t uid)
3562 {
3563         sk_init_common(sk);
3564         sk->sk_send_head        =       NULL;
3565
3566         timer_setup(&sk->sk_timer, NULL, 0);
3567
3568         sk->sk_allocation       =       GFP_KERNEL;
3569         sk->sk_rcvbuf           =       READ_ONCE(sysctl_rmem_default);
3570         sk->sk_sndbuf           =       READ_ONCE(sysctl_wmem_default);
3571         sk->sk_state            =       TCP_CLOSE;
3572         sk->sk_use_task_frag    =       true;
3573         sk_set_socket(sk, sock);
3574
3575         sock_set_flag(sk, SOCK_ZAPPED);
3576
3577         if (sock) {
3578                 sk->sk_type     =       sock->type;
3579                 RCU_INIT_POINTER(sk->sk_wq, &sock->wq);
3580                 sock->sk        =       sk;
3581         } else {
3582                 RCU_INIT_POINTER(sk->sk_wq, NULL);
3583         }
3584         sk->sk_uid      =       uid;
3585
3586         sk->sk_state_change     =       sock_def_wakeup;
3587         sk->sk_data_ready       =       sock_def_readable;
3588         sk->sk_write_space      =       sock_def_write_space;
3589         sk->sk_error_report     =       sock_def_error_report;
3590         sk->sk_destruct         =       sock_def_destruct;
3591
3592         sk->sk_frag.page        =       NULL;
3593         sk->sk_frag.offset      =       0;
3594         sk->sk_peek_off         =       -1;
3595
3596         sk->sk_peer_pid         =       NULL;
3597         sk->sk_peer_cred        =       NULL;
3598         spin_lock_init(&sk->sk_peer_lock);
3599
3600         sk->sk_write_pending    =       0;
3601         sk->sk_rcvlowat         =       1;
3602         sk->sk_rcvtimeo         =       MAX_SCHEDULE_TIMEOUT;
3603         sk->sk_sndtimeo         =       MAX_SCHEDULE_TIMEOUT;
3604
3605         sk->sk_stamp = SK_DEFAULT_STAMP;
3606 #if BITS_PER_LONG==32
3607         seqlock_init(&sk->sk_stamp_seq);
3608 #endif
3609         atomic_set(&sk->sk_zckey, 0);
3610
3611 #ifdef CONFIG_NET_RX_BUSY_POLL
3612         sk->sk_napi_id          =       0;
3613         sk->sk_ll_usec          =       READ_ONCE(sysctl_net_busy_read);
3614 #endif
3615
3616         sk->sk_max_pacing_rate = ~0UL;
3617         sk->sk_pacing_rate = ~0UL;
3618         WRITE_ONCE(sk->sk_pacing_shift, 10);
3619         sk->sk_incoming_cpu = -1;
3620
3621         sk_rx_queue_clear(sk);
3622         /*
3623          * Before updating sk_refcnt, we must commit prior changes to memory
3624          * (Documentation/RCU/rculist_nulls.rst for details)
3625          */
3626         smp_wmb();
3627         refcount_set(&sk->sk_refcnt, 1);
3628         atomic_set(&sk->sk_drops, 0);
3629 }
3630 EXPORT_SYMBOL(sock_init_data_uid);
3631
3632 void sock_init_data(struct socket *sock, struct sock *sk)
3633 {
3634         kuid_t uid = sock ?
3635                 SOCK_INODE(sock)->i_uid :
3636                 make_kuid(sock_net(sk)->user_ns, 0);
3637
3638         sock_init_data_uid(sock, sk, uid);
3639 }
3640 EXPORT_SYMBOL(sock_init_data);
3641
3642 void lock_sock_nested(struct sock *sk, int subclass)
3643 {
3644         /* The sk_lock has mutex_lock() semantics here. */
3645         mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_);
3646
3647         might_sleep();
3648         spin_lock_bh(&sk->sk_lock.slock);
3649         if (sock_owned_by_user_nocheck(sk))
3650                 __lock_sock(sk);
3651         sk->sk_lock.owned = 1;
3652         spin_unlock_bh(&sk->sk_lock.slock);
3653 }
3654 EXPORT_SYMBOL(lock_sock_nested);
3655
3656 void release_sock(struct sock *sk)
3657 {
3658         spin_lock_bh(&sk->sk_lock.slock);
3659         if (sk->sk_backlog.tail)
3660                 __release_sock(sk);
3661
3662         if (sk->sk_prot->release_cb)
3663                 INDIRECT_CALL_INET_1(sk->sk_prot->release_cb,
3664                                      tcp_release_cb, sk);
3665
3666         sock_release_ownership(sk);
3667         if (waitqueue_active(&sk->sk_lock.wq))
3668                 wake_up(&sk->sk_lock.wq);
3669         spin_unlock_bh(&sk->sk_lock.slock);
3670 }
3671 EXPORT_SYMBOL(release_sock);
3672
3673 bool __lock_sock_fast(struct sock *sk) __acquires(&sk->sk_lock.slock)
3674 {
3675         might_sleep();
3676         spin_lock_bh(&sk->sk_lock.slock);
3677
3678         if (!sock_owned_by_user_nocheck(sk)) {
3679                 /*
3680                  * Fast path return with bottom halves disabled and
3681                  * sock::sk_lock.slock held.
3682                  *
3683                  * The 'mutex' is not contended and holding
3684                  * sock::sk_lock.slock prevents all other lockers to
3685                  * proceed so the corresponding unlock_sock_fast() can
3686                  * avoid the slow path of release_sock() completely and
3687                  * just release slock.
3688                  *
3689                  * From a semantical POV this is equivalent to 'acquiring'
3690                  * the 'mutex', hence the corresponding lockdep
3691                  * mutex_release() has to happen in the fast path of
3692                  * unlock_sock_fast().
3693                  */
3694                 return false;
3695         }
3696
3697         __lock_sock(sk);
3698         sk->sk_lock.owned = 1;
3699         __acquire(&sk->sk_lock.slock);
3700         spin_unlock_bh(&sk->sk_lock.slock);
3701         return true;
3702 }
3703 EXPORT_SYMBOL(__lock_sock_fast);
3704
3705 int sock_gettstamp(struct socket *sock, void __user *userstamp,
3706                    bool timeval, bool time32)
3707 {
3708         struct sock *sk = sock->sk;
3709         struct timespec64 ts;
3710
3711         sock_enable_timestamp(sk, SOCK_TIMESTAMP);
3712         ts = ktime_to_timespec64(sock_read_timestamp(sk));
3713         if (ts.tv_sec == -1)
3714                 return -ENOENT;
3715         if (ts.tv_sec == 0) {
3716                 ktime_t kt = ktime_get_real();
3717                 sock_write_timestamp(sk, kt);
3718                 ts = ktime_to_timespec64(kt);
3719         }
3720
3721         if (timeval)
3722                 ts.tv_nsec /= 1000;
3723
3724 #ifdef CONFIG_COMPAT_32BIT_TIME
3725         if (time32)
3726                 return put_old_timespec32(&ts, userstamp);
3727 #endif
3728 #ifdef CONFIG_SPARC64
3729         /* beware of padding in sparc64 timeval */
3730         if (timeval && !in_compat_syscall()) {
3731                 struct __kernel_old_timeval __user tv = {
3732                         .tv_sec = ts.tv_sec,
3733                         .tv_usec = ts.tv_nsec,
3734                 };
3735                 if (copy_to_user(userstamp, &tv, sizeof(tv)))
3736                         return -EFAULT;
3737                 return 0;
3738         }
3739 #endif
3740         return put_timespec64(&ts, userstamp);
3741 }
3742 EXPORT_SYMBOL(sock_gettstamp);
3743
3744 void sock_enable_timestamp(struct sock *sk, enum sock_flags flag)
3745 {
3746         if (!sock_flag(sk, flag)) {
3747                 unsigned long previous_flags = sk->sk_flags;
3748
3749                 sock_set_flag(sk, flag);
3750                 /*
3751                  * we just set one of the two flags which require net
3752                  * time stamping, but time stamping might have been on
3753                  * already because of the other one
3754                  */
3755                 if (sock_needs_netstamp(sk) &&
3756                     !(previous_flags & SK_FLAGS_TIMESTAMP))
3757                         net_enable_timestamp();
3758         }
3759 }
3760
3761 int sock_recv_errqueue(struct sock *sk, struct msghdr *msg, int len,
3762                        int level, int type)
3763 {
3764         struct sock_exterr_skb *serr;
3765         struct sk_buff *skb;
3766         int copied, err;
3767
3768         err = -EAGAIN;
3769         skb = sock_dequeue_err_skb(sk);
3770         if (skb == NULL)
3771                 goto out;
3772
3773         copied = skb->len;
3774         if (copied > len) {
3775                 msg->msg_flags |= MSG_TRUNC;
3776                 copied = len;
3777         }
3778         err = skb_copy_datagram_msg(skb, 0, msg, copied);
3779         if (err)
3780                 goto out_free_skb;
3781
3782         sock_recv_timestamp(msg, sk, skb);
3783
3784         serr = SKB_EXT_ERR(skb);
3785         put_cmsg(msg, level, type, sizeof(serr->ee), &serr->ee);
3786
3787         msg->msg_flags |= MSG_ERRQUEUE;
3788         err = copied;
3789
3790 out_free_skb:
3791         kfree_skb(skb);
3792 out:
3793         return err;
3794 }
3795 EXPORT_SYMBOL(sock_recv_errqueue);
3796
3797 /*
3798  *      Get a socket option on an socket.
3799  *
3800  *      FIX: POSIX 1003.1g is very ambiguous here. It states that
3801  *      asynchronous errors should be reported by getsockopt. We assume
3802  *      this means if you specify SO_ERROR (otherwise what is the point of it).
3803  */
3804 int sock_common_getsockopt(struct socket *sock, int level, int optname,
3805                            char __user *optval, int __user *optlen)
3806 {
3807         struct sock *sk = sock->sk;
3808
3809         /* IPV6_ADDRFORM can change sk->sk_prot under us. */
3810         return READ_ONCE(sk->sk_prot)->getsockopt(sk, level, optname, optval, optlen);
3811 }
3812 EXPORT_SYMBOL(sock_common_getsockopt);
3813
3814 int sock_common_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
3815                         int flags)
3816 {
3817         struct sock *sk = sock->sk;
3818         int addr_len = 0;
3819         int err;
3820
3821         err = sk->sk_prot->recvmsg(sk, msg, size, flags, &addr_len);
3822         if (err >= 0)
3823                 msg->msg_namelen = addr_len;
3824         return err;
3825 }
3826 EXPORT_SYMBOL(sock_common_recvmsg);
3827
3828 /*
3829  *      Set socket options on an inet socket.
3830  */
3831 int sock_common_setsockopt(struct socket *sock, int level, int optname,
3832                            sockptr_t optval, unsigned int optlen)
3833 {
3834         struct sock *sk = sock->sk;
3835
3836         /* IPV6_ADDRFORM can change sk->sk_prot under us. */
3837         return READ_ONCE(sk->sk_prot)->setsockopt(sk, level, optname, optval, optlen);
3838 }
3839 EXPORT_SYMBOL(sock_common_setsockopt);
3840
3841 void sk_common_release(struct sock *sk)
3842 {
3843         if (sk->sk_prot->destroy)
3844                 sk->sk_prot->destroy(sk);
3845
3846         /*
3847          * Observation: when sk_common_release is called, processes have
3848          * no access to socket. But net still has.
3849          * Step one, detach it from networking:
3850          *
3851          * A. Remove from hash tables.
3852          */
3853
3854         sk->sk_prot->unhash(sk);
3855
3856         /*
3857          * In this point socket cannot receive new packets, but it is possible
3858          * that some packets are in flight because some CPU runs receiver and
3859          * did hash table lookup before we unhashed socket. They will achieve
3860          * receive queue and will be purged by socket destructor.
3861          *
3862          * Also we still have packets pending on receive queue and probably,
3863          * our own packets waiting in device queues. sock_destroy will drain
3864          * receive queue, but transmitted packets will delay socket destruction
3865          * until the last reference will be released.
3866          */
3867
3868         sock_orphan(sk);
3869
3870         xfrm_sk_free_policy(sk);
3871
3872         sock_put(sk);
3873 }
3874 EXPORT_SYMBOL(sk_common_release);
3875
3876 void sk_get_meminfo(const struct sock *sk, u32 *mem)
3877 {
3878         memset(mem, 0, sizeof(*mem) * SK_MEMINFO_VARS);
3879
3880         mem[SK_MEMINFO_RMEM_ALLOC] = sk_rmem_alloc_get(sk);
3881         mem[SK_MEMINFO_RCVBUF] = READ_ONCE(sk->sk_rcvbuf);
3882         mem[SK_MEMINFO_WMEM_ALLOC] = sk_wmem_alloc_get(sk);
3883         mem[SK_MEMINFO_SNDBUF] = READ_ONCE(sk->sk_sndbuf);
3884         mem[SK_MEMINFO_FWD_ALLOC] = sk_forward_alloc_get(sk);
3885         mem[SK_MEMINFO_WMEM_QUEUED] = READ_ONCE(sk->sk_wmem_queued);
3886         mem[SK_MEMINFO_OPTMEM] = atomic_read(&sk->sk_omem_alloc);
3887         mem[SK_MEMINFO_BACKLOG] = READ_ONCE(sk->sk_backlog.len);
3888         mem[SK_MEMINFO_DROPS] = atomic_read(&sk->sk_drops);
3889 }
3890
3891 #ifdef CONFIG_PROC_FS
3892 static DECLARE_BITMAP(proto_inuse_idx, PROTO_INUSE_NR);
3893
3894 int sock_prot_inuse_get(struct net *net, struct proto *prot)
3895 {
3896         int cpu, idx = prot->inuse_idx;
3897         int res = 0;
3898
3899         for_each_possible_cpu(cpu)
3900                 res += per_cpu_ptr(net->core.prot_inuse, cpu)->val[idx];
3901
3902         return res >= 0 ? res : 0;
3903 }
3904 EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
3905
3906 int sock_inuse_get(struct net *net)
3907 {
3908         int cpu, res = 0;
3909
3910         for_each_possible_cpu(cpu)
3911                 res += per_cpu_ptr(net->core.prot_inuse, cpu)->all;
3912
3913         return res;
3914 }
3915
3916 EXPORT_SYMBOL_GPL(sock_inuse_get);
3917
3918 static int __net_init sock_inuse_init_net(struct net *net)
3919 {
3920         net->core.prot_inuse = alloc_percpu(struct prot_inuse);
3921         if (net->core.prot_inuse == NULL)
3922                 return -ENOMEM;
3923         return 0;
3924 }
3925
3926 static void __net_exit sock_inuse_exit_net(struct net *net)
3927 {
3928         free_percpu(net->core.prot_inuse);
3929 }
3930
3931 static struct pernet_operations net_inuse_ops = {
3932         .init = sock_inuse_init_net,
3933         .exit = sock_inuse_exit_net,
3934 };
3935
3936 static __init int net_inuse_init(void)
3937 {
3938         if (register_pernet_subsys(&net_inuse_ops))
3939                 panic("Cannot initialize net inuse counters");
3940
3941         return 0;
3942 }
3943
3944 core_initcall(net_inuse_init);
3945
3946 static int assign_proto_idx(struct proto *prot)
3947 {
3948         prot->inuse_idx = find_first_zero_bit(proto_inuse_idx, PROTO_INUSE_NR);
3949
3950         if (unlikely(prot->inuse_idx == PROTO_INUSE_NR - 1)) {
3951                 pr_err("PROTO_INUSE_NR exhausted\n");
3952                 return -ENOSPC;
3953         }
3954
3955         set_bit(prot->inuse_idx, proto_inuse_idx);
3956         return 0;
3957 }
3958
3959 static void release_proto_idx(struct proto *prot)
3960 {
3961         if (prot->inuse_idx != PROTO_INUSE_NR - 1)
3962                 clear_bit(prot->inuse_idx, proto_inuse_idx);
3963 }
3964 #else
3965 static inline int assign_proto_idx(struct proto *prot)
3966 {
3967         return 0;
3968 }
3969
3970 static inline void release_proto_idx(struct proto *prot)
3971 {
3972 }
3973
3974 #endif
3975
3976 static void tw_prot_cleanup(struct timewait_sock_ops *twsk_prot)
3977 {
3978         if (!twsk_prot)
3979                 return;
3980         kfree(twsk_prot->twsk_slab_name);
3981         twsk_prot->twsk_slab_name = NULL;
3982         kmem_cache_destroy(twsk_prot->twsk_slab);
3983         twsk_prot->twsk_slab = NULL;
3984 }
3985
3986 static int tw_prot_init(const struct proto *prot)
3987 {
3988         struct timewait_sock_ops *twsk_prot = prot->twsk_prot;
3989
3990         if (!twsk_prot)
3991                 return 0;
3992
3993         twsk_prot->twsk_slab_name = kasprintf(GFP_KERNEL, "tw_sock_%s",
3994                                               prot->name);
3995         if (!twsk_prot->twsk_slab_name)
3996                 return -ENOMEM;
3997
3998         twsk_prot->twsk_slab =
3999                 kmem_cache_create(twsk_prot->twsk_slab_name,
4000                                   twsk_prot->twsk_obj_size, 0,
4001                                   SLAB_ACCOUNT | prot->slab_flags,
4002                                   NULL);
4003         if (!twsk_prot->twsk_slab) {
4004                 pr_crit("%s: Can't create timewait sock SLAB cache!\n",
4005                         prot->name);
4006                 return -ENOMEM;
4007         }
4008
4009         return 0;
4010 }
4011
4012 static void req_prot_cleanup(struct request_sock_ops *rsk_prot)
4013 {
4014         if (!rsk_prot)
4015                 return;
4016         kfree(rsk_prot->slab_name);
4017         rsk_prot->slab_name = NULL;
4018         kmem_cache_destroy(rsk_prot->slab);
4019         rsk_prot->slab = NULL;
4020 }
4021
4022 static int req_prot_init(const struct proto *prot)
4023 {
4024         struct request_sock_ops *rsk_prot = prot->rsk_prot;
4025
4026         if (!rsk_prot)
4027                 return 0;
4028
4029         rsk_prot->slab_name = kasprintf(GFP_KERNEL, "request_sock_%s",
4030                                         prot->name);
4031         if (!rsk_prot->slab_name)
4032                 return -ENOMEM;
4033
4034         rsk_prot->slab = kmem_cache_create(rsk_prot->slab_name,
4035                                            rsk_prot->obj_size, 0,
4036                                            SLAB_ACCOUNT | prot->slab_flags,
4037                                            NULL);
4038
4039         if (!rsk_prot->slab) {
4040                 pr_crit("%s: Can't create request sock SLAB cache!\n",
4041                         prot->name);
4042                 return -ENOMEM;
4043         }
4044         return 0;
4045 }
4046
4047 int proto_register(struct proto *prot, int alloc_slab)
4048 {
4049         int ret = -ENOBUFS;
4050
4051         if (prot->memory_allocated && !prot->sysctl_mem) {
4052                 pr_err("%s: missing sysctl_mem\n", prot->name);
4053                 return -EINVAL;
4054         }
4055         if (prot->memory_allocated && !prot->per_cpu_fw_alloc) {
4056                 pr_err("%s: missing per_cpu_fw_alloc\n", prot->name);
4057                 return -EINVAL;
4058         }
4059         if (alloc_slab) {
4060                 prot->slab = kmem_cache_create_usercopy(prot->name,
4061                                         prot->obj_size, 0,
4062                                         SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT |
4063                                         prot->slab_flags,
4064                                         prot->useroffset, prot->usersize,
4065                                         NULL);
4066
4067                 if (prot->slab == NULL) {
4068                         pr_crit("%s: Can't create sock SLAB cache!\n",
4069                                 prot->name);
4070                         goto out;
4071                 }
4072
4073                 if (req_prot_init(prot))
4074                         goto out_free_request_sock_slab;
4075
4076                 if (tw_prot_init(prot))
4077                         goto out_free_timewait_sock_slab;
4078         }
4079
4080         mutex_lock(&proto_list_mutex);
4081         ret = assign_proto_idx(prot);
4082         if (ret) {
4083                 mutex_unlock(&proto_list_mutex);
4084                 goto out_free_timewait_sock_slab;
4085         }
4086         list_add(&prot->node, &proto_list);
4087         mutex_unlock(&proto_list_mutex);
4088         return ret;
4089
4090 out_free_timewait_sock_slab:
4091         if (alloc_slab)
4092                 tw_prot_cleanup(prot->twsk_prot);
4093 out_free_request_sock_slab:
4094         if (alloc_slab) {
4095                 req_prot_cleanup(prot->rsk_prot);
4096
4097                 kmem_cache_destroy(prot->slab);
4098                 prot->slab = NULL;
4099         }
4100 out:
4101         return ret;
4102 }
4103 EXPORT_SYMBOL(proto_register);
4104
4105 void proto_unregister(struct proto *prot)
4106 {
4107         mutex_lock(&proto_list_mutex);
4108         release_proto_idx(prot);
4109         list_del(&prot->node);
4110         mutex_unlock(&proto_list_mutex);
4111
4112         kmem_cache_destroy(prot->slab);
4113         prot->slab = NULL;
4114
4115         req_prot_cleanup(prot->rsk_prot);
4116         tw_prot_cleanup(prot->twsk_prot);
4117 }
4118 EXPORT_SYMBOL(proto_unregister);
4119
4120 int sock_load_diag_module(int family, int protocol)
4121 {
4122         if (!protocol) {
4123                 if (!sock_is_registered(family))
4124                         return -ENOENT;
4125
4126                 return request_module("net-pf-%d-proto-%d-type-%d", PF_NETLINK,
4127                                       NETLINK_SOCK_DIAG, family);
4128         }
4129
4130 #ifdef CONFIG_INET
4131         if (family == AF_INET &&
4132             protocol != IPPROTO_RAW &&
4133             protocol < MAX_INET_PROTOS &&
4134             !rcu_access_pointer(inet_protos[protocol]))
4135                 return -ENOENT;
4136 #endif
4137
4138         return request_module("net-pf-%d-proto-%d-type-%d-%d", PF_NETLINK,
4139                               NETLINK_SOCK_DIAG, family, protocol);
4140 }
4141 EXPORT_SYMBOL(sock_load_diag_module);
4142
4143 #ifdef CONFIG_PROC_FS
4144 static void *proto_seq_start(struct seq_file *seq, loff_t *pos)
4145         __acquires(proto_list_mutex)
4146 {
4147         mutex_lock(&proto_list_mutex);
4148         return seq_list_start_head(&proto_list, *pos);
4149 }
4150
4151 static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos)
4152 {
4153         return seq_list_next(v, &proto_list, pos);
4154 }
4155
4156 static void proto_seq_stop(struct seq_file *seq, void *v)
4157         __releases(proto_list_mutex)
4158 {
4159         mutex_unlock(&proto_list_mutex);
4160 }
4161
4162 static char proto_method_implemented(const void *method)
4163 {
4164         return method == NULL ? 'n' : 'y';
4165 }
4166 static long sock_prot_memory_allocated(struct proto *proto)
4167 {
4168         return proto->memory_allocated != NULL ? proto_memory_allocated(proto) : -1L;
4169 }
4170
4171 static const char *sock_prot_memory_pressure(struct proto *proto)
4172 {
4173         return proto->memory_pressure != NULL ?
4174         proto_memory_pressure(proto) ? "yes" : "no" : "NI";
4175 }
4176
4177 static void proto_seq_printf(struct seq_file *seq, struct proto *proto)
4178 {
4179
4180         seq_printf(seq, "%-9s %4u %6d  %6ld   %-3s %6u   %-3s  %-10s "
4181                         "%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n",
4182                    proto->name,
4183                    proto->obj_size,
4184                    sock_prot_inuse_get(seq_file_net(seq), proto),
4185                    sock_prot_memory_allocated(proto),
4186                    sock_prot_memory_pressure(proto),
4187                    proto->max_header,
4188                    proto->slab == NULL ? "no" : "yes",
4189                    module_name(proto->owner),
4190                    proto_method_implemented(proto->close),
4191                    proto_method_implemented(proto->connect),
4192                    proto_method_implemented(proto->disconnect),
4193                    proto_method_implemented(proto->accept),
4194                    proto_method_implemented(proto->ioctl),
4195                    proto_method_implemented(proto->init),
4196                    proto_method_implemented(proto->destroy),
4197                    proto_method_implemented(proto->shutdown),
4198                    proto_method_implemented(proto->setsockopt),
4199                    proto_method_implemented(proto->getsockopt),
4200                    proto_method_implemented(proto->sendmsg),
4201                    proto_method_implemented(proto->recvmsg),
4202                    proto_method_implemented(proto->bind),
4203                    proto_method_implemented(proto->backlog_rcv),
4204                    proto_method_implemented(proto->hash),
4205                    proto_method_implemented(proto->unhash),
4206                    proto_method_implemented(proto->get_port),
4207                    proto_method_implemented(proto->enter_memory_pressure));
4208 }
4209
4210 static int proto_seq_show(struct seq_file *seq, void *v)
4211 {
4212         if (v == &proto_list)
4213                 seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s",
4214                            "protocol",
4215                            "size",
4216                            "sockets",
4217                            "memory",
4218                            "press",
4219                            "maxhdr",
4220                            "slab",
4221                            "module",
4222                            "cl co di ac io in de sh ss gs se re bi br ha uh gp em\n");
4223         else
4224                 proto_seq_printf(seq, list_entry(v, struct proto, node));
4225         return 0;
4226 }
4227
4228 static const struct seq_operations proto_seq_ops = {
4229         .start  = proto_seq_start,
4230         .next   = proto_seq_next,
4231         .stop   = proto_seq_stop,
4232         .show   = proto_seq_show,
4233 };
4234
4235 static __net_init int proto_init_net(struct net *net)
4236 {
4237         if (!proc_create_net("protocols", 0444, net->proc_net, &proto_seq_ops,
4238                         sizeof(struct seq_net_private)))
4239                 return -ENOMEM;
4240
4241         return 0;
4242 }
4243
4244 static __net_exit void proto_exit_net(struct net *net)
4245 {
4246         remove_proc_entry("protocols", net->proc_net);
4247 }
4248
4249
4250 static __net_initdata struct pernet_operations proto_net_ops = {
4251         .init = proto_init_net,
4252         .exit = proto_exit_net,
4253 };
4254
4255 static int __init proto_init(void)
4256 {
4257         return register_pernet_subsys(&proto_net_ops);
4258 }
4259
4260 subsys_initcall(proto_init);
4261
4262 #endif /* PROC_FS */
4263
4264 #ifdef CONFIG_NET_RX_BUSY_POLL
4265 bool sk_busy_loop_end(void *p, unsigned long start_time)
4266 {
4267         struct sock *sk = p;
4268
4269         if (!skb_queue_empty_lockless(&sk->sk_receive_queue))
4270                 return true;
4271
4272         if (sk_is_udp(sk) &&
4273             !skb_queue_empty_lockless(&udp_sk(sk)->reader_queue))
4274                 return true;
4275
4276         return sk_busy_loop_timeout(sk, start_time);
4277 }
4278 EXPORT_SYMBOL(sk_busy_loop_end);
4279 #endif /* CONFIG_NET_RX_BUSY_POLL */
4280
4281 int sock_bind_add(struct sock *sk, struct sockaddr *addr, int addr_len)
4282 {
4283         if (!sk->sk_prot->bind_add)
4284                 return -EOPNOTSUPP;
4285         return sk->sk_prot->bind_add(sk, addr, addr_len);
4286 }
4287 EXPORT_SYMBOL(sock_bind_add);
4288
4289 /* Copy 'size' bytes from userspace and return `size` back to userspace */
4290 int sock_ioctl_inout(struct sock *sk, unsigned int cmd,
4291                      void __user *arg, void *karg, size_t size)
4292 {
4293         int ret;
4294
4295         if (copy_from_user(karg, arg, size))
4296                 return -EFAULT;
4297
4298         ret = READ_ONCE(sk->sk_prot)->ioctl(sk, cmd, karg);
4299         if (ret)
4300                 return ret;
4301
4302         if (copy_to_user(arg, karg, size))
4303                 return -EFAULT;
4304
4305         return 0;
4306 }
4307 EXPORT_SYMBOL(sock_ioctl_inout);
4308
4309 /* This is the most common ioctl prep function, where the result (4 bytes) is
4310  * copied back to userspace if the ioctl() returns successfully. No input is
4311  * copied from userspace as input argument.
4312  */
4313 static int sock_ioctl_out(struct sock *sk, unsigned int cmd, void __user *arg)
4314 {
4315         int ret, karg = 0;
4316
4317         ret = READ_ONCE(sk->sk_prot)->ioctl(sk, cmd, &karg);
4318         if (ret)
4319                 return ret;
4320
4321         return put_user(karg, (int __user *)arg);
4322 }
4323
4324 /* A wrapper around sock ioctls, which copies the data from userspace
4325  * (depending on the protocol/ioctl), and copies back the result to userspace.
4326  * The main motivation for this function is to pass kernel memory to the
4327  * protocol ioctl callbacks, instead of userspace memory.
4328  */
4329 int sk_ioctl(struct sock *sk, unsigned int cmd, void __user *arg)
4330 {
4331         int rc = 1;
4332
4333         if (sk->sk_type == SOCK_RAW && sk->sk_family == AF_INET)
4334                 rc = ipmr_sk_ioctl(sk, cmd, arg);
4335         else if (sk->sk_type == SOCK_RAW && sk->sk_family == AF_INET6)
4336                 rc = ip6mr_sk_ioctl(sk, cmd, arg);
4337         else if (sk_is_phonet(sk))
4338                 rc = phonet_sk_ioctl(sk, cmd, arg);
4339
4340         /* If ioctl was processed, returns its value */
4341         if (rc <= 0)
4342                 return rc;
4343
4344         /* Otherwise call the default handler */
4345         return sock_ioctl_out(sk, cmd, arg);
4346 }
4347 EXPORT_SYMBOL(sk_ioctl);
4348
4349 static int __init sock_struct_check(void)
4350 {
4351         CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rx, sk_drops);
4352         CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rx, sk_peek_off);
4353         CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rx, sk_error_queue);
4354         CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rx, sk_receive_queue);
4355         CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rx, sk_backlog);
4356
4357         CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_rx_dst);
4358         CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_rx_dst_ifindex);
4359         CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_rx_dst_cookie);
4360         CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_rcvbuf);
4361         CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_filter);
4362         CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_wq);
4363         CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_data_ready);
4364         CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_rcvtimeo);
4365         CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_rcvlowat);
4366
4367         CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rxtx, sk_err);
4368         CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rxtx, sk_socket);
4369         CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rxtx, sk_memcg);
4370
4371         CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rxtx, sk_lock);
4372         CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rxtx, sk_reserved_mem);
4373         CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rxtx, sk_forward_alloc);
4374         CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rxtx, sk_tsflags);
4375
4376         CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_omem_alloc);
4377         CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_omem_alloc);
4378         CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_sndbuf);
4379         CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_wmem_queued);
4380         CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_wmem_alloc);
4381         CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_tsq_flags);
4382         CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_send_head);
4383         CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_write_queue);
4384         CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_write_pending);
4385         CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_dst_pending_confirm);
4386         CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_pacing_status);
4387         CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_frag);
4388         CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_timer);
4389         CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_pacing_rate);
4390         CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_zckey);
4391         CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_tskey);
4392
4393         CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_max_pacing_rate);
4394         CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_sndtimeo);
4395         CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_priority);
4396         CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_mark);
4397         CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_dst_cache);
4398         CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_route_caps);
4399         CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_gso_type);
4400         CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_gso_max_size);
4401         CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_allocation);
4402         CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_txhash);
4403         CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_gso_max_segs);
4404         CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_pacing_shift);
4405         CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_use_task_frag);
4406         return 0;
4407 }
4408
4409 core_initcall(sock_struct_check);