net/core/sock.c

   1 // SPDX-License-Identifier: GPL-2.0-or-later
   2 /*
   3  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   4  *              operating system.  INET is implemented using the  BSD Socket
   5  *              interface as the means of communication with the user level.
   6  *
   7  *              Generic socket support routines. Memory allocators, socket lock/release
   8  *              handler for protocols to use and generic option handler.
   9  *
  10  * Authors:     Ross Biro
  11  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12  *              Florian La Roche, <flla@stud.uni-sb.de>
  13  *              Alan Cox, <A.Cox@swansea.ac.uk>
  14  *
  15  * Fixes:
  16  *              Alan Cox        :       Numerous verify_area() problems
  17  *              Alan Cox        :       Connecting on a connecting socket
  18  *                                      now returns an error for tcp.
  19  *              Alan Cox        :       sock->protocol is set correctly.
  20  *                                      and is not sometimes left as 0.
  21  *              Alan Cox        :       connect handles icmp errors on a
  22  *                                      connect properly. Unfortunately there
  23  *                                      is a restart syscall nasty there. I
  24  *                                      can't match BSD without hacking the C
  25  *                                      library. Ideas urgently sought!
  26  *              Alan Cox        :       Disallow bind() to addresses that are
  27  *                                      not ours - especially broadcast ones!!
  28  *              Alan Cox        :       Socket 1024 _IS_ ok for users. (fencepost)
  29  *              Alan Cox        :       sock_wfree/sock_rfree don't destroy sockets,
  30  *                                      instead they leave that for the DESTROY timer.
  31  *              Alan Cox        :       Clean up error flag in accept
  32  *              Alan Cox        :       TCP ack handling is buggy, the DESTROY timer
  33  *                                      was buggy. Put a remove_sock() in the handler
  34  *                                      for memory when we hit 0. Also altered the timer
  35  *                                      code. The ACK stuff can wait and needs major
  36  *                                      TCP layer surgery.
  37  *              Alan Cox        :       Fixed TCP ack bug, removed remove sock
  38  *                                      and fixed timer/inet_bh race.
  39  *              Alan Cox        :       Added zapped flag for TCP
  40  *              Alan Cox        :       Move kfree_skb into skbuff.c and tidied up surplus code
  41  *              Alan Cox        :       for new sk_buff allocations wmalloc/rmalloc now call alloc_skb
  42  *              Alan Cox        :       kfree_s calls now are kfree_skbmem so we can track skb resources
  43  *              Alan Cox        :       Supports socket option broadcast now as does udp. Packet and raw need fixing.
  44  *              Alan Cox        :       Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so...
  45  *              Rick Sladkey    :       Relaxed UDP rules for matching packets.
  46  *              C.E.Hawkins     :       IFF_PROMISC/SIOCGHWADDR support
  47  *      Pauline Middelink       :       identd support
  48  *              Alan Cox        :       Fixed connect() taking signals I think.
  49  *              Alan Cox        :       SO_LINGER supported
  50  *              Alan Cox        :       Error reporting fixes
  51  *              Anonymous       :       inet_create tidied up (sk->reuse setting)
  52  *              Alan Cox        :       inet sockets don't set sk->type!
  53  *              Alan Cox        :       Split socket option code
  54  *              Alan Cox        :       Callbacks
  55  *              Alan Cox        :       Nagle flag for Charles & Johannes stuff
  56  *              Alex            :       Removed restriction on inet fioctl
  57  *              Alan Cox        :       Splitting INET from NET core
  58  *              Alan Cox        :       Fixed bogus SO_TYPE handling in getsockopt()
  59  *              Adam Caldwell   :       Missing return in SO_DONTROUTE/SO_DEBUG code
  60  *              Alan Cox        :       Split IP from generic code
  61  *              Alan Cox        :       New kfree_skbmem()
  62  *              Alan Cox        :       Make SO_DEBUG superuser only.
  63  *              Alan Cox        :       Allow anyone to clear SO_DEBUG
  64  *                                      (compatibility fix)
  65  *              Alan Cox        :       Added optimistic memory grabbing for AF_UNIX throughput.
  66  *              Alan Cox        :       Allocator for a socket is settable.
  67  *              Alan Cox        :       SO_ERROR includes soft errors.
  68  *              Alan Cox        :       Allow NULL arguments on some SO_ opts
  69  *              Alan Cox        :       Generic socket allocation to make hooks
  70  *                                      easier (suggested by Craig Metz).
  71  *              Michael Pall    :       SO_ERROR returns positive errno again
  72  *              Steve Whitehouse:       Added default destructor to free
  73  *                                      protocol private data.
  74  *              Steve Whitehouse:       Added various other default routines
  75  *                                      common to several socket families.
  76  *              Chris Evans     :       Call suser() check last on F_SETOWN
  77  *              Jay Schulist    :       Added SO_ATTACH_FILTER and SO_DETACH_FILTER.
  78  *              Andi Kleen      :       Add sock_kmalloc()/sock_kfree_s()
  79  *              Andi Kleen      :       Fix write_space callback
  80  *              Chris Evans     :       Security fixes - signedness again
  81  *              Arnaldo C. Melo :       cleanups, use skb_queue_purge
  82  *
  83  * To Fix:
  84  */
  85
  86 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
  87
  88 #include <linux/unaligned.h>
  89 #include <linux/capability.h>
  90 #include <linux/errno.h>
  91 #include <linux/errqueue.h>
  92 #include <linux/types.h>
  93 #include <linux/socket.h>
  94 #include <linux/in.h>
  95 #include <linux/kernel.h>
  96 #include <linux/module.h>
  97 #include <linux/proc_fs.h>
  98 #include <linux/seq_file.h>
  99 #include <linux/sched.h>
 100 #include <linux/sched/mm.h>
 101 #include <linux/timer.h>
 102 #include <linux/string.h>
 103 #include <linux/sockios.h>
 104 #include <linux/net.h>
 105 #include <linux/mm.h>
 106 #include <linux/slab.h>
 107 #include <linux/interrupt.h>
 108 #include <linux/poll.h>
 109 #include <linux/tcp.h>
 110 #include <linux/udp.h>
 111 #include <linux/init.h>
 112 #include <linux/highmem.h>
 113 #include <linux/user_namespace.h>
 114 #include <linux/static_key.h>
 115 #include <linux/memcontrol.h>
 116 #include <linux/prefetch.h>
 117 #include <linux/compat.h>
 118 #include <linux/mroute.h>
 119 #include <linux/mroute6.h>
 120 #include <linux/icmpv6.h>
 121
 122 #include <linux/uaccess.h>
 123
 124 #include <linux/netdevice.h>
 125 #include <net/protocol.h>
 126 #include <linux/skbuff.h>
 127 #include <linux/skbuff_ref.h>
 128 #include <net/net_namespace.h>
 129 #include <net/request_sock.h>
 130 #include <net/sock.h>
 131 #include <net/proto_memory.h>
 132 #include <linux/net_tstamp.h>
 133 #include <net/xfrm.h>
 134 #include <linux/ipsec.h>
 135 #include <net/cls_cgroup.h>
 136 #include <net/netprio_cgroup.h>
 137 #include <linux/sock_diag.h>
 138
 139 #include <linux/filter.h>
 140 #include <net/sock_reuseport.h>
 141 #include <net/bpf_sk_storage.h>
 142
 143 #include <trace/events/sock.h>
 144
 145 #include <net/tcp.h>
 146 #include <net/busy_poll.h>
 147 #include <net/phonet/phonet.h>
 148
 149 #include <linux/ethtool.h>
 150
 151 #include "dev.h"
 152
 153 static DEFINE_MUTEX(proto_list_mutex);
 154 static LIST_HEAD(proto_list);
 155
 156 static void sock_def_write_space_wfree(struct sock *sk);
 157 static void sock_def_write_space(struct sock *sk);
 158
 159 /**
 160  * sk_ns_capable - General socket capability test
 161  * @sk: Socket to use a capability on or through
 162  * @user_ns: The user namespace of the capability to use
 163  * @cap: The capability to use
 164  *
 165  * Test to see if the opener of the socket had when the socket was
 166  * created and the current process has the capability @cap in the user
 167  * namespace @user_ns.
 168  */
 169 bool sk_ns_capable(const struct sock *sk,
 170                    struct user_namespace *user_ns, int cap)
 171 {
 172         return file_ns_capable(sk->sk_socket->file, user_ns, cap) &&
 173                 ns_capable(user_ns, cap);
 174 }
 175 EXPORT_SYMBOL(sk_ns_capable);
 176
 177 /**
 178  * sk_capable - Socket global capability test
 179  * @sk: Socket to use a capability on or through
 180  * @cap: The global capability to use
 181  *
 182  * Test to see if the opener of the socket had when the socket was
 183  * created and the current process has the capability @cap in all user
 184  * namespaces.
 185  */
 186 bool sk_capable(const struct sock *sk, int cap)
 187 {
 188         return sk_ns_capable(sk, &init_user_ns, cap);
 189 }
 190 EXPORT_SYMBOL(sk_capable);
 191
 192 /**
 193  * sk_net_capable - Network namespace socket capability test
 194  * @sk: Socket to use a capability on or through
 195  * @cap: The capability to use
 196  *
 197  * Test to see if the opener of the socket had when the socket was created
 198  * and the current process has the capability @cap over the network namespace
 199  * the socket is a member of.
 200  */
 201 bool sk_net_capable(const struct sock *sk, int cap)
 202 {
 203         return sk_ns_capable(sk, sock_net(sk)->user_ns, cap);
 204 }
 205 EXPORT_SYMBOL(sk_net_capable);
 206
 207 /*
 208  * Each address family might have different locking rules, so we have
 209  * one slock key per address family and separate keys for internal and
 210  * userspace sockets.
 211  */
 212 static struct lock_class_key af_family_keys[AF_MAX];
 213 static struct lock_class_key af_family_kern_keys[AF_MAX];
 214 static struct lock_class_key af_family_slock_keys[AF_MAX];
 215 static struct lock_class_key af_family_kern_slock_keys[AF_MAX];
 216
 217 /*
 218  * Make lock validator output more readable. (we pre-construct these
 219  * strings build-time, so that runtime initialization of socket
 220  * locks is fast):
 221  */
 222
 223 #define _sock_locks(x)                                            \
 224   x "AF_UNSPEC",        x "AF_UNIX"     ,       x "AF_INET"     , \
 225   x "AF_AX25"  ,        x "AF_IPX"      ,       x "AF_APPLETALK", \
 226   x "AF_NETROM",        x "AF_BRIDGE"   ,       x "AF_ATMPVC"   , \
 227   x "AF_X25"   ,        x "AF_INET6"    ,       x "AF_ROSE"     , \
 228   x "AF_DECnet",        x "AF_NETBEUI"  ,       x "AF_SECURITY" , \
 229   x "AF_KEY"   ,        x "AF_NETLINK"  ,       x "AF_PACKET"   , \
 230   x "AF_ASH"   ,        x "AF_ECONET"   ,       x "AF_ATMSVC"   , \
 231   x "AF_RDS"   ,        x "AF_SNA"      ,       x "AF_IRDA"     , \
 232   x "AF_PPPOX" ,        x "AF_WANPIPE"  ,       x "AF_LLC"      , \
 233   x "27"       ,        x "28"          ,       x "AF_CAN"      , \
 234   x "AF_TIPC"  ,        x "AF_BLUETOOTH",       x "IUCV"        , \
 235   x "AF_RXRPC" ,        x "AF_ISDN"     ,       x "AF_PHONET"   , \
 236   x "AF_IEEE802154",    x "AF_CAIF"     ,       x "AF_ALG"      , \
 237   x "AF_NFC"   ,        x "AF_VSOCK"    ,       x "AF_KCM"      , \
 238   x "AF_QIPCRTR",       x "AF_SMC"      ,       x "AF_XDP"      , \
 239   x "AF_MCTP"  , \
 240   x "AF_MAX"
 241
 242 static const char *const af_family_key_strings[AF_MAX+1] = {
 243         _sock_locks("sk_lock-")
 244 };
 245 static const char *const af_family_slock_key_strings[AF_MAX+1] = {
 246         _sock_locks("slock-")
 247 };
 248 static const char *const af_family_clock_key_strings[AF_MAX+1] = {
 249         _sock_locks("clock-")
 250 };
 251
 252 static const char *const af_family_kern_key_strings[AF_MAX+1] = {
 253         _sock_locks("k-sk_lock-")
 254 };
 255 static const char *const af_family_kern_slock_key_strings[AF_MAX+1] = {
 256         _sock_locks("k-slock-")
 257 };
 258 static const char *const af_family_kern_clock_key_strings[AF_MAX+1] = {
 259         _sock_locks("k-clock-")
 260 };
 261 static const char *const af_family_rlock_key_strings[AF_MAX+1] = {
 262         _sock_locks("rlock-")
 263 };
 264 static const char *const af_family_wlock_key_strings[AF_MAX+1] = {
 265         _sock_locks("wlock-")
 266 };
 267 static const char *const af_family_elock_key_strings[AF_MAX+1] = {
 268         _sock_locks("elock-")
 269 };
 270
 271 /*
 272  * sk_callback_lock and sk queues locking rules are per-address-family,
 273  * so split the lock classes by using a per-AF key:
 274  */
 275 static struct lock_class_key af_callback_keys[AF_MAX];
 276 static struct lock_class_key af_rlock_keys[AF_MAX];
 277 static struct lock_class_key af_wlock_keys[AF_MAX];
 278 static struct lock_class_key af_elock_keys[AF_MAX];
 279 static struct lock_class_key af_kern_callback_keys[AF_MAX];
 280
 281 /* Run time adjustable parameters. */
 282 __u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX;
 283 EXPORT_SYMBOL(sysctl_wmem_max);
 284 __u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX;
 285 EXPORT_SYMBOL(sysctl_rmem_max);
 286 __u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX;
 287 __u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX;
 288
 289 DEFINE_STATIC_KEY_FALSE(memalloc_socks_key);
 290 EXPORT_SYMBOL_GPL(memalloc_socks_key);
 291
 292 /**
 293  * sk_set_memalloc - sets %SOCK_MEMALLOC
 294  * @sk: socket to set it on
 295  *
 296  * Set %SOCK_MEMALLOC on a socket for access to emergency reserves.
 297  * It's the responsibility of the admin to adjust min_free_kbytes
 298  * to meet the requirements
 299  */
 300 void sk_set_memalloc(struct sock *sk)
 301 {
 302         sock_set_flag(sk, SOCK_MEMALLOC);
 303         sk->sk_allocation |= __GFP_MEMALLOC;
 304         static_branch_inc(&memalloc_socks_key);
 305 }
 306 EXPORT_SYMBOL_GPL(sk_set_memalloc);
 307
 308 void sk_clear_memalloc(struct sock *sk)
 309 {
 310         sock_reset_flag(sk, SOCK_MEMALLOC);
 311         sk->sk_allocation &= ~__GFP_MEMALLOC;
 312         static_branch_dec(&memalloc_socks_key);
 313
 314         /*
 315          * SOCK_MEMALLOC is allowed to ignore rmem limits to ensure forward
 316          * progress of swapping. SOCK_MEMALLOC may be cleared while
 317          * it has rmem allocations due to the last swapfile being deactivated
 318          * but there is a risk that the socket is unusable due to exceeding
 319          * the rmem limits. Reclaim the reserves and obey rmem limits again.
 320          */
 321         sk_mem_reclaim(sk);
 322 }
 323 EXPORT_SYMBOL_GPL(sk_clear_memalloc);
 324
 325 int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb)
 326 {
 327         int ret;
 328         unsigned int noreclaim_flag;
 329
 330         /* these should have been dropped before queueing */
 331         BUG_ON(!sock_flag(sk, SOCK_MEMALLOC));
 332
 333         noreclaim_flag = memalloc_noreclaim_save();
 334         ret = INDIRECT_CALL_INET(sk->sk_backlog_rcv,
 335                                  tcp_v6_do_rcv,
 336                                  tcp_v4_do_rcv,
 337                                  sk, skb);
 338         memalloc_noreclaim_restore(noreclaim_flag);
 339
 340         return ret;
 341 }
 342 EXPORT_SYMBOL(__sk_backlog_rcv);
 343
 344 void sk_error_report(struct sock *sk)
 345 {
 346         sk->sk_error_report(sk);
 347
 348         switch (sk->sk_family) {
 349         case AF_INET:
 350                 fallthrough;
 351         case AF_INET6:
 352                 trace_inet_sk_error_report(sk);
 353                 break;
 354         default:
 355                 break;
 356         }
 357 }
 358 EXPORT_SYMBOL(sk_error_report);
 359
 360 int sock_get_timeout(long timeo, void *optval, bool old_timeval)
 361 {
 362         struct __kernel_sock_timeval tv;
 363
 364         if (timeo == MAX_SCHEDULE_TIMEOUT) {
 365                 tv.tv_sec = 0;
 366                 tv.tv_usec = 0;
 367         } else {
 368                 tv.tv_sec = timeo / HZ;
 369                 tv.tv_usec = ((timeo % HZ) * USEC_PER_SEC) / HZ;
 370         }
 371
 372         if (old_timeval && in_compat_syscall() && !COMPAT_USE_64BIT_TIME) {
 373                 struct old_timeval32 tv32 = { tv.tv_sec, tv.tv_usec };
 374                 *(struct old_timeval32 *)optval = tv32;
 375                 return sizeof(tv32);
 376         }
 377
 378         if (old_timeval) {
 379                 struct __kernel_old_timeval old_tv;
 380                 old_tv.tv_sec = tv.tv_sec;
 381                 old_tv.tv_usec = tv.tv_usec;
 382                 *(struct __kernel_old_timeval *)optval = old_tv;
 383                 return sizeof(old_tv);
 384         }
 385
 386         *(struct __kernel_sock_timeval *)optval = tv;
 387         return sizeof(tv);
 388 }
 389 EXPORT_SYMBOL(sock_get_timeout);
 390
 391 int sock_copy_user_timeval(struct __kernel_sock_timeval *tv,
 392                            sockptr_t optval, int optlen, bool old_timeval)
 393 {
 394         if (old_timeval && in_compat_syscall() && !COMPAT_USE_64BIT_TIME) {
 395                 struct old_timeval32 tv32;
 396
 397                 if (optlen < sizeof(tv32))
 398                         return -EINVAL;
 399
 400                 if (copy_from_sockptr(&tv32, optval, sizeof(tv32)))
 401                         return -EFAULT;
 402                 tv->tv_sec = tv32.tv_sec;
 403                 tv->tv_usec = tv32.tv_usec;
 404         } else if (old_timeval) {
 405                 struct __kernel_old_timeval old_tv;
 406
 407                 if (optlen < sizeof(old_tv))
 408                         return -EINVAL;
 409                 if (copy_from_sockptr(&old_tv, optval, sizeof(old_tv)))
 410                         return -EFAULT;
 411                 tv->tv_sec = old_tv.tv_sec;
 412                 tv->tv_usec = old_tv.tv_usec;
 413         } else {
 414                 if (optlen < sizeof(*tv))
 415                         return -EINVAL;
 416                 if (copy_from_sockptr(tv, optval, sizeof(*tv)))
 417                         return -EFAULT;
 418         }
 419
 420         return 0;
 421 }
 422 EXPORT_SYMBOL(sock_copy_user_timeval);
 423
 424 static int sock_set_timeout(long *timeo_p, sockptr_t optval, int optlen,
 425                             bool old_timeval)
 426 {
 427         struct __kernel_sock_timeval tv;
 428         int err = sock_copy_user_timeval(&tv, optval, optlen, old_timeval);
 429         long val;
 430
 431         if (err)
 432                 return err;
 433
 434         if (tv.tv_usec < 0 || tv.tv_usec >= USEC_PER_SEC)
 435                 return -EDOM;
 436
 437         if (tv.tv_sec < 0) {
 438                 static int warned __read_mostly;
 439
 440                 WRITE_ONCE(*timeo_p, 0);
 441                 if (warned < 10 && net_ratelimit()) {
 442                         warned++;
 443                         pr_info("%s: `%s' (pid %d) tries to set negative timeout\n",
 444                                 __func__, current->comm, task_pid_nr(current));
 445                 }
 446                 return 0;
 447         }
 448         val = MAX_SCHEDULE_TIMEOUT;
 449         if ((tv.tv_sec || tv.tv_usec) &&
 450             (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT / HZ - 1)))
 451                 val = tv.tv_sec * HZ + DIV_ROUND_UP((unsigned long)tv.tv_usec,
 452                                                     USEC_PER_SEC / HZ);
 453         WRITE_ONCE(*timeo_p, val);
 454         return 0;
 455 }
 456
 457 static bool sock_needs_netstamp(const struct sock *sk)
 458 {
 459         switch (sk->sk_family) {
 460         case AF_UNSPEC:
 461         case AF_UNIX:
 462                 return false;
 463         default:
 464                 return true;
 465         }
 466 }
 467
 468 static void sock_disable_timestamp(struct sock *sk, unsigned long flags)
 469 {
 470         if (sk->sk_flags & flags) {
 471                 sk->sk_flags &= ~flags;
 472                 if (sock_needs_netstamp(sk) &&
 473                     !(sk->sk_flags & SK_FLAGS_TIMESTAMP))
 474                         net_disable_timestamp();
 475         }
 476 }
 477
 478
 479 int __sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
 480 {
 481         unsigned long flags;
 482         struct sk_buff_head *list = &sk->sk_receive_queue;
 483
 484         if (atomic_read(&sk->sk_rmem_alloc) >= READ_ONCE(sk->sk_rcvbuf)) {
 485                 atomic_inc(&sk->sk_drops);
 486                 trace_sock_rcvqueue_full(sk, skb);
 487                 return -ENOMEM;
 488         }
 489
 490         if (!sk_rmem_schedule(sk, skb, skb->truesize)) {
 491                 atomic_inc(&sk->sk_drops);
 492                 return -ENOBUFS;
 493         }
 494
 495         skb->dev = NULL;
 496         skb_set_owner_r(skb, sk);
 497
 498         /* we escape from rcu protected region, make sure we dont leak
 499          * a norefcounted dst
 500          */
 501         skb_dst_force(skb);
 502
 503         spin_lock_irqsave(&list->lock, flags);
 504         sock_skb_set_dropcount(sk, skb);
 505         __skb_queue_tail(list, skb);
 506         spin_unlock_irqrestore(&list->lock, flags);
 507
 508         if (!sock_flag(sk, SOCK_DEAD))
 509                 sk->sk_data_ready(sk);
 510         return 0;
 511 }
 512 EXPORT_SYMBOL(__sock_queue_rcv_skb);
 513
 514 int sock_queue_rcv_skb_reason(struct sock *sk, struct sk_buff *skb,
 515                               enum skb_drop_reason *reason)
 516 {
 517         enum skb_drop_reason drop_reason;
 518         int err;
 519
 520         err = sk_filter(sk, skb);
 521         if (err) {
 522                 drop_reason = SKB_DROP_REASON_SOCKET_FILTER;
 523                 goto out;
 524         }
 525         err = __sock_queue_rcv_skb(sk, skb);
 526         switch (err) {
 527         case -ENOMEM:
 528                 drop_reason = SKB_DROP_REASON_SOCKET_RCVBUFF;
 529                 break;
 530         case -ENOBUFS:
 531                 drop_reason = SKB_DROP_REASON_PROTO_MEM;
 532                 break;
 533         default:
 534                 drop_reason = SKB_NOT_DROPPED_YET;
 535                 break;
 536         }
 537 out:
 538         if (reason)
 539                 *reason = drop_reason;
 540         return err;
 541 }
 542 EXPORT_SYMBOL(sock_queue_rcv_skb_reason);
 543
 544 int __sk_receive_skb(struct sock *sk, struct sk_buff *skb,
 545                      const int nested, unsigned int trim_cap, bool refcounted)
 546 {
 547         int rc = NET_RX_SUCCESS;
 548
 549         if (sk_filter_trim_cap(sk, skb, trim_cap))
 550                 goto discard_and_relse;
 551
 552         skb->dev = NULL;
 553
 554         if (sk_rcvqueues_full(sk, READ_ONCE(sk->sk_rcvbuf))) {
 555                 atomic_inc(&sk->sk_drops);
 556                 goto discard_and_relse;
 557         }
 558         if (nested)
 559                 bh_lock_sock_nested(sk);
 560         else
 561                 bh_lock_sock(sk);
 562         if (!sock_owned_by_user(sk)) {
 563                 /*
 564                  * trylock + unlock semantics:
 565                  */
 566                 mutex_acquire(&sk->sk_lock.dep_map, 0, 1, _RET_IP_);
 567
 568                 rc = sk_backlog_rcv(sk, skb);
 569
 570                 mutex_release(&sk->sk_lock.dep_map, _RET_IP_);
 571         } else if (sk_add_backlog(sk, skb, READ_ONCE(sk->sk_rcvbuf))) {
 572                 bh_unlock_sock(sk);
 573                 atomic_inc(&sk->sk_drops);
 574                 goto discard_and_relse;
 575         }
 576
 577         bh_unlock_sock(sk);
 578 out:
 579         if (refcounted)
 580                 sock_put(sk);
 581         return rc;
 582 discard_and_relse:
 583         kfree_skb(skb);
 584         goto out;
 585 }
 586 EXPORT_SYMBOL(__sk_receive_skb);
 587
 588 INDIRECT_CALLABLE_DECLARE(struct dst_entry *ip6_dst_check(struct dst_entry *,
 589                                                           u32));
 590 INDIRECT_CALLABLE_DECLARE(struct dst_entry *ipv4_dst_check(struct dst_entry *,
 591                                                            u32));
 592 struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie)
 593 {
 594         struct dst_entry *dst = __sk_dst_get(sk);
 595
 596         if (dst && dst->obsolete &&
 597             INDIRECT_CALL_INET(dst->ops->check, ip6_dst_check, ipv4_dst_check,
 598                                dst, cookie) == NULL) {
 599                 sk_tx_queue_clear(sk);
 600                 WRITE_ONCE(sk->sk_dst_pending_confirm, 0);
 601                 RCU_INIT_POINTER(sk->sk_dst_cache, NULL);
 602                 dst_release(dst);
 603                 return NULL;
 604         }
 605
 606         return dst;
 607 }
 608 EXPORT_SYMBOL(__sk_dst_check);
 609
 610 struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie)
 611 {
 612         struct dst_entry *dst = sk_dst_get(sk);
 613
 614         if (dst && dst->obsolete &&
 615             INDIRECT_CALL_INET(dst->ops->check, ip6_dst_check, ipv4_dst_check,
 616                                dst, cookie) == NULL) {
 617                 sk_dst_reset(sk);
 618                 dst_release(dst);
 619                 return NULL;
 620         }
 621
 622         return dst;
 623 }
 624 EXPORT_SYMBOL(sk_dst_check);
 625
 626 static int sock_bindtoindex_locked(struct sock *sk, int ifindex)
 627 {
 628         int ret = -ENOPROTOOPT;
 629 #ifdef CONFIG_NETDEVICES
 630         struct net *net = sock_net(sk);
 631
 632         /* Sorry... */
 633         ret = -EPERM;
 634         if (sk->sk_bound_dev_if && !ns_capable(net->user_ns, CAP_NET_RAW))
 635                 goto out;
 636
 637         ret = -EINVAL;
 638         if (ifindex < 0)
 639                 goto out;
 640
 641         /* Paired with all READ_ONCE() done locklessly. */
 642         WRITE_ONCE(sk->sk_bound_dev_if, ifindex);
 643
 644         if (sk->sk_prot->rehash)
 645                 sk->sk_prot->rehash(sk);
 646         sk_dst_reset(sk);
 647
 648         ret = 0;
 649
 650 out:
 651 #endif
 652
 653         return ret;
 654 }
 655
 656 int sock_bindtoindex(struct sock *sk, int ifindex, bool lock_sk)
 657 {
 658         int ret;
 659
 660         if (lock_sk)
 661                 lock_sock(sk);
 662         ret = sock_bindtoindex_locked(sk, ifindex);
 663         if (lock_sk)
 664                 release_sock(sk);
 665
 666         return ret;
 667 }
 668 EXPORT_SYMBOL(sock_bindtoindex);
 669
 670 static int sock_setbindtodevice(struct sock *sk, sockptr_t optval, int optlen)
 671 {
 672         int ret = -ENOPROTOOPT;
 673 #ifdef CONFIG_NETDEVICES
 674         struct net *net = sock_net(sk);
 675         char devname[IFNAMSIZ];
 676         int index;
 677
 678         ret = -EINVAL;
 679         if (optlen < 0)
 680                 goto out;
 681
 682         /* Bind this socket to a particular device like "eth0",
 683          * as specified in the passed interface name. If the
 684          * name is "" or the option length is zero the socket
 685          * is not bound.
 686          */
 687         if (optlen > IFNAMSIZ - 1)
 688                 optlen = IFNAMSIZ - 1;
 689         memset(devname, 0, sizeof(devname));
 690
 691         ret = -EFAULT;
 692         if (copy_from_sockptr(devname, optval, optlen))
 693                 goto out;
 694
 695         index = 0;
 696         if (devname[0] != '\0') {
 697                 struct net_device *dev;
 698
 699                 rcu_read_lock();
 700                 dev = dev_get_by_name_rcu(net, devname);
 701                 if (dev)
 702                         index = dev->ifindex;
 703                 rcu_read_unlock();
 704                 ret = -ENODEV;
 705                 if (!dev)
 706                         goto out;
 707         }
 708
 709         sockopt_lock_sock(sk);
 710         ret = sock_bindtoindex_locked(sk, index);
 711         sockopt_release_sock(sk);
 712 out:
 713 #endif
 714
 715         return ret;
 716 }
 717
 718 static int sock_getbindtodevice(struct sock *sk, sockptr_t optval,
 719                                 sockptr_t optlen, int len)
 720 {
 721         int ret = -ENOPROTOOPT;
 722 #ifdef CONFIG_NETDEVICES
 723         int bound_dev_if = READ_ONCE(sk->sk_bound_dev_if);
 724         struct net *net = sock_net(sk);
 725         char devname[IFNAMSIZ];
 726
 727         if (bound_dev_if == 0) {
 728                 len = 0;
 729                 goto zero;
 730         }
 731
 732         ret = -EINVAL;
 733         if (len < IFNAMSIZ)
 734                 goto out;
 735
 736         ret = netdev_get_name(net, devname, bound_dev_if);
 737         if (ret)
 738                 goto out;
 739
 740         len = strlen(devname) + 1;
 741
 742         ret = -EFAULT;
 743         if (copy_to_sockptr(optval, devname, len))
 744                 goto out;
 745
 746 zero:
 747         ret = -EFAULT;
 748         if (copy_to_sockptr(optlen, &len, sizeof(int)))
 749                 goto out;
 750
 751         ret = 0;
 752
 753 out:
 754 #endif
 755
 756         return ret;
 757 }
 758
 759 bool sk_mc_loop(const struct sock *sk)
 760 {
 761         if (dev_recursion_level())
 762                 return false;
 763         if (!sk)
 764                 return true;
 765         /* IPV6_ADDRFORM can change sk->sk_family under us. */
 766         switch (READ_ONCE(sk->sk_family)) {
 767         case AF_INET:
 768                 return inet_test_bit(MC_LOOP, sk);
 769 #if IS_ENABLED(CONFIG_IPV6)
 770         case AF_INET6:
 771                 return inet6_test_bit(MC6_LOOP, sk);
 772 #endif
 773         }
 774         WARN_ON_ONCE(1);
 775         return true;
 776 }
 777 EXPORT_SYMBOL(sk_mc_loop);
 778
 779 void sock_set_reuseaddr(struct sock *sk)
 780 {
 781         lock_sock(sk);
 782         sk->sk_reuse = SK_CAN_REUSE;
 783         release_sock(sk);
 784 }
 785 EXPORT_SYMBOL(sock_set_reuseaddr);
 786
 787 void sock_set_reuseport(struct sock *sk)
 788 {
 789         lock_sock(sk);
 790         sk->sk_reuseport = true;
 791         release_sock(sk);
 792 }
 793 EXPORT_SYMBOL(sock_set_reuseport);
 794
 795 void sock_no_linger(struct sock *sk)
 796 {
 797         lock_sock(sk);
 798         WRITE_ONCE(sk->sk_lingertime, 0);
 799         sock_set_flag(sk, SOCK_LINGER);
 800         release_sock(sk);
 801 }
 802 EXPORT_SYMBOL(sock_no_linger);
 803
 804 void sock_set_priority(struct sock *sk, u32 priority)
 805 {
 806         WRITE_ONCE(sk->sk_priority, priority);
 807 }
 808 EXPORT_SYMBOL(sock_set_priority);
 809
 810 void sock_set_sndtimeo(struct sock *sk, s64 secs)
 811 {
 812         lock_sock(sk);
 813         if (secs && secs < MAX_SCHEDULE_TIMEOUT / HZ - 1)
 814                 WRITE_ONCE(sk->sk_sndtimeo, secs * HZ);
 815         else
 816                 WRITE_ONCE(sk->sk_sndtimeo, MAX_SCHEDULE_TIMEOUT);
 817         release_sock(sk);
 818 }
 819 EXPORT_SYMBOL(sock_set_sndtimeo);
 820
 821 static void __sock_set_timestamps(struct sock *sk, bool val, bool new, bool ns)
 822 {
 823         sock_valbool_flag(sk, SOCK_RCVTSTAMP, val);
 824         sock_valbool_flag(sk, SOCK_RCVTSTAMPNS, val && ns);
 825         if (val)  {
 826                 sock_valbool_flag(sk, SOCK_TSTAMP_NEW, new);
 827                 sock_enable_timestamp(sk, SOCK_TIMESTAMP);
 828         }
 829 }
 830
 831 void sock_enable_timestamps(struct sock *sk)
 832 {
 833         lock_sock(sk);
 834         __sock_set_timestamps(sk, true, false, true);
 835         release_sock(sk);
 836 }
 837 EXPORT_SYMBOL(sock_enable_timestamps);
 838
 839 void sock_set_timestamp(struct sock *sk, int optname, bool valbool)
 840 {
 841         switch (optname) {
 842         case SO_TIMESTAMP_OLD:
 843                 __sock_set_timestamps(sk, valbool, false, false);
 844                 break;
 845         case SO_TIMESTAMP_NEW:
 846                 __sock_set_timestamps(sk, valbool, true, false);
 847                 break;
 848         case SO_TIMESTAMPNS_OLD:
 849                 __sock_set_timestamps(sk, valbool, false, true);
 850                 break;
 851         case SO_TIMESTAMPNS_NEW:
 852                 __sock_set_timestamps(sk, valbool, true, true);
 853                 break;
 854         }
 855 }
 856
 857 static int sock_timestamping_bind_phc(struct sock *sk, int phc_index)
 858 {
 859         struct net *net = sock_net(sk);
 860         struct net_device *dev = NULL;
 861         bool match = false;
 862         int *vclock_index;
 863         int i, num;
 864
 865         if (sk->sk_bound_dev_if)
 866                 dev = dev_get_by_index(net, sk->sk_bound_dev_if);
 867
 868         if (!dev) {
 869                 pr_err("%s: sock not bind to device\n", __func__);
 870                 return -EOPNOTSUPP;
 871         }
 872
 873         num = ethtool_get_phc_vclocks(dev, &vclock_index);
 874         dev_put(dev);
 875
 876         for (i = 0; i < num; i++) {
 877                 if (*(vclock_index + i) == phc_index) {
 878                         match = true;
 879                         break;
 880                 }
 881         }
 882
 883         if (num > 0)
 884                 kfree(vclock_index);
 885
 886         if (!match)
 887                 return -EINVAL;
 888
 889         WRITE_ONCE(sk->sk_bind_phc, phc_index);
 890
 891         return 0;
 892 }
 893
 894 int sock_set_timestamping(struct sock *sk, int optname,
 895                           struct so_timestamping timestamping)
 896 {
 897         int val = timestamping.flags;
 898         int ret;
 899
 900         if (val & ~SOF_TIMESTAMPING_MASK)
 901                 return -EINVAL;
 902
 903         if (val & SOF_TIMESTAMPING_OPT_ID_TCP &&
 904             !(val & SOF_TIMESTAMPING_OPT_ID))
 905                 return -EINVAL;
 906
 907         if (val & SOF_TIMESTAMPING_OPT_ID &&
 908             !(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)) {
 909                 if (sk_is_tcp(sk)) {
 910                         if ((1 << sk->sk_state) &
 911                             (TCPF_CLOSE | TCPF_LISTEN))
 912                                 return -EINVAL;
 913                         if (val & SOF_TIMESTAMPING_OPT_ID_TCP)
 914                                 atomic_set(&sk->sk_tskey, tcp_sk(sk)->write_seq);
 915                         else
 916                                 atomic_set(&sk->sk_tskey, tcp_sk(sk)->snd_una);
 917                 } else {
 918                         atomic_set(&sk->sk_tskey, 0);
 919                 }
 920         }
 921
 922         if (val & SOF_TIMESTAMPING_OPT_STATS &&
 923             !(val & SOF_TIMESTAMPING_OPT_TSONLY))
 924                 return -EINVAL;
 925
 926         if (val & SOF_TIMESTAMPING_BIND_PHC) {
 927                 ret = sock_timestamping_bind_phc(sk, timestamping.bind_phc);
 928                 if (ret)
 929                         return ret;
 930         }
 931
 932         WRITE_ONCE(sk->sk_tsflags, val);
 933         sock_valbool_flag(sk, SOCK_TSTAMP_NEW, optname == SO_TIMESTAMPING_NEW);
 934
 935         if (val & SOF_TIMESTAMPING_RX_SOFTWARE)
 936                 sock_enable_timestamp(sk,
 937                                       SOCK_TIMESTAMPING_RX_SOFTWARE);
 938         else
 939                 sock_disable_timestamp(sk,
 940                                        (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE));
 941         return 0;
 942 }
 943
 944 void sock_set_keepalive(struct sock *sk)
 945 {
 946         lock_sock(sk);
 947         if (sk->sk_prot->keepalive)
 948                 sk->sk_prot->keepalive(sk, true);
 949         sock_valbool_flag(sk, SOCK_KEEPOPEN, true);
 950         release_sock(sk);
 951 }
 952 EXPORT_SYMBOL(sock_set_keepalive);
 953
 954 static void __sock_set_rcvbuf(struct sock *sk, int val)
 955 {
 956         /* Ensure val * 2 fits into an int, to prevent max_t() from treating it
 957          * as a negative value.
 958          */
 959         val = min_t(int, val, INT_MAX / 2);
 960         sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
 961
 962         /* We double it on the way in to account for "struct sk_buff" etc.
 963          * overhead.   Applications assume that the SO_RCVBUF setting they make
 964          * will allow that much actual data to be received on that socket.
 965          *
 966          * Applications are unaware that "struct sk_buff" and other overheads
 967          * allocate from the receive buffer during socket buffer allocation.
 968          *
 969          * And after considering the possible alternatives, returning the value
 970          * we actually used in getsockopt is the most desirable behavior.
 971          */
 972         WRITE_ONCE(sk->sk_rcvbuf, max_t(int, val * 2, SOCK_MIN_RCVBUF));
 973 }
 974
 975 void sock_set_rcvbuf(struct sock *sk, int val)
 976 {
 977         lock_sock(sk);
 978         __sock_set_rcvbuf(sk, val);
 979         release_sock(sk);
 980 }
 981 EXPORT_SYMBOL(sock_set_rcvbuf);
 982
 983 static void __sock_set_mark(struct sock *sk, u32 val)
 984 {
 985         if (val != sk->sk_mark) {
 986                 WRITE_ONCE(sk->sk_mark, val);
 987                 sk_dst_reset(sk);
 988         }
 989 }
 990
 991 void sock_set_mark(struct sock *sk, u32 val)
 992 {
 993         lock_sock(sk);
 994         __sock_set_mark(sk, val);
 995         release_sock(sk);
 996 }
 997 EXPORT_SYMBOL(sock_set_mark);
 998
 999 static void sock_release_reserved_memory(struct sock *sk, int bytes)
1000 {
1001         /* Round down bytes to multiple of pages */
1002         bytes = round_down(bytes, PAGE_SIZE);
1003
1004         WARN_ON(bytes > sk->sk_reserved_mem);
1005         WRITE_ONCE(sk->sk_reserved_mem, sk->sk_reserved_mem - bytes);
1006         sk_mem_reclaim(sk);
1007 }
1008
1009 static int sock_reserve_memory(struct sock *sk, int bytes)
1010 {
1011         long allocated;
1012         bool charged;
1013         int pages;
1014
1015         if (!mem_cgroup_sockets_enabled || !sk->sk_memcg || !sk_has_account(sk))
1016                 return -EOPNOTSUPP;
1017
1018         if (!bytes)
1019                 return 0;
1020
1021         pages = sk_mem_pages(bytes);
1022
1023         /* pre-charge to memcg */
1024         charged = mem_cgroup_charge_skmem(sk->sk_memcg, pages,
1025                                           GFP_KERNEL | __GFP_RETRY_MAYFAIL);
1026         if (!charged)
1027                 return -ENOMEM;
1028
1029         /* pre-charge to forward_alloc */
1030         sk_memory_allocated_add(sk, pages);
1031         allocated = sk_memory_allocated(sk);
1032         /* If the system goes into memory pressure with this
1033          * precharge, give up and return error.
1034          */
1035         if (allocated > sk_prot_mem_limits(sk, 1)) {
1036                 sk_memory_allocated_sub(sk, pages);
1037                 mem_cgroup_uncharge_skmem(sk->sk_memcg, pages);
1038                 return -ENOMEM;
1039         }
1040         sk_forward_alloc_add(sk, pages << PAGE_SHIFT);
1041
1042         WRITE_ONCE(sk->sk_reserved_mem,
1043                    sk->sk_reserved_mem + (pages << PAGE_SHIFT));
1044
1045         return 0;
1046 }
1047
1048 #ifdef CONFIG_PAGE_POOL
1049
1050 /* This is the number of tokens and frags that the user can SO_DEVMEM_DONTNEED
1051  * in 1 syscall. The limit exists to limit the amount of memory the kernel
1052  * allocates to copy these tokens, and to prevent looping over the frags for
1053  * too long.
1054  */
1055 #define MAX_DONTNEED_TOKENS 128
1056 #define MAX_DONTNEED_FRAGS 1024
1057
1058 static noinline_for_stack int
1059 sock_devmem_dontneed(struct sock *sk, sockptr_t optval, unsigned int optlen)
1060 {
1061         unsigned int num_tokens, i, j, k, netmem_num = 0;
1062         struct dmabuf_token *tokens;
1063         int ret = 0, num_frags = 0;
1064         netmem_ref netmems[16];
1065
1066         if (!sk_is_tcp(sk))
1067                 return -EBADF;
1068
1069         if (optlen % sizeof(*tokens) ||
1070             optlen > sizeof(*tokens) * MAX_DONTNEED_TOKENS)
1071                 return -EINVAL;
1072
1073         num_tokens = optlen / sizeof(*tokens);
1074         tokens = kvmalloc_array(num_tokens, sizeof(*tokens), GFP_KERNEL);
1075         if (!tokens)
1076                 return -ENOMEM;
1077
1078         if (copy_from_sockptr(tokens, optval, optlen)) {
1079                 kvfree(tokens);
1080                 return -EFAULT;
1081         }
1082
1083         xa_lock_bh(&sk->sk_user_frags);
1084         for (i = 0; i < num_tokens; i++) {
1085                 for (j = 0; j < tokens[i].token_count; j++) {
1086                         if (++num_frags > MAX_DONTNEED_FRAGS)
1087                                 goto frag_limit_reached;
1088
1089                         netmem_ref netmem = (__force netmem_ref)__xa_erase(
1090                                 &sk->sk_user_frags, tokens[i].token_start + j);
1091
1092                         if (!netmem || WARN_ON_ONCE(!netmem_is_net_iov(netmem)))
1093                                 continue;
1094
1095                         netmems[netmem_num++] = netmem;
1096                         if (netmem_num == ARRAY_SIZE(netmems)) {
1097                                 xa_unlock_bh(&sk->sk_user_frags);
1098                                 for (k = 0; k < netmem_num; k++)
1099                                         WARN_ON_ONCE(!napi_pp_put_page(netmems[k]));
1100                                 netmem_num = 0;
1101                                 xa_lock_bh(&sk->sk_user_frags);
1102                         }
1103                         ret++;
1104                 }
1105         }
1106
1107 frag_limit_reached:
1108         xa_unlock_bh(&sk->sk_user_frags);
1109         for (k = 0; k < netmem_num; k++)
1110                 WARN_ON_ONCE(!napi_pp_put_page(netmems[k]));
1111
1112         kvfree(tokens);
1113         return ret;
1114 }
1115 #endif
1116
1117 void sockopt_lock_sock(struct sock *sk)
1118 {
1119         /* When current->bpf_ctx is set, the setsockopt is called from
1120          * a bpf prog.  bpf has ensured the sk lock has been
1121          * acquired before calling setsockopt().
1122          */
1123         if (has_current_bpf_ctx())
1124                 return;
1125
1126         lock_sock(sk);
1127 }
1128 EXPORT_SYMBOL(sockopt_lock_sock);
1129
1130 void sockopt_release_sock(struct sock *sk)
1131 {
1132         if (has_current_bpf_ctx())
1133                 return;
1134
1135         release_sock(sk);
1136 }
1137 EXPORT_SYMBOL(sockopt_release_sock);
1138
1139 bool sockopt_ns_capable(struct user_namespace *ns, int cap)
1140 {
1141         return has_current_bpf_ctx() || ns_capable(ns, cap);
1142 }
1143 EXPORT_SYMBOL(sockopt_ns_capable);
1144
1145 bool sockopt_capable(int cap)
1146 {
1147         return has_current_bpf_ctx() || capable(cap);
1148 }
1149 EXPORT_SYMBOL(sockopt_capable);
1150
1151 static int sockopt_validate_clockid(__kernel_clockid_t value)
1152 {
1153         switch (value) {
1154         case CLOCK_REALTIME:
1155         case CLOCK_MONOTONIC:
1156         case CLOCK_TAI:
1157                 return 0;
1158         }
1159         return -EINVAL;
1160 }
1161
1162 /*
1163  *      This is meant for all protocols to use and covers goings on
1164  *      at the socket level. Everything here is generic.
1165  */
1166
1167 int sk_setsockopt(struct sock *sk, int level, int optname,
1168                   sockptr_t optval, unsigned int optlen)
1169 {
1170         struct so_timestamping timestamping;
1171         struct socket *sock = sk->sk_socket;
1172         struct sock_txtime sk_txtime;
1173         int val;
1174         int valbool;
1175         struct linger ling;
1176         int ret = 0;
1177
1178         /*
1179          *      Options without arguments
1180          */
1181
1182         if (optname == SO_BINDTODEVICE)
1183                 return sock_setbindtodevice(sk, optval, optlen);
1184
1185         if (optlen < sizeof(int))
1186                 return -EINVAL;
1187
1188         if (copy_from_sockptr(&val, optval, sizeof(val)))
1189                 return -EFAULT;
1190
1191         valbool = val ? 1 : 0;
1192
1193         /* handle options which do not require locking the socket. */
1194         switch (optname) {
1195         case SO_PRIORITY:
1196                 if ((val >= 0 && val <= 6) ||
1197                     sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) ||
1198                     sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
1199                         sock_set_priority(sk, val);
1200                         return 0;
1201                 }
1202                 return -EPERM;
1203         case SO_PASSSEC:
1204                 assign_bit(SOCK_PASSSEC, &sock->flags, valbool);
1205                 return 0;
1206         case SO_PASSCRED:
1207                 assign_bit(SOCK_PASSCRED, &sock->flags, valbool);
1208                 return 0;
1209         case SO_PASSPIDFD:
1210                 assign_bit(SOCK_PASSPIDFD, &sock->flags, valbool);
1211                 return 0;
1212         case SO_TYPE:
1213         case SO_PROTOCOL:
1214         case SO_DOMAIN:
1215         case SO_ERROR:
1216                 return -ENOPROTOOPT;
1217 #ifdef CONFIG_NET_RX_BUSY_POLL
1218         case SO_BUSY_POLL:
1219                 if (val < 0)
1220                         return -EINVAL;
1221                 WRITE_ONCE(sk->sk_ll_usec, val);
1222                 return 0;
1223         case SO_PREFER_BUSY_POLL:
1224                 if (valbool && !sockopt_capable(CAP_NET_ADMIN))
1225                         return -EPERM;
1226                 WRITE_ONCE(sk->sk_prefer_busy_poll, valbool);
1227                 return 0;
1228         case SO_BUSY_POLL_BUDGET:
1229                 if (val > READ_ONCE(sk->sk_busy_poll_budget) &&
1230                     !sockopt_capable(CAP_NET_ADMIN))
1231                         return -EPERM;
1232                 if (val < 0 || val > U16_MAX)
1233                         return -EINVAL;
1234                 WRITE_ONCE(sk->sk_busy_poll_budget, val);
1235                 return 0;
1236 #endif
1237         case SO_MAX_PACING_RATE:
1238                 {
1239                 unsigned long ulval = (val == ~0U) ? ~0UL : (unsigned int)val;
1240                 unsigned long pacing_rate;
1241
1242                 if (sizeof(ulval) != sizeof(val) &&
1243                     optlen >= sizeof(ulval) &&
1244                     copy_from_sockptr(&ulval, optval, sizeof(ulval))) {
1245                         return -EFAULT;
1246                 }
1247                 if (ulval != ~0UL)
1248                         cmpxchg(&sk->sk_pacing_status,
1249                                 SK_PACING_NONE,
1250                                 SK_PACING_NEEDED);
1251                 /* Pairs with READ_ONCE() from sk_getsockopt() */
1252                 WRITE_ONCE(sk->sk_max_pacing_rate, ulval);
1253                 pacing_rate = READ_ONCE(sk->sk_pacing_rate);
1254                 if (ulval < pacing_rate)
1255                         WRITE_ONCE(sk->sk_pacing_rate, ulval);
1256                 return 0;
1257                 }
1258         case SO_TXREHASH:
1259                 if (val < -1 || val > 1)
1260                         return -EINVAL;
1261                 if ((u8)val == SOCK_TXREHASH_DEFAULT)
1262                         val = READ_ONCE(sock_net(sk)->core.sysctl_txrehash);
1263                 /* Paired with READ_ONCE() in tcp_rtx_synack()
1264                  * and sk_getsockopt().
1265                  */
1266                 WRITE_ONCE(sk->sk_txrehash, (u8)val);
1267                 return 0;
1268         case SO_PEEK_OFF:
1269                 {
1270                 int (*set_peek_off)(struct sock *sk, int val);
1271
1272                 set_peek_off = READ_ONCE(sock->ops)->set_peek_off;
1273                 if (set_peek_off)
1274                         ret = set_peek_off(sk, val);
1275                 else
1276                         ret = -EOPNOTSUPP;
1277                 return ret;
1278                 }
1279 #ifdef CONFIG_PAGE_POOL
1280         case SO_DEVMEM_DONTNEED:
1281                 return sock_devmem_dontneed(sk, optval, optlen);
1282 #endif
1283         }
1284
1285         sockopt_lock_sock(sk);
1286
1287         switch (optname) {
1288         case SO_DEBUG:
1289                 if (val && !sockopt_capable(CAP_NET_ADMIN))
1290                         ret = -EACCES;
1291                 else
1292                         sock_valbool_flag(sk, SOCK_DBG, valbool);
1293                 break;
1294         case SO_REUSEADDR:
1295                 sk->sk_reuse = (valbool ? SK_CAN_REUSE : SK_NO_REUSE);
1296                 break;
1297         case SO_REUSEPORT:
1298                 sk->sk_reuseport = valbool;
1299                 break;
1300         case SO_DONTROUTE:
1301                 sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool);
1302                 sk_dst_reset(sk);
1303                 break;
1304         case SO_BROADCAST:
1305                 sock_valbool_flag(sk, SOCK_BROADCAST, valbool);
1306                 break;
1307         case SO_SNDBUF:
1308                 /* Don't error on this BSD doesn't and if you think
1309                  * about it this is right. Otherwise apps have to
1310                  * play 'guess the biggest size' games. RCVBUF/SNDBUF
1311                  * are treated in BSD as hints
1312                  */
1313                 val = min_t(u32, val, READ_ONCE(sysctl_wmem_max));
1314 set_sndbuf:
1315                 /* Ensure val * 2 fits into an int, to prevent max_t()
1316                  * from treating it as a negative value.
1317                  */
1318                 val = min_t(int, val, INT_MAX / 2);
1319                 sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
1320                 WRITE_ONCE(sk->sk_sndbuf,
1321                            max_t(int, val * 2, SOCK_MIN_SNDBUF));
1322                 /* Wake up sending tasks if we upped the value. */
1323                 sk->sk_write_space(sk);
1324                 break;
1325
1326         case SO_SNDBUFFORCE:
1327                 if (!sockopt_capable(CAP_NET_ADMIN)) {
1328                         ret = -EPERM;
1329                         break;
1330                 }
1331
1332                 /* No negative values (to prevent underflow, as val will be
1333                  * multiplied by 2).
1334                  */
1335                 if (val < 0)
1336                         val = 0;
1337                 goto set_sndbuf;
1338
1339         case SO_RCVBUF:
1340                 /* Don't error on this BSD doesn't and if you think
1341                  * about it this is right. Otherwise apps have to
1342                  * play 'guess the biggest size' games. RCVBUF/SNDBUF
1343                  * are treated in BSD as hints
1344                  */
1345                 __sock_set_rcvbuf(sk, min_t(u32, val, READ_ONCE(sysctl_rmem_max)));
1346                 break;
1347
1348         case SO_RCVBUFFORCE:
1349                 if (!sockopt_capable(CAP_NET_ADMIN)) {
1350                         ret = -EPERM;
1351                         break;
1352                 }
1353
1354                 /* No negative values (to prevent underflow, as val will be
1355                  * multiplied by 2).
1356                  */
1357                 __sock_set_rcvbuf(sk, max(val, 0));
1358                 break;
1359
1360         case SO_KEEPALIVE:
1361                 if (sk->sk_prot->keepalive)
1362                         sk->sk_prot->keepalive(sk, valbool);
1363                 sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool);
1364                 break;
1365
1366         case SO_OOBINLINE:
1367                 sock_valbool_flag(sk, SOCK_URGINLINE, valbool);
1368                 break;
1369
1370         case SO_NO_CHECK:
1371                 sk->sk_no_check_tx = valbool;
1372                 break;
1373
1374         case SO_LINGER:
1375                 if (optlen < sizeof(ling)) {
1376                         ret = -EINVAL;  /* 1003.1g */
1377                         break;
1378                 }
1379                 if (copy_from_sockptr(&ling, optval, sizeof(ling))) {
1380                         ret = -EFAULT;
1381                         break;
1382                 }
1383                 if (!ling.l_onoff) {
1384                         sock_reset_flag(sk, SOCK_LINGER);
1385                 } else {
1386                         unsigned long t_sec = ling.l_linger;
1387
1388                         if (t_sec >= MAX_SCHEDULE_TIMEOUT / HZ)
1389                                 WRITE_ONCE(sk->sk_lingertime, MAX_SCHEDULE_TIMEOUT);
1390                         else
1391                                 WRITE_ONCE(sk->sk_lingertime, t_sec * HZ);
1392                         sock_set_flag(sk, SOCK_LINGER);
1393                 }
1394                 break;
1395
1396         case SO_BSDCOMPAT:
1397                 break;
1398
1399         case SO_TIMESTAMP_OLD:
1400         case SO_TIMESTAMP_NEW:
1401         case SO_TIMESTAMPNS_OLD:
1402         case SO_TIMESTAMPNS_NEW:
1403                 sock_set_timestamp(sk, optname, valbool);
1404                 break;
1405
1406         case SO_TIMESTAMPING_NEW:
1407         case SO_TIMESTAMPING_OLD:
1408                 if (optlen == sizeof(timestamping)) {
1409                         if (copy_from_sockptr(&timestamping, optval,
1410                                               sizeof(timestamping))) {
1411                                 ret = -EFAULT;
1412                                 break;
1413                         }
1414                 } else {
1415                         memset(&timestamping, 0, sizeof(timestamping));
1416                         timestamping.flags = val;
1417                 }
1418                 ret = sock_set_timestamping(sk, optname, timestamping);
1419                 break;
1420
1421         case SO_RCVLOWAT:
1422                 {
1423                 int (*set_rcvlowat)(struct sock *sk, int val) = NULL;
1424
1425                 if (val < 0)
1426                         val = INT_MAX;
1427                 if (sock)
1428                         set_rcvlowat = READ_ONCE(sock->ops)->set_rcvlowat;
1429                 if (set_rcvlowat)
1430                         ret = set_rcvlowat(sk, val);
1431                 else
1432                         WRITE_ONCE(sk->sk_rcvlowat, val ? : 1);
1433                 break;
1434                 }
1435         case SO_RCVTIMEO_OLD:
1436         case SO_RCVTIMEO_NEW:
1437                 ret = sock_set_timeout(&sk->sk_rcvtimeo, optval,
1438                                        optlen, optname == SO_RCVTIMEO_OLD);
1439                 break;
1440
1441         case SO_SNDTIMEO_OLD:
1442         case SO_SNDTIMEO_NEW:
1443                 ret = sock_set_timeout(&sk->sk_sndtimeo, optval,
1444                                        optlen, optname == SO_SNDTIMEO_OLD);
1445                 break;
1446
1447         case SO_ATTACH_FILTER: {
1448                 struct sock_fprog fprog;
1449
1450                 ret = copy_bpf_fprog_from_user(&fprog, optval, optlen);
1451                 if (!ret)
1452                         ret = sk_attach_filter(&fprog, sk);
1453                 break;
1454         }
1455         case SO_ATTACH_BPF:
1456                 ret = -EINVAL;
1457                 if (optlen == sizeof(u32)) {
1458                         u32 ufd;
1459
1460                         ret = -EFAULT;
1461                         if (copy_from_sockptr(&ufd, optval, sizeof(ufd)))
1462                                 break;
1463
1464                         ret = sk_attach_bpf(ufd, sk);
1465                 }
1466                 break;
1467
1468         case SO_ATTACH_REUSEPORT_CBPF: {
1469                 struct sock_fprog fprog;
1470
1471                 ret = copy_bpf_fprog_from_user(&fprog, optval, optlen);
1472                 if (!ret)
1473                         ret = sk_reuseport_attach_filter(&fprog, sk);
1474                 break;
1475         }
1476         case SO_ATTACH_REUSEPORT_EBPF:
1477                 ret = -EINVAL;
1478                 if (optlen == sizeof(u32)) {
1479                         u32 ufd;
1480
1481                         ret = -EFAULT;
1482                         if (copy_from_sockptr(&ufd, optval, sizeof(ufd)))
1483                                 break;
1484
1485                         ret = sk_reuseport_attach_bpf(ufd, sk);
1486                 }
1487                 break;
1488
1489         case SO_DETACH_REUSEPORT_BPF:
1490                 ret = reuseport_detach_prog(sk);
1491                 break;
1492
1493         case SO_DETACH_FILTER:
1494                 ret = sk_detach_filter(sk);
1495                 break;
1496
1497         case SO_LOCK_FILTER:
1498                 if (sock_flag(sk, SOCK_FILTER_LOCKED) && !valbool)
1499                         ret = -EPERM;
1500                 else
1501                         sock_valbool_flag(sk, SOCK_FILTER_LOCKED, valbool);
1502                 break;
1503
1504         case SO_MARK:
1505                 if (!sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) &&
1506                     !sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
1507                         ret = -EPERM;
1508                         break;
1509                 }
1510
1511                 __sock_set_mark(sk, val);
1512                 break;
1513         case SO_RCVMARK:
1514                 sock_valbool_flag(sk, SOCK_RCVMARK, valbool);
1515                 break;
1516
1517         case SO_RXQ_OVFL:
1518                 sock_valbool_flag(sk, SOCK_RXQ_OVFL, valbool);
1519                 break;
1520
1521         case SO_WIFI_STATUS:
1522                 sock_valbool_flag(sk, SOCK_WIFI_STATUS, valbool);
1523                 break;
1524
1525         case SO_NOFCS:
1526                 sock_valbool_flag(sk, SOCK_NOFCS, valbool);
1527                 break;
1528
1529         case SO_SELECT_ERR_QUEUE:
1530                 sock_valbool_flag(sk, SOCK_SELECT_ERR_QUEUE, valbool);
1531                 break;
1532
1533
1534         case SO_INCOMING_CPU:
1535                 reuseport_update_incoming_cpu(sk, val);
1536                 break;
1537
1538         case SO_CNX_ADVICE:
1539                 if (val == 1)
1540                         dst_negative_advice(sk);
1541                 break;
1542
1543         case SO_ZEROCOPY:
1544                 if (sk->sk_family == PF_INET || sk->sk_family == PF_INET6) {
1545                         if (!(sk_is_tcp(sk) ||
1546                               (sk->sk_type == SOCK_DGRAM &&
1547                                sk->sk_protocol == IPPROTO_UDP)))
1548                                 ret = -EOPNOTSUPP;
1549                 } else if (sk->sk_family != PF_RDS) {
1550                         ret = -EOPNOTSUPP;
1551                 }
1552                 if (!ret) {
1553                         if (val < 0 || val > 1)
1554                                 ret = -EINVAL;
1555                         else
1556                                 sock_valbool_flag(sk, SOCK_ZEROCOPY, valbool);
1557                 }
1558                 break;
1559
1560         case SO_TXTIME:
1561                 if (optlen != sizeof(struct sock_txtime)) {
1562                         ret = -EINVAL;
1563                         break;
1564                 } else if (copy_from_sockptr(&sk_txtime, optval,
1565                            sizeof(struct sock_txtime))) {
1566                         ret = -EFAULT;
1567                         break;
1568                 } else if (sk_txtime.flags & ~SOF_TXTIME_FLAGS_MASK) {
1569                         ret = -EINVAL;
1570                         break;
1571                 }
1572                 /* CLOCK_MONOTONIC is only used by sch_fq, and this packet
1573                  * scheduler has enough safe guards.
1574                  */
1575                 if (sk_txtime.clockid != CLOCK_MONOTONIC &&
1576                     !sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
1577                         ret = -EPERM;
1578                         break;
1579                 }
1580
1581                 ret = sockopt_validate_clockid(sk_txtime.clockid);
1582                 if (ret)
1583                         break;
1584
1585                 sock_valbool_flag(sk, SOCK_TXTIME, true);
1586                 sk->sk_clockid = sk_txtime.clockid;
1587                 sk->sk_txtime_deadline_mode =
1588                         !!(sk_txtime.flags & SOF_TXTIME_DEADLINE_MODE);
1589                 sk->sk_txtime_report_errors =
1590                         !!(sk_txtime.flags & SOF_TXTIME_REPORT_ERRORS);
1591                 break;
1592
1593         case SO_BINDTOIFINDEX:
1594                 ret = sock_bindtoindex_locked(sk, val);
1595                 break;
1596
1597         case SO_BUF_LOCK:
1598                 if (val & ~SOCK_BUF_LOCK_MASK) {
1599                         ret = -EINVAL;
1600                         break;
1601                 }
1602                 sk->sk_userlocks = val | (sk->sk_userlocks &
1603                                           ~SOCK_BUF_LOCK_MASK);
1604                 break;
1605
1606         case SO_RESERVE_MEM:
1607         {
1608                 int delta;
1609
1610                 if (val < 0) {
1611                         ret = -EINVAL;
1612                         break;
1613                 }
1614
1615                 delta = val - sk->sk_reserved_mem;
1616                 if (delta < 0)
1617                         sock_release_reserved_memory(sk, -delta);
1618                 else
1619                         ret = sock_reserve_memory(sk, delta);
1620                 break;
1621         }
1622
1623         default:
1624                 ret = -ENOPROTOOPT;
1625                 break;
1626         }
1627         sockopt_release_sock(sk);
1628         return ret;
1629 }
1630
1631 int sock_setsockopt(struct socket *sock, int level, int optname,
1632                     sockptr_t optval, unsigned int optlen)
1633 {
1634         return sk_setsockopt(sock->sk, level, optname,
1635                              optval, optlen);
1636 }
1637 EXPORT_SYMBOL(sock_setsockopt);
1638
1639 static const struct cred *sk_get_peer_cred(struct sock *sk)
1640 {
1641         const struct cred *cred;
1642
1643         spin_lock(&sk->sk_peer_lock);
1644         cred = get_cred(sk->sk_peer_cred);
1645         spin_unlock(&sk->sk_peer_lock);
1646
1647         return cred;
1648 }
1649
1650 static void cred_to_ucred(struct pid *pid, const struct cred *cred,
1651                           struct ucred *ucred)
1652 {
1653         ucred->pid = pid_vnr(pid);
1654         ucred->uid = ucred->gid = -1;
1655         if (cred) {
1656                 struct user_namespace *current_ns = current_user_ns();
1657
1658                 ucred->uid = from_kuid_munged(current_ns, cred->euid);
1659                 ucred->gid = from_kgid_munged(current_ns, cred->egid);
1660         }
1661 }
1662
1663 static int groups_to_user(sockptr_t dst, const struct group_info *src)
1664 {
1665         struct user_namespace *user_ns = current_user_ns();
1666         int i;
1667
1668         for (i = 0; i < src->ngroups; i++) {
1669                 gid_t gid = from_kgid_munged(user_ns, src->gid[i]);
1670
1671                 if (copy_to_sockptr_offset(dst, i * sizeof(gid), &gid, sizeof(gid)))
1672                         return -EFAULT;
1673         }
1674
1675         return 0;
1676 }
1677
1678 int sk_getsockopt(struct sock *sk, int level, int optname,
1679                   sockptr_t optval, sockptr_t optlen)
1680 {
1681         struct socket *sock = sk->sk_socket;
1682
1683         union {
1684                 int val;
1685                 u64 val64;
1686                 unsigned long ulval;
1687                 struct linger ling;
1688                 struct old_timeval32 tm32;
1689                 struct __kernel_old_timeval tm;
1690                 struct  __kernel_sock_timeval stm;
1691                 struct sock_txtime txtime;
1692                 struct so_timestamping timestamping;
1693         } v;
1694
1695         int lv = sizeof(int);
1696         int len;
1697
1698         if (copy_from_sockptr(&len, optlen, sizeof(int)))
1699                 return -EFAULT;
1700         if (len < 0)
1701                 return -EINVAL;
1702
1703         memset(&v, 0, sizeof(v));
1704
1705         switch (optname) {
1706         case SO_DEBUG:
1707                 v.val = sock_flag(sk, SOCK_DBG);
1708                 break;
1709
1710         case SO_DONTROUTE:
1711                 v.val = sock_flag(sk, SOCK_LOCALROUTE);
1712                 break;
1713
1714         case SO_BROADCAST:
1715                 v.val = sock_flag(sk, SOCK_BROADCAST);
1716                 break;
1717
1718         case SO_SNDBUF:
1719                 v.val = READ_ONCE(sk->sk_sndbuf);
1720                 break;
1721
1722         case SO_RCVBUF:
1723                 v.val = READ_ONCE(sk->sk_rcvbuf);
1724                 break;
1725
1726         case SO_REUSEADDR:
1727                 v.val = sk->sk_reuse;
1728                 break;
1729
1730         case SO_REUSEPORT:
1731                 v.val = sk->sk_reuseport;
1732                 break;
1733
1734         case SO_KEEPALIVE:
1735                 v.val = sock_flag(sk, SOCK_KEEPOPEN);
1736                 break;
1737
1738         case SO_TYPE:
1739                 v.val = sk->sk_type;
1740                 break;
1741
1742         case SO_PROTOCOL:
1743                 v.val = sk->sk_protocol;
1744                 break;
1745
1746         case SO_DOMAIN:
1747                 v.val = sk->sk_family;
1748                 break;
1749
1750         case SO_ERROR:
1751                 v.val = -sock_error(sk);
1752                 if (v.val == 0)
1753                         v.val = xchg(&sk->sk_err_soft, 0);
1754                 break;
1755
1756         case SO_OOBINLINE:
1757                 v.val = sock_flag(sk, SOCK_URGINLINE);
1758                 break;
1759
1760         case SO_NO_CHECK:
1761                 v.val = sk->sk_no_check_tx;
1762                 break;
1763
1764         case SO_PRIORITY:
1765                 v.val = READ_ONCE(sk->sk_priority);
1766                 break;
1767
1768         case SO_LINGER:
1769                 lv              = sizeof(v.ling);
1770                 v.ling.l_onoff  = sock_flag(sk, SOCK_LINGER);
1771                 v.ling.l_linger = READ_ONCE(sk->sk_lingertime) / HZ;
1772                 break;
1773
1774         case SO_BSDCOMPAT:
1775                 break;
1776
1777         case SO_TIMESTAMP_OLD:
1778                 v.val = sock_flag(sk, SOCK_RCVTSTAMP) &&
1779                                 !sock_flag(sk, SOCK_TSTAMP_NEW) &&
1780                                 !sock_flag(sk, SOCK_RCVTSTAMPNS);
1781                 break;
1782
1783         case SO_TIMESTAMPNS_OLD:
1784                 v.val = sock_flag(sk, SOCK_RCVTSTAMPNS) && !sock_flag(sk, SOCK_TSTAMP_NEW);
1785                 break;
1786
1787         case SO_TIMESTAMP_NEW:
1788                 v.val = sock_flag(sk, SOCK_RCVTSTAMP) && sock_flag(sk, SOCK_TSTAMP_NEW);
1789                 break;
1790
1791         case SO_TIMESTAMPNS_NEW:
1792                 v.val = sock_flag(sk, SOCK_RCVTSTAMPNS) && sock_flag(sk, SOCK_TSTAMP_NEW);
1793                 break;
1794
1795         case SO_TIMESTAMPING_OLD:
1796         case SO_TIMESTAMPING_NEW:
1797                 lv = sizeof(v.timestamping);
1798                 /* For the later-added case SO_TIMESTAMPING_NEW: Be strict about only
1799                  * returning the flags when they were set through the same option.
1800                  * Don't change the beviour for the old case SO_TIMESTAMPING_OLD.
1801                  */
1802                 if (optname == SO_TIMESTAMPING_OLD || sock_flag(sk, SOCK_TSTAMP_NEW)) {
1803                         v.timestamping.flags = READ_ONCE(sk->sk_tsflags);
1804                         v.timestamping.bind_phc = READ_ONCE(sk->sk_bind_phc);
1805                 }
1806                 break;
1807
1808         case SO_RCVTIMEO_OLD:
1809         case SO_RCVTIMEO_NEW:
1810                 lv = sock_get_timeout(READ_ONCE(sk->sk_rcvtimeo), &v,
1811                                       SO_RCVTIMEO_OLD == optname);
1812                 break;
1813
1814         case SO_SNDTIMEO_OLD:
1815         case SO_SNDTIMEO_NEW:
1816                 lv = sock_get_timeout(READ_ONCE(sk->sk_sndtimeo), &v,
1817                                       SO_SNDTIMEO_OLD == optname);
1818                 break;
1819
1820         case SO_RCVLOWAT:
1821                 v.val = READ_ONCE(sk->sk_rcvlowat);
1822                 break;
1823
1824         case SO_SNDLOWAT:
1825                 v.val = 1;
1826                 break;
1827
1828         case SO_PASSCRED:
1829                 v.val = !!test_bit(SOCK_PASSCRED, &sock->flags);
1830                 break;
1831
1832         case SO_PASSPIDFD:
1833                 v.val = !!test_bit(SOCK_PASSPIDFD, &sock->flags);
1834                 break;
1835
1836         case SO_PEERCRED:
1837         {
1838                 struct ucred peercred;
1839                 if (len > sizeof(peercred))
1840                         len = sizeof(peercred);
1841
1842                 spin_lock(&sk->sk_peer_lock);
1843                 cred_to_ucred(sk->sk_peer_pid, sk->sk_peer_cred, &peercred);
1844                 spin_unlock(&sk->sk_peer_lock);
1845
1846                 if (copy_to_sockptr(optval, &peercred, len))
1847                         return -EFAULT;
1848                 goto lenout;
1849         }
1850
1851         case SO_PEERPIDFD:
1852         {
1853                 struct pid *peer_pid;
1854                 struct file *pidfd_file = NULL;
1855                 int pidfd;
1856
1857                 if (len > sizeof(pidfd))
1858                         len = sizeof(pidfd);
1859
1860                 spin_lock(&sk->sk_peer_lock);
1861                 peer_pid = get_pid(sk->sk_peer_pid);
1862                 spin_unlock(&sk->sk_peer_lock);
1863
1864                 if (!peer_pid)
1865                         return -ENODATA;
1866
1867                 pidfd = pidfd_prepare(peer_pid, 0, &pidfd_file);
1868                 put_pid(peer_pid);
1869                 if (pidfd < 0)
1870                         return pidfd;
1871
1872                 if (copy_to_sockptr(optval, &pidfd, len) ||
1873                     copy_to_sockptr(optlen, &len, sizeof(int))) {
1874                         put_unused_fd(pidfd);
1875                         fput(pidfd_file);
1876
1877                         return -EFAULT;
1878                 }
1879
1880                 fd_install(pidfd, pidfd_file);
1881                 return 0;
1882         }
1883
1884         case SO_PEERGROUPS:
1885         {
1886                 const struct cred *cred;
1887                 int ret, n;
1888
1889                 cred = sk_get_peer_cred(sk);
1890                 if (!cred)
1891                         return -ENODATA;
1892
1893                 n = cred->group_info->ngroups;
1894                 if (len < n * sizeof(gid_t)) {
1895                         len = n * sizeof(gid_t);
1896                         put_cred(cred);
1897                         return copy_to_sockptr(optlen, &len, sizeof(int)) ? -EFAULT : -ERANGE;
1898                 }
1899                 len = n * sizeof(gid_t);
1900
1901                 ret = groups_to_user(optval, cred->group_info);
1902                 put_cred(cred);
1903                 if (ret)
1904                         return ret;
1905                 goto lenout;
1906         }
1907
1908         case SO_PEERNAME:
1909         {
1910                 struct sockaddr_storage address;
1911
1912                 lv = READ_ONCE(sock->ops)->getname(sock, (struct sockaddr *)&address, 2);
1913                 if (lv < 0)
1914                         return -ENOTCONN;
1915                 if (lv < len)
1916                         return -EINVAL;
1917                 if (copy_to_sockptr(optval, &address, len))
1918                         return -EFAULT;
1919                 goto lenout;
1920         }
1921
1922         /* Dubious BSD thing... Probably nobody even uses it, but
1923          * the UNIX standard wants it for whatever reason... -DaveM
1924          */
1925         case SO_ACCEPTCONN:
1926                 v.val = sk->sk_state == TCP_LISTEN;
1927                 break;
1928
1929         case SO_PASSSEC:
1930                 v.val = !!test_bit(SOCK_PASSSEC, &sock->flags);
1931                 break;
1932
1933         case SO_PEERSEC:
1934                 return security_socket_getpeersec_stream(sock,
1935                                                          optval, optlen, len);
1936
1937         case SO_MARK:
1938                 v.val = READ_ONCE(sk->sk_mark);
1939                 break;
1940
1941         case SO_RCVMARK:
1942                 v.val = sock_flag(sk, SOCK_RCVMARK);
1943                 break;
1944
1945         case SO_RXQ_OVFL:
1946                 v.val = sock_flag(sk, SOCK_RXQ_OVFL);
1947                 break;
1948
1949         case SO_WIFI_STATUS:
1950                 v.val = sock_flag(sk, SOCK_WIFI_STATUS);
1951                 break;
1952
1953         case SO_PEEK_OFF:
1954                 if (!READ_ONCE(sock->ops)->set_peek_off)
1955                         return -EOPNOTSUPP;
1956
1957                 v.val = READ_ONCE(sk->sk_peek_off);
1958                 break;
1959         case SO_NOFCS:
1960                 v.val = sock_flag(sk, SOCK_NOFCS);
1961                 break;
1962
1963         case SO_BINDTODEVICE:
1964                 return sock_getbindtodevice(sk, optval, optlen, len);
1965
1966         case SO_GET_FILTER:
1967                 len = sk_get_filter(sk, optval, len);
1968                 if (len < 0)
1969                         return len;
1970
1971                 goto lenout;
1972
1973         case SO_LOCK_FILTER:
1974                 v.val = sock_flag(sk, SOCK_FILTER_LOCKED);
1975                 break;
1976
1977         case SO_BPF_EXTENSIONS:
1978                 v.val = bpf_tell_extensions();
1979                 break;
1980
1981         case SO_SELECT_ERR_QUEUE:
1982                 v.val = sock_flag(sk, SOCK_SELECT_ERR_QUEUE);
1983                 break;
1984
1985 #ifdef CONFIG_NET_RX_BUSY_POLL
1986         case SO_BUSY_POLL:
1987                 v.val = READ_ONCE(sk->sk_ll_usec);
1988                 break;
1989         case SO_PREFER_BUSY_POLL:
1990                 v.val = READ_ONCE(sk->sk_prefer_busy_poll);
1991                 break;
1992 #endif
1993
1994         case SO_MAX_PACING_RATE:
1995                 /* The READ_ONCE() pair with the WRITE_ONCE() in sk_setsockopt() */
1996                 if (sizeof(v.ulval) != sizeof(v.val) && len >= sizeof(v.ulval)) {
1997                         lv = sizeof(v.ulval);
1998                         v.ulval = READ_ONCE(sk->sk_max_pacing_rate);
1999                 } else {
2000                         /* 32bit version */
2001                         v.val = min_t(unsigned long, ~0U,
2002                                       READ_ONCE(sk->sk_max_pacing_rate));
2003                 }
2004                 break;
2005
2006         case SO_INCOMING_CPU:
2007                 v.val = READ_ONCE(sk->sk_incoming_cpu);
2008                 break;
2009
2010         case SO_MEMINFO:
2011         {
2012                 u32 meminfo[SK_MEMINFO_VARS];
2013
2014                 sk_get_meminfo(sk, meminfo);
2015
2016                 len = min_t(unsigned int, len, sizeof(meminfo));
2017                 if (copy_to_sockptr(optval, &meminfo, len))
2018                         return -EFAULT;
2019
2020                 goto lenout;
2021         }
2022
2023 #ifdef CONFIG_NET_RX_BUSY_POLL
2024         case SO_INCOMING_NAPI_ID:
2025                 v.val = READ_ONCE(sk->sk_napi_id);
2026
2027                 /* aggregate non-NAPI IDs down to 0 */
2028                 if (v.val < MIN_NAPI_ID)
2029                         v.val = 0;
2030
2031                 break;
2032 #endif
2033
2034         case SO_COOKIE:
2035                 lv = sizeof(u64);
2036                 if (len < lv)
2037                         return -EINVAL;
2038                 v.val64 = sock_gen_cookie(sk);
2039                 break;
2040
2041         case SO_ZEROCOPY:
2042                 v.val = sock_flag(sk, SOCK_ZEROCOPY);
2043                 break;
2044
2045         case SO_TXTIME:
2046                 lv = sizeof(v.txtime);
2047                 v.txtime.clockid = sk->sk_clockid;
2048                 v.txtime.flags |= sk->sk_txtime_deadline_mode ?
2049                                   SOF_TXTIME_DEADLINE_MODE : 0;
2050                 v.txtime.flags |= sk->sk_txtime_report_errors ?
2051                                   SOF_TXTIME_REPORT_ERRORS : 0;
2052                 break;
2053
2054         case SO_BINDTOIFINDEX:
2055                 v.val = READ_ONCE(sk->sk_bound_dev_if);
2056                 break;
2057
2058         case SO_NETNS_COOKIE:
2059                 lv = sizeof(u64);
2060                 if (len != lv)
2061                         return -EINVAL;
2062                 v.val64 = sock_net(sk)->net_cookie;
2063                 break;
2064
2065         case SO_BUF_LOCK:
2066                 v.val = sk->sk_userlocks & SOCK_BUF_LOCK_MASK;
2067                 break;
2068
2069         case SO_RESERVE_MEM:
2070                 v.val = READ_ONCE(sk->sk_reserved_mem);
2071                 break;
2072
2073         case SO_TXREHASH:
2074                 /* Paired with WRITE_ONCE() in sk_setsockopt() */
2075                 v.val = READ_ONCE(sk->sk_txrehash);
2076                 break;
2077
2078         default:
2079                 /* We implement the SO_SNDLOWAT etc to not be settable
2080                  * (1003.1g 7).
2081                  */
2082                 return -ENOPROTOOPT;
2083         }
2084
2085         if (len > lv)
2086                 len = lv;
2087         if (copy_to_sockptr(optval, &v, len))
2088                 return -EFAULT;
2089 lenout:
2090         if (copy_to_sockptr(optlen, &len, sizeof(int)))
2091                 return -EFAULT;
2092         return 0;
2093 }
2094
2095 /*
2096  * Initialize an sk_lock.
2097  *
2098  * (We also register the sk_lock with the lock validator.)
2099  */
2100 static inline void sock_lock_init(struct sock *sk)
2101 {
2102         if (sk->sk_kern_sock)
2103                 sock_lock_init_class_and_name(
2104                         sk,
2105                         af_family_kern_slock_key_strings[sk->sk_family],
2106                         af_family_kern_slock_keys + sk->sk_family,
2107                         af_family_kern_key_strings[sk->sk_family],
2108                         af_family_kern_keys + sk->sk_family);
2109         else
2110                 sock_lock_init_class_and_name(
2111                         sk,
2112                         af_family_slock_key_strings[sk->sk_family],
2113                         af_family_slock_keys + sk->sk_family,
2114                         af_family_key_strings[sk->sk_family],
2115                         af_family_keys + sk->sk_family);
2116 }
2117
2118 /*
2119  * Copy all fields from osk to nsk but nsk->sk_refcnt must not change yet,
2120  * even temporarily, because of RCU lookups. sk_node should also be left as is.
2121  * We must not copy fields between sk_dontcopy_begin and sk_dontcopy_end
2122  */
2123 static void sock_copy(struct sock *nsk, const struct sock *osk)
2124 {
2125         const struct proto *prot = READ_ONCE(osk->sk_prot);
2126 #ifdef CONFIG_SECURITY_NETWORK
2127         void *sptr = nsk->sk_security;
2128 #endif
2129
2130         /* If we move sk_tx_queue_mapping out of the private section,
2131          * we must check if sk_tx_queue_clear() is called after
2132          * sock_copy() in sk_clone_lock().
2133          */
2134         BUILD_BUG_ON(offsetof(struct sock, sk_tx_queue_mapping) <
2135                      offsetof(struct sock, sk_dontcopy_begin) ||
2136                      offsetof(struct sock, sk_tx_queue_mapping) >=
2137                      offsetof(struct sock, sk_dontcopy_end));
2138
2139         memcpy(nsk, osk, offsetof(struct sock, sk_dontcopy_begin));
2140
2141         unsafe_memcpy(&nsk->sk_dontcopy_end, &osk->sk_dontcopy_end,
2142                       prot->obj_size - offsetof(struct sock, sk_dontcopy_end),
2143                       /* alloc is larger than struct, see sk_prot_alloc() */);
2144
2145 #ifdef CONFIG_SECURITY_NETWORK
2146         nsk->sk_security = sptr;
2147         security_sk_clone(osk, nsk);
2148 #endif
2149 }
2150
2151 static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
2152                 int family)
2153 {
2154         struct sock *sk;
2155         struct kmem_cache *slab;
2156
2157         slab = prot->slab;
2158         if (slab != NULL) {
2159                 sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO);
2160                 if (!sk)
2161                         return sk;
2162                 if (want_init_on_alloc(priority))
2163                         sk_prot_clear_nulls(sk, prot->obj_size);
2164         } else
2165                 sk = kmalloc(prot->obj_size, priority);
2166
2167         if (sk != NULL) {
2168                 if (security_sk_alloc(sk, family, priority))
2169                         goto out_free;
2170
2171                 if (!try_module_get(prot->owner))
2172                         goto out_free_sec;
2173         }
2174
2175         return sk;
2176
2177 out_free_sec:
2178         security_sk_free(sk);
2179 out_free:
2180         if (slab != NULL)
2181                 kmem_cache_free(slab, sk);
2182         else
2183                 kfree(sk);
2184         return NULL;
2185 }
2186
2187 static void sk_prot_free(struct proto *prot, struct sock *sk)
2188 {
2189         struct kmem_cache *slab;
2190         struct module *owner;
2191
2192         owner = prot->owner;
2193         slab = prot->slab;
2194
2195         cgroup_sk_free(&sk->sk_cgrp_data);
2196         mem_cgroup_sk_free(sk);
2197         security_sk_free(sk);
2198         if (slab != NULL)
2199                 kmem_cache_free(slab, sk);
2200         else
2201                 kfree(sk);
2202         module_put(owner);
2203 }
2204
2205 /**
2206  *      sk_alloc - All socket objects are allocated here
2207  *      @net: the applicable net namespace
2208  *      @family: protocol family
2209  *      @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
2210  *      @prot: struct proto associated with this new sock instance
2211  *      @kern: is this to be a kernel socket?
2212  */
2213 struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
2214                       struct proto *prot, int kern)
2215 {
2216         struct sock *sk;
2217
2218         sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family);
2219         if (sk) {
2220                 sk->sk_family = family;
2221                 /*
2222                  * See comment in struct sock definition to understand
2223                  * why we need sk_prot_creator -acme
2224                  */
2225                 sk->sk_prot = sk->sk_prot_creator = prot;
2226                 sk->sk_kern_sock = kern;
2227                 sock_lock_init(sk);
2228                 sk->sk_net_refcnt = kern ? 0 : 1;
2229                 if (likely(sk->sk_net_refcnt)) {
2230                         get_net_track(net, &sk->ns_tracker, priority);
2231                         sock_inuse_add(net, 1);
2232                 } else {
2233                         __netns_tracker_alloc(net, &sk->ns_tracker,
2234                                               false, priority);
2235                 }
2236
2237                 sock_net_set(sk, net);
2238                 refcount_set(&sk->sk_wmem_alloc, 1);
2239
2240                 mem_cgroup_sk_alloc(sk);
2241                 cgroup_sk_alloc(&sk->sk_cgrp_data);
2242                 sock_update_classid(&sk->sk_cgrp_data);
2243                 sock_update_netprioidx(&sk->sk_cgrp_data);
2244                 sk_tx_queue_clear(sk);
2245         }
2246
2247         return sk;
2248 }
2249 EXPORT_SYMBOL(sk_alloc);
2250
2251 /* Sockets having SOCK_RCU_FREE will call this function after one RCU
2252  * grace period. This is the case for UDP sockets and TCP listeners.
2253  */
2254 static void __sk_destruct(struct rcu_head *head)
2255 {
2256         struct sock *sk = container_of(head, struct sock, sk_rcu);
2257         struct sk_filter *filter;
2258
2259         if (sk->sk_destruct)
2260                 sk->sk_destruct(sk);
2261
2262         filter = rcu_dereference_check(sk->sk_filter,
2263                                        refcount_read(&sk->sk_wmem_alloc) == 0);
2264         if (filter) {
2265                 sk_filter_uncharge(sk, filter);
2266                 RCU_INIT_POINTER(sk->sk_filter, NULL);
2267         }
2268
2269         sock_disable_timestamp(sk, SK_FLAGS_TIMESTAMP);
2270
2271 #ifdef CONFIG_BPF_SYSCALL
2272         bpf_sk_storage_free(sk);
2273 #endif
2274
2275         if (atomic_read(&sk->sk_omem_alloc))
2276                 pr_debug("%s: optmem leakage (%d bytes) detected\n",
2277                          __func__, atomic_read(&sk->sk_omem_alloc));
2278
2279         if (sk->sk_frag.page) {
2280                 put_page(sk->sk_frag.page);
2281                 sk->sk_frag.page = NULL;
2282         }
2283
2284         /* We do not need to acquire sk->sk_peer_lock, we are the last user. */
2285         put_cred(sk->sk_peer_cred);
2286         put_pid(sk->sk_peer_pid);
2287
2288         if (likely(sk->sk_net_refcnt))
2289                 put_net_track(sock_net(sk), &sk->ns_tracker);
2290         else
2291                 __netns_tracker_free(sock_net(sk), &sk->ns_tracker, false);
2292
2293         sk_prot_free(sk->sk_prot_creator, sk);
2294 }
2295
2296 void sk_destruct(struct sock *sk)
2297 {
2298         bool use_call_rcu = sock_flag(sk, SOCK_RCU_FREE);
2299
2300         if (rcu_access_pointer(sk->sk_reuseport_cb)) {
2301                 reuseport_detach_sock(sk);
2302                 use_call_rcu = true;
2303         }
2304
2305         if (use_call_rcu)
2306                 call_rcu(&sk->sk_rcu, __sk_destruct);
2307         else
2308                 __sk_destruct(&sk->sk_rcu);
2309 }
2310
2311 static void __sk_free(struct sock *sk)
2312 {
2313         if (likely(sk->sk_net_refcnt))
2314                 sock_inuse_add(sock_net(sk), -1);
2315
2316         if (unlikely(sk->sk_net_refcnt && sock_diag_has_destroy_listeners(sk)))
2317                 sock_diag_broadcast_destroy(sk);
2318         else
2319                 sk_destruct(sk);
2320 }
2321
2322 void sk_free(struct sock *sk)
2323 {
2324         /*
2325          * We subtract one from sk_wmem_alloc and can know if
2326          * some packets are still in some tx queue.
2327          * If not null, sock_wfree() will call __sk_free(sk) later
2328          */
2329         if (refcount_dec_and_test(&sk->sk_wmem_alloc))
2330                 __sk_free(sk);
2331 }
2332 EXPORT_SYMBOL(sk_free);
2333
2334 static void sk_init_common(struct sock *sk)
2335 {
2336         skb_queue_head_init(&sk->sk_receive_queue);
2337         skb_queue_head_init(&sk->sk_write_queue);
2338         skb_queue_head_init(&sk->sk_error_queue);
2339
2340         rwlock_init(&sk->sk_callback_lock);
2341         lockdep_set_class_and_name(&sk->sk_receive_queue.lock,
2342                         af_rlock_keys + sk->sk_family,
2343                         af_family_rlock_key_strings[sk->sk_family]);
2344         lockdep_set_class_and_name(&sk->sk_write_queue.lock,
2345                         af_wlock_keys + sk->sk_family,
2346                         af_family_wlock_key_strings[sk->sk_family]);
2347         lockdep_set_class_and_name(&sk->sk_error_queue.lock,
2348                         af_elock_keys + sk->sk_family,
2349                         af_family_elock_key_strings[sk->sk_family]);
2350         if (sk->sk_kern_sock)
2351                 lockdep_set_class_and_name(&sk->sk_callback_lock,
2352                         af_kern_callback_keys + sk->sk_family,
2353                         af_family_kern_clock_key_strings[sk->sk_family]);
2354         else
2355                 lockdep_set_class_and_name(&sk->sk_callback_lock,
2356                         af_callback_keys + sk->sk_family,
2357                         af_family_clock_key_strings[sk->sk_family]);
2358 }
2359
2360 /**
2361  *      sk_clone_lock - clone a socket, and lock its clone
2362  *      @sk: the socket to clone
2363  *      @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
2364  *
2365  *      Caller must unlock socket even in error path (bh_unlock_sock(newsk))
2366  */
2367 struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
2368 {
2369         struct proto *prot = READ_ONCE(sk->sk_prot);
2370         struct sk_filter *filter;
2371         bool is_charged = true;
2372         struct sock *newsk;
2373
2374         newsk = sk_prot_alloc(prot, priority, sk->sk_family);
2375         if (!newsk)
2376                 goto out;
2377
2378         sock_copy(newsk, sk);
2379
2380         newsk->sk_prot_creator = prot;
2381
2382         /* SANITY */
2383         if (likely(newsk->sk_net_refcnt)) {
2384                 get_net_track(sock_net(newsk), &newsk->ns_tracker, priority);
2385                 sock_inuse_add(sock_net(newsk), 1);
2386         } else {
2387                 /* Kernel sockets are not elevating the struct net refcount.
2388                  * Instead, use a tracker to more easily detect if a layer
2389                  * is not properly dismantling its kernel sockets at netns
2390                  * destroy time.
2391                  */
2392                 __netns_tracker_alloc(sock_net(newsk), &newsk->ns_tracker,
2393                                       false, priority);
2394         }
2395         sk_node_init(&newsk->sk_node);
2396         sock_lock_init(newsk);
2397         bh_lock_sock(newsk);
2398         newsk->sk_backlog.head  = newsk->sk_backlog.tail = NULL;
2399         newsk->sk_backlog.len = 0;
2400
2401         atomic_set(&newsk->sk_rmem_alloc, 0);
2402
2403         /* sk_wmem_alloc set to one (see sk_free() and sock_wfree()) */
2404         refcount_set(&newsk->sk_wmem_alloc, 1);
2405
2406         atomic_set(&newsk->sk_omem_alloc, 0);
2407         sk_init_common(newsk);
2408
2409         newsk->sk_dst_cache     = NULL;
2410         newsk->sk_dst_pending_confirm = 0;
2411         newsk->sk_wmem_queued   = 0;
2412         newsk->sk_forward_alloc = 0;
2413         newsk->sk_reserved_mem  = 0;
2414         atomic_set(&newsk->sk_drops, 0);
2415         newsk->sk_send_head     = NULL;
2416         newsk->sk_userlocks     = sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
2417         atomic_set(&newsk->sk_zckey, 0);
2418
2419         sock_reset_flag(newsk, SOCK_DONE);
2420
2421         /* sk->sk_memcg will be populated at accept() time */
2422         newsk->sk_memcg = NULL;
2423
2424         cgroup_sk_clone(&newsk->sk_cgrp_data);
2425
2426         rcu_read_lock();
2427         filter = rcu_dereference(sk->sk_filter);
2428         if (filter != NULL)
2429                 /* though it's an empty new sock, the charging may fail
2430                  * if sysctl_optmem_max was changed between creation of
2431                  * original socket and cloning
2432                  */
2433                 is_charged = sk_filter_charge(newsk, filter);
2434         RCU_INIT_POINTER(newsk->sk_filter, filter);
2435         rcu_read_unlock();
2436
2437         if (unlikely(!is_charged || xfrm_sk_clone_policy(newsk, sk))) {
2438                 /* We need to make sure that we don't uncharge the new
2439                  * socket if we couldn't charge it in the first place
2440                  * as otherwise we uncharge the parent's filter.
2441                  */
2442                 if (!is_charged)
2443                         RCU_INIT_POINTER(newsk->sk_filter, NULL);
2444                 sk_free_unlock_clone(newsk);
2445                 newsk = NULL;
2446                 goto out;
2447         }
2448         RCU_INIT_POINTER(newsk->sk_reuseport_cb, NULL);
2449
2450         if (bpf_sk_storage_clone(sk, newsk)) {
2451                 sk_free_unlock_clone(newsk);
2452                 newsk = NULL;
2453                 goto out;
2454         }
2455
2456         /* Clear sk_user_data if parent had the pointer tagged
2457          * as not suitable for copying when cloning.
2458          */
2459         if (sk_user_data_is_nocopy(newsk))
2460                 newsk->sk_user_data = NULL;
2461
2462         newsk->sk_err      = 0;
2463         newsk->sk_err_soft = 0;
2464         newsk->sk_priority = 0;
2465         newsk->sk_incoming_cpu = raw_smp_processor_id();
2466
2467         /* Before updating sk_refcnt, we must commit prior changes to memory
2468          * (Documentation/RCU/rculist_nulls.rst for details)
2469          */
2470         smp_wmb();
2471         refcount_set(&newsk->sk_refcnt, 2);
2472
2473         sk_set_socket(newsk, NULL);
2474         sk_tx_queue_clear(newsk);
2475         RCU_INIT_POINTER(newsk->sk_wq, NULL);
2476
2477         if (newsk->sk_prot->sockets_allocated)
2478                 sk_sockets_allocated_inc(newsk);
2479
2480         if (sock_needs_netstamp(sk) && newsk->sk_flags & SK_FLAGS_TIMESTAMP)
2481                 net_enable_timestamp();
2482 out:
2483         return newsk;
2484 }
2485 EXPORT_SYMBOL_GPL(sk_clone_lock);
2486
2487 void sk_free_unlock_clone(struct sock *sk)
2488 {
2489         /* It is still raw copy of parent, so invalidate
2490          * destructor and make plain sk_free() */
2491         sk->sk_destruct = NULL;
2492         bh_unlock_sock(sk);
2493         sk_free(sk);
2494 }
2495 EXPORT_SYMBOL_GPL(sk_free_unlock_clone);
2496
2497 static u32 sk_dst_gso_max_size(struct sock *sk, struct dst_entry *dst)
2498 {
2499         bool is_ipv6 = false;
2500         u32 max_size;
2501
2502 #if IS_ENABLED(CONFIG_IPV6)
2503         is_ipv6 = (sk->sk_family == AF_INET6 &&
2504                    !ipv6_addr_v4mapped(&sk->sk_v6_rcv_saddr));
2505 #endif
2506         /* pairs with the WRITE_ONCE() in netif_set_gso(_ipv4)_max_size() */
2507         max_size = is_ipv6 ? READ_ONCE(dst->dev->gso_max_size) :
2508                         READ_ONCE(dst->dev->gso_ipv4_max_size);
2509         if (max_size > GSO_LEGACY_MAX_SIZE && !sk_is_tcp(sk))
2510                 max_size = GSO_LEGACY_MAX_SIZE;
2511
2512         return max_size - (MAX_TCP_HEADER + 1);
2513 }
2514
2515 void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
2516 {
2517         u32 max_segs = 1;
2518
2519         sk->sk_route_caps = dst->dev->features;
2520         if (sk_is_tcp(sk))
2521                 sk->sk_route_caps |= NETIF_F_GSO;
2522         if (sk->sk_route_caps & NETIF_F_GSO)
2523                 sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE;
2524         if (unlikely(sk->sk_gso_disabled))
2525                 sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
2526         if (sk_can_gso(sk)) {
2527                 if (dst->header_len && !xfrm_dst_offload_ok(dst)) {
2528                         sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
2529                 } else {
2530                         sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM;
2531                         sk->sk_gso_max_size = sk_dst_gso_max_size(sk, dst);
2532                         /* pairs with the WRITE_ONCE() in netif_set_gso_max_segs() */
2533                         max_segs = max_t(u32, READ_ONCE(dst->dev->gso_max_segs), 1);
2534                 }
2535         }
2536         sk->sk_gso_max_segs = max_segs;
2537         sk_dst_set(sk, dst);
2538 }
2539 EXPORT_SYMBOL_GPL(sk_setup_caps);
2540
2541 /*
2542  *      Simple resource managers for sockets.
2543  */
2544
2545
2546 /*
2547  * Write buffer destructor automatically called from kfree_skb.
2548  */
2549 void sock_wfree(struct sk_buff *skb)
2550 {
2551         struct sock *sk = skb->sk;
2552         unsigned int len = skb->truesize;
2553         bool free;
2554
2555         if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE)) {
2556                 if (sock_flag(sk, SOCK_RCU_FREE) &&
2557                     sk->sk_write_space == sock_def_write_space) {
2558                         rcu_read_lock();
2559                         free = refcount_sub_and_test(len, &sk->sk_wmem_alloc);
2560                         sock_def_write_space_wfree(sk);
2561                         rcu_read_unlock();
2562                         if (unlikely(free))
2563                                 __sk_free(sk);
2564                         return;
2565                 }
2566
2567                 /*
2568                  * Keep a reference on sk_wmem_alloc, this will be released
2569                  * after sk_write_space() call
2570                  */
2571                 WARN_ON(refcount_sub_and_test(len - 1, &sk->sk_wmem_alloc));
2572                 sk->sk_write_space(sk);
2573                 len = 1;
2574         }
2575         /*
2576          * if sk_wmem_alloc reaches 0, we must finish what sk_free()
2577          * could not do because of in-flight packets
2578          */
2579         if (refcount_sub_and_test(len, &sk->sk_wmem_alloc))
2580                 __sk_free(sk);
2581 }
2582 EXPORT_SYMBOL(sock_wfree);
2583
2584 /* This variant of sock_wfree() is used by TCP,
2585  * since it sets SOCK_USE_WRITE_QUEUE.
2586  */
2587 void __sock_wfree(struct sk_buff *skb)
2588 {
2589         struct sock *sk = skb->sk;
2590
2591         if (refcount_sub_and_test(skb->truesize, &sk->sk_wmem_alloc))
2592                 __sk_free(sk);
2593 }
2594
2595 void skb_set_owner_w(struct sk_buff *skb, struct sock *sk)
2596 {
2597         skb_orphan(skb);
2598 #ifdef CONFIG_INET
2599         if (unlikely(!sk_fullsock(sk)))
2600                 return skb_set_owner_edemux(skb, sk);
2601 #endif
2602         skb->sk = sk;
2603         skb->destructor = sock_wfree;
2604         skb_set_hash_from_sk(skb, sk);
2605         /*
2606          * We used to take a refcount on sk, but following operation
2607          * is enough to guarantee sk_free() won't free this sock until
2608          * all in-flight packets are completed
2609          */
2610         refcount_add(skb->truesize, &sk->sk_wmem_alloc);
2611 }
2612 EXPORT_SYMBOL(skb_set_owner_w);
2613
2614 static bool can_skb_orphan_partial(const struct sk_buff *skb)
2615 {
2616         /* Drivers depend on in-order delivery for crypto offload,
2617          * partial orphan breaks out-of-order-OK logic.
2618          */
2619         if (skb_is_decrypted(skb))
2620                 return false;
2621
2622         return (skb->destructor == sock_wfree ||
2623                 (IS_ENABLED(CONFIG_INET) && skb->destructor == tcp_wfree));
2624 }
2625
2626 /* This helper is used by netem, as it can hold packets in its
2627  * delay queue. We want to allow the owner socket to send more
2628  * packets, as if they were already TX completed by a typical driver.
2629  * But we also want to keep skb->sk set because some packet schedulers
2630  * rely on it (sch_fq for example).
2631  */
2632 void skb_orphan_partial(struct sk_buff *skb)
2633 {
2634         if (skb_is_tcp_pure_ack(skb))
2635                 return;
2636
2637         if (can_skb_orphan_partial(skb) && skb_set_owner_sk_safe(skb, skb->sk))
2638                 return;
2639
2640         skb_orphan(skb);
2641 }
2642 EXPORT_SYMBOL(skb_orphan_partial);
2643
2644 /*
2645  * Read buffer destructor automatically called from kfree_skb.
2646  */
2647 void sock_rfree(struct sk_buff *skb)
2648 {
2649         struct sock *sk = skb->sk;
2650         unsigned int len = skb->truesize;
2651
2652         atomic_sub(len, &sk->sk_rmem_alloc);
2653         sk_mem_uncharge(sk, len);
2654 }
2655 EXPORT_SYMBOL(sock_rfree);
2656
2657 /*
2658  * Buffer destructor for skbs that are not used directly in read or write
2659  * path, e.g. for error handler skbs. Automatically called from kfree_skb.
2660  */
2661 void sock_efree(struct sk_buff *skb)
2662 {
2663         sock_put(skb->sk);
2664 }
2665 EXPORT_SYMBOL(sock_efree);
2666
2667 /* Buffer destructor for prefetch/receive path where reference count may
2668  * not be held, e.g. for listen sockets.
2669  */
2670 #ifdef CONFIG_INET
2671 void sock_pfree(struct sk_buff *skb)
2672 {
2673         struct sock *sk = skb->sk;
2674
2675         if (!sk_is_refcounted(sk))
2676                 return;
2677
2678         if (sk->sk_state == TCP_NEW_SYN_RECV && inet_reqsk(sk)->syncookie) {
2679                 inet_reqsk(sk)->rsk_listener = NULL;
2680                 reqsk_free(inet_reqsk(sk));
2681                 return;
2682         }
2683
2684         sock_gen_put(sk);
2685 }
2686 EXPORT_SYMBOL(sock_pfree);
2687 #endif /* CONFIG_INET */
2688
2689 kuid_t sock_i_uid(struct sock *sk)
2690 {
2691         kuid_t uid;
2692
2693         read_lock_bh(&sk->sk_callback_lock);
2694         uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : GLOBAL_ROOT_UID;
2695         read_unlock_bh(&sk->sk_callback_lock);
2696         return uid;
2697 }
2698 EXPORT_SYMBOL(sock_i_uid);
2699
2700 unsigned long __sock_i_ino(struct sock *sk)
2701 {
2702         unsigned long ino;
2703
2704         read_lock(&sk->sk_callback_lock);
2705         ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0;
2706         read_unlock(&sk->sk_callback_lock);
2707         return ino;
2708 }
2709 EXPORT_SYMBOL(__sock_i_ino);
2710
2711 unsigned long sock_i_ino(struct sock *sk)
2712 {
2713         unsigned long ino;
2714
2715         local_bh_disable();
2716         ino = __sock_i_ino(sk);
2717         local_bh_enable();
2718         return ino;
2719 }
2720 EXPORT_SYMBOL(sock_i_ino);
2721
2722 /*
2723  * Allocate a skb from the socket's send buffer.
2724  */
2725 struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
2726                              gfp_t priority)
2727 {
2728         if (force ||
2729             refcount_read(&sk->sk_wmem_alloc) < READ_ONCE(sk->sk_sndbuf)) {
2730                 struct sk_buff *skb = alloc_skb(size, priority);
2731
2732                 if (skb) {
2733                         skb_set_owner_w(skb, sk);
2734                         return skb;
2735                 }
2736         }
2737         return NULL;
2738 }
2739 EXPORT_SYMBOL(sock_wmalloc);
2740
2741 static void sock_ofree(struct sk_buff *skb)
2742 {
2743         struct sock *sk = skb->sk;
2744
2745         atomic_sub(skb->truesize, &sk->sk_omem_alloc);
2746 }
2747
2748 struct sk_buff *sock_omalloc(struct sock *sk, unsigned long size,
2749                              gfp_t priority)
2750 {
2751         struct sk_buff *skb;
2752
2753         /* small safe race: SKB_TRUESIZE may differ from final skb->truesize */
2754         if (atomic_read(&sk->sk_omem_alloc) + SKB_TRUESIZE(size) >
2755             READ_ONCE(sock_net(sk)->core.sysctl_optmem_max))
2756                 return NULL;
2757
2758         skb = alloc_skb(size, priority);
2759         if (!skb)
2760                 return NULL;
2761
2762         atomic_add(skb->truesize, &sk->sk_omem_alloc);
2763         skb->sk = sk;
2764         skb->destructor = sock_ofree;
2765         return skb;
2766 }
2767
2768 /*
2769  * Allocate a memory block from the socket's option memory buffer.
2770  */
2771 void *sock_kmalloc(struct sock *sk, int size, gfp_t priority)
2772 {
2773         int optmem_max = READ_ONCE(sock_net(sk)->core.sysctl_optmem_max);
2774
2775         if ((unsigned int)size <= optmem_max &&
2776             atomic_read(&sk->sk_omem_alloc) + size < optmem_max) {
2777                 void *mem;
2778                 /* First do the add, to avoid the race if kmalloc
2779                  * might sleep.
2780                  */
2781                 atomic_add(size, &sk->sk_omem_alloc);
2782                 mem = kmalloc(size, priority);
2783                 if (mem)
2784                         return mem;
2785                 atomic_sub(size, &sk->sk_omem_alloc);
2786         }
2787         return NULL;
2788 }
2789 EXPORT_SYMBOL(sock_kmalloc);
2790
2791 /* Free an option memory block. Note, we actually want the inline
2792  * here as this allows gcc to detect the nullify and fold away the
2793  * condition entirely.
2794  */
2795 static inline void __sock_kfree_s(struct sock *sk, void *mem, int size,
2796                                   const bool nullify)
2797 {
2798         if (WARN_ON_ONCE(!mem))
2799                 return;
2800         if (nullify)
2801                 kfree_sensitive(mem);
2802         else
2803                 kfree(mem);
2804         atomic_sub(size, &sk->sk_omem_alloc);
2805 }
2806
2807 void sock_kfree_s(struct sock *sk, void *mem, int size)
2808 {
2809         __sock_kfree_s(sk, mem, size, false);
2810 }
2811 EXPORT_SYMBOL(sock_kfree_s);
2812
2813 void sock_kzfree_s(struct sock *sk, void *mem, int size)
2814 {
2815         __sock_kfree_s(sk, mem, size, true);
2816 }
2817 EXPORT_SYMBOL(sock_kzfree_s);
2818
2819 /* It is almost wait_for_tcp_memory minus release_sock/lock_sock.
2820    I think, these locks should be removed for datagram sockets.
2821  */
2822 static long sock_wait_for_wmem(struct sock *sk, long timeo)
2823 {
2824         DEFINE_WAIT(wait);
2825
2826         sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
2827         for (;;) {
2828                 if (!timeo)
2829                         break;
2830                 if (signal_pending(current))
2831                         break;
2832                 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
2833                 prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
2834                 if (refcount_read(&sk->sk_wmem_alloc) < READ_ONCE(sk->sk_sndbuf))
2835                         break;
2836                 if (READ_ONCE(sk->sk_shutdown) & SEND_SHUTDOWN)
2837                         break;
2838                 if (READ_ONCE(sk->sk_err))
2839                         break;
2840                 timeo = schedule_timeout(timeo);
2841         }
2842         finish_wait(sk_sleep(sk), &wait);
2843         return timeo;
2844 }
2845
2846
2847 /*
2848  *      Generic send/receive buffer handlers
2849  */
2850
2851 struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len,
2852                                      unsigned long data_len, int noblock,
2853                                      int *errcode, int max_page_order)
2854 {
2855         struct sk_buff *skb;
2856         long timeo;
2857         int err;
2858
2859         timeo = sock_sndtimeo(sk, noblock);
2860         for (;;) {
2861                 err = sock_error(sk);
2862                 if (err != 0)
2863                         goto failure;
2864
2865                 err = -EPIPE;
2866                 if (READ_ONCE(sk->sk_shutdown) & SEND_SHUTDOWN)
2867                         goto failure;
2868
2869                 if (sk_wmem_alloc_get(sk) < READ_ONCE(sk->sk_sndbuf))
2870                         break;
2871
2872                 sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
2873                 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
2874                 err = -EAGAIN;
2875                 if (!timeo)
2876                         goto failure;
2877                 if (signal_pending(current))
2878                         goto interrupted;
2879                 timeo = sock_wait_for_wmem(sk, timeo);
2880         }
2881         skb = alloc_skb_with_frags(header_len, data_len, max_page_order,
2882                                    errcode, sk->sk_allocation);
2883         if (skb)
2884                 skb_set_owner_w(skb, sk);
2885         return skb;
2886
2887 interrupted:
2888         err = sock_intr_errno(timeo);
2889 failure:
2890         *errcode = err;
2891         return NULL;
2892 }
2893 EXPORT_SYMBOL(sock_alloc_send_pskb);
2894
2895 int __sock_cmsg_send(struct sock *sk, struct cmsghdr *cmsg,
2896                      struct sockcm_cookie *sockc)
2897 {
2898         u32 tsflags;
2899
2900         BUILD_BUG_ON(SOF_TIMESTAMPING_LAST == (1 << 31));
2901
2902         switch (cmsg->cmsg_type) {
2903         case SO_MARK:
2904                 if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) &&
2905                     !ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
2906                         return -EPERM;
2907                 if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
2908                         return -EINVAL;
2909                 sockc->mark = *(u32 *)CMSG_DATA(cmsg);
2910                 break;
2911         case SO_TIMESTAMPING_OLD:
2912         case SO_TIMESTAMPING_NEW:
2913                 if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
2914                         return -EINVAL;
2915
2916                 tsflags = *(u32 *)CMSG_DATA(cmsg);
2917                 if (tsflags & ~SOF_TIMESTAMPING_TX_RECORD_MASK)
2918                         return -EINVAL;
2919
2920                 sockc->tsflags &= ~SOF_TIMESTAMPING_TX_RECORD_MASK;
2921                 sockc->tsflags |= tsflags;
2922                 break;
2923         case SCM_TXTIME:
2924                 if (!sock_flag(sk, SOCK_TXTIME))
2925                         return -EINVAL;
2926                 if (cmsg->cmsg_len != CMSG_LEN(sizeof(u64)))
2927                         return -EINVAL;
2928                 sockc->transmit_time = get_unaligned((u64 *)CMSG_DATA(cmsg));
2929                 break;
2930         case SCM_TS_OPT_ID:
2931                 if (sk_is_tcp(sk))
2932                         return -EINVAL;
2933                 tsflags = READ_ONCE(sk->sk_tsflags);
2934                 if (!(tsflags & SOF_TIMESTAMPING_OPT_ID))
2935                         return -EINVAL;
2936                 if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
2937                         return -EINVAL;
2938                 sockc->ts_opt_id = *(u32 *)CMSG_DATA(cmsg);
2939                 sockc->tsflags |= SOCKCM_FLAG_TS_OPT_ID;
2940                 break;
2941         /* SCM_RIGHTS and SCM_CREDENTIALS are semantically in SOL_UNIX. */
2942         case SCM_RIGHTS:
2943         case SCM_CREDENTIALS:
2944                 break;
2945         default:
2946                 return -EINVAL;
2947         }
2948         return 0;
2949 }
2950 EXPORT_SYMBOL(__sock_cmsg_send);
2951
2952 int sock_cmsg_send(struct sock *sk, struct msghdr *msg,
2953                    struct sockcm_cookie *sockc)
2954 {
2955         struct cmsghdr *cmsg;
2956         int ret;
2957
2958         for_each_cmsghdr(cmsg, msg) {
2959                 if (!CMSG_OK(msg, cmsg))
2960                         return -EINVAL;
2961                 if (cmsg->cmsg_level != SOL_SOCKET)
2962                         continue;
2963                 ret = __sock_cmsg_send(sk, cmsg, sockc);
2964                 if (ret)
2965                         return ret;
2966         }
2967         return 0;
2968 }
2969 EXPORT_SYMBOL(sock_cmsg_send);
2970
2971 static void sk_enter_memory_pressure(struct sock *sk)
2972 {
2973         if (!sk->sk_prot->enter_memory_pressure)
2974                 return;
2975
2976         sk->sk_prot->enter_memory_pressure(sk);
2977 }
2978
2979 static void sk_leave_memory_pressure(struct sock *sk)
2980 {
2981         if (sk->sk_prot->leave_memory_pressure) {
2982                 INDIRECT_CALL_INET_1(sk->sk_prot->leave_memory_pressure,
2983                                      tcp_leave_memory_pressure, sk);
2984         } else {
2985                 unsigned long *memory_pressure = sk->sk_prot->memory_pressure;
2986
2987                 if (memory_pressure && READ_ONCE(*memory_pressure))
2988                         WRITE_ONCE(*memory_pressure, 0);
2989         }
2990 }
2991
2992 DEFINE_STATIC_KEY_FALSE(net_high_order_alloc_disable_key);
2993
2994 /**
2995  * skb_page_frag_refill - check that a page_frag contains enough room
2996  * @sz: minimum size of the fragment we want to get
2997  * @pfrag: pointer to page_frag
2998  * @gfp: priority for memory allocation
2999  *
3000  * Note: While this allocator tries to use high order pages, there is
3001  * no guarantee that allocations succeed. Therefore, @sz MUST be
3002  * less or equal than PAGE_SIZE.
3003  */
3004 bool skb_page_frag_refill(unsigned int sz, struct page_frag *pfrag, gfp_t gfp)
3005 {
3006         if (pfrag->page) {
3007                 if (page_ref_count(pfrag->page) == 1) {
3008                         pfrag->offset = 0;
3009                         return true;
3010                 }
3011                 if (pfrag->offset + sz <= pfrag->size)
3012                         return true;
3013                 put_page(pfrag->page);
3014         }
3015
3016         pfrag->offset = 0;
3017         if (SKB_FRAG_PAGE_ORDER &&
3018             !static_branch_unlikely(&net_high_order_alloc_disable_key)) {
3019                 /* Avoid direct reclaim but allow kswapd to wake */
3020                 pfrag->page = alloc_pages((gfp & ~__GFP_DIRECT_RECLAIM) |
3021                                           __GFP_COMP | __GFP_NOWARN |
3022                                           __GFP_NORETRY,
3023                                           SKB_FRAG_PAGE_ORDER);
3024                 if (likely(pfrag->page)) {
3025                         pfrag->size = PAGE_SIZE << SKB_FRAG_PAGE_ORDER;
3026                         return true;
3027                 }
3028         }
3029         pfrag->page = alloc_page(gfp);
3030         if (likely(pfrag->page)) {
3031                 pfrag->size = PAGE_SIZE;
3032                 return true;
3033         }
3034         return false;
3035 }
3036 EXPORT_SYMBOL(skb_page_frag_refill);
3037
3038 bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag)
3039 {
3040         if (likely(skb_page_frag_refill(32U, pfrag, sk->sk_allocation)))
3041                 return true;
3042
3043         sk_enter_memory_pressure(sk);
3044         sk_stream_moderate_sndbuf(sk);
3045         return false;
3046 }
3047 EXPORT_SYMBOL(sk_page_frag_refill);
3048
3049 void __lock_sock(struct sock *sk)
3050         __releases(&sk->sk_lock.slock)
3051         __acquires(&sk->sk_lock.slock)
3052 {
3053         DEFINE_WAIT(wait);
3054
3055         for (;;) {
3056                 prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait,
3057                                         TASK_UNINTERRUPTIBLE);
3058                 spin_unlock_bh(&sk->sk_lock.slock);
3059                 schedule();
3060                 spin_lock_bh(&sk->sk_lock.slock);
3061                 if (!sock_owned_by_user(sk))
3062                         break;
3063         }
3064         finish_wait(&sk->sk_lock.wq, &wait);
3065 }
3066
3067 void __release_sock(struct sock *sk)
3068         __releases(&sk->sk_lock.slock)
3069         __acquires(&sk->sk_lock.slock)
3070 {
3071         struct sk_buff *skb, *next;
3072
3073         while ((skb = sk->sk_backlog.head) != NULL) {
3074                 sk->sk_backlog.head = sk->sk_backlog.tail = NULL;
3075
3076                 spin_unlock_bh(&sk->sk_lock.slock);
3077
3078                 do {
3079                         next = skb->next;
3080                         prefetch(next);
3081                         DEBUG_NET_WARN_ON_ONCE(skb_dst_is_noref(skb));
3082                         skb_mark_not_on_list(skb);
3083                         sk_backlog_rcv(sk, skb);
3084
3085                         cond_resched();
3086
3087                         skb = next;
3088                 } while (skb != NULL);
3089
3090                 spin_lock_bh(&sk->sk_lock.slock);
3091         }
3092
3093         /*
3094          * Doing the zeroing here guarantee we can not loop forever
3095          * while a wild producer attempts to flood us.
3096          */
3097         sk->sk_backlog.len = 0;
3098 }
3099
3100 void __sk_flush_backlog(struct sock *sk)
3101 {
3102         spin_lock_bh(&sk->sk_lock.slock);
3103         __release_sock(sk);
3104
3105         if (sk->sk_prot->release_cb)
3106                 INDIRECT_CALL_INET_1(sk->sk_prot->release_cb,
3107                                      tcp_release_cb, sk);
3108
3109         spin_unlock_bh(&sk->sk_lock.slock);
3110 }
3111 EXPORT_SYMBOL_GPL(__sk_flush_backlog);
3112
3113 /**
3114  * sk_wait_data - wait for data to arrive at sk_receive_queue
3115  * @sk:    sock to wait on
3116  * @timeo: for how long
3117  * @skb:   last skb seen on sk_receive_queue
3118  *
3119  * Now socket state including sk->sk_err is changed only under lock,
3120  * hence we may omit checks after joining wait queue.
3121  * We check receive queue before schedule() only as optimization;
3122  * it is very likely that release_sock() added new data.
3123  */
3124 int sk_wait_data(struct sock *sk, long *timeo, const struct sk_buff *skb)
3125 {
3126         DEFINE_WAIT_FUNC(wait, woken_wake_function);
3127         int rc;
3128
3129         add_wait_queue(sk_sleep(sk), &wait);
3130         sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
3131         rc = sk_wait_event(sk, timeo, skb_peek_tail(&sk->sk_receive_queue) != skb, &wait);
3132         sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
3133         remove_wait_queue(sk_sleep(sk), &wait);
3134         return rc;
3135 }
3136 EXPORT_SYMBOL(sk_wait_data);
3137
3138 /**
3139  *      __sk_mem_raise_allocated - increase memory_allocated
3140  *      @sk: socket
3141  *      @size: memory size to allocate
3142  *      @amt: pages to allocate
3143  *      @kind: allocation type
3144  *
3145  *      Similar to __sk_mem_schedule(), but does not update sk_forward_alloc.
3146  *
3147  *      Unlike the globally shared limits among the sockets under same protocol,
3148  *      consuming the budget of a memcg won't have direct effect on other ones.
3149  *      So be optimistic about memcg's tolerance, and leave the callers to decide
3150  *      whether or not to raise allocated through sk_under_memory_pressure() or
3151  *      its variants.
3152  */
3153 int __sk_mem_raise_allocated(struct sock *sk, int size, int amt, int kind)
3154 {
3155         struct mem_cgroup *memcg = mem_cgroup_sockets_enabled ? sk->sk_memcg : NULL;
3156         struct proto *prot = sk->sk_prot;
3157         bool charged = false;
3158         long allocated;
3159
3160         sk_memory_allocated_add(sk, amt);
3161         allocated = sk_memory_allocated(sk);
3162
3163         if (memcg) {
3164                 if (!mem_cgroup_charge_skmem(memcg, amt, gfp_memcg_charge()))
3165                         goto suppress_allocation;
3166                 charged = true;
3167         }
3168
3169         /* Under limit. */
3170         if (allocated <= sk_prot_mem_limits(sk, 0)) {
3171                 sk_leave_memory_pressure(sk);
3172                 return 1;
3173         }
3174
3175         /* Under pressure. */
3176         if (allocated > sk_prot_mem_limits(sk, 1))
3177                 sk_enter_memory_pressure(sk);
3178
3179         /* Over hard limit. */
3180         if (allocated > sk_prot_mem_limits(sk, 2))
3181                 goto suppress_allocation;
3182
3183         /* Guarantee minimum buffer size under pressure (either global
3184          * or memcg) to make sure features described in RFC 7323 (TCP
3185          * Extensions for High Performance) work properly.
3186          *
3187          * This rule does NOT stand when exceeds global or memcg's hard
3188          * limit, or else a DoS attack can be taken place by spawning
3189          * lots of sockets whose usage are under minimum buffer size.
3190          */
3191         if (kind == SK_MEM_RECV) {
3192                 if (atomic_read(&sk->sk_rmem_alloc) < sk_get_rmem0(sk, prot))
3193                         return 1;
3194
3195         } else { /* SK_MEM_SEND */
3196                 int wmem0 = sk_get_wmem0(sk, prot);
3197
3198                 if (sk->sk_type == SOCK_STREAM) {
3199                         if (sk->sk_wmem_queued < wmem0)
3200                                 return 1;
3201                 } else if (refcount_read(&sk->sk_wmem_alloc) < wmem0) {
3202                                 return 1;
3203                 }
3204         }
3205
3206         if (sk_has_memory_pressure(sk)) {
3207                 u64 alloc;
3208
3209                 /* The following 'average' heuristic is within the
3210                  * scope of global accounting, so it only makes
3211                  * sense for global memory pressure.
3212                  */
3213                 if (!sk_under_global_memory_pressure(sk))
3214                         return 1;
3215
3216                 /* Try to be fair among all the sockets under global
3217                  * pressure by allowing the ones that below average
3218                  * usage to raise.
3219                  */
3220                 alloc = sk_sockets_allocated_read_positive(sk);
3221                 if (sk_prot_mem_limits(sk, 2) > alloc *
3222                     sk_mem_pages(sk->sk_wmem_queued +
3223                                  atomic_read(&sk->sk_rmem_alloc) +
3224                                  sk->sk_forward_alloc))
3225                         return 1;
3226         }
3227
3228 suppress_allocation:
3229
3230         if (kind == SK_MEM_SEND && sk->sk_type == SOCK_STREAM) {
3231                 sk_stream_moderate_sndbuf(sk);
3232
3233                 /* Fail only if socket is _under_ its sndbuf.
3234                  * In this case we cannot block, so that we have to fail.
3235                  */
3236                 if (sk->sk_wmem_queued + size >= sk->sk_sndbuf) {
3237                         /* Force charge with __GFP_NOFAIL */
3238                         if (memcg && !charged) {
3239                                 mem_cgroup_charge_skmem(memcg, amt,
3240                                         gfp_memcg_charge() | __GFP_NOFAIL);
3241                         }
3242                         return 1;
3243                 }
3244         }
3245
3246         if (kind == SK_MEM_SEND || (kind == SK_MEM_RECV && charged))
3247                 trace_sock_exceed_buf_limit(sk, prot, allocated, kind);
3248
3249         sk_memory_allocated_sub(sk, amt);
3250
3251         if (charged)
3252                 mem_cgroup_uncharge_skmem(memcg, amt);
3253
3254         return 0;
3255 }
3256
3257 /**
3258  *      __sk_mem_schedule - increase sk_forward_alloc and memory_allocated
3259  *      @sk: socket
3260  *      @size: memory size to allocate
3261  *      @kind: allocation type
3262  *
3263  *      If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means
3264  *      rmem allocation. This function assumes that protocols which have
3265  *      memory_pressure use sk_wmem_queued as write buffer accounting.
3266  */
3267 int __sk_mem_schedule(struct sock *sk, int size, int kind)
3268 {
3269         int ret, amt = sk_mem_pages(size);
3270
3271         sk_forward_alloc_add(sk, amt << PAGE_SHIFT);
3272         ret = __sk_mem_raise_allocated(sk, size, amt, kind);
3273         if (!ret)
3274                 sk_forward_alloc_add(sk, -(amt << PAGE_SHIFT));
3275         return ret;
3276 }
3277 EXPORT_SYMBOL(__sk_mem_schedule);
3278
3279 /**
3280  *      __sk_mem_reduce_allocated - reclaim memory_allocated
3281  *      @sk: socket
3282  *      @amount: number of quanta
3283  *
3284  *      Similar to __sk_mem_reclaim(), but does not update sk_forward_alloc
3285  */
3286 void __sk_mem_reduce_allocated(struct sock *sk, int amount)
3287 {
3288         sk_memory_allocated_sub(sk, amount);
3289
3290         if (mem_cgroup_sockets_enabled && sk->sk_memcg)
3291                 mem_cgroup_uncharge_skmem(sk->sk_memcg, amount);
3292
3293         if (sk_under_global_memory_pressure(sk) &&
3294             (sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)))
3295                 sk_leave_memory_pressure(sk);
3296 }
3297
3298 /**
3299  *      __sk_mem_reclaim - reclaim sk_forward_alloc and memory_allocated
3300  *      @sk: socket
3301  *      @amount: number of bytes (rounded down to a PAGE_SIZE multiple)
3302  */
3303 void __sk_mem_reclaim(struct sock *sk, int amount)
3304 {
3305         amount >>= PAGE_SHIFT;
3306         sk_forward_alloc_add(sk, -(amount << PAGE_SHIFT));
3307         __sk_mem_reduce_allocated(sk, amount);
3308 }
3309 EXPORT_SYMBOL(__sk_mem_reclaim);
3310
3311 int sk_set_peek_off(struct sock *sk, int val)
3312 {
3313         WRITE_ONCE(sk->sk_peek_off, val);
3314         return 0;
3315 }
3316 EXPORT_SYMBOL_GPL(sk_set_peek_off);
3317
3318 /*
3319  * Set of default routines for initialising struct proto_ops when
3320  * the protocol does not support a particular function. In certain
3321  * cases where it makes no sense for a protocol to have a "do nothing"
3322  * function, some default processing is provided.
3323  */
3324
3325 int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len)
3326 {
3327         return -EOPNOTSUPP;
3328 }
3329 EXPORT_SYMBOL(sock_no_bind);
3330
3331 int sock_no_connect(struct socket *sock, struct sockaddr *saddr,
3332                     int len, int flags)
3333 {
3334         return -EOPNOTSUPP;
3335 }
3336 EXPORT_SYMBOL(sock_no_connect);
3337
3338 int sock_no_socketpair(struct socket *sock1, struct socket *sock2)
3339 {
3340         return -EOPNOTSUPP;
3341 }
3342 EXPORT_SYMBOL(sock_no_socketpair);
3343
3344 int sock_no_accept(struct socket *sock, struct socket *newsock,
3345                    struct proto_accept_arg *arg)
3346 {
3347         return -EOPNOTSUPP;
3348 }
3349 EXPORT_SYMBOL(sock_no_accept);
3350
3351 int sock_no_getname(struct socket *sock, struct sockaddr *saddr,
3352                     int peer)
3353 {
3354         return -EOPNOTSUPP;
3355 }
3356 EXPORT_SYMBOL(sock_no_getname);
3357
3358 int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
3359 {
3360         return -EOPNOTSUPP;
3361 }
3362 EXPORT_SYMBOL(sock_no_ioctl);
3363
3364 int sock_no_listen(struct socket *sock, int backlog)
3365 {
3366         return -EOPNOTSUPP;
3367 }
3368 EXPORT_SYMBOL(sock_no_listen);
3369
3370 int sock_no_shutdown(struct socket *sock, int how)
3371 {
3372         return -EOPNOTSUPP;
3373 }
3374 EXPORT_SYMBOL(sock_no_shutdown);
3375
3376 int sock_no_sendmsg(struct socket *sock, struct msghdr *m, size_t len)
3377 {
3378         return -EOPNOTSUPP;
3379 }
3380 EXPORT_SYMBOL(sock_no_sendmsg);
3381
3382 int sock_no_sendmsg_locked(struct sock *sk, struct msghdr *m, size_t len)
3383 {
3384         return -EOPNOTSUPP;
3385 }
3386 EXPORT_SYMBOL(sock_no_sendmsg_locked);
3387
3388 int sock_no_recvmsg(struct socket *sock, struct msghdr *m, size_t len,
3389                     int flags)
3390 {
3391         return -EOPNOTSUPP;
3392 }
3393 EXPORT_SYMBOL(sock_no_recvmsg);
3394
3395 int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
3396 {
3397         /* Mirror missing mmap method error code */
3398         return -ENODEV;
3399 }
3400 EXPORT_SYMBOL(sock_no_mmap);
3401
3402 /*
3403  * When a file is received (via SCM_RIGHTS, etc), we must bump the
3404  * various sock-based usage counts.
3405  */
3406 void __receive_sock(struct file *file)
3407 {
3408         struct socket *sock;
3409
3410         sock = sock_from_file(file);
3411         if (sock) {
3412                 sock_update_netprioidx(&sock->sk->sk_cgrp_data);
3413                 sock_update_classid(&sock->sk->sk_cgrp_data);
3414         }
3415 }
3416
3417 /*
3418  *      Default Socket Callbacks
3419  */
3420
3421 static void sock_def_wakeup(struct sock *sk)
3422 {
3423         struct socket_wq *wq;
3424
3425         rcu_read_lock();
3426         wq = rcu_dereference(sk->sk_wq);
3427         if (skwq_has_sleeper(wq))
3428                 wake_up_interruptible_all(&wq->wait);
3429         rcu_read_unlock();
3430 }
3431
3432 static void sock_def_error_report(struct sock *sk)
3433 {
3434         struct socket_wq *wq;
3435
3436         rcu_read_lock();
3437         wq = rcu_dereference(sk->sk_wq);
3438         if (skwq_has_sleeper(wq))
3439                 wake_up_interruptible_poll(&wq->wait, EPOLLERR);
3440         sk_wake_async_rcu(sk, SOCK_WAKE_IO, POLL_ERR);
3441         rcu_read_unlock();
3442 }
3443
3444 void sock_def_readable(struct sock *sk)
3445 {
3446         struct socket_wq *wq;
3447
3448         trace_sk_data_ready(sk);
3449
3450         rcu_read_lock();
3451         wq = rcu_dereference(sk->sk_wq);
3452         if (skwq_has_sleeper(wq))
3453                 wake_up_interruptible_sync_poll(&wq->wait, EPOLLIN | EPOLLPRI |
3454                                                 EPOLLRDNORM | EPOLLRDBAND);
3455         sk_wake_async_rcu(sk, SOCK_WAKE_WAITD, POLL_IN);
3456         rcu_read_unlock();
3457 }
3458
3459 static void sock_def_write_space(struct sock *sk)
3460 {
3461         struct socket_wq *wq;
3462
3463         rcu_read_lock();
3464
3465         /* Do not wake up a writer until he can make "significant"
3466          * progress.  --DaveM
3467          */
3468         if (sock_writeable(sk)) {
3469                 wq = rcu_dereference(sk->sk_wq);
3470                 if (skwq_has_sleeper(wq))
3471                         wake_up_interruptible_sync_poll(&wq->wait, EPOLLOUT |
3472                                                 EPOLLWRNORM | EPOLLWRBAND);
3473
3474                 /* Should agree with poll, otherwise some programs break */
3475                 sk_wake_async_rcu(sk, SOCK_WAKE_SPACE, POLL_OUT);
3476         }
3477
3478         rcu_read_unlock();
3479 }
3480
3481 /* An optimised version of sock_def_write_space(), should only be called
3482  * for SOCK_RCU_FREE sockets under RCU read section and after putting
3483  * ->sk_wmem_alloc.
3484  */
3485 static void sock_def_write_space_wfree(struct sock *sk)
3486 {
3487         /* Do not wake up a writer until he can make "significant"
3488          * progress.  --DaveM
3489          */
3490         if (sock_writeable(sk)) {
3491                 struct socket_wq *wq = rcu_dereference(sk->sk_wq);
3492
3493                 /* rely on refcount_sub from sock_wfree() */
3494                 smp_mb__after_atomic();
3495                 if (wq && waitqueue_active(&wq->wait))
3496                         wake_up_interruptible_sync_poll(&wq->wait, EPOLLOUT |
3497                                                 EPOLLWRNORM | EPOLLWRBAND);
3498
3499                 /* Should agree with poll, otherwise some programs break */
3500                 sk_wake_async_rcu(sk, SOCK_WAKE_SPACE, POLL_OUT);
3501         }
3502 }
3503
3504 static void sock_def_destruct(struct sock *sk)
3505 {
3506 }
3507
3508 void sk_send_sigurg(struct sock *sk)
3509 {
3510         if (sk->sk_socket && sk->sk_socket->file)
3511                 if (send_sigurg(sk->sk_socket->file))
3512                         sk_wake_async(sk, SOCK_WAKE_URG, POLL_PRI);
3513 }
3514 EXPORT_SYMBOL(sk_send_sigurg);
3515
3516 void sk_reset_timer(struct sock *sk, struct timer_list* timer,
3517                     unsigned long expires)
3518 {
3519         if (!mod_timer(timer, expires))
3520                 sock_hold(sk);
3521 }
3522 EXPORT_SYMBOL(sk_reset_timer);
3523
3524 void sk_stop_timer(struct sock *sk, struct timer_list* timer)
3525 {
3526         if (del_timer(timer))
3527                 __sock_put(sk);
3528 }
3529 EXPORT_SYMBOL(sk_stop_timer);
3530
3531 void sk_stop_timer_sync(struct sock *sk, struct timer_list *timer)
3532 {
3533         if (del_timer_sync(timer))
3534                 __sock_put(sk);
3535 }
3536 EXPORT_SYMBOL(sk_stop_timer_sync);
3537
3538 void sock_init_data_uid(struct socket *sock, struct sock *sk, kuid_t uid)
3539 {
3540         sk_init_common(sk);
3541         sk->sk_send_head        =       NULL;
3542
3543         timer_setup(&sk->sk_timer, NULL, 0);
3544
3545         sk->sk_allocation       =       GFP_KERNEL;
3546         sk->sk_rcvbuf           =       READ_ONCE(sysctl_rmem_default);
3547         sk->sk_sndbuf           =       READ_ONCE(sysctl_wmem_default);
3548         sk->sk_state            =       TCP_CLOSE;
3549         sk->sk_use_task_frag    =       true;
3550         sk_set_socket(sk, sock);
3551
3552         sock_set_flag(sk, SOCK_ZAPPED);
3553
3554         if (sock) {
3555                 sk->sk_type     =       sock->type;
3556                 RCU_INIT_POINTER(sk->sk_wq, &sock->wq);
3557                 sock->sk        =       sk;
3558         } else {
3559                 RCU_INIT_POINTER(sk->sk_wq, NULL);
3560         }
3561         sk->sk_uid      =       uid;
3562
3563         sk->sk_state_change     =       sock_def_wakeup;
3564         sk->sk_data_ready       =       sock_def_readable;
3565         sk->sk_write_space      =       sock_def_write_space;
3566         sk->sk_error_report     =       sock_def_error_report;
3567         sk->sk_destruct         =       sock_def_destruct;
3568
3569         sk->sk_frag.page        =       NULL;
3570         sk->sk_frag.offset      =       0;
3571         sk->sk_peek_off         =       -1;
3572
3573         sk->sk_peer_pid         =       NULL;
3574         sk->sk_peer_cred        =       NULL;
3575         spin_lock_init(&sk->sk_peer_lock);
3576
3577         sk->sk_write_pending    =       0;
3578         sk->sk_rcvlowat         =       1;
3579         sk->sk_rcvtimeo         =       MAX_SCHEDULE_TIMEOUT;
3580         sk->sk_sndtimeo         =       MAX_SCHEDULE_TIMEOUT;
3581
3582         sk->sk_stamp = SK_DEFAULT_STAMP;
3583 #if BITS_PER_LONG==32
3584         seqlock_init(&sk->sk_stamp_seq);
3585 #endif
3586         atomic_set(&sk->sk_zckey, 0);
3587
3588 #ifdef CONFIG_NET_RX_BUSY_POLL
3589         sk->sk_napi_id          =       0;
3590         sk->sk_ll_usec          =       READ_ONCE(sysctl_net_busy_read);
3591 #endif
3592
3593         sk->sk_max_pacing_rate = ~0UL;
3594         sk->sk_pacing_rate = ~0UL;
3595         WRITE_ONCE(sk->sk_pacing_shift, 10);
3596         sk->sk_incoming_cpu = -1;
3597
3598         sk_rx_queue_clear(sk);
3599         /*
3600          * Before updating sk_refcnt, we must commit prior changes to memory
3601          * (Documentation/RCU/rculist_nulls.rst for details)
3602          */
3603         smp_wmb();
3604         refcount_set(&sk->sk_refcnt, 1);
3605         atomic_set(&sk->sk_drops, 0);
3606 }
3607 EXPORT_SYMBOL(sock_init_data_uid);
3608
3609 void sock_init_data(struct socket *sock, struct sock *sk)
3610 {
3611         kuid_t uid = sock ?
3612                 SOCK_INODE(sock)->i_uid :
3613                 make_kuid(sock_net(sk)->user_ns, 0);
3614
3615         sock_init_data_uid(sock, sk, uid);
3616 }
3617 EXPORT_SYMBOL(sock_init_data);
3618
3619 void lock_sock_nested(struct sock *sk, int subclass)
3620 {
3621         /* The sk_lock has mutex_lock() semantics here. */
3622         mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_);
3623
3624         might_sleep();
3625         spin_lock_bh(&sk->sk_lock.slock);
3626         if (sock_owned_by_user_nocheck(sk))
3627                 __lock_sock(sk);
3628         sk->sk_lock.owned = 1;
3629         spin_unlock_bh(&sk->sk_lock.slock);
3630 }
3631 EXPORT_SYMBOL(lock_sock_nested);
3632
3633 void release_sock(struct sock *sk)
3634 {
3635         spin_lock_bh(&sk->sk_lock.slock);
3636         if (sk->sk_backlog.tail)
3637                 __release_sock(sk);
3638
3639         if (sk->sk_prot->release_cb)
3640                 INDIRECT_CALL_INET_1(sk->sk_prot->release_cb,
3641                                      tcp_release_cb, sk);
3642
3643         sock_release_ownership(sk);
3644         if (waitqueue_active(&sk->sk_lock.wq))
3645                 wake_up(&sk->sk_lock.wq);
3646         spin_unlock_bh(&sk->sk_lock.slock);
3647 }
3648 EXPORT_SYMBOL(release_sock);
3649
3650 bool __lock_sock_fast(struct sock *sk) __acquires(&sk->sk_lock.slock)
3651 {
3652         might_sleep();
3653         spin_lock_bh(&sk->sk_lock.slock);
3654
3655         if (!sock_owned_by_user_nocheck(sk)) {
3656                 /*
3657                  * Fast path return with bottom halves disabled and
3658                  * sock::sk_lock.slock held.
3659                  *
3660                  * The 'mutex' is not contended and holding
3661                  * sock::sk_lock.slock prevents all other lockers to
3662                  * proceed so the corresponding unlock_sock_fast() can
3663                  * avoid the slow path of release_sock() completely and
3664                  * just release slock.
3665                  *
3666                  * From a semantical POV this is equivalent to 'acquiring'
3667                  * the 'mutex', hence the corresponding lockdep
3668                  * mutex_release() has to happen in the fast path of
3669                  * unlock_sock_fast().
3670                  */
3671                 return false;
3672         }
3673
3674         __lock_sock(sk);
3675         sk->sk_lock.owned = 1;
3676         __acquire(&sk->sk_lock.slock);
3677         spin_unlock_bh(&sk->sk_lock.slock);
3678         return true;
3679 }
3680 EXPORT_SYMBOL(__lock_sock_fast);
3681
3682 int sock_gettstamp(struct socket *sock, void __user *userstamp,
3683                    bool timeval, bool time32)
3684 {
3685         struct sock *sk = sock->sk;
3686         struct timespec64 ts;
3687
3688         sock_enable_timestamp(sk, SOCK_TIMESTAMP);
3689         ts = ktime_to_timespec64(sock_read_timestamp(sk));
3690         if (ts.tv_sec == -1)
3691                 return -ENOENT;
3692         if (ts.tv_sec == 0) {
3693                 ktime_t kt = ktime_get_real();
3694                 sock_write_timestamp(sk, kt);
3695                 ts = ktime_to_timespec64(kt);
3696         }
3697
3698         if (timeval)
3699                 ts.tv_nsec /= 1000;
3700
3701 #ifdef CONFIG_COMPAT_32BIT_TIME
3702         if (time32)
3703                 return put_old_timespec32(&ts, userstamp);
3704 #endif
3705 #ifdef CONFIG_SPARC64
3706         /* beware of padding in sparc64 timeval */
3707         if (timeval && !in_compat_syscall()) {
3708                 struct __kernel_old_timeval __user tv = {
3709                         .tv_sec = ts.tv_sec,
3710                         .tv_usec = ts.tv_nsec,
3711                 };
3712                 if (copy_to_user(userstamp, &tv, sizeof(tv)))
3713                         return -EFAULT;
3714                 return 0;
3715         }
3716 #endif
3717         return put_timespec64(&ts, userstamp);
3718 }
3719 EXPORT_SYMBOL(sock_gettstamp);
3720
3721 void sock_enable_timestamp(struct sock *sk, enum sock_flags flag)
3722 {
3723         if (!sock_flag(sk, flag)) {
3724                 unsigned long previous_flags = sk->sk_flags;
3725
3726                 sock_set_flag(sk, flag);
3727                 /*
3728                  * we just set one of the two flags which require net
3729                  * time stamping, but time stamping might have been on
3730                  * already because of the other one
3731                  */
3732                 if (sock_needs_netstamp(sk) &&
3733                     !(previous_flags & SK_FLAGS_TIMESTAMP))
3734                         net_enable_timestamp();
3735         }
3736 }
3737
3738 int sock_recv_errqueue(struct sock *sk, struct msghdr *msg, int len,
3739                        int level, int type)
3740 {
3741         struct sock_exterr_skb *serr;
3742         struct sk_buff *skb;
3743         int copied, err;
3744
3745         err = -EAGAIN;
3746         skb = sock_dequeue_err_skb(sk);
3747         if (skb == NULL)
3748                 goto out;
3749
3750         copied = skb->len;
3751         if (copied > len) {
3752                 msg->msg_flags |= MSG_TRUNC;
3753                 copied = len;
3754         }
3755         err = skb_copy_datagram_msg(skb, 0, msg, copied);
3756         if (err)
3757                 goto out_free_skb;
3758
3759         sock_recv_timestamp(msg, sk, skb);
3760
3761         serr = SKB_EXT_ERR(skb);
3762         put_cmsg(msg, level, type, sizeof(serr->ee), &serr->ee);
3763
3764         msg->msg_flags |= MSG_ERRQUEUE;
3765         err = copied;
3766
3767 out_free_skb:
3768         kfree_skb(skb);
3769 out:
3770         return err;
3771 }
3772 EXPORT_SYMBOL(sock_recv_errqueue);
3773
3774 /*
3775  *      Get a socket option on an socket.
3776  *
3777  *      FIX: POSIX 1003.1g is very ambiguous here. It states that
3778  *      asynchronous errors should be reported by getsockopt. We assume
3779  *      this means if you specify SO_ERROR (otherwise what is the point of it).
3780  */
3781 int sock_common_getsockopt(struct socket *sock, int level, int optname,
3782                            char __user *optval, int __user *optlen)
3783 {
3784         struct sock *sk = sock->sk;
3785
3786         /* IPV6_ADDRFORM can change sk->sk_prot under us. */
3787         return READ_ONCE(sk->sk_prot)->getsockopt(sk, level, optname, optval, optlen);
3788 }
3789 EXPORT_SYMBOL(sock_common_getsockopt);
3790
3791 int sock_common_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
3792                         int flags)
3793 {
3794         struct sock *sk = sock->sk;
3795         int addr_len = 0;
3796         int err;
3797
3798         err = sk->sk_prot->recvmsg(sk, msg, size, flags, &addr_len);
3799         if (err >= 0)
3800                 msg->msg_namelen = addr_len;
3801         return err;
3802 }
3803 EXPORT_SYMBOL(sock_common_recvmsg);
3804
3805 /*
3806  *      Set socket options on an inet socket.
3807  */
3808 int sock_common_setsockopt(struct socket *sock, int level, int optname,
3809                            sockptr_t optval, unsigned int optlen)
3810 {
3811         struct sock *sk = sock->sk;
3812
3813         /* IPV6_ADDRFORM can change sk->sk_prot under us. */
3814         return READ_ONCE(sk->sk_prot)->setsockopt(sk, level, optname, optval, optlen);
3815 }
3816 EXPORT_SYMBOL(sock_common_setsockopt);
3817
3818 void sk_common_release(struct sock *sk)
3819 {
3820         if (sk->sk_prot->destroy)
3821                 sk->sk_prot->destroy(sk);
3822
3823         /*
3824          * Observation: when sk_common_release is called, processes have
3825          * no access to socket. But net still has.
3826          * Step one, detach it from networking:
3827          *
3828          * A. Remove from hash tables.
3829          */
3830
3831         sk->sk_prot->unhash(sk);
3832
3833         /*
3834          * In this point socket cannot receive new packets, but it is possible
3835          * that some packets are in flight because some CPU runs receiver and
3836          * did hash table lookup before we unhashed socket. They will achieve
3837          * receive queue and will be purged by socket destructor.
3838          *
3839          * Also we still have packets pending on receive queue and probably,
3840          * our own packets waiting in device queues. sock_destroy will drain
3841          * receive queue, but transmitted packets will delay socket destruction
3842          * until the last reference will be released.
3843          */
3844
3845         sock_orphan(sk);
3846
3847         xfrm_sk_free_policy(sk);
3848
3849         sock_put(sk);
3850 }
3851 EXPORT_SYMBOL(sk_common_release);
3852
3853 void sk_get_meminfo(const struct sock *sk, u32 *mem)
3854 {
3855         memset(mem, 0, sizeof(*mem) * SK_MEMINFO_VARS);
3856
3857         mem[SK_MEMINFO_RMEM_ALLOC] = sk_rmem_alloc_get(sk);
3858         mem[SK_MEMINFO_RCVBUF] = READ_ONCE(sk->sk_rcvbuf);
3859         mem[SK_MEMINFO_WMEM_ALLOC] = sk_wmem_alloc_get(sk);
3860         mem[SK_MEMINFO_SNDBUF] = READ_ONCE(sk->sk_sndbuf);
3861         mem[SK_MEMINFO_FWD_ALLOC] = sk_forward_alloc_get(sk);
3862         mem[SK_MEMINFO_WMEM_QUEUED] = READ_ONCE(sk->sk_wmem_queued);
3863         mem[SK_MEMINFO_OPTMEM] = atomic_read(&sk->sk_omem_alloc);
3864         mem[SK_MEMINFO_BACKLOG] = READ_ONCE(sk->sk_backlog.len);
3865         mem[SK_MEMINFO_DROPS] = atomic_read(&sk->sk_drops);
3866 }
3867
3868 #ifdef CONFIG_PROC_FS
3869 static DECLARE_BITMAP(proto_inuse_idx, PROTO_INUSE_NR);
3870
3871 int sock_prot_inuse_get(struct net *net, struct proto *prot)
3872 {
3873         int cpu, idx = prot->inuse_idx;
3874         int res = 0;
3875
3876         for_each_possible_cpu(cpu)
3877                 res += per_cpu_ptr(net->core.prot_inuse, cpu)->val[idx];
3878
3879         return res >= 0 ? res : 0;
3880 }
3881 EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
3882
3883 int sock_inuse_get(struct net *net)
3884 {
3885         int cpu, res = 0;
3886
3887         for_each_possible_cpu(cpu)
3888                 res += per_cpu_ptr(net->core.prot_inuse, cpu)->all;
3889
3890         return res;
3891 }
3892
3893 EXPORT_SYMBOL_GPL(sock_inuse_get);
3894
3895 static int __net_init sock_inuse_init_net(struct net *net)
3896 {
3897         net->core.prot_inuse = alloc_percpu(struct prot_inuse);
3898         if (net->core.prot_inuse == NULL)
3899                 return -ENOMEM;
3900         return 0;
3901 }
3902
3903 static void __net_exit sock_inuse_exit_net(struct net *net)
3904 {
3905         free_percpu(net->core.prot_inuse);
3906 }
3907
3908 static struct pernet_operations net_inuse_ops = {
3909         .init = sock_inuse_init_net,
3910         .exit = sock_inuse_exit_net,
3911 };
3912
3913 static __init int net_inuse_init(void)
3914 {
3915         if (register_pernet_subsys(&net_inuse_ops))
3916                 panic("Cannot initialize net inuse counters");
3917
3918         return 0;
3919 }
3920
3921 core_initcall(net_inuse_init);
3922
3923 static int assign_proto_idx(struct proto *prot)
3924 {
3925         prot->inuse_idx = find_first_zero_bit(proto_inuse_idx, PROTO_INUSE_NR);
3926
3927         if (unlikely(prot->inuse_idx == PROTO_INUSE_NR - 1)) {
3928                 pr_err("PROTO_INUSE_NR exhausted\n");
3929                 return -ENOSPC;
3930         }
3931
3932         set_bit(prot->inuse_idx, proto_inuse_idx);
3933         return 0;
3934 }
3935
3936 static void release_proto_idx(struct proto *prot)
3937 {
3938         if (prot->inuse_idx != PROTO_INUSE_NR - 1)
3939                 clear_bit(prot->inuse_idx, proto_inuse_idx);
3940 }
3941 #else
3942 static inline int assign_proto_idx(struct proto *prot)
3943 {
3944         return 0;
3945 }
3946
3947 static inline void release_proto_idx(struct proto *prot)
3948 {
3949 }
3950
3951 #endif
3952
3953 static void tw_prot_cleanup(struct timewait_sock_ops *twsk_prot)
3954 {
3955         if (!twsk_prot)
3956                 return;
3957         kfree(twsk_prot->twsk_slab_name);
3958         twsk_prot->twsk_slab_name = NULL;
3959         kmem_cache_destroy(twsk_prot->twsk_slab);
3960         twsk_prot->twsk_slab = NULL;
3961 }
3962
3963 static int tw_prot_init(const struct proto *prot)
3964 {
3965         struct timewait_sock_ops *twsk_prot = prot->twsk_prot;
3966
3967         if (!twsk_prot)
3968                 return 0;
3969
3970         twsk_prot->twsk_slab_name = kasprintf(GFP_KERNEL, "tw_sock_%s",
3971                                               prot->name);
3972         if (!twsk_prot->twsk_slab_name)
3973                 return -ENOMEM;
3974
3975         twsk_prot->twsk_slab =
3976                 kmem_cache_create(twsk_prot->twsk_slab_name,
3977                                   twsk_prot->twsk_obj_size, 0,
3978                                   SLAB_ACCOUNT | prot->slab_flags,
3979                                   NULL);
3980         if (!twsk_prot->twsk_slab) {
3981                 pr_crit("%s: Can't create timewait sock SLAB cache!\n",
3982                         prot->name);
3983                 return -ENOMEM;
3984         }
3985
3986         return 0;
3987 }
3988
3989 static void req_prot_cleanup(struct request_sock_ops *rsk_prot)
3990 {
3991         if (!rsk_prot)
3992                 return;
3993         kfree(rsk_prot->slab_name);
3994         rsk_prot->slab_name = NULL;
3995         kmem_cache_destroy(rsk_prot->slab);
3996         rsk_prot->slab = NULL;
3997 }
3998
3999 static int req_prot_init(const struct proto *prot)
4000 {
4001         struct request_sock_ops *rsk_prot = prot->rsk_prot;
4002
4003         if (!rsk_prot)
4004                 return 0;
4005
4006         rsk_prot->slab_name = kasprintf(GFP_KERNEL, "request_sock_%s",
4007                                         prot->name);
4008         if (!rsk_prot->slab_name)
4009                 return -ENOMEM;
4010
4011         rsk_prot->slab = kmem_cache_create(rsk_prot->slab_name,
4012                                            rsk_prot->obj_size, 0,
4013                                            SLAB_ACCOUNT | prot->slab_flags,
4014                                            NULL);
4015
4016         if (!rsk_prot->slab) {
4017                 pr_crit("%s: Can't create request sock SLAB cache!\n",
4018                         prot->name);
4019                 return -ENOMEM;
4020         }
4021         return 0;
4022 }
4023
4024 int proto_register(struct proto *prot, int alloc_slab)
4025 {
4026         int ret = -ENOBUFS;
4027
4028         if (prot->memory_allocated && !prot->sysctl_mem) {
4029                 pr_err("%s: missing sysctl_mem\n", prot->name);
4030                 return -EINVAL;
4031         }
4032         if (prot->memory_allocated && !prot->per_cpu_fw_alloc) {
4033                 pr_err("%s: missing per_cpu_fw_alloc\n", prot->name);
4034                 return -EINVAL;
4035         }
4036         if (alloc_slab) {
4037                 prot->slab = kmem_cache_create_usercopy(prot->name,
4038                                         prot->obj_size, 0,
4039                                         SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT |
4040                                         prot->slab_flags,
4041                                         prot->useroffset, prot->usersize,
4042                                         NULL);
4043
4044                 if (prot->slab == NULL) {
4045                         pr_crit("%s: Can't create sock SLAB cache!\n",
4046                                 prot->name);
4047                         goto out;
4048                 }
4049
4050                 if (req_prot_init(prot))
4051                         goto out_free_request_sock_slab;
4052
4053                 if (tw_prot_init(prot))
4054                         goto out_free_timewait_sock_slab;
4055         }
4056
4057         mutex_lock(&proto_list_mutex);
4058         ret = assign_proto_idx(prot);
4059         if (ret) {
4060                 mutex_unlock(&proto_list_mutex);
4061                 goto out_free_timewait_sock_slab;
4062         }
4063         list_add(&prot->node, &proto_list);
4064         mutex_unlock(&proto_list_mutex);
4065         return ret;
4066
4067 out_free_timewait_sock_slab:
4068         if (alloc_slab)
4069                 tw_prot_cleanup(prot->twsk_prot);
4070 out_free_request_sock_slab:
4071         if (alloc_slab) {
4072                 req_prot_cleanup(prot->rsk_prot);
4073
4074                 kmem_cache_destroy(prot->slab);
4075                 prot->slab = NULL;
4076         }
4077 out:
4078         return ret;
4079 }
4080 EXPORT_SYMBOL(proto_register);
4081
4082 void proto_unregister(struct proto *prot)
4083 {
4084         mutex_lock(&proto_list_mutex);
4085         release_proto_idx(prot);
4086         list_del(&prot->node);
4087         mutex_unlock(&proto_list_mutex);
4088
4089         kmem_cache_destroy(prot->slab);
4090         prot->slab = NULL;
4091
4092         req_prot_cleanup(prot->rsk_prot);
4093         tw_prot_cleanup(prot->twsk_prot);
4094 }
4095 EXPORT_SYMBOL(proto_unregister);
4096
4097 int sock_load_diag_module(int family, int protocol)
4098 {
4099         if (!protocol) {
4100                 if (!sock_is_registered(family))
4101                         return -ENOENT;
4102
4103                 return request_module("net-pf-%d-proto-%d-type-%d", PF_NETLINK,
4104                                       NETLINK_SOCK_DIAG, family);
4105         }
4106
4107 #ifdef CONFIG_INET
4108         if (family == AF_INET &&
4109             protocol != IPPROTO_RAW &&
4110             protocol < MAX_INET_PROTOS &&
4111             !rcu_access_pointer(inet_protos[protocol]))
4112                 return -ENOENT;
4113 #endif
4114
4115         return request_module("net-pf-%d-proto-%d-type-%d-%d", PF_NETLINK,
4116                               NETLINK_SOCK_DIAG, family, protocol);
4117 }
4118 EXPORT_SYMBOL(sock_load_diag_module);
4119
4120 #ifdef CONFIG_PROC_FS
4121 static void *proto_seq_start(struct seq_file *seq, loff_t *pos)
4122         __acquires(proto_list_mutex)
4123 {
4124         mutex_lock(&proto_list_mutex);
4125         return seq_list_start_head(&proto_list, *pos);
4126 }
4127
4128 static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos)
4129 {
4130         return seq_list_next(v, &proto_list, pos);
4131 }
4132
4133 static void proto_seq_stop(struct seq_file *seq, void *v)
4134         __releases(proto_list_mutex)
4135 {
4136         mutex_unlock(&proto_list_mutex);
4137 }
4138
4139 static char proto_method_implemented(const void *method)
4140 {
4141         return method == NULL ? 'n' : 'y';
4142 }
4143 static long sock_prot_memory_allocated(struct proto *proto)
4144 {
4145         return proto->memory_allocated != NULL ? proto_memory_allocated(proto) : -1L;
4146 }
4147
4148 static const char *sock_prot_memory_pressure(struct proto *proto)
4149 {
4150         return proto->memory_pressure != NULL ?
4151         proto_memory_pressure(proto) ? "yes" : "no" : "NI";
4152 }
4153
4154 static void proto_seq_printf(struct seq_file *seq, struct proto *proto)
4155 {
4156
4157         seq_printf(seq, "%-9s %4u %6d  %6ld   %-3s %6u   %-3s  %-10s "
4158                         "%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n",
4159                    proto->name,
4160                    proto->obj_size,
4161                    sock_prot_inuse_get(seq_file_net(seq), proto),
4162                    sock_prot_memory_allocated(proto),
4163                    sock_prot_memory_pressure(proto),
4164                    proto->max_header,
4165                    proto->slab == NULL ? "no" : "yes",
4166                    module_name(proto->owner),
4167                    proto_method_implemented(proto->close),
4168                    proto_method_implemented(proto->connect),
4169                    proto_method_implemented(proto->disconnect),
4170                    proto_method_implemented(proto->accept),
4171                    proto_method_implemented(proto->ioctl),
4172                    proto_method_implemented(proto->init),
4173                    proto_method_implemented(proto->destroy),
4174                    proto_method_implemented(proto->shutdown),
4175                    proto_method_implemented(proto->setsockopt),
4176                    proto_method_implemented(proto->getsockopt),
4177                    proto_method_implemented(proto->sendmsg),
4178                    proto_method_implemented(proto->recvmsg),
4179                    proto_method_implemented(proto->bind),
4180                    proto_method_implemented(proto->backlog_rcv),
4181                    proto_method_implemented(proto->hash),
4182                    proto_method_implemented(proto->unhash),
4183                    proto_method_implemented(proto->get_port),
4184                    proto_method_implemented(proto->enter_memory_pressure));
4185 }
4186
4187 static int proto_seq_show(struct seq_file *seq, void *v)
4188 {
4189         if (v == &proto_list)
4190                 seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s",
4191                            "protocol",
4192                            "size",
4193                            "sockets",
4194                            "memory",
4195                            "press",
4196                            "maxhdr",
4197                            "slab",
4198                            "module",
4199                            "cl co di ac io in de sh ss gs se re bi br ha uh gp em\n");
4200         else
4201                 proto_seq_printf(seq, list_entry(v, struct proto, node));
4202         return 0;
4203 }
4204
4205 static const struct seq_operations proto_seq_ops = {
4206         .start  = proto_seq_start,
4207         .next   = proto_seq_next,
4208         .stop   = proto_seq_stop,
4209         .show   = proto_seq_show,
4210 };
4211
4212 static __net_init int proto_init_net(struct net *net)
4213 {
4214         if (!proc_create_net("protocols", 0444, net->proc_net, &proto_seq_ops,
4215                         sizeof(struct seq_net_private)))
4216                 return -ENOMEM;
4217
4218         return 0;
4219 }
4220
4221 static __net_exit void proto_exit_net(struct net *net)
4222 {
4223         remove_proc_entry("protocols", net->proc_net);
4224 }
4225
4226
4227 static __net_initdata struct pernet_operations proto_net_ops = {
4228         .init = proto_init_net,
4229         .exit = proto_exit_net,
4230 };
4231
4232 static int __init proto_init(void)
4233 {
4234         return register_pernet_subsys(&proto_net_ops);
4235 }
4236
4237 subsys_initcall(proto_init);
4238
4239 #endif /* PROC_FS */
4240
4241 #ifdef CONFIG_NET_RX_BUSY_POLL
4242 bool sk_busy_loop_end(void *p, unsigned long start_time)
4243 {
4244         struct sock *sk = p;
4245
4246         if (!skb_queue_empty_lockless(&sk->sk_receive_queue))
4247                 return true;
4248
4249         if (sk_is_udp(sk) &&
4250             !skb_queue_empty_lockless(&udp_sk(sk)->reader_queue))
4251                 return true;
4252
4253         return sk_busy_loop_timeout(sk, start_time);
4254 }
4255 EXPORT_SYMBOL(sk_busy_loop_end);
4256 #endif /* CONFIG_NET_RX_BUSY_POLL */
4257
4258 int sock_bind_add(struct sock *sk, struct sockaddr *addr, int addr_len)
4259 {
4260         if (!sk->sk_prot->bind_add)
4261                 return -EOPNOTSUPP;
4262         return sk->sk_prot->bind_add(sk, addr, addr_len);
4263 }
4264 EXPORT_SYMBOL(sock_bind_add);
4265
4266 /* Copy 'size' bytes from userspace and return `size` back to userspace */
4267 int sock_ioctl_inout(struct sock *sk, unsigned int cmd,
4268                      void __user *arg, void *karg, size_t size)
4269 {
4270         int ret;
4271
4272         if (copy_from_user(karg, arg, size))
4273                 return -EFAULT;
4274
4275         ret = READ_ONCE(sk->sk_prot)->ioctl(sk, cmd, karg);
4276         if (ret)
4277                 return ret;
4278
4279         if (copy_to_user(arg, karg, size))
4280                 return -EFAULT;
4281
4282         return 0;
4283 }
4284 EXPORT_SYMBOL(sock_ioctl_inout);
4285
4286 /* This is the most common ioctl prep function, where the result (4 bytes) is
4287  * copied back to userspace if the ioctl() returns successfully. No input is
4288  * copied from userspace as input argument.
4289  */
4290 static int sock_ioctl_out(struct sock *sk, unsigned int cmd, void __user *arg)
4291 {
4292         int ret, karg = 0;
4293
4294         ret = READ_ONCE(sk->sk_prot)->ioctl(sk, cmd, &karg);
4295         if (ret)
4296                 return ret;
4297
4298         return put_user(karg, (int __user *)arg);
4299 }
4300
4301 /* A wrapper around sock ioctls, which copies the data from userspace
4302  * (depending on the protocol/ioctl), and copies back the result to userspace.
4303  * The main motivation for this function is to pass kernel memory to the
4304  * protocol ioctl callbacks, instead of userspace memory.
4305  */
4306 int sk_ioctl(struct sock *sk, unsigned int cmd, void __user *arg)
4307 {
4308         int rc = 1;
4309
4310         if (sk->sk_type == SOCK_RAW && sk->sk_family == AF_INET)
4311                 rc = ipmr_sk_ioctl(sk, cmd, arg);
4312         else if (sk->sk_type == SOCK_RAW && sk->sk_family == AF_INET6)
4313                 rc = ip6mr_sk_ioctl(sk, cmd, arg);
4314         else if (sk_is_phonet(sk))
4315                 rc = phonet_sk_ioctl(sk, cmd, arg);
4316
4317         /* If ioctl was processed, returns its value */
4318         if (rc <= 0)
4319                 return rc;
4320
4321         /* Otherwise call the default handler */
4322         return sock_ioctl_out(sk, cmd, arg);
4323 }
4324 EXPORT_SYMBOL(sk_ioctl);
4325
4326 static int __init sock_struct_check(void)
4327 {
4328         CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rx, sk_drops);
4329         CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rx, sk_peek_off);
4330         CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rx, sk_error_queue);
4331         CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rx, sk_receive_queue);
4332         CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rx, sk_backlog);
4333
4334         CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_rx_dst);
4335         CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_rx_dst_ifindex);
4336         CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_rx_dst_cookie);
4337         CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_rcvbuf);
4338         CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_filter);
4339         CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_wq);
4340         CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_data_ready);
4341         CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_rcvtimeo);
4342         CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_rcvlowat);
4343
4344         CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rxtx, sk_err);
4345         CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rxtx, sk_socket);
4346         CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rxtx, sk_memcg);
4347
4348         CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rxtx, sk_lock);
4349         CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rxtx, sk_reserved_mem);
4350         CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rxtx, sk_forward_alloc);
4351         CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rxtx, sk_tsflags);
4352
4353         CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_omem_alloc);
4354         CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_omem_alloc);
4355         CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_sndbuf);
4356         CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_wmem_queued);
4357         CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_wmem_alloc);
4358         CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_tsq_flags);
4359         CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_send_head);
4360         CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_write_queue);
4361         CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_write_pending);
4362         CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_dst_pending_confirm);
4363         CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_pacing_status);
4364         CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_frag);
4365         CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_timer);
4366         CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_pacing_rate);
4367         CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_zckey);
4368         CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_tskey);
4369
4370         CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_max_pacing_rate);
4371         CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_sndtimeo);
4372         CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_priority);
4373         CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_mark);
4374         CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_dst_cache);
4375         CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_route_caps);
4376         CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_gso_type);
4377         CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_gso_max_size);
4378         CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_allocation);
4379         CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_txhash);
4380         CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_gso_max_segs);
4381         CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_pacing_shift);
4382         CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_use_task_frag);
4383         return 0;
4384 }
4385
4386 core_initcall(sock_struct_check);