net/core/sock.c

   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              Generic socket support routines. Memory allocators, socket lock/release
   7  *              handler for protocols to use and generic option handler.
   8  *
   9  *
  10  * Authors:     Ross Biro
  11  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12  *              Florian La Roche, <flla@stud.uni-sb.de>
  13  *              Alan Cox, <A.Cox@swansea.ac.uk>
  14  *
  15  * Fixes:
  16  *              Alan Cox        :       Numerous verify_area() problems
  17  *              Alan Cox        :       Connecting on a connecting socket
  18  *                                      now returns an error for tcp.
  19  *              Alan Cox        :       sock->protocol is set correctly.
  20  *                                      and is not sometimes left as 0.
  21  *              Alan Cox        :       connect handles icmp errors on a
  22  *                                      connect properly. Unfortunately there
  23  *                                      is a restart syscall nasty there. I
  24  *                                      can't match BSD without hacking the C
  25  *                                      library. Ideas urgently sought!
  26  *              Alan Cox        :       Disallow bind() to addresses that are
  27  *                                      not ours - especially broadcast ones!!
  28  *              Alan Cox        :       Socket 1024 _IS_ ok for users. (fencepost)
  29  *              Alan Cox        :       sock_wfree/sock_rfree don't destroy sockets,
  30  *                                      instead they leave that for the DESTROY timer.
  31  *              Alan Cox        :       Clean up error flag in accept
  32  *              Alan Cox        :       TCP ack handling is buggy, the DESTROY timer
  33  *                                      was buggy. Put a remove_sock() in the handler
  34  *                                      for memory when we hit 0. Also altered the timer
  35  *                                      code. The ACK stuff can wait and needs major
  36  *                                      TCP layer surgery.
  37  *              Alan Cox        :       Fixed TCP ack bug, removed remove sock
  38  *                                      and fixed timer/inet_bh race.
  39  *              Alan Cox        :       Added zapped flag for TCP
  40  *              Alan Cox        :       Move kfree_skb into skbuff.c and tidied up surplus code
  41  *              Alan Cox        :       for new sk_buff allocations wmalloc/rmalloc now call alloc_skb
  42  *              Alan Cox        :       kfree_s calls now are kfree_skbmem so we can track skb resources
  43  *              Alan Cox        :       Supports socket option broadcast now as does udp. Packet and raw need fixing.
  44  *              Alan Cox        :       Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so...
  45  *              Rick Sladkey    :       Relaxed UDP rules for matching packets.
  46  *              C.E.Hawkins     :       IFF_PROMISC/SIOCGHWADDR support
  47  *      Pauline Middelink       :       identd support
  48  *              Alan Cox        :       Fixed connect() taking signals I think.
  49  *              Alan Cox        :       SO_LINGER supported
  50  *              Alan Cox        :       Error reporting fixes
  51  *              Anonymous       :       inet_create tidied up (sk->reuse setting)
  52  *              Alan Cox        :       inet sockets don't set sk->type!
  53  *              Alan Cox        :       Split socket option code
  54  *              Alan Cox        :       Callbacks
  55  *              Alan Cox        :       Nagle flag for Charles & Johannes stuff
  56  *              Alex            :       Removed restriction on inet fioctl
  57  *              Alan Cox        :       Splitting INET from NET core
  58  *              Alan Cox        :       Fixed bogus SO_TYPE handling in getsockopt()
  59  *              Adam Caldwell   :       Missing return in SO_DONTROUTE/SO_DEBUG code
  60  *              Alan Cox        :       Split IP from generic code
  61  *              Alan Cox        :       New kfree_skbmem()
  62  *              Alan Cox        :       Make SO_DEBUG superuser only.
  63  *              Alan Cox        :       Allow anyone to clear SO_DEBUG
  64  *                                      (compatibility fix)
  65  *              Alan Cox        :       Added optimistic memory grabbing for AF_UNIX throughput.
  66  *              Alan Cox        :       Allocator for a socket is settable.
  67  *              Alan Cox        :       SO_ERROR includes soft errors.
  68  *              Alan Cox        :       Allow NULL arguments on some SO_ opts
  69  *              Alan Cox        :       Generic socket allocation to make hooks
  70  *                                      easier (suggested by Craig Metz).
  71  *              Michael Pall    :       SO_ERROR returns positive errno again
  72  *              Steve Whitehouse:       Added default destructor to free
  73  *                                      protocol private data.
  74  *              Steve Whitehouse:       Added various other default routines
  75  *                                      common to several socket families.
  76  *              Chris Evans     :       Call suser() check last on F_SETOWN
  77  *              Jay Schulist    :       Added SO_ATTACH_FILTER and SO_DETACH_FILTER.
  78  *              Andi Kleen      :       Add sock_kmalloc()/sock_kfree_s()
  79  *              Andi Kleen      :       Fix write_space callback
  80  *              Chris Evans     :       Security fixes - signedness again
  81  *              Arnaldo C. Melo :       cleanups, use skb_queue_purge
  82  *
  83  * To Fix:
  84  *
  85  *
  86  *              This program is free software; you can redistribute it and/or
  87  *              modify it under the terms of the GNU General Public License
  88  *              as published by the Free Software Foundation; either version
  89  *              2 of the License, or (at your option) any later version.
  90  */
  91
  92 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
  93
  94 #include <asm/unaligned.h>
  95 #include <linux/capability.h>
  96 #include <linux/errno.h>
  97 #include <linux/errqueue.h>
  98 #include <linux/types.h>
  99 #include <linux/socket.h>
 100 #include <linux/in.h>
 101 #include <linux/kernel.h>
 102 #include <linux/module.h>
 103 #include <linux/proc_fs.h>
 104 #include <linux/seq_file.h>
 105 #include <linux/sched.h>
 106 #include <linux/sched/mm.h>
 107 #include <linux/timer.h>
 108 #include <linux/string.h>
 109 #include <linux/sockios.h>
 110 #include <linux/net.h>
 111 #include <linux/mm.h>
 112 #include <linux/slab.h>
 113 #include <linux/interrupt.h>
 114 #include <linux/poll.h>
 115 #include <linux/tcp.h>
 116 #include <linux/init.h>
 117 #include <linux/highmem.h>
 118 #include <linux/user_namespace.h>
 119 #include <linux/static_key.h>
 120 #include <linux/memcontrol.h>
 121 #include <linux/prefetch.h>
 122
 123 #include <linux/uaccess.h>
 124
 125 #include <linux/netdevice.h>
 126 #include <net/protocol.h>
 127 #include <linux/skbuff.h>
 128 #include <net/net_namespace.h>
 129 #include <net/request_sock.h>
 130 #include <net/sock.h>
 131 #include <linux/net_tstamp.h>
 132 #include <net/xfrm.h>
 133 #include <linux/ipsec.h>
 134 #include <net/cls_cgroup.h>
 135 #include <net/netprio_cgroup.h>
 136 #include <linux/sock_diag.h>
 137
 138 #include <linux/filter.h>
 139 #include <net/sock_reuseport.h>
 140
 141 #include <trace/events/sock.h>
 142
 143 #include <net/tcp.h>
 144 #include <net/busy_poll.h>
 145
 146 static DEFINE_MUTEX(proto_list_mutex);
 147 static LIST_HEAD(proto_list);
 148
 149 static void sock_inuse_add(struct net *net, int val);
 150
 151 /**
 152  * sk_ns_capable - General socket capability test
 153  * @sk: Socket to use a capability on or through
 154  * @user_ns: The user namespace of the capability to use
 155  * @cap: The capability to use
 156  *
 157  * Test to see if the opener of the socket had when the socket was
 158  * created and the current process has the capability @cap in the user
 159  * namespace @user_ns.
 160  */
 161 bool sk_ns_capable(const struct sock *sk,
 162                    struct user_namespace *user_ns, int cap)
 163 {
 164         return file_ns_capable(sk->sk_socket->file, user_ns, cap) &&
 165                 ns_capable(user_ns, cap);
 166 }
 167 EXPORT_SYMBOL(sk_ns_capable);
 168
 169 /**
 170  * sk_capable - Socket global capability test
 171  * @sk: Socket to use a capability on or through
 172  * @cap: The global capability to use
 173  *
 174  * Test to see if the opener of the socket had when the socket was
 175  * created and the current process has the capability @cap in all user
 176  * namespaces.
 177  */
 178 bool sk_capable(const struct sock *sk, int cap)
 179 {
 180         return sk_ns_capable(sk, &init_user_ns, cap);
 181 }
 182 EXPORT_SYMBOL(sk_capable);
 183
 184 /**
 185  * sk_net_capable - Network namespace socket capability test
 186  * @sk: Socket to use a capability on or through
 187  * @cap: The capability to use
 188  *
 189  * Test to see if the opener of the socket had when the socket was created
 190  * and the current process has the capability @cap over the network namespace
 191  * the socket is a member of.
 192  */
 193 bool sk_net_capable(const struct sock *sk, int cap)
 194 {
 195         return sk_ns_capable(sk, sock_net(sk)->user_ns, cap);
 196 }
 197 EXPORT_SYMBOL(sk_net_capable);
 198
 199 /*
 200  * Each address family might have different locking rules, so we have
 201  * one slock key per address family and separate keys for internal and
 202  * userspace sockets.
 203  */
 204 static struct lock_class_key af_family_keys[AF_MAX];
 205 static struct lock_class_key af_family_kern_keys[AF_MAX];
 206 static struct lock_class_key af_family_slock_keys[AF_MAX];
 207 static struct lock_class_key af_family_kern_slock_keys[AF_MAX];
 208
 209 /*
 210  * Make lock validator output more readable. (we pre-construct these
 211  * strings build-time, so that runtime initialization of socket
 212  * locks is fast):
 213  */
 214
 215 #define _sock_locks(x)                                            \
 216   x "AF_UNSPEC",        x "AF_UNIX"     ,       x "AF_INET"     , \
 217   x "AF_AX25"  ,        x "AF_IPX"      ,       x "AF_APPLETALK", \
 218   x "AF_NETROM",        x "AF_BRIDGE"   ,       x "AF_ATMPVC"   , \
 219   x "AF_X25"   ,        x "AF_INET6"    ,       x "AF_ROSE"     , \
 220   x "AF_DECnet",        x "AF_NETBEUI"  ,       x "AF_SECURITY" , \
 221   x "AF_KEY"   ,        x "AF_NETLINK"  ,       x "AF_PACKET"   , \
 222   x "AF_ASH"   ,        x "AF_ECONET"   ,       x "AF_ATMSVC"   , \
 223   x "AF_RDS"   ,        x "AF_SNA"      ,       x "AF_IRDA"     , \
 224   x "AF_PPPOX" ,        x "AF_WANPIPE"  ,       x "AF_LLC"      , \
 225   x "27"       ,        x "28"          ,       x "AF_CAN"      , \
 226   x "AF_TIPC"  ,        x "AF_BLUETOOTH",       x "IUCV"        , \
 227   x "AF_RXRPC" ,        x "AF_ISDN"     ,       x "AF_PHONET"   , \
 228   x "AF_IEEE802154",    x "AF_CAIF"     ,       x "AF_ALG"      , \
 229   x "AF_NFC"   ,        x "AF_VSOCK"    ,       x "AF_KCM"      , \
 230   x "AF_QIPCRTR",       x "AF_SMC"      ,       x "AF_XDP"      , \
 231   x "AF_MAX"
 232
 233 static const char *const af_family_key_strings[AF_MAX+1] = {
 234         _sock_locks("sk_lock-")
 235 };
 236 static const char *const af_family_slock_key_strings[AF_MAX+1] = {
 237         _sock_locks("slock-")
 238 };
 239 static const char *const af_family_clock_key_strings[AF_MAX+1] = {
 240         _sock_locks("clock-")
 241 };
 242
 243 static const char *const af_family_kern_key_strings[AF_MAX+1] = {
 244         _sock_locks("k-sk_lock-")
 245 };
 246 static const char *const af_family_kern_slock_key_strings[AF_MAX+1] = {
 247         _sock_locks("k-slock-")
 248 };
 249 static const char *const af_family_kern_clock_key_strings[AF_MAX+1] = {
 250         _sock_locks("k-clock-")
 251 };
 252 static const char *const af_family_rlock_key_strings[AF_MAX+1] = {
 253         _sock_locks("rlock-")
 254 };
 255 static const char *const af_family_wlock_key_strings[AF_MAX+1] = {
 256         _sock_locks("wlock-")
 257 };
 258 static const char *const af_family_elock_key_strings[AF_MAX+1] = {
 259         _sock_locks("elock-")
 260 };
 261
 262 /*
 263  * sk_callback_lock and sk queues locking rules are per-address-family,
 264  * so split the lock classes by using a per-AF key:
 265  */
 266 static struct lock_class_key af_callback_keys[AF_MAX];
 267 static struct lock_class_key af_rlock_keys[AF_MAX];
 268 static struct lock_class_key af_wlock_keys[AF_MAX];
 269 static struct lock_class_key af_elock_keys[AF_MAX];
 270 static struct lock_class_key af_kern_callback_keys[AF_MAX];
 271
 272 /* Run time adjustable parameters. */
 273 __u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX;
 274 EXPORT_SYMBOL(sysctl_wmem_max);
 275 __u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX;
 276 EXPORT_SYMBOL(sysctl_rmem_max);
 277 __u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX;
 278 __u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX;
 279
 280 /* Maximal space eaten by iovec or ancillary data plus some space */
 281 int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512);
 282 EXPORT_SYMBOL(sysctl_optmem_max);
 283
 284 int sysctl_tstamp_allow_data __read_mostly = 1;
 285
 286 DEFINE_STATIC_KEY_FALSE(memalloc_socks_key);
 287 EXPORT_SYMBOL_GPL(memalloc_socks_key);
 288
 289 /**
 290  * sk_set_memalloc - sets %SOCK_MEMALLOC
 291  * @sk: socket to set it on
 292  *
 293  * Set %SOCK_MEMALLOC on a socket for access to emergency reserves.
 294  * It's the responsibility of the admin to adjust min_free_kbytes
 295  * to meet the requirements
 296  */
 297 void sk_set_memalloc(struct sock *sk)
 298 {
 299         sock_set_flag(sk, SOCK_MEMALLOC);
 300         sk->sk_allocation |= __GFP_MEMALLOC;
 301         static_branch_inc(&memalloc_socks_key);
 302 }
 303 EXPORT_SYMBOL_GPL(sk_set_memalloc);
 304
 305 void sk_clear_memalloc(struct sock *sk)
 306 {
 307         sock_reset_flag(sk, SOCK_MEMALLOC);
 308         sk->sk_allocation &= ~__GFP_MEMALLOC;
 309         static_branch_dec(&memalloc_socks_key);
 310
 311         /*
 312          * SOCK_MEMALLOC is allowed to ignore rmem limits to ensure forward
 313          * progress of swapping. SOCK_MEMALLOC may be cleared while
 314          * it has rmem allocations due to the last swapfile being deactivated
 315          * but there is a risk that the socket is unusable due to exceeding
 316          * the rmem limits. Reclaim the reserves and obey rmem limits again.
 317          */
 318         sk_mem_reclaim(sk);
 319 }
 320 EXPORT_SYMBOL_GPL(sk_clear_memalloc);
 321
 322 int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb)
 323 {
 324         int ret;
 325         unsigned int noreclaim_flag;
 326
 327         /* these should have been dropped before queueing */
 328         BUG_ON(!sock_flag(sk, SOCK_MEMALLOC));
 329
 330         noreclaim_flag = memalloc_noreclaim_save();
 331         ret = sk->sk_backlog_rcv(sk, skb);
 332         memalloc_noreclaim_restore(noreclaim_flag);
 333
 334         return ret;
 335 }
 336 EXPORT_SYMBOL(__sk_backlog_rcv);
 337
 338 static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen)
 339 {
 340         struct timeval tv;
 341
 342         if (optlen < sizeof(tv))
 343                 return -EINVAL;
 344         if (copy_from_user(&tv, optval, sizeof(tv)))
 345                 return -EFAULT;
 346         if (tv.tv_usec < 0 || tv.tv_usec >= USEC_PER_SEC)
 347                 return -EDOM;
 348
 349         if (tv.tv_sec < 0) {
 350                 static int warned __read_mostly;
 351
 352                 *timeo_p = 0;
 353                 if (warned < 10 && net_ratelimit()) {
 354                         warned++;
 355                         pr_info("%s: `%s' (pid %d) tries to set negative timeout\n",
 356                                 __func__, current->comm, task_pid_nr(current));
 357                 }
 358                 return 0;
 359         }
 360         *timeo_p = MAX_SCHEDULE_TIMEOUT;
 361         if (tv.tv_sec == 0 && tv.tv_usec == 0)
 362                 return 0;
 363         if (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT/HZ - 1))
 364                 *timeo_p = tv.tv_sec * HZ + DIV_ROUND_UP(tv.tv_usec, USEC_PER_SEC / HZ);
 365         return 0;
 366 }
 367
 368 static void sock_warn_obsolete_bsdism(const char *name)
 369 {
 370         static int warned;
 371         static char warncomm[TASK_COMM_LEN];
 372         if (strcmp(warncomm, current->comm) && warned < 5) {
 373                 strcpy(warncomm,  current->comm);
 374                 pr_warn("process `%s' is using obsolete %s SO_BSDCOMPAT\n",
 375                         warncomm, name);
 376                 warned++;
 377         }
 378 }
 379
 380 static bool sock_needs_netstamp(const struct sock *sk)
 381 {
 382         switch (sk->sk_family) {
 383         case AF_UNSPEC:
 384         case AF_UNIX:
 385                 return false;
 386         default:
 387                 return true;
 388         }
 389 }
 390
 391 static void sock_disable_timestamp(struct sock *sk, unsigned long flags)
 392 {
 393         if (sk->sk_flags & flags) {
 394                 sk->sk_flags &= ~flags;
 395                 if (sock_needs_netstamp(sk) &&
 396                     !(sk->sk_flags & SK_FLAGS_TIMESTAMP))
 397                         net_disable_timestamp();
 398         }
 399 }
 400
 401
 402 int __sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
 403 {
 404         unsigned long flags;
 405         struct sk_buff_head *list = &sk->sk_receive_queue;
 406
 407         if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) {
 408                 atomic_inc(&sk->sk_drops);
 409                 trace_sock_rcvqueue_full(sk, skb);
 410                 return -ENOMEM;
 411         }
 412
 413         if (!sk_rmem_schedule(sk, skb, skb->truesize)) {
 414                 atomic_inc(&sk->sk_drops);
 415                 return -ENOBUFS;
 416         }
 417
 418         skb->dev = NULL;
 419         skb_set_owner_r(skb, sk);
 420
 421         /* we escape from rcu protected region, make sure we dont leak
 422          * a norefcounted dst
 423          */
 424         skb_dst_force(skb);
 425
 426         spin_lock_irqsave(&list->lock, flags);
 427         sock_skb_set_dropcount(sk, skb);
 428         __skb_queue_tail(list, skb);
 429         spin_unlock_irqrestore(&list->lock, flags);
 430
 431         if (!sock_flag(sk, SOCK_DEAD))
 432                 sk->sk_data_ready(sk);
 433         return 0;
 434 }
 435 EXPORT_SYMBOL(__sock_queue_rcv_skb);
 436
 437 int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
 438 {
 439         int err;
 440
 441         err = sk_filter(sk, skb);
 442         if (err)
 443                 return err;
 444
 445         return __sock_queue_rcv_skb(sk, skb);
 446 }
 447 EXPORT_SYMBOL(sock_queue_rcv_skb);
 448
 449 int __sk_receive_skb(struct sock *sk, struct sk_buff *skb,
 450                      const int nested, unsigned int trim_cap, bool refcounted)
 451 {
 452         int rc = NET_RX_SUCCESS;
 453
 454         if (sk_filter_trim_cap(sk, skb, trim_cap))
 455                 goto discard_and_relse;
 456
 457         skb->dev = NULL;
 458
 459         if (sk_rcvqueues_full(sk, sk->sk_rcvbuf)) {
 460                 atomic_inc(&sk->sk_drops);
 461                 goto discard_and_relse;
 462         }
 463         if (nested)
 464                 bh_lock_sock_nested(sk);
 465         else
 466                 bh_lock_sock(sk);
 467         if (!sock_owned_by_user(sk)) {
 468                 /*
 469                  * trylock + unlock semantics:
 470                  */
 471                 mutex_acquire(&sk->sk_lock.dep_map, 0, 1, _RET_IP_);
 472
 473                 rc = sk_backlog_rcv(sk, skb);
 474
 475                 mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
 476         } else if (sk_add_backlog(sk, skb, sk->sk_rcvbuf)) {
 477                 bh_unlock_sock(sk);
 478                 atomic_inc(&sk->sk_drops);
 479                 goto discard_and_relse;
 480         }
 481
 482         bh_unlock_sock(sk);
 483 out:
 484         if (refcounted)
 485                 sock_put(sk);
 486         return rc;
 487 discard_and_relse:
 488         kfree_skb(skb);
 489         goto out;
 490 }
 491 EXPORT_SYMBOL(__sk_receive_skb);
 492
 493 struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie)
 494 {
 495         struct dst_entry *dst = __sk_dst_get(sk);
 496
 497         if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
 498                 sk_tx_queue_clear(sk);
 499                 sk->sk_dst_pending_confirm = 0;
 500                 RCU_INIT_POINTER(sk->sk_dst_cache, NULL);
 501                 dst_release(dst);
 502                 return NULL;
 503         }
 504
 505         return dst;
 506 }
 507 EXPORT_SYMBOL(__sk_dst_check);
 508
 509 struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie)
 510 {
 511         struct dst_entry *dst = sk_dst_get(sk);
 512
 513         if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
 514                 sk_dst_reset(sk);
 515                 dst_release(dst);
 516                 return NULL;
 517         }
 518
 519         return dst;
 520 }
 521 EXPORT_SYMBOL(sk_dst_check);
 522
 523 static int sock_setbindtodevice(struct sock *sk, char __user *optval,
 524                                 int optlen)
 525 {
 526         int ret = -ENOPROTOOPT;
 527 #ifdef CONFIG_NETDEVICES
 528         struct net *net = sock_net(sk);
 529         char devname[IFNAMSIZ];
 530         int index;
 531
 532         /* Sorry... */
 533         ret = -EPERM;
 534         if (!ns_capable(net->user_ns, CAP_NET_RAW))
 535                 goto out;
 536
 537         ret = -EINVAL;
 538         if (optlen < 0)
 539                 goto out;
 540
 541         /* Bind this socket to a particular device like "eth0",
 542          * as specified in the passed interface name. If the
 543          * name is "" or the option length is zero the socket
 544          * is not bound.
 545          */
 546         if (optlen > IFNAMSIZ - 1)
 547                 optlen = IFNAMSIZ - 1;
 548         memset(devname, 0, sizeof(devname));
 549
 550         ret = -EFAULT;
 551         if (copy_from_user(devname, optval, optlen))
 552                 goto out;
 553
 554         index = 0;
 555         if (devname[0] != '\0') {
 556                 struct net_device *dev;
 557
 558                 rcu_read_lock();
 559                 dev = dev_get_by_name_rcu(net, devname);
 560                 if (dev)
 561                         index = dev->ifindex;
 562                 rcu_read_unlock();
 563                 ret = -ENODEV;
 564                 if (!dev)
 565                         goto out;
 566         }
 567
 568         lock_sock(sk);
 569         sk->sk_bound_dev_if = index;
 570         sk_dst_reset(sk);
 571         release_sock(sk);
 572
 573         ret = 0;
 574
 575 out:
 576 #endif
 577
 578         return ret;
 579 }
 580
 581 static int sock_getbindtodevice(struct sock *sk, char __user *optval,
 582                                 int __user *optlen, int len)
 583 {
 584         int ret = -ENOPROTOOPT;
 585 #ifdef CONFIG_NETDEVICES
 586         struct net *net = sock_net(sk);
 587         char devname[IFNAMSIZ];
 588
 589         if (sk->sk_bound_dev_if == 0) {
 590                 len = 0;
 591                 goto zero;
 592         }
 593
 594         ret = -EINVAL;
 595         if (len < IFNAMSIZ)
 596                 goto out;
 597
 598         ret = netdev_get_name(net, devname, sk->sk_bound_dev_if);
 599         if (ret)
 600                 goto out;
 601
 602         len = strlen(devname) + 1;
 603
 604         ret = -EFAULT;
 605         if (copy_to_user(optval, devname, len))
 606                 goto out;
 607
 608 zero:
 609         ret = -EFAULT;
 610         if (put_user(len, optlen))
 611                 goto out;
 612
 613         ret = 0;
 614
 615 out:
 616 #endif
 617
 618         return ret;
 619 }
 620
 621 static inline void sock_valbool_flag(struct sock *sk, int bit, int valbool)
 622 {
 623         if (valbool)
 624                 sock_set_flag(sk, bit);
 625         else
 626                 sock_reset_flag(sk, bit);
 627 }
 628
 629 bool sk_mc_loop(struct sock *sk)
 630 {
 631         if (dev_recursion_level())
 632                 return false;
 633         if (!sk)
 634                 return true;
 635         switch (sk->sk_family) {
 636         case AF_INET:
 637                 return inet_sk(sk)->mc_loop;
 638 #if IS_ENABLED(CONFIG_IPV6)
 639         case AF_INET6:
 640                 return inet6_sk(sk)->mc_loop;
 641 #endif
 642         }
 643         WARN_ON(1);
 644         return true;
 645 }
 646 EXPORT_SYMBOL(sk_mc_loop);
 647
 648 /*
 649  *      This is meant for all protocols to use and covers goings on
 650  *      at the socket level. Everything here is generic.
 651  */
 652
 653 int sock_setsockopt(struct socket *sock, int level, int optname,
 654                     char __user *optval, unsigned int optlen)
 655 {
 656         struct sock_txtime sk_txtime;
 657         struct sock *sk = sock->sk;
 658         int val;
 659         int valbool;
 660         struct linger ling;
 661         int ret = 0;
 662
 663         /*
 664          *      Options without arguments
 665          */
 666
 667         if (optname == SO_BINDTODEVICE)
 668                 return sock_setbindtodevice(sk, optval, optlen);
 669
 670         if (optlen < sizeof(int))
 671                 return -EINVAL;
 672
 673         if (get_user(val, (int __user *)optval))
 674                 return -EFAULT;
 675
 676         valbool = val ? 1 : 0;
 677
 678         lock_sock(sk);
 679
 680         switch (optname) {
 681         case SO_DEBUG:
 682                 if (val && !capable(CAP_NET_ADMIN))
 683                         ret = -EACCES;
 684                 else
 685                         sock_valbool_flag(sk, SOCK_DBG, valbool);
 686                 break;
 687         case SO_REUSEADDR:
 688                 sk->sk_reuse = (valbool ? SK_CAN_REUSE : SK_NO_REUSE);
 689                 break;
 690         case SO_REUSEPORT:
 691                 sk->sk_reuseport = valbool;
 692                 break;
 693         case SO_TYPE:
 694         case SO_PROTOCOL:
 695         case SO_DOMAIN:
 696         case SO_ERROR:
 697                 ret = -ENOPROTOOPT;
 698                 break;
 699         case SO_DONTROUTE:
 700                 sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool);
 701                 sk_dst_reset(sk);
 702                 break;
 703         case SO_BROADCAST:
 704                 sock_valbool_flag(sk, SOCK_BROADCAST, valbool);
 705                 break;
 706         case SO_SNDBUF:
 707                 /* Don't error on this BSD doesn't and if you think
 708                  * about it this is right. Otherwise apps have to
 709                  * play 'guess the biggest size' games. RCVBUF/SNDBUF
 710                  * are treated in BSD as hints
 711                  */
 712                 val = min_t(u32, val, sysctl_wmem_max);
 713 set_sndbuf:
 714                 sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
 715                 sk->sk_sndbuf = max_t(int, val * 2, SOCK_MIN_SNDBUF);
 716                 /* Wake up sending tasks if we upped the value. */
 717                 sk->sk_write_space(sk);
 718                 break;
 719
 720         case SO_SNDBUFFORCE:
 721                 if (!capable(CAP_NET_ADMIN)) {
 722                         ret = -EPERM;
 723                         break;
 724                 }
 725                 goto set_sndbuf;
 726
 727         case SO_RCVBUF:
 728                 /* Don't error on this BSD doesn't and if you think
 729                  * about it this is right. Otherwise apps have to
 730                  * play 'guess the biggest size' games. RCVBUF/SNDBUF
 731                  * are treated in BSD as hints
 732                  */
 733                 val = min_t(u32, val, sysctl_rmem_max);
 734 set_rcvbuf:
 735                 sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
 736                 /*
 737                  * We double it on the way in to account for
 738                  * "struct sk_buff" etc. overhead.   Applications
 739                  * assume that the SO_RCVBUF setting they make will
 740                  * allow that much actual data to be received on that
 741                  * socket.
 742                  *
 743                  * Applications are unaware that "struct sk_buff" and
 744                  * other overheads allocate from the receive buffer
 745                  * during socket buffer allocation.
 746                  *
 747                  * And after considering the possible alternatives,
 748                  * returning the value we actually used in getsockopt
 749                  * is the most desirable behavior.
 750                  */
 751                 sk->sk_rcvbuf = max_t(int, val * 2, SOCK_MIN_RCVBUF);
 752                 break;
 753
 754         case SO_RCVBUFFORCE:
 755                 if (!capable(CAP_NET_ADMIN)) {
 756                         ret = -EPERM;
 757                         break;
 758                 }
 759                 goto set_rcvbuf;
 760
 761         case SO_KEEPALIVE:
 762                 if (sk->sk_prot->keepalive)
 763                         sk->sk_prot->keepalive(sk, valbool);
 764                 sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool);
 765                 break;
 766
 767         case SO_OOBINLINE:
 768                 sock_valbool_flag(sk, SOCK_URGINLINE, valbool);
 769                 break;
 770
 771         case SO_NO_CHECK:
 772                 sk->sk_no_check_tx = valbool;
 773                 break;
 774
 775         case SO_PRIORITY:
 776                 if ((val >= 0 && val <= 6) ||
 777                     ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
 778                         sk->sk_priority = val;
 779                 else
 780                         ret = -EPERM;
 781                 break;
 782
 783         case SO_LINGER:
 784                 if (optlen < sizeof(ling)) {
 785                         ret = -EINVAL;  /* 1003.1g */
 786                         break;
 787                 }
 788                 if (copy_from_user(&ling, optval, sizeof(ling))) {
 789                         ret = -EFAULT;
 790                         break;
 791                 }
 792                 if (!ling.l_onoff)
 793                         sock_reset_flag(sk, SOCK_LINGER);
 794                 else {
 795 #if (BITS_PER_LONG == 32)
 796                         if ((unsigned int)ling.l_linger >= MAX_SCHEDULE_TIMEOUT/HZ)
 797                                 sk->sk_lingertime = MAX_SCHEDULE_TIMEOUT;
 798                         else
 799 #endif
 800                                 sk->sk_lingertime = (unsigned int)ling.l_linger * HZ;
 801                         sock_set_flag(sk, SOCK_LINGER);
 802                 }
 803                 break;
 804
 805         case SO_BSDCOMPAT:
 806                 sock_warn_obsolete_bsdism("setsockopt");
 807                 break;
 808
 809         case SO_PASSCRED:
 810                 if (valbool)
 811                         set_bit(SOCK_PASSCRED, &sock->flags);
 812                 else
 813                         clear_bit(SOCK_PASSCRED, &sock->flags);
 814                 break;
 815
 816         case SO_TIMESTAMP:
 817         case SO_TIMESTAMPNS:
 818                 if (valbool)  {
 819                         if (optname == SO_TIMESTAMP)
 820                                 sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
 821                         else
 822                                 sock_set_flag(sk, SOCK_RCVTSTAMPNS);
 823                         sock_set_flag(sk, SOCK_RCVTSTAMP);
 824                         sock_enable_timestamp(sk, SOCK_TIMESTAMP);
 825                 } else {
 826                         sock_reset_flag(sk, SOCK_RCVTSTAMP);
 827                         sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
 828                 }
 829                 break;
 830
 831         case SO_TIMESTAMPING:
 832                 if (val & ~SOF_TIMESTAMPING_MASK) {
 833                         ret = -EINVAL;
 834                         break;
 835                 }
 836
 837                 if (val & SOF_TIMESTAMPING_OPT_ID &&
 838                     !(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)) {
 839                         if (sk->sk_protocol == IPPROTO_TCP &&
 840                             sk->sk_type == SOCK_STREAM) {
 841                                 if ((1 << sk->sk_state) &
 842                                     (TCPF_CLOSE | TCPF_LISTEN)) {
 843                                         ret = -EINVAL;
 844                                         break;
 845                                 }
 846                                 sk->sk_tskey = tcp_sk(sk)->snd_una;
 847                         } else {
 848                                 sk->sk_tskey = 0;
 849                         }
 850                 }
 851
 852                 if (val & SOF_TIMESTAMPING_OPT_STATS &&
 853                     !(val & SOF_TIMESTAMPING_OPT_TSONLY)) {
 854                         ret = -EINVAL;
 855                         break;
 856                 }
 857
 858                 sk->sk_tsflags = val;
 859                 if (val & SOF_TIMESTAMPING_RX_SOFTWARE)
 860                         sock_enable_timestamp(sk,
 861                                               SOCK_TIMESTAMPING_RX_SOFTWARE);
 862                 else
 863                         sock_disable_timestamp(sk,
 864                                                (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE));
 865                 break;
 866
 867         case SO_RCVLOWAT:
 868                 if (val < 0)
 869                         val = INT_MAX;
 870                 if (sock->ops->set_rcvlowat)
 871                         ret = sock->ops->set_rcvlowat(sk, val);
 872                 else
 873                         sk->sk_rcvlowat = val ? : 1;
 874                 break;
 875
 876         case SO_RCVTIMEO:
 877                 ret = sock_set_timeout(&sk->sk_rcvtimeo, optval, optlen);
 878                 break;
 879
 880         case SO_SNDTIMEO:
 881                 ret = sock_set_timeout(&sk->sk_sndtimeo, optval, optlen);
 882                 break;
 883
 884         case SO_ATTACH_FILTER:
 885                 ret = -EINVAL;
 886                 if (optlen == sizeof(struct sock_fprog)) {
 887                         struct sock_fprog fprog;
 888
 889                         ret = -EFAULT;
 890                         if (copy_from_user(&fprog, optval, sizeof(fprog)))
 891                                 break;
 892
 893                         ret = sk_attach_filter(&fprog, sk);
 894                 }
 895                 break;
 896
 897         case SO_ATTACH_BPF:
 898                 ret = -EINVAL;
 899                 if (optlen == sizeof(u32)) {
 900                         u32 ufd;
 901
 902                         ret = -EFAULT;
 903                         if (copy_from_user(&ufd, optval, sizeof(ufd)))
 904                                 break;
 905
 906                         ret = sk_attach_bpf(ufd, sk);
 907                 }
 908                 break;
 909
 910         case SO_ATTACH_REUSEPORT_CBPF:
 911                 ret = -EINVAL;
 912                 if (optlen == sizeof(struct sock_fprog)) {
 913                         struct sock_fprog fprog;
 914
 915                         ret = -EFAULT;
 916                         if (copy_from_user(&fprog, optval, sizeof(fprog)))
 917                                 break;
 918
 919                         ret = sk_reuseport_attach_filter(&fprog, sk);
 920                 }
 921                 break;
 922
 923         case SO_ATTACH_REUSEPORT_EBPF:
 924                 ret = -EINVAL;
 925                 if (optlen == sizeof(u32)) {
 926                         u32 ufd;
 927
 928                         ret = -EFAULT;
 929                         if (copy_from_user(&ufd, optval, sizeof(ufd)))
 930                                 break;
 931
 932                         ret = sk_reuseport_attach_bpf(ufd, sk);
 933                 }
 934                 break;
 935
 936         case SO_DETACH_FILTER:
 937                 ret = sk_detach_filter(sk);
 938                 break;
 939
 940         case SO_LOCK_FILTER:
 941                 if (sock_flag(sk, SOCK_FILTER_LOCKED) && !valbool)
 942                         ret = -EPERM;
 943                 else
 944                         sock_valbool_flag(sk, SOCK_FILTER_LOCKED, valbool);
 945                 break;
 946
 947         case SO_PASSSEC:
 948                 if (valbool)
 949                         set_bit(SOCK_PASSSEC, &sock->flags);
 950                 else
 951                         clear_bit(SOCK_PASSSEC, &sock->flags);
 952                 break;
 953         case SO_MARK:
 954                 if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
 955                         ret = -EPERM;
 956                 else
 957                         sk->sk_mark = val;
 958                 break;
 959
 960         case SO_RXQ_OVFL:
 961                 sock_valbool_flag(sk, SOCK_RXQ_OVFL, valbool);
 962                 break;
 963
 964         case SO_WIFI_STATUS:
 965                 sock_valbool_flag(sk, SOCK_WIFI_STATUS, valbool);
 966                 break;
 967
 968         case SO_PEEK_OFF:
 969                 if (sock->ops->set_peek_off)
 970                         ret = sock->ops->set_peek_off(sk, val);
 971                 else
 972                         ret = -EOPNOTSUPP;
 973                 break;
 974
 975         case SO_NOFCS:
 976                 sock_valbool_flag(sk, SOCK_NOFCS, valbool);
 977                 break;
 978
 979         case SO_SELECT_ERR_QUEUE:
 980                 sock_valbool_flag(sk, SOCK_SELECT_ERR_QUEUE, valbool);
 981                 break;
 982
 983 #ifdef CONFIG_NET_RX_BUSY_POLL
 984         case SO_BUSY_POLL:
 985                 /* allow unprivileged users to decrease the value */
 986                 if ((val > sk->sk_ll_usec) && !capable(CAP_NET_ADMIN))
 987                         ret = -EPERM;
 988                 else {
 989                         if (val < 0)
 990                                 ret = -EINVAL;
 991                         else
 992                                 sk->sk_ll_usec = val;
 993                 }
 994                 break;
 995 #endif
 996
 997         case SO_MAX_PACING_RATE:
 998                 if (val != ~0U)
 999                         cmpxchg(&sk->sk_pacing_status,
1000                                 SK_PACING_NONE,
1001                                 SK_PACING_NEEDED);
1002                 sk->sk_max_pacing_rate = (val == ~0U) ? ~0UL : val;
1003                 sk->sk_pacing_rate = min(sk->sk_pacing_rate,
1004                                          sk->sk_max_pacing_rate);
1005                 break;
1006
1007         case SO_INCOMING_CPU:
1008                 sk->sk_incoming_cpu = val;
1009                 break;
1010
1011         case SO_CNX_ADVICE:
1012                 if (val == 1)
1013                         dst_negative_advice(sk);
1014                 break;
1015
1016         case SO_ZEROCOPY:
1017                 if (sk->sk_family == PF_INET || sk->sk_family == PF_INET6) {
1018                         if (sk->sk_protocol != IPPROTO_TCP)
1019                                 ret = -ENOTSUPP;
1020                 } else if (sk->sk_family != PF_RDS) {
1021                         ret = -ENOTSUPP;
1022                 }
1023                 if (!ret) {
1024                         if (val < 0 || val > 1)
1025                                 ret = -EINVAL;
1026                         else
1027                                 sock_valbool_flag(sk, SOCK_ZEROCOPY, valbool);
1028                 }
1029                 break;
1030
1031         case SO_TXTIME:
1032                 if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
1033                         ret = -EPERM;
1034                 } else if (optlen != sizeof(struct sock_txtime)) {
1035                         ret = -EINVAL;
1036                 } else if (copy_from_user(&sk_txtime, optval,
1037                            sizeof(struct sock_txtime))) {
1038                         ret = -EFAULT;
1039                 } else if (sk_txtime.flags & ~SOF_TXTIME_FLAGS_MASK) {
1040                         ret = -EINVAL;
1041                 } else {
1042                         sock_valbool_flag(sk, SOCK_TXTIME, true);
1043                         sk->sk_clockid = sk_txtime.clockid;
1044                         sk->sk_txtime_deadline_mode =
1045                                 !!(sk_txtime.flags & SOF_TXTIME_DEADLINE_MODE);
1046                         sk->sk_txtime_report_errors =
1047                                 !!(sk_txtime.flags & SOF_TXTIME_REPORT_ERRORS);
1048                 }
1049                 break;
1050
1051         default:
1052                 ret = -ENOPROTOOPT;
1053                 break;
1054         }
1055         release_sock(sk);
1056         return ret;
1057 }
1058 EXPORT_SYMBOL(sock_setsockopt);
1059
1060
1061 static void cred_to_ucred(struct pid *pid, const struct cred *cred,
1062                           struct ucred *ucred)
1063 {
1064         ucred->pid = pid_vnr(pid);
1065         ucred->uid = ucred->gid = -1;
1066         if (cred) {
1067                 struct user_namespace *current_ns = current_user_ns();
1068
1069                 ucred->uid = from_kuid_munged(current_ns, cred->euid);
1070                 ucred->gid = from_kgid_munged(current_ns, cred->egid);
1071         }
1072 }
1073
1074 static int groups_to_user(gid_t __user *dst, const struct group_info *src)
1075 {
1076         struct user_namespace *user_ns = current_user_ns();
1077         int i;
1078
1079         for (i = 0; i < src->ngroups; i++)
1080                 if (put_user(from_kgid_munged(user_ns, src->gid[i]), dst + i))
1081                         return -EFAULT;
1082
1083         return 0;
1084 }
1085
1086 int sock_getsockopt(struct socket *sock, int level, int optname,
1087                     char __user *optval, int __user *optlen)
1088 {
1089         struct sock *sk = sock->sk;
1090
1091         union {
1092                 int val;
1093                 u64 val64;
1094                 struct linger ling;
1095                 struct timeval tm;
1096                 struct sock_txtime txtime;
1097         } v;
1098
1099         int lv = sizeof(int);
1100         int len;
1101
1102         if (get_user(len, optlen))
1103                 return -EFAULT;
1104         if (len < 0)
1105                 return -EINVAL;
1106
1107         memset(&v, 0, sizeof(v));
1108
1109         switch (optname) {
1110         case SO_DEBUG:
1111                 v.val = sock_flag(sk, SOCK_DBG);
1112                 break;
1113
1114         case SO_DONTROUTE:
1115                 v.val = sock_flag(sk, SOCK_LOCALROUTE);
1116                 break;
1117
1118         case SO_BROADCAST:
1119                 v.val = sock_flag(sk, SOCK_BROADCAST);
1120                 break;
1121
1122         case SO_SNDBUF:
1123                 v.val = sk->sk_sndbuf;
1124                 break;
1125
1126         case SO_RCVBUF:
1127                 v.val = sk->sk_rcvbuf;
1128                 break;
1129
1130         case SO_REUSEADDR:
1131                 v.val = sk->sk_reuse;
1132                 break;
1133
1134         case SO_REUSEPORT:
1135                 v.val = sk->sk_reuseport;
1136                 break;
1137
1138         case SO_KEEPALIVE:
1139                 v.val = sock_flag(sk, SOCK_KEEPOPEN);
1140                 break;
1141
1142         case SO_TYPE:
1143                 v.val = sk->sk_type;
1144                 break;
1145
1146         case SO_PROTOCOL:
1147                 v.val = sk->sk_protocol;
1148                 break;
1149
1150         case SO_DOMAIN:
1151                 v.val = sk->sk_family;
1152                 break;
1153
1154         case SO_ERROR:
1155                 v.val = -sock_error(sk);
1156                 if (v.val == 0)
1157                         v.val = xchg(&sk->sk_err_soft, 0);
1158                 break;
1159
1160         case SO_OOBINLINE:
1161                 v.val = sock_flag(sk, SOCK_URGINLINE);
1162                 break;
1163
1164         case SO_NO_CHECK:
1165                 v.val = sk->sk_no_check_tx;
1166                 break;
1167
1168         case SO_PRIORITY:
1169                 v.val = sk->sk_priority;
1170                 break;
1171
1172         case SO_LINGER:
1173                 lv              = sizeof(v.ling);
1174                 v.ling.l_onoff  = sock_flag(sk, SOCK_LINGER);
1175                 v.ling.l_linger = sk->sk_lingertime / HZ;
1176                 break;
1177
1178         case SO_BSDCOMPAT:
1179                 sock_warn_obsolete_bsdism("getsockopt");
1180                 break;
1181
1182         case SO_TIMESTAMP:
1183                 v.val = sock_flag(sk, SOCK_RCVTSTAMP) &&
1184                                 !sock_flag(sk, SOCK_RCVTSTAMPNS);
1185                 break;
1186
1187         case SO_TIMESTAMPNS:
1188                 v.val = sock_flag(sk, SOCK_RCVTSTAMPNS);
1189                 break;
1190
1191         case SO_TIMESTAMPING:
1192                 v.val = sk->sk_tsflags;
1193                 break;
1194
1195         case SO_RCVTIMEO:
1196                 lv = sizeof(struct timeval);
1197                 if (sk->sk_rcvtimeo == MAX_SCHEDULE_TIMEOUT) {
1198                         v.tm.tv_sec = 0;
1199                         v.tm.tv_usec = 0;
1200                 } else {
1201                         v.tm.tv_sec = sk->sk_rcvtimeo / HZ;
1202                         v.tm.tv_usec = ((sk->sk_rcvtimeo % HZ) * USEC_PER_SEC) / HZ;
1203                 }
1204                 break;
1205
1206         case SO_SNDTIMEO:
1207                 lv = sizeof(struct timeval);
1208                 if (sk->sk_sndtimeo == MAX_SCHEDULE_TIMEOUT) {
1209                         v.tm.tv_sec = 0;
1210                         v.tm.tv_usec = 0;
1211                 } else {
1212                         v.tm.tv_sec = sk->sk_sndtimeo / HZ;
1213                         v.tm.tv_usec = ((sk->sk_sndtimeo % HZ) * USEC_PER_SEC) / HZ;
1214                 }
1215                 break;
1216
1217         case SO_RCVLOWAT:
1218                 v.val = sk->sk_rcvlowat;
1219                 break;
1220
1221         case SO_SNDLOWAT:
1222                 v.val = 1;
1223                 break;
1224
1225         case SO_PASSCRED:
1226                 v.val = !!test_bit(SOCK_PASSCRED, &sock->flags);
1227                 break;
1228
1229         case SO_PEERCRED:
1230         {
1231                 struct ucred peercred;
1232                 if (len > sizeof(peercred))
1233                         len = sizeof(peercred);
1234                 cred_to_ucred(sk->sk_peer_pid, sk->sk_peer_cred, &peercred);
1235                 if (copy_to_user(optval, &peercred, len))
1236                         return -EFAULT;
1237                 goto lenout;
1238         }
1239
1240         case SO_PEERGROUPS:
1241         {
1242                 int ret, n;
1243
1244                 if (!sk->sk_peer_cred)
1245                         return -ENODATA;
1246
1247                 n = sk->sk_peer_cred->group_info->ngroups;
1248                 if (len < n * sizeof(gid_t)) {
1249                         len = n * sizeof(gid_t);
1250                         return put_user(len, optlen) ? -EFAULT : -ERANGE;
1251                 }
1252                 len = n * sizeof(gid_t);
1253
1254                 ret = groups_to_user((gid_t __user *)optval,
1255                                      sk->sk_peer_cred->group_info);
1256                 if (ret)
1257                         return ret;
1258                 goto lenout;
1259         }
1260
1261         case SO_PEERNAME:
1262         {
1263                 char address[128];
1264
1265                 lv = sock->ops->getname(sock, (struct sockaddr *)address, 2);
1266                 if (lv < 0)
1267                         return -ENOTCONN;
1268                 if (lv < len)
1269                         return -EINVAL;
1270                 if (copy_to_user(optval, address, len))
1271                         return -EFAULT;
1272                 goto lenout;
1273         }
1274
1275         /* Dubious BSD thing... Probably nobody even uses it, but
1276          * the UNIX standard wants it for whatever reason... -DaveM
1277          */
1278         case SO_ACCEPTCONN:
1279                 v.val = sk->sk_state == TCP_LISTEN;
1280                 break;
1281
1282         case SO_PASSSEC:
1283                 v.val = !!test_bit(SOCK_PASSSEC, &sock->flags);
1284                 break;
1285
1286         case SO_PEERSEC:
1287                 return security_socket_getpeersec_stream(sock, optval, optlen, len);
1288
1289         case SO_MARK:
1290                 v.val = sk->sk_mark;
1291                 break;
1292
1293         case SO_RXQ_OVFL:
1294                 v.val = sock_flag(sk, SOCK_RXQ_OVFL);
1295                 break;
1296
1297         case SO_WIFI_STATUS:
1298                 v.val = sock_flag(sk, SOCK_WIFI_STATUS);
1299                 break;
1300
1301         case SO_PEEK_OFF:
1302                 if (!sock->ops->set_peek_off)
1303                         return -EOPNOTSUPP;
1304
1305                 v.val = sk->sk_peek_off;
1306                 break;
1307         case SO_NOFCS:
1308                 v.val = sock_flag(sk, SOCK_NOFCS);
1309                 break;
1310
1311         case SO_BINDTODEVICE:
1312                 return sock_getbindtodevice(sk, optval, optlen, len);
1313
1314         case SO_GET_FILTER:
1315                 len = sk_get_filter(sk, (struct sock_filter __user *)optval, len);
1316                 if (len < 0)
1317                         return len;
1318
1319                 goto lenout;
1320
1321         case SO_LOCK_FILTER:
1322                 v.val = sock_flag(sk, SOCK_FILTER_LOCKED);
1323                 break;
1324
1325         case SO_BPF_EXTENSIONS:
1326                 v.val = bpf_tell_extensions();
1327                 break;
1328
1329         case SO_SELECT_ERR_QUEUE:
1330                 v.val = sock_flag(sk, SOCK_SELECT_ERR_QUEUE);
1331                 break;
1332
1333 #ifdef CONFIG_NET_RX_BUSY_POLL
1334         case SO_BUSY_POLL:
1335                 v.val = sk->sk_ll_usec;
1336                 break;
1337 #endif
1338
1339         case SO_MAX_PACING_RATE:
1340                 /* 32bit version */
1341                 v.val = min_t(unsigned long, sk->sk_max_pacing_rate, ~0U);
1342                 break;
1343
1344         case SO_INCOMING_CPU:
1345                 v.val = sk->sk_incoming_cpu;
1346                 break;
1347
1348         case SO_MEMINFO:
1349         {
1350                 u32 meminfo[SK_MEMINFO_VARS];
1351
1352                 if (get_user(len, optlen))
1353                         return -EFAULT;
1354
1355                 sk_get_meminfo(sk, meminfo);
1356
1357                 len = min_t(unsigned int, len, sizeof(meminfo));
1358                 if (copy_to_user(optval, &meminfo, len))
1359                         return -EFAULT;
1360
1361                 goto lenout;
1362         }
1363
1364 #ifdef CONFIG_NET_RX_BUSY_POLL
1365         case SO_INCOMING_NAPI_ID:
1366                 v.val = READ_ONCE(sk->sk_napi_id);
1367
1368                 /* aggregate non-NAPI IDs down to 0 */
1369                 if (v.val < MIN_NAPI_ID)
1370                         v.val = 0;
1371
1372                 break;
1373 #endif
1374
1375         case SO_COOKIE:
1376                 lv = sizeof(u64);
1377                 if (len < lv)
1378                         return -EINVAL;
1379                 v.val64 = sock_gen_cookie(sk);
1380                 break;
1381
1382         case SO_ZEROCOPY:
1383                 v.val = sock_flag(sk, SOCK_ZEROCOPY);
1384                 break;
1385
1386         case SO_TXTIME:
1387                 lv = sizeof(v.txtime);
1388                 v.txtime.clockid = sk->sk_clockid;
1389                 v.txtime.flags |= sk->sk_txtime_deadline_mode ?
1390                                   SOF_TXTIME_DEADLINE_MODE : 0;
1391                 v.txtime.flags |= sk->sk_txtime_report_errors ?
1392                                   SOF_TXTIME_REPORT_ERRORS : 0;
1393                 break;
1394
1395         default:
1396                 /* We implement the SO_SNDLOWAT etc to not be settable
1397                  * (1003.1g 7).
1398                  */
1399                 return -ENOPROTOOPT;
1400         }
1401
1402         if (len > lv)
1403                 len = lv;
1404         if (copy_to_user(optval, &v, len))
1405                 return -EFAULT;
1406 lenout:
1407         if (put_user(len, optlen))
1408                 return -EFAULT;
1409         return 0;
1410 }
1411
1412 /*
1413  * Initialize an sk_lock.
1414  *
1415  * (We also register the sk_lock with the lock validator.)
1416  */
1417 static inline void sock_lock_init(struct sock *sk)
1418 {
1419         if (sk->sk_kern_sock)
1420                 sock_lock_init_class_and_name(
1421                         sk,
1422                         af_family_kern_slock_key_strings[sk->sk_family],
1423                         af_family_kern_slock_keys + sk->sk_family,
1424                         af_family_kern_key_strings[sk->sk_family],
1425                         af_family_kern_keys + sk->sk_family);
1426         else
1427                 sock_lock_init_class_and_name(
1428                         sk,
1429                         af_family_slock_key_strings[sk->sk_family],
1430                         af_family_slock_keys + sk->sk_family,
1431                         af_family_key_strings[sk->sk_family],
1432                         af_family_keys + sk->sk_family);
1433 }
1434
1435 /*
1436  * Copy all fields from osk to nsk but nsk->sk_refcnt must not change yet,
1437  * even temporarly, because of RCU lookups. sk_node should also be left as is.
1438  * We must not copy fields between sk_dontcopy_begin and sk_dontcopy_end
1439  */
1440 static void sock_copy(struct sock *nsk, const struct sock *osk)
1441 {
1442 #ifdef CONFIG_SECURITY_NETWORK
1443         void *sptr = nsk->sk_security;
1444 #endif
1445         memcpy(nsk, osk, offsetof(struct sock, sk_dontcopy_begin));
1446
1447         memcpy(&nsk->sk_dontcopy_end, &osk->sk_dontcopy_end,
1448                osk->sk_prot->obj_size - offsetof(struct sock, sk_dontcopy_end));
1449
1450 #ifdef CONFIG_SECURITY_NETWORK
1451         nsk->sk_security = sptr;
1452         security_sk_clone(osk, nsk);
1453 #endif
1454 }
1455
1456 static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
1457                 int family)
1458 {
1459         struct sock *sk;
1460         struct kmem_cache *slab;
1461
1462         slab = prot->slab;
1463         if (slab != NULL) {
1464                 sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO);
1465                 if (!sk)
1466                         return sk;
1467                 if (priority & __GFP_ZERO)
1468                         sk_prot_clear_nulls(sk, prot->obj_size);
1469         } else
1470                 sk = kmalloc(prot->obj_size, priority);
1471
1472         if (sk != NULL) {
1473                 if (security_sk_alloc(sk, family, priority))
1474                         goto out_free;
1475
1476                 if (!try_module_get(prot->owner))
1477                         goto out_free_sec;
1478                 sk_tx_queue_clear(sk);
1479         }
1480
1481         return sk;
1482
1483 out_free_sec:
1484         security_sk_free(sk);
1485 out_free:
1486         if (slab != NULL)
1487                 kmem_cache_free(slab, sk);
1488         else
1489                 kfree(sk);
1490         return NULL;
1491 }
1492
1493 static void sk_prot_free(struct proto *prot, struct sock *sk)
1494 {
1495         struct kmem_cache *slab;
1496         struct module *owner;
1497
1498         owner = prot->owner;
1499         slab = prot->slab;
1500
1501         cgroup_sk_free(&sk->sk_cgrp_data);
1502         mem_cgroup_sk_free(sk);
1503         security_sk_free(sk);
1504         if (slab != NULL)
1505                 kmem_cache_free(slab, sk);
1506         else
1507                 kfree(sk);
1508         module_put(owner);
1509 }
1510
1511 /**
1512  *      sk_alloc - All socket objects are allocated here
1513  *      @net: the applicable net namespace
1514  *      @family: protocol family
1515  *      @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
1516  *      @prot: struct proto associated with this new sock instance
1517  *      @kern: is this to be a kernel socket?
1518  */
1519 struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
1520                       struct proto *prot, int kern)
1521 {
1522         struct sock *sk;
1523
1524         sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family);
1525         if (sk) {
1526                 sk->sk_family = family;
1527                 /*
1528                  * See comment in struct sock definition to understand
1529                  * why we need sk_prot_creator -acme
1530                  */
1531                 sk->sk_prot = sk->sk_prot_creator = prot;
1532                 sk->sk_kern_sock = kern;
1533                 sock_lock_init(sk);
1534                 sk->sk_net_refcnt = kern ? 0 : 1;
1535                 if (likely(sk->sk_net_refcnt)) {
1536                         get_net(net);
1537                         sock_inuse_add(net, 1);
1538                 }
1539
1540                 sock_net_set(sk, net);
1541                 refcount_set(&sk->sk_wmem_alloc, 1);
1542
1543                 mem_cgroup_sk_alloc(sk);
1544                 cgroup_sk_alloc(&sk->sk_cgrp_data);
1545                 sock_update_classid(&sk->sk_cgrp_data);
1546                 sock_update_netprioidx(&sk->sk_cgrp_data);
1547         }
1548
1549         return sk;
1550 }
1551 EXPORT_SYMBOL(sk_alloc);
1552
1553 /* Sockets having SOCK_RCU_FREE will call this function after one RCU
1554  * grace period. This is the case for UDP sockets and TCP listeners.
1555  */
1556 static void __sk_destruct(struct rcu_head *head)
1557 {
1558         struct sock *sk = container_of(head, struct sock, sk_rcu);
1559         struct sk_filter *filter;
1560
1561         if (sk->sk_destruct)
1562                 sk->sk_destruct(sk);
1563
1564         filter = rcu_dereference_check(sk->sk_filter,
1565                                        refcount_read(&sk->sk_wmem_alloc) == 0);
1566         if (filter) {
1567                 sk_filter_uncharge(sk, filter);
1568                 RCU_INIT_POINTER(sk->sk_filter, NULL);
1569         }
1570         if (rcu_access_pointer(sk->sk_reuseport_cb))
1571                 reuseport_detach_sock(sk);
1572
1573         sock_disable_timestamp(sk, SK_FLAGS_TIMESTAMP);
1574
1575         if (atomic_read(&sk->sk_omem_alloc))
1576                 pr_debug("%s: optmem leakage (%d bytes) detected\n",
1577                          __func__, atomic_read(&sk->sk_omem_alloc));
1578
1579         if (sk->sk_frag.page) {
1580                 put_page(sk->sk_frag.page);
1581                 sk->sk_frag.page = NULL;
1582         }
1583
1584         if (sk->sk_peer_cred)
1585                 put_cred(sk->sk_peer_cred);
1586         put_pid(sk->sk_peer_pid);
1587         if (likely(sk->sk_net_refcnt))
1588                 put_net(sock_net(sk));
1589         sk_prot_free(sk->sk_prot_creator, sk);
1590 }
1591
1592 void sk_destruct(struct sock *sk)
1593 {
1594         if (sock_flag(sk, SOCK_RCU_FREE))
1595                 call_rcu(&sk->sk_rcu, __sk_destruct);
1596         else
1597                 __sk_destruct(&sk->sk_rcu);
1598 }
1599
1600 static void __sk_free(struct sock *sk)
1601 {
1602         if (likely(sk->sk_net_refcnt))
1603                 sock_inuse_add(sock_net(sk), -1);
1604
1605         if (unlikely(sk->sk_net_refcnt && sock_diag_has_destroy_listeners(sk)))
1606                 sock_diag_broadcast_destroy(sk);
1607         else
1608                 sk_destruct(sk);
1609 }
1610
1611 void sk_free(struct sock *sk)
1612 {
1613         /*
1614          * We subtract one from sk_wmem_alloc and can know if
1615          * some packets are still in some tx queue.
1616          * If not null, sock_wfree() will call __sk_free(sk) later
1617          */
1618         if (refcount_dec_and_test(&sk->sk_wmem_alloc))
1619                 __sk_free(sk);
1620 }
1621 EXPORT_SYMBOL(sk_free);
1622
1623 static void sk_init_common(struct sock *sk)
1624 {
1625         skb_queue_head_init(&sk->sk_receive_queue);
1626         skb_queue_head_init(&sk->sk_write_queue);
1627         skb_queue_head_init(&sk->sk_error_queue);
1628
1629         rwlock_init(&sk->sk_callback_lock);
1630         lockdep_set_class_and_name(&sk->sk_receive_queue.lock,
1631                         af_rlock_keys + sk->sk_family,
1632                         af_family_rlock_key_strings[sk->sk_family]);
1633         lockdep_set_class_and_name(&sk->sk_write_queue.lock,
1634                         af_wlock_keys + sk->sk_family,
1635                         af_family_wlock_key_strings[sk->sk_family]);
1636         lockdep_set_class_and_name(&sk->sk_error_queue.lock,
1637                         af_elock_keys + sk->sk_family,
1638                         af_family_elock_key_strings[sk->sk_family]);
1639         lockdep_set_class_and_name(&sk->sk_callback_lock,
1640                         af_callback_keys + sk->sk_family,
1641                         af_family_clock_key_strings[sk->sk_family]);
1642 }
1643
1644 /**
1645  *      sk_clone_lock - clone a socket, and lock its clone
1646  *      @sk: the socket to clone
1647  *      @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
1648  *
1649  *      Caller must unlock socket even in error path (bh_unlock_sock(newsk))
1650  */
1651 struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
1652 {
1653         struct sock *newsk;
1654         bool is_charged = true;
1655
1656         newsk = sk_prot_alloc(sk->sk_prot, priority, sk->sk_family);
1657         if (newsk != NULL) {
1658                 struct sk_filter *filter;
1659
1660                 sock_copy(newsk, sk);
1661
1662                 newsk->sk_prot_creator = sk->sk_prot;
1663
1664                 /* SANITY */
1665                 if (likely(newsk->sk_net_refcnt))
1666                         get_net(sock_net(newsk));
1667                 sk_node_init(&newsk->sk_node);
1668                 sock_lock_init(newsk);
1669                 bh_lock_sock(newsk);
1670                 newsk->sk_backlog.head  = newsk->sk_backlog.tail = NULL;
1671                 newsk->sk_backlog.len = 0;
1672
1673                 atomic_set(&newsk->sk_rmem_alloc, 0);
1674                 /*
1675                  * sk_wmem_alloc set to one (see sk_free() and sock_wfree())
1676                  */
1677                 refcount_set(&newsk->sk_wmem_alloc, 1);
1678                 atomic_set(&newsk->sk_omem_alloc, 0);
1679                 sk_init_common(newsk);
1680
1681                 newsk->sk_dst_cache     = NULL;
1682                 newsk->sk_dst_pending_confirm = 0;
1683                 newsk->sk_wmem_queued   = 0;
1684                 newsk->sk_forward_alloc = 0;
1685                 atomic_set(&newsk->sk_drops, 0);
1686                 newsk->sk_send_head     = NULL;
1687                 newsk->sk_userlocks     = sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
1688                 atomic_set(&newsk->sk_zckey, 0);
1689
1690                 sock_reset_flag(newsk, SOCK_DONE);
1691                 mem_cgroup_sk_alloc(newsk);
1692                 cgroup_sk_alloc(&newsk->sk_cgrp_data);
1693
1694                 rcu_read_lock();
1695                 filter = rcu_dereference(sk->sk_filter);
1696                 if (filter != NULL)
1697                         /* though it's an empty new sock, the charging may fail
1698                          * if sysctl_optmem_max was changed between creation of
1699                          * original socket and cloning
1700                          */
1701                         is_charged = sk_filter_charge(newsk, filter);
1702                 RCU_INIT_POINTER(newsk->sk_filter, filter);
1703                 rcu_read_unlock();
1704
1705                 if (unlikely(!is_charged || xfrm_sk_clone_policy(newsk, sk))) {
1706                         /* We need to make sure that we don't uncharge the new
1707                          * socket if we couldn't charge it in the first place
1708                          * as otherwise we uncharge the parent's filter.
1709                          */
1710                         if (!is_charged)
1711                                 RCU_INIT_POINTER(newsk->sk_filter, NULL);
1712                         sk_free_unlock_clone(newsk);
1713                         newsk = NULL;
1714                         goto out;
1715                 }
1716                 RCU_INIT_POINTER(newsk->sk_reuseport_cb, NULL);
1717
1718                 newsk->sk_err      = 0;
1719                 newsk->sk_err_soft = 0;
1720                 newsk->sk_priority = 0;
1721                 newsk->sk_incoming_cpu = raw_smp_processor_id();
1722                 atomic64_set(&newsk->sk_cookie, 0);
1723                 if (likely(newsk->sk_net_refcnt))
1724                         sock_inuse_add(sock_net(newsk), 1);
1725
1726                 /*
1727                  * Before updating sk_refcnt, we must commit prior changes to memory
1728                  * (Documentation/RCU/rculist_nulls.txt for details)
1729                  */
1730                 smp_wmb();
1731                 refcount_set(&newsk->sk_refcnt, 2);
1732
1733                 /*
1734                  * Increment the counter in the same struct proto as the master
1735                  * sock (sk_refcnt_debug_inc uses newsk->sk_prot->socks, that
1736                  * is the same as sk->sk_prot->socks, as this field was copied
1737                  * with memcpy).
1738                  *
1739                  * This _changes_ the previous behaviour, where
1740                  * tcp_create_openreq_child always was incrementing the
1741                  * equivalent to tcp_prot->socks (inet_sock_nr), so this have
1742                  * to be taken into account in all callers. -acme
1743                  */
1744                 sk_refcnt_debug_inc(newsk);
1745                 sk_set_socket(newsk, NULL);
1746                 newsk->sk_wq = NULL;
1747
1748                 if (newsk->sk_prot->sockets_allocated)
1749                         sk_sockets_allocated_inc(newsk);
1750
1751                 if (sock_needs_netstamp(sk) &&
1752                     newsk->sk_flags & SK_FLAGS_TIMESTAMP)
1753                         net_enable_timestamp();
1754         }
1755 out:
1756         return newsk;
1757 }
1758 EXPORT_SYMBOL_GPL(sk_clone_lock);
1759
1760 void sk_free_unlock_clone(struct sock *sk)
1761 {
1762         /* It is still raw copy of parent, so invalidate
1763          * destructor and make plain sk_free() */
1764         sk->sk_destruct = NULL;
1765         bh_unlock_sock(sk);
1766         sk_free(sk);
1767 }
1768 EXPORT_SYMBOL_GPL(sk_free_unlock_clone);
1769
1770 void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
1771 {
1772         u32 max_segs = 1;
1773
1774         sk_dst_set(sk, dst);
1775         sk->sk_route_caps = dst->dev->features | sk->sk_route_forced_caps;
1776         if (sk->sk_route_caps & NETIF_F_GSO)
1777                 sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE;
1778         sk->sk_route_caps &= ~sk->sk_route_nocaps;
1779         if (sk_can_gso(sk)) {
1780                 if (dst->header_len && !xfrm_dst_offload_ok(dst)) {
1781                         sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
1782                 } else {
1783                         sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM;
1784                         sk->sk_gso_max_size = dst->dev->gso_max_size;
1785                         max_segs = max_t(u32, dst->dev->gso_max_segs, 1);
1786                 }
1787         }
1788         sk->sk_gso_max_segs = max_segs;
1789 }
1790 EXPORT_SYMBOL_GPL(sk_setup_caps);
1791
1792 /*
1793  *      Simple resource managers for sockets.
1794  */
1795
1796
1797 /*
1798  * Write buffer destructor automatically called from kfree_skb.
1799  */
1800 void sock_wfree(struct sk_buff *skb)
1801 {
1802         struct sock *sk = skb->sk;
1803         unsigned int len = skb->truesize;
1804
1805         if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE)) {
1806                 /*
1807                  * Keep a reference on sk_wmem_alloc, this will be released
1808                  * after sk_write_space() call
1809                  */
1810                 WARN_ON(refcount_sub_and_test(len - 1, &sk->sk_wmem_alloc));
1811                 sk->sk_write_space(sk);
1812                 len = 1;
1813         }
1814         /*
1815          * if sk_wmem_alloc reaches 0, we must finish what sk_free()
1816          * could not do because of in-flight packets
1817          */
1818         if (refcount_sub_and_test(len, &sk->sk_wmem_alloc))
1819                 __sk_free(sk);
1820 }
1821 EXPORT_SYMBOL(sock_wfree);
1822
1823 /* This variant of sock_wfree() is used by TCP,
1824  * since it sets SOCK_USE_WRITE_QUEUE.
1825  */
1826 void __sock_wfree(struct sk_buff *skb)
1827 {
1828         struct sock *sk = skb->sk;
1829
1830         if (refcount_sub_and_test(skb->truesize, &sk->sk_wmem_alloc))
1831                 __sk_free(sk);
1832 }
1833
1834 void skb_set_owner_w(struct sk_buff *skb, struct sock *sk)
1835 {
1836         skb_orphan(skb);
1837         skb->sk = sk;
1838 #ifdef CONFIG_INET
1839         if (unlikely(!sk_fullsock(sk))) {
1840                 skb->destructor = sock_edemux;
1841                 sock_hold(sk);
1842                 return;
1843         }
1844 #endif
1845         skb->destructor = sock_wfree;
1846         skb_set_hash_from_sk(skb, sk);
1847         /*
1848          * We used to take a refcount on sk, but following operation
1849          * is enough to guarantee sk_free() wont free this sock until
1850          * all in-flight packets are completed
1851          */
1852         refcount_add(skb->truesize, &sk->sk_wmem_alloc);
1853 }
1854 EXPORT_SYMBOL(skb_set_owner_w);
1855
1856 /* This helper is used by netem, as it can hold packets in its
1857  * delay queue. We want to allow the owner socket to send more
1858  * packets, as if they were already TX completed by a typical driver.
1859  * But we also want to keep skb->sk set because some packet schedulers
1860  * rely on it (sch_fq for example).
1861  */
1862 void skb_orphan_partial(struct sk_buff *skb)
1863 {
1864         if (skb_is_tcp_pure_ack(skb))
1865                 return;
1866
1867         if (skb->destructor == sock_wfree
1868 #ifdef CONFIG_INET
1869             || skb->destructor == tcp_wfree
1870 #endif
1871                 ) {
1872                 struct sock *sk = skb->sk;
1873
1874                 if (refcount_inc_not_zero(&sk->sk_refcnt)) {
1875                         WARN_ON(refcount_sub_and_test(skb->truesize, &sk->sk_wmem_alloc));
1876                         skb->destructor = sock_efree;
1877                 }
1878         } else {
1879                 skb_orphan(skb);
1880         }
1881 }
1882 EXPORT_SYMBOL(skb_orphan_partial);
1883
1884 /*
1885  * Read buffer destructor automatically called from kfree_skb.
1886  */
1887 void sock_rfree(struct sk_buff *skb)
1888 {
1889         struct sock *sk = skb->sk;
1890         unsigned int len = skb->truesize;
1891
1892         atomic_sub(len, &sk->sk_rmem_alloc);
1893         sk_mem_uncharge(sk, len);
1894 }
1895 EXPORT_SYMBOL(sock_rfree);
1896
1897 /*
1898  * Buffer destructor for skbs that are not used directly in read or write
1899  * path, e.g. for error handler skbs. Automatically called from kfree_skb.
1900  */
1901 void sock_efree(struct sk_buff *skb)
1902 {
1903         sock_put(skb->sk);
1904 }
1905 EXPORT_SYMBOL(sock_efree);
1906
1907 kuid_t sock_i_uid(struct sock *sk)
1908 {
1909         kuid_t uid;
1910
1911         read_lock_bh(&sk->sk_callback_lock);
1912         uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : GLOBAL_ROOT_UID;
1913         read_unlock_bh(&sk->sk_callback_lock);
1914         return uid;
1915 }
1916 EXPORT_SYMBOL(sock_i_uid);
1917
1918 unsigned long sock_i_ino(struct sock *sk)
1919 {
1920         unsigned long ino;
1921
1922         read_lock_bh(&sk->sk_callback_lock);
1923         ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0;
1924         read_unlock_bh(&sk->sk_callback_lock);
1925         return ino;
1926 }
1927 EXPORT_SYMBOL(sock_i_ino);
1928
1929 /*
1930  * Allocate a skb from the socket's send buffer.
1931  */
1932 struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
1933                              gfp_t priority)
1934 {
1935         if (force || refcount_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
1936                 struct sk_buff *skb = alloc_skb(size, priority);
1937                 if (skb) {
1938                         skb_set_owner_w(skb, sk);
1939                         return skb;
1940                 }
1941         }
1942         return NULL;
1943 }
1944 EXPORT_SYMBOL(sock_wmalloc);
1945
1946 static void sock_ofree(struct sk_buff *skb)
1947 {
1948         struct sock *sk = skb->sk;
1949
1950         atomic_sub(skb->truesize, &sk->sk_omem_alloc);
1951 }
1952
1953 struct sk_buff *sock_omalloc(struct sock *sk, unsigned long size,
1954                              gfp_t priority)
1955 {
1956         struct sk_buff *skb;
1957
1958         /* small safe race: SKB_TRUESIZE may differ from final skb->truesize */
1959         if (atomic_read(&sk->sk_omem_alloc) + SKB_TRUESIZE(size) >
1960             sysctl_optmem_max)
1961                 return NULL;
1962
1963         skb = alloc_skb(size, priority);
1964         if (!skb)
1965                 return NULL;
1966
1967         atomic_add(skb->truesize, &sk->sk_omem_alloc);
1968         skb->sk = sk;
1969         skb->destructor = sock_ofree;
1970         return skb;
1971 }
1972
1973 /*
1974  * Allocate a memory block from the socket's option memory buffer.
1975  */
1976 void *sock_kmalloc(struct sock *sk, int size, gfp_t priority)
1977 {
1978         if ((unsigned int)size <= sysctl_optmem_max &&
1979             atomic_read(&sk->sk_omem_alloc) + size < sysctl_optmem_max) {
1980                 void *mem;
1981                 /* First do the add, to avoid the race if kmalloc
1982                  * might sleep.
1983                  */
1984                 atomic_add(size, &sk->sk_omem_alloc);
1985                 mem = kmalloc(size, priority);
1986                 if (mem)
1987                         return mem;
1988                 atomic_sub(size, &sk->sk_omem_alloc);
1989         }
1990         return NULL;
1991 }
1992 EXPORT_SYMBOL(sock_kmalloc);
1993
1994 /* Free an option memory block. Note, we actually want the inline
1995  * here as this allows gcc to detect the nullify and fold away the
1996  * condition entirely.
1997  */
1998 static inline void __sock_kfree_s(struct sock *sk, void *mem, int size,
1999                                   const bool nullify)
2000 {
2001         if (WARN_ON_ONCE(!mem))
2002                 return;
2003         if (nullify)
2004                 kzfree(mem);
2005         else
2006                 kfree(mem);
2007         atomic_sub(size, &sk->sk_omem_alloc);
2008 }
2009
2010 void sock_kfree_s(struct sock *sk, void *mem, int size)
2011 {
2012         __sock_kfree_s(sk, mem, size, false);
2013 }
2014 EXPORT_SYMBOL(sock_kfree_s);
2015
2016 void sock_kzfree_s(struct sock *sk, void *mem, int size)
2017 {
2018         __sock_kfree_s(sk, mem, size, true);
2019 }
2020 EXPORT_SYMBOL(sock_kzfree_s);
2021
2022 /* It is almost wait_for_tcp_memory minus release_sock/lock_sock.
2023    I think, these locks should be removed for datagram sockets.
2024  */
2025 static long sock_wait_for_wmem(struct sock *sk, long timeo)
2026 {
2027         DEFINE_WAIT(wait);
2028
2029         sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
2030         for (;;) {
2031                 if (!timeo)
2032                         break;
2033                 if (signal_pending(current))
2034                         break;
2035                 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
2036                 prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
2037                 if (refcount_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf)
2038                         break;
2039                 if (sk->sk_shutdown & SEND_SHUTDOWN)
2040                         break;
2041                 if (sk->sk_err)
2042                         break;
2043                 timeo = schedule_timeout(timeo);
2044         }
2045         finish_wait(sk_sleep(sk), &wait);
2046         return timeo;
2047 }
2048
2049
2050 /*
2051  *      Generic send/receive buffer handlers
2052  */
2053
2054 struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len,
2055                                      unsigned long data_len, int noblock,
2056                                      int *errcode, int max_page_order)
2057 {
2058         struct sk_buff *skb;
2059         long timeo;
2060         int err;
2061
2062         timeo = sock_sndtimeo(sk, noblock);
2063         for (;;) {
2064                 err = sock_error(sk);
2065                 if (err != 0)
2066                         goto failure;
2067
2068                 err = -EPIPE;
2069                 if (sk->sk_shutdown & SEND_SHUTDOWN)
2070                         goto failure;
2071
2072                 if (sk_wmem_alloc_get(sk) < sk->sk_sndbuf)
2073                         break;
2074
2075                 sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
2076                 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
2077                 err = -EAGAIN;
2078                 if (!timeo)
2079                         goto failure;
2080                 if (signal_pending(current))
2081                         goto interrupted;
2082                 timeo = sock_wait_for_wmem(sk, timeo);
2083         }
2084         skb = alloc_skb_with_frags(header_len, data_len, max_page_order,
2085                                    errcode, sk->sk_allocation);
2086         if (skb)
2087                 skb_set_owner_w(skb, sk);
2088         return skb;
2089
2090 interrupted:
2091         err = sock_intr_errno(timeo);
2092 failure:
2093         *errcode = err;
2094         return NULL;
2095 }
2096 EXPORT_SYMBOL(sock_alloc_send_pskb);
2097
2098 struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size,
2099                                     int noblock, int *errcode)
2100 {
2101         return sock_alloc_send_pskb(sk, size, 0, noblock, errcode, 0);
2102 }
2103 EXPORT_SYMBOL(sock_alloc_send_skb);
2104
2105 int __sock_cmsg_send(struct sock *sk, struct msghdr *msg, struct cmsghdr *cmsg,
2106                      struct sockcm_cookie *sockc)
2107 {
2108         u32 tsflags;
2109
2110         switch (cmsg->cmsg_type) {
2111         case SO_MARK:
2112                 if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
2113                         return -EPERM;
2114                 if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
2115                         return -EINVAL;
2116                 sockc->mark = *(u32 *)CMSG_DATA(cmsg);
2117                 break;
2118         case SO_TIMESTAMPING:
2119                 if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
2120                         return -EINVAL;
2121
2122                 tsflags = *(u32 *)CMSG_DATA(cmsg);
2123                 if (tsflags & ~SOF_TIMESTAMPING_TX_RECORD_MASK)
2124                         return -EINVAL;
2125
2126                 sockc->tsflags &= ~SOF_TIMESTAMPING_TX_RECORD_MASK;
2127                 sockc->tsflags |= tsflags;
2128                 break;
2129         case SCM_TXTIME:
2130                 if (!sock_flag(sk, SOCK_TXTIME))
2131                         return -EINVAL;
2132                 if (cmsg->cmsg_len != CMSG_LEN(sizeof(u64)))
2133                         return -EINVAL;
2134                 sockc->transmit_time = get_unaligned((u64 *)CMSG_DATA(cmsg));
2135                 break;
2136         /* SCM_RIGHTS and SCM_CREDENTIALS are semantically in SOL_UNIX. */
2137         case SCM_RIGHTS:
2138         case SCM_CREDENTIALS:
2139                 break;
2140         default:
2141                 return -EINVAL;
2142         }
2143         return 0;
2144 }
2145 EXPORT_SYMBOL(__sock_cmsg_send);
2146
2147 int sock_cmsg_send(struct sock *sk, struct msghdr *msg,
2148                    struct sockcm_cookie *sockc)
2149 {
2150         struct cmsghdr *cmsg;
2151         int ret;
2152
2153         for_each_cmsghdr(cmsg, msg) {
2154                 if (!CMSG_OK(msg, cmsg))
2155                         return -EINVAL;
2156                 if (cmsg->cmsg_level != SOL_SOCKET)
2157                         continue;
2158                 ret = __sock_cmsg_send(sk, msg, cmsg, sockc);
2159                 if (ret)
2160                         return ret;
2161         }
2162         return 0;
2163 }
2164 EXPORT_SYMBOL(sock_cmsg_send);
2165
2166 static void sk_enter_memory_pressure(struct sock *sk)
2167 {
2168         if (!sk->sk_prot->enter_memory_pressure)
2169                 return;
2170
2171         sk->sk_prot->enter_memory_pressure(sk);
2172 }
2173
2174 static void sk_leave_memory_pressure(struct sock *sk)
2175 {
2176         if (sk->sk_prot->leave_memory_pressure) {
2177                 sk->sk_prot->leave_memory_pressure(sk);
2178         } else {
2179                 unsigned long *memory_pressure = sk->sk_prot->memory_pressure;
2180
2181                 if (memory_pressure && *memory_pressure)
2182                         *memory_pressure = 0;
2183         }
2184 }
2185
2186 /* On 32bit arches, an skb frag is limited to 2^15 */
2187 #define SKB_FRAG_PAGE_ORDER     get_order(32768)
2188
2189 /**
2190  * skb_page_frag_refill - check that a page_frag contains enough room
2191  * @sz: minimum size of the fragment we want to get
2192  * @pfrag: pointer to page_frag
2193  * @gfp: priority for memory allocation
2194  *
2195  * Note: While this allocator tries to use high order pages, there is
2196  * no guarantee that allocations succeed. Therefore, @sz MUST be
2197  * less or equal than PAGE_SIZE.
2198  */
2199 bool skb_page_frag_refill(unsigned int sz, struct page_frag *pfrag, gfp_t gfp)
2200 {
2201         if (pfrag->page) {
2202                 if (page_ref_count(pfrag->page) == 1) {
2203                         pfrag->offset = 0;
2204                         return true;
2205                 }
2206                 if (pfrag->offset + sz <= pfrag->size)
2207                         return true;
2208                 put_page(pfrag->page);
2209         }
2210
2211         pfrag->offset = 0;
2212         if (SKB_FRAG_PAGE_ORDER) {
2213                 /* Avoid direct reclaim but allow kswapd to wake */
2214                 pfrag->page = alloc_pages((gfp & ~__GFP_DIRECT_RECLAIM) |
2215                                           __GFP_COMP | __GFP_NOWARN |
2216                                           __GFP_NORETRY,
2217                                           SKB_FRAG_PAGE_ORDER);
2218                 if (likely(pfrag->page)) {
2219                         pfrag->size = PAGE_SIZE << SKB_FRAG_PAGE_ORDER;
2220                         return true;
2221                 }
2222         }
2223         pfrag->page = alloc_page(gfp);
2224         if (likely(pfrag->page)) {
2225                 pfrag->size = PAGE_SIZE;
2226                 return true;
2227         }
2228         return false;
2229 }
2230 EXPORT_SYMBOL(skb_page_frag_refill);
2231
2232 bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag)
2233 {
2234         if (likely(skb_page_frag_refill(32U, pfrag, sk->sk_allocation)))
2235                 return true;
2236
2237         sk_enter_memory_pressure(sk);
2238         sk_stream_moderate_sndbuf(sk);
2239         return false;
2240 }
2241 EXPORT_SYMBOL(sk_page_frag_refill);
2242
2243 static void __lock_sock(struct sock *sk)
2244         __releases(&sk->sk_lock.slock)
2245         __acquires(&sk->sk_lock.slock)
2246 {
2247         DEFINE_WAIT(wait);
2248
2249         for (;;) {
2250                 prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait,
2251                                         TASK_UNINTERRUPTIBLE);
2252                 spin_unlock_bh(&sk->sk_lock.slock);
2253                 schedule();
2254                 spin_lock_bh(&sk->sk_lock.slock);
2255                 if (!sock_owned_by_user(sk))
2256                         break;
2257         }
2258         finish_wait(&sk->sk_lock.wq, &wait);
2259 }
2260
2261 void __release_sock(struct sock *sk)
2262         __releases(&sk->sk_lock.slock)
2263         __acquires(&sk->sk_lock.slock)
2264 {
2265         struct sk_buff *skb, *next;
2266
2267         while ((skb = sk->sk_backlog.head) != NULL) {
2268                 sk->sk_backlog.head = sk->sk_backlog.tail = NULL;
2269
2270                 spin_unlock_bh(&sk->sk_lock.slock);
2271
2272                 do {
2273                         next = skb->next;
2274                         prefetch(next);
2275                         WARN_ON_ONCE(skb_dst_is_noref(skb));
2276                         skb_mark_not_on_list(skb);
2277                         sk_backlog_rcv(sk, skb);
2278
2279                         cond_resched();
2280
2281                         skb = next;
2282                 } while (skb != NULL);
2283
2284                 spin_lock_bh(&sk->sk_lock.slock);
2285         }
2286
2287         /*
2288          * Doing the zeroing here guarantee we can not loop forever
2289          * while a wild producer attempts to flood us.
2290          */
2291         sk->sk_backlog.len = 0;
2292 }
2293
2294 void __sk_flush_backlog(struct sock *sk)
2295 {
2296         spin_lock_bh(&sk->sk_lock.slock);
2297         __release_sock(sk);
2298         spin_unlock_bh(&sk->sk_lock.slock);
2299 }
2300
2301 /**
2302  * sk_wait_data - wait for data to arrive at sk_receive_queue
2303  * @sk:    sock to wait on
2304  * @timeo: for how long
2305  * @skb:   last skb seen on sk_receive_queue
2306  *
2307  * Now socket state including sk->sk_err is changed only under lock,
2308  * hence we may omit checks after joining wait queue.
2309  * We check receive queue before schedule() only as optimization;
2310  * it is very likely that release_sock() added new data.
2311  */
2312 int sk_wait_data(struct sock *sk, long *timeo, const struct sk_buff *skb)
2313 {
2314         DEFINE_WAIT_FUNC(wait, woken_wake_function);
2315         int rc;
2316
2317         add_wait_queue(sk_sleep(sk), &wait);
2318         sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2319         rc = sk_wait_event(sk, timeo, skb_peek_tail(&sk->sk_receive_queue) != skb, &wait);
2320         sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2321         remove_wait_queue(sk_sleep(sk), &wait);
2322         return rc;
2323 }
2324 EXPORT_SYMBOL(sk_wait_data);
2325
2326 /**
2327  *      __sk_mem_raise_allocated - increase memory_allocated
2328  *      @sk: socket
2329  *      @size: memory size to allocate
2330  *      @amt: pages to allocate
2331  *      @kind: allocation type
2332  *
2333  *      Similar to __sk_mem_schedule(), but does not update sk_forward_alloc
2334  */
2335 int __sk_mem_raise_allocated(struct sock *sk, int size, int amt, int kind)
2336 {
2337         struct proto *prot = sk->sk_prot;
2338         long allocated = sk_memory_allocated_add(sk, amt);
2339         bool charged = true;
2340
2341         if (mem_cgroup_sockets_enabled && sk->sk_memcg &&
2342             !(charged = mem_cgroup_charge_skmem(sk->sk_memcg, amt)))
2343                 goto suppress_allocation;
2344
2345         /* Under limit. */
2346         if (allocated <= sk_prot_mem_limits(sk, 0)) {
2347                 sk_leave_memory_pressure(sk);
2348                 return 1;
2349         }
2350
2351         /* Under pressure. */
2352         if (allocated > sk_prot_mem_limits(sk, 1))
2353                 sk_enter_memory_pressure(sk);
2354
2355         /* Over hard limit. */
2356         if (allocated > sk_prot_mem_limits(sk, 2))
2357                 goto suppress_allocation;
2358
2359         /* guarantee minimum buffer size under pressure */
2360         if (kind == SK_MEM_RECV) {
2361                 if (atomic_read(&sk->sk_rmem_alloc) < sk_get_rmem0(sk, prot))
2362                         return 1;
2363
2364         } else { /* SK_MEM_SEND */
2365                 int wmem0 = sk_get_wmem0(sk, prot);
2366
2367                 if (sk->sk_type == SOCK_STREAM) {
2368                         if (sk->sk_wmem_queued < wmem0)
2369                                 return 1;
2370                 } else if (refcount_read(&sk->sk_wmem_alloc) < wmem0) {
2371                                 return 1;
2372                 }
2373         }
2374
2375         if (sk_has_memory_pressure(sk)) {
2376                 int alloc;
2377
2378                 if (!sk_under_memory_pressure(sk))
2379                         return 1;
2380                 alloc = sk_sockets_allocated_read_positive(sk);
2381                 if (sk_prot_mem_limits(sk, 2) > alloc *
2382                     sk_mem_pages(sk->sk_wmem_queued +
2383                                  atomic_read(&sk->sk_rmem_alloc) +
2384                                  sk->sk_forward_alloc))
2385                         return 1;
2386         }
2387
2388 suppress_allocation:
2389
2390         if (kind == SK_MEM_SEND && sk->sk_type == SOCK_STREAM) {
2391                 sk_stream_moderate_sndbuf(sk);
2392
2393                 /* Fail only if socket is _under_ its sndbuf.
2394                  * In this case we cannot block, so that we have to fail.
2395                  */
2396                 if (sk->sk_wmem_queued + size >= sk->sk_sndbuf)
2397                         return 1;
2398         }
2399
2400         if (kind == SK_MEM_SEND || (kind == SK_MEM_RECV && charged))
2401                 trace_sock_exceed_buf_limit(sk, prot, allocated, kind);
2402
2403         sk_memory_allocated_sub(sk, amt);
2404
2405         if (mem_cgroup_sockets_enabled && sk->sk_memcg)
2406                 mem_cgroup_uncharge_skmem(sk->sk_memcg, amt);
2407
2408         return 0;
2409 }
2410 EXPORT_SYMBOL(__sk_mem_raise_allocated);
2411
2412 /**
2413  *      __sk_mem_schedule - increase sk_forward_alloc and memory_allocated
2414  *      @sk: socket
2415  *      @size: memory size to allocate
2416  *      @kind: allocation type
2417  *
2418  *      If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means
2419  *      rmem allocation. This function assumes that protocols which have
2420  *      memory_pressure use sk_wmem_queued as write buffer accounting.
2421  */
2422 int __sk_mem_schedule(struct sock *sk, int size, int kind)
2423 {
2424         int ret, amt = sk_mem_pages(size);
2425
2426         sk->sk_forward_alloc += amt << SK_MEM_QUANTUM_SHIFT;
2427         ret = __sk_mem_raise_allocated(sk, size, amt, kind);
2428         if (!ret)
2429                 sk->sk_forward_alloc -= amt << SK_MEM_QUANTUM_SHIFT;
2430         return ret;
2431 }
2432 EXPORT_SYMBOL(__sk_mem_schedule);
2433
2434 /**
2435  *      __sk_mem_reduce_allocated - reclaim memory_allocated
2436  *      @sk: socket
2437  *      @amount: number of quanta
2438  *
2439  *      Similar to __sk_mem_reclaim(), but does not update sk_forward_alloc
2440  */
2441 void __sk_mem_reduce_allocated(struct sock *sk, int amount)
2442 {
2443         sk_memory_allocated_sub(sk, amount);
2444
2445         if (mem_cgroup_sockets_enabled && sk->sk_memcg)
2446                 mem_cgroup_uncharge_skmem(sk->sk_memcg, amount);
2447
2448         if (sk_under_memory_pressure(sk) &&
2449             (sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)))
2450                 sk_leave_memory_pressure(sk);
2451 }
2452 EXPORT_SYMBOL(__sk_mem_reduce_allocated);
2453
2454 /**
2455  *      __sk_mem_reclaim - reclaim sk_forward_alloc and memory_allocated
2456  *      @sk: socket
2457  *      @amount: number of bytes (rounded down to a SK_MEM_QUANTUM multiple)
2458  */
2459 void __sk_mem_reclaim(struct sock *sk, int amount)
2460 {
2461         amount >>= SK_MEM_QUANTUM_SHIFT;
2462         sk->sk_forward_alloc -= amount << SK_MEM_QUANTUM_SHIFT;
2463         __sk_mem_reduce_allocated(sk, amount);
2464 }
2465 EXPORT_SYMBOL(__sk_mem_reclaim);
2466
2467 int sk_set_peek_off(struct sock *sk, int val)
2468 {
2469         sk->sk_peek_off = val;
2470         return 0;
2471 }
2472 EXPORT_SYMBOL_GPL(sk_set_peek_off);
2473
2474 /*
2475  * Set of default routines for initialising struct proto_ops when
2476  * the protocol does not support a particular function. In certain
2477  * cases where it makes no sense for a protocol to have a "do nothing"
2478  * function, some default processing is provided.
2479  */
2480
2481 int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len)
2482 {
2483         return -EOPNOTSUPP;
2484 }
2485 EXPORT_SYMBOL(sock_no_bind);
2486
2487 int sock_no_connect(struct socket *sock, struct sockaddr *saddr,
2488                     int len, int flags)
2489 {
2490         return -EOPNOTSUPP;
2491 }
2492 EXPORT_SYMBOL(sock_no_connect);
2493
2494 int sock_no_socketpair(struct socket *sock1, struct socket *sock2)
2495 {
2496         return -EOPNOTSUPP;
2497 }
2498 EXPORT_SYMBOL(sock_no_socketpair);
2499
2500 int sock_no_accept(struct socket *sock, struct socket *newsock, int flags,
2501                    bool kern)
2502 {
2503         return -EOPNOTSUPP;
2504 }
2505 EXPORT_SYMBOL(sock_no_accept);
2506
2507 int sock_no_getname(struct socket *sock, struct sockaddr *saddr,
2508                     int peer)
2509 {
2510         return -EOPNOTSUPP;
2511 }
2512 EXPORT_SYMBOL(sock_no_getname);
2513
2514 int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
2515 {
2516         return -EOPNOTSUPP;
2517 }
2518 EXPORT_SYMBOL(sock_no_ioctl);
2519
2520 int sock_no_listen(struct socket *sock, int backlog)
2521 {
2522         return -EOPNOTSUPP;
2523 }
2524 EXPORT_SYMBOL(sock_no_listen);
2525
2526 int sock_no_shutdown(struct socket *sock, int how)
2527 {
2528         return -EOPNOTSUPP;
2529 }
2530 EXPORT_SYMBOL(sock_no_shutdown);
2531
2532 int sock_no_setsockopt(struct socket *sock, int level, int optname,
2533                     char __user *optval, unsigned int optlen)
2534 {
2535         return -EOPNOTSUPP;
2536 }
2537 EXPORT_SYMBOL(sock_no_setsockopt);
2538
2539 int sock_no_getsockopt(struct socket *sock, int level, int optname,
2540                     char __user *optval, int __user *optlen)
2541 {
2542         return -EOPNOTSUPP;
2543 }
2544 EXPORT_SYMBOL(sock_no_getsockopt);
2545
2546 int sock_no_sendmsg(struct socket *sock, struct msghdr *m, size_t len)
2547 {
2548         return -EOPNOTSUPP;
2549 }
2550 EXPORT_SYMBOL(sock_no_sendmsg);
2551
2552 int sock_no_sendmsg_locked(struct sock *sk, struct msghdr *m, size_t len)
2553 {
2554         return -EOPNOTSUPP;
2555 }
2556 EXPORT_SYMBOL(sock_no_sendmsg_locked);
2557
2558 int sock_no_recvmsg(struct socket *sock, struct msghdr *m, size_t len,
2559                     int flags)
2560 {
2561         return -EOPNOTSUPP;
2562 }
2563 EXPORT_SYMBOL(sock_no_recvmsg);
2564
2565 int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
2566 {
2567         /* Mirror missing mmap method error code */
2568         return -ENODEV;
2569 }
2570 EXPORT_SYMBOL(sock_no_mmap);
2571
2572 ssize_t sock_no_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags)
2573 {
2574         ssize_t res;
2575         struct msghdr msg = {.msg_flags = flags};
2576         struct kvec iov;
2577         char *kaddr = kmap(page);
2578         iov.iov_base = kaddr + offset;
2579         iov.iov_len = size;
2580         res = kernel_sendmsg(sock, &msg, &iov, 1, size);
2581         kunmap(page);
2582         return res;
2583 }
2584 EXPORT_SYMBOL(sock_no_sendpage);
2585
2586 ssize_t sock_no_sendpage_locked(struct sock *sk, struct page *page,
2587                                 int offset, size_t size, int flags)
2588 {
2589         ssize_t res;
2590         struct msghdr msg = {.msg_flags = flags};
2591         struct kvec iov;
2592         char *kaddr = kmap(page);
2593
2594         iov.iov_base = kaddr + offset;
2595         iov.iov_len = size;
2596         res = kernel_sendmsg_locked(sk, &msg, &iov, 1, size);
2597         kunmap(page);
2598         return res;
2599 }
2600 EXPORT_SYMBOL(sock_no_sendpage_locked);
2601
2602 /*
2603  *      Default Socket Callbacks
2604  */
2605
2606 static void sock_def_wakeup(struct sock *sk)
2607 {
2608         struct socket_wq *wq;
2609
2610         rcu_read_lock();
2611         wq = rcu_dereference(sk->sk_wq);
2612         if (skwq_has_sleeper(wq))
2613                 wake_up_interruptible_all(&wq->wait);
2614         rcu_read_unlock();
2615 }
2616
2617 static void sock_def_error_report(struct sock *sk)
2618 {
2619         struct socket_wq *wq;
2620
2621         rcu_read_lock();
2622         wq = rcu_dereference(sk->sk_wq);
2623         if (skwq_has_sleeper(wq))
2624                 wake_up_interruptible_poll(&wq->wait, EPOLLERR);
2625         sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR);
2626         rcu_read_unlock();
2627 }
2628
2629 static void sock_def_readable(struct sock *sk)
2630 {
2631         struct socket_wq *wq;
2632
2633         rcu_read_lock();
2634         wq = rcu_dereference(sk->sk_wq);
2635         if (skwq_has_sleeper(wq))
2636                 wake_up_interruptible_sync_poll(&wq->wait, EPOLLIN | EPOLLPRI |
2637                                                 EPOLLRDNORM | EPOLLRDBAND);
2638         sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
2639         rcu_read_unlock();
2640 }
2641
2642 static void sock_def_write_space(struct sock *sk)
2643 {
2644         struct socket_wq *wq;
2645
2646         rcu_read_lock();
2647
2648         /* Do not wake up a writer until he can make "significant"
2649          * progress.  --DaveM
2650          */
2651         if ((refcount_read(&sk->sk_wmem_alloc) << 1) <= sk->sk_sndbuf) {
2652                 wq = rcu_dereference(sk->sk_wq);
2653                 if (skwq_has_sleeper(wq))
2654                         wake_up_interruptible_sync_poll(&wq->wait, EPOLLOUT |
2655                                                 EPOLLWRNORM | EPOLLWRBAND);
2656
2657                 /* Should agree with poll, otherwise some programs break */
2658                 if (sock_writeable(sk))
2659                         sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
2660         }
2661
2662         rcu_read_unlock();
2663 }
2664
2665 static void sock_def_destruct(struct sock *sk)
2666 {
2667 }
2668
2669 void sk_send_sigurg(struct sock *sk)
2670 {
2671         if (sk->sk_socket && sk->sk_socket->file)
2672                 if (send_sigurg(&sk->sk_socket->file->f_owner))
2673                         sk_wake_async(sk, SOCK_WAKE_URG, POLL_PRI);
2674 }
2675 EXPORT_SYMBOL(sk_send_sigurg);
2676
2677 void sk_reset_timer(struct sock *sk, struct timer_list* timer,
2678                     unsigned long expires)
2679 {
2680         if (!mod_timer(timer, expires))
2681                 sock_hold(sk);
2682 }
2683 EXPORT_SYMBOL(sk_reset_timer);
2684
2685 void sk_stop_timer(struct sock *sk, struct timer_list* timer)
2686 {
2687         if (del_timer(timer))
2688                 __sock_put(sk);
2689 }
2690 EXPORT_SYMBOL(sk_stop_timer);
2691
2692 void sock_init_data(struct socket *sock, struct sock *sk)
2693 {
2694         sk_init_common(sk);
2695         sk->sk_send_head        =       NULL;
2696
2697         timer_setup(&sk->sk_timer, NULL, 0);
2698
2699         sk->sk_allocation       =       GFP_KERNEL;
2700         sk->sk_rcvbuf           =       sysctl_rmem_default;
2701         sk->sk_sndbuf           =       sysctl_wmem_default;
2702         sk->sk_state            =       TCP_CLOSE;
2703         sk_set_socket(sk, sock);
2704
2705         sock_set_flag(sk, SOCK_ZAPPED);
2706
2707         if (sock) {
2708                 sk->sk_type     =       sock->type;
2709                 sk->sk_wq       =       sock->wq;
2710                 sock->sk        =       sk;
2711                 sk->sk_uid      =       SOCK_INODE(sock)->i_uid;
2712         } else {
2713                 sk->sk_wq       =       NULL;
2714                 sk->sk_uid      =       make_kuid(sock_net(sk)->user_ns, 0);
2715         }
2716
2717         rwlock_init(&sk->sk_callback_lock);
2718         if (sk->sk_kern_sock)
2719                 lockdep_set_class_and_name(
2720                         &sk->sk_callback_lock,
2721                         af_kern_callback_keys + sk->sk_family,
2722                         af_family_kern_clock_key_strings[sk->sk_family]);
2723         else
2724                 lockdep_set_class_and_name(
2725                         &sk->sk_callback_lock,
2726                         af_callback_keys + sk->sk_family,
2727                         af_family_clock_key_strings[sk->sk_family]);
2728
2729         sk->sk_state_change     =       sock_def_wakeup;
2730         sk->sk_data_ready       =       sock_def_readable;
2731         sk->sk_write_space      =       sock_def_write_space;
2732         sk->sk_error_report     =       sock_def_error_report;
2733         sk->sk_destruct         =       sock_def_destruct;
2734
2735         sk->sk_frag.page        =       NULL;
2736         sk->sk_frag.offset      =       0;
2737         sk->sk_peek_off         =       -1;
2738
2739         sk->sk_peer_pid         =       NULL;
2740         sk->sk_peer_cred        =       NULL;
2741         sk->sk_write_pending    =       0;
2742         sk->sk_rcvlowat         =       1;
2743         sk->sk_rcvtimeo         =       MAX_SCHEDULE_TIMEOUT;
2744         sk->sk_sndtimeo         =       MAX_SCHEDULE_TIMEOUT;
2745
2746         sk->sk_stamp = SK_DEFAULT_STAMP;
2747 #if BITS_PER_LONG==32
2748         seqlock_init(&sk->sk_stamp_seq);
2749 #endif
2750         atomic_set(&sk->sk_zckey, 0);
2751
2752 #ifdef CONFIG_NET_RX_BUSY_POLL
2753         sk->sk_napi_id          =       0;
2754         sk->sk_ll_usec          =       sysctl_net_busy_read;
2755 #endif
2756
2757         sk->sk_max_pacing_rate = ~0UL;
2758         sk->sk_pacing_rate = ~0UL;
2759         sk->sk_pacing_shift = 10;
2760         sk->sk_incoming_cpu = -1;
2761
2762         sk_rx_queue_clear(sk);
2763         /*
2764          * Before updating sk_refcnt, we must commit prior changes to memory
2765          * (Documentation/RCU/rculist_nulls.txt for details)
2766          */
2767         smp_wmb();
2768         refcount_set(&sk->sk_refcnt, 1);
2769         atomic_set(&sk->sk_drops, 0);
2770 }
2771 EXPORT_SYMBOL(sock_init_data);
2772
2773 void lock_sock_nested(struct sock *sk, int subclass)
2774 {
2775         might_sleep();
2776         spin_lock_bh(&sk->sk_lock.slock);
2777         if (sk->sk_lock.owned)
2778                 __lock_sock(sk);
2779         sk->sk_lock.owned = 1;
2780         spin_unlock(&sk->sk_lock.slock);
2781         /*
2782          * The sk_lock has mutex_lock() semantics here:
2783          */
2784         mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_);
2785         local_bh_enable();
2786 }
2787 EXPORT_SYMBOL(lock_sock_nested);
2788
2789 void release_sock(struct sock *sk)
2790 {
2791         spin_lock_bh(&sk->sk_lock.slock);
2792         if (sk->sk_backlog.tail)
2793                 __release_sock(sk);
2794
2795         /* Warning : release_cb() might need to release sk ownership,
2796          * ie call sock_release_ownership(sk) before us.
2797          */
2798         if (sk->sk_prot->release_cb)
2799                 sk->sk_prot->release_cb(sk);
2800
2801         sock_release_ownership(sk);
2802         if (waitqueue_active(&sk->sk_lock.wq))
2803                 wake_up(&sk->sk_lock.wq);
2804         spin_unlock_bh(&sk->sk_lock.slock);
2805 }
2806 EXPORT_SYMBOL(release_sock);
2807
2808 /**
2809  * lock_sock_fast - fast version of lock_sock
2810  * @sk: socket
2811  *
2812  * This version should be used for very small section, where process wont block
2813  * return false if fast path is taken:
2814  *
2815  *   sk_lock.slock locked, owned = 0, BH disabled
2816  *
2817  * return true if slow path is taken:
2818  *
2819  *   sk_lock.slock unlocked, owned = 1, BH enabled
2820  */
2821 bool lock_sock_fast(struct sock *sk)
2822 {
2823         might_sleep();
2824         spin_lock_bh(&sk->sk_lock.slock);
2825
2826         if (!sk->sk_lock.owned)
2827                 /*
2828                  * Note : We must disable BH
2829                  */
2830                 return false;
2831
2832         __lock_sock(sk);
2833         sk->sk_lock.owned = 1;
2834         spin_unlock(&sk->sk_lock.slock);
2835         /*
2836          * The sk_lock has mutex_lock() semantics here:
2837          */
2838         mutex_acquire(&sk->sk_lock.dep_map, 0, 0, _RET_IP_);
2839         local_bh_enable();
2840         return true;
2841 }
2842 EXPORT_SYMBOL(lock_sock_fast);
2843
2844 int sock_get_timestamp(struct sock *sk, struct timeval __user *userstamp)
2845 {
2846         struct timeval tv;
2847
2848         sock_enable_timestamp(sk, SOCK_TIMESTAMP);
2849         tv = ktime_to_timeval(sock_read_timestamp(sk));
2850         if (tv.tv_sec == -1)
2851                 return -ENOENT;
2852         if (tv.tv_sec == 0) {
2853                 ktime_t kt = ktime_get_real();
2854                 sock_write_timestamp(sk, kt);
2855                 tv = ktime_to_timeval(kt);
2856         }
2857         return copy_to_user(userstamp, &tv, sizeof(tv)) ? -EFAULT : 0;
2858 }
2859 EXPORT_SYMBOL(sock_get_timestamp);
2860
2861 int sock_get_timestampns(struct sock *sk, struct timespec __user *userstamp)
2862 {
2863         struct timespec ts;
2864
2865         sock_enable_timestamp(sk, SOCK_TIMESTAMP);
2866         ts = ktime_to_timespec(sock_read_timestamp(sk));
2867         if (ts.tv_sec == -1)
2868                 return -ENOENT;
2869         if (ts.tv_sec == 0) {
2870                 ktime_t kt = ktime_get_real();
2871                 sock_write_timestamp(sk, kt);
2872                 ts = ktime_to_timespec(sk->sk_stamp);
2873         }
2874         return copy_to_user(userstamp, &ts, sizeof(ts)) ? -EFAULT : 0;
2875 }
2876 EXPORT_SYMBOL(sock_get_timestampns);
2877
2878 void sock_enable_timestamp(struct sock *sk, int flag)
2879 {
2880         if (!sock_flag(sk, flag)) {
2881                 unsigned long previous_flags = sk->sk_flags;
2882
2883                 sock_set_flag(sk, flag);
2884                 /*
2885                  * we just set one of the two flags which require net
2886                  * time stamping, but time stamping might have been on
2887                  * already because of the other one
2888                  */
2889                 if (sock_needs_netstamp(sk) &&
2890                     !(previous_flags & SK_FLAGS_TIMESTAMP))
2891                         net_enable_timestamp();
2892         }
2893 }
2894
2895 int sock_recv_errqueue(struct sock *sk, struct msghdr *msg, int len,
2896                        int level, int type)
2897 {
2898         struct sock_exterr_skb *serr;
2899         struct sk_buff *skb;
2900         int copied, err;
2901
2902         err = -EAGAIN;
2903         skb = sock_dequeue_err_skb(sk);
2904         if (skb == NULL)
2905                 goto out;
2906
2907         copied = skb->len;
2908         if (copied > len) {
2909                 msg->msg_flags |= MSG_TRUNC;
2910                 copied = len;
2911         }
2912         err = skb_copy_datagram_msg(skb, 0, msg, copied);
2913         if (err)
2914                 goto out_free_skb;
2915
2916         sock_recv_timestamp(msg, sk, skb);
2917
2918         serr = SKB_EXT_ERR(skb);
2919         put_cmsg(msg, level, type, sizeof(serr->ee), &serr->ee);
2920
2921         msg->msg_flags |= MSG_ERRQUEUE;
2922         err = copied;
2923
2924 out_free_skb:
2925         kfree_skb(skb);
2926 out:
2927         return err;
2928 }
2929 EXPORT_SYMBOL(sock_recv_errqueue);
2930
2931 /*
2932  *      Get a socket option on an socket.
2933  *
2934  *      FIX: POSIX 1003.1g is very ambiguous here. It states that
2935  *      asynchronous errors should be reported by getsockopt. We assume
2936  *      this means if you specify SO_ERROR (otherwise whats the point of it).
2937  */
2938 int sock_common_getsockopt(struct socket *sock, int level, int optname,
2939                            char __user *optval, int __user *optlen)
2940 {
2941         struct sock *sk = sock->sk;
2942
2943         return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
2944 }
2945 EXPORT_SYMBOL(sock_common_getsockopt);
2946
2947 #ifdef CONFIG_COMPAT
2948 int compat_sock_common_getsockopt(struct socket *sock, int level, int optname,
2949                                   char __user *optval, int __user *optlen)
2950 {
2951         struct sock *sk = sock->sk;
2952
2953         if (sk->sk_prot->compat_getsockopt != NULL)
2954                 return sk->sk_prot->compat_getsockopt(sk, level, optname,
2955                                                       optval, optlen);
2956         return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
2957 }
2958 EXPORT_SYMBOL(compat_sock_common_getsockopt);
2959 #endif
2960
2961 int sock_common_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
2962                         int flags)
2963 {
2964         struct sock *sk = sock->sk;
2965         int addr_len = 0;
2966         int err;
2967
2968         err = sk->sk_prot->recvmsg(sk, msg, size, flags & MSG_DONTWAIT,
2969                                    flags & ~MSG_DONTWAIT, &addr_len);
2970         if (err >= 0)
2971                 msg->msg_namelen = addr_len;
2972         return err;
2973 }
2974 EXPORT_SYMBOL(sock_common_recvmsg);
2975
2976 /*
2977  *      Set socket options on an inet socket.
2978  */
2979 int sock_common_setsockopt(struct socket *sock, int level, int optname,
2980                            char __user *optval, unsigned int optlen)
2981 {
2982         struct sock *sk = sock->sk;
2983
2984         return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
2985 }
2986 EXPORT_SYMBOL(sock_common_setsockopt);
2987
2988 #ifdef CONFIG_COMPAT
2989 int compat_sock_common_setsockopt(struct socket *sock, int level, int optname,
2990                                   char __user *optval, unsigned int optlen)
2991 {
2992         struct sock *sk = sock->sk;
2993
2994         if (sk->sk_prot->compat_setsockopt != NULL)
2995                 return sk->sk_prot->compat_setsockopt(sk, level, optname,
2996                                                       optval, optlen);
2997         return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
2998 }
2999 EXPORT_SYMBOL(compat_sock_common_setsockopt);
3000 #endif
3001
3002 void sk_common_release(struct sock *sk)
3003 {
3004         if (sk->sk_prot->destroy)
3005                 sk->sk_prot->destroy(sk);
3006
3007         /*
3008          * Observation: when sock_common_release is called, processes have
3009          * no access to socket. But net still has.
3010          * Step one, detach it from networking:
3011          *
3012          * A. Remove from hash tables.
3013          */
3014
3015         sk->sk_prot->unhash(sk);
3016
3017         /*
3018          * In this point socket cannot receive new packets, but it is possible
3019          * that some packets are in flight because some CPU runs receiver and
3020          * did hash table lookup before we unhashed socket. They will achieve
3021          * receive queue and will be purged by socket destructor.
3022          *
3023          * Also we still have packets pending on receive queue and probably,
3024          * our own packets waiting in device queues. sock_destroy will drain
3025          * receive queue, but transmitted packets will delay socket destruction
3026          * until the last reference will be released.
3027          */
3028
3029         sock_orphan(sk);
3030
3031         xfrm_sk_free_policy(sk);
3032
3033         sk_refcnt_debug_release(sk);
3034
3035         sock_put(sk);
3036 }
3037 EXPORT_SYMBOL(sk_common_release);
3038
3039 void sk_get_meminfo(const struct sock *sk, u32 *mem)
3040 {
3041         memset(mem, 0, sizeof(*mem) * SK_MEMINFO_VARS);
3042
3043         mem[SK_MEMINFO_RMEM_ALLOC] = sk_rmem_alloc_get(sk);
3044         mem[SK_MEMINFO_RCVBUF] = sk->sk_rcvbuf;
3045         mem[SK_MEMINFO_WMEM_ALLOC] = sk_wmem_alloc_get(sk);
3046         mem[SK_MEMINFO_SNDBUF] = sk->sk_sndbuf;
3047         mem[SK_MEMINFO_FWD_ALLOC] = sk->sk_forward_alloc;
3048         mem[SK_MEMINFO_WMEM_QUEUED] = sk->sk_wmem_queued;
3049         mem[SK_MEMINFO_OPTMEM] = atomic_read(&sk->sk_omem_alloc);
3050         mem[SK_MEMINFO_BACKLOG] = sk->sk_backlog.len;
3051         mem[SK_MEMINFO_DROPS] = atomic_read(&sk->sk_drops);
3052 }
3053
3054 #ifdef CONFIG_PROC_FS
3055 #define PROTO_INUSE_NR  64      /* should be enough for the first time */
3056 struct prot_inuse {
3057         int val[PROTO_INUSE_NR];
3058 };
3059
3060 static DECLARE_BITMAP(proto_inuse_idx, PROTO_INUSE_NR);
3061
3062 void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
3063 {
3064         __this_cpu_add(net->core.prot_inuse->val[prot->inuse_idx], val);
3065 }
3066 EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
3067
3068 int sock_prot_inuse_get(struct net *net, struct proto *prot)
3069 {
3070         int cpu, idx = prot->inuse_idx;
3071         int res = 0;
3072
3073         for_each_possible_cpu(cpu)
3074                 res += per_cpu_ptr(net->core.prot_inuse, cpu)->val[idx];
3075
3076         return res >= 0 ? res : 0;
3077 }
3078 EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
3079
3080 static void sock_inuse_add(struct net *net, int val)
3081 {
3082         this_cpu_add(*net->core.sock_inuse, val);
3083 }
3084
3085 int sock_inuse_get(struct net *net)
3086 {
3087         int cpu, res = 0;
3088
3089         for_each_possible_cpu(cpu)
3090                 res += *per_cpu_ptr(net->core.sock_inuse, cpu);
3091
3092         return res;
3093 }
3094
3095 EXPORT_SYMBOL_GPL(sock_inuse_get);
3096
3097 static int __net_init sock_inuse_init_net(struct net *net)
3098 {
3099         net->core.prot_inuse = alloc_percpu(struct prot_inuse);
3100         if (net->core.prot_inuse == NULL)
3101                 return -ENOMEM;
3102
3103         net->core.sock_inuse = alloc_percpu(int);
3104         if (net->core.sock_inuse == NULL)
3105                 goto out;
3106
3107         return 0;
3108
3109 out:
3110         free_percpu(net->core.prot_inuse);
3111         return -ENOMEM;
3112 }
3113
3114 static void __net_exit sock_inuse_exit_net(struct net *net)
3115 {
3116         free_percpu(net->core.prot_inuse);
3117         free_percpu(net->core.sock_inuse);
3118 }
3119
3120 static struct pernet_operations net_inuse_ops = {
3121         .init = sock_inuse_init_net,
3122         .exit = sock_inuse_exit_net,
3123 };
3124
3125 static __init int net_inuse_init(void)
3126 {
3127         if (register_pernet_subsys(&net_inuse_ops))
3128                 panic("Cannot initialize net inuse counters");
3129
3130         return 0;
3131 }
3132
3133 core_initcall(net_inuse_init);
3134
3135 static void assign_proto_idx(struct proto *prot)
3136 {
3137         prot->inuse_idx = find_first_zero_bit(proto_inuse_idx, PROTO_INUSE_NR);
3138
3139         if (unlikely(prot->inuse_idx == PROTO_INUSE_NR - 1)) {
3140                 pr_err("PROTO_INUSE_NR exhausted\n");
3141                 return;
3142         }
3143
3144         set_bit(prot->inuse_idx, proto_inuse_idx);
3145 }
3146
3147 static void release_proto_idx(struct proto *prot)
3148 {
3149         if (prot->inuse_idx != PROTO_INUSE_NR - 1)
3150                 clear_bit(prot->inuse_idx, proto_inuse_idx);
3151 }
3152 #else
3153 static inline void assign_proto_idx(struct proto *prot)
3154 {
3155 }
3156
3157 static inline void release_proto_idx(struct proto *prot)
3158 {
3159 }
3160
3161 static void sock_inuse_add(struct net *net, int val)
3162 {
3163 }
3164 #endif
3165
3166 static void req_prot_cleanup(struct request_sock_ops *rsk_prot)
3167 {
3168         if (!rsk_prot)
3169                 return;
3170         kfree(rsk_prot->slab_name);
3171         rsk_prot->slab_name = NULL;
3172         kmem_cache_destroy(rsk_prot->slab);
3173         rsk_prot->slab = NULL;
3174 }
3175
3176 static int req_prot_init(const struct proto *prot)
3177 {
3178         struct request_sock_ops *rsk_prot = prot->rsk_prot;
3179
3180         if (!rsk_prot)
3181                 return 0;
3182
3183         rsk_prot->slab_name = kasprintf(GFP_KERNEL, "request_sock_%s",
3184                                         prot->name);
3185         if (!rsk_prot->slab_name)
3186                 return -ENOMEM;
3187
3188         rsk_prot->slab = kmem_cache_create(rsk_prot->slab_name,
3189                                            rsk_prot->obj_size, 0,
3190                                            SLAB_ACCOUNT | prot->slab_flags,
3191                                            NULL);
3192
3193         if (!rsk_prot->slab) {
3194                 pr_crit("%s: Can't create request sock SLAB cache!\n",
3195                         prot->name);
3196                 return -ENOMEM;
3197         }
3198         return 0;
3199 }
3200
3201 int proto_register(struct proto *prot, int alloc_slab)
3202 {
3203         if (alloc_slab) {
3204                 prot->slab = kmem_cache_create_usercopy(prot->name,
3205                                         prot->obj_size, 0,
3206                                         SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT |
3207                                         prot->slab_flags,
3208                                         prot->useroffset, prot->usersize,
3209                                         NULL);
3210
3211                 if (prot->slab == NULL) {
3212                         pr_crit("%s: Can't create sock SLAB cache!\n",
3213                                 prot->name);
3214                         goto out;
3215                 }
3216
3217                 if (req_prot_init(prot))
3218                         goto out_free_request_sock_slab;
3219
3220                 if (prot->twsk_prot != NULL) {
3221                         prot->twsk_prot->twsk_slab_name = kasprintf(GFP_KERNEL, "tw_sock_%s", prot->name);
3222
3223                         if (prot->twsk_prot->twsk_slab_name == NULL)
3224                                 goto out_free_request_sock_slab;
3225
3226                         prot->twsk_prot->twsk_slab =
3227                                 kmem_cache_create(prot->twsk_prot->twsk_slab_name,
3228                                                   prot->twsk_prot->twsk_obj_size,
3229                                                   0,
3230                                                   SLAB_ACCOUNT |
3231                                                   prot->slab_flags,
3232                                                   NULL);
3233                         if (prot->twsk_prot->twsk_slab == NULL)
3234                                 goto out_free_timewait_sock_slab_name;
3235                 }
3236         }
3237
3238         mutex_lock(&proto_list_mutex);
3239         list_add(&prot->node, &proto_list);
3240         assign_proto_idx(prot);
3241         mutex_unlock(&proto_list_mutex);
3242         return 0;
3243
3244 out_free_timewait_sock_slab_name:
3245         kfree(prot->twsk_prot->twsk_slab_name);
3246 out_free_request_sock_slab:
3247         req_prot_cleanup(prot->rsk_prot);
3248
3249         kmem_cache_destroy(prot->slab);
3250         prot->slab = NULL;
3251 out:
3252         return -ENOBUFS;
3253 }
3254 EXPORT_SYMBOL(proto_register);
3255
3256 void proto_unregister(struct proto *prot)
3257 {
3258         mutex_lock(&proto_list_mutex);
3259         release_proto_idx(prot);
3260         list_del(&prot->node);
3261         mutex_unlock(&proto_list_mutex);
3262
3263         kmem_cache_destroy(prot->slab);
3264         prot->slab = NULL;
3265
3266         req_prot_cleanup(prot->rsk_prot);
3267
3268         if (prot->twsk_prot != NULL && prot->twsk_prot->twsk_slab != NULL) {
3269                 kmem_cache_destroy(prot->twsk_prot->twsk_slab);
3270                 kfree(prot->twsk_prot->twsk_slab_name);
3271                 prot->twsk_prot->twsk_slab = NULL;
3272         }
3273 }
3274 EXPORT_SYMBOL(proto_unregister);
3275
3276 int sock_load_diag_module(int family, int protocol)
3277 {
3278         if (!protocol) {
3279                 if (!sock_is_registered(family))
3280                         return -ENOENT;
3281
3282                 return request_module("net-pf-%d-proto-%d-type-%d", PF_NETLINK,
3283                                       NETLINK_SOCK_DIAG, family);
3284         }
3285
3286 #ifdef CONFIG_INET
3287         if (family == AF_INET &&
3288             protocol != IPPROTO_RAW &&
3289             !rcu_access_pointer(inet_protos[protocol]))
3290                 return -ENOENT;
3291 #endif
3292
3293         return request_module("net-pf-%d-proto-%d-type-%d-%d", PF_NETLINK,
3294                               NETLINK_SOCK_DIAG, family, protocol);
3295 }
3296 EXPORT_SYMBOL(sock_load_diag_module);
3297
3298 #ifdef CONFIG_PROC_FS
3299 static void *proto_seq_start(struct seq_file *seq, loff_t *pos)
3300         __acquires(proto_list_mutex)
3301 {
3302         mutex_lock(&proto_list_mutex);
3303         return seq_list_start_head(&proto_list, *pos);
3304 }
3305
3306 static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3307 {
3308         return seq_list_next(v, &proto_list, pos);
3309 }
3310
3311 static void proto_seq_stop(struct seq_file *seq, void *v)
3312         __releases(proto_list_mutex)
3313 {
3314         mutex_unlock(&proto_list_mutex);
3315 }
3316
3317 static char proto_method_implemented(const void *method)
3318 {
3319         return method == NULL ? 'n' : 'y';
3320 }
3321 static long sock_prot_memory_allocated(struct proto *proto)
3322 {
3323         return proto->memory_allocated != NULL ? proto_memory_allocated(proto) : -1L;
3324 }
3325
3326 static char *sock_prot_memory_pressure(struct proto *proto)
3327 {
3328         return proto->memory_pressure != NULL ?
3329         proto_memory_pressure(proto) ? "yes" : "no" : "NI";
3330 }
3331
3332 static void proto_seq_printf(struct seq_file *seq, struct proto *proto)
3333 {
3334
3335         seq_printf(seq, "%-9s %4u %6d  %6ld   %-3s %6u   %-3s  %-10s "
3336                         "%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n",
3337                    proto->name,
3338                    proto->obj_size,
3339                    sock_prot_inuse_get(seq_file_net(seq), proto),
3340                    sock_prot_memory_allocated(proto),
3341                    sock_prot_memory_pressure(proto),
3342                    proto->max_header,
3343                    proto->slab == NULL ? "no" : "yes",
3344                    module_name(proto->owner),
3345                    proto_method_implemented(proto->close),
3346                    proto_method_implemented(proto->connect),
3347                    proto_method_implemented(proto->disconnect),
3348                    proto_method_implemented(proto->accept),
3349                    proto_method_implemented(proto->ioctl),
3350                    proto_method_implemented(proto->init),
3351                    proto_method_implemented(proto->destroy),
3352                    proto_method_implemented(proto->shutdown),
3353                    proto_method_implemented(proto->setsockopt),
3354                    proto_method_implemented(proto->getsockopt),
3355                    proto_method_implemented(proto->sendmsg),
3356                    proto_method_implemented(proto->recvmsg),
3357                    proto_method_implemented(proto->sendpage),
3358                    proto_method_implemented(proto->bind),
3359                    proto_method_implemented(proto->backlog_rcv),
3360                    proto_method_implemented(proto->hash),
3361                    proto_method_implemented(proto->unhash),
3362                    proto_method_implemented(proto->get_port),
3363                    proto_method_implemented(proto->enter_memory_pressure));
3364 }
3365
3366 static int proto_seq_show(struct seq_file *seq, void *v)
3367 {
3368         if (v == &proto_list)
3369                 seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s",
3370                            "protocol",
3371                            "size",
3372                            "sockets",
3373                            "memory",
3374                            "press",
3375                            "maxhdr",
3376                            "slab",
3377                            "module",
3378                            "cl co di ac io in de sh ss gs se re sp bi br ha uh gp em\n");
3379         else
3380                 proto_seq_printf(seq, list_entry(v, struct proto, node));
3381         return 0;
3382 }
3383
3384 static const struct seq_operations proto_seq_ops = {
3385         .start  = proto_seq_start,
3386         .next   = proto_seq_next,
3387         .stop   = proto_seq_stop,
3388         .show   = proto_seq_show,
3389 };
3390
3391 static __net_init int proto_init_net(struct net *net)
3392 {
3393         if (!proc_create_net("protocols", 0444, net->proc_net, &proto_seq_ops,
3394                         sizeof(struct seq_net_private)))
3395                 return -ENOMEM;
3396
3397         return 0;
3398 }
3399
3400 static __net_exit void proto_exit_net(struct net *net)
3401 {
3402         remove_proc_entry("protocols", net->proc_net);
3403 }
3404
3405
3406 static __net_initdata struct pernet_operations proto_net_ops = {
3407         .init = proto_init_net,
3408         .exit = proto_exit_net,
3409 };
3410
3411 static int __init proto_init(void)
3412 {
3413         return register_pernet_subsys(&proto_net_ops);
3414 }
3415
3416 subsys_initcall(proto_init);
3417
3418 #endif /* PROC_FS */
3419
3420 #ifdef CONFIG_NET_RX_BUSY_POLL
3421 bool sk_busy_loop_end(void *p, unsigned long start_time)
3422 {
3423         struct sock *sk = p;
3424
3425         return !skb_queue_empty(&sk->sk_receive_queue) ||
3426                sk_busy_loop_timeout(sk, start_time);
3427 }
3428 EXPORT_SYMBOL(sk_busy_loop_end);
3429 #endif /* CONFIG_NET_RX_BUSY_POLL */