net/unix/af_unix.c

   1 /*
   2  * NET4:        Implementation of BSD Unix domain sockets.
   3  *
   4  * Authors:     Alan Cox, <alan@lxorguk.ukuu.org.uk>
   5  *
   6  *              This program is free software; you can redistribute it and/or
   7  *              modify it under the terms of the GNU General Public License
   8  *              as published by the Free Software Foundation; either version
   9  *              2 of the License, or (at your option) any later version.
  10  *
  11  * Fixes:
  12  *              Linus Torvalds  :       Assorted bug cures.
  13  *              Niibe Yutaka    :       async I/O support.
  14  *              Carsten Paeth   :       PF_UNIX check, address fixes.
  15  *              Alan Cox        :       Limit size of allocated blocks.
  16  *              Alan Cox        :       Fixed the stupid socketpair bug.
  17  *              Alan Cox        :       BSD compatibility fine tuning.
  18  *              Alan Cox        :       Fixed a bug in connect when interrupted.
  19  *              Alan Cox        :       Sorted out a proper draft version of
  20  *                                      file descriptor passing hacked up from
  21  *                                      Mike Shaver's work.
  22  *              Marty Leisner   :       Fixes to fd passing
  23  *              Nick Nevin      :       recvmsg bugfix.
  24  *              Alan Cox        :       Started proper garbage collector
  25  *              Heiko EiBfeldt  :       Missing verify_area check
  26  *              Alan Cox        :       Started POSIXisms
  27  *              Andreas Schwab  :       Replace inode by dentry for proper
  28  *                                      reference counting
  29  *              Kirk Petersen   :       Made this a module
  30  *          Christoph Rohland   :       Elegant non-blocking accept/connect algorithm.
  31  *                                      Lots of bug fixes.
  32  *           Alexey Kuznetosv   :       Repaired (I hope) bugs introduces
  33  *                                      by above two patches.
  34  *           Andrea Arcangeli   :       If possible we block in connect(2)
  35  *                                      if the max backlog of the listen socket
  36  *                                      is been reached. This won't break
  37  *                                      old apps and it will avoid huge amount
  38  *                                      of socks hashed (this for unix_gc()
  39  *                                      performances reasons).
  40  *                                      Security fix that limits the max
  41  *                                      number of socks to 2*max_files and
  42  *                                      the number of skb queueable in the
  43  *                                      dgram receiver.
  44  *              Artur Skawina   :       Hash function optimizations
  45  *           Alexey Kuznetsov   :       Full scale SMP. Lot of bugs are introduced 8)
  46  *            Malcolm Beattie   :       Set peercred for socketpair
  47  *           Michal Ostrowski   :       Module initialization cleanup.
  48  *           Arnaldo C. Melo    :       Remove MOD_{INC,DEC}_USE_COUNT,
  49  *                                      the core infrastructure is doing that
  50  *                                      for all net proto families now (2.5.69+)
  51  *
  52  *
  53  * Known differences from reference BSD that was tested:
  54  *
  55  *      [TO FIX]
  56  *      ECONNREFUSED is not returned from one end of a connected() socket to the
  57  *              other the moment one end closes.
  58  *      fstat() doesn't return st_dev=0, and give the blksize as high water mark
  59  *              and a fake inode identifier (nor the BSD first socket fstat twice bug).
  60  *      [NOT TO FIX]
  61  *      accept() returns a path name even if the connecting socket has closed
  62  *              in the meantime (BSD loses the path and gives up).
  63  *      accept() returns 0 length path for an unbound connector. BSD returns 16
  64  *              and a null first byte in the path (but not for gethost/peername - BSD bug ??)
  65  *      socketpair(...SOCK_RAW..) doesn't panic the kernel.
  66  *      BSD af_unix apparently has connect forgetting to block properly.
  67  *              (need to check this with the POSIX spec in detail)
  68  *
  69  * Differences from 2.0.0-11-... (ANK)
  70  *      Bug fixes and improvements.
  71  *              - client shutdown killed server socket.
  72  *              - removed all useless cli/sti pairs.
  73  *
  74  *      Semantic changes/extensions.
  75  *              - generic control message passing.
  76  *              - SCM_CREDENTIALS control message.
  77  *              - "Abstract" (not FS based) socket bindings.
  78  *                Abstract names are sequences of bytes (not zero terminated)
  79  *                started by 0, so that this name space does not intersect
  80  *                with BSD names.
  81  */
  82
  83 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
  84
  85 #include <linux/module.h>
  86 #include <linux/kernel.h>
  87 #include <linux/signal.h>
  88 #include <linux/sched/signal.h>
  89 #include <linux/errno.h>
  90 #include <linux/string.h>
  91 #include <linux/stat.h>
  92 #include <linux/dcache.h>
  93 #include <linux/namei.h>
  94 #include <linux/socket.h>
  95 #include <linux/un.h>
  96 #include <linux/fcntl.h>
  97 #include <linux/termios.h>
  98 #include <linux/sockios.h>
  99 #include <linux/net.h>
 100 #include <linux/in.h>
 101 #include <linux/fs.h>
 102 #include <linux/slab.h>
 103 #include <linux/uaccess.h>
 104 #include <linux/skbuff.h>
 105 #include <linux/netdevice.h>
 106 #include <net/net_namespace.h>
 107 #include <net/sock.h>
 108 #include <net/tcp_states.h>
 109 #include <net/af_unix.h>
 110 #include <linux/proc_fs.h>
 111 #include <linux/seq_file.h>
 112 #include <net/scm.h>
 113 #include <linux/init.h>
 114 #include <linux/poll.h>
 115 #include <linux/rtnetlink.h>
 116 #include <linux/mount.h>
 117 #include <net/checksum.h>
 118 #include <linux/security.h>
 119 #include <linux/freezer.h>
 120 #include <linux/file.h>
 121
 122 struct hlist_head unix_socket_table[2 * UNIX_HASH_SIZE];
 123 EXPORT_SYMBOL_GPL(unix_socket_table);
 124 DEFINE_SPINLOCK(unix_table_lock);
 125 EXPORT_SYMBOL_GPL(unix_table_lock);
 126 static atomic_long_t unix_nr_socks;
 127
 128
 129 static struct hlist_head *unix_sockets_unbound(void *addr)
 130 {
 131         unsigned long hash = (unsigned long)addr;
 132
 133         hash ^= hash >> 16;
 134         hash ^= hash >> 8;
 135         hash %= UNIX_HASH_SIZE;
 136         return &unix_socket_table[UNIX_HASH_SIZE + hash];
 137 }
 138
 139 #define UNIX_ABSTRACT(sk)       (unix_sk(sk)->addr->hash < UNIX_HASH_SIZE)
 140
 141 #ifdef CONFIG_SECURITY_NETWORK
 142 static void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb)
 143 {
 144         UNIXCB(skb).secid = scm->secid;
 145 }
 146
 147 static inline void unix_set_secdata(struct scm_cookie *scm, struct sk_buff *skb)
 148 {
 149         scm->secid = UNIXCB(skb).secid;
 150 }
 151
 152 static inline bool unix_secdata_eq(struct scm_cookie *scm, struct sk_buff *skb)
 153 {
 154         return (scm->secid == UNIXCB(skb).secid);
 155 }
 156 #else
 157 static inline void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb)
 158 { }
 159
 160 static inline void unix_set_secdata(struct scm_cookie *scm, struct sk_buff *skb)
 161 { }
 162
 163 static inline bool unix_secdata_eq(struct scm_cookie *scm, struct sk_buff *skb)
 164 {
 165         return true;
 166 }
 167 #endif /* CONFIG_SECURITY_NETWORK */
 168
 169 /*
 170  *  SMP locking strategy:
 171  *    hash table is protected with spinlock unix_table_lock
 172  *    each socket state is protected by separate spin lock.
 173  */
 174
 175 static inline unsigned int unix_hash_fold(__wsum n)
 176 {
 177         unsigned int hash = (__force unsigned int)csum_fold(n);
 178
 179         hash ^= hash>>8;
 180         return hash&(UNIX_HASH_SIZE-1);
 181 }
 182
 183 #define unix_peer(sk) (unix_sk(sk)->peer)
 184
 185 static inline int unix_our_peer(struct sock *sk, struct sock *osk)
 186 {
 187         return unix_peer(osk) == sk;
 188 }
 189
 190 static inline int unix_may_send(struct sock *sk, struct sock *osk)
 191 {
 192         return unix_peer(osk) == NULL || unix_our_peer(sk, osk);
 193 }
 194
 195 static inline int unix_recvq_full(const struct sock *sk)
 196 {
 197         return skb_queue_len(&sk->sk_receive_queue) > sk->sk_max_ack_backlog;
 198 }
 199
 200 static inline int unix_recvq_full_lockless(const struct sock *sk)
 201 {
 202         return skb_queue_len_lockless(&sk->sk_receive_queue) >
 203                 READ_ONCE(sk->sk_max_ack_backlog);
 204 }
 205
 206 struct sock *unix_peer_get(struct sock *s)
 207 {
 208         struct sock *peer;
 209
 210         unix_state_lock(s);
 211         peer = unix_peer(s);
 212         if (peer)
 213                 sock_hold(peer);
 214         unix_state_unlock(s);
 215         return peer;
 216 }
 217 EXPORT_SYMBOL_GPL(unix_peer_get);
 218
 219 static inline void unix_release_addr(struct unix_address *addr)
 220 {
 221         if (refcount_dec_and_test(&addr->refcnt))
 222                 kfree(addr);
 223 }
 224
 225 /*
 226  *      Check unix socket name:
 227  *              - should be not zero length.
 228  *              - if started by not zero, should be NULL terminated (FS object)
 229  *              - if started by zero, it is abstract name.
 230  */
 231
 232 static int unix_mkname(struct sockaddr_un *sunaddr, int len, unsigned int *hashp)
 233 {
 234         *hashp = 0;
 235
 236         if (len <= sizeof(short) || len > sizeof(*sunaddr))
 237                 return -EINVAL;
 238         if (!sunaddr || sunaddr->sun_family != AF_UNIX)
 239                 return -EINVAL;
 240         if (sunaddr->sun_path[0]) {
 241                 /*
 242                  * This may look like an off by one error but it is a bit more
 243                  * subtle. 108 is the longest valid AF_UNIX path for a binding.
 244                  * sun_path[108] doesn't as such exist.  However in kernel space
 245                  * we are guaranteed that it is a valid memory location in our
 246                  * kernel address buffer.
 247                  */
 248                 ((char *)sunaddr)[len] = 0;
 249                 len = strlen(sunaddr->sun_path)+1+sizeof(short);
 250                 return len;
 251         }
 252
 253         *hashp = unix_hash_fold(csum_partial(sunaddr, len, 0));
 254         return len;
 255 }
 256
 257 static void __unix_remove_socket(struct sock *sk)
 258 {
 259         sk_del_node_init(sk);
 260 }
 261
 262 static void __unix_insert_socket(struct hlist_head *list, struct sock *sk)
 263 {
 264         WARN_ON(!sk_unhashed(sk));
 265         sk_add_node(sk, list);
 266 }
 267
 268 static inline void unix_remove_socket(struct sock *sk)
 269 {
 270         spin_lock(&unix_table_lock);
 271         __unix_remove_socket(sk);
 272         spin_unlock(&unix_table_lock);
 273 }
 274
 275 static inline void unix_insert_socket(struct hlist_head *list, struct sock *sk)
 276 {
 277         spin_lock(&unix_table_lock);
 278         __unix_insert_socket(list, sk);
 279         spin_unlock(&unix_table_lock);
 280 }
 281
 282 static struct sock *__unix_find_socket_byname(struct net *net,
 283                                               struct sockaddr_un *sunname,
 284                                               int len, int type, unsigned int hash)
 285 {
 286         struct sock *s;
 287
 288         sk_for_each(s, &unix_socket_table[hash ^ type]) {
 289                 struct unix_sock *u = unix_sk(s);
 290
 291                 if (!net_eq(sock_net(s), net))
 292                         continue;
 293
 294                 if (u->addr->len == len &&
 295                     !memcmp(u->addr->name, sunname, len))
 296                         goto found;
 297         }
 298         s = NULL;
 299 found:
 300         return s;
 301 }
 302
 303 static inline struct sock *unix_find_socket_byname(struct net *net,
 304                                                    struct sockaddr_un *sunname,
 305                                                    int len, int type,
 306                                                    unsigned int hash)
 307 {
 308         struct sock *s;
 309
 310         spin_lock(&unix_table_lock);
 311         s = __unix_find_socket_byname(net, sunname, len, type, hash);
 312         if (s)
 313                 sock_hold(s);
 314         spin_unlock(&unix_table_lock);
 315         return s;
 316 }
 317
 318 static struct sock *unix_find_socket_byinode(struct inode *i)
 319 {
 320         struct sock *s;
 321
 322         spin_lock(&unix_table_lock);
 323         sk_for_each(s,
 324                     &unix_socket_table[i->i_ino & (UNIX_HASH_SIZE - 1)]) {
 325                 struct dentry *dentry = unix_sk(s)->path.dentry;
 326
 327                 if (dentry && d_backing_inode(dentry) == i) {
 328                         sock_hold(s);
 329                         goto found;
 330                 }
 331         }
 332         s = NULL;
 333 found:
 334         spin_unlock(&unix_table_lock);
 335         return s;
 336 }
 337
 338 /* Support code for asymmetrically connected dgram sockets
 339  *
 340  * If a datagram socket is connected to a socket not itself connected
 341  * to the first socket (eg, /dev/log), clients may only enqueue more
 342  * messages if the present receive queue of the server socket is not
 343  * "too large". This means there's a second writeability condition
 344  * poll and sendmsg need to test. The dgram recv code will do a wake
 345  * up on the peer_wait wait queue of a socket upon reception of a
 346  * datagram which needs to be propagated to sleeping would-be writers
 347  * since these might not have sent anything so far. This can't be
 348  * accomplished via poll_wait because the lifetime of the server
 349  * socket might be less than that of its clients if these break their
 350  * association with it or if the server socket is closed while clients
 351  * are still connected to it and there's no way to inform "a polling
 352  * implementation" that it should let go of a certain wait queue
 353  *
 354  * In order to propagate a wake up, a wait_queue_entry_t of the client
 355  * socket is enqueued on the peer_wait queue of the server socket
 356  * whose wake function does a wake_up on the ordinary client socket
 357  * wait queue. This connection is established whenever a write (or
 358  * poll for write) hit the flow control condition and broken when the
 359  * association to the server socket is dissolved or after a wake up
 360  * was relayed.
 361  */
 362
 363 static int unix_dgram_peer_wake_relay(wait_queue_entry_t *q, unsigned mode, int flags,
 364                                       void *key)
 365 {
 366         struct unix_sock *u;
 367         wait_queue_head_t *u_sleep;
 368
 369         u = container_of(q, struct unix_sock, peer_wake);
 370
 371         __remove_wait_queue(&unix_sk(u->peer_wake.private)->peer_wait,
 372                             q);
 373         u->peer_wake.private = NULL;
 374
 375         /* relaying can only happen while the wq still exists */
 376         u_sleep = sk_sleep(&u->sk);
 377         if (u_sleep)
 378                 wake_up_interruptible_poll(u_sleep, key_to_poll(key));
 379
 380         return 0;
 381 }
 382
 383 static int unix_dgram_peer_wake_connect(struct sock *sk, struct sock *other)
 384 {
 385         struct unix_sock *u, *u_other;
 386         int rc;
 387
 388         u = unix_sk(sk);
 389         u_other = unix_sk(other);
 390         rc = 0;
 391         spin_lock(&u_other->peer_wait.lock);
 392
 393         if (!u->peer_wake.private) {
 394                 u->peer_wake.private = other;
 395                 __add_wait_queue(&u_other->peer_wait, &u->peer_wake);
 396
 397                 rc = 1;
 398         }
 399
 400         spin_unlock(&u_other->peer_wait.lock);
 401         return rc;
 402 }
 403
 404 static void unix_dgram_peer_wake_disconnect(struct sock *sk,
 405                                             struct sock *other)
 406 {
 407         struct unix_sock *u, *u_other;
 408
 409         u = unix_sk(sk);
 410         u_other = unix_sk(other);
 411         spin_lock(&u_other->peer_wait.lock);
 412
 413         if (u->peer_wake.private == other) {
 414                 __remove_wait_queue(&u_other->peer_wait, &u->peer_wake);
 415                 u->peer_wake.private = NULL;
 416         }
 417
 418         spin_unlock(&u_other->peer_wait.lock);
 419 }
 420
 421 static void unix_dgram_peer_wake_disconnect_wakeup(struct sock *sk,
 422                                                    struct sock *other)
 423 {
 424         unix_dgram_peer_wake_disconnect(sk, other);
 425         wake_up_interruptible_poll(sk_sleep(sk),
 426                                    EPOLLOUT |
 427                                    EPOLLWRNORM |
 428                                    EPOLLWRBAND);
 429 }
 430
 431 /* preconditions:
 432  *      - unix_peer(sk) == other
 433  *      - association is stable
 434  */
 435 static int unix_dgram_peer_wake_me(struct sock *sk, struct sock *other)
 436 {
 437         int connected;
 438
 439         connected = unix_dgram_peer_wake_connect(sk, other);
 440
 441         /* If other is SOCK_DEAD, we want to make sure we signal
 442          * POLLOUT, such that a subsequent write() can get a
 443          * -ECONNREFUSED. Otherwise, if we haven't queued any skbs
 444          * to other and its full, we will hang waiting for POLLOUT.
 445          */
 446         if (unix_recvq_full(other) && !sock_flag(other, SOCK_DEAD))
 447                 return 1;
 448
 449         if (connected)
 450                 unix_dgram_peer_wake_disconnect(sk, other);
 451
 452         return 0;
 453 }
 454
 455 static int unix_writable(const struct sock *sk)
 456 {
 457         return sk->sk_state != TCP_LISTEN &&
 458                (refcount_read(&sk->sk_wmem_alloc) << 2) <= sk->sk_sndbuf;
 459 }
 460
 461 static void unix_write_space(struct sock *sk)
 462 {
 463         struct socket_wq *wq;
 464
 465         rcu_read_lock();
 466         if (unix_writable(sk)) {
 467                 wq = rcu_dereference(sk->sk_wq);
 468                 if (skwq_has_sleeper(wq))
 469                         wake_up_interruptible_sync_poll(&wq->wait,
 470                                 EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND);
 471                 sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
 472         }
 473         rcu_read_unlock();
 474 }
 475
 476 /* When dgram socket disconnects (or changes its peer), we clear its receive
 477  * queue of packets arrived from previous peer. First, it allows to do
 478  * flow control based only on wmem_alloc; second, sk connected to peer
 479  * may receive messages only from that peer. */
 480 static void unix_dgram_disconnected(struct sock *sk, struct sock *other)
 481 {
 482         if (!skb_queue_empty(&sk->sk_receive_queue)) {
 483                 skb_queue_purge(&sk->sk_receive_queue);
 484                 wake_up_interruptible_all(&unix_sk(sk)->peer_wait);
 485
 486                 /* If one link of bidirectional dgram pipe is disconnected,
 487                  * we signal error. Messages are lost. Do not make this,
 488                  * when peer was not connected to us.
 489                  */
 490                 if (!sock_flag(other, SOCK_DEAD) && unix_peer(other) == sk) {
 491                         other->sk_err = ECONNRESET;
 492                         other->sk_error_report(other);
 493                 }
 494         }
 495 }
 496
 497 static void unix_sock_destructor(struct sock *sk)
 498 {
 499         struct unix_sock *u = unix_sk(sk);
 500
 501         skb_queue_purge(&sk->sk_receive_queue);
 502
 503         WARN_ON(refcount_read(&sk->sk_wmem_alloc));
 504         WARN_ON(!sk_unhashed(sk));
 505         WARN_ON(sk->sk_socket);
 506         if (!sock_flag(sk, SOCK_DEAD)) {
 507                 pr_info("Attempt to release alive unix socket: %p\n", sk);
 508                 return;
 509         }
 510
 511         if (u->addr)
 512                 unix_release_addr(u->addr);
 513
 514         atomic_long_dec(&unix_nr_socks);
 515         local_bh_disable();
 516         sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
 517         local_bh_enable();
 518 #ifdef UNIX_REFCNT_DEBUG
 519         pr_debug("UNIX %p is destroyed, %ld are still alive.\n", sk,
 520                 atomic_long_read(&unix_nr_socks));
 521 #endif
 522 }
 523
 524 static void unix_release_sock(struct sock *sk, int embrion)
 525 {
 526         struct unix_sock *u = unix_sk(sk);
 527         struct path path;
 528         struct sock *skpair;
 529         struct sk_buff *skb;
 530         int state;
 531
 532         unix_remove_socket(sk);
 533
 534         /* Clear state */
 535         unix_state_lock(sk);
 536         sock_orphan(sk);
 537         sk->sk_shutdown = SHUTDOWN_MASK;
 538         path         = u->path;
 539         u->path.dentry = NULL;
 540         u->path.mnt = NULL;
 541         state = sk->sk_state;
 542         sk->sk_state = TCP_CLOSE;
 543         unix_state_unlock(sk);
 544
 545         wake_up_interruptible_all(&u->peer_wait);
 546
 547         skpair = unix_peer(sk);
 548
 549         if (skpair != NULL) {
 550                 if (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) {
 551                         unix_state_lock(skpair);
 552                         /* No more writes */
 553                         skpair->sk_shutdown = SHUTDOWN_MASK;
 554                         if (!skb_queue_empty(&sk->sk_receive_queue) || embrion)
 555                                 skpair->sk_err = ECONNRESET;
 556                         unix_state_unlock(skpair);
 557                         skpair->sk_state_change(skpair);
 558                         sk_wake_async(skpair, SOCK_WAKE_WAITD, POLL_HUP);
 559                 }
 560
 561                 unix_dgram_peer_wake_disconnect(sk, skpair);
 562                 sock_put(skpair); /* It may now die */
 563                 unix_peer(sk) = NULL;
 564         }
 565
 566         /* Try to flush out this socket. Throw out buffers at least */
 567
 568         while ((skb = skb_dequeue(&sk->sk_receive_queue)) != NULL) {
 569                 if (state == TCP_LISTEN)
 570                         unix_release_sock(skb->sk, 1);
 571                 /* passed fds are erased in the kfree_skb hook        */
 572                 UNIXCB(skb).consumed = skb->len;
 573                 kfree_skb(skb);
 574         }
 575
 576         if (path.dentry)
 577                 path_put(&path);
 578
 579         sock_put(sk);
 580
 581         /* ---- Socket is dead now and most probably destroyed ---- */
 582
 583         /*
 584          * Fixme: BSD difference: In BSD all sockets connected to us get
 585          *        ECONNRESET and we die on the spot. In Linux we behave
 586          *        like files and pipes do and wait for the last
 587          *        dereference.
 588          *
 589          * Can't we simply set sock->err?
 590          *
 591          *        What the above comment does talk about? --ANK(980817)
 592          */
 593
 594         if (unix_tot_inflight)
 595                 unix_gc();              /* Garbage collect fds */
 596 }
 597
 598 static void init_peercred(struct sock *sk)
 599 {
 600         put_pid(sk->sk_peer_pid);
 601         if (sk->sk_peer_cred)
 602                 put_cred(sk->sk_peer_cred);
 603         sk->sk_peer_pid  = get_pid(task_tgid(current));
 604         sk->sk_peer_cred = get_current_cred();
 605 }
 606
 607 static void copy_peercred(struct sock *sk, struct sock *peersk)
 608 {
 609         put_pid(sk->sk_peer_pid);
 610         if (sk->sk_peer_cred)
 611                 put_cred(sk->sk_peer_cred);
 612         sk->sk_peer_pid  = get_pid(peersk->sk_peer_pid);
 613         sk->sk_peer_cred = get_cred(peersk->sk_peer_cred);
 614 }
 615
 616 static int unix_listen(struct socket *sock, int backlog)
 617 {
 618         int err;
 619         struct sock *sk = sock->sk;
 620         struct unix_sock *u = unix_sk(sk);
 621         struct pid *old_pid = NULL;
 622
 623         err = -EOPNOTSUPP;
 624         if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET)
 625                 goto out;       /* Only stream/seqpacket sockets accept */
 626         err = -EINVAL;
 627         if (!u->addr)
 628                 goto out;       /* No listens on an unbound socket */
 629         unix_state_lock(sk);
 630         if (sk->sk_state != TCP_CLOSE && sk->sk_state != TCP_LISTEN)
 631                 goto out_unlock;
 632         if (backlog > sk->sk_max_ack_backlog)
 633                 wake_up_interruptible_all(&u->peer_wait);
 634         sk->sk_max_ack_backlog  = backlog;
 635         sk->sk_state            = TCP_LISTEN;
 636         /* set credentials so connect can copy them */
 637         init_peercred(sk);
 638         err = 0;
 639
 640 out_unlock:
 641         unix_state_unlock(sk);
 642         put_pid(old_pid);
 643 out:
 644         return err;
 645 }
 646
 647 static int unix_release(struct socket *);
 648 static int unix_bind(struct socket *, struct sockaddr *, int);
 649 static int unix_stream_connect(struct socket *, struct sockaddr *,
 650                                int addr_len, int flags);
 651 static int unix_socketpair(struct socket *, struct socket *);
 652 static int unix_accept(struct socket *, struct socket *, int, bool);
 653 static int unix_getname(struct socket *, struct sockaddr *, int);
 654 static __poll_t unix_poll(struct file *, struct socket *, poll_table *);
 655 static __poll_t unix_dgram_poll(struct file *, struct socket *,
 656                                     poll_table *);
 657 static int unix_ioctl(struct socket *, unsigned int, unsigned long);
 658 #ifdef CONFIG_COMPAT
 659 static int unix_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg);
 660 #endif
 661 static int unix_shutdown(struct socket *, int);
 662 static int unix_stream_sendmsg(struct socket *, struct msghdr *, size_t);
 663 static int unix_stream_recvmsg(struct socket *, struct msghdr *, size_t, int);
 664 static ssize_t unix_stream_sendpage(struct socket *, struct page *, int offset,
 665                                     size_t size, int flags);
 666 static ssize_t unix_stream_splice_read(struct socket *,  loff_t *ppos,
 667                                        struct pipe_inode_info *, size_t size,
 668                                        unsigned int flags);
 669 static int unix_dgram_sendmsg(struct socket *, struct msghdr *, size_t);
 670 static int unix_dgram_recvmsg(struct socket *, struct msghdr *, size_t, int);
 671 static int unix_dgram_connect(struct socket *, struct sockaddr *,
 672                               int, int);
 673 static int unix_seqpacket_sendmsg(struct socket *, struct msghdr *, size_t);
 674 static int unix_seqpacket_recvmsg(struct socket *, struct msghdr *, size_t,
 675                                   int);
 676
 677 static int unix_set_peek_off(struct sock *sk, int val)
 678 {
 679         struct unix_sock *u = unix_sk(sk);
 680
 681         if (mutex_lock_interruptible(&u->iolock))
 682                 return -EINTR;
 683
 684         sk->sk_peek_off = val;
 685         mutex_unlock(&u->iolock);
 686
 687         return 0;
 688 }
 689
 690
 691 static const struct proto_ops unix_stream_ops = {
 692         .family =       PF_UNIX,
 693         .owner =        THIS_MODULE,
 694         .release =      unix_release,
 695         .bind =         unix_bind,
 696         .connect =      unix_stream_connect,
 697         .socketpair =   unix_socketpair,
 698         .accept =       unix_accept,
 699         .getname =      unix_getname,
 700         .poll =         unix_poll,
 701         .ioctl =        unix_ioctl,
 702 #ifdef CONFIG_COMPAT
 703         .compat_ioctl = unix_compat_ioctl,
 704 #endif
 705         .listen =       unix_listen,
 706         .shutdown =     unix_shutdown,
 707         .setsockopt =   sock_no_setsockopt,
 708         .getsockopt =   sock_no_getsockopt,
 709         .sendmsg =      unix_stream_sendmsg,
 710         .recvmsg =      unix_stream_recvmsg,
 711         .mmap =         sock_no_mmap,
 712         .sendpage =     unix_stream_sendpage,
 713         .splice_read =  unix_stream_splice_read,
 714         .set_peek_off = unix_set_peek_off,
 715 };
 716
 717 static const struct proto_ops unix_dgram_ops = {
 718         .family =       PF_UNIX,
 719         .owner =        THIS_MODULE,
 720         .release =      unix_release,
 721         .bind =         unix_bind,
 722         .connect =      unix_dgram_connect,
 723         .socketpair =   unix_socketpair,
 724         .accept =       sock_no_accept,
 725         .getname =      unix_getname,
 726         .poll =         unix_dgram_poll,
 727         .ioctl =        unix_ioctl,
 728 #ifdef CONFIG_COMPAT
 729         .compat_ioctl = unix_compat_ioctl,
 730 #endif
 731         .listen =       sock_no_listen,
 732         .shutdown =     unix_shutdown,
 733         .setsockopt =   sock_no_setsockopt,
 734         .getsockopt =   sock_no_getsockopt,
 735         .sendmsg =      unix_dgram_sendmsg,
 736         .recvmsg =      unix_dgram_recvmsg,
 737         .mmap =         sock_no_mmap,
 738         .sendpage =     sock_no_sendpage,
 739         .set_peek_off = unix_set_peek_off,
 740 };
 741
 742 static const struct proto_ops unix_seqpacket_ops = {
 743         .family =       PF_UNIX,
 744         .owner =        THIS_MODULE,
 745         .release =      unix_release,
 746         .bind =         unix_bind,
 747         .connect =      unix_stream_connect,
 748         .socketpair =   unix_socketpair,
 749         .accept =       unix_accept,
 750         .getname =      unix_getname,
 751         .poll =         unix_dgram_poll,
 752         .ioctl =        unix_ioctl,
 753 #ifdef CONFIG_COMPAT
 754         .compat_ioctl = unix_compat_ioctl,
 755 #endif
 756         .listen =       unix_listen,
 757         .shutdown =     unix_shutdown,
 758         .setsockopt =   sock_no_setsockopt,
 759         .getsockopt =   sock_no_getsockopt,
 760         .sendmsg =      unix_seqpacket_sendmsg,
 761         .recvmsg =      unix_seqpacket_recvmsg,
 762         .mmap =         sock_no_mmap,
 763         .sendpage =     sock_no_sendpage,
 764         .set_peek_off = unix_set_peek_off,
 765 };
 766
 767 static struct proto unix_proto = {
 768         .name                   = "UNIX",
 769         .owner                  = THIS_MODULE,
 770         .obj_size               = sizeof(struct unix_sock),
 771 };
 772
 773 static struct sock *unix_create1(struct net *net, struct socket *sock, int kern)
 774 {
 775         struct sock *sk = NULL;
 776         struct unix_sock *u;
 777
 778         atomic_long_inc(&unix_nr_socks);
 779         if (atomic_long_read(&unix_nr_socks) > 2 * get_max_files())
 780                 goto out;
 781
 782         sk = sk_alloc(net, PF_UNIX, GFP_KERNEL, &unix_proto, kern);
 783         if (!sk)
 784                 goto out;
 785
 786         sock_init_data(sock, sk);
 787
 788         sk->sk_allocation       = GFP_KERNEL_ACCOUNT;
 789         sk->sk_write_space      = unix_write_space;
 790         sk->sk_max_ack_backlog  = net->unx.sysctl_max_dgram_qlen;
 791         sk->sk_destruct         = unix_sock_destructor;
 792         u         = unix_sk(sk);
 793         u->path.dentry = NULL;
 794         u->path.mnt = NULL;
 795         spin_lock_init(&u->lock);
 796         atomic_long_set(&u->inflight, 0);
 797         INIT_LIST_HEAD(&u->link);
 798         mutex_init(&u->iolock); /* single task reading lock */
 799         mutex_init(&u->bindlock); /* single task binding lock */
 800         init_waitqueue_head(&u->peer_wait);
 801         init_waitqueue_func_entry(&u->peer_wake, unix_dgram_peer_wake_relay);
 802         unix_insert_socket(unix_sockets_unbound(sk), sk);
 803 out:
 804         if (sk == NULL)
 805                 atomic_long_dec(&unix_nr_socks);
 806         else {
 807                 local_bh_disable();
 808                 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
 809                 local_bh_enable();
 810         }
 811         return sk;
 812 }
 813
 814 static int unix_create(struct net *net, struct socket *sock, int protocol,
 815                        int kern)
 816 {
 817         if (protocol && protocol != PF_UNIX)
 818                 return -EPROTONOSUPPORT;
 819
 820         sock->state = SS_UNCONNECTED;
 821
 822         switch (sock->type) {
 823         case SOCK_STREAM:
 824                 sock->ops = &unix_stream_ops;
 825                 break;
 826                 /*
 827                  *      Believe it or not BSD has AF_UNIX, SOCK_RAW though
 828                  *      nothing uses it.
 829                  */
 830         case SOCK_RAW:
 831                 sock->type = SOCK_DGRAM;
 832                 /* fall through */
 833         case SOCK_DGRAM:
 834                 sock->ops = &unix_dgram_ops;
 835                 break;
 836         case SOCK_SEQPACKET:
 837                 sock->ops = &unix_seqpacket_ops;
 838                 break;
 839         default:
 840                 return -ESOCKTNOSUPPORT;
 841         }
 842
 843         return unix_create1(net, sock, kern) ? 0 : -ENOMEM;
 844 }
 845
 846 static int unix_release(struct socket *sock)
 847 {
 848         struct sock *sk = sock->sk;
 849
 850         if (!sk)
 851                 return 0;
 852
 853         unix_release_sock(sk, 0);
 854         sock->sk = NULL;
 855
 856         return 0;
 857 }
 858
 859 static int unix_autobind(struct socket *sock)
 860 {
 861         struct sock *sk = sock->sk;
 862         struct net *net = sock_net(sk);
 863         struct unix_sock *u = unix_sk(sk);
 864         static u32 ordernum = 1;
 865         struct unix_address *addr;
 866         int err;
 867         unsigned int retries = 0;
 868
 869         err = mutex_lock_interruptible(&u->bindlock);
 870         if (err)
 871                 return err;
 872
 873         err = 0;
 874         if (u->addr)
 875                 goto out;
 876
 877         err = -ENOMEM;
 878         addr = kzalloc(sizeof(*addr) + sizeof(short) + 16, GFP_KERNEL);
 879         if (!addr)
 880                 goto out;
 881
 882         addr->name->sun_family = AF_UNIX;
 883         refcount_set(&addr->refcnt, 1);
 884
 885 retry:
 886         addr->len = sprintf(addr->name->sun_path+1, "%05x", ordernum) + 1 + sizeof(short);
 887         addr->hash = unix_hash_fold(csum_partial(addr->name, addr->len, 0));
 888
 889         spin_lock(&unix_table_lock);
 890         ordernum = (ordernum+1)&0xFFFFF;
 891
 892         if (__unix_find_socket_byname(net, addr->name, addr->len, sock->type,
 893                                       addr->hash)) {
 894                 spin_unlock(&unix_table_lock);
 895                 /*
 896                  * __unix_find_socket_byname() may take long time if many names
 897                  * are already in use.
 898                  */
 899                 cond_resched();
 900                 /* Give up if all names seems to be in use. */
 901                 if (retries++ == 0xFFFFF) {
 902                         err = -ENOSPC;
 903                         kfree(addr);
 904                         goto out;
 905                 }
 906                 goto retry;
 907         }
 908         addr->hash ^= sk->sk_type;
 909
 910         __unix_remove_socket(sk);
 911         smp_store_release(&u->addr, addr);
 912         __unix_insert_socket(&unix_socket_table[addr->hash], sk);
 913         spin_unlock(&unix_table_lock);
 914         err = 0;
 915
 916 out:    mutex_unlock(&u->bindlock);
 917         return err;
 918 }
 919
 920 static struct sock *unix_find_other(struct net *net,
 921                                     struct sockaddr_un *sunname, int len,
 922                                     int type, unsigned int hash, int *error)
 923 {
 924         struct sock *u;
 925         struct path path;
 926         int err = 0;
 927
 928         if (sunname->sun_path[0]) {
 929                 struct inode *inode;
 930                 err = kern_path(sunname->sun_path, LOOKUP_FOLLOW, &path);
 931                 if (err)
 932                         goto fail;
 933                 inode = d_backing_inode(path.dentry);
 934                 err = inode_permission(inode, MAY_WRITE);
 935                 if (err)
 936                         goto put_fail;
 937
 938                 err = -ECONNREFUSED;
 939                 if (!S_ISSOCK(inode->i_mode))
 940                         goto put_fail;
 941                 u = unix_find_socket_byinode(inode);
 942                 if (!u)
 943                         goto put_fail;
 944
 945                 if (u->sk_type == type)
 946                         touch_atime(&path);
 947
 948                 path_put(&path);
 949
 950                 err = -EPROTOTYPE;
 951                 if (u->sk_type != type) {
 952                         sock_put(u);
 953                         goto fail;
 954                 }
 955         } else {
 956                 err = -ECONNREFUSED;
 957                 u = unix_find_socket_byname(net, sunname, len, type, hash);
 958                 if (u) {
 959                         struct dentry *dentry;
 960                         dentry = unix_sk(u)->path.dentry;
 961                         if (dentry)
 962                                 touch_atime(&unix_sk(u)->path);
 963                 } else
 964                         goto fail;
 965         }
 966         return u;
 967
 968 put_fail:
 969         path_put(&path);
 970 fail:
 971         *error = err;
 972         return NULL;
 973 }
 974
 975 static int unix_mknod(const char *sun_path, umode_t mode, struct path *res)
 976 {
 977         struct dentry *dentry;
 978         struct path path;
 979         int err = 0;
 980         /*
 981          * Get the parent directory, calculate the hash for last
 982          * component.
 983          */
 984         dentry = kern_path_create(AT_FDCWD, sun_path, &path, 0);
 985         err = PTR_ERR(dentry);
 986         if (IS_ERR(dentry))
 987                 return err;
 988
 989         /*
 990          * All right, let's create it.
 991          */
 992         err = security_path_mknod(&path, dentry, mode, 0);
 993         if (!err) {
 994                 err = vfs_mknod(d_inode(path.dentry), dentry, mode, 0);
 995                 if (!err) {
 996                         res->mnt = mntget(path.mnt);
 997                         res->dentry = dget(dentry);
 998                 }
 999         }
1000         done_path_create(&path, dentry);
1001         return err;
1002 }
1003
1004 static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
1005 {
1006         struct sock *sk = sock->sk;
1007         struct net *net = sock_net(sk);
1008         struct unix_sock *u = unix_sk(sk);
1009         struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr;
1010         char *sun_path = sunaddr->sun_path;
1011         int err;
1012         unsigned int hash;
1013         struct unix_address *addr;
1014         struct hlist_head *list;
1015         struct path path = { };
1016
1017         err = -EINVAL;
1018         if (addr_len < offsetofend(struct sockaddr_un, sun_family) ||
1019             sunaddr->sun_family != AF_UNIX)
1020                 goto out;
1021
1022         if (addr_len == sizeof(short)) {
1023                 err = unix_autobind(sock);
1024                 goto out;
1025         }
1026
1027         err = unix_mkname(sunaddr, addr_len, &hash);
1028         if (err < 0)
1029                 goto out;
1030         addr_len = err;
1031
1032         if (sun_path[0]) {
1033                 umode_t mode = S_IFSOCK |
1034                        (SOCK_INODE(sock)->i_mode & ~current_umask());
1035                 err = unix_mknod(sun_path, mode, &path);
1036                 if (err) {
1037                         if (err == -EEXIST)
1038                                 err = -EADDRINUSE;
1039                         goto out;
1040                 }
1041         }
1042
1043         err = mutex_lock_interruptible(&u->bindlock);
1044         if (err)
1045                 goto out_put;
1046
1047         err = -EINVAL;
1048         if (u->addr)
1049                 goto out_up;
1050
1051         err = -ENOMEM;
1052         addr = kmalloc(sizeof(*addr)+addr_len, GFP_KERNEL);
1053         if (!addr)
1054                 goto out_up;
1055
1056         memcpy(addr->name, sunaddr, addr_len);
1057         addr->len = addr_len;
1058         addr->hash = hash ^ sk->sk_type;
1059         refcount_set(&addr->refcnt, 1);
1060
1061         if (sun_path[0]) {
1062                 addr->hash = UNIX_HASH_SIZE;
1063                 hash = d_backing_inode(path.dentry)->i_ino & (UNIX_HASH_SIZE - 1);
1064                 spin_lock(&unix_table_lock);
1065                 u->path = path;
1066                 list = &unix_socket_table[hash];
1067         } else {
1068                 spin_lock(&unix_table_lock);
1069                 err = -EADDRINUSE;
1070                 if (__unix_find_socket_byname(net, sunaddr, addr_len,
1071                                               sk->sk_type, hash)) {
1072                         unix_release_addr(addr);
1073                         goto out_unlock;
1074                 }
1075
1076                 list = &unix_socket_table[addr->hash];
1077         }
1078
1079         err = 0;
1080         __unix_remove_socket(sk);
1081         smp_store_release(&u->addr, addr);
1082         __unix_insert_socket(list, sk);
1083
1084 out_unlock:
1085         spin_unlock(&unix_table_lock);
1086 out_up:
1087         mutex_unlock(&u->bindlock);
1088 out_put:
1089         if (err)
1090                 path_put(&path);
1091 out:
1092         return err;
1093 }
1094
1095 static void unix_state_double_lock(struct sock *sk1, struct sock *sk2)
1096 {
1097         if (unlikely(sk1 == sk2) || !sk2) {
1098                 unix_state_lock(sk1);
1099                 return;
1100         }
1101         if (sk1 < sk2) {
1102                 unix_state_lock(sk1);
1103                 unix_state_lock_nested(sk2);
1104         } else {
1105                 unix_state_lock(sk2);
1106                 unix_state_lock_nested(sk1);
1107         }
1108 }
1109
1110 static void unix_state_double_unlock(struct sock *sk1, struct sock *sk2)
1111 {
1112         if (unlikely(sk1 == sk2) || !sk2) {
1113                 unix_state_unlock(sk1);
1114                 return;
1115         }
1116         unix_state_unlock(sk1);
1117         unix_state_unlock(sk2);
1118 }
1119
1120 static int unix_dgram_connect(struct socket *sock, struct sockaddr *addr,
1121                               int alen, int flags)
1122 {
1123         struct sock *sk = sock->sk;
1124         struct net *net = sock_net(sk);
1125         struct sockaddr_un *sunaddr = (struct sockaddr_un *)addr;
1126         struct sock *other;
1127         unsigned int hash;
1128         int err;
1129
1130         err = -EINVAL;
1131         if (alen < offsetofend(struct sockaddr, sa_family))
1132                 goto out;
1133
1134         if (addr->sa_family != AF_UNSPEC) {
1135                 err = unix_mkname(sunaddr, alen, &hash);
1136                 if (err < 0)
1137                         goto out;
1138                 alen = err;
1139
1140                 if (test_bit(SOCK_PASSCRED, &sock->flags) &&
1141                     !unix_sk(sk)->addr && (err = unix_autobind(sock)) != 0)
1142                         goto out;
1143
1144 restart:
1145                 other = unix_find_other(net, sunaddr, alen, sock->type, hash, &err);
1146                 if (!other)
1147                         goto out;
1148
1149                 unix_state_double_lock(sk, other);
1150
1151                 /* Apparently VFS overslept socket death. Retry. */
1152                 if (sock_flag(other, SOCK_DEAD)) {
1153                         unix_state_double_unlock(sk, other);
1154                         sock_put(other);
1155                         goto restart;
1156                 }
1157
1158                 err = -EPERM;
1159                 if (!unix_may_send(sk, other))
1160                         goto out_unlock;
1161
1162                 err = security_unix_may_send(sk->sk_socket, other->sk_socket);
1163                 if (err)
1164                         goto out_unlock;
1165
1166         } else {
1167                 /*
1168                  *      1003.1g breaking connected state with AF_UNSPEC
1169                  */
1170                 other = NULL;
1171                 unix_state_double_lock(sk, other);
1172         }
1173
1174         /*
1175          * If it was connected, reconnect.
1176          */
1177         if (unix_peer(sk)) {
1178                 struct sock *old_peer = unix_peer(sk);
1179                 unix_peer(sk) = other;
1180                 unix_dgram_peer_wake_disconnect_wakeup(sk, old_peer);
1181
1182                 unix_state_double_unlock(sk, other);
1183
1184                 if (other != old_peer)
1185                         unix_dgram_disconnected(sk, old_peer);
1186                 sock_put(old_peer);
1187         } else {
1188                 unix_peer(sk) = other;
1189                 unix_state_double_unlock(sk, other);
1190         }
1191         return 0;
1192
1193 out_unlock:
1194         unix_state_double_unlock(sk, other);
1195         sock_put(other);
1196 out:
1197         return err;
1198 }
1199
1200 static long unix_wait_for_peer(struct sock *other, long timeo)
1201 {
1202         struct unix_sock *u = unix_sk(other);
1203         int sched;
1204         DEFINE_WAIT(wait);
1205
1206         prepare_to_wait_exclusive(&u->peer_wait, &wait, TASK_INTERRUPTIBLE);
1207
1208         sched = !sock_flag(other, SOCK_DEAD) &&
1209                 !(other->sk_shutdown & RCV_SHUTDOWN) &&
1210                 unix_recvq_full(other);
1211
1212         unix_state_unlock(other);
1213
1214         if (sched)
1215                 timeo = schedule_timeout(timeo);
1216
1217         finish_wait(&u->peer_wait, &wait);
1218         return timeo;
1219 }
1220
1221 static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr,
1222                                int addr_len, int flags)
1223 {
1224         struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr;
1225         struct sock *sk = sock->sk;
1226         struct net *net = sock_net(sk);
1227         struct unix_sock *u = unix_sk(sk), *newu, *otheru;
1228         struct sock *newsk = NULL;
1229         struct sock *other = NULL;
1230         struct sk_buff *skb = NULL;
1231         unsigned int hash;
1232         int st;
1233         int err;
1234         long timeo;
1235
1236         err = unix_mkname(sunaddr, addr_len, &hash);
1237         if (err < 0)
1238                 goto out;
1239         addr_len = err;
1240
1241         if (test_bit(SOCK_PASSCRED, &sock->flags) && !u->addr &&
1242             (err = unix_autobind(sock)) != 0)
1243                 goto out;
1244
1245         timeo = sock_sndtimeo(sk, flags & O_NONBLOCK);
1246
1247         /* First of all allocate resources.
1248            If we will make it after state is locked,
1249            we will have to recheck all again in any case.
1250          */
1251
1252         err = -ENOMEM;
1253
1254         /* create new sock for complete connection */
1255         newsk = unix_create1(sock_net(sk), NULL, 0);
1256         if (newsk == NULL)
1257                 goto out;
1258
1259         /* Allocate skb for sending to listening sock */
1260         skb = sock_wmalloc(newsk, 1, 0, GFP_KERNEL);
1261         if (skb == NULL)
1262                 goto out;
1263
1264 restart:
1265         /*  Find listening sock. */
1266         other = unix_find_other(net, sunaddr, addr_len, sk->sk_type, hash, &err);
1267         if (!other)
1268                 goto out;
1269
1270         /* Latch state of peer */
1271         unix_state_lock(other);
1272
1273         /* Apparently VFS overslept socket death. Retry. */
1274         if (sock_flag(other, SOCK_DEAD)) {
1275                 unix_state_unlock(other);
1276                 sock_put(other);
1277                 goto restart;
1278         }
1279
1280         err = -ECONNREFUSED;
1281         if (other->sk_state != TCP_LISTEN)
1282                 goto out_unlock;
1283         if (other->sk_shutdown & RCV_SHUTDOWN)
1284                 goto out_unlock;
1285
1286         if (unix_recvq_full(other)) {
1287                 err = -EAGAIN;
1288                 if (!timeo)
1289                         goto out_unlock;
1290
1291                 timeo = unix_wait_for_peer(other, timeo);
1292
1293                 err = sock_intr_errno(timeo);
1294                 if (signal_pending(current))
1295                         goto out;
1296                 sock_put(other);
1297                 goto restart;
1298         }
1299
1300         /* Latch our state.
1301
1302            It is tricky place. We need to grab our state lock and cannot
1303            drop lock on peer. It is dangerous because deadlock is
1304            possible. Connect to self case and simultaneous
1305            attempt to connect are eliminated by checking socket
1306            state. other is TCP_LISTEN, if sk is TCP_LISTEN we
1307            check this before attempt to grab lock.
1308
1309            Well, and we have to recheck the state after socket locked.
1310          */
1311         st = sk->sk_state;
1312
1313         switch (st) {
1314         case TCP_CLOSE:
1315                 /* This is ok... continue with connect */
1316                 break;
1317         case TCP_ESTABLISHED:
1318                 /* Socket is already connected */
1319                 err = -EISCONN;
1320                 goto out_unlock;
1321         default:
1322                 err = -EINVAL;
1323                 goto out_unlock;
1324         }
1325
1326         unix_state_lock_nested(sk);
1327
1328         if (sk->sk_state != st) {
1329                 unix_state_unlock(sk);
1330                 unix_state_unlock(other);
1331                 sock_put(other);
1332                 goto restart;
1333         }
1334
1335         err = security_unix_stream_connect(sk, other, newsk);
1336         if (err) {
1337                 unix_state_unlock(sk);
1338                 goto out_unlock;
1339         }
1340
1341         /* The way is open! Fastly set all the necessary fields... */
1342
1343         sock_hold(sk);
1344         unix_peer(newsk)        = sk;
1345         newsk->sk_state         = TCP_ESTABLISHED;
1346         newsk->sk_type          = sk->sk_type;
1347         init_peercred(newsk);
1348         newu = unix_sk(newsk);
1349         RCU_INIT_POINTER(newsk->sk_wq, &newu->peer_wq);
1350         otheru = unix_sk(other);
1351
1352         /* copy address information from listening to new sock
1353          *
1354          * The contents of *(otheru->addr) and otheru->path
1355          * are seen fully set up here, since we have found
1356          * otheru in hash under unix_table_lock.  Insertion
1357          * into the hash chain we'd found it in had been done
1358          * in an earlier critical area protected by unix_table_lock,
1359          * the same one where we'd set *(otheru->addr) contents,
1360          * as well as otheru->path and otheru->addr itself.
1361          *
1362          * Using smp_store_release() here to set newu->addr
1363          * is enough to make those stores, as well as stores
1364          * to newu->path visible to anyone who gets newu->addr
1365          * by smp_load_acquire().  IOW, the same warranties
1366          * as for unix_sock instances bound in unix_bind() or
1367          * in unix_autobind().
1368          */
1369         if (otheru->path.dentry) {
1370                 path_get(&otheru->path);
1371                 newu->path = otheru->path;
1372         }
1373         refcount_inc(&otheru->addr->refcnt);
1374         smp_store_release(&newu->addr, otheru->addr);
1375
1376         /* Set credentials */
1377         copy_peercred(sk, other);
1378
1379         sock->state     = SS_CONNECTED;
1380         sk->sk_state    = TCP_ESTABLISHED;
1381         sock_hold(newsk);
1382
1383         smp_mb__after_atomic(); /* sock_hold() does an atomic_inc() */
1384         unix_peer(sk)   = newsk;
1385
1386         unix_state_unlock(sk);
1387
1388         /* take ten and and send info to listening sock */
1389         spin_lock(&other->sk_receive_queue.lock);
1390         __skb_queue_tail(&other->sk_receive_queue, skb);
1391         spin_unlock(&other->sk_receive_queue.lock);
1392         unix_state_unlock(other);
1393         other->sk_data_ready(other);
1394         sock_put(other);
1395         return 0;
1396
1397 out_unlock:
1398         if (other)
1399                 unix_state_unlock(other);
1400
1401 out:
1402         kfree_skb(skb);
1403         if (newsk)
1404                 unix_release_sock(newsk, 0);
1405         if (other)
1406                 sock_put(other);
1407         return err;
1408 }
1409
1410 static int unix_socketpair(struct socket *socka, struct socket *sockb)
1411 {
1412         struct sock *ska = socka->sk, *skb = sockb->sk;
1413
1414         /* Join our sockets back to back */
1415         sock_hold(ska);
1416         sock_hold(skb);
1417         unix_peer(ska) = skb;
1418         unix_peer(skb) = ska;
1419         init_peercred(ska);
1420         init_peercred(skb);
1421
1422         if (ska->sk_type != SOCK_DGRAM) {
1423                 ska->sk_state = TCP_ESTABLISHED;
1424                 skb->sk_state = TCP_ESTABLISHED;
1425                 socka->state  = SS_CONNECTED;
1426                 sockb->state  = SS_CONNECTED;
1427         }
1428         return 0;
1429 }
1430
1431 static void unix_sock_inherit_flags(const struct socket *old,
1432                                     struct socket *new)
1433 {
1434         if (test_bit(SOCK_PASSCRED, &old->flags))
1435                 set_bit(SOCK_PASSCRED, &new->flags);
1436         if (test_bit(SOCK_PASSSEC, &old->flags))
1437                 set_bit(SOCK_PASSSEC, &new->flags);
1438 }
1439
1440 static int unix_accept(struct socket *sock, struct socket *newsock, int flags,
1441                        bool kern)
1442 {
1443         struct sock *sk = sock->sk;
1444         struct sock *tsk;
1445         struct sk_buff *skb;
1446         int err;
1447
1448         err = -EOPNOTSUPP;
1449         if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET)
1450                 goto out;
1451
1452         err = -EINVAL;
1453         if (sk->sk_state != TCP_LISTEN)
1454                 goto out;
1455
1456         /* If socket state is TCP_LISTEN it cannot change (for now...),
1457          * so that no locks are necessary.
1458          */
1459
1460         skb = skb_recv_datagram(sk, 0, flags&O_NONBLOCK, &err);
1461         if (!skb) {
1462                 /* This means receive shutdown. */
1463                 if (err == 0)
1464                         err = -EINVAL;
1465                 goto out;
1466         }
1467
1468         tsk = skb->sk;
1469         skb_free_datagram(sk, skb);
1470         wake_up_interruptible(&unix_sk(sk)->peer_wait);
1471
1472         /* attach accepted sock to socket */
1473         unix_state_lock(tsk);
1474         newsock->state = SS_CONNECTED;
1475         unix_sock_inherit_flags(sock, newsock);
1476         sock_graft(tsk, newsock);
1477         unix_state_unlock(tsk);
1478         return 0;
1479
1480 out:
1481         return err;
1482 }
1483
1484
1485 static int unix_getname(struct socket *sock, struct sockaddr *uaddr, int peer)
1486 {
1487         struct sock *sk = sock->sk;
1488         struct unix_address *addr;
1489         DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, uaddr);
1490         int err = 0;
1491
1492         if (peer) {
1493                 sk = unix_peer_get(sk);
1494
1495                 err = -ENOTCONN;
1496                 if (!sk)
1497                         goto out;
1498                 err = 0;
1499         } else {
1500                 sock_hold(sk);
1501         }
1502
1503         addr = smp_load_acquire(&unix_sk(sk)->addr);
1504         if (!addr) {
1505                 sunaddr->sun_family = AF_UNIX;
1506                 sunaddr->sun_path[0] = 0;
1507                 err = sizeof(short);
1508         } else {
1509                 err = addr->len;
1510                 memcpy(sunaddr, addr->name, addr->len);
1511         }
1512         sock_put(sk);
1513 out:
1514         return err;
1515 }
1516
1517 static void unix_detach_fds(struct scm_cookie *scm, struct sk_buff *skb)
1518 {
1519         int i;
1520
1521         scm->fp = UNIXCB(skb).fp;
1522         UNIXCB(skb).fp = NULL;
1523
1524         for (i = scm->fp->count-1; i >= 0; i--)
1525                 unix_notinflight(scm->fp->user, scm->fp->fp[i]);
1526 }
1527
1528 static void unix_destruct_scm(struct sk_buff *skb)
1529 {
1530         struct scm_cookie scm;
1531         memset(&scm, 0, sizeof(scm));
1532         scm.pid  = UNIXCB(skb).pid;
1533         if (UNIXCB(skb).fp)
1534                 unix_detach_fds(&scm, skb);
1535
1536         /* Alas, it calls VFS */
1537         /* So fscking what? fput() had been SMP-safe since the last Summer */
1538         scm_destroy(&scm);
1539         sock_wfree(skb);
1540 }
1541
1542 /*
1543  * The "user->unix_inflight" variable is protected by the garbage
1544  * collection lock, and we just read it locklessly here. If you go
1545  * over the limit, there might be a tiny race in actually noticing
1546  * it across threads. Tough.
1547  */
1548 static inline bool too_many_unix_fds(struct task_struct *p)
1549 {
1550         struct user_struct *user = current_user();
1551
1552         if (unlikely(user->unix_inflight > task_rlimit(p, RLIMIT_NOFILE)))
1553                 return !capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN);
1554         return false;
1555 }
1556
1557 static int unix_attach_fds(struct scm_cookie *scm, struct sk_buff *skb)
1558 {
1559         int i;
1560
1561         if (too_many_unix_fds(current))
1562                 return -ETOOMANYREFS;
1563
1564         /*
1565          * Need to duplicate file references for the sake of garbage
1566          * collection.  Otherwise a socket in the fps might become a
1567          * candidate for GC while the skb is not yet queued.
1568          */
1569         UNIXCB(skb).fp = scm_fp_dup(scm->fp);
1570         if (!UNIXCB(skb).fp)
1571                 return -ENOMEM;
1572
1573         for (i = scm->fp->count - 1; i >= 0; i--)
1574                 unix_inflight(scm->fp->user, scm->fp->fp[i]);
1575         return 0;
1576 }
1577
1578 static int unix_scm_to_skb(struct scm_cookie *scm, struct sk_buff *skb, bool send_fds)
1579 {
1580         int err = 0;
1581
1582         UNIXCB(skb).pid  = get_pid(scm->pid);
1583         UNIXCB(skb).uid = scm->creds.uid;
1584         UNIXCB(skb).gid = scm->creds.gid;
1585         UNIXCB(skb).fp = NULL;
1586         unix_get_secdata(scm, skb);
1587         if (scm->fp && send_fds)
1588                 err = unix_attach_fds(scm, skb);
1589
1590         skb->destructor = unix_destruct_scm;
1591         return err;
1592 }
1593
1594 static bool unix_passcred_enabled(const struct socket *sock,
1595                                   const struct sock *other)
1596 {
1597         return test_bit(SOCK_PASSCRED, &sock->flags) ||
1598                !other->sk_socket ||
1599                test_bit(SOCK_PASSCRED, &other->sk_socket->flags);
1600 }
1601
1602 /*
1603  * Some apps rely on write() giving SCM_CREDENTIALS
1604  * We include credentials if source or destination socket
1605  * asserted SOCK_PASSCRED.
1606  */
1607 static void maybe_add_creds(struct sk_buff *skb, const struct socket *sock,
1608                             const struct sock *other)
1609 {
1610         if (UNIXCB(skb).pid)
1611                 return;
1612         if (unix_passcred_enabled(sock, other)) {
1613                 UNIXCB(skb).pid  = get_pid(task_tgid(current));
1614                 current_uid_gid(&UNIXCB(skb).uid, &UNIXCB(skb).gid);
1615         }
1616 }
1617
1618 static int maybe_init_creds(struct scm_cookie *scm,
1619                             struct socket *socket,
1620                             const struct sock *other)
1621 {
1622         int err;
1623         struct msghdr msg = { .msg_controllen = 0 };
1624
1625         err = scm_send(socket, &msg, scm, false);
1626         if (err)
1627                 return err;
1628
1629         if (unix_passcred_enabled(socket, other)) {
1630                 scm->pid = get_pid(task_tgid(current));
1631                 current_uid_gid(&scm->creds.uid, &scm->creds.gid);
1632         }
1633         return err;
1634 }
1635
1636 static bool unix_skb_scm_eq(struct sk_buff *skb,
1637                             struct scm_cookie *scm)
1638 {
1639         const struct unix_skb_parms *u = &UNIXCB(skb);
1640
1641         return u->pid == scm->pid &&
1642                uid_eq(u->uid, scm->creds.uid) &&
1643                gid_eq(u->gid, scm->creds.gid) &&
1644                unix_secdata_eq(scm, skb);
1645 }
1646
1647 /*
1648  *      Send AF_UNIX data.
1649  */
1650
1651 static int unix_dgram_sendmsg(struct socket *sock, struct msghdr *msg,
1652                               size_t len)
1653 {
1654         struct sock *sk = sock->sk;
1655         struct net *net = sock_net(sk);
1656         struct unix_sock *u = unix_sk(sk);
1657         DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, msg->msg_name);
1658         struct sock *other = NULL;
1659         int namelen = 0; /* fake GCC */
1660         int err;
1661         unsigned int hash;
1662         struct sk_buff *skb;
1663         long timeo;
1664         struct scm_cookie scm;
1665         int data_len = 0;
1666         int sk_locked;
1667
1668         wait_for_unix_gc();
1669         err = scm_send(sock, msg, &scm, false);
1670         if (err < 0)
1671                 return err;
1672
1673         err = -EOPNOTSUPP;
1674         if (msg->msg_flags&MSG_OOB)
1675                 goto out;
1676
1677         if (msg->msg_namelen) {
1678                 err = unix_mkname(sunaddr, msg->msg_namelen, &hash);
1679                 if (err < 0)
1680                         goto out;
1681                 namelen = err;
1682         } else {
1683                 sunaddr = NULL;
1684                 err = -ENOTCONN;
1685                 other = unix_peer_get(sk);
1686                 if (!other)
1687                         goto out;
1688         }
1689
1690         if (test_bit(SOCK_PASSCRED, &sock->flags) && !u->addr
1691             && (err = unix_autobind(sock)) != 0)
1692                 goto out;
1693
1694         err = -EMSGSIZE;
1695         if (len > sk->sk_sndbuf - 32)
1696                 goto out;
1697
1698         if (len > SKB_MAX_ALLOC) {
1699                 data_len = min_t(size_t,
1700                                  len - SKB_MAX_ALLOC,
1701                                  MAX_SKB_FRAGS * PAGE_SIZE);
1702                 data_len = PAGE_ALIGN(data_len);
1703
1704                 BUILD_BUG_ON(SKB_MAX_ALLOC < PAGE_SIZE);
1705         }
1706
1707         skb = sock_alloc_send_pskb(sk, len - data_len, data_len,
1708                                    msg->msg_flags & MSG_DONTWAIT, &err,
1709                                    PAGE_ALLOC_COSTLY_ORDER);
1710         if (skb == NULL)
1711                 goto out;
1712
1713         err = unix_scm_to_skb(&scm, skb, true);
1714         if (err < 0)
1715                 goto out_free;
1716
1717         skb_put(skb, len - data_len);
1718         skb->data_len = data_len;
1719         skb->len = len;
1720         err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, len);
1721         if (err)
1722                 goto out_free;
1723
1724         timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT);
1725
1726 restart:
1727         if (!other) {
1728                 err = -ECONNRESET;
1729                 if (sunaddr == NULL)
1730                         goto out_free;
1731
1732                 other = unix_find_other(net, sunaddr, namelen, sk->sk_type,
1733                                         hash, &err);
1734                 if (other == NULL)
1735                         goto out_free;
1736         }
1737
1738         if (sk_filter(other, skb) < 0) {
1739                 /* Toss the packet but do not return any error to the sender */
1740                 err = len;
1741                 goto out_free;
1742         }
1743
1744         sk_locked = 0;
1745         unix_state_lock(other);
1746 restart_locked:
1747         err = -EPERM;
1748         if (!unix_may_send(sk, other))
1749                 goto out_unlock;
1750
1751         if (unlikely(sock_flag(other, SOCK_DEAD))) {
1752                 /*
1753                  *      Check with 1003.1g - what should
1754                  *      datagram error
1755                  */
1756                 unix_state_unlock(other);
1757                 sock_put(other);
1758
1759                 if (!sk_locked)
1760                         unix_state_lock(sk);
1761
1762                 err = 0;
1763                 if (unix_peer(sk) == other) {
1764                         unix_peer(sk) = NULL;
1765                         unix_dgram_peer_wake_disconnect_wakeup(sk, other);
1766
1767                         unix_state_unlock(sk);
1768
1769                         unix_dgram_disconnected(sk, other);
1770                         sock_put(other);
1771                         err = -ECONNREFUSED;
1772                 } else {
1773                         unix_state_unlock(sk);
1774                 }
1775
1776                 other = NULL;
1777                 if (err)
1778                         goto out_free;
1779                 goto restart;
1780         }
1781
1782         err = -EPIPE;
1783         if (other->sk_shutdown & RCV_SHUTDOWN)
1784                 goto out_unlock;
1785
1786         if (sk->sk_type != SOCK_SEQPACKET) {
1787                 err = security_unix_may_send(sk->sk_socket, other->sk_socket);
1788                 if (err)
1789                         goto out_unlock;
1790         }
1791
1792         /* other == sk && unix_peer(other) != sk if
1793          * - unix_peer(sk) == NULL, destination address bound to sk
1794          * - unix_peer(sk) == sk by time of get but disconnected before lock
1795          */
1796         if (other != sk &&
1797             unlikely(unix_peer(other) != sk &&
1798             unix_recvq_full_lockless(other))) {
1799                 if (timeo) {
1800                         timeo = unix_wait_for_peer(other, timeo);
1801
1802                         err = sock_intr_errno(timeo);
1803                         if (signal_pending(current))
1804                                 goto out_free;
1805
1806                         goto restart;
1807                 }
1808
1809                 if (!sk_locked) {
1810                         unix_state_unlock(other);
1811                         unix_state_double_lock(sk, other);
1812                 }
1813
1814                 if (unix_peer(sk) != other ||
1815                     unix_dgram_peer_wake_me(sk, other)) {
1816                         err = -EAGAIN;
1817                         sk_locked = 1;
1818                         goto out_unlock;
1819                 }
1820
1821                 if (!sk_locked) {
1822                         sk_locked = 1;
1823                         goto restart_locked;
1824                 }
1825         }
1826
1827         if (unlikely(sk_locked))
1828                 unix_state_unlock(sk);
1829
1830         if (sock_flag(other, SOCK_RCVTSTAMP))
1831                 __net_timestamp(skb);
1832         maybe_add_creds(skb, sock, other);
1833         skb_queue_tail(&other->sk_receive_queue, skb);
1834         unix_state_unlock(other);
1835         other->sk_data_ready(other);
1836         sock_put(other);
1837         scm_destroy(&scm);
1838         return len;
1839
1840 out_unlock:
1841         if (sk_locked)
1842                 unix_state_unlock(sk);
1843         unix_state_unlock(other);
1844 out_free:
1845         kfree_skb(skb);
1846 out:
1847         if (other)
1848                 sock_put(other);
1849         scm_destroy(&scm);
1850         return err;
1851 }
1852
1853 /* We use paged skbs for stream sockets, and limit occupancy to 32768
1854  * bytes, and a minimum of a full page.
1855  */
1856 #define UNIX_SKB_FRAGS_SZ (PAGE_SIZE << get_order(32768))
1857
1858 static int unix_stream_sendmsg(struct socket *sock, struct msghdr *msg,
1859                                size_t len)
1860 {
1861         struct sock *sk = sock->sk;
1862         struct sock *other = NULL;
1863         int err, size;
1864         struct sk_buff *skb;
1865         int sent = 0;
1866         struct scm_cookie scm;
1867         bool fds_sent = false;
1868         int data_len;
1869
1870         wait_for_unix_gc();
1871         err = scm_send(sock, msg, &scm, false);
1872         if (err < 0)
1873                 return err;
1874
1875         err = -EOPNOTSUPP;
1876         if (msg->msg_flags&MSG_OOB)
1877                 goto out_err;
1878
1879         if (msg->msg_namelen) {
1880                 err = sk->sk_state == TCP_ESTABLISHED ? -EISCONN : -EOPNOTSUPP;
1881                 goto out_err;
1882         } else {
1883                 err = -ENOTCONN;
1884                 other = unix_peer(sk);
1885                 if (!other)
1886                         goto out_err;
1887         }
1888
1889         if (sk->sk_shutdown & SEND_SHUTDOWN)
1890                 goto pipe_err;
1891
1892         while (sent < len) {
1893                 size = len - sent;
1894
1895                 /* Keep two messages in the pipe so it schedules better */
1896                 size = min_t(int, size, (sk->sk_sndbuf >> 1) - 64);
1897
1898                 /* allow fallback to order-0 allocations */
1899                 size = min_t(int, size, SKB_MAX_HEAD(0) + UNIX_SKB_FRAGS_SZ);
1900
1901                 data_len = max_t(int, 0, size - SKB_MAX_HEAD(0));
1902
1903                 data_len = min_t(size_t, size, PAGE_ALIGN(data_len));
1904
1905                 skb = sock_alloc_send_pskb(sk, size - data_len, data_len,
1906                                            msg->msg_flags & MSG_DONTWAIT, &err,
1907                                            get_order(UNIX_SKB_FRAGS_SZ));
1908                 if (!skb)
1909                         goto out_err;
1910
1911                 /* Only send the fds in the first buffer */
1912                 err = unix_scm_to_skb(&scm, skb, !fds_sent);
1913                 if (err < 0) {
1914                         kfree_skb(skb);
1915                         goto out_err;
1916                 }
1917                 fds_sent = true;
1918
1919                 skb_put(skb, size - data_len);
1920                 skb->data_len = data_len;
1921                 skb->len = size;
1922                 err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, size);
1923                 if (err) {
1924                         kfree_skb(skb);
1925                         goto out_err;
1926                 }
1927
1928                 unix_state_lock(other);
1929
1930                 if (sock_flag(other, SOCK_DEAD) ||
1931                     (other->sk_shutdown & RCV_SHUTDOWN))
1932                         goto pipe_err_free;
1933
1934                 maybe_add_creds(skb, sock, other);
1935                 skb_queue_tail(&other->sk_receive_queue, skb);
1936                 unix_state_unlock(other);
1937                 other->sk_data_ready(other);
1938                 sent += size;
1939         }
1940
1941         scm_destroy(&scm);
1942
1943         return sent;
1944
1945 pipe_err_free:
1946         unix_state_unlock(other);
1947         kfree_skb(skb);
1948 pipe_err:
1949         if (sent == 0 && !(msg->msg_flags&MSG_NOSIGNAL))
1950                 send_sig(SIGPIPE, current, 0);
1951         err = -EPIPE;
1952 out_err:
1953         scm_destroy(&scm);
1954         return sent ? : err;
1955 }
1956
1957 static ssize_t unix_stream_sendpage(struct socket *socket, struct page *page,
1958                                     int offset, size_t size, int flags)
1959 {
1960         int err;
1961         bool send_sigpipe = false;
1962         bool init_scm = true;
1963         struct scm_cookie scm;
1964         struct sock *other, *sk = socket->sk;
1965         struct sk_buff *skb, *newskb = NULL, *tail = NULL;
1966
1967         if (flags & MSG_OOB)
1968                 return -EOPNOTSUPP;
1969
1970         other = unix_peer(sk);
1971         if (!other || sk->sk_state != TCP_ESTABLISHED)
1972                 return -ENOTCONN;
1973
1974         if (false) {
1975 alloc_skb:
1976                 unix_state_unlock(other);
1977                 mutex_unlock(&unix_sk(other)->iolock);
1978                 newskb = sock_alloc_send_pskb(sk, 0, 0, flags & MSG_DONTWAIT,
1979                                               &err, 0);
1980                 if (!newskb)
1981                         goto err;
1982         }
1983
1984         /* we must acquire iolock as we modify already present
1985          * skbs in the sk_receive_queue and mess with skb->len
1986          */
1987         err = mutex_lock_interruptible(&unix_sk(other)->iolock);
1988         if (err) {
1989                 err = flags & MSG_DONTWAIT ? -EAGAIN : -ERESTARTSYS;
1990                 goto err;
1991         }
1992
1993         if (sk->sk_shutdown & SEND_SHUTDOWN) {
1994                 err = -EPIPE;
1995                 send_sigpipe = true;
1996                 goto err_unlock;
1997         }
1998
1999         unix_state_lock(other);
2000
2001         if (sock_flag(other, SOCK_DEAD) ||
2002             other->sk_shutdown & RCV_SHUTDOWN) {
2003                 err = -EPIPE;
2004                 send_sigpipe = true;
2005                 goto err_state_unlock;
2006         }
2007
2008         if (init_scm) {
2009                 err = maybe_init_creds(&scm, socket, other);
2010                 if (err)
2011                         goto err_state_unlock;
2012                 init_scm = false;
2013         }
2014
2015         skb = skb_peek_tail(&other->sk_receive_queue);
2016         if (tail && tail == skb) {
2017                 skb = newskb;
2018         } else if (!skb || !unix_skb_scm_eq(skb, &scm)) {
2019                 if (newskb) {
2020                         skb = newskb;
2021                 } else {
2022                         tail = skb;
2023                         goto alloc_skb;
2024                 }
2025         } else if (newskb) {
2026                 /* this is fast path, we don't necessarily need to
2027                  * call to kfree_skb even though with newskb == NULL
2028                  * this - does no harm
2029                  */
2030                 consume_skb(newskb);
2031                 newskb = NULL;
2032         }
2033
2034         if (skb_append_pagefrags(skb, page, offset, size)) {
2035                 tail = skb;
2036                 goto alloc_skb;
2037         }
2038
2039         skb->len += size;
2040         skb->data_len += size;
2041         skb->truesize += size;
2042         refcount_add(size, &sk->sk_wmem_alloc);
2043
2044         if (newskb) {
2045                 err = unix_scm_to_skb(&scm, skb, false);
2046                 if (err)
2047                         goto err_state_unlock;
2048                 spin_lock(&other->sk_receive_queue.lock);
2049                 __skb_queue_tail(&other->sk_receive_queue, newskb);
2050                 spin_unlock(&other->sk_receive_queue.lock);
2051         }
2052
2053         unix_state_unlock(other);
2054         mutex_unlock(&unix_sk(other)->iolock);
2055
2056         other->sk_data_ready(other);
2057         scm_destroy(&scm);
2058         return size;
2059
2060 err_state_unlock:
2061         unix_state_unlock(other);
2062 err_unlock:
2063         mutex_unlock(&unix_sk(other)->iolock);
2064 err:
2065         kfree_skb(newskb);
2066         if (send_sigpipe && !(flags & MSG_NOSIGNAL))
2067                 send_sig(SIGPIPE, current, 0);
2068         if (!init_scm)
2069                 scm_destroy(&scm);
2070         return err;
2071 }
2072
2073 static int unix_seqpacket_sendmsg(struct socket *sock, struct msghdr *msg,
2074                                   size_t len)
2075 {
2076         int err;
2077         struct sock *sk = sock->sk;
2078
2079         err = sock_error(sk);
2080         if (err)
2081                 return err;
2082
2083         if (sk->sk_state != TCP_ESTABLISHED)
2084                 return -ENOTCONN;
2085
2086         if (msg->msg_namelen)
2087                 msg->msg_namelen = 0;
2088
2089         return unix_dgram_sendmsg(sock, msg, len);
2090 }
2091
2092 static int unix_seqpacket_recvmsg(struct socket *sock, struct msghdr *msg,
2093                                   size_t size, int flags)
2094 {
2095         struct sock *sk = sock->sk;
2096
2097         if (sk->sk_state != TCP_ESTABLISHED)
2098                 return -ENOTCONN;
2099
2100         return unix_dgram_recvmsg(sock, msg, size, flags);
2101 }
2102
2103 static void unix_copy_addr(struct msghdr *msg, struct sock *sk)
2104 {
2105         struct unix_address *addr = smp_load_acquire(&unix_sk(sk)->addr);
2106
2107         if (addr) {
2108                 msg->msg_namelen = addr->len;
2109                 memcpy(msg->msg_name, addr->name, addr->len);
2110         }
2111 }
2112
2113 static int unix_dgram_recvmsg(struct socket *sock, struct msghdr *msg,
2114                               size_t size, int flags)
2115 {
2116         struct scm_cookie scm;
2117         struct sock *sk = sock->sk;
2118         struct unix_sock *u = unix_sk(sk);
2119         struct sk_buff *skb, *last;
2120         long timeo;
2121         int err;
2122         int peeked, skip;
2123
2124         err = -EOPNOTSUPP;
2125         if (flags&MSG_OOB)
2126                 goto out;
2127
2128         timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT);
2129
2130         do {
2131                 mutex_lock(&u->iolock);
2132
2133                 skip = sk_peek_offset(sk, flags);
2134                 skb = __skb_try_recv_datagram(sk, flags, NULL, &peeked, &skip,
2135                                               &err, &last);
2136                 if (skb)
2137                         break;
2138
2139                 mutex_unlock(&u->iolock);
2140
2141                 if (err != -EAGAIN)
2142                         break;
2143         } while (timeo &&
2144                  !__skb_wait_for_more_packets(sk, &err, &timeo, last));
2145
2146         if (!skb) { /* implies iolock unlocked */
2147                 unix_state_lock(sk);
2148                 /* Signal EOF on disconnected non-blocking SEQPACKET socket. */
2149                 if (sk->sk_type == SOCK_SEQPACKET && err == -EAGAIN &&
2150                     (sk->sk_shutdown & RCV_SHUTDOWN))
2151                         err = 0;
2152                 unix_state_unlock(sk);
2153                 goto out;
2154         }
2155
2156         if (wq_has_sleeper(&u->peer_wait))
2157                 wake_up_interruptible_sync_poll(&u->peer_wait,
2158                                                 EPOLLOUT | EPOLLWRNORM |
2159                                                 EPOLLWRBAND);
2160
2161         if (msg->msg_name)
2162                 unix_copy_addr(msg, skb->sk);
2163
2164         if (size > skb->len - skip)
2165                 size = skb->len - skip;
2166         else if (size < skb->len - skip)
2167                 msg->msg_flags |= MSG_TRUNC;
2168
2169         err = skb_copy_datagram_msg(skb, skip, msg, size);
2170         if (err)
2171                 goto out_free;
2172
2173         if (sock_flag(sk, SOCK_RCVTSTAMP))
2174                 __sock_recv_timestamp(msg, sk, skb);
2175
2176         memset(&scm, 0, sizeof(scm));
2177
2178         scm_set_cred(&scm, UNIXCB(skb).pid, UNIXCB(skb).uid, UNIXCB(skb).gid);
2179         unix_set_secdata(&scm, skb);
2180
2181         if (!(flags & MSG_PEEK)) {
2182                 if (UNIXCB(skb).fp)
2183                         unix_detach_fds(&scm, skb);
2184
2185                 sk_peek_offset_bwd(sk, skb->len);
2186         } else {
2187                 /* It is questionable: on PEEK we could:
2188                    - do not return fds - good, but too simple 8)
2189                    - return fds, and do not return them on read (old strategy,
2190                      apparently wrong)
2191                    - clone fds (I chose it for now, it is the most universal
2192                      solution)
2193
2194                    POSIX 1003.1g does not actually define this clearly
2195                    at all. POSIX 1003.1g doesn't define a lot of things
2196                    clearly however!
2197
2198                 */
2199
2200                 sk_peek_offset_fwd(sk, size);
2201
2202                 if (UNIXCB(skb).fp)
2203                         scm.fp = scm_fp_dup(UNIXCB(skb).fp);
2204         }
2205         err = (flags & MSG_TRUNC) ? skb->len - skip : size;
2206
2207         scm_recv(sock, msg, &scm, flags);
2208
2209 out_free:
2210         skb_free_datagram(sk, skb);
2211         mutex_unlock(&u->iolock);
2212 out:
2213         return err;
2214 }
2215
2216 /*
2217  *      Sleep until more data has arrived. But check for races..
2218  */
2219 static long unix_stream_data_wait(struct sock *sk, long timeo,
2220                                   struct sk_buff *last, unsigned int last_len,
2221                                   bool freezable)
2222 {
2223         struct sk_buff *tail;
2224         DEFINE_WAIT(wait);
2225
2226         unix_state_lock(sk);
2227
2228         for (;;) {
2229                 prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
2230
2231                 tail = skb_peek_tail(&sk->sk_receive_queue);
2232                 if (tail != last ||
2233                     (tail && tail->len != last_len) ||
2234                     sk->sk_err ||
2235                     (sk->sk_shutdown & RCV_SHUTDOWN) ||
2236                     signal_pending(current) ||
2237                     !timeo)
2238                         break;
2239
2240                 sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2241                 unix_state_unlock(sk);
2242                 if (freezable)
2243                         timeo = freezable_schedule_timeout(timeo);
2244                 else
2245                         timeo = schedule_timeout(timeo);
2246                 unix_state_lock(sk);
2247
2248                 if (sock_flag(sk, SOCK_DEAD))
2249                         break;
2250
2251                 sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2252         }
2253
2254         finish_wait(sk_sleep(sk), &wait);
2255         unix_state_unlock(sk);
2256         return timeo;
2257 }
2258
2259 static unsigned int unix_skb_len(const struct sk_buff *skb)
2260 {
2261         return skb->len - UNIXCB(skb).consumed;
2262 }
2263
2264 struct unix_stream_read_state {
2265         int (*recv_actor)(struct sk_buff *, int, int,
2266                           struct unix_stream_read_state *);
2267         struct socket *socket;
2268         struct msghdr *msg;
2269         struct pipe_inode_info *pipe;
2270         size_t size;
2271         int flags;
2272         unsigned int splice_flags;
2273 };
2274
2275 static int unix_stream_read_generic(struct unix_stream_read_state *state,
2276                                     bool freezable)
2277 {
2278         struct scm_cookie scm;
2279         struct socket *sock = state->socket;
2280         struct sock *sk = sock->sk;
2281         struct unix_sock *u = unix_sk(sk);
2282         int copied = 0;
2283         int flags = state->flags;
2284         int noblock = flags & MSG_DONTWAIT;
2285         bool check_creds = false;
2286         int target;
2287         int err = 0;
2288         long timeo;
2289         int skip;
2290         size_t size = state->size;
2291         unsigned int last_len;
2292
2293         if (unlikely(sk->sk_state != TCP_ESTABLISHED)) {
2294                 err = -EINVAL;
2295                 goto out;
2296         }
2297
2298         if (unlikely(flags & MSG_OOB)) {
2299                 err = -EOPNOTSUPP;
2300                 goto out;
2301         }
2302
2303         target = sock_rcvlowat(sk, flags & MSG_WAITALL, size);
2304         timeo = sock_rcvtimeo(sk, noblock);
2305
2306         memset(&scm, 0, sizeof(scm));
2307
2308         /* Lock the socket to prevent queue disordering
2309          * while sleeps in memcpy_tomsg
2310          */
2311         mutex_lock(&u->iolock);
2312
2313         skip = max(sk_peek_offset(sk, flags), 0);
2314
2315         do {
2316                 int chunk;
2317                 bool drop_skb;
2318                 struct sk_buff *skb, *last;
2319
2320 redo:
2321                 unix_state_lock(sk);
2322                 if (sock_flag(sk, SOCK_DEAD)) {
2323                         err = -ECONNRESET;
2324                         goto unlock;
2325                 }
2326                 last = skb = skb_peek(&sk->sk_receive_queue);
2327                 last_len = last ? last->len : 0;
2328 again:
2329                 if (skb == NULL) {
2330                         if (copied >= target)
2331                                 goto unlock;
2332
2333                         /*
2334                          *      POSIX 1003.1g mandates this order.
2335                          */
2336
2337                         err = sock_error(sk);
2338                         if (err)
2339                                 goto unlock;
2340                         if (sk->sk_shutdown & RCV_SHUTDOWN)
2341                                 goto unlock;
2342
2343                         unix_state_unlock(sk);
2344                         if (!timeo) {
2345                                 err = -EAGAIN;
2346                                 break;
2347                         }
2348
2349                         mutex_unlock(&u->iolock);
2350
2351                         timeo = unix_stream_data_wait(sk, timeo, last,
2352                                                       last_len, freezable);
2353
2354                         if (signal_pending(current)) {
2355                                 err = sock_intr_errno(timeo);
2356                                 scm_destroy(&scm);
2357                                 goto out;
2358                         }
2359
2360                         mutex_lock(&u->iolock);
2361                         goto redo;
2362 unlock:
2363                         unix_state_unlock(sk);
2364                         break;
2365                 }
2366
2367                 while (skip >= unix_skb_len(skb)) {
2368                         skip -= unix_skb_len(skb);
2369                         last = skb;
2370                         last_len = skb->len;
2371                         skb = skb_peek_next(skb, &sk->sk_receive_queue);
2372                         if (!skb)
2373                                 goto again;
2374                 }
2375
2376                 unix_state_unlock(sk);
2377
2378                 if (check_creds) {
2379                         /* Never glue messages from different writers */
2380                         if (!unix_skb_scm_eq(skb, &scm))
2381                                 break;
2382                 } else if (test_bit(SOCK_PASSCRED, &sock->flags)) {
2383                         /* Copy credentials */
2384                         scm_set_cred(&scm, UNIXCB(skb).pid, UNIXCB(skb).uid, UNIXCB(skb).gid);
2385                         unix_set_secdata(&scm, skb);
2386                         check_creds = true;
2387                 }
2388
2389                 /* Copy address just once */
2390                 if (state->msg && state->msg->msg_name) {
2391                         DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr,
2392                                          state->msg->msg_name);
2393                         unix_copy_addr(state->msg, skb->sk);
2394                         sunaddr = NULL;
2395                 }
2396
2397                 chunk = min_t(unsigned int, unix_skb_len(skb) - skip, size);
2398                 skb_get(skb);
2399                 chunk = state->recv_actor(skb, skip, chunk, state);
2400                 drop_skb = !unix_skb_len(skb);
2401                 /* skb is only safe to use if !drop_skb */
2402                 consume_skb(skb);
2403                 if (chunk < 0) {
2404                         if (copied == 0)
2405                                 copied = -EFAULT;
2406                         break;
2407                 }
2408                 copied += chunk;
2409                 size -= chunk;
2410
2411                 if (drop_skb) {
2412                         /* the skb was touched by a concurrent reader;
2413                          * we should not expect anything from this skb
2414                          * anymore and assume it invalid - we can be
2415                          * sure it was dropped from the socket queue
2416                          *
2417                          * let's report a short read
2418                          */
2419                         err = 0;
2420                         break;
2421                 }
2422
2423                 /* Mark read part of skb as used */
2424                 if (!(flags & MSG_PEEK)) {
2425                         UNIXCB(skb).consumed += chunk;
2426
2427                         sk_peek_offset_bwd(sk, chunk);
2428
2429                         if (UNIXCB(skb).fp)
2430                                 unix_detach_fds(&scm, skb);
2431
2432                         if (unix_skb_len(skb))
2433                                 break;
2434
2435                         skb_unlink(skb, &sk->sk_receive_queue);
2436                         consume_skb(skb);
2437
2438                         if (scm.fp)
2439                                 break;
2440                 } else {
2441                         /* It is questionable, see note in unix_dgram_recvmsg.
2442                          */
2443                         if (UNIXCB(skb).fp)
2444                                 scm.fp = scm_fp_dup(UNIXCB(skb).fp);
2445
2446                         sk_peek_offset_fwd(sk, chunk);
2447
2448                         if (UNIXCB(skb).fp)
2449                                 break;
2450
2451                         skip = 0;
2452                         last = skb;
2453                         last_len = skb->len;
2454                         unix_state_lock(sk);
2455                         skb = skb_peek_next(skb, &sk->sk_receive_queue);
2456                         if (skb)
2457                                 goto again;
2458                         unix_state_unlock(sk);
2459                         break;
2460                 }
2461         } while (size);
2462
2463         mutex_unlock(&u->iolock);
2464         if (state->msg)
2465                 scm_recv(sock, state->msg, &scm, flags);
2466         else
2467                 scm_destroy(&scm);
2468 out:
2469         return copied ? : err;
2470 }
2471
2472 static int unix_stream_read_actor(struct sk_buff *skb,
2473                                   int skip, int chunk,
2474                                   struct unix_stream_read_state *state)
2475 {
2476         int ret;
2477
2478         ret = skb_copy_datagram_msg(skb, UNIXCB(skb).consumed + skip,
2479                                     state->msg, chunk);
2480         return ret ?: chunk;
2481 }
2482
2483 static int unix_stream_recvmsg(struct socket *sock, struct msghdr *msg,
2484                                size_t size, int flags)
2485 {
2486         struct unix_stream_read_state state = {
2487                 .recv_actor = unix_stream_read_actor,
2488                 .socket = sock,
2489                 .msg = msg,
2490                 .size = size,
2491                 .flags = flags
2492         };
2493
2494         return unix_stream_read_generic(&state, true);
2495 }
2496
2497 static int unix_stream_splice_actor(struct sk_buff *skb,
2498                                     int skip, int chunk,
2499                                     struct unix_stream_read_state *state)
2500 {
2501         return skb_splice_bits(skb, state->socket->sk,
2502                                UNIXCB(skb).consumed + skip,
2503                                state->pipe, chunk, state->splice_flags);
2504 }
2505
2506 static ssize_t unix_stream_splice_read(struct socket *sock,  loff_t *ppos,
2507                                        struct pipe_inode_info *pipe,
2508                                        size_t size, unsigned int flags)
2509 {
2510         struct unix_stream_read_state state = {
2511                 .recv_actor = unix_stream_splice_actor,
2512                 .socket = sock,
2513                 .pipe = pipe,
2514                 .size = size,
2515                 .splice_flags = flags,
2516         };
2517
2518         if (unlikely(*ppos))
2519                 return -ESPIPE;
2520
2521         if (sock->file->f_flags & O_NONBLOCK ||
2522             flags & SPLICE_F_NONBLOCK)
2523                 state.flags = MSG_DONTWAIT;
2524
2525         return unix_stream_read_generic(&state, false);
2526 }
2527
2528 static int unix_shutdown(struct socket *sock, int mode)
2529 {
2530         struct sock *sk = sock->sk;
2531         struct sock *other;
2532
2533         if (mode < SHUT_RD || mode > SHUT_RDWR)
2534                 return -EINVAL;
2535         /* This maps:
2536          * SHUT_RD   (0) -> RCV_SHUTDOWN  (1)
2537          * SHUT_WR   (1) -> SEND_SHUTDOWN (2)
2538          * SHUT_RDWR (2) -> SHUTDOWN_MASK (3)
2539          */
2540         ++mode;
2541
2542         unix_state_lock(sk);
2543         sk->sk_shutdown |= mode;
2544         other = unix_peer(sk);
2545         if (other)
2546                 sock_hold(other);
2547         unix_state_unlock(sk);
2548         sk->sk_state_change(sk);
2549
2550         if (other &&
2551                 (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET)) {
2552
2553                 int peer_mode = 0;
2554
2555                 if (mode&RCV_SHUTDOWN)
2556                         peer_mode |= SEND_SHUTDOWN;
2557                 if (mode&SEND_SHUTDOWN)
2558                         peer_mode |= RCV_SHUTDOWN;
2559                 unix_state_lock(other);
2560                 other->sk_shutdown |= peer_mode;
2561                 unix_state_unlock(other);
2562                 other->sk_state_change(other);
2563                 if (peer_mode == SHUTDOWN_MASK)
2564                         sk_wake_async(other, SOCK_WAKE_WAITD, POLL_HUP);
2565                 else if (peer_mode & RCV_SHUTDOWN)
2566                         sk_wake_async(other, SOCK_WAKE_WAITD, POLL_IN);
2567         }
2568         if (other)
2569                 sock_put(other);
2570
2571         return 0;
2572 }
2573
2574 long unix_inq_len(struct sock *sk)
2575 {
2576         struct sk_buff *skb;
2577         long amount = 0;
2578
2579         if (sk->sk_state == TCP_LISTEN)
2580                 return -EINVAL;
2581
2582         spin_lock(&sk->sk_receive_queue.lock);
2583         if (sk->sk_type == SOCK_STREAM ||
2584             sk->sk_type == SOCK_SEQPACKET) {
2585                 skb_queue_walk(&sk->sk_receive_queue, skb)
2586                         amount += unix_skb_len(skb);
2587         } else {
2588                 skb = skb_peek(&sk->sk_receive_queue);
2589                 if (skb)
2590                         amount = skb->len;
2591         }
2592         spin_unlock(&sk->sk_receive_queue.lock);
2593
2594         return amount;
2595 }
2596 EXPORT_SYMBOL_GPL(unix_inq_len);
2597
2598 long unix_outq_len(struct sock *sk)
2599 {
2600         return sk_wmem_alloc_get(sk);
2601 }
2602 EXPORT_SYMBOL_GPL(unix_outq_len);
2603
2604 static int unix_open_file(struct sock *sk)
2605 {
2606         struct path path;
2607         struct file *f;
2608         int fd;
2609
2610         if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
2611                 return -EPERM;
2612
2613         if (!smp_load_acquire(&unix_sk(sk)->addr))
2614                 return -ENOENT;
2615
2616         path = unix_sk(sk)->path;
2617         if (!path.dentry)
2618                 return -ENOENT;
2619
2620         path_get(&path);
2621
2622         fd = get_unused_fd_flags(O_CLOEXEC);
2623         if (fd < 0)
2624                 goto out;
2625
2626         f = dentry_open(&path, O_PATH, current_cred());
2627         if (IS_ERR(f)) {
2628                 put_unused_fd(fd);
2629                 fd = PTR_ERR(f);
2630                 goto out;
2631         }
2632
2633         fd_install(fd, f);
2634 out:
2635         path_put(&path);
2636
2637         return fd;
2638 }
2639
2640 static int unix_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
2641 {
2642         struct sock *sk = sock->sk;
2643         long amount = 0;
2644         int err;
2645
2646         switch (cmd) {
2647         case SIOCOUTQ:
2648                 amount = unix_outq_len(sk);
2649                 err = put_user(amount, (int __user *)arg);
2650                 break;
2651         case SIOCINQ:
2652                 amount = unix_inq_len(sk);
2653                 if (amount < 0)
2654                         err = amount;
2655                 else
2656                         err = put_user(amount, (int __user *)arg);
2657                 break;
2658         case SIOCUNIXFILE:
2659                 err = unix_open_file(sk);
2660                 break;
2661         default:
2662                 err = -ENOIOCTLCMD;
2663                 break;
2664         }
2665         return err;
2666 }
2667
2668 #ifdef CONFIG_COMPAT
2669 static int unix_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
2670 {
2671         return unix_ioctl(sock, cmd, (unsigned long)compat_ptr(arg));
2672 }
2673 #endif
2674
2675 static __poll_t unix_poll(struct file *file, struct socket *sock, poll_table *wait)
2676 {
2677         struct sock *sk = sock->sk;
2678         __poll_t mask;
2679
2680         sock_poll_wait(file, sock, wait);
2681         mask = 0;
2682
2683         /* exceptional events? */
2684         if (sk->sk_err)
2685                 mask |= EPOLLERR;
2686         if (sk->sk_shutdown == SHUTDOWN_MASK)
2687                 mask |= EPOLLHUP;
2688         if (sk->sk_shutdown & RCV_SHUTDOWN)
2689                 mask |= EPOLLRDHUP | EPOLLIN | EPOLLRDNORM;
2690
2691         /* readable? */
2692         if (!skb_queue_empty_lockless(&sk->sk_receive_queue))
2693                 mask |= EPOLLIN | EPOLLRDNORM;
2694
2695         /* Connection-based need to check for termination and startup */
2696         if ((sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) &&
2697             sk->sk_state == TCP_CLOSE)
2698                 mask |= EPOLLHUP;
2699
2700         /*
2701          * we set writable also when the other side has shut down the
2702          * connection. This prevents stuck sockets.
2703          */
2704         if (unix_writable(sk))
2705                 mask |= EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND;
2706
2707         return mask;
2708 }
2709
2710 static __poll_t unix_dgram_poll(struct file *file, struct socket *sock,
2711                                     poll_table *wait)
2712 {
2713         struct sock *sk = sock->sk, *other;
2714         unsigned int writable;
2715         __poll_t mask;
2716
2717         sock_poll_wait(file, sock, wait);
2718         mask = 0;
2719
2720         /* exceptional events? */
2721         if (sk->sk_err || !skb_queue_empty_lockless(&sk->sk_error_queue))
2722                 mask |= EPOLLERR |
2723                         (sock_flag(sk, SOCK_SELECT_ERR_QUEUE) ? EPOLLPRI : 0);
2724
2725         if (sk->sk_shutdown & RCV_SHUTDOWN)
2726                 mask |= EPOLLRDHUP | EPOLLIN | EPOLLRDNORM;
2727         if (sk->sk_shutdown == SHUTDOWN_MASK)
2728                 mask |= EPOLLHUP;
2729
2730         /* readable? */
2731         if (!skb_queue_empty_lockless(&sk->sk_receive_queue))
2732                 mask |= EPOLLIN | EPOLLRDNORM;
2733
2734         /* Connection-based need to check for termination and startup */
2735         if (sk->sk_type == SOCK_SEQPACKET) {
2736                 if (sk->sk_state == TCP_CLOSE)
2737                         mask |= EPOLLHUP;
2738                 /* connection hasn't started yet? */
2739                 if (sk->sk_state == TCP_SYN_SENT)
2740                         return mask;
2741         }
2742
2743         /* No write status requested, avoid expensive OUT tests. */
2744         if (!(poll_requested_events(wait) & (EPOLLWRBAND|EPOLLWRNORM|EPOLLOUT)))
2745                 return mask;
2746
2747         writable = unix_writable(sk);
2748         if (writable) {
2749                 unix_state_lock(sk);
2750
2751                 other = unix_peer(sk);
2752                 if (other && unix_peer(other) != sk &&
2753                     unix_recvq_full(other) &&
2754                     unix_dgram_peer_wake_me(sk, other))
2755                         writable = 0;
2756
2757                 unix_state_unlock(sk);
2758         }
2759
2760         if (writable)
2761                 mask |= EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND;
2762         else
2763                 sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
2764
2765         return mask;
2766 }
2767
2768 #ifdef CONFIG_PROC_FS
2769
2770 #define BUCKET_SPACE (BITS_PER_LONG - (UNIX_HASH_BITS + 1) - 1)
2771
2772 #define get_bucket(x) ((x) >> BUCKET_SPACE)
2773 #define get_offset(x) ((x) & ((1L << BUCKET_SPACE) - 1))
2774 #define set_bucket_offset(b, o) ((b) << BUCKET_SPACE | (o))
2775
2776 static struct sock *unix_from_bucket(struct seq_file *seq, loff_t *pos)
2777 {
2778         unsigned long offset = get_offset(*pos);
2779         unsigned long bucket = get_bucket(*pos);
2780         struct sock *sk;
2781         unsigned long count = 0;
2782
2783         for (sk = sk_head(&unix_socket_table[bucket]); sk; sk = sk_next(sk)) {
2784                 if (sock_net(sk) != seq_file_net(seq))
2785                         continue;
2786                 if (++count == offset)
2787                         break;
2788         }
2789
2790         return sk;
2791 }
2792
2793 static struct sock *unix_next_socket(struct seq_file *seq,
2794                                      struct sock *sk,
2795                                      loff_t *pos)
2796 {
2797         unsigned long bucket;
2798
2799         while (sk > (struct sock *)SEQ_START_TOKEN) {
2800                 sk = sk_next(sk);
2801                 if (!sk)
2802                         goto next_bucket;
2803                 if (sock_net(sk) == seq_file_net(seq))
2804                         return sk;
2805         }
2806
2807         do {
2808                 sk = unix_from_bucket(seq, pos);
2809                 if (sk)
2810                         return sk;
2811
2812 next_bucket:
2813                 bucket = get_bucket(*pos) + 1;
2814                 *pos = set_bucket_offset(bucket, 1);
2815         } while (bucket < ARRAY_SIZE(unix_socket_table));
2816
2817         return NULL;
2818 }
2819
2820 static void *unix_seq_start(struct seq_file *seq, loff_t *pos)
2821         __acquires(unix_table_lock)
2822 {
2823         spin_lock(&unix_table_lock);
2824
2825         if (!*pos)
2826                 return SEQ_START_TOKEN;
2827
2828         if (get_bucket(*pos) >= ARRAY_SIZE(unix_socket_table))
2829                 return NULL;
2830
2831         return unix_next_socket(seq, NULL, pos);
2832 }
2833
2834 static void *unix_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2835 {
2836         ++*pos;
2837         return unix_next_socket(seq, v, pos);
2838 }
2839
2840 static void unix_seq_stop(struct seq_file *seq, void *v)
2841         __releases(unix_table_lock)
2842 {
2843         spin_unlock(&unix_table_lock);
2844 }
2845
2846 static int unix_seq_show(struct seq_file *seq, void *v)
2847 {
2848
2849         if (v == SEQ_START_TOKEN)
2850                 seq_puts(seq, "Num       RefCount Protocol Flags    Type St "
2851                          "Inode Path\n");
2852         else {
2853                 struct sock *s = v;
2854                 struct unix_sock *u = unix_sk(s);
2855                 unix_state_lock(s);
2856
2857                 seq_printf(seq, "%pK: %08X %08X %08X %04X %02X %5lu",
2858                         s,
2859                         refcount_read(&s->sk_refcnt),
2860                         0,
2861                         s->sk_state == TCP_LISTEN ? __SO_ACCEPTCON : 0,
2862                         s->sk_type,
2863                         s->sk_socket ?
2864                         (s->sk_state == TCP_ESTABLISHED ? SS_CONNECTED : SS_UNCONNECTED) :
2865                         (s->sk_state == TCP_ESTABLISHED ? SS_CONNECTING : SS_DISCONNECTING),
2866                         sock_i_ino(s));
2867
2868                 if (u->addr) {  // under unix_table_lock here
2869                         int i, len;
2870                         seq_putc(seq, ' ');
2871
2872                         i = 0;
2873                         len = u->addr->len - sizeof(short);
2874                         if (!UNIX_ABSTRACT(s))
2875                                 len--;
2876                         else {
2877                                 seq_putc(seq, '@');
2878                                 i++;
2879                         }
2880                         for ( ; i < len; i++)
2881                                 seq_putc(seq, u->addr->name->sun_path[i] ?:
2882                                          '@');
2883                 }
2884                 unix_state_unlock(s);
2885                 seq_putc(seq, '\n');
2886         }
2887
2888         return 0;
2889 }
2890
2891 static const struct seq_operations unix_seq_ops = {
2892         .start  = unix_seq_start,
2893         .next   = unix_seq_next,
2894         .stop   = unix_seq_stop,
2895         .show   = unix_seq_show,
2896 };
2897 #endif
2898
2899 static const struct net_proto_family unix_family_ops = {
2900         .family = PF_UNIX,
2901         .create = unix_create,
2902         .owner  = THIS_MODULE,
2903 };
2904
2905
2906 static int __net_init unix_net_init(struct net *net)
2907 {
2908         int error = -ENOMEM;
2909
2910         net->unx.sysctl_max_dgram_qlen = 10;
2911         if (unix_sysctl_register(net))
2912                 goto out;
2913
2914 #ifdef CONFIG_PROC_FS
2915         if (!proc_create_net("unix", 0, net->proc_net, &unix_seq_ops,
2916                         sizeof(struct seq_net_private))) {
2917                 unix_sysctl_unregister(net);
2918                 goto out;
2919         }
2920 #endif
2921         error = 0;
2922 out:
2923         return error;
2924 }
2925
2926 static void __net_exit unix_net_exit(struct net *net)
2927 {
2928         unix_sysctl_unregister(net);
2929         remove_proc_entry("unix", net->proc_net);
2930 }
2931
2932 static struct pernet_operations unix_net_ops = {
2933         .init = unix_net_init,
2934         .exit = unix_net_exit,
2935 };
2936
2937 static int __init af_unix_init(void)
2938 {
2939         int rc = -1;
2940
2941         BUILD_BUG_ON(sizeof(struct unix_skb_parms) > FIELD_SIZEOF(struct sk_buff, cb));
2942
2943         rc = proto_register(&unix_proto, 1);
2944         if (rc != 0) {
2945                 pr_crit("%s: Cannot create unix_sock SLAB cache!\n", __func__);
2946                 goto out;
2947         }
2948
2949         sock_register(&unix_family_ops);
2950         register_pernet_subsys(&unix_net_ops);
2951 out:
2952         return rc;
2953 }
2954
2955 static void __exit af_unix_exit(void)
2956 {
2957         sock_unregister(PF_UNIX);
2958         proto_unregister(&unix_proto);
2959         unregister_pernet_subsys(&unix_net_ops);
2960 }
2961
2962 /* Earlier than device_initcall() so that other drivers invoking
2963    request_module() don't end up in a loop when modprobe tries
2964    to use a UNIX socket. But later than subsys_initcall() because
2965    we depend on stuff initialised there */
2966 fs_initcall(af_unix_init);
2967 module_exit(af_unix_exit);
2968
2969 MODULE_LICENSE("GPL");
2970 MODULE_ALIAS_NETPROTO(PF_UNIX);