net/unix/af_unix.c

   1 /*
   2  * NET4:        Implementation of BSD Unix domain sockets.
   3  *
   4  * Authors:     Alan Cox, <alan@lxorguk.ukuu.org.uk>
   5  *
   6  *              This program is free software; you can redistribute it and/or
   7  *              modify it under the terms of the GNU General Public License
   8  *              as published by the Free Software Foundation; either version
   9  *              2 of the License, or (at your option) any later version.
  10  *
  11  * Fixes:
  12  *              Linus Torvalds  :       Assorted bug cures.
  13  *              Niibe Yutaka    :       async I/O support.
  14  *              Carsten Paeth   :       PF_UNIX check, address fixes.
  15  *              Alan Cox        :       Limit size of allocated blocks.
  16  *              Alan Cox        :       Fixed the stupid socketpair bug.
  17  *              Alan Cox        :       BSD compatibility fine tuning.
  18  *              Alan Cox        :       Fixed a bug in connect when interrupted.
  19  *              Alan Cox        :       Sorted out a proper draft version of
  20  *                                      file descriptor passing hacked up from
  21  *                                      Mike Shaver's work.
  22  *              Marty Leisner   :       Fixes to fd passing
  23  *              Nick Nevin      :       recvmsg bugfix.
  24  *              Alan Cox        :       Started proper garbage collector
  25  *              Heiko EiBfeldt  :       Missing verify_area check
  26  *              Alan Cox        :       Started POSIXisms
  27  *              Andreas Schwab  :       Replace inode by dentry for proper
  28  *                                      reference counting
  29  *              Kirk Petersen   :       Made this a module
  30  *          Christoph Rohland   :       Elegant non-blocking accept/connect algorithm.
  31  *                                      Lots of bug fixes.
  32  *           Alexey Kuznetosv   :       Repaired (I hope) bugs introduces
  33  *                                      by above two patches.
  34  *           Andrea Arcangeli   :       If possible we block in connect(2)
  35  *                                      if the max backlog of the listen socket
  36  *                                      is been reached. This won't break
  37  *                                      old apps and it will avoid huge amount
  38  *                                      of socks hashed (this for unix_gc()
  39  *                                      performances reasons).
  40  *                                      Security fix that limits the max
  41  *                                      number of socks to 2*max_files and
  42  *                                      the number of skb queueable in the
  43  *                                      dgram receiver.
  44  *              Artur Skawina   :       Hash function optimizations
  45  *           Alexey Kuznetsov   :       Full scale SMP. Lot of bugs are introduced 8)
  46  *            Malcolm Beattie   :       Set peercred for socketpair
  47  *           Michal Ostrowski   :       Module initialization cleanup.
  48  *           Arnaldo C. Melo    :       Remove MOD_{INC,DEC}_USE_COUNT,
  49  *                                      the core infrastructure is doing that
  50  *                                      for all net proto families now (2.5.69+)
  51  *
  52  *
  53  * Known differences from reference BSD that was tested:
  54  *
  55  *      [TO FIX]
  56  *      ECONNREFUSED is not returned from one end of a connected() socket to the
  57  *              other the moment one end closes.
  58  *      fstat() doesn't return st_dev=0, and give the blksize as high water mark
  59  *              and a fake inode identifier (nor the BSD first socket fstat twice bug).
  60  *      [NOT TO FIX]
  61  *      accept() returns a path name even if the connecting socket has closed
  62  *              in the meantime (BSD loses the path and gives up).
  63  *      accept() returns 0 length path for an unbound connector. BSD returns 16
  64  *              and a null first byte in the path (but not for gethost/peername - BSD bug ??)
  65  *      socketpair(...SOCK_RAW..) doesn't panic the kernel.
  66  *      BSD af_unix apparently has connect forgetting to block properly.
  67  *              (need to check this with the POSIX spec in detail)
  68  *
  69  * Differences from 2.0.0-11-... (ANK)
  70  *      Bug fixes and improvements.
  71  *              - client shutdown killed server socket.
  72  *              - removed all useless cli/sti pairs.
  73  *
  74  *      Semantic changes/extensions.
  75  *              - generic control message passing.
  76  *              - SCM_CREDENTIALS control message.
  77  *              - "Abstract" (not FS based) socket bindings.
  78  *                Abstract names are sequences of bytes (not zero terminated)
  79  *                started by 0, so that this name space does not intersect
  80  *                with BSD names.
  81  */
  82
  83 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
  84
  85 #include <linux/module.h>
  86 #include <linux/kernel.h>
  87 #include <linux/signal.h>
  88 #include <linux/sched.h>
  89 #include <linux/errno.h>
  90 #include <linux/string.h>
  91 #include <linux/stat.h>
  92 #include <linux/dcache.h>
  93 #include <linux/namei.h>
  94 #include <linux/socket.h>
  95 #include <linux/un.h>
  96 #include <linux/fcntl.h>
  97 #include <linux/termios.h>
  98 #include <linux/sockios.h>
  99 #include <linux/net.h>
 100 #include <linux/in.h>
 101 #include <linux/fs.h>
 102 #include <linux/slab.h>
 103 #include <asm/uaccess.h>
 104 #include <linux/skbuff.h>
 105 #include <linux/netdevice.h>
 106 #include <net/net_namespace.h>
 107 #include <net/sock.h>
 108 #include <net/tcp_states.h>
 109 #include <net/af_unix.h>
 110 #include <linux/proc_fs.h>
 111 #include <linux/seq_file.h>
 112 #include <net/scm.h>
 113 #include <linux/init.h>
 114 #include <linux/poll.h>
 115 #include <linux/rtnetlink.h>
 116 #include <linux/mount.h>
 117 #include <net/checksum.h>
 118 #include <linux/security.h>
 119 #include <linux/freezer.h>
 120
 121 struct hlist_head unix_socket_table[2 * UNIX_HASH_SIZE];
 122 EXPORT_SYMBOL_GPL(unix_socket_table);
 123 DEFINE_SPINLOCK(unix_table_lock);
 124 EXPORT_SYMBOL_GPL(unix_table_lock);
 125 static atomic_long_t unix_nr_socks;
 126
 127
 128 static struct hlist_head *unix_sockets_unbound(void *addr)
 129 {
 130         unsigned long hash = (unsigned long)addr;
 131
 132         hash ^= hash >> 16;
 133         hash ^= hash >> 8;
 134         hash %= UNIX_HASH_SIZE;
 135         return &unix_socket_table[UNIX_HASH_SIZE + hash];
 136 }
 137
 138 #define UNIX_ABSTRACT(sk)       (unix_sk(sk)->addr->hash < UNIX_HASH_SIZE)
 139
 140 #ifdef CONFIG_SECURITY_NETWORK
 141 static void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb)
 142 {
 143         memcpy(UNIXSID(skb), &scm->secid, sizeof(u32));
 144 }
 145
 146 static inline void unix_set_secdata(struct scm_cookie *scm, struct sk_buff *skb)
 147 {
 148         scm->secid = *UNIXSID(skb);
 149 }
 150 #else
 151 static inline void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb)
 152 { }
 153
 154 static inline void unix_set_secdata(struct scm_cookie *scm, struct sk_buff *skb)
 155 { }
 156 #endif /* CONFIG_SECURITY_NETWORK */
 157
 158 /*
 159  *  SMP locking strategy:
 160  *    hash table is protected with spinlock unix_table_lock
 161  *    each socket state is protected by separate spin lock.
 162  */
 163
 164 static inline unsigned int unix_hash_fold(__wsum n)
 165 {
 166         unsigned int hash = (__force unsigned int)csum_fold(n);
 167
 168         hash ^= hash>>8;
 169         return hash&(UNIX_HASH_SIZE-1);
 170 }
 171
 172 #define unix_peer(sk) (unix_sk(sk)->peer)
 173
 174 static inline int unix_our_peer(struct sock *sk, struct sock *osk)
 175 {
 176         return unix_peer(osk) == sk;
 177 }
 178
 179 static inline int unix_may_send(struct sock *sk, struct sock *osk)
 180 {
 181         return unix_peer(osk) == NULL || unix_our_peer(sk, osk);
 182 }
 183
 184 static inline int unix_recvq_full(struct sock const *sk)
 185 {
 186         return skb_queue_len(&sk->sk_receive_queue) > sk->sk_max_ack_backlog;
 187 }
 188
 189 struct sock *unix_peer_get(struct sock *s)
 190 {
 191         struct sock *peer;
 192
 193         unix_state_lock(s);
 194         peer = unix_peer(s);
 195         if (peer)
 196                 sock_hold(peer);
 197         unix_state_unlock(s);
 198         return peer;
 199 }
 200 EXPORT_SYMBOL_GPL(unix_peer_get);
 201
 202 static inline void unix_release_addr(struct unix_address *addr)
 203 {
 204         if (atomic_dec_and_test(&addr->refcnt))
 205                 kfree(addr);
 206 }
 207
 208 /*
 209  *      Check unix socket name:
 210  *              - should be not zero length.
 211  *              - if started by not zero, should be NULL terminated (FS object)
 212  *              - if started by zero, it is abstract name.
 213  */
 214
 215 static int unix_mkname(struct sockaddr_un *sunaddr, int len, unsigned int *hashp)
 216 {
 217         if (len <= sizeof(short) || len > sizeof(*sunaddr))
 218                 return -EINVAL;
 219         if (!sunaddr || sunaddr->sun_family != AF_UNIX)
 220                 return -EINVAL;
 221         if (sunaddr->sun_path[0]) {
 222                 /*
 223                  * This may look like an off by one error but it is a bit more
 224                  * subtle. 108 is the longest valid AF_UNIX path for a binding.
 225                  * sun_path[108] doesn't as such exist.  However in kernel space
 226                  * we are guaranteed that it is a valid memory location in our
 227                  * kernel address buffer.
 228                  */
 229                 ((char *)sunaddr)[len] = 0;
 230                 len = strlen(sunaddr->sun_path)+1+sizeof(short);
 231                 return len;
 232         }
 233
 234         *hashp = unix_hash_fold(csum_partial(sunaddr, len, 0));
 235         return len;
 236 }
 237
 238 static void __unix_remove_socket(struct sock *sk)
 239 {
 240         sk_del_node_init(sk);
 241 }
 242
 243 static void __unix_insert_socket(struct hlist_head *list, struct sock *sk)
 244 {
 245         WARN_ON(!sk_unhashed(sk));
 246         sk_add_node(sk, list);
 247 }
 248
 249 static inline void unix_remove_socket(struct sock *sk)
 250 {
 251         spin_lock(&unix_table_lock);
 252         __unix_remove_socket(sk);
 253         spin_unlock(&unix_table_lock);
 254 }
 255
 256 static inline void unix_insert_socket(struct hlist_head *list, struct sock *sk)
 257 {
 258         spin_lock(&unix_table_lock);
 259         __unix_insert_socket(list, sk);
 260         spin_unlock(&unix_table_lock);
 261 }
 262
 263 static struct sock *__unix_find_socket_byname(struct net *net,
 264                                               struct sockaddr_un *sunname,
 265                                               int len, int type, unsigned int hash)
 266 {
 267         struct sock *s;
 268
 269         sk_for_each(s, &unix_socket_table[hash ^ type]) {
 270                 struct unix_sock *u = unix_sk(s);
 271
 272                 if (!net_eq(sock_net(s), net))
 273                         continue;
 274
 275                 if (u->addr->len == len &&
 276                     !memcmp(u->addr->name, sunname, len))
 277                         goto found;
 278         }
 279         s = NULL;
 280 found:
 281         return s;
 282 }
 283
 284 static inline struct sock *unix_find_socket_byname(struct net *net,
 285                                                    struct sockaddr_un *sunname,
 286                                                    int len, int type,
 287                                                    unsigned int hash)
 288 {
 289         struct sock *s;
 290
 291         spin_lock(&unix_table_lock);
 292         s = __unix_find_socket_byname(net, sunname, len, type, hash);
 293         if (s)
 294                 sock_hold(s);
 295         spin_unlock(&unix_table_lock);
 296         return s;
 297 }
 298
 299 static struct sock *unix_find_socket_byinode(struct inode *i)
 300 {
 301         struct sock *s;
 302
 303         spin_lock(&unix_table_lock);
 304         sk_for_each(s,
 305                     &unix_socket_table[i->i_ino & (UNIX_HASH_SIZE - 1)]) {
 306                 struct dentry *dentry = unix_sk(s)->path.dentry;
 307
 308                 if (dentry && d_backing_inode(dentry) == i) {
 309                         sock_hold(s);
 310                         goto found;
 311                 }
 312         }
 313         s = NULL;
 314 found:
 315         spin_unlock(&unix_table_lock);
 316         return s;
 317 }
 318
 319 /* Support code for asymmetrically connected dgram sockets
 320  *
 321  * If a datagram socket is connected to a socket not itself connected
 322  * to the first socket (eg, /dev/log), clients may only enqueue more
 323  * messages if the present receive queue of the server socket is not
 324  * "too large". This means there's a second writeability condition
 325  * poll and sendmsg need to test. The dgram recv code will do a wake
 326  * up on the peer_wait wait queue of a socket upon reception of a
 327  * datagram which needs to be propagated to sleeping would-be writers
 328  * since these might not have sent anything so far. This can't be
 329  * accomplished via poll_wait because the lifetime of the server
 330  * socket might be less than that of its clients if these break their
 331  * association with it or if the server socket is closed while clients
 332  * are still connected to it and there's no way to inform "a polling
 333  * implementation" that it should let go of a certain wait queue
 334  *
 335  * In order to propagate a wake up, a wait_queue_t of the client
 336  * socket is enqueued on the peer_wait queue of the server socket
 337  * whose wake function does a wake_up on the ordinary client socket
 338  * wait queue. This connection is established whenever a write (or
 339  * poll for write) hit the flow control condition and broken when the
 340  * association to the server socket is dissolved or after a wake up
 341  * was relayed.
 342  */
 343
 344 static int unix_dgram_peer_wake_relay(wait_queue_t *q, unsigned mode, int flags,
 345                                       void *key)
 346 {
 347         struct unix_sock *u;
 348         wait_queue_head_t *u_sleep;
 349
 350         u = container_of(q, struct unix_sock, peer_wake);
 351
 352         __remove_wait_queue(&unix_sk(u->peer_wake.private)->peer_wait,
 353                             q);
 354         u->peer_wake.private = NULL;
 355
 356         /* relaying can only happen while the wq still exists */
 357         u_sleep = sk_sleep(&u->sk);
 358         if (u_sleep)
 359                 wake_up_interruptible_poll(u_sleep, key);
 360
 361         return 0;
 362 }
 363
 364 static int unix_dgram_peer_wake_connect(struct sock *sk, struct sock *other)
 365 {
 366         struct unix_sock *u, *u_other;
 367         int rc;
 368
 369         u = unix_sk(sk);
 370         u_other = unix_sk(other);
 371         rc = 0;
 372         spin_lock(&u_other->peer_wait.lock);
 373
 374         if (!u->peer_wake.private) {
 375                 u->peer_wake.private = other;
 376                 __add_wait_queue(&u_other->peer_wait, &u->peer_wake);
 377
 378                 rc = 1;
 379         }
 380
 381         spin_unlock(&u_other->peer_wait.lock);
 382         return rc;
 383 }
 384
 385 static void unix_dgram_peer_wake_disconnect(struct sock *sk,
 386                                             struct sock *other)
 387 {
 388         struct unix_sock *u, *u_other;
 389
 390         u = unix_sk(sk);
 391         u_other = unix_sk(other);
 392         spin_lock(&u_other->peer_wait.lock);
 393
 394         if (u->peer_wake.private == other) {
 395                 __remove_wait_queue(&u_other->peer_wait, &u->peer_wake);
 396                 u->peer_wake.private = NULL;
 397         }
 398
 399         spin_unlock(&u_other->peer_wait.lock);
 400 }
 401
 402 static void unix_dgram_peer_wake_disconnect_wakeup(struct sock *sk,
 403                                                    struct sock *other)
 404 {
 405         unix_dgram_peer_wake_disconnect(sk, other);
 406         wake_up_interruptible_poll(sk_sleep(sk),
 407                                    POLLOUT |
 408                                    POLLWRNORM |
 409                                    POLLWRBAND);
 410 }
 411
 412 /* preconditions:
 413  *      - unix_peer(sk) == other
 414  *      - association is stable
 415  */
 416 static int unix_dgram_peer_wake_me(struct sock *sk, struct sock *other)
 417 {
 418         int connected;
 419
 420         connected = unix_dgram_peer_wake_connect(sk, other);
 421
 422         if (unix_recvq_full(other))
 423                 return 1;
 424
 425         if (connected)
 426                 unix_dgram_peer_wake_disconnect(sk, other);
 427
 428         return 0;
 429 }
 430
 431 static inline int unix_writable(struct sock *sk)
 432 {
 433         return (atomic_read(&sk->sk_wmem_alloc) << 2) <= sk->sk_sndbuf;
 434 }
 435
 436 static void unix_write_space(struct sock *sk)
 437 {
 438         struct socket_wq *wq;
 439
 440         rcu_read_lock();
 441         if (unix_writable(sk)) {
 442                 wq = rcu_dereference(sk->sk_wq);
 443                 if (wq_has_sleeper(wq))
 444                         wake_up_interruptible_sync_poll(&wq->wait,
 445                                 POLLOUT | POLLWRNORM | POLLWRBAND);
 446                 sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
 447         }
 448         rcu_read_unlock();
 449 }
 450
 451 /* When dgram socket disconnects (or changes its peer), we clear its receive
 452  * queue of packets arrived from previous peer. First, it allows to do
 453  * flow control based only on wmem_alloc; second, sk connected to peer
 454  * may receive messages only from that peer. */
 455 static void unix_dgram_disconnected(struct sock *sk, struct sock *other)
 456 {
 457         if (!skb_queue_empty(&sk->sk_receive_queue)) {
 458                 skb_queue_purge(&sk->sk_receive_queue);
 459                 wake_up_interruptible_all(&unix_sk(sk)->peer_wait);
 460
 461                 /* If one link of bidirectional dgram pipe is disconnected,
 462                  * we signal error. Messages are lost. Do not make this,
 463                  * when peer was not connected to us.
 464                  */
 465                 if (!sock_flag(other, SOCK_DEAD) && unix_peer(other) == sk) {
 466                         other->sk_err = ECONNRESET;
 467                         other->sk_error_report(other);
 468                 }
 469         }
 470 }
 471
 472 static void unix_sock_destructor(struct sock *sk)
 473 {
 474         struct unix_sock *u = unix_sk(sk);
 475
 476         skb_queue_purge(&sk->sk_receive_queue);
 477
 478         WARN_ON(atomic_read(&sk->sk_wmem_alloc));
 479         WARN_ON(!sk_unhashed(sk));
 480         WARN_ON(sk->sk_socket);
 481         if (!sock_flag(sk, SOCK_DEAD)) {
 482                 pr_info("Attempt to release alive unix socket: %p\n", sk);
 483                 return;
 484         }
 485
 486         if (u->addr)
 487                 unix_release_addr(u->addr);
 488
 489         atomic_long_dec(&unix_nr_socks);
 490         local_bh_disable();
 491         sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
 492         local_bh_enable();
 493 #ifdef UNIX_REFCNT_DEBUG
 494         pr_debug("UNIX %p is destroyed, %ld are still alive.\n", sk,
 495                 atomic_long_read(&unix_nr_socks));
 496 #endif
 497 }
 498
 499 static void unix_release_sock(struct sock *sk, int embrion)
 500 {
 501         struct unix_sock *u = unix_sk(sk);
 502         struct path path;
 503         struct sock *skpair;
 504         struct sk_buff *skb;
 505         int state;
 506
 507         unix_remove_socket(sk);
 508
 509         /* Clear state */
 510         unix_state_lock(sk);
 511         sock_orphan(sk);
 512         sk->sk_shutdown = SHUTDOWN_MASK;
 513         path         = u->path;
 514         u->path.dentry = NULL;
 515         u->path.mnt = NULL;
 516         state = sk->sk_state;
 517         sk->sk_state = TCP_CLOSE;
 518         unix_state_unlock(sk);
 519
 520         wake_up_interruptible_all(&u->peer_wait);
 521
 522         skpair = unix_peer(sk);
 523
 524         if (skpair != NULL) {
 525                 if (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) {
 526                         unix_state_lock(skpair);
 527                         /* No more writes */
 528                         skpair->sk_shutdown = SHUTDOWN_MASK;
 529                         if (!skb_queue_empty(&sk->sk_receive_queue) || embrion)
 530                                 skpair->sk_err = ECONNRESET;
 531                         unix_state_unlock(skpair);
 532                         skpair->sk_state_change(skpair);
 533                         sk_wake_async(skpair, SOCK_WAKE_WAITD, POLL_HUP);
 534                 }
 535
 536                 unix_dgram_peer_wake_disconnect(sk, skpair);
 537                 sock_put(skpair); /* It may now die */
 538                 unix_peer(sk) = NULL;
 539         }
 540
 541         /* Try to flush out this socket. Throw out buffers at least */
 542
 543         while ((skb = skb_dequeue(&sk->sk_receive_queue)) != NULL) {
 544                 if (state == TCP_LISTEN)
 545                         unix_release_sock(skb->sk, 1);
 546                 /* passed fds are erased in the kfree_skb hook        */
 547                 kfree_skb(skb);
 548         }
 549
 550         if (path.dentry)
 551                 path_put(&path);
 552
 553         sock_put(sk);
 554
 555         /* ---- Socket is dead now and most probably destroyed ---- */
 556
 557         /*
 558          * Fixme: BSD difference: In BSD all sockets connected to us get
 559          *        ECONNRESET and we die on the spot. In Linux we behave
 560          *        like files and pipes do and wait for the last
 561          *        dereference.
 562          *
 563          * Can't we simply set sock->err?
 564          *
 565          *        What the above comment does talk about? --ANK(980817)
 566          */
 567
 568         if (unix_tot_inflight)
 569                 unix_gc();              /* Garbage collect fds */
 570 }
 571
 572 static void init_peercred(struct sock *sk)
 573 {
 574         put_pid(sk->sk_peer_pid);
 575         if (sk->sk_peer_cred)
 576                 put_cred(sk->sk_peer_cred);
 577         sk->sk_peer_pid  = get_pid(task_tgid(current));
 578         sk->sk_peer_cred = get_current_cred();
 579 }
 580
 581 static void copy_peercred(struct sock *sk, struct sock *peersk)
 582 {
 583         put_pid(sk->sk_peer_pid);
 584         if (sk->sk_peer_cred)
 585                 put_cred(sk->sk_peer_cred);
 586         sk->sk_peer_pid  = get_pid(peersk->sk_peer_pid);
 587         sk->sk_peer_cred = get_cred(peersk->sk_peer_cred);
 588 }
 589
 590 static int unix_listen(struct socket *sock, int backlog)
 591 {
 592         int err;
 593         struct sock *sk = sock->sk;
 594         struct unix_sock *u = unix_sk(sk);
 595         struct pid *old_pid = NULL;
 596
 597         err = -EOPNOTSUPP;
 598         if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET)
 599                 goto out;       /* Only stream/seqpacket sockets accept */
 600         err = -EINVAL;
 601         if (!u->addr)
 602                 goto out;       /* No listens on an unbound socket */
 603         unix_state_lock(sk);
 604         if (sk->sk_state != TCP_CLOSE && sk->sk_state != TCP_LISTEN)
 605                 goto out_unlock;
 606         if (backlog > sk->sk_max_ack_backlog)
 607                 wake_up_interruptible_all(&u->peer_wait);
 608         sk->sk_max_ack_backlog  = backlog;
 609         sk->sk_state            = TCP_LISTEN;
 610         /* set credentials so connect can copy them */
 611         init_peercred(sk);
 612         err = 0;
 613
 614 out_unlock:
 615         unix_state_unlock(sk);
 616         put_pid(old_pid);
 617 out:
 618         return err;
 619 }
 620
 621 static int unix_release(struct socket *);
 622 static int unix_bind(struct socket *, struct sockaddr *, int);
 623 static int unix_stream_connect(struct socket *, struct sockaddr *,
 624                                int addr_len, int flags);
 625 static int unix_socketpair(struct socket *, struct socket *);
 626 static int unix_accept(struct socket *, struct socket *, int);
 627 static int unix_getname(struct socket *, struct sockaddr *, int *, int);
 628 static unsigned int unix_poll(struct file *, struct socket *, poll_table *);
 629 static unsigned int unix_dgram_poll(struct file *, struct socket *,
 630                                     poll_table *);
 631 static int unix_ioctl(struct socket *, unsigned int, unsigned long);
 632 static int unix_shutdown(struct socket *, int);
 633 static int unix_stream_sendmsg(struct kiocb *, struct socket *,
 634                                struct msghdr *, size_t);
 635 static int unix_stream_recvmsg(struct kiocb *, struct socket *,
 636                                struct msghdr *, size_t, int);
 637 static int unix_dgram_sendmsg(struct kiocb *, struct socket *,
 638                               struct msghdr *, size_t);
 639 static int unix_dgram_recvmsg(struct kiocb *, struct socket *,
 640                               struct msghdr *, size_t, int);
 641 static int unix_dgram_connect(struct socket *, struct sockaddr *,
 642                               int, int);
 643 static int unix_seqpacket_sendmsg(struct kiocb *, struct socket *,
 644                                   struct msghdr *, size_t);
 645 static int unix_seqpacket_recvmsg(struct kiocb *, struct socket *,
 646                                   struct msghdr *, size_t, int);
 647
 648 static int unix_set_peek_off(struct sock *sk, int val)
 649 {
 650         struct unix_sock *u = unix_sk(sk);
 651
 652         if (mutex_lock_interruptible(&u->readlock))
 653                 return -EINTR;
 654
 655         sk->sk_peek_off = val;
 656         mutex_unlock(&u->readlock);
 657
 658         return 0;
 659 }
 660
 661
 662 static const struct proto_ops unix_stream_ops = {
 663         .family =       PF_UNIX,
 664         .owner =        THIS_MODULE,
 665         .release =      unix_release,
 666         .bind =         unix_bind,
 667         .connect =      unix_stream_connect,
 668         .socketpair =   unix_socketpair,
 669         .accept =       unix_accept,
 670         .getname =      unix_getname,
 671         .poll =         unix_poll,
 672         .ioctl =        unix_ioctl,
 673         .listen =       unix_listen,
 674         .shutdown =     unix_shutdown,
 675         .setsockopt =   sock_no_setsockopt,
 676         .getsockopt =   sock_no_getsockopt,
 677         .sendmsg =      unix_stream_sendmsg,
 678         .recvmsg =      unix_stream_recvmsg,
 679         .mmap =         sock_no_mmap,
 680         .sendpage =     sock_no_sendpage,
 681         .set_peek_off = unix_set_peek_off,
 682 };
 683
 684 static const struct proto_ops unix_dgram_ops = {
 685         .family =       PF_UNIX,
 686         .owner =        THIS_MODULE,
 687         .release =      unix_release,
 688         .bind =         unix_bind,
 689         .connect =      unix_dgram_connect,
 690         .socketpair =   unix_socketpair,
 691         .accept =       sock_no_accept,
 692         .getname =      unix_getname,
 693         .poll =         unix_dgram_poll,
 694         .ioctl =        unix_ioctl,
 695         .listen =       sock_no_listen,
 696         .shutdown =     unix_shutdown,
 697         .setsockopt =   sock_no_setsockopt,
 698         .getsockopt =   sock_no_getsockopt,
 699         .sendmsg =      unix_dgram_sendmsg,
 700         .recvmsg =      unix_dgram_recvmsg,
 701         .mmap =         sock_no_mmap,
 702         .sendpage =     sock_no_sendpage,
 703         .set_peek_off = unix_set_peek_off,
 704 };
 705
 706 static const struct proto_ops unix_seqpacket_ops = {
 707         .family =       PF_UNIX,
 708         .owner =        THIS_MODULE,
 709         .release =      unix_release,
 710         .bind =         unix_bind,
 711         .connect =      unix_stream_connect,
 712         .socketpair =   unix_socketpair,
 713         .accept =       unix_accept,
 714         .getname =      unix_getname,
 715         .poll =         unix_dgram_poll,
 716         .ioctl =        unix_ioctl,
 717         .listen =       unix_listen,
 718         .shutdown =     unix_shutdown,
 719         .setsockopt =   sock_no_setsockopt,
 720         .getsockopt =   sock_no_getsockopt,
 721         .sendmsg =      unix_seqpacket_sendmsg,
 722         .recvmsg =      unix_seqpacket_recvmsg,
 723         .mmap =         sock_no_mmap,
 724         .sendpage =     sock_no_sendpage,
 725         .set_peek_off = unix_set_peek_off,
 726 };
 727
 728 static struct proto unix_proto = {
 729         .name                   = "UNIX",
 730         .owner                  = THIS_MODULE,
 731         .obj_size               = sizeof(struct unix_sock),
 732 };
 733
 734 /*
 735  * AF_UNIX sockets do not interact with hardware, hence they
 736  * dont trigger interrupts - so it's safe for them to have
 737  * bh-unsafe locking for their sk_receive_queue.lock. Split off
 738  * this special lock-class by reinitializing the spinlock key:
 739  */
 740 static struct lock_class_key af_unix_sk_receive_queue_lock_key;
 741
 742 static struct sock *unix_create1(struct net *net, struct socket *sock)
 743 {
 744         struct sock *sk = NULL;
 745         struct unix_sock *u;
 746
 747         atomic_long_inc(&unix_nr_socks);
 748         if (atomic_long_read(&unix_nr_socks) > 2 * get_max_files())
 749                 goto out;
 750
 751         sk = sk_alloc(net, PF_UNIX, GFP_KERNEL, &unix_proto);
 752         if (!sk)
 753                 goto out;
 754
 755         sock_init_data(sock, sk);
 756         lockdep_set_class(&sk->sk_receive_queue.lock,
 757                                 &af_unix_sk_receive_queue_lock_key);
 758
 759         sk->sk_write_space      = unix_write_space;
 760         sk->sk_max_ack_backlog  = net->unx.sysctl_max_dgram_qlen;
 761         sk->sk_destruct         = unix_sock_destructor;
 762         u         = unix_sk(sk);
 763         u->path.dentry = NULL;
 764         u->path.mnt = NULL;
 765         spin_lock_init(&u->lock);
 766         atomic_long_set(&u->inflight, 0);
 767         INIT_LIST_HEAD(&u->link);
 768         mutex_init(&u->readlock); /* single task reading lock */
 769         init_waitqueue_head(&u->peer_wait);
 770         init_waitqueue_func_entry(&u->peer_wake, unix_dgram_peer_wake_relay);
 771         unix_insert_socket(unix_sockets_unbound(sk), sk);
 772 out:
 773         if (sk == NULL)
 774                 atomic_long_dec(&unix_nr_socks);
 775         else {
 776                 local_bh_disable();
 777                 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
 778                 local_bh_enable();
 779         }
 780         return sk;
 781 }
 782
 783 static int unix_create(struct net *net, struct socket *sock, int protocol,
 784                        int kern)
 785 {
 786         if (protocol && protocol != PF_UNIX)
 787                 return -EPROTONOSUPPORT;
 788
 789         sock->state = SS_UNCONNECTED;
 790
 791         switch (sock->type) {
 792         case SOCK_STREAM:
 793                 sock->ops = &unix_stream_ops;
 794                 break;
 795                 /*
 796                  *      Believe it or not BSD has AF_UNIX, SOCK_RAW though
 797                  *      nothing uses it.
 798                  */
 799         case SOCK_RAW:
 800                 sock->type = SOCK_DGRAM;
 801         case SOCK_DGRAM:
 802                 sock->ops = &unix_dgram_ops;
 803                 break;
 804         case SOCK_SEQPACKET:
 805                 sock->ops = &unix_seqpacket_ops;
 806                 break;
 807         default:
 808                 return -ESOCKTNOSUPPORT;
 809         }
 810
 811         return unix_create1(net, sock) ? 0 : -ENOMEM;
 812 }
 813
 814 static int unix_release(struct socket *sock)
 815 {
 816         struct sock *sk = sock->sk;
 817
 818         if (!sk)
 819                 return 0;
 820
 821         unix_release_sock(sk, 0);
 822         sock->sk = NULL;
 823
 824         return 0;
 825 }
 826
 827 static int unix_autobind(struct socket *sock)
 828 {
 829         struct sock *sk = sock->sk;
 830         struct net *net = sock_net(sk);
 831         struct unix_sock *u = unix_sk(sk);
 832         static u32 ordernum = 1;
 833         struct unix_address *addr;
 834         int err;
 835         unsigned int retries = 0;
 836
 837         err = mutex_lock_interruptible(&u->readlock);
 838         if (err)
 839                 return err;
 840
 841         err = 0;
 842         if (u->addr)
 843                 goto out;
 844
 845         err = -ENOMEM;
 846         addr = kzalloc(sizeof(*addr) + sizeof(short) + 16, GFP_KERNEL);
 847         if (!addr)
 848                 goto out;
 849
 850         addr->name->sun_family = AF_UNIX;
 851         atomic_set(&addr->refcnt, 1);
 852
 853 retry:
 854         addr->len = sprintf(addr->name->sun_path+1, "%05x", ordernum) + 1 + sizeof(short);
 855         addr->hash = unix_hash_fold(csum_partial(addr->name, addr->len, 0));
 856
 857         spin_lock(&unix_table_lock);
 858         ordernum = (ordernum+1)&0xFFFFF;
 859
 860         if (__unix_find_socket_byname(net, addr->name, addr->len, sock->type,
 861                                       addr->hash)) {
 862                 spin_unlock(&unix_table_lock);
 863                 /*
 864                  * __unix_find_socket_byname() may take long time if many names
 865                  * are already in use.
 866                  */
 867                 cond_resched();
 868                 /* Give up if all names seems to be in use. */
 869                 if (retries++ == 0xFFFFF) {
 870                         err = -ENOSPC;
 871                         kfree(addr);
 872                         goto out;
 873                 }
 874                 goto retry;
 875         }
 876         addr->hash ^= sk->sk_type;
 877
 878         __unix_remove_socket(sk);
 879         u->addr = addr;
 880         __unix_insert_socket(&unix_socket_table[addr->hash], sk);
 881         spin_unlock(&unix_table_lock);
 882         err = 0;
 883
 884 out:    mutex_unlock(&u->readlock);
 885         return err;
 886 }
 887
 888 static struct sock *unix_find_other(struct net *net,
 889                                     struct sockaddr_un *sunname, int len,
 890                                     int type, unsigned int hash, int *error)
 891 {
 892         struct sock *u;
 893         struct path path;
 894         int err = 0;
 895
 896         if (sunname->sun_path[0]) {
 897                 struct inode *inode;
 898                 err = kern_path(sunname->sun_path, LOOKUP_FOLLOW, &path);
 899                 if (err)
 900                         goto fail;
 901                 inode = d_backing_inode(path.dentry);
 902                 err = inode_permission(inode, MAY_WRITE);
 903                 if (err)
 904                         goto put_fail;
 905
 906                 err = -ECONNREFUSED;
 907                 if (!S_ISSOCK(inode->i_mode))
 908                         goto put_fail;
 909                 u = unix_find_socket_byinode(inode);
 910                 if (!u)
 911                         goto put_fail;
 912
 913                 if (u->sk_type == type)
 914                         touch_atime(&path);
 915
 916                 path_put(&path);
 917
 918                 err = -EPROTOTYPE;
 919                 if (u->sk_type != type) {
 920                         sock_put(u);
 921                         goto fail;
 922                 }
 923         } else {
 924                 err = -ECONNREFUSED;
 925                 u = unix_find_socket_byname(net, sunname, len, type, hash);
 926                 if (u) {
 927                         struct dentry *dentry;
 928                         dentry = unix_sk(u)->path.dentry;
 929                         if (dentry)
 930                                 touch_atime(&unix_sk(u)->path);
 931                 } else
 932                         goto fail;
 933         }
 934         return u;
 935
 936 put_fail:
 937         path_put(&path);
 938 fail:
 939         *error = err;
 940         return NULL;
 941 }
 942
 943 static int unix_mknod(const char *sun_path, umode_t mode, struct path *res)
 944 {
 945         struct dentry *dentry;
 946         struct path path;
 947         int err = 0;
 948         /*
 949          * Get the parent directory, calculate the hash for last
 950          * component.
 951          */
 952         dentry = kern_path_create(AT_FDCWD, sun_path, &path, 0);
 953         err = PTR_ERR(dentry);
 954         if (IS_ERR(dentry))
 955                 return err;
 956
 957         /*
 958          * All right, let's create it.
 959          */
 960         err = security_path_mknod(&path, dentry, mode, 0);
 961         if (!err) {
 962                 err = vfs_mknod(d_inode(path.dentry), dentry, mode, 0);
 963                 if (!err) {
 964                         res->mnt = mntget(path.mnt);
 965                         res->dentry = dget(dentry);
 966                 }
 967         }
 968         done_path_create(&path, dentry);
 969         return err;
 970 }
 971
 972 static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
 973 {
 974         struct sock *sk = sock->sk;
 975         struct net *net = sock_net(sk);
 976         struct unix_sock *u = unix_sk(sk);
 977         struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr;
 978         char *sun_path = sunaddr->sun_path;
 979         int err;
 980         unsigned int hash;
 981         struct unix_address *addr;
 982         struct hlist_head *list;
 983
 984         err = -EINVAL;
 985         if (sunaddr->sun_family != AF_UNIX)
 986                 goto out;
 987
 988         if (addr_len == sizeof(short)) {
 989                 err = unix_autobind(sock);
 990                 goto out;
 991         }
 992
 993         err = unix_mkname(sunaddr, addr_len, &hash);
 994         if (err < 0)
 995                 goto out;
 996         addr_len = err;
 997
 998         err = mutex_lock_interruptible(&u->readlock);
 999         if (err)
1000                 goto out;
1001
1002         err = -EINVAL;
1003         if (u->addr)
1004                 goto out_up;
1005
1006         err = -ENOMEM;
1007         addr = kmalloc(sizeof(*addr)+addr_len, GFP_KERNEL);
1008         if (!addr)
1009                 goto out_up;
1010
1011         memcpy(addr->name, sunaddr, addr_len);
1012         addr->len = addr_len;
1013         addr->hash = hash ^ sk->sk_type;
1014         atomic_set(&addr->refcnt, 1);
1015
1016         if (sun_path[0]) {
1017                 struct path path;
1018                 umode_t mode = S_IFSOCK |
1019                        (SOCK_INODE(sock)->i_mode & ~current_umask());
1020                 err = unix_mknod(sun_path, mode, &path);
1021                 if (err) {
1022                         if (err == -EEXIST)
1023                                 err = -EADDRINUSE;
1024                         unix_release_addr(addr);
1025                         goto out_up;
1026                 }
1027                 addr->hash = UNIX_HASH_SIZE;
1028                 hash = d_backing_inode(path.dentry)->i_ino & (UNIX_HASH_SIZE-1);
1029                 spin_lock(&unix_table_lock);
1030                 u->path = path;
1031                 list = &unix_socket_table[hash];
1032         } else {
1033                 spin_lock(&unix_table_lock);
1034                 err = -EADDRINUSE;
1035                 if (__unix_find_socket_byname(net, sunaddr, addr_len,
1036                                               sk->sk_type, hash)) {
1037                         unix_release_addr(addr);
1038                         goto out_unlock;
1039                 }
1040
1041                 list = &unix_socket_table[addr->hash];
1042         }
1043
1044         err = 0;
1045         __unix_remove_socket(sk);
1046         u->addr = addr;
1047         __unix_insert_socket(list, sk);
1048
1049 out_unlock:
1050         spin_unlock(&unix_table_lock);
1051 out_up:
1052         mutex_unlock(&u->readlock);
1053 out:
1054         return err;
1055 }
1056
1057 static void unix_state_double_lock(struct sock *sk1, struct sock *sk2)
1058 {
1059         if (unlikely(sk1 == sk2) || !sk2) {
1060                 unix_state_lock(sk1);
1061                 return;
1062         }
1063         if (sk1 < sk2) {
1064                 unix_state_lock(sk1);
1065                 unix_state_lock_nested(sk2);
1066         } else {
1067                 unix_state_lock(sk2);
1068                 unix_state_lock_nested(sk1);
1069         }
1070 }
1071
1072 static void unix_state_double_unlock(struct sock *sk1, struct sock *sk2)
1073 {
1074         if (unlikely(sk1 == sk2) || !sk2) {
1075                 unix_state_unlock(sk1);
1076                 return;
1077         }
1078         unix_state_unlock(sk1);
1079         unix_state_unlock(sk2);
1080 }
1081
1082 static int unix_dgram_connect(struct socket *sock, struct sockaddr *addr,
1083                               int alen, int flags)
1084 {
1085         struct sock *sk = sock->sk;
1086         struct net *net = sock_net(sk);
1087         struct sockaddr_un *sunaddr = (struct sockaddr_un *)addr;
1088         struct sock *other;
1089         unsigned int hash;
1090         int err;
1091
1092         if (addr->sa_family != AF_UNSPEC) {
1093                 err = unix_mkname(sunaddr, alen, &hash);
1094                 if (err < 0)
1095                         goto out;
1096                 alen = err;
1097
1098                 if (test_bit(SOCK_PASSCRED, &sock->flags) &&
1099                     !unix_sk(sk)->addr && (err = unix_autobind(sock)) != 0)
1100                         goto out;
1101
1102 restart:
1103                 other = unix_find_other(net, sunaddr, alen, sock->type, hash, &err);
1104                 if (!other)
1105                         goto out;
1106
1107                 unix_state_double_lock(sk, other);
1108
1109                 /* Apparently VFS overslept socket death. Retry. */
1110                 if (sock_flag(other, SOCK_DEAD)) {
1111                         unix_state_double_unlock(sk, other);
1112                         sock_put(other);
1113                         goto restart;
1114                 }
1115
1116                 err = -EPERM;
1117                 if (!unix_may_send(sk, other))
1118                         goto out_unlock;
1119
1120                 err = security_unix_may_send(sk->sk_socket, other->sk_socket);
1121                 if (err)
1122                         goto out_unlock;
1123
1124         } else {
1125                 /*
1126                  *      1003.1g breaking connected state with AF_UNSPEC
1127                  */
1128                 other = NULL;
1129                 unix_state_double_lock(sk, other);
1130         }
1131
1132         /*
1133          * If it was connected, reconnect.
1134          */
1135         if (unix_peer(sk)) {
1136                 struct sock *old_peer = unix_peer(sk);
1137                 unix_peer(sk) = other;
1138                 unix_dgram_peer_wake_disconnect_wakeup(sk, old_peer);
1139
1140                 unix_state_double_unlock(sk, other);
1141
1142                 if (other != old_peer)
1143                         unix_dgram_disconnected(sk, old_peer);
1144                 sock_put(old_peer);
1145         } else {
1146                 unix_peer(sk) = other;
1147                 unix_state_double_unlock(sk, other);
1148         }
1149         return 0;
1150
1151 out_unlock:
1152         unix_state_double_unlock(sk, other);
1153         sock_put(other);
1154 out:
1155         return err;
1156 }
1157
1158 static long unix_wait_for_peer(struct sock *other, long timeo)
1159 {
1160         struct unix_sock *u = unix_sk(other);
1161         int sched;
1162         DEFINE_WAIT(wait);
1163
1164         prepare_to_wait_exclusive(&u->peer_wait, &wait, TASK_INTERRUPTIBLE);
1165
1166         sched = !sock_flag(other, SOCK_DEAD) &&
1167                 !(other->sk_shutdown & RCV_SHUTDOWN) &&
1168                 unix_recvq_full(other);
1169
1170         unix_state_unlock(other);
1171
1172         if (sched)
1173                 timeo = schedule_timeout(timeo);
1174
1175         finish_wait(&u->peer_wait, &wait);
1176         return timeo;
1177 }
1178
1179 static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr,
1180                                int addr_len, int flags)
1181 {
1182         struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr;
1183         struct sock *sk = sock->sk;
1184         struct net *net = sock_net(sk);
1185         struct unix_sock *u = unix_sk(sk), *newu, *otheru;
1186         struct sock *newsk = NULL;
1187         struct sock *other = NULL;
1188         struct sk_buff *skb = NULL;
1189         unsigned int hash;
1190         int st;
1191         int err;
1192         long timeo;
1193
1194         err = unix_mkname(sunaddr, addr_len, &hash);
1195         if (err < 0)
1196                 goto out;
1197         addr_len = err;
1198
1199         if (test_bit(SOCK_PASSCRED, &sock->flags) && !u->addr &&
1200             (err = unix_autobind(sock)) != 0)
1201                 goto out;
1202
1203         timeo = sock_sndtimeo(sk, flags & O_NONBLOCK);
1204
1205         /* First of all allocate resources.
1206            If we will make it after state is locked,
1207            we will have to recheck all again in any case.
1208          */
1209
1210         err = -ENOMEM;
1211
1212         /* create new sock for complete connection */
1213         newsk = unix_create1(sock_net(sk), NULL);
1214         if (newsk == NULL)
1215                 goto out;
1216
1217         /* Allocate skb for sending to listening sock */
1218         skb = sock_wmalloc(newsk, 1, 0, GFP_KERNEL);
1219         if (skb == NULL)
1220                 goto out;
1221
1222 restart:
1223         /*  Find listening sock. */
1224         other = unix_find_other(net, sunaddr, addr_len, sk->sk_type, hash, &err);
1225         if (!other)
1226                 goto out;
1227
1228         /* Latch state of peer */
1229         unix_state_lock(other);
1230
1231         /* Apparently VFS overslept socket death. Retry. */
1232         if (sock_flag(other, SOCK_DEAD)) {
1233                 unix_state_unlock(other);
1234                 sock_put(other);
1235                 goto restart;
1236         }
1237
1238         err = -ECONNREFUSED;
1239         if (other->sk_state != TCP_LISTEN)
1240                 goto out_unlock;
1241         if (other->sk_shutdown & RCV_SHUTDOWN)
1242                 goto out_unlock;
1243
1244         if (unix_recvq_full(other)) {
1245                 err = -EAGAIN;
1246                 if (!timeo)
1247                         goto out_unlock;
1248
1249                 timeo = unix_wait_for_peer(other, timeo);
1250
1251                 err = sock_intr_errno(timeo);
1252                 if (signal_pending(current))
1253                         goto out;
1254                 sock_put(other);
1255                 goto restart;
1256         }
1257
1258         /* Latch our state.
1259
1260            It is tricky place. We need to grab our state lock and cannot
1261            drop lock on peer. It is dangerous because deadlock is
1262            possible. Connect to self case and simultaneous
1263            attempt to connect are eliminated by checking socket
1264            state. other is TCP_LISTEN, if sk is TCP_LISTEN we
1265            check this before attempt to grab lock.
1266
1267            Well, and we have to recheck the state after socket locked.
1268          */
1269         st = sk->sk_state;
1270
1271         switch (st) {
1272         case TCP_CLOSE:
1273                 /* This is ok... continue with connect */
1274                 break;
1275         case TCP_ESTABLISHED:
1276                 /* Socket is already connected */
1277                 err = -EISCONN;
1278                 goto out_unlock;
1279         default:
1280                 err = -EINVAL;
1281                 goto out_unlock;
1282         }
1283
1284         unix_state_lock_nested(sk);
1285
1286         if (sk->sk_state != st) {
1287                 unix_state_unlock(sk);
1288                 unix_state_unlock(other);
1289                 sock_put(other);
1290                 goto restart;
1291         }
1292
1293         err = security_unix_stream_connect(sk, other, newsk);
1294         if (err) {
1295                 unix_state_unlock(sk);
1296                 goto out_unlock;
1297         }
1298
1299         /* The way is open! Fastly set all the necessary fields... */
1300
1301         sock_hold(sk);
1302         unix_peer(newsk)        = sk;
1303         newsk->sk_state         = TCP_ESTABLISHED;
1304         newsk->sk_type          = sk->sk_type;
1305         init_peercred(newsk);
1306         newu = unix_sk(newsk);
1307         RCU_INIT_POINTER(newsk->sk_wq, &newu->peer_wq);
1308         otheru = unix_sk(other);
1309
1310         /* copy address information from listening to new sock*/
1311         if (otheru->addr) {
1312                 atomic_inc(&otheru->addr->refcnt);
1313                 newu->addr = otheru->addr;
1314         }
1315         if (otheru->path.dentry) {
1316                 path_get(&otheru->path);
1317                 newu->path = otheru->path;
1318         }
1319
1320         /* Set credentials */
1321         copy_peercred(sk, other);
1322
1323         sock->state     = SS_CONNECTED;
1324         sk->sk_state    = TCP_ESTABLISHED;
1325         sock_hold(newsk);
1326
1327         smp_mb__after_atomic(); /* sock_hold() does an atomic_inc() */
1328         unix_peer(sk)   = newsk;
1329
1330         unix_state_unlock(sk);
1331
1332         /* take ten and and send info to listening sock */
1333         spin_lock(&other->sk_receive_queue.lock);
1334         __skb_queue_tail(&other->sk_receive_queue, skb);
1335         spin_unlock(&other->sk_receive_queue.lock);
1336         unix_state_unlock(other);
1337         other->sk_data_ready(other);
1338         sock_put(other);
1339         return 0;
1340
1341 out_unlock:
1342         if (other)
1343                 unix_state_unlock(other);
1344
1345 out:
1346         kfree_skb(skb);
1347         if (newsk)
1348                 unix_release_sock(newsk, 0);
1349         if (other)
1350                 sock_put(other);
1351         return err;
1352 }
1353
1354 static int unix_socketpair(struct socket *socka, struct socket *sockb)
1355 {
1356         struct sock *ska = socka->sk, *skb = sockb->sk;
1357
1358         /* Join our sockets back to back */
1359         sock_hold(ska);
1360         sock_hold(skb);
1361         unix_peer(ska) = skb;
1362         unix_peer(skb) = ska;
1363         init_peercred(ska);
1364         init_peercred(skb);
1365
1366         if (ska->sk_type != SOCK_DGRAM) {
1367                 ska->sk_state = TCP_ESTABLISHED;
1368                 skb->sk_state = TCP_ESTABLISHED;
1369                 socka->state  = SS_CONNECTED;
1370                 sockb->state  = SS_CONNECTED;
1371         }
1372         return 0;
1373 }
1374
1375 static void unix_sock_inherit_flags(const struct socket *old,
1376                                     struct socket *new)
1377 {
1378         if (test_bit(SOCK_PASSCRED, &old->flags))
1379                 set_bit(SOCK_PASSCRED, &new->flags);
1380         if (test_bit(SOCK_PASSSEC, &old->flags))
1381                 set_bit(SOCK_PASSSEC, &new->flags);
1382 }
1383
1384 static int unix_accept(struct socket *sock, struct socket *newsock, int flags)
1385 {
1386         struct sock *sk = sock->sk;
1387         struct sock *tsk;
1388         struct sk_buff *skb;
1389         int err;
1390
1391         err = -EOPNOTSUPP;
1392         if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET)
1393                 goto out;
1394
1395         err = -EINVAL;
1396         if (sk->sk_state != TCP_LISTEN)
1397                 goto out;
1398
1399         /* If socket state is TCP_LISTEN it cannot change (for now...),
1400          * so that no locks are necessary.
1401          */
1402
1403         skb = skb_recv_datagram(sk, 0, flags&O_NONBLOCK, &err);
1404         if (!skb) {
1405                 /* This means receive shutdown. */
1406                 if (err == 0)
1407                         err = -EINVAL;
1408                 goto out;
1409         }
1410
1411         tsk = skb->sk;
1412         skb_free_datagram(sk, skb);
1413         wake_up_interruptible(&unix_sk(sk)->peer_wait);
1414
1415         /* attach accepted sock to socket */
1416         unix_state_lock(tsk);
1417         newsock->state = SS_CONNECTED;
1418         unix_sock_inherit_flags(sock, newsock);
1419         sock_graft(tsk, newsock);
1420         unix_state_unlock(tsk);
1421         return 0;
1422
1423 out:
1424         return err;
1425 }
1426
1427
1428 static int unix_getname(struct socket *sock, struct sockaddr *uaddr, int *uaddr_len, int peer)
1429 {
1430         struct sock *sk = sock->sk;
1431         struct unix_sock *u;
1432         DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, uaddr);
1433         int err = 0;
1434
1435         if (peer) {
1436                 sk = unix_peer_get(sk);
1437
1438                 err = -ENOTCONN;
1439                 if (!sk)
1440                         goto out;
1441                 err = 0;
1442         } else {
1443                 sock_hold(sk);
1444         }
1445
1446         u = unix_sk(sk);
1447         unix_state_lock(sk);
1448         if (!u->addr) {
1449                 sunaddr->sun_family = AF_UNIX;
1450                 sunaddr->sun_path[0] = 0;
1451                 *uaddr_len = sizeof(short);
1452         } else {
1453                 struct unix_address *addr = u->addr;
1454
1455                 *uaddr_len = addr->len;
1456                 memcpy(sunaddr, addr->name, *uaddr_len);
1457         }
1458         unix_state_unlock(sk);
1459         sock_put(sk);
1460 out:
1461         return err;
1462 }
1463
1464 static void unix_detach_fds(struct scm_cookie *scm, struct sk_buff *skb)
1465 {
1466         int i;
1467
1468         scm->fp = UNIXCB(skb).fp;
1469         UNIXCB(skb).fp = NULL;
1470
1471         for (i = scm->fp->count-1; i >= 0; i--)
1472                 unix_notinflight(scm->fp->fp[i]);
1473 }
1474
1475 static void unix_destruct_scm(struct sk_buff *skb)
1476 {
1477         struct scm_cookie scm;
1478         memset(&scm, 0, sizeof(scm));
1479         scm.pid  = UNIXCB(skb).pid;
1480         if (UNIXCB(skb).fp)
1481                 unix_detach_fds(&scm, skb);
1482
1483         /* Alas, it calls VFS */
1484         /* So fscking what? fput() had been SMP-safe since the last Summer */
1485         scm_destroy(&scm);
1486         sock_wfree(skb);
1487 }
1488
1489 /*
1490  * The "user->unix_inflight" variable is protected by the garbage
1491  * collection lock, and we just read it locklessly here. If you go
1492  * over the limit, there might be a tiny race in actually noticing
1493  * it across threads. Tough.
1494  */
1495 static inline bool too_many_unix_fds(struct task_struct *p)
1496 {
1497         struct user_struct *user = current_user();
1498
1499         if (unlikely(user->unix_inflight > task_rlimit(p, RLIMIT_NOFILE)))
1500                 return !capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN);
1501         return false;
1502 }
1503
1504 #define MAX_RECURSION_LEVEL 4
1505
1506 static int unix_attach_fds(struct scm_cookie *scm, struct sk_buff *skb)
1507 {
1508         int i;
1509         unsigned char max_level = 0;
1510         int unix_sock_count = 0;
1511
1512         if (too_many_unix_fds(current))
1513                 return -ETOOMANYREFS;
1514
1515         for (i = scm->fp->count - 1; i >= 0; i--) {
1516                 struct sock *sk = unix_get_socket(scm->fp->fp[i]);
1517
1518                 if (sk) {
1519                         unix_sock_count++;
1520                         max_level = max(max_level,
1521                                         unix_sk(sk)->recursion_level);
1522                 }
1523         }
1524         if (unlikely(max_level > MAX_RECURSION_LEVEL))
1525                 return -ETOOMANYREFS;
1526
1527         /*
1528          * Need to duplicate file references for the sake of garbage
1529          * collection.  Otherwise a socket in the fps might become a
1530          * candidate for GC while the skb is not yet queued.
1531          */
1532         UNIXCB(skb).fp = scm_fp_dup(scm->fp);
1533         if (!UNIXCB(skb).fp)
1534                 return -ENOMEM;
1535
1536         for (i = scm->fp->count - 1; i >= 0; i--)
1537                 unix_inflight(scm->fp->fp[i]);
1538         return max_level;
1539 }
1540
1541 static int unix_scm_to_skb(struct scm_cookie *scm, struct sk_buff *skb, bool send_fds)
1542 {
1543         int err = 0;
1544
1545         UNIXCB(skb).pid  = get_pid(scm->pid);
1546         UNIXCB(skb).uid = scm->creds.uid;
1547         UNIXCB(skb).gid = scm->creds.gid;
1548         UNIXCB(skb).fp = NULL;
1549         if (scm->fp && send_fds)
1550                 err = unix_attach_fds(scm, skb);
1551
1552         skb->destructor = unix_destruct_scm;
1553         return err;
1554 }
1555
1556 /*
1557  * Some apps rely on write() giving SCM_CREDENTIALS
1558  * We include credentials if source or destination socket
1559  * asserted SOCK_PASSCRED.
1560  */
1561 static void maybe_add_creds(struct sk_buff *skb, const struct socket *sock,
1562                             const struct sock *other)
1563 {
1564         if (UNIXCB(skb).pid)
1565                 return;
1566         if (test_bit(SOCK_PASSCRED, &sock->flags) ||
1567             !other->sk_socket ||
1568             test_bit(SOCK_PASSCRED, &other->sk_socket->flags)) {
1569                 UNIXCB(skb).pid  = get_pid(task_tgid(current));
1570                 current_uid_gid(&UNIXCB(skb).uid, &UNIXCB(skb).gid);
1571         }
1572 }
1573
1574 /*
1575  *      Send AF_UNIX data.
1576  */
1577
1578 static int unix_dgram_sendmsg(struct kiocb *kiocb, struct socket *sock,
1579                               struct msghdr *msg, size_t len)
1580 {
1581         struct sock_iocb *siocb = kiocb_to_siocb(kiocb);
1582         struct sock *sk = sock->sk;
1583         struct net *net = sock_net(sk);
1584         struct unix_sock *u = unix_sk(sk);
1585         DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, msg->msg_name);
1586         struct sock *other = NULL;
1587         int namelen = 0; /* fake GCC */
1588         int err;
1589         unsigned int hash;
1590         struct sk_buff *skb;
1591         long timeo;
1592         struct scm_cookie tmp_scm;
1593         int max_level;
1594         int data_len = 0;
1595         int sk_locked;
1596
1597         if (NULL == siocb->scm)
1598                 siocb->scm = &tmp_scm;
1599         wait_for_unix_gc();
1600         err = scm_send(sock, msg, siocb->scm, false);
1601         if (err < 0)
1602                 return err;
1603
1604         err = -EOPNOTSUPP;
1605         if (msg->msg_flags&MSG_OOB)
1606                 goto out;
1607
1608         if (msg->msg_namelen) {
1609                 err = unix_mkname(sunaddr, msg->msg_namelen, &hash);
1610                 if (err < 0)
1611                         goto out;
1612                 namelen = err;
1613         } else {
1614                 sunaddr = NULL;
1615                 err = -ENOTCONN;
1616                 other = unix_peer_get(sk);
1617                 if (!other)
1618                         goto out;
1619         }
1620
1621         if (test_bit(SOCK_PASSCRED, &sock->flags) && !u->addr
1622             && (err = unix_autobind(sock)) != 0)
1623                 goto out;
1624
1625         err = -EMSGSIZE;
1626         if (len > sk->sk_sndbuf - 32)
1627                 goto out;
1628
1629         if (len > SKB_MAX_ALLOC) {
1630                 data_len = min_t(size_t,
1631                                  len - SKB_MAX_ALLOC,
1632                                  MAX_SKB_FRAGS * PAGE_SIZE);
1633                 data_len = PAGE_ALIGN(data_len);
1634
1635                 BUILD_BUG_ON(SKB_MAX_ALLOC < PAGE_SIZE);
1636         }
1637
1638         skb = sock_alloc_send_pskb(sk, len - data_len, data_len,
1639                                    msg->msg_flags & MSG_DONTWAIT, &err,
1640                                    PAGE_ALLOC_COSTLY_ORDER);
1641         if (skb == NULL)
1642                 goto out;
1643
1644         err = unix_scm_to_skb(siocb->scm, skb, true);
1645         if (err < 0)
1646                 goto out_free;
1647         max_level = err + 1;
1648         unix_get_secdata(siocb->scm, skb);
1649
1650         skb_put(skb, len - data_len);
1651         skb->data_len = data_len;
1652         skb->len = len;
1653         err = skb_copy_datagram_from_iovec(skb, 0, msg->msg_iov, 0, len);
1654         if (err)
1655                 goto out_free;
1656
1657         timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT);
1658
1659 restart:
1660         if (!other) {
1661                 err = -ECONNRESET;
1662                 if (sunaddr == NULL)
1663                         goto out_free;
1664
1665                 other = unix_find_other(net, sunaddr, namelen, sk->sk_type,
1666                                         hash, &err);
1667                 if (other == NULL)
1668                         goto out_free;
1669         }
1670
1671         if (sk_filter(other, skb) < 0) {
1672                 /* Toss the packet but do not return any error to the sender */
1673                 err = len;
1674                 goto out_free;
1675         }
1676
1677         sk_locked = 0;
1678         unix_state_lock(other);
1679 restart_locked:
1680         err = -EPERM;
1681         if (!unix_may_send(sk, other))
1682                 goto out_unlock;
1683
1684         if (unlikely(sock_flag(other, SOCK_DEAD))) {
1685                 /*
1686                  *      Check with 1003.1g - what should
1687                  *      datagram error
1688                  */
1689                 unix_state_unlock(other);
1690                 sock_put(other);
1691
1692                 if (!sk_locked)
1693                         unix_state_lock(sk);
1694
1695                 err = 0;
1696                 if (unix_peer(sk) == other) {
1697                         unix_peer(sk) = NULL;
1698                         unix_dgram_peer_wake_disconnect_wakeup(sk, other);
1699
1700                         unix_state_unlock(sk);
1701
1702                         unix_dgram_disconnected(sk, other);
1703                         sock_put(other);
1704                         err = -ECONNREFUSED;
1705                 } else {
1706                         unix_state_unlock(sk);
1707                 }
1708
1709                 other = NULL;
1710                 if (err)
1711                         goto out_free;
1712                 goto restart;
1713         }
1714
1715         err = -EPIPE;
1716         if (other->sk_shutdown & RCV_SHUTDOWN)
1717                 goto out_unlock;
1718
1719         if (sk->sk_type != SOCK_SEQPACKET) {
1720                 err = security_unix_may_send(sk->sk_socket, other->sk_socket);
1721                 if (err)
1722                         goto out_unlock;
1723         }
1724
1725         /* other == sk && unix_peer(other) != sk if
1726          * - unix_peer(sk) == NULL, destination address bound to sk
1727          * - unix_peer(sk) == sk by time of get but disconnected before lock
1728          */
1729         if (other != sk &&
1730             unlikely(unix_peer(other) != sk && unix_recvq_full(other))) {
1731                 if (timeo) {
1732                         timeo = unix_wait_for_peer(other, timeo);
1733
1734                         err = sock_intr_errno(timeo);
1735                         if (signal_pending(current))
1736                                 goto out_free;
1737
1738                         goto restart;
1739                 }
1740
1741                 if (!sk_locked) {
1742                         unix_state_unlock(other);
1743                         unix_state_double_lock(sk, other);
1744                 }
1745
1746                 if (unix_peer(sk) != other ||
1747                     unix_dgram_peer_wake_me(sk, other)) {
1748                         err = -EAGAIN;
1749                         sk_locked = 1;
1750                         goto out_unlock;
1751                 }
1752
1753                 if (!sk_locked) {
1754                         sk_locked = 1;
1755                         goto restart_locked;
1756                 }
1757         }
1758
1759         if (unlikely(sk_locked))
1760                 unix_state_unlock(sk);
1761
1762         if (sock_flag(other, SOCK_RCVTSTAMP))
1763                 __net_timestamp(skb);
1764         maybe_add_creds(skb, sock, other);
1765         skb_queue_tail(&other->sk_receive_queue, skb);
1766         if (max_level > unix_sk(other)->recursion_level)
1767                 unix_sk(other)->recursion_level = max_level;
1768         unix_state_unlock(other);
1769         other->sk_data_ready(other);
1770         sock_put(other);
1771         scm_destroy(siocb->scm);
1772         return len;
1773
1774 out_unlock:
1775         if (sk_locked)
1776                 unix_state_unlock(sk);
1777         unix_state_unlock(other);
1778 out_free:
1779         kfree_skb(skb);
1780 out:
1781         if (other)
1782                 sock_put(other);
1783         scm_destroy(siocb->scm);
1784         return err;
1785 }
1786
1787 /* We use paged skbs for stream sockets, and limit occupancy to 32768
1788  * bytes, and a minimun of a full page.
1789  */
1790 #define UNIX_SKB_FRAGS_SZ (PAGE_SIZE << get_order(32768))
1791
1792 static int unix_stream_sendmsg(struct kiocb *kiocb, struct socket *sock,
1793                                struct msghdr *msg, size_t len)
1794 {
1795         struct sock_iocb *siocb = kiocb_to_siocb(kiocb);
1796         struct sock *sk = sock->sk;
1797         struct sock *other = NULL;
1798         int err, size;
1799         struct sk_buff *skb;
1800         int sent = 0;
1801         struct scm_cookie tmp_scm;
1802         bool fds_sent = false;
1803         int max_level;
1804         int data_len;
1805
1806         if (NULL == siocb->scm)
1807                 siocb->scm = &tmp_scm;
1808         wait_for_unix_gc();
1809         err = scm_send(sock, msg, siocb->scm, false);
1810         if (err < 0)
1811                 return err;
1812
1813         err = -EOPNOTSUPP;
1814         if (msg->msg_flags&MSG_OOB)
1815                 goto out_err;
1816
1817         if (msg->msg_namelen) {
1818                 err = sk->sk_state == TCP_ESTABLISHED ? -EISCONN : -EOPNOTSUPP;
1819                 goto out_err;
1820         } else {
1821                 err = -ENOTCONN;
1822                 other = unix_peer(sk);
1823                 if (!other)
1824                         goto out_err;
1825         }
1826
1827         if (sk->sk_shutdown & SEND_SHUTDOWN)
1828                 goto pipe_err;
1829
1830         while (sent < len) {
1831                 size = len - sent;
1832
1833                 /* Keep two messages in the pipe so it schedules better */
1834                 size = min_t(int, size, (sk->sk_sndbuf >> 1) - 64);
1835
1836                 /* allow fallback to order-0 allocations */
1837                 size = min_t(int, size, SKB_MAX_HEAD(0) + UNIX_SKB_FRAGS_SZ);
1838
1839                 data_len = max_t(int, 0, size - SKB_MAX_HEAD(0));
1840
1841                 data_len = min_t(size_t, size, PAGE_ALIGN(data_len));
1842
1843                 skb = sock_alloc_send_pskb(sk, size - data_len, data_len,
1844                                            msg->msg_flags & MSG_DONTWAIT, &err,
1845                                            get_order(UNIX_SKB_FRAGS_SZ));
1846                 if (!skb)
1847                         goto out_err;
1848
1849                 /* Only send the fds in the first buffer */
1850                 err = unix_scm_to_skb(siocb->scm, skb, !fds_sent);
1851                 if (err < 0) {
1852                         kfree_skb(skb);
1853                         goto out_err;
1854                 }
1855                 max_level = err + 1;
1856                 fds_sent = true;
1857
1858                 skb_put(skb, size - data_len);
1859                 skb->data_len = data_len;
1860                 skb->len = size;
1861                 err = skb_copy_datagram_from_iovec(skb, 0, msg->msg_iov,
1862                                                    sent, size);
1863                 if (err) {
1864                         kfree_skb(skb);
1865                         goto out_err;
1866                 }
1867
1868                 unix_state_lock(other);
1869
1870                 if (sock_flag(other, SOCK_DEAD) ||
1871                     (other->sk_shutdown & RCV_SHUTDOWN))
1872                         goto pipe_err_free;
1873
1874                 maybe_add_creds(skb, sock, other);
1875                 skb_queue_tail(&other->sk_receive_queue, skb);
1876                 if (max_level > unix_sk(other)->recursion_level)
1877                         unix_sk(other)->recursion_level = max_level;
1878                 unix_state_unlock(other);
1879                 other->sk_data_ready(other);
1880                 sent += size;
1881         }
1882
1883         scm_destroy(siocb->scm);
1884         siocb->scm = NULL;
1885
1886         return sent;
1887
1888 pipe_err_free:
1889         unix_state_unlock(other);
1890         kfree_skb(skb);
1891 pipe_err:
1892         if (sent == 0 && !(msg->msg_flags&MSG_NOSIGNAL))
1893                 send_sig(SIGPIPE, current, 0);
1894         err = -EPIPE;
1895 out_err:
1896         scm_destroy(siocb->scm);
1897         siocb->scm = NULL;
1898         return sent ? : err;
1899 }
1900
1901 static int unix_seqpacket_sendmsg(struct kiocb *kiocb, struct socket *sock,
1902                                   struct msghdr *msg, size_t len)
1903 {
1904         int err;
1905         struct sock *sk = sock->sk;
1906
1907         err = sock_error(sk);
1908         if (err)
1909                 return err;
1910
1911         if (sk->sk_state != TCP_ESTABLISHED)
1912                 return -ENOTCONN;
1913
1914         if (msg->msg_namelen)
1915                 msg->msg_namelen = 0;
1916
1917         return unix_dgram_sendmsg(kiocb, sock, msg, len);
1918 }
1919
1920 static int unix_seqpacket_recvmsg(struct kiocb *iocb, struct socket *sock,
1921                               struct msghdr *msg, size_t size,
1922                               int flags)
1923 {
1924         struct sock *sk = sock->sk;
1925
1926         if (sk->sk_state != TCP_ESTABLISHED)
1927                 return -ENOTCONN;
1928
1929         return unix_dgram_recvmsg(iocb, sock, msg, size, flags);
1930 }
1931
1932 static void unix_copy_addr(struct msghdr *msg, struct sock *sk)
1933 {
1934         struct unix_sock *u = unix_sk(sk);
1935
1936         if (u->addr) {
1937                 msg->msg_namelen = u->addr->len;
1938                 memcpy(msg->msg_name, u->addr->name, u->addr->len);
1939         }
1940 }
1941
1942 static int unix_dgram_recvmsg(struct kiocb *iocb, struct socket *sock,
1943                               struct msghdr *msg, size_t size,
1944                               int flags)
1945 {
1946         struct sock_iocb *siocb = kiocb_to_siocb(iocb);
1947         struct scm_cookie tmp_scm;
1948         struct sock *sk = sock->sk;
1949         struct unix_sock *u = unix_sk(sk);
1950         int noblock = flags & MSG_DONTWAIT;
1951         struct sk_buff *skb;
1952         int err;
1953         int peeked, skip;
1954
1955         err = -EOPNOTSUPP;
1956         if (flags&MSG_OOB)
1957                 goto out;
1958
1959         err = mutex_lock_interruptible(&u->readlock);
1960         if (unlikely(err)) {
1961                 /* recvmsg() in non blocking mode is supposed to return -EAGAIN
1962                  * sk_rcvtimeo is not honored by mutex_lock_interruptible()
1963                  */
1964                 err = noblock ? -EAGAIN : -ERESTARTSYS;
1965                 goto out;
1966         }
1967
1968         skip = sk_peek_offset(sk, flags);
1969
1970         skb = __skb_recv_datagram(sk, flags, &peeked, &skip, &err);
1971         if (!skb) {
1972                 unix_state_lock(sk);
1973                 /* Signal EOF on disconnected non-blocking SEQPACKET socket. */
1974                 if (sk->sk_type == SOCK_SEQPACKET && err == -EAGAIN &&
1975                     (sk->sk_shutdown & RCV_SHUTDOWN))
1976                         err = 0;
1977                 unix_state_unlock(sk);
1978                 goto out_unlock;
1979         }
1980
1981         wake_up_interruptible_sync_poll(&u->peer_wait,
1982                                         POLLOUT | POLLWRNORM | POLLWRBAND);
1983
1984         if (msg->msg_name)
1985                 unix_copy_addr(msg, skb->sk);
1986
1987         if (size > skb->len - skip)
1988                 size = skb->len - skip;
1989         else if (size < skb->len - skip)
1990                 msg->msg_flags |= MSG_TRUNC;
1991
1992         err = skb_copy_datagram_iovec(skb, skip, msg->msg_iov, size);
1993         if (err)
1994                 goto out_free;
1995
1996         if (sock_flag(sk, SOCK_RCVTSTAMP))
1997                 __sock_recv_timestamp(msg, sk, skb);
1998
1999         if (!siocb->scm) {
2000                 siocb->scm = &tmp_scm;
2001                 memset(&tmp_scm, 0, sizeof(tmp_scm));
2002         }
2003         scm_set_cred(siocb->scm, UNIXCB(skb).pid, UNIXCB(skb).uid, UNIXCB(skb).gid);
2004         unix_set_secdata(siocb->scm, skb);
2005
2006         if (!(flags & MSG_PEEK)) {
2007                 if (UNIXCB(skb).fp)
2008                         unix_detach_fds(siocb->scm, skb);
2009
2010                 sk_peek_offset_bwd(sk, skb->len);
2011         } else {
2012                 /* It is questionable: on PEEK we could:
2013                    - do not return fds - good, but too simple 8)
2014                    - return fds, and do not return them on read (old strategy,
2015                      apparently wrong)
2016                    - clone fds (I chose it for now, it is the most universal
2017                      solution)
2018
2019                    POSIX 1003.1g does not actually define this clearly
2020                    at all. POSIX 1003.1g doesn't define a lot of things
2021                    clearly however!
2022
2023                 */
2024
2025                 sk_peek_offset_fwd(sk, size);
2026
2027                 if (UNIXCB(skb).fp)
2028                         siocb->scm->fp = scm_fp_dup(UNIXCB(skb).fp);
2029         }
2030         err = (flags & MSG_TRUNC) ? skb->len - skip : size;
2031
2032         scm_recv(sock, msg, siocb->scm, flags);
2033
2034 out_free:
2035         skb_free_datagram(sk, skb);
2036 out_unlock:
2037         mutex_unlock(&u->readlock);
2038 out:
2039         return err;
2040 }
2041
2042 /*
2043  *      Sleep until more data has arrived. But check for races..
2044  */
2045 static long unix_stream_data_wait(struct sock *sk, long timeo,
2046                                   struct sk_buff *last)
2047 {
2048         DEFINE_WAIT(wait);
2049
2050         unix_state_lock(sk);
2051
2052         for (;;) {
2053                 prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
2054
2055                 if (skb_peek_tail(&sk->sk_receive_queue) != last ||
2056                     sk->sk_err ||
2057                     (sk->sk_shutdown & RCV_SHUTDOWN) ||
2058                     signal_pending(current) ||
2059                     !timeo)
2060                         break;
2061
2062                 set_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
2063                 unix_state_unlock(sk);
2064                 timeo = freezable_schedule_timeout(timeo);
2065                 unix_state_lock(sk);
2066
2067                 if (sock_flag(sk, SOCK_DEAD))
2068                         break;
2069
2070                 clear_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
2071         }
2072
2073         finish_wait(sk_sleep(sk), &wait);
2074         unix_state_unlock(sk);
2075         return timeo;
2076 }
2077
2078 static unsigned int unix_skb_len(const struct sk_buff *skb)
2079 {
2080         return skb->len - UNIXCB(skb).consumed;
2081 }
2082
2083 static int unix_stream_recvmsg(struct kiocb *iocb, struct socket *sock,
2084                                struct msghdr *msg, size_t size,
2085                                int flags)
2086 {
2087         struct sock_iocb *siocb = kiocb_to_siocb(iocb);
2088         struct scm_cookie tmp_scm;
2089         struct sock *sk = sock->sk;
2090         struct unix_sock *u = unix_sk(sk);
2091         DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, msg->msg_name);
2092         int copied = 0;
2093         int noblock = flags & MSG_DONTWAIT;
2094         int check_creds = 0;
2095         int target;
2096         int err = 0;
2097         long timeo;
2098         int skip;
2099
2100         err = -EINVAL;
2101         if (sk->sk_state != TCP_ESTABLISHED)
2102                 goto out;
2103
2104         err = -EOPNOTSUPP;
2105         if (flags&MSG_OOB)
2106                 goto out;
2107
2108         target = sock_rcvlowat(sk, flags&MSG_WAITALL, size);
2109         timeo = sock_rcvtimeo(sk, noblock);
2110
2111         /* Lock the socket to prevent queue disordering
2112          * while sleeps in memcpy_tomsg
2113          */
2114
2115         if (!siocb->scm) {
2116                 siocb->scm = &tmp_scm;
2117                 memset(&tmp_scm, 0, sizeof(tmp_scm));
2118         }
2119
2120         mutex_lock(&u->readlock);
2121
2122         if (flags & MSG_PEEK)
2123                 skip = sk_peek_offset(sk, flags);
2124         else
2125                 skip = 0;
2126
2127         do {
2128                 int chunk;
2129                 struct sk_buff *skb, *last;
2130
2131                 unix_state_lock(sk);
2132                 if (sock_flag(sk, SOCK_DEAD)) {
2133                         err = -ECONNRESET;
2134                         goto unlock;
2135                 }
2136                 last = skb = skb_peek(&sk->sk_receive_queue);
2137 again:
2138                 if (skb == NULL) {
2139                         unix_sk(sk)->recursion_level = 0;
2140                         if (copied >= target)
2141                                 goto unlock;
2142
2143                         /*
2144                          *      POSIX 1003.1g mandates this order.
2145                          */
2146
2147                         err = sock_error(sk);
2148                         if (err)
2149                                 goto unlock;
2150                         if (sk->sk_shutdown & RCV_SHUTDOWN)
2151                                 goto unlock;
2152
2153                         unix_state_unlock(sk);
2154                         err = -EAGAIN;
2155                         if (!timeo)
2156                                 break;
2157                         mutex_unlock(&u->readlock);
2158
2159                         timeo = unix_stream_data_wait(sk, timeo, last);
2160
2161                         if (signal_pending(current)) {
2162                                 err = sock_intr_errno(timeo);
2163                                 goto out;
2164                         }
2165
2166                         mutex_lock(&u->readlock);
2167                         continue;
2168  unlock:
2169                         unix_state_unlock(sk);
2170                         break;
2171                 }
2172
2173                 while (skip >= unix_skb_len(skb)) {
2174                         skip -= unix_skb_len(skb);
2175                         last = skb;
2176                         skb = skb_peek_next(skb, &sk->sk_receive_queue);
2177                         if (!skb)
2178                                 goto again;
2179                 }
2180
2181                 unix_state_unlock(sk);
2182
2183                 if (check_creds) {
2184                         /* Never glue messages from different writers */
2185                         if ((UNIXCB(skb).pid  != siocb->scm->pid) ||
2186                             !uid_eq(UNIXCB(skb).uid, siocb->scm->creds.uid) ||
2187                             !gid_eq(UNIXCB(skb).gid, siocb->scm->creds.gid))
2188                                 break;
2189                 } else if (test_bit(SOCK_PASSCRED, &sock->flags)) {
2190                         /* Copy credentials */
2191                         scm_set_cred(siocb->scm, UNIXCB(skb).pid, UNIXCB(skb).uid, UNIXCB(skb).gid);
2192                         check_creds = 1;
2193                 }
2194
2195                 /* Copy address just once */
2196                 if (sunaddr) {
2197                         unix_copy_addr(msg, skb->sk);
2198                         sunaddr = NULL;
2199                 }
2200
2201                 chunk = min_t(unsigned int, unix_skb_len(skb) - skip, size);
2202                 if (skb_copy_datagram_iovec(skb, UNIXCB(skb).consumed + skip,
2203                                             msg->msg_iov, chunk)) {
2204                         if (copied == 0)
2205                                 copied = -EFAULT;
2206                         break;
2207                 }
2208                 copied += chunk;
2209                 size -= chunk;
2210
2211                 /* Mark read part of skb as used */
2212                 if (!(flags & MSG_PEEK)) {
2213                         UNIXCB(skb).consumed += chunk;
2214
2215                         sk_peek_offset_bwd(sk, chunk);
2216
2217                         if (UNIXCB(skb).fp)
2218                                 unix_detach_fds(siocb->scm, skb);
2219
2220                         if (unix_skb_len(skb))
2221                                 break;
2222
2223                         skb_unlink(skb, &sk->sk_receive_queue);
2224                         consume_skb(skb);
2225
2226                         if (siocb->scm->fp)
2227                                 break;
2228                 } else {
2229                         /* It is questionable, see note in unix_dgram_recvmsg.
2230                          */
2231                         if (UNIXCB(skb).fp)
2232                                 siocb->scm->fp = scm_fp_dup(UNIXCB(skb).fp);
2233
2234                         sk_peek_offset_fwd(sk, chunk);
2235
2236                         if (UNIXCB(skb).fp)
2237                                 break;
2238
2239                         skip = 0;
2240                         last = skb;
2241                         unix_state_lock(sk);
2242                         skb = skb_peek_next(skb, &sk->sk_receive_queue);
2243                         if (skb)
2244                                 goto again;
2245                         unix_state_unlock(sk);
2246                         break;
2247                 }
2248         } while (size);
2249
2250         mutex_unlock(&u->readlock);
2251         scm_recv(sock, msg, siocb->scm, flags);
2252 out:
2253         return copied ? : err;
2254 }
2255
2256 static int unix_shutdown(struct socket *sock, int mode)
2257 {
2258         struct sock *sk = sock->sk;
2259         struct sock *other;
2260
2261         if (mode < SHUT_RD || mode > SHUT_RDWR)
2262                 return -EINVAL;
2263         /* This maps:
2264          * SHUT_RD   (0) -> RCV_SHUTDOWN  (1)
2265          * SHUT_WR   (1) -> SEND_SHUTDOWN (2)
2266          * SHUT_RDWR (2) -> SHUTDOWN_MASK (3)
2267          */
2268         ++mode;
2269
2270         unix_state_lock(sk);
2271         sk->sk_shutdown |= mode;
2272         other = unix_peer(sk);
2273         if (other)
2274                 sock_hold(other);
2275         unix_state_unlock(sk);
2276         sk->sk_state_change(sk);
2277
2278         if (other &&
2279                 (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET)) {
2280
2281                 int peer_mode = 0;
2282
2283                 if (mode&RCV_SHUTDOWN)
2284                         peer_mode |= SEND_SHUTDOWN;
2285                 if (mode&SEND_SHUTDOWN)
2286                         peer_mode |= RCV_SHUTDOWN;
2287                 unix_state_lock(other);
2288                 other->sk_shutdown |= peer_mode;
2289                 unix_state_unlock(other);
2290                 other->sk_state_change(other);
2291                 if (peer_mode == SHUTDOWN_MASK)
2292                         sk_wake_async(other, SOCK_WAKE_WAITD, POLL_HUP);
2293                 else if (peer_mode & RCV_SHUTDOWN)
2294                         sk_wake_async(other, SOCK_WAKE_WAITD, POLL_IN);
2295         }
2296         if (other)
2297                 sock_put(other);
2298
2299         return 0;
2300 }
2301
2302 long unix_inq_len(struct sock *sk)
2303 {
2304         struct sk_buff *skb;
2305         long amount = 0;
2306
2307         if (sk->sk_state == TCP_LISTEN)
2308                 return -EINVAL;
2309
2310         spin_lock(&sk->sk_receive_queue.lock);
2311         if (sk->sk_type == SOCK_STREAM ||
2312             sk->sk_type == SOCK_SEQPACKET) {
2313                 skb_queue_walk(&sk->sk_receive_queue, skb)
2314                         amount += unix_skb_len(skb);
2315         } else {
2316                 skb = skb_peek(&sk->sk_receive_queue);
2317                 if (skb)
2318                         amount = skb->len;
2319         }
2320         spin_unlock(&sk->sk_receive_queue.lock);
2321
2322         return amount;
2323 }
2324 EXPORT_SYMBOL_GPL(unix_inq_len);
2325
2326 long unix_outq_len(struct sock *sk)
2327 {
2328         return sk_wmem_alloc_get(sk);
2329 }
2330 EXPORT_SYMBOL_GPL(unix_outq_len);
2331
2332 static int unix_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
2333 {
2334         struct sock *sk = sock->sk;
2335         long amount = 0;
2336         int err;
2337
2338         switch (cmd) {
2339         case SIOCOUTQ:
2340                 amount = unix_outq_len(sk);
2341                 err = put_user(amount, (int __user *)arg);
2342                 break;
2343         case SIOCINQ:
2344                 amount = unix_inq_len(sk);
2345                 if (amount < 0)
2346                         err = amount;
2347                 else
2348                         err = put_user(amount, (int __user *)arg);
2349                 break;
2350         default:
2351                 err = -ENOIOCTLCMD;
2352                 break;
2353         }
2354         return err;
2355 }
2356
2357 static unsigned int unix_poll(struct file *file, struct socket *sock, poll_table *wait)
2358 {
2359         struct sock *sk = sock->sk;
2360         unsigned int mask;
2361
2362         sock_poll_wait(file, sk_sleep(sk), wait);
2363         mask = 0;
2364
2365         /* exceptional events? */
2366         if (sk->sk_err)
2367                 mask |= POLLERR;
2368         if (sk->sk_shutdown == SHUTDOWN_MASK)
2369                 mask |= POLLHUP;
2370         if (sk->sk_shutdown & RCV_SHUTDOWN)
2371                 mask |= POLLRDHUP | POLLIN | POLLRDNORM;
2372
2373         /* readable? */
2374         if (!skb_queue_empty(&sk->sk_receive_queue))
2375                 mask |= POLLIN | POLLRDNORM;
2376
2377         /* Connection-based need to check for termination and startup */
2378         if ((sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) &&
2379             sk->sk_state == TCP_CLOSE)
2380                 mask |= POLLHUP;
2381
2382         /*
2383          * we set writable also when the other side has shut down the
2384          * connection. This prevents stuck sockets.
2385          */
2386         if (unix_writable(sk))
2387                 mask |= POLLOUT | POLLWRNORM | POLLWRBAND;
2388
2389         return mask;
2390 }
2391
2392 static unsigned int unix_dgram_poll(struct file *file, struct socket *sock,
2393                                     poll_table *wait)
2394 {
2395         struct sock *sk = sock->sk, *other;
2396         unsigned int mask, writable;
2397
2398         sock_poll_wait(file, sk_sleep(sk), wait);
2399         mask = 0;
2400
2401         /* exceptional events? */
2402         if (sk->sk_err || !skb_queue_empty(&sk->sk_error_queue))
2403                 mask |= POLLERR |
2404                         (sock_flag(sk, SOCK_SELECT_ERR_QUEUE) ? POLLPRI : 0);
2405
2406         if (sk->sk_shutdown & RCV_SHUTDOWN)
2407                 mask |= POLLRDHUP | POLLIN | POLLRDNORM;
2408         if (sk->sk_shutdown == SHUTDOWN_MASK)
2409                 mask |= POLLHUP;
2410
2411         /* readable? */
2412         if (!skb_queue_empty(&sk->sk_receive_queue))
2413                 mask |= POLLIN | POLLRDNORM;
2414
2415         /* Connection-based need to check for termination and startup */
2416         if (sk->sk_type == SOCK_SEQPACKET) {
2417                 if (sk->sk_state == TCP_CLOSE)
2418                         mask |= POLLHUP;
2419                 /* connection hasn't started yet? */
2420                 if (sk->sk_state == TCP_SYN_SENT)
2421                         return mask;
2422         }
2423
2424         /* No write status requested, avoid expensive OUT tests. */
2425         if (!(poll_requested_events(wait) & (POLLWRBAND|POLLWRNORM|POLLOUT)))
2426                 return mask;
2427
2428         writable = unix_writable(sk);
2429         if (writable) {
2430                 unix_state_lock(sk);
2431
2432                 other = unix_peer(sk);
2433                 if (other && unix_peer(other) != sk &&
2434                     unix_recvq_full(other) &&
2435                     unix_dgram_peer_wake_me(sk, other))
2436                         writable = 0;
2437
2438                 unix_state_unlock(sk);
2439         }
2440
2441         if (writable)
2442                 mask |= POLLOUT | POLLWRNORM | POLLWRBAND;
2443         else
2444                 set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
2445
2446         return mask;
2447 }
2448
2449 #ifdef CONFIG_PROC_FS
2450
2451 #define BUCKET_SPACE (BITS_PER_LONG - (UNIX_HASH_BITS + 1) - 1)
2452
2453 #define get_bucket(x) ((x) >> BUCKET_SPACE)
2454 #define get_offset(x) ((x) & ((1L << BUCKET_SPACE) - 1))
2455 #define set_bucket_offset(b, o) ((b) << BUCKET_SPACE | (o))
2456
2457 static struct sock *unix_from_bucket(struct seq_file *seq, loff_t *pos)
2458 {
2459         unsigned long offset = get_offset(*pos);
2460         unsigned long bucket = get_bucket(*pos);
2461         struct sock *sk;
2462         unsigned long count = 0;
2463
2464         for (sk = sk_head(&unix_socket_table[bucket]); sk; sk = sk_next(sk)) {
2465                 if (sock_net(sk) != seq_file_net(seq))
2466                         continue;
2467                 if (++count == offset)
2468                         break;
2469         }
2470
2471         return sk;
2472 }
2473
2474 static struct sock *unix_next_socket(struct seq_file *seq,
2475                                      struct sock *sk,
2476                                      loff_t *pos)
2477 {
2478         unsigned long bucket;
2479
2480         while (sk > (struct sock *)SEQ_START_TOKEN) {
2481                 sk = sk_next(sk);
2482                 if (!sk)
2483                         goto next_bucket;
2484                 if (sock_net(sk) == seq_file_net(seq))
2485                         return sk;
2486         }
2487
2488         do {
2489                 sk = unix_from_bucket(seq, pos);
2490                 if (sk)
2491                         return sk;
2492
2493 next_bucket:
2494                 bucket = get_bucket(*pos) + 1;
2495                 *pos = set_bucket_offset(bucket, 1);
2496         } while (bucket < ARRAY_SIZE(unix_socket_table));
2497
2498         return NULL;
2499 }
2500
2501 static void *unix_seq_start(struct seq_file *seq, loff_t *pos)
2502         __acquires(unix_table_lock)
2503 {
2504         spin_lock(&unix_table_lock);
2505
2506         if (!*pos)
2507                 return SEQ_START_TOKEN;
2508
2509         if (get_bucket(*pos) >= ARRAY_SIZE(unix_socket_table))
2510                 return NULL;
2511
2512         return unix_next_socket(seq, NULL, pos);
2513 }
2514
2515 static void *unix_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2516 {
2517         ++*pos;
2518         return unix_next_socket(seq, v, pos);
2519 }
2520
2521 static void unix_seq_stop(struct seq_file *seq, void *v)
2522         __releases(unix_table_lock)
2523 {
2524         spin_unlock(&unix_table_lock);
2525 }
2526
2527 static int unix_seq_show(struct seq_file *seq, void *v)
2528 {
2529
2530         if (v == SEQ_START_TOKEN)
2531                 seq_puts(seq, "Num       RefCount Protocol Flags    Type St "
2532                          "Inode Path\n");
2533         else {
2534                 struct sock *s = v;
2535                 struct unix_sock *u = unix_sk(s);
2536                 unix_state_lock(s);
2537
2538                 seq_printf(seq, "%pK: %08X %08X %08X %04X %02X %5lu",
2539                         s,
2540                         atomic_read(&s->sk_refcnt),
2541                         0,
2542                         s->sk_state == TCP_LISTEN ? __SO_ACCEPTCON : 0,
2543                         s->sk_type,
2544                         s->sk_socket ?
2545                         (s->sk_state == TCP_ESTABLISHED ? SS_CONNECTED : SS_UNCONNECTED) :
2546                         (s->sk_state == TCP_ESTABLISHED ? SS_CONNECTING : SS_DISCONNECTING),
2547                         sock_i_ino(s));
2548
2549                 if (u->addr) {
2550                         int i, len;
2551                         seq_putc(seq, ' ');
2552
2553                         i = 0;
2554                         len = u->addr->len - sizeof(short);
2555                         if (!UNIX_ABSTRACT(s))
2556                                 len--;
2557                         else {
2558                                 seq_putc(seq, '@');
2559                                 i++;
2560                         }
2561                         for ( ; i < len; i++)
2562                                 seq_putc(seq, u->addr->name->sun_path[i]);
2563                 }
2564                 unix_state_unlock(s);
2565                 seq_putc(seq, '\n');
2566         }
2567
2568         return 0;
2569 }
2570
2571 static const struct seq_operations unix_seq_ops = {
2572         .start  = unix_seq_start,
2573         .next   = unix_seq_next,
2574         .stop   = unix_seq_stop,
2575         .show   = unix_seq_show,
2576 };
2577
2578 static int unix_seq_open(struct inode *inode, struct file *file)
2579 {
2580         return seq_open_net(inode, file, &unix_seq_ops,
2581                             sizeof(struct seq_net_private));
2582 }
2583
2584 static const struct file_operations unix_seq_fops = {
2585         .owner          = THIS_MODULE,
2586         .open           = unix_seq_open,
2587         .read           = seq_read,
2588         .llseek         = seq_lseek,
2589         .release        = seq_release_net,
2590 };
2591
2592 #endif
2593
2594 static const struct net_proto_family unix_family_ops = {
2595         .family = PF_UNIX,
2596         .create = unix_create,
2597         .owner  = THIS_MODULE,
2598 };
2599
2600
2601 static int __net_init unix_net_init(struct net *net)
2602 {
2603         int error = -ENOMEM;
2604
2605         net->unx.sysctl_max_dgram_qlen = 10;
2606         if (unix_sysctl_register(net))
2607                 goto out;
2608
2609 #ifdef CONFIG_PROC_FS
2610         if (!proc_create("unix", 0, net->proc_net, &unix_seq_fops)) {
2611                 unix_sysctl_unregister(net);
2612                 goto out;
2613         }
2614 #endif
2615         error = 0;
2616 out:
2617         return error;
2618 }
2619
2620 static void __net_exit unix_net_exit(struct net *net)
2621 {
2622         unix_sysctl_unregister(net);
2623         remove_proc_entry("unix", net->proc_net);
2624 }
2625
2626 static struct pernet_operations unix_net_ops = {
2627         .init = unix_net_init,
2628         .exit = unix_net_exit,
2629 };
2630
2631 static int __init af_unix_init(void)
2632 {
2633         int rc = -1;
2634
2635         BUILD_BUG_ON(sizeof(struct unix_skb_parms) > FIELD_SIZEOF(struct sk_buff, cb));
2636
2637         rc = proto_register(&unix_proto, 1);
2638         if (rc != 0) {
2639                 pr_crit("%s: Cannot create unix_sock SLAB cache!\n", __func__);
2640                 goto out;
2641         }
2642
2643         sock_register(&unix_family_ops);
2644         register_pernet_subsys(&unix_net_ops);
2645 out:
2646         return rc;
2647 }
2648
2649 static void __exit af_unix_exit(void)
2650 {
2651         sock_unregister(PF_UNIX);
2652         proto_unregister(&unix_proto);
2653         unregister_pernet_subsys(&unix_net_ops);
2654 }
2655
2656 /* Earlier than device_initcall() so that other drivers invoking
2657    request_module() don't end up in a loop when modprobe tries
2658    to use a UNIX socket. But later than subsys_initcall() because
2659    we depend on stuff initialised there */
2660 fs_initcall(af_unix_init);
2661 module_exit(af_unix_exit);
2662
2663 MODULE_LICENSE("GPL");
2664 MODULE_ALIAS_NETPROTO(PF_UNIX);