net/unix/af_unix.c

   1 /*
   2  * NET4:        Implementation of BSD Unix domain sockets.
   3  *
   4  * Authors:     Alan Cox, <alan@lxorguk.ukuu.org.uk>
   5  *
   6  *              This program is free software; you can redistribute it and/or
   7  *              modify it under the terms of the GNU General Public License
   8  *              as published by the Free Software Foundation; either version
   9  *              2 of the License, or (at your option) any later version.
  10  *
  11  * Fixes:
  12  *              Linus Torvalds  :       Assorted bug cures.
  13  *              Niibe Yutaka    :       async I/O support.
  14  *              Carsten Paeth   :       PF_UNIX check, address fixes.
  15  *              Alan Cox        :       Limit size of allocated blocks.
  16  *              Alan Cox        :       Fixed the stupid socketpair bug.
  17  *              Alan Cox        :       BSD compatibility fine tuning.
  18  *              Alan Cox        :       Fixed a bug in connect when interrupted.
  19  *              Alan Cox        :       Sorted out a proper draft version of
  20  *                                      file descriptor passing hacked up from
  21  *                                      Mike Shaver's work.
  22  *              Marty Leisner   :       Fixes to fd passing
  23  *              Nick Nevin      :       recvmsg bugfix.
  24  *              Alan Cox        :       Started proper garbage collector
  25  *              Heiko EiBfeldt  :       Missing verify_area check
  26  *              Alan Cox        :       Started POSIXisms
  27  *              Andreas Schwab  :       Replace inode by dentry for proper
  28  *                                      reference counting
  29  *              Kirk Petersen   :       Made this a module
  30  *          Christoph Rohland   :       Elegant non-blocking accept/connect algorithm.
  31  *                                      Lots of bug fixes.
  32  *           Alexey Kuznetosv   :       Repaired (I hope) bugs introduces
  33  *                                      by above two patches.
  34  *           Andrea Arcangeli   :       If possible we block in connect(2)
  35  *                                      if the max backlog of the listen socket
  36  *                                      is been reached. This won't break
  37  *                                      old apps and it will avoid huge amount
  38  *                                      of socks hashed (this for unix_gc()
  39  *                                      performances reasons).
  40  *                                      Security fix that limits the max
  41  *                                      number of socks to 2*max_files and
  42  *                                      the number of skb queueable in the
  43  *                                      dgram receiver.
  44  *              Artur Skawina   :       Hash function optimizations
  45  *           Alexey Kuznetsov   :       Full scale SMP. Lot of bugs are introduced 8)
  46  *            Malcolm Beattie   :       Set peercred for socketpair
  47  *           Michal Ostrowski   :       Module initialization cleanup.
  48  *           Arnaldo C. Melo    :       Remove MOD_{INC,DEC}_USE_COUNT,
  49  *                                      the core infrastructure is doing that
  50  *                                      for all net proto families now (2.5.69+)
  51  *
  52  *
  53  * Known differences from reference BSD that was tested:
  54  *
  55  *      [TO FIX]
  56  *      ECONNREFUSED is not returned from one end of a connected() socket to the
  57  *              other the moment one end closes.
  58  *      fstat() doesn't return st_dev=0, and give the blksize as high water mark
  59  *              and a fake inode identifier (nor the BSD first socket fstat twice bug).
  60  *      [NOT TO FIX]
  61  *      accept() returns a path name even if the connecting socket has closed
  62  *              in the meantime (BSD loses the path and gives up).
  63  *      accept() returns 0 length path for an unbound connector. BSD returns 16
  64  *              and a null first byte in the path (but not for gethost/peername - BSD bug ??)
  65  *      socketpair(...SOCK_RAW..) doesn't panic the kernel.
  66  *      BSD af_unix apparently has connect forgetting to block properly.
  67  *              (need to check this with the POSIX spec in detail)
  68  *
  69  * Differences from 2.0.0-11-... (ANK)
  70  *      Bug fixes and improvements.
  71  *              - client shutdown killed server socket.
  72  *              - removed all useless cli/sti pairs.
  73  *
  74  *      Semantic changes/extensions.
  75  *              - generic control message passing.
  76  *              - SCM_CREDENTIALS control message.
  77  *              - "Abstract" (not FS based) socket bindings.
  78  *                Abstract names are sequences of bytes (not zero terminated)
  79  *                started by 0, so that this name space does not intersect
  80  *                with BSD names.
  81  */
  82
  83 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
  84
  85 #include <linux/module.h>
  86 #include <linux/kernel.h>
  87 #include <linux/signal.h>
  88 #include <linux/sched.h>
  89 #include <linux/errno.h>
  90 #include <linux/string.h>
  91 #include <linux/stat.h>
  92 #include <linux/dcache.h>
  93 #include <linux/namei.h>
  94 #include <linux/socket.h>
  95 #include <linux/un.h>
  96 #include <linux/fcntl.h>
  97 #include <linux/termios.h>
  98 #include <linux/sockios.h>
  99 #include <linux/net.h>
 100 #include <linux/in.h>
 101 #include <linux/fs.h>
 102 #include <linux/slab.h>
 103 #include <asm/uaccess.h>
 104 #include <linux/skbuff.h>
 105 #include <linux/netdevice.h>
 106 #include <net/net_namespace.h>
 107 #include <net/sock.h>
 108 #include <net/tcp_states.h>
 109 #include <net/af_unix.h>
 110 #include <linux/proc_fs.h>
 111 #include <linux/seq_file.h>
 112 #include <net/scm.h>
 113 #include <linux/init.h>
 114 #include <linux/poll.h>
 115 #include <linux/rtnetlink.h>
 116 #include <linux/mount.h>
 117 #include <net/checksum.h>
 118 #include <linux/security.h>
 119 #include <linux/freezer.h>
 120
 121 struct hlist_head unix_socket_table[2 * UNIX_HASH_SIZE];
 122 EXPORT_SYMBOL_GPL(unix_socket_table);
 123 DEFINE_SPINLOCK(unix_table_lock);
 124 EXPORT_SYMBOL_GPL(unix_table_lock);
 125 static atomic_long_t unix_nr_socks;
 126
 127
 128 static struct hlist_head *unix_sockets_unbound(void *addr)
 129 {
 130         unsigned long hash = (unsigned long)addr;
 131
 132         hash ^= hash >> 16;
 133         hash ^= hash >> 8;
 134         hash %= UNIX_HASH_SIZE;
 135         return &unix_socket_table[UNIX_HASH_SIZE + hash];
 136 }
 137
 138 #define UNIX_ABSTRACT(sk)       (unix_sk(sk)->addr->hash < UNIX_HASH_SIZE)
 139
 140 #ifdef CONFIG_SECURITY_NETWORK
 141 static void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb)
 142 {
 143         UNIXCB(skb).secid = scm->secid;
 144 }
 145
 146 static inline void unix_set_secdata(struct scm_cookie *scm, struct sk_buff *skb)
 147 {
 148         scm->secid = UNIXCB(skb).secid;
 149 }
 150
 151 static inline bool unix_secdata_eq(struct scm_cookie *scm, struct sk_buff *skb)
 152 {
 153         return (scm->secid == UNIXCB(skb).secid);
 154 }
 155 #else
 156 static inline void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb)
 157 { }
 158
 159 static inline void unix_set_secdata(struct scm_cookie *scm, struct sk_buff *skb)
 160 { }
 161
 162 static inline bool unix_secdata_eq(struct scm_cookie *scm, struct sk_buff *skb)
 163 {
 164         return true;
 165 }
 166 #endif /* CONFIG_SECURITY_NETWORK */
 167
 168 /*
 169  *  SMP locking strategy:
 170  *    hash table is protected with spinlock unix_table_lock
 171  *    each socket state is protected by separate spin lock.
 172  */
 173
 174 static inline unsigned int unix_hash_fold(__wsum n)
 175 {
 176         unsigned int hash = (__force unsigned int)csum_fold(n);
 177
 178         hash ^= hash>>8;
 179         return hash&(UNIX_HASH_SIZE-1);
 180 }
 181
 182 #define unix_peer(sk) (unix_sk(sk)->peer)
 183
 184 static inline int unix_our_peer(struct sock *sk, struct sock *osk)
 185 {
 186         return unix_peer(osk) == sk;
 187 }
 188
 189 static inline int unix_may_send(struct sock *sk, struct sock *osk)
 190 {
 191         return unix_peer(osk) == NULL || unix_our_peer(sk, osk);
 192 }
 193
 194 static inline int unix_recvq_full(struct sock const *sk)
 195 {
 196         return skb_queue_len(&sk->sk_receive_queue) > sk->sk_max_ack_backlog;
 197 }
 198
 199 struct sock *unix_peer_get(struct sock *s)
 200 {
 201         struct sock *peer;
 202
 203         unix_state_lock(s);
 204         peer = unix_peer(s);
 205         if (peer)
 206                 sock_hold(peer);
 207         unix_state_unlock(s);
 208         return peer;
 209 }
 210 EXPORT_SYMBOL_GPL(unix_peer_get);
 211
 212 static inline void unix_release_addr(struct unix_address *addr)
 213 {
 214         if (atomic_dec_and_test(&addr->refcnt))
 215                 kfree(addr);
 216 }
 217
 218 /*
 219  *      Check unix socket name:
 220  *              - should be not zero length.
 221  *              - if started by not zero, should be NULL terminated (FS object)
 222  *              - if started by zero, it is abstract name.
 223  */
 224
 225 static int unix_mkname(struct sockaddr_un *sunaddr, int len, unsigned int *hashp)
 226 {
 227         if (len <= sizeof(short) || len > sizeof(*sunaddr))
 228                 return -EINVAL;
 229         if (!sunaddr || sunaddr->sun_family != AF_UNIX)
 230                 return -EINVAL;
 231         if (sunaddr->sun_path[0]) {
 232                 /*
 233                  * This may look like an off by one error but it is a bit more
 234                  * subtle. 108 is the longest valid AF_UNIX path for a binding.
 235                  * sun_path[108] doesn't as such exist.  However in kernel space
 236                  * we are guaranteed that it is a valid memory location in our
 237                  * kernel address buffer.
 238                  */
 239                 ((char *)sunaddr)[len] = 0;
 240                 len = strlen(sunaddr->sun_path)+1+sizeof(short);
 241                 return len;
 242         }
 243
 244         *hashp = unix_hash_fold(csum_partial(sunaddr, len, 0));
 245         return len;
 246 }
 247
 248 static void __unix_remove_socket(struct sock *sk)
 249 {
 250         sk_del_node_init(sk);
 251 }
 252
 253 static void __unix_insert_socket(struct hlist_head *list, struct sock *sk)
 254 {
 255         WARN_ON(!sk_unhashed(sk));
 256         sk_add_node(sk, list);
 257 }
 258
 259 static inline void unix_remove_socket(struct sock *sk)
 260 {
 261         spin_lock(&unix_table_lock);
 262         __unix_remove_socket(sk);
 263         spin_unlock(&unix_table_lock);
 264 }
 265
 266 static inline void unix_insert_socket(struct hlist_head *list, struct sock *sk)
 267 {
 268         spin_lock(&unix_table_lock);
 269         __unix_insert_socket(list, sk);
 270         spin_unlock(&unix_table_lock);
 271 }
 272
 273 static struct sock *__unix_find_socket_byname(struct net *net,
 274                                               struct sockaddr_un *sunname,
 275                                               int len, int type, unsigned int hash)
 276 {
 277         struct sock *s;
 278
 279         sk_for_each(s, &unix_socket_table[hash ^ type]) {
 280                 struct unix_sock *u = unix_sk(s);
 281
 282                 if (!net_eq(sock_net(s), net))
 283                         continue;
 284
 285                 if (u->addr->len == len &&
 286                     !memcmp(u->addr->name, sunname, len))
 287                         goto found;
 288         }
 289         s = NULL;
 290 found:
 291         return s;
 292 }
 293
 294 static inline struct sock *unix_find_socket_byname(struct net *net,
 295                                                    struct sockaddr_un *sunname,
 296                                                    int len, int type,
 297                                                    unsigned int hash)
 298 {
 299         struct sock *s;
 300
 301         spin_lock(&unix_table_lock);
 302         s = __unix_find_socket_byname(net, sunname, len, type, hash);
 303         if (s)
 304                 sock_hold(s);
 305         spin_unlock(&unix_table_lock);
 306         return s;
 307 }
 308
 309 static struct sock *unix_find_socket_byinode(struct inode *i)
 310 {
 311         struct sock *s;
 312
 313         spin_lock(&unix_table_lock);
 314         sk_for_each(s,
 315                     &unix_socket_table[i->i_ino & (UNIX_HASH_SIZE - 1)]) {
 316                 struct dentry *dentry = unix_sk(s)->path.dentry;
 317
 318                 if (dentry && d_real_inode(dentry) == i) {
 319                         sock_hold(s);
 320                         goto found;
 321                 }
 322         }
 323         s = NULL;
 324 found:
 325         spin_unlock(&unix_table_lock);
 326         return s;
 327 }
 328
 329 /* Support code for asymmetrically connected dgram sockets
 330  *
 331  * If a datagram socket is connected to a socket not itself connected
 332  * to the first socket (eg, /dev/log), clients may only enqueue more
 333  * messages if the present receive queue of the server socket is not
 334  * "too large". This means there's a second writeability condition
 335  * poll and sendmsg need to test. The dgram recv code will do a wake
 336  * up on the peer_wait wait queue of a socket upon reception of a
 337  * datagram which needs to be propagated to sleeping would-be writers
 338  * since these might not have sent anything so far. This can't be
 339  * accomplished via poll_wait because the lifetime of the server
 340  * socket might be less than that of its clients if these break their
 341  * association with it or if the server socket is closed while clients
 342  * are still connected to it and there's no way to inform "a polling
 343  * implementation" that it should let go of a certain wait queue
 344  *
 345  * In order to propagate a wake up, a wait_queue_t of the client
 346  * socket is enqueued on the peer_wait queue of the server socket
 347  * whose wake function does a wake_up on the ordinary client socket
 348  * wait queue. This connection is established whenever a write (or
 349  * poll for write) hit the flow control condition and broken when the
 350  * association to the server socket is dissolved or after a wake up
 351  * was relayed.
 352  */
 353
 354 static int unix_dgram_peer_wake_relay(wait_queue_t *q, unsigned mode, int flags,
 355                                       void *key)
 356 {
 357         struct unix_sock *u;
 358         wait_queue_head_t *u_sleep;
 359
 360         u = container_of(q, struct unix_sock, peer_wake);
 361
 362         __remove_wait_queue(&unix_sk(u->peer_wake.private)->peer_wait,
 363                             q);
 364         u->peer_wake.private = NULL;
 365
 366         /* relaying can only happen while the wq still exists */
 367         u_sleep = sk_sleep(&u->sk);
 368         if (u_sleep)
 369                 wake_up_interruptible_poll(u_sleep, key);
 370
 371         return 0;
 372 }
 373
 374 static int unix_dgram_peer_wake_connect(struct sock *sk, struct sock *other)
 375 {
 376         struct unix_sock *u, *u_other;
 377         int rc;
 378
 379         u = unix_sk(sk);
 380         u_other = unix_sk(other);
 381         rc = 0;
 382         spin_lock(&u_other->peer_wait.lock);
 383
 384         if (!u->peer_wake.private) {
 385                 u->peer_wake.private = other;
 386                 __add_wait_queue(&u_other->peer_wait, &u->peer_wake);
 387
 388                 rc = 1;
 389         }
 390
 391         spin_unlock(&u_other->peer_wait.lock);
 392         return rc;
 393 }
 394
 395 static void unix_dgram_peer_wake_disconnect(struct sock *sk,
 396                                             struct sock *other)
 397 {
 398         struct unix_sock *u, *u_other;
 399
 400         u = unix_sk(sk);
 401         u_other = unix_sk(other);
 402         spin_lock(&u_other->peer_wait.lock);
 403
 404         if (u->peer_wake.private == other) {
 405                 __remove_wait_queue(&u_other->peer_wait, &u->peer_wake);
 406                 u->peer_wake.private = NULL;
 407         }
 408
 409         spin_unlock(&u_other->peer_wait.lock);
 410 }
 411
 412 static void unix_dgram_peer_wake_disconnect_wakeup(struct sock *sk,
 413                                                    struct sock *other)
 414 {
 415         unix_dgram_peer_wake_disconnect(sk, other);
 416         wake_up_interruptible_poll(sk_sleep(sk),
 417                                    POLLOUT |
 418                                    POLLWRNORM |
 419                                    POLLWRBAND);
 420 }
 421
 422 /* preconditions:
 423  *      - unix_peer(sk) == other
 424  *      - association is stable
 425  */
 426 static int unix_dgram_peer_wake_me(struct sock *sk, struct sock *other)
 427 {
 428         int connected;
 429
 430         connected = unix_dgram_peer_wake_connect(sk, other);
 431
 432         if (unix_recvq_full(other))
 433                 return 1;
 434
 435         if (connected)
 436                 unix_dgram_peer_wake_disconnect(sk, other);
 437
 438         return 0;
 439 }
 440
 441 static int unix_writable(const struct sock *sk)
 442 {
 443         return sk->sk_state != TCP_LISTEN &&
 444                (atomic_read(&sk->sk_wmem_alloc) << 2) <= sk->sk_sndbuf;
 445 }
 446
 447 static void unix_write_space(struct sock *sk)
 448 {
 449         struct socket_wq *wq;
 450
 451         rcu_read_lock();
 452         if (unix_writable(sk)) {
 453                 wq = rcu_dereference(sk->sk_wq);
 454                 if (skwq_has_sleeper(wq))
 455                         wake_up_interruptible_sync_poll(&wq->wait,
 456                                 POLLOUT | POLLWRNORM | POLLWRBAND);
 457                 sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
 458         }
 459         rcu_read_unlock();
 460 }
 461
 462 /* When dgram socket disconnects (or changes its peer), we clear its receive
 463  * queue of packets arrived from previous peer. First, it allows to do
 464  * flow control based only on wmem_alloc; second, sk connected to peer
 465  * may receive messages only from that peer. */
 466 static void unix_dgram_disconnected(struct sock *sk, struct sock *other)
 467 {
 468         if (!skb_queue_empty(&sk->sk_receive_queue)) {
 469                 skb_queue_purge(&sk->sk_receive_queue);
 470                 wake_up_interruptible_all(&unix_sk(sk)->peer_wait);
 471
 472                 /* If one link of bidirectional dgram pipe is disconnected,
 473                  * we signal error. Messages are lost. Do not make this,
 474                  * when peer was not connected to us.
 475                  */
 476                 if (!sock_flag(other, SOCK_DEAD) && unix_peer(other) == sk) {
 477                         other->sk_err = ECONNRESET;
 478                         other->sk_error_report(other);
 479                 }
 480         }
 481 }
 482
 483 static void unix_sock_destructor(struct sock *sk)
 484 {
 485         struct unix_sock *u = unix_sk(sk);
 486
 487         skb_queue_purge(&sk->sk_receive_queue);
 488
 489         WARN_ON(atomic_read(&sk->sk_wmem_alloc));
 490         WARN_ON(!sk_unhashed(sk));
 491         WARN_ON(sk->sk_socket);
 492         if (!sock_flag(sk, SOCK_DEAD)) {
 493                 pr_info("Attempt to release alive unix socket: %p\n", sk);
 494                 return;
 495         }
 496
 497         if (u->addr)
 498                 unix_release_addr(u->addr);
 499
 500         atomic_long_dec(&unix_nr_socks);
 501         local_bh_disable();
 502         sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
 503         local_bh_enable();
 504 #ifdef UNIX_REFCNT_DEBUG
 505         pr_debug("UNIX %p is destroyed, %ld are still alive.\n", sk,
 506                 atomic_long_read(&unix_nr_socks));
 507 #endif
 508 }
 509
 510 static void unix_release_sock(struct sock *sk, int embrion)
 511 {
 512         struct unix_sock *u = unix_sk(sk);
 513         struct path path;
 514         struct sock *skpair;
 515         struct sk_buff *skb;
 516         int state;
 517
 518         unix_remove_socket(sk);
 519
 520         /* Clear state */
 521         unix_state_lock(sk);
 522         sock_orphan(sk);
 523         sk->sk_shutdown = SHUTDOWN_MASK;
 524         path         = u->path;
 525         u->path.dentry = NULL;
 526         u->path.mnt = NULL;
 527         state = sk->sk_state;
 528         sk->sk_state = TCP_CLOSE;
 529         unix_state_unlock(sk);
 530
 531         wake_up_interruptible_all(&u->peer_wait);
 532
 533         skpair = unix_peer(sk);
 534
 535         if (skpair != NULL) {
 536                 if (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) {
 537                         unix_state_lock(skpair);
 538                         /* No more writes */
 539                         skpair->sk_shutdown = SHUTDOWN_MASK;
 540                         if (!skb_queue_empty(&sk->sk_receive_queue) || embrion)
 541                                 skpair->sk_err = ECONNRESET;
 542                         unix_state_unlock(skpair);
 543                         skpair->sk_state_change(skpair);
 544                         sk_wake_async(skpair, SOCK_WAKE_WAITD, POLL_HUP);
 545                 }
 546
 547                 unix_dgram_peer_wake_disconnect(sk, skpair);
 548                 sock_put(skpair); /* It may now die */
 549                 unix_peer(sk) = NULL;
 550         }
 551
 552         /* Try to flush out this socket. Throw out buffers at least */
 553
 554         while ((skb = skb_dequeue(&sk->sk_receive_queue)) != NULL) {
 555                 if (state == TCP_LISTEN)
 556                         unix_release_sock(skb->sk, 1);
 557                 /* passed fds are erased in the kfree_skb hook        */
 558                 UNIXCB(skb).consumed = skb->len;
 559                 kfree_skb(skb);
 560         }
 561
 562         if (path.dentry)
 563                 path_put(&path);
 564
 565         sock_put(sk);
 566
 567         /* ---- Socket is dead now and most probably destroyed ---- */
 568
 569         /*
 570          * Fixme: BSD difference: In BSD all sockets connected to us get
 571          *        ECONNRESET and we die on the spot. In Linux we behave
 572          *        like files and pipes do and wait for the last
 573          *        dereference.
 574          *
 575          * Can't we simply set sock->err?
 576          *
 577          *        What the above comment does talk about? --ANK(980817)
 578          */
 579
 580         if (unix_tot_inflight)
 581                 unix_gc();              /* Garbage collect fds */
 582 }
 583
 584 static void init_peercred(struct sock *sk)
 585 {
 586         put_pid(sk->sk_peer_pid);
 587         if (sk->sk_peer_cred)
 588                 put_cred(sk->sk_peer_cred);
 589         sk->sk_peer_pid  = get_pid(task_tgid(current));
 590         sk->sk_peer_cred = get_current_cred();
 591 }
 592
 593 static void copy_peercred(struct sock *sk, struct sock *peersk)
 594 {
 595         put_pid(sk->sk_peer_pid);
 596         if (sk->sk_peer_cred)
 597                 put_cred(sk->sk_peer_cred);
 598         sk->sk_peer_pid  = get_pid(peersk->sk_peer_pid);
 599         sk->sk_peer_cred = get_cred(peersk->sk_peer_cred);
 600 }
 601
 602 static int unix_listen(struct socket *sock, int backlog)
 603 {
 604         int err;
 605         struct sock *sk = sock->sk;
 606         struct unix_sock *u = unix_sk(sk);
 607         struct pid *old_pid = NULL;
 608
 609         err = -EOPNOTSUPP;
 610         if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET)
 611                 goto out;       /* Only stream/seqpacket sockets accept */
 612         err = -EINVAL;
 613         if (!u->addr)
 614                 goto out;       /* No listens on an unbound socket */
 615         unix_state_lock(sk);
 616         if (sk->sk_state != TCP_CLOSE && sk->sk_state != TCP_LISTEN)
 617                 goto out_unlock;
 618         if (backlog > sk->sk_max_ack_backlog)
 619                 wake_up_interruptible_all(&u->peer_wait);
 620         sk->sk_max_ack_backlog  = backlog;
 621         sk->sk_state            = TCP_LISTEN;
 622         /* set credentials so connect can copy them */
 623         init_peercred(sk);
 624         err = 0;
 625
 626 out_unlock:
 627         unix_state_unlock(sk);
 628         put_pid(old_pid);
 629 out:
 630         return err;
 631 }
 632
 633 static int unix_release(struct socket *);
 634 static int unix_bind(struct socket *, struct sockaddr *, int);
 635 static int unix_stream_connect(struct socket *, struct sockaddr *,
 636                                int addr_len, int flags);
 637 static int unix_socketpair(struct socket *, struct socket *);
 638 static int unix_accept(struct socket *, struct socket *, int);
 639 static int unix_getname(struct socket *, struct sockaddr *, int *, int);
 640 static unsigned int unix_poll(struct file *, struct socket *, poll_table *);
 641 static unsigned int unix_dgram_poll(struct file *, struct socket *,
 642                                     poll_table *);
 643 static int unix_ioctl(struct socket *, unsigned int, unsigned long);
 644 static int unix_shutdown(struct socket *, int);
 645 static int unix_stream_sendmsg(struct socket *, struct msghdr *, size_t);
 646 static int unix_stream_recvmsg(struct socket *, struct msghdr *, size_t, int);
 647 static ssize_t unix_stream_sendpage(struct socket *, struct page *, int offset,
 648                                     size_t size, int flags);
 649 static ssize_t unix_stream_splice_read(struct socket *,  loff_t *ppos,
 650                                        struct pipe_inode_info *, size_t size,
 651                                        unsigned int flags);
 652 static int unix_dgram_sendmsg(struct socket *, struct msghdr *, size_t);
 653 static int unix_dgram_recvmsg(struct socket *, struct msghdr *, size_t, int);
 654 static int unix_dgram_connect(struct socket *, struct sockaddr *,
 655                               int, int);
 656 static int unix_seqpacket_sendmsg(struct socket *, struct msghdr *, size_t);
 657 static int unix_seqpacket_recvmsg(struct socket *, struct msghdr *, size_t,
 658                                   int);
 659
 660 static int unix_set_peek_off(struct sock *sk, int val)
 661 {
 662         struct unix_sock *u = unix_sk(sk);
 663
 664         if (mutex_lock_interruptible(&u->readlock))
 665                 return -EINTR;
 666
 667         sk->sk_peek_off = val;
 668         mutex_unlock(&u->readlock);
 669
 670         return 0;
 671 }
 672
 673
 674 static const struct proto_ops unix_stream_ops = {
 675         .family =       PF_UNIX,
 676         .owner =        THIS_MODULE,
 677         .release =      unix_release,
 678         .bind =         unix_bind,
 679         .connect =      unix_stream_connect,
 680         .socketpair =   unix_socketpair,
 681         .accept =       unix_accept,
 682         .getname =      unix_getname,
 683         .poll =         unix_poll,
 684         .ioctl =        unix_ioctl,
 685         .listen =       unix_listen,
 686         .shutdown =     unix_shutdown,
 687         .setsockopt =   sock_no_setsockopt,
 688         .getsockopt =   sock_no_getsockopt,
 689         .sendmsg =      unix_stream_sendmsg,
 690         .recvmsg =      unix_stream_recvmsg,
 691         .mmap =         sock_no_mmap,
 692         .sendpage =     unix_stream_sendpage,
 693         .splice_read =  unix_stream_splice_read,
 694         .set_peek_off = unix_set_peek_off,
 695 };
 696
 697 static const struct proto_ops unix_dgram_ops = {
 698         .family =       PF_UNIX,
 699         .owner =        THIS_MODULE,
 700         .release =      unix_release,
 701         .bind =         unix_bind,
 702         .connect =      unix_dgram_connect,
 703         .socketpair =   unix_socketpair,
 704         .accept =       sock_no_accept,
 705         .getname =      unix_getname,
 706         .poll =         unix_dgram_poll,
 707         .ioctl =        unix_ioctl,
 708         .listen =       sock_no_listen,
 709         .shutdown =     unix_shutdown,
 710         .setsockopt =   sock_no_setsockopt,
 711         .getsockopt =   sock_no_getsockopt,
 712         .sendmsg =      unix_dgram_sendmsg,
 713         .recvmsg =      unix_dgram_recvmsg,
 714         .mmap =         sock_no_mmap,
 715         .sendpage =     sock_no_sendpage,
 716         .set_peek_off = unix_set_peek_off,
 717 };
 718
 719 static const struct proto_ops unix_seqpacket_ops = {
 720         .family =       PF_UNIX,
 721         .owner =        THIS_MODULE,
 722         .release =      unix_release,
 723         .bind =         unix_bind,
 724         .connect =      unix_stream_connect,
 725         .socketpair =   unix_socketpair,
 726         .accept =       unix_accept,
 727         .getname =      unix_getname,
 728         .poll =         unix_dgram_poll,
 729         .ioctl =        unix_ioctl,
 730         .listen =       unix_listen,
 731         .shutdown =     unix_shutdown,
 732         .setsockopt =   sock_no_setsockopt,
 733         .getsockopt =   sock_no_getsockopt,
 734         .sendmsg =      unix_seqpacket_sendmsg,
 735         .recvmsg =      unix_seqpacket_recvmsg,
 736         .mmap =         sock_no_mmap,
 737         .sendpage =     sock_no_sendpage,
 738         .set_peek_off = unix_set_peek_off,
 739 };
 740
 741 static struct proto unix_proto = {
 742         .name                   = "UNIX",
 743         .owner                  = THIS_MODULE,
 744         .obj_size               = sizeof(struct unix_sock),
 745 };
 746
 747 /*
 748  * AF_UNIX sockets do not interact with hardware, hence they
 749  * dont trigger interrupts - so it's safe for them to have
 750  * bh-unsafe locking for their sk_receive_queue.lock. Split off
 751  * this special lock-class by reinitializing the spinlock key:
 752  */
 753 static struct lock_class_key af_unix_sk_receive_queue_lock_key;
 754
 755 static struct sock *unix_create1(struct net *net, struct socket *sock, int kern)
 756 {
 757         struct sock *sk = NULL;
 758         struct unix_sock *u;
 759
 760         atomic_long_inc(&unix_nr_socks);
 761         if (atomic_long_read(&unix_nr_socks) > 2 * get_max_files())
 762                 goto out;
 763
 764         sk = sk_alloc(net, PF_UNIX, GFP_KERNEL, &unix_proto, kern);
 765         if (!sk)
 766                 goto out;
 767
 768         sock_init_data(sock, sk);
 769         lockdep_set_class(&sk->sk_receive_queue.lock,
 770                                 &af_unix_sk_receive_queue_lock_key);
 771
 772         sk->sk_allocation       = GFP_KERNEL_ACCOUNT;
 773         sk->sk_write_space      = unix_write_space;
 774         sk->sk_max_ack_backlog  = net->unx.sysctl_max_dgram_qlen;
 775         sk->sk_destruct         = unix_sock_destructor;
 776         u         = unix_sk(sk);
 777         u->path.dentry = NULL;
 778         u->path.mnt = NULL;
 779         spin_lock_init(&u->lock);
 780         atomic_long_set(&u->inflight, 0);
 781         INIT_LIST_HEAD(&u->link);
 782         mutex_init(&u->readlock); /* single task reading lock */
 783         init_waitqueue_head(&u->peer_wait);
 784         init_waitqueue_func_entry(&u->peer_wake, unix_dgram_peer_wake_relay);
 785         unix_insert_socket(unix_sockets_unbound(sk), sk);
 786 out:
 787         if (sk == NULL)
 788                 atomic_long_dec(&unix_nr_socks);
 789         else {
 790                 local_bh_disable();
 791                 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
 792                 local_bh_enable();
 793         }
 794         return sk;
 795 }
 796
 797 static int unix_create(struct net *net, struct socket *sock, int protocol,
 798                        int kern)
 799 {
 800         if (protocol && protocol != PF_UNIX)
 801                 return -EPROTONOSUPPORT;
 802
 803         sock->state = SS_UNCONNECTED;
 804
 805         switch (sock->type) {
 806         case SOCK_STREAM:
 807                 sock->ops = &unix_stream_ops;
 808                 break;
 809                 /*
 810                  *      Believe it or not BSD has AF_UNIX, SOCK_RAW though
 811                  *      nothing uses it.
 812                  */
 813         case SOCK_RAW:
 814                 sock->type = SOCK_DGRAM;
 815         case SOCK_DGRAM:
 816                 sock->ops = &unix_dgram_ops;
 817                 break;
 818         case SOCK_SEQPACKET:
 819                 sock->ops = &unix_seqpacket_ops;
 820                 break;
 821         default:
 822                 return -ESOCKTNOSUPPORT;
 823         }
 824
 825         return unix_create1(net, sock, kern) ? 0 : -ENOMEM;
 826 }
 827
 828 static int unix_release(struct socket *sock)
 829 {
 830         struct sock *sk = sock->sk;
 831
 832         if (!sk)
 833                 return 0;
 834
 835         unix_release_sock(sk, 0);
 836         sock->sk = NULL;
 837
 838         return 0;
 839 }
 840
 841 static int unix_autobind(struct socket *sock)
 842 {
 843         struct sock *sk = sock->sk;
 844         struct net *net = sock_net(sk);
 845         struct unix_sock *u = unix_sk(sk);
 846         static u32 ordernum = 1;
 847         struct unix_address *addr;
 848         int err;
 849         unsigned int retries = 0;
 850
 851         err = mutex_lock_interruptible(&u->readlock);
 852         if (err)
 853                 return err;
 854
 855         err = 0;
 856         if (u->addr)
 857                 goto out;
 858
 859         err = -ENOMEM;
 860         addr = kzalloc(sizeof(*addr) + sizeof(short) + 16, GFP_KERNEL);
 861         if (!addr)
 862                 goto out;
 863
 864         addr->name->sun_family = AF_UNIX;
 865         atomic_set(&addr->refcnt, 1);
 866
 867 retry:
 868         addr->len = sprintf(addr->name->sun_path+1, "%05x", ordernum) + 1 + sizeof(short);
 869         addr->hash = unix_hash_fold(csum_partial(addr->name, addr->len, 0));
 870
 871         spin_lock(&unix_table_lock);
 872         ordernum = (ordernum+1)&0xFFFFF;
 873
 874         if (__unix_find_socket_byname(net, addr->name, addr->len, sock->type,
 875                                       addr->hash)) {
 876                 spin_unlock(&unix_table_lock);
 877                 /*
 878                  * __unix_find_socket_byname() may take long time if many names
 879                  * are already in use.
 880                  */
 881                 cond_resched();
 882                 /* Give up if all names seems to be in use. */
 883                 if (retries++ == 0xFFFFF) {
 884                         err = -ENOSPC;
 885                         kfree(addr);
 886                         goto out;
 887                 }
 888                 goto retry;
 889         }
 890         addr->hash ^= sk->sk_type;
 891
 892         __unix_remove_socket(sk);
 893         u->addr = addr;
 894         __unix_insert_socket(&unix_socket_table[addr->hash], sk);
 895         spin_unlock(&unix_table_lock);
 896         err = 0;
 897
 898 out:    mutex_unlock(&u->readlock);
 899         return err;
 900 }
 901
 902 static struct sock *unix_find_other(struct net *net,
 903                                     struct sockaddr_un *sunname, int len,
 904                                     int type, unsigned int hash, int *error)
 905 {
 906         struct sock *u;
 907         struct path path;
 908         int err = 0;
 909
 910         if (sunname->sun_path[0]) {
 911                 struct inode *inode;
 912                 err = kern_path(sunname->sun_path, LOOKUP_FOLLOW, &path);
 913                 if (err)
 914                         goto fail;
 915                 inode = d_real_inode(path.dentry);
 916                 err = inode_permission(inode, MAY_WRITE);
 917                 if (err)
 918                         goto put_fail;
 919
 920                 err = -ECONNREFUSED;
 921                 if (!S_ISSOCK(inode->i_mode))
 922                         goto put_fail;
 923                 u = unix_find_socket_byinode(inode);
 924                 if (!u)
 925                         goto put_fail;
 926
 927                 if (u->sk_type == type)
 928                         touch_atime(&path);
 929
 930                 path_put(&path);
 931
 932                 err = -EPROTOTYPE;
 933                 if (u->sk_type != type) {
 934                         sock_put(u);
 935                         goto fail;
 936                 }
 937         } else {
 938                 err = -ECONNREFUSED;
 939                 u = unix_find_socket_byname(net, sunname, len, type, hash);
 940                 if (u) {
 941                         struct dentry *dentry;
 942                         dentry = unix_sk(u)->path.dentry;
 943                         if (dentry)
 944                                 touch_atime(&unix_sk(u)->path);
 945                 } else
 946                         goto fail;
 947         }
 948         return u;
 949
 950 put_fail:
 951         path_put(&path);
 952 fail:
 953         *error = err;
 954         return NULL;
 955 }
 956
 957 static int unix_mknod(struct dentry *dentry, const struct path *path, umode_t mode,
 958                       struct path *res)
 959 {
 960         int err;
 961
 962         err = security_path_mknod(path, dentry, mode, 0);
 963         if (!err) {
 964                 err = vfs_mknod(d_inode(path->dentry), dentry, mode, 0);
 965                 if (!err) {
 966                         res->mnt = mntget(path->mnt);
 967                         res->dentry = dget(dentry);
 968                 }
 969         }
 970
 971         return err;
 972 }
 973
 974 static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
 975 {
 976         struct sock *sk = sock->sk;
 977         struct net *net = sock_net(sk);
 978         struct unix_sock *u = unix_sk(sk);
 979         struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr;
 980         char *sun_path = sunaddr->sun_path;
 981         int err, name_err;
 982         unsigned int hash;
 983         struct unix_address *addr;
 984         struct hlist_head *list;
 985         struct path path;
 986         struct dentry *dentry;
 987
 988         err = -EINVAL;
 989         if (sunaddr->sun_family != AF_UNIX)
 990                 goto out;
 991
 992         if (addr_len == sizeof(short)) {
 993                 err = unix_autobind(sock);
 994                 goto out;
 995         }
 996
 997         err = unix_mkname(sunaddr, addr_len, &hash);
 998         if (err < 0)
 999                 goto out;
1000         addr_len = err;
1001
1002         name_err = 0;
1003         dentry = NULL;
1004         if (sun_path[0]) {
1005                 /* Get the parent directory, calculate the hash for last
1006                  * component.
1007                  */
1008                 dentry = kern_path_create(AT_FDCWD, sun_path, &path, 0);
1009
1010                 if (IS_ERR(dentry)) {
1011                         /* delay report until after 'already bound' check */
1012                         name_err = PTR_ERR(dentry);
1013                         dentry = NULL;
1014                 }
1015         }
1016
1017         err = mutex_lock_interruptible(&u->readlock);
1018         if (err)
1019                 goto out_path;
1020
1021         err = -EINVAL;
1022         if (u->addr)
1023                 goto out_up;
1024
1025         if (name_err) {
1026                 err = name_err == -EEXIST ? -EADDRINUSE : name_err;
1027                 goto out_up;
1028         }
1029
1030         err = -ENOMEM;
1031         addr = kmalloc(sizeof(*addr)+addr_len, GFP_KERNEL);
1032         if (!addr)
1033                 goto out_up;
1034
1035         memcpy(addr->name, sunaddr, addr_len);
1036         addr->len = addr_len;
1037         addr->hash = hash ^ sk->sk_type;
1038         atomic_set(&addr->refcnt, 1);
1039
1040         if (dentry) {
1041                 struct path u_path;
1042                 umode_t mode = S_IFSOCK |
1043                        (SOCK_INODE(sock)->i_mode & ~current_umask());
1044                 err = unix_mknod(dentry, &path, mode, &u_path);
1045                 if (err) {
1046                         if (err == -EEXIST)
1047                                 err = -EADDRINUSE;
1048                         unix_release_addr(addr);
1049                         goto out_up;
1050                 }
1051                 addr->hash = UNIX_HASH_SIZE;
1052                 hash = d_real_inode(dentry)->i_ino & (UNIX_HASH_SIZE - 1);
1053                 spin_lock(&unix_table_lock);
1054                 u->path = u_path;
1055                 list = &unix_socket_table[hash];
1056         } else {
1057                 spin_lock(&unix_table_lock);
1058                 err = -EADDRINUSE;
1059                 if (__unix_find_socket_byname(net, sunaddr, addr_len,
1060                                               sk->sk_type, hash)) {
1061                         unix_release_addr(addr);
1062                         goto out_unlock;
1063                 }
1064
1065                 list = &unix_socket_table[addr->hash];
1066         }
1067
1068         err = 0;
1069         __unix_remove_socket(sk);
1070         u->addr = addr;
1071         __unix_insert_socket(list, sk);
1072
1073 out_unlock:
1074         spin_unlock(&unix_table_lock);
1075 out_up:
1076         mutex_unlock(&u->readlock);
1077 out_path:
1078         if (dentry)
1079                 done_path_create(&path, dentry);
1080
1081 out:
1082         return err;
1083 }
1084
1085 static void unix_state_double_lock(struct sock *sk1, struct sock *sk2)
1086 {
1087         if (unlikely(sk1 == sk2) || !sk2) {
1088                 unix_state_lock(sk1);
1089                 return;
1090         }
1091         if (sk1 < sk2) {
1092                 unix_state_lock(sk1);
1093                 unix_state_lock_nested(sk2);
1094         } else {
1095                 unix_state_lock(sk2);
1096                 unix_state_lock_nested(sk1);
1097         }
1098 }
1099
1100 static void unix_state_double_unlock(struct sock *sk1, struct sock *sk2)
1101 {
1102         if (unlikely(sk1 == sk2) || !sk2) {
1103                 unix_state_unlock(sk1);
1104                 return;
1105         }
1106         unix_state_unlock(sk1);
1107         unix_state_unlock(sk2);
1108 }
1109
1110 static int unix_dgram_connect(struct socket *sock, struct sockaddr *addr,
1111                               int alen, int flags)
1112 {
1113         struct sock *sk = sock->sk;
1114         struct net *net = sock_net(sk);
1115         struct sockaddr_un *sunaddr = (struct sockaddr_un *)addr;
1116         struct sock *other;
1117         unsigned int hash;
1118         int err;
1119
1120         if (addr->sa_family != AF_UNSPEC) {
1121                 err = unix_mkname(sunaddr, alen, &hash);
1122                 if (err < 0)
1123                         goto out;
1124                 alen = err;
1125
1126                 if (test_bit(SOCK_PASSCRED, &sock->flags) &&
1127                     !unix_sk(sk)->addr && (err = unix_autobind(sock)) != 0)
1128                         goto out;
1129
1130 restart:
1131                 other = unix_find_other(net, sunaddr, alen, sock->type, hash, &err);
1132                 if (!other)
1133                         goto out;
1134
1135                 unix_state_double_lock(sk, other);
1136
1137                 /* Apparently VFS overslept socket death. Retry. */
1138                 if (sock_flag(other, SOCK_DEAD)) {
1139                         unix_state_double_unlock(sk, other);
1140                         sock_put(other);
1141                         goto restart;
1142                 }
1143
1144                 err = -EPERM;
1145                 if (!unix_may_send(sk, other))
1146                         goto out_unlock;
1147
1148                 err = security_unix_may_send(sk->sk_socket, other->sk_socket);
1149                 if (err)
1150                         goto out_unlock;
1151
1152         } else {
1153                 /*
1154                  *      1003.1g breaking connected state with AF_UNSPEC
1155                  */
1156                 other = NULL;
1157                 unix_state_double_lock(sk, other);
1158         }
1159
1160         /*
1161          * If it was connected, reconnect.
1162          */
1163         if (unix_peer(sk)) {
1164                 struct sock *old_peer = unix_peer(sk);
1165                 unix_peer(sk) = other;
1166                 unix_dgram_peer_wake_disconnect_wakeup(sk, old_peer);
1167
1168                 unix_state_double_unlock(sk, other);
1169
1170                 if (other != old_peer)
1171                         unix_dgram_disconnected(sk, old_peer);
1172                 sock_put(old_peer);
1173         } else {
1174                 unix_peer(sk) = other;
1175                 unix_state_double_unlock(sk, other);
1176         }
1177         return 0;
1178
1179 out_unlock:
1180         unix_state_double_unlock(sk, other);
1181         sock_put(other);
1182 out:
1183         return err;
1184 }
1185
1186 static long unix_wait_for_peer(struct sock *other, long timeo)
1187 {
1188         struct unix_sock *u = unix_sk(other);
1189         int sched;
1190         DEFINE_WAIT(wait);
1191
1192         prepare_to_wait_exclusive(&u->peer_wait, &wait, TASK_INTERRUPTIBLE);
1193
1194         sched = !sock_flag(other, SOCK_DEAD) &&
1195                 !(other->sk_shutdown & RCV_SHUTDOWN) &&
1196                 unix_recvq_full(other);
1197
1198         unix_state_unlock(other);
1199
1200         if (sched)
1201                 timeo = schedule_timeout(timeo);
1202
1203         finish_wait(&u->peer_wait, &wait);
1204         return timeo;
1205 }
1206
1207 static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr,
1208                                int addr_len, int flags)
1209 {
1210         struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr;
1211         struct sock *sk = sock->sk;
1212         struct net *net = sock_net(sk);
1213         struct unix_sock *u = unix_sk(sk), *newu, *otheru;
1214         struct sock *newsk = NULL;
1215         struct sock *other = NULL;
1216         struct sk_buff *skb = NULL;
1217         unsigned int hash;
1218         int st;
1219         int err;
1220         long timeo;
1221
1222         err = unix_mkname(sunaddr, addr_len, &hash);
1223         if (err < 0)
1224                 goto out;
1225         addr_len = err;
1226
1227         if (test_bit(SOCK_PASSCRED, &sock->flags) && !u->addr &&
1228             (err = unix_autobind(sock)) != 0)
1229                 goto out;
1230
1231         timeo = sock_sndtimeo(sk, flags & O_NONBLOCK);
1232
1233         /* First of all allocate resources.
1234            If we will make it after state is locked,
1235            we will have to recheck all again in any case.
1236          */
1237
1238         err = -ENOMEM;
1239
1240         /* create new sock for complete connection */
1241         newsk = unix_create1(sock_net(sk), NULL, 0);
1242         if (newsk == NULL)
1243                 goto out;
1244
1245         /* Allocate skb for sending to listening sock */
1246         skb = sock_wmalloc(newsk, 1, 0, GFP_KERNEL);
1247         if (skb == NULL)
1248                 goto out;
1249
1250 restart:
1251         /*  Find listening sock. */
1252         other = unix_find_other(net, sunaddr, addr_len, sk->sk_type, hash, &err);
1253         if (!other)
1254                 goto out;
1255
1256         /* Latch state of peer */
1257         unix_state_lock(other);
1258
1259         /* Apparently VFS overslept socket death. Retry. */
1260         if (sock_flag(other, SOCK_DEAD)) {
1261                 unix_state_unlock(other);
1262                 sock_put(other);
1263                 goto restart;
1264         }
1265
1266         err = -ECONNREFUSED;
1267         if (other->sk_state != TCP_LISTEN)
1268                 goto out_unlock;
1269         if (other->sk_shutdown & RCV_SHUTDOWN)
1270                 goto out_unlock;
1271
1272         if (unix_recvq_full(other)) {
1273                 err = -EAGAIN;
1274                 if (!timeo)
1275                         goto out_unlock;
1276
1277                 timeo = unix_wait_for_peer(other, timeo);
1278
1279                 err = sock_intr_errno(timeo);
1280                 if (signal_pending(current))
1281                         goto out;
1282                 sock_put(other);
1283                 goto restart;
1284         }
1285
1286         /* Latch our state.
1287
1288            It is tricky place. We need to grab our state lock and cannot
1289            drop lock on peer. It is dangerous because deadlock is
1290            possible. Connect to self case and simultaneous
1291            attempt to connect are eliminated by checking socket
1292            state. other is TCP_LISTEN, if sk is TCP_LISTEN we
1293            check this before attempt to grab lock.
1294
1295            Well, and we have to recheck the state after socket locked.
1296          */
1297         st = sk->sk_state;
1298
1299         switch (st) {
1300         case TCP_CLOSE:
1301                 /* This is ok... continue with connect */
1302                 break;
1303         case TCP_ESTABLISHED:
1304                 /* Socket is already connected */
1305                 err = -EISCONN;
1306                 goto out_unlock;
1307         default:
1308                 err = -EINVAL;
1309                 goto out_unlock;
1310         }
1311
1312         unix_state_lock_nested(sk);
1313
1314         if (sk->sk_state != st) {
1315                 unix_state_unlock(sk);
1316                 unix_state_unlock(other);
1317                 sock_put(other);
1318                 goto restart;
1319         }
1320
1321         err = security_unix_stream_connect(sk, other, newsk);
1322         if (err) {
1323                 unix_state_unlock(sk);
1324                 goto out_unlock;
1325         }
1326
1327         /* The way is open! Fastly set all the necessary fields... */
1328
1329         sock_hold(sk);
1330         unix_peer(newsk)        = sk;
1331         newsk->sk_state         = TCP_ESTABLISHED;
1332         newsk->sk_type          = sk->sk_type;
1333         init_peercred(newsk);
1334         newu = unix_sk(newsk);
1335         RCU_INIT_POINTER(newsk->sk_wq, &newu->peer_wq);
1336         otheru = unix_sk(other);
1337
1338         /* copy address information from listening to new sock*/
1339         if (otheru->addr) {
1340                 atomic_inc(&otheru->addr->refcnt);
1341                 newu->addr = otheru->addr;
1342         }
1343         if (otheru->path.dentry) {
1344                 path_get(&otheru->path);
1345                 newu->path = otheru->path;
1346         }
1347
1348         /* Set credentials */
1349         copy_peercred(sk, other);
1350
1351         sock->state     = SS_CONNECTED;
1352         sk->sk_state    = TCP_ESTABLISHED;
1353         sock_hold(newsk);
1354
1355         smp_mb__after_atomic(); /* sock_hold() does an atomic_inc() */
1356         unix_peer(sk)   = newsk;
1357
1358         unix_state_unlock(sk);
1359
1360         /* take ten and and send info to listening sock */
1361         spin_lock(&other->sk_receive_queue.lock);
1362         __skb_queue_tail(&other->sk_receive_queue, skb);
1363         spin_unlock(&other->sk_receive_queue.lock);
1364         unix_state_unlock(other);
1365         other->sk_data_ready(other);
1366         sock_put(other);
1367         return 0;
1368
1369 out_unlock:
1370         if (other)
1371                 unix_state_unlock(other);
1372
1373 out:
1374         kfree_skb(skb);
1375         if (newsk)
1376                 unix_release_sock(newsk, 0);
1377         if (other)
1378                 sock_put(other);
1379         return err;
1380 }
1381
1382 static int unix_socketpair(struct socket *socka, struct socket *sockb)
1383 {
1384         struct sock *ska = socka->sk, *skb = sockb->sk;
1385
1386         /* Join our sockets back to back */
1387         sock_hold(ska);
1388         sock_hold(skb);
1389         unix_peer(ska) = skb;
1390         unix_peer(skb) = ska;
1391         init_peercred(ska);
1392         init_peercred(skb);
1393
1394         if (ska->sk_type != SOCK_DGRAM) {
1395                 ska->sk_state = TCP_ESTABLISHED;
1396                 skb->sk_state = TCP_ESTABLISHED;
1397                 socka->state  = SS_CONNECTED;
1398                 sockb->state  = SS_CONNECTED;
1399         }
1400         return 0;
1401 }
1402
1403 static void unix_sock_inherit_flags(const struct socket *old,
1404                                     struct socket *new)
1405 {
1406         if (test_bit(SOCK_PASSCRED, &old->flags))
1407                 set_bit(SOCK_PASSCRED, &new->flags);
1408         if (test_bit(SOCK_PASSSEC, &old->flags))
1409                 set_bit(SOCK_PASSSEC, &new->flags);
1410 }
1411
1412 static int unix_accept(struct socket *sock, struct socket *newsock, int flags)
1413 {
1414         struct sock *sk = sock->sk;
1415         struct sock *tsk;
1416         struct sk_buff *skb;
1417         int err;
1418
1419         err = -EOPNOTSUPP;
1420         if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET)
1421                 goto out;
1422
1423         err = -EINVAL;
1424         if (sk->sk_state != TCP_LISTEN)
1425                 goto out;
1426
1427         /* If socket state is TCP_LISTEN it cannot change (for now...),
1428          * so that no locks are necessary.
1429          */
1430
1431         skb = skb_recv_datagram(sk, 0, flags&O_NONBLOCK, &err);
1432         if (!skb) {
1433                 /* This means receive shutdown. */
1434                 if (err == 0)
1435                         err = -EINVAL;
1436                 goto out;
1437         }
1438
1439         tsk = skb->sk;
1440         skb_free_datagram(sk, skb);
1441         wake_up_interruptible(&unix_sk(sk)->peer_wait);
1442
1443         /* attach accepted sock to socket */
1444         unix_state_lock(tsk);
1445         newsock->state = SS_CONNECTED;
1446         unix_sock_inherit_flags(sock, newsock);
1447         sock_graft(tsk, newsock);
1448         unix_state_unlock(tsk);
1449         return 0;
1450
1451 out:
1452         return err;
1453 }
1454
1455
1456 static int unix_getname(struct socket *sock, struct sockaddr *uaddr, int *uaddr_len, int peer)
1457 {
1458         struct sock *sk = sock->sk;
1459         struct unix_sock *u;
1460         DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, uaddr);
1461         int err = 0;
1462
1463         if (peer) {
1464                 sk = unix_peer_get(sk);
1465
1466                 err = -ENOTCONN;
1467                 if (!sk)
1468                         goto out;
1469                 err = 0;
1470         } else {
1471                 sock_hold(sk);
1472         }
1473
1474         u = unix_sk(sk);
1475         unix_state_lock(sk);
1476         if (!u->addr) {
1477                 sunaddr->sun_family = AF_UNIX;
1478                 sunaddr->sun_path[0] = 0;
1479                 *uaddr_len = sizeof(short);
1480         } else {
1481                 struct unix_address *addr = u->addr;
1482
1483                 *uaddr_len = addr->len;
1484                 memcpy(sunaddr, addr->name, *uaddr_len);
1485         }
1486         unix_state_unlock(sk);
1487         sock_put(sk);
1488 out:
1489         return err;
1490 }
1491
1492 static void unix_detach_fds(struct scm_cookie *scm, struct sk_buff *skb)
1493 {
1494         int i;
1495
1496         scm->fp = UNIXCB(skb).fp;
1497         UNIXCB(skb).fp = NULL;
1498
1499         for (i = scm->fp->count-1; i >= 0; i--)
1500                 unix_notinflight(scm->fp->user, scm->fp->fp[i]);
1501 }
1502
1503 static void unix_destruct_scm(struct sk_buff *skb)
1504 {
1505         struct scm_cookie scm;
1506         memset(&scm, 0, sizeof(scm));
1507         scm.pid  = UNIXCB(skb).pid;
1508         if (UNIXCB(skb).fp)
1509                 unix_detach_fds(&scm, skb);
1510
1511         /* Alas, it calls VFS */
1512         /* So fscking what? fput() had been SMP-safe since the last Summer */
1513         scm_destroy(&scm);
1514         sock_wfree(skb);
1515 }
1516
1517 /*
1518  * The "user->unix_inflight" variable is protected by the garbage
1519  * collection lock, and we just read it locklessly here. If you go
1520  * over the limit, there might be a tiny race in actually noticing
1521  * it across threads. Tough.
1522  */
1523 static inline bool too_many_unix_fds(struct task_struct *p)
1524 {
1525         struct user_struct *user = current_user();
1526
1527         if (unlikely(user->unix_inflight > task_rlimit(p, RLIMIT_NOFILE)))
1528                 return !capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN);
1529         return false;
1530 }
1531
1532 #define MAX_RECURSION_LEVEL 4
1533
1534 static int unix_attach_fds(struct scm_cookie *scm, struct sk_buff *skb)
1535 {
1536         int i;
1537         unsigned char max_level = 0;
1538
1539         if (too_many_unix_fds(current))
1540                 return -ETOOMANYREFS;
1541
1542         for (i = scm->fp->count - 1; i >= 0; i--) {
1543                 struct sock *sk = unix_get_socket(scm->fp->fp[i]);
1544
1545                 if (sk)
1546                         max_level = max(max_level,
1547                                         unix_sk(sk)->recursion_level);
1548         }
1549         if (unlikely(max_level > MAX_RECURSION_LEVEL))
1550                 return -ETOOMANYREFS;
1551
1552         /*
1553          * Need to duplicate file references for the sake of garbage
1554          * collection.  Otherwise a socket in the fps might become a
1555          * candidate for GC while the skb is not yet queued.
1556          */
1557         UNIXCB(skb).fp = scm_fp_dup(scm->fp);
1558         if (!UNIXCB(skb).fp)
1559                 return -ENOMEM;
1560
1561         for (i = scm->fp->count - 1; i >= 0; i--)
1562                 unix_inflight(scm->fp->user, scm->fp->fp[i]);
1563         return max_level;
1564 }
1565
1566 static int unix_scm_to_skb(struct scm_cookie *scm, struct sk_buff *skb, bool send_fds)
1567 {
1568         int err = 0;
1569
1570         UNIXCB(skb).pid  = get_pid(scm->pid);
1571         UNIXCB(skb).uid = scm->creds.uid;
1572         UNIXCB(skb).gid = scm->creds.gid;
1573         UNIXCB(skb).fp = NULL;
1574         unix_get_secdata(scm, skb);
1575         if (scm->fp && send_fds)
1576                 err = unix_attach_fds(scm, skb);
1577
1578         skb->destructor = unix_destruct_scm;
1579         return err;
1580 }
1581
1582 static bool unix_passcred_enabled(const struct socket *sock,
1583                                   const struct sock *other)
1584 {
1585         return test_bit(SOCK_PASSCRED, &sock->flags) ||
1586                !other->sk_socket ||
1587                test_bit(SOCK_PASSCRED, &other->sk_socket->flags);
1588 }
1589
1590 /*
1591  * Some apps rely on write() giving SCM_CREDENTIALS
1592  * We include credentials if source or destination socket
1593  * asserted SOCK_PASSCRED.
1594  */
1595 static void maybe_add_creds(struct sk_buff *skb, const struct socket *sock,
1596                             const struct sock *other)
1597 {
1598         if (UNIXCB(skb).pid)
1599                 return;
1600         if (unix_passcred_enabled(sock, other)) {
1601                 UNIXCB(skb).pid  = get_pid(task_tgid(current));
1602                 current_uid_gid(&UNIXCB(skb).uid, &UNIXCB(skb).gid);
1603         }
1604 }
1605
1606 static int maybe_init_creds(struct scm_cookie *scm,
1607                             struct socket *socket,
1608                             const struct sock *other)
1609 {
1610         int err;
1611         struct msghdr msg = { .msg_controllen = 0 };
1612
1613         err = scm_send(socket, &msg, scm, false);
1614         if (err)
1615                 return err;
1616
1617         if (unix_passcred_enabled(socket, other)) {
1618                 scm->pid = get_pid(task_tgid(current));
1619                 current_uid_gid(&scm->creds.uid, &scm->creds.gid);
1620         }
1621         return err;
1622 }
1623
1624 static bool unix_skb_scm_eq(struct sk_buff *skb,
1625                             struct scm_cookie *scm)
1626 {
1627         const struct unix_skb_parms *u = &UNIXCB(skb);
1628
1629         return u->pid == scm->pid &&
1630                uid_eq(u->uid, scm->creds.uid) &&
1631                gid_eq(u->gid, scm->creds.gid) &&
1632                unix_secdata_eq(scm, skb);
1633 }
1634
1635 /*
1636  *      Send AF_UNIX data.
1637  */
1638
1639 static int unix_dgram_sendmsg(struct socket *sock, struct msghdr *msg,
1640                               size_t len)
1641 {
1642         struct sock *sk = sock->sk;
1643         struct net *net = sock_net(sk);
1644         struct unix_sock *u = unix_sk(sk);
1645         DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, msg->msg_name);
1646         struct sock *other = NULL;
1647         int namelen = 0; /* fake GCC */
1648         int err;
1649         unsigned int hash;
1650         struct sk_buff *skb;
1651         long timeo;
1652         struct scm_cookie scm;
1653         int max_level;
1654         int data_len = 0;
1655         int sk_locked;
1656
1657         wait_for_unix_gc();
1658         err = scm_send(sock, msg, &scm, false);
1659         if (err < 0)
1660                 return err;
1661
1662         err = -EOPNOTSUPP;
1663         if (msg->msg_flags&MSG_OOB)
1664                 goto out;
1665
1666         if (msg->msg_namelen) {
1667                 err = unix_mkname(sunaddr, msg->msg_namelen, &hash);
1668                 if (err < 0)
1669                         goto out;
1670                 namelen = err;
1671         } else {
1672                 sunaddr = NULL;
1673                 err = -ENOTCONN;
1674                 other = unix_peer_get(sk);
1675                 if (!other)
1676                         goto out;
1677         }
1678
1679         if (test_bit(SOCK_PASSCRED, &sock->flags) && !u->addr
1680             && (err = unix_autobind(sock)) != 0)
1681                 goto out;
1682
1683         err = -EMSGSIZE;
1684         if (len > sk->sk_sndbuf - 32)
1685                 goto out;
1686
1687         if (len > SKB_MAX_ALLOC) {
1688                 data_len = min_t(size_t,
1689                                  len - SKB_MAX_ALLOC,
1690                                  MAX_SKB_FRAGS * PAGE_SIZE);
1691                 data_len = PAGE_ALIGN(data_len);
1692
1693                 BUILD_BUG_ON(SKB_MAX_ALLOC < PAGE_SIZE);
1694         }
1695
1696         skb = sock_alloc_send_pskb(sk, len - data_len, data_len,
1697                                    msg->msg_flags & MSG_DONTWAIT, &err,
1698                                    PAGE_ALLOC_COSTLY_ORDER);
1699         if (skb == NULL)
1700                 goto out;
1701
1702         err = unix_scm_to_skb(&scm, skb, true);
1703         if (err < 0)
1704                 goto out_free;
1705         max_level = err + 1;
1706
1707         skb_put(skb, len - data_len);
1708         skb->data_len = data_len;
1709         skb->len = len;
1710         err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, len);
1711         if (err)
1712                 goto out_free;
1713
1714         timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT);
1715
1716 restart:
1717         if (!other) {
1718                 err = -ECONNRESET;
1719                 if (sunaddr == NULL)
1720                         goto out_free;
1721
1722                 other = unix_find_other(net, sunaddr, namelen, sk->sk_type,
1723                                         hash, &err);
1724                 if (other == NULL)
1725                         goto out_free;
1726         }
1727
1728         if (sk_filter(other, skb) < 0) {
1729                 /* Toss the packet but do not return any error to the sender */
1730                 err = len;
1731                 goto out_free;
1732         }
1733
1734         sk_locked = 0;
1735         unix_state_lock(other);
1736 restart_locked:
1737         err = -EPERM;
1738         if (!unix_may_send(sk, other))
1739                 goto out_unlock;
1740
1741         if (unlikely(sock_flag(other, SOCK_DEAD))) {
1742                 /*
1743                  *      Check with 1003.1g - what should
1744                  *      datagram error
1745                  */
1746                 unix_state_unlock(other);
1747                 sock_put(other);
1748
1749                 if (!sk_locked)
1750                         unix_state_lock(sk);
1751
1752                 err = 0;
1753                 if (unix_peer(sk) == other) {
1754                         unix_peer(sk) = NULL;
1755                         unix_dgram_peer_wake_disconnect_wakeup(sk, other);
1756
1757                         unix_state_unlock(sk);
1758
1759                         unix_dgram_disconnected(sk, other);
1760                         sock_put(other);
1761                         err = -ECONNREFUSED;
1762                 } else {
1763                         unix_state_unlock(sk);
1764                 }
1765
1766                 other = NULL;
1767                 if (err)
1768                         goto out_free;
1769                 goto restart;
1770         }
1771
1772         err = -EPIPE;
1773         if (other->sk_shutdown & RCV_SHUTDOWN)
1774                 goto out_unlock;
1775
1776         if (sk->sk_type != SOCK_SEQPACKET) {
1777                 err = security_unix_may_send(sk->sk_socket, other->sk_socket);
1778                 if (err)
1779                         goto out_unlock;
1780         }
1781
1782         /* other == sk && unix_peer(other) != sk if
1783          * - unix_peer(sk) == NULL, destination address bound to sk
1784          * - unix_peer(sk) == sk by time of get but disconnected before lock
1785          */
1786         if (other != sk &&
1787             unlikely(unix_peer(other) != sk && unix_recvq_full(other))) {
1788                 if (timeo) {
1789                         timeo = unix_wait_for_peer(other, timeo);
1790
1791                         err = sock_intr_errno(timeo);
1792                         if (signal_pending(current))
1793                                 goto out_free;
1794
1795                         goto restart;
1796                 }
1797
1798                 if (!sk_locked) {
1799                         unix_state_unlock(other);
1800                         unix_state_double_lock(sk, other);
1801                 }
1802
1803                 if (unix_peer(sk) != other ||
1804                     unix_dgram_peer_wake_me(sk, other)) {
1805                         err = -EAGAIN;
1806                         sk_locked = 1;
1807                         goto out_unlock;
1808                 }
1809
1810                 if (!sk_locked) {
1811                         sk_locked = 1;
1812                         goto restart_locked;
1813                 }
1814         }
1815
1816         if (unlikely(sk_locked))
1817                 unix_state_unlock(sk);
1818
1819         if (sock_flag(other, SOCK_RCVTSTAMP))
1820                 __net_timestamp(skb);
1821         maybe_add_creds(skb, sock, other);
1822         skb_queue_tail(&other->sk_receive_queue, skb);
1823         if (max_level > unix_sk(other)->recursion_level)
1824                 unix_sk(other)->recursion_level = max_level;
1825         unix_state_unlock(other);
1826         other->sk_data_ready(other);
1827         sock_put(other);
1828         scm_destroy(&scm);
1829         return len;
1830
1831 out_unlock:
1832         if (sk_locked)
1833                 unix_state_unlock(sk);
1834         unix_state_unlock(other);
1835 out_free:
1836         kfree_skb(skb);
1837 out:
1838         if (other)
1839                 sock_put(other);
1840         scm_destroy(&scm);
1841         return err;
1842 }
1843
1844 /* We use paged skbs for stream sockets, and limit occupancy to 32768
1845  * bytes, and a minimun of a full page.
1846  */
1847 #define UNIX_SKB_FRAGS_SZ (PAGE_SIZE << get_order(32768))
1848
1849 static int unix_stream_sendmsg(struct socket *sock, struct msghdr *msg,
1850                                size_t len)
1851 {
1852         struct sock *sk = sock->sk;
1853         struct sock *other = NULL;
1854         int err, size;
1855         struct sk_buff *skb;
1856         int sent = 0;
1857         struct scm_cookie scm;
1858         bool fds_sent = false;
1859         int max_level;
1860         int data_len;
1861
1862         wait_for_unix_gc();
1863         err = scm_send(sock, msg, &scm, false);
1864         if (err < 0)
1865                 return err;
1866
1867         err = -EOPNOTSUPP;
1868         if (msg->msg_flags&MSG_OOB)
1869                 goto out_err;
1870
1871         if (msg->msg_namelen) {
1872                 err = sk->sk_state == TCP_ESTABLISHED ? -EISCONN : -EOPNOTSUPP;
1873                 goto out_err;
1874         } else {
1875                 err = -ENOTCONN;
1876                 other = unix_peer(sk);
1877                 if (!other)
1878                         goto out_err;
1879         }
1880
1881         if (sk->sk_shutdown & SEND_SHUTDOWN)
1882                 goto pipe_err;
1883
1884         while (sent < len) {
1885                 size = len - sent;
1886
1887                 /* Keep two messages in the pipe so it schedules better */
1888                 size = min_t(int, size, (sk->sk_sndbuf >> 1) - 64);
1889
1890                 /* allow fallback to order-0 allocations */
1891                 size = min_t(int, size, SKB_MAX_HEAD(0) + UNIX_SKB_FRAGS_SZ);
1892
1893                 data_len = max_t(int, 0, size - SKB_MAX_HEAD(0));
1894
1895                 data_len = min_t(size_t, size, PAGE_ALIGN(data_len));
1896
1897                 skb = sock_alloc_send_pskb(sk, size - data_len, data_len,
1898                                            msg->msg_flags & MSG_DONTWAIT, &err,
1899                                            get_order(UNIX_SKB_FRAGS_SZ));
1900                 if (!skb)
1901                         goto out_err;
1902
1903                 /* Only send the fds in the first buffer */
1904                 err = unix_scm_to_skb(&scm, skb, !fds_sent);
1905                 if (err < 0) {
1906                         kfree_skb(skb);
1907                         goto out_err;
1908                 }
1909                 max_level = err + 1;
1910                 fds_sent = true;
1911
1912                 skb_put(skb, size - data_len);
1913                 skb->data_len = data_len;
1914                 skb->len = size;
1915                 err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, size);
1916                 if (err) {
1917                         kfree_skb(skb);
1918                         goto out_err;
1919                 }
1920
1921                 unix_state_lock(other);
1922
1923                 if (sock_flag(other, SOCK_DEAD) ||
1924                     (other->sk_shutdown & RCV_SHUTDOWN))
1925                         goto pipe_err_free;
1926
1927                 maybe_add_creds(skb, sock, other);
1928                 skb_queue_tail(&other->sk_receive_queue, skb);
1929                 if (max_level > unix_sk(other)->recursion_level)
1930                         unix_sk(other)->recursion_level = max_level;
1931                 unix_state_unlock(other);
1932                 other->sk_data_ready(other);
1933                 sent += size;
1934         }
1935
1936         scm_destroy(&scm);
1937
1938         return sent;
1939
1940 pipe_err_free:
1941         unix_state_unlock(other);
1942         kfree_skb(skb);
1943 pipe_err:
1944         if (sent == 0 && !(msg->msg_flags&MSG_NOSIGNAL))
1945                 send_sig(SIGPIPE, current, 0);
1946         err = -EPIPE;
1947 out_err:
1948         scm_destroy(&scm);
1949         return sent ? : err;
1950 }
1951
1952 static ssize_t unix_stream_sendpage(struct socket *socket, struct page *page,
1953                                     int offset, size_t size, int flags)
1954 {
1955         int err;
1956         bool send_sigpipe = false;
1957         bool init_scm = true;
1958         struct scm_cookie scm;
1959         struct sock *other, *sk = socket->sk;
1960         struct sk_buff *skb, *newskb = NULL, *tail = NULL;
1961
1962         if (flags & MSG_OOB)
1963                 return -EOPNOTSUPP;
1964
1965         other = unix_peer(sk);
1966         if (!other || sk->sk_state != TCP_ESTABLISHED)
1967                 return -ENOTCONN;
1968
1969         if (false) {
1970 alloc_skb:
1971                 unix_state_unlock(other);
1972                 mutex_unlock(&unix_sk(other)->readlock);
1973                 newskb = sock_alloc_send_pskb(sk, 0, 0, flags & MSG_DONTWAIT,
1974                                               &err, 0);
1975                 if (!newskb)
1976                         goto err;
1977         }
1978
1979         /* we must acquire readlock as we modify already present
1980          * skbs in the sk_receive_queue and mess with skb->len
1981          */
1982         err = mutex_lock_interruptible(&unix_sk(other)->readlock);
1983         if (err) {
1984                 err = flags & MSG_DONTWAIT ? -EAGAIN : -ERESTARTSYS;
1985                 goto err;
1986         }
1987
1988         if (sk->sk_shutdown & SEND_SHUTDOWN) {
1989                 err = -EPIPE;
1990                 send_sigpipe = true;
1991                 goto err_unlock;
1992         }
1993
1994         unix_state_lock(other);
1995
1996         if (sock_flag(other, SOCK_DEAD) ||
1997             other->sk_shutdown & RCV_SHUTDOWN) {
1998                 err = -EPIPE;
1999                 send_sigpipe = true;
2000                 goto err_state_unlock;
2001         }
2002
2003         if (init_scm) {
2004                 err = maybe_init_creds(&scm, socket, other);
2005                 if (err)
2006                         goto err_state_unlock;
2007                 init_scm = false;
2008         }
2009
2010         skb = skb_peek_tail(&other->sk_receive_queue);
2011         if (tail && tail == skb) {
2012                 skb = newskb;
2013         } else if (!skb || !unix_skb_scm_eq(skb, &scm)) {
2014                 if (newskb) {
2015                         skb = newskb;
2016                 } else {
2017                         tail = skb;
2018                         goto alloc_skb;
2019                 }
2020         } else if (newskb) {
2021                 /* this is fast path, we don't necessarily need to
2022                  * call to kfree_skb even though with newskb == NULL
2023                  * this - does no harm
2024                  */
2025                 consume_skb(newskb);
2026                 newskb = NULL;
2027         }
2028
2029         if (skb_append_pagefrags(skb, page, offset, size)) {
2030                 tail = skb;
2031                 goto alloc_skb;
2032         }
2033
2034         skb->len += size;
2035         skb->data_len += size;
2036         skb->truesize += size;
2037         atomic_add(size, &sk->sk_wmem_alloc);
2038
2039         if (newskb) {
2040                 err = unix_scm_to_skb(&scm, skb, false);
2041                 if (err)
2042                         goto err_state_unlock;
2043                 spin_lock(&other->sk_receive_queue.lock);
2044                 __skb_queue_tail(&other->sk_receive_queue, newskb);
2045                 spin_unlock(&other->sk_receive_queue.lock);
2046         }
2047
2048         unix_state_unlock(other);
2049         mutex_unlock(&unix_sk(other)->readlock);
2050
2051         other->sk_data_ready(other);
2052         scm_destroy(&scm);
2053         return size;
2054
2055 err_state_unlock:
2056         unix_state_unlock(other);
2057 err_unlock:
2058         mutex_unlock(&unix_sk(other)->readlock);
2059 err:
2060         kfree_skb(newskb);
2061         if (send_sigpipe && !(flags & MSG_NOSIGNAL))
2062                 send_sig(SIGPIPE, current, 0);
2063         if (!init_scm)
2064                 scm_destroy(&scm);
2065         return err;
2066 }
2067
2068 static int unix_seqpacket_sendmsg(struct socket *sock, struct msghdr *msg,
2069                                   size_t len)
2070 {
2071         int err;
2072         struct sock *sk = sock->sk;
2073
2074         err = sock_error(sk);
2075         if (err)
2076                 return err;
2077
2078         if (sk->sk_state != TCP_ESTABLISHED)
2079                 return -ENOTCONN;
2080
2081         if (msg->msg_namelen)
2082                 msg->msg_namelen = 0;
2083
2084         return unix_dgram_sendmsg(sock, msg, len);
2085 }
2086
2087 static int unix_seqpacket_recvmsg(struct socket *sock, struct msghdr *msg,
2088                                   size_t size, int flags)
2089 {
2090         struct sock *sk = sock->sk;
2091
2092         if (sk->sk_state != TCP_ESTABLISHED)
2093                 return -ENOTCONN;
2094
2095         return unix_dgram_recvmsg(sock, msg, size, flags);
2096 }
2097
2098 static void unix_copy_addr(struct msghdr *msg, struct sock *sk)
2099 {
2100         struct unix_sock *u = unix_sk(sk);
2101
2102         if (u->addr) {
2103                 msg->msg_namelen = u->addr->len;
2104                 memcpy(msg->msg_name, u->addr->name, u->addr->len);
2105         }
2106 }
2107
2108 static int unix_dgram_recvmsg(struct socket *sock, struct msghdr *msg,
2109                               size_t size, int flags)
2110 {
2111         struct scm_cookie scm;
2112         struct sock *sk = sock->sk;
2113         struct unix_sock *u = unix_sk(sk);
2114         struct sk_buff *skb, *last;
2115         long timeo;
2116         int err;
2117         int peeked, skip;
2118
2119         err = -EOPNOTSUPP;
2120         if (flags&MSG_OOB)
2121                 goto out;
2122
2123         timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT);
2124
2125         do {
2126                 mutex_lock(&u->readlock);
2127
2128                 skip = sk_peek_offset(sk, flags);
2129                 skb = __skb_try_recv_datagram(sk, flags, &peeked, &skip, &err,
2130                                               &last);
2131                 if (skb)
2132                         break;
2133
2134                 mutex_unlock(&u->readlock);
2135
2136                 if (err != -EAGAIN)
2137                         break;
2138         } while (timeo &&
2139                  !__skb_wait_for_more_packets(sk, &err, &timeo, last));
2140
2141         if (!skb) { /* implies readlock unlocked */
2142                 unix_state_lock(sk);
2143                 /* Signal EOF on disconnected non-blocking SEQPACKET socket. */
2144                 if (sk->sk_type == SOCK_SEQPACKET && err == -EAGAIN &&
2145                     (sk->sk_shutdown & RCV_SHUTDOWN))
2146                         err = 0;
2147                 unix_state_unlock(sk);
2148                 goto out;
2149         }
2150
2151         if (wq_has_sleeper(&u->peer_wait))
2152                 wake_up_interruptible_sync_poll(&u->peer_wait,
2153                                                 POLLOUT | POLLWRNORM |
2154                                                 POLLWRBAND);
2155
2156         if (msg->msg_name)
2157                 unix_copy_addr(msg, skb->sk);
2158
2159         if (size > skb->len - skip)
2160                 size = skb->len - skip;
2161         else if (size < skb->len - skip)
2162                 msg->msg_flags |= MSG_TRUNC;
2163
2164         err = skb_copy_datagram_msg(skb, skip, msg, size);
2165         if (err)
2166                 goto out_free;
2167
2168         if (sock_flag(sk, SOCK_RCVTSTAMP))
2169                 __sock_recv_timestamp(msg, sk, skb);
2170
2171         memset(&scm, 0, sizeof(scm));
2172
2173         scm_set_cred(&scm, UNIXCB(skb).pid, UNIXCB(skb).uid, UNIXCB(skb).gid);
2174         unix_set_secdata(&scm, skb);
2175
2176         if (!(flags & MSG_PEEK)) {
2177                 if (UNIXCB(skb).fp)
2178                         unix_detach_fds(&scm, skb);
2179
2180                 sk_peek_offset_bwd(sk, skb->len);
2181         } else {
2182                 /* It is questionable: on PEEK we could:
2183                    - do not return fds - good, but too simple 8)
2184                    - return fds, and do not return them on read (old strategy,
2185                      apparently wrong)
2186                    - clone fds (I chose it for now, it is the most universal
2187                      solution)
2188
2189                    POSIX 1003.1g does not actually define this clearly
2190                    at all. POSIX 1003.1g doesn't define a lot of things
2191                    clearly however!
2192
2193                 */
2194
2195                 sk_peek_offset_fwd(sk, size);
2196
2197                 if (UNIXCB(skb).fp)
2198                         scm.fp = scm_fp_dup(UNIXCB(skb).fp);
2199         }
2200         err = (flags & MSG_TRUNC) ? skb->len - skip : size;
2201
2202         scm_recv(sock, msg, &scm, flags);
2203
2204 out_free:
2205         skb_free_datagram(sk, skb);
2206         mutex_unlock(&u->readlock);
2207 out:
2208         return err;
2209 }
2210
2211 /*
2212  *      Sleep until more data has arrived. But check for races..
2213  */
2214 static long unix_stream_data_wait(struct sock *sk, long timeo,
2215                                   struct sk_buff *last, unsigned int last_len)
2216 {
2217         struct sk_buff *tail;
2218         DEFINE_WAIT(wait);
2219
2220         unix_state_lock(sk);
2221
2222         for (;;) {
2223                 prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
2224
2225                 tail = skb_peek_tail(&sk->sk_receive_queue);
2226                 if (tail != last ||
2227                     (tail && tail->len != last_len) ||
2228                     sk->sk_err ||
2229                     (sk->sk_shutdown & RCV_SHUTDOWN) ||
2230                     signal_pending(current) ||
2231                     !timeo)
2232                         break;
2233
2234                 sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2235                 unix_state_unlock(sk);
2236                 timeo = freezable_schedule_timeout(timeo);
2237                 unix_state_lock(sk);
2238
2239                 if (sock_flag(sk, SOCK_DEAD))
2240                         break;
2241
2242                 sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2243         }
2244
2245         finish_wait(sk_sleep(sk), &wait);
2246         unix_state_unlock(sk);
2247         return timeo;
2248 }
2249
2250 static unsigned int unix_skb_len(const struct sk_buff *skb)
2251 {
2252         return skb->len - UNIXCB(skb).consumed;
2253 }
2254
2255 struct unix_stream_read_state {
2256         int (*recv_actor)(struct sk_buff *, int, int,
2257                           struct unix_stream_read_state *);
2258         struct socket *socket;
2259         struct msghdr *msg;
2260         struct pipe_inode_info *pipe;
2261         size_t size;
2262         int flags;
2263         unsigned int splice_flags;
2264 };
2265
2266 static int unix_stream_read_generic(struct unix_stream_read_state *state)
2267 {
2268         struct scm_cookie scm;
2269         struct socket *sock = state->socket;
2270         struct sock *sk = sock->sk;
2271         struct unix_sock *u = unix_sk(sk);
2272         int copied = 0;
2273         int flags = state->flags;
2274         int noblock = flags & MSG_DONTWAIT;
2275         bool check_creds = false;
2276         int target;
2277         int err = 0;
2278         long timeo;
2279         int skip;
2280         size_t size = state->size;
2281         unsigned int last_len;
2282
2283         if (unlikely(sk->sk_state != TCP_ESTABLISHED)) {
2284                 err = -EINVAL;
2285                 goto out;
2286         }
2287
2288         if (unlikely(flags & MSG_OOB)) {
2289                 err = -EOPNOTSUPP;
2290                 goto out;
2291         }
2292
2293         target = sock_rcvlowat(sk, flags & MSG_WAITALL, size);
2294         timeo = sock_rcvtimeo(sk, noblock);
2295
2296         memset(&scm, 0, sizeof(scm));
2297
2298         /* Lock the socket to prevent queue disordering
2299          * while sleeps in memcpy_tomsg
2300          */
2301         mutex_lock(&u->readlock);
2302
2303         if (flags & MSG_PEEK)
2304                 skip = sk_peek_offset(sk, flags);
2305         else
2306                 skip = 0;
2307
2308         do {
2309                 int chunk;
2310                 bool drop_skb;
2311                 struct sk_buff *skb, *last;
2312
2313 redo:
2314                 unix_state_lock(sk);
2315                 if (sock_flag(sk, SOCK_DEAD)) {
2316                         err = -ECONNRESET;
2317                         goto unlock;
2318                 }
2319                 last = skb = skb_peek(&sk->sk_receive_queue);
2320                 last_len = last ? last->len : 0;
2321 again:
2322                 if (skb == NULL) {
2323                         unix_sk(sk)->recursion_level = 0;
2324                         if (copied >= target)
2325                                 goto unlock;
2326
2327                         /*
2328                          *      POSIX 1003.1g mandates this order.
2329                          */
2330
2331                         err = sock_error(sk);
2332                         if (err)
2333                                 goto unlock;
2334                         if (sk->sk_shutdown & RCV_SHUTDOWN)
2335                                 goto unlock;
2336
2337                         unix_state_unlock(sk);
2338                         if (!timeo) {
2339                                 err = -EAGAIN;
2340                                 break;
2341                         }
2342
2343                         mutex_unlock(&u->readlock);
2344
2345                         timeo = unix_stream_data_wait(sk, timeo, last,
2346                                                       last_len);
2347
2348                         if (signal_pending(current)) {
2349                                 err = sock_intr_errno(timeo);
2350                                 scm_destroy(&scm);
2351                                 goto out;
2352                         }
2353
2354                         mutex_lock(&u->readlock);
2355                         goto redo;
2356 unlock:
2357                         unix_state_unlock(sk);
2358                         break;
2359                 }
2360
2361                 while (skip >= unix_skb_len(skb)) {
2362                         skip -= unix_skb_len(skb);
2363                         last = skb;
2364                         last_len = skb->len;
2365                         skb = skb_peek_next(skb, &sk->sk_receive_queue);
2366                         if (!skb)
2367                                 goto again;
2368                 }
2369
2370                 unix_state_unlock(sk);
2371
2372                 if (check_creds) {
2373                         /* Never glue messages from different writers */
2374                         if (!unix_skb_scm_eq(skb, &scm))
2375                                 break;
2376                 } else if (test_bit(SOCK_PASSCRED, &sock->flags)) {
2377                         /* Copy credentials */
2378                         scm_set_cred(&scm, UNIXCB(skb).pid, UNIXCB(skb).uid, UNIXCB(skb).gid);
2379                         unix_set_secdata(&scm, skb);
2380                         check_creds = true;
2381                 }
2382
2383                 /* Copy address just once */
2384                 if (state->msg && state->msg->msg_name) {
2385                         DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr,
2386                                          state->msg->msg_name);
2387                         unix_copy_addr(state->msg, skb->sk);
2388                         sunaddr = NULL;
2389                 }
2390
2391                 chunk = min_t(unsigned int, unix_skb_len(skb) - skip, size);
2392                 skb_get(skb);
2393                 chunk = state->recv_actor(skb, skip, chunk, state);
2394                 drop_skb = !unix_skb_len(skb);
2395                 /* skb is only safe to use if !drop_skb */
2396                 consume_skb(skb);
2397                 if (chunk < 0) {
2398                         if (copied == 0)
2399                                 copied = -EFAULT;
2400                         break;
2401                 }
2402                 copied += chunk;
2403                 size -= chunk;
2404
2405                 if (drop_skb) {
2406                         /* the skb was touched by a concurrent reader;
2407                          * we should not expect anything from this skb
2408                          * anymore and assume it invalid - we can be
2409                          * sure it was dropped from the socket queue
2410                          *
2411                          * let's report a short read
2412                          */
2413                         err = 0;
2414                         break;
2415                 }
2416
2417                 /* Mark read part of skb as used */
2418                 if (!(flags & MSG_PEEK)) {
2419                         UNIXCB(skb).consumed += chunk;
2420
2421                         sk_peek_offset_bwd(sk, chunk);
2422
2423                         if (UNIXCB(skb).fp)
2424                                 unix_detach_fds(&scm, skb);
2425
2426                         if (unix_skb_len(skb))
2427                                 break;
2428
2429                         skb_unlink(skb, &sk->sk_receive_queue);
2430                         consume_skb(skb);
2431
2432                         if (scm.fp)
2433                                 break;
2434                 } else {
2435                         /* It is questionable, see note in unix_dgram_recvmsg.
2436                          */
2437                         if (UNIXCB(skb).fp)
2438                                 scm.fp = scm_fp_dup(UNIXCB(skb).fp);
2439
2440                         sk_peek_offset_fwd(sk, chunk);
2441
2442                         if (UNIXCB(skb).fp)
2443                                 break;
2444
2445                         skip = 0;
2446                         last = skb;
2447                         last_len = skb->len;
2448                         unix_state_lock(sk);
2449                         skb = skb_peek_next(skb, &sk->sk_receive_queue);
2450                         if (skb)
2451                                 goto again;
2452                         unix_state_unlock(sk);
2453                         break;
2454                 }
2455         } while (size);
2456
2457         mutex_unlock(&u->readlock);
2458         if (state->msg)
2459                 scm_recv(sock, state->msg, &scm, flags);
2460         else
2461                 scm_destroy(&scm);
2462 out:
2463         return copied ? : err;
2464 }
2465
2466 static int unix_stream_read_actor(struct sk_buff *skb,
2467                                   int skip, int chunk,
2468                                   struct unix_stream_read_state *state)
2469 {
2470         int ret;
2471
2472         ret = skb_copy_datagram_msg(skb, UNIXCB(skb).consumed + skip,
2473                                     state->msg, chunk);
2474         return ret ?: chunk;
2475 }
2476
2477 static int unix_stream_recvmsg(struct socket *sock, struct msghdr *msg,
2478                                size_t size, int flags)
2479 {
2480         struct unix_stream_read_state state = {
2481                 .recv_actor = unix_stream_read_actor,
2482                 .socket = sock,
2483                 .msg = msg,
2484                 .size = size,
2485                 .flags = flags
2486         };
2487
2488         return unix_stream_read_generic(&state);
2489 }
2490
2491 static ssize_t skb_unix_socket_splice(struct sock *sk,
2492                                       struct pipe_inode_info *pipe,
2493                                       struct splice_pipe_desc *spd)
2494 {
2495         int ret;
2496         struct unix_sock *u = unix_sk(sk);
2497
2498         mutex_unlock(&u->readlock);
2499         ret = splice_to_pipe(pipe, spd);
2500         mutex_lock(&u->readlock);
2501
2502         return ret;
2503 }
2504
2505 static int unix_stream_splice_actor(struct sk_buff *skb,
2506                                     int skip, int chunk,
2507                                     struct unix_stream_read_state *state)
2508 {
2509         return skb_splice_bits(skb, state->socket->sk,
2510                                UNIXCB(skb).consumed + skip,
2511                                state->pipe, chunk, state->splice_flags,
2512                                skb_unix_socket_splice);
2513 }
2514
2515 static ssize_t unix_stream_splice_read(struct socket *sock,  loff_t *ppos,
2516                                        struct pipe_inode_info *pipe,
2517                                        size_t size, unsigned int flags)
2518 {
2519         struct unix_stream_read_state state = {
2520                 .recv_actor = unix_stream_splice_actor,
2521                 .socket = sock,
2522                 .pipe = pipe,
2523                 .size = size,
2524                 .splice_flags = flags,
2525         };
2526
2527         if (unlikely(*ppos))
2528                 return -ESPIPE;
2529
2530         if (sock->file->f_flags & O_NONBLOCK ||
2531             flags & SPLICE_F_NONBLOCK)
2532                 state.flags = MSG_DONTWAIT;
2533
2534         return unix_stream_read_generic(&state);
2535 }
2536
2537 static int unix_shutdown(struct socket *sock, int mode)
2538 {
2539         struct sock *sk = sock->sk;
2540         struct sock *other;
2541
2542         if (mode < SHUT_RD || mode > SHUT_RDWR)
2543                 return -EINVAL;
2544         /* This maps:
2545          * SHUT_RD   (0) -> RCV_SHUTDOWN  (1)
2546          * SHUT_WR   (1) -> SEND_SHUTDOWN (2)
2547          * SHUT_RDWR (2) -> SHUTDOWN_MASK (3)
2548          */
2549         ++mode;
2550
2551         unix_state_lock(sk);
2552         sk->sk_shutdown |= mode;
2553         other = unix_peer(sk);
2554         if (other)
2555                 sock_hold(other);
2556         unix_state_unlock(sk);
2557         sk->sk_state_change(sk);
2558
2559         if (other &&
2560                 (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET)) {
2561
2562                 int peer_mode = 0;
2563
2564                 if (mode&RCV_SHUTDOWN)
2565                         peer_mode |= SEND_SHUTDOWN;
2566                 if (mode&SEND_SHUTDOWN)
2567                         peer_mode |= RCV_SHUTDOWN;
2568                 unix_state_lock(other);
2569                 other->sk_shutdown |= peer_mode;
2570                 unix_state_unlock(other);
2571                 other->sk_state_change(other);
2572                 if (peer_mode == SHUTDOWN_MASK)
2573                         sk_wake_async(other, SOCK_WAKE_WAITD, POLL_HUP);
2574                 else if (peer_mode & RCV_SHUTDOWN)
2575                         sk_wake_async(other, SOCK_WAKE_WAITD, POLL_IN);
2576         }
2577         if (other)
2578                 sock_put(other);
2579
2580         return 0;
2581 }
2582
2583 long unix_inq_len(struct sock *sk)
2584 {
2585         struct sk_buff *skb;
2586         long amount = 0;
2587
2588         if (sk->sk_state == TCP_LISTEN)
2589                 return -EINVAL;
2590
2591         spin_lock(&sk->sk_receive_queue.lock);
2592         if (sk->sk_type == SOCK_STREAM ||
2593             sk->sk_type == SOCK_SEQPACKET) {
2594                 skb_queue_walk(&sk->sk_receive_queue, skb)
2595                         amount += unix_skb_len(skb);
2596         } else {
2597                 skb = skb_peek(&sk->sk_receive_queue);
2598                 if (skb)
2599                         amount = skb->len;
2600         }
2601         spin_unlock(&sk->sk_receive_queue.lock);
2602
2603         return amount;
2604 }
2605 EXPORT_SYMBOL_GPL(unix_inq_len);
2606
2607 long unix_outq_len(struct sock *sk)
2608 {
2609         return sk_wmem_alloc_get(sk);
2610 }
2611 EXPORT_SYMBOL_GPL(unix_outq_len);
2612
2613 static int unix_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
2614 {
2615         struct sock *sk = sock->sk;
2616         long amount = 0;
2617         int err;
2618
2619         switch (cmd) {
2620         case SIOCOUTQ:
2621                 amount = unix_outq_len(sk);
2622                 err = put_user(amount, (int __user *)arg);
2623                 break;
2624         case SIOCINQ:
2625                 amount = unix_inq_len(sk);
2626                 if (amount < 0)
2627                         err = amount;
2628                 else
2629                         err = put_user(amount, (int __user *)arg);
2630                 break;
2631         default:
2632                 err = -ENOIOCTLCMD;
2633                 break;
2634         }
2635         return err;
2636 }
2637
2638 static unsigned int unix_poll(struct file *file, struct socket *sock, poll_table *wait)
2639 {
2640         struct sock *sk = sock->sk;
2641         unsigned int mask;
2642
2643         sock_poll_wait(file, sk_sleep(sk), wait);
2644         mask = 0;
2645
2646         /* exceptional events? */
2647         if (sk->sk_err)
2648                 mask |= POLLERR;
2649         if (sk->sk_shutdown == SHUTDOWN_MASK)
2650                 mask |= POLLHUP;
2651         if (sk->sk_shutdown & RCV_SHUTDOWN)
2652                 mask |= POLLRDHUP | POLLIN | POLLRDNORM;
2653
2654         /* readable? */
2655         if (!skb_queue_empty(&sk->sk_receive_queue))
2656                 mask |= POLLIN | POLLRDNORM;
2657
2658         /* Connection-based need to check for termination and startup */
2659         if ((sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) &&
2660             sk->sk_state == TCP_CLOSE)
2661                 mask |= POLLHUP;
2662
2663         /*
2664          * we set writable also when the other side has shut down the
2665          * connection. This prevents stuck sockets.
2666          */
2667         if (unix_writable(sk))
2668                 mask |= POLLOUT | POLLWRNORM | POLLWRBAND;
2669
2670         return mask;
2671 }
2672
2673 static unsigned int unix_dgram_poll(struct file *file, struct socket *sock,
2674                                     poll_table *wait)
2675 {
2676         struct sock *sk = sock->sk, *other;
2677         unsigned int mask, writable;
2678
2679         sock_poll_wait(file, sk_sleep(sk), wait);
2680         mask = 0;
2681
2682         /* exceptional events? */
2683         if (sk->sk_err || !skb_queue_empty(&sk->sk_error_queue))
2684                 mask |= POLLERR |
2685                         (sock_flag(sk, SOCK_SELECT_ERR_QUEUE) ? POLLPRI : 0);
2686
2687         if (sk->sk_shutdown & RCV_SHUTDOWN)
2688                 mask |= POLLRDHUP | POLLIN | POLLRDNORM;
2689         if (sk->sk_shutdown == SHUTDOWN_MASK)
2690                 mask |= POLLHUP;
2691
2692         /* readable? */
2693         if (!skb_queue_empty(&sk->sk_receive_queue))
2694                 mask |= POLLIN | POLLRDNORM;
2695
2696         /* Connection-based need to check for termination and startup */
2697         if (sk->sk_type == SOCK_SEQPACKET) {
2698                 if (sk->sk_state == TCP_CLOSE)
2699                         mask |= POLLHUP;
2700                 /* connection hasn't started yet? */
2701                 if (sk->sk_state == TCP_SYN_SENT)
2702                         return mask;
2703         }
2704
2705         /* No write status requested, avoid expensive OUT tests. */
2706         if (!(poll_requested_events(wait) & (POLLWRBAND|POLLWRNORM|POLLOUT)))
2707                 return mask;
2708
2709         writable = unix_writable(sk);
2710         if (writable) {
2711                 unix_state_lock(sk);
2712
2713                 other = unix_peer(sk);
2714                 if (other && unix_peer(other) != sk &&
2715                     unix_recvq_full(other) &&
2716                     unix_dgram_peer_wake_me(sk, other))
2717                         writable = 0;
2718
2719                 unix_state_unlock(sk);
2720         }
2721
2722         if (writable)
2723                 mask |= POLLOUT | POLLWRNORM | POLLWRBAND;
2724         else
2725                 sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
2726
2727         return mask;
2728 }
2729
2730 #ifdef CONFIG_PROC_FS
2731
2732 #define BUCKET_SPACE (BITS_PER_LONG - (UNIX_HASH_BITS + 1) - 1)
2733
2734 #define get_bucket(x) ((x) >> BUCKET_SPACE)
2735 #define get_offset(x) ((x) & ((1L << BUCKET_SPACE) - 1))
2736 #define set_bucket_offset(b, o) ((b) << BUCKET_SPACE | (o))
2737
2738 static struct sock *unix_from_bucket(struct seq_file *seq, loff_t *pos)
2739 {
2740         unsigned long offset = get_offset(*pos);
2741         unsigned long bucket = get_bucket(*pos);
2742         struct sock *sk;
2743         unsigned long count = 0;
2744
2745         for (sk = sk_head(&unix_socket_table[bucket]); sk; sk = sk_next(sk)) {
2746                 if (sock_net(sk) != seq_file_net(seq))
2747                         continue;
2748                 if (++count == offset)
2749                         break;
2750         }
2751
2752         return sk;
2753 }
2754
2755 static struct sock *unix_next_socket(struct seq_file *seq,
2756                                      struct sock *sk,
2757                                      loff_t *pos)
2758 {
2759         unsigned long bucket;
2760
2761         while (sk > (struct sock *)SEQ_START_TOKEN) {
2762                 sk = sk_next(sk);
2763                 if (!sk)
2764                         goto next_bucket;
2765                 if (sock_net(sk) == seq_file_net(seq))
2766                         return sk;
2767         }
2768
2769         do {
2770                 sk = unix_from_bucket(seq, pos);
2771                 if (sk)
2772                         return sk;
2773
2774 next_bucket:
2775                 bucket = get_bucket(*pos) + 1;
2776                 *pos = set_bucket_offset(bucket, 1);
2777         } while (bucket < ARRAY_SIZE(unix_socket_table));
2778
2779         return NULL;
2780 }
2781
2782 static void *unix_seq_start(struct seq_file *seq, loff_t *pos)
2783         __acquires(unix_table_lock)
2784 {
2785         spin_lock(&unix_table_lock);
2786
2787         if (!*pos)
2788                 return SEQ_START_TOKEN;
2789
2790         if (get_bucket(*pos) >= ARRAY_SIZE(unix_socket_table))
2791                 return NULL;
2792
2793         return unix_next_socket(seq, NULL, pos);
2794 }
2795
2796 static void *unix_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2797 {
2798         ++*pos;
2799         return unix_next_socket(seq, v, pos);
2800 }
2801
2802 static void unix_seq_stop(struct seq_file *seq, void *v)
2803         __releases(unix_table_lock)
2804 {
2805         spin_unlock(&unix_table_lock);
2806 }
2807
2808 static int unix_seq_show(struct seq_file *seq, void *v)
2809 {
2810
2811         if (v == SEQ_START_TOKEN)
2812                 seq_puts(seq, "Num       RefCount Protocol Flags    Type St "
2813                          "Inode Path\n");
2814         else {
2815                 struct sock *s = v;
2816                 struct unix_sock *u = unix_sk(s);
2817                 unix_state_lock(s);
2818
2819                 seq_printf(seq, "%pK: %08X %08X %08X %04X %02X %5lu",
2820                         s,
2821                         atomic_read(&s->sk_refcnt),
2822                         0,
2823                         s->sk_state == TCP_LISTEN ? __SO_ACCEPTCON : 0,
2824                         s->sk_type,
2825                         s->sk_socket ?
2826                         (s->sk_state == TCP_ESTABLISHED ? SS_CONNECTED : SS_UNCONNECTED) :
2827                         (s->sk_state == TCP_ESTABLISHED ? SS_CONNECTING : SS_DISCONNECTING),
2828                         sock_i_ino(s));
2829
2830                 if (u->addr) {
2831                         int i, len;
2832                         seq_putc(seq, ' ');
2833
2834                         i = 0;
2835                         len = u->addr->len - sizeof(short);
2836                         if (!UNIX_ABSTRACT(s))
2837                                 len--;
2838                         else {
2839                                 seq_putc(seq, '@');
2840                                 i++;
2841                         }
2842                         for ( ; i < len; i++)
2843                                 seq_putc(seq, u->addr->name->sun_path[i]);
2844                 }
2845                 unix_state_unlock(s);
2846                 seq_putc(seq, '\n');
2847         }
2848
2849         return 0;
2850 }
2851
2852 static const struct seq_operations unix_seq_ops = {
2853         .start  = unix_seq_start,
2854         .next   = unix_seq_next,
2855         .stop   = unix_seq_stop,
2856         .show   = unix_seq_show,
2857 };
2858
2859 static int unix_seq_open(struct inode *inode, struct file *file)
2860 {
2861         return seq_open_net(inode, file, &unix_seq_ops,
2862                             sizeof(struct seq_net_private));
2863 }
2864
2865 static const struct file_operations unix_seq_fops = {
2866         .owner          = THIS_MODULE,
2867         .open           = unix_seq_open,
2868         .read           = seq_read,
2869         .llseek         = seq_lseek,
2870         .release        = seq_release_net,
2871 };
2872
2873 #endif
2874
2875 static const struct net_proto_family unix_family_ops = {
2876         .family = PF_UNIX,
2877         .create = unix_create,
2878         .owner  = THIS_MODULE,
2879 };
2880
2881
2882 static int __net_init unix_net_init(struct net *net)
2883 {
2884         int error = -ENOMEM;
2885
2886         net->unx.sysctl_max_dgram_qlen = 10;
2887         if (unix_sysctl_register(net))
2888                 goto out;
2889
2890 #ifdef CONFIG_PROC_FS
2891         if (!proc_create("unix", 0, net->proc_net, &unix_seq_fops)) {
2892                 unix_sysctl_unregister(net);
2893                 goto out;
2894         }
2895 #endif
2896         error = 0;
2897 out:
2898         return error;
2899 }
2900
2901 static void __net_exit unix_net_exit(struct net *net)
2902 {
2903         unix_sysctl_unregister(net);
2904         remove_proc_entry("unix", net->proc_net);
2905 }
2906
2907 static struct pernet_operations unix_net_ops = {
2908         .init = unix_net_init,
2909         .exit = unix_net_exit,
2910 };
2911
2912 static int __init af_unix_init(void)
2913 {
2914         int rc = -1;
2915
2916         BUILD_BUG_ON(sizeof(struct unix_skb_parms) > FIELD_SIZEOF(struct sk_buff, cb));
2917
2918         rc = proto_register(&unix_proto, 1);
2919         if (rc != 0) {
2920                 pr_crit("%s: Cannot create unix_sock SLAB cache!\n", __func__);
2921                 goto out;
2922         }
2923
2924         sock_register(&unix_family_ops);
2925         register_pernet_subsys(&unix_net_ops);
2926 out:
2927         return rc;
2928 }
2929
2930 static void __exit af_unix_exit(void)
2931 {
2932         sock_unregister(PF_UNIX);
2933         proto_unregister(&unix_proto);
2934         unregister_pernet_subsys(&unix_net_ops);
2935 }
2936
2937 /* Earlier than device_initcall() so that other drivers invoking
2938    request_module() don't end up in a loop when modprobe tries
2939    to use a UNIX socket. But later than subsys_initcall() because
2940    we depend on stuff initialised there */
2941 fs_initcall(af_unix_init);
2942 module_exit(af_unix_exit);
2943
2944 MODULE_LICENSE("GPL");
2945 MODULE_ALIAS_NETPROTO(PF_UNIX);