ALSA: hda/realtek - Change front mic location for Lenovo M710q
[linux/fpc-iii.git] / net / unix / af_unix.c
blobf601933ad7285450ad8b5090c46f1f59cba896c7
1 /*
2 * NET4: Implementation of BSD Unix domain sockets.
4 * Authors: Alan Cox, <alan@lxorguk.ukuu.org.uk>
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version
9 * 2 of the License, or (at your option) any later version.
11 * Fixes:
12 * Linus Torvalds : Assorted bug cures.
13 * Niibe Yutaka : async I/O support.
14 * Carsten Paeth : PF_UNIX check, address fixes.
15 * Alan Cox : Limit size of allocated blocks.
16 * Alan Cox : Fixed the stupid socketpair bug.
17 * Alan Cox : BSD compatibility fine tuning.
18 * Alan Cox : Fixed a bug in connect when interrupted.
19 * Alan Cox : Sorted out a proper draft version of
20 * file descriptor passing hacked up from
21 * Mike Shaver's work.
22 * Marty Leisner : Fixes to fd passing
23 * Nick Nevin : recvmsg bugfix.
24 * Alan Cox : Started proper garbage collector
25 * Heiko EiBfeldt : Missing verify_area check
26 * Alan Cox : Started POSIXisms
27 * Andreas Schwab : Replace inode by dentry for proper
28 * reference counting
29 * Kirk Petersen : Made this a module
30 * Christoph Rohland : Elegant non-blocking accept/connect algorithm.
31 * Lots of bug fixes.
32 * Alexey Kuznetosv : Repaired (I hope) bugs introduces
33 * by above two patches.
34 * Andrea Arcangeli : If possible we block in connect(2)
35 * if the max backlog of the listen socket
36 * is been reached. This won't break
37 * old apps and it will avoid huge amount
38 * of socks hashed (this for unix_gc()
39 * performances reasons).
40 * Security fix that limits the max
41 * number of socks to 2*max_files and
42 * the number of skb queueable in the
43 * dgram receiver.
44 * Artur Skawina : Hash function optimizations
45 * Alexey Kuznetsov : Full scale SMP. Lot of bugs are introduced 8)
46 * Malcolm Beattie : Set peercred for socketpair
47 * Michal Ostrowski : Module initialization cleanup.
48 * Arnaldo C. Melo : Remove MOD_{INC,DEC}_USE_COUNT,
49 * the core infrastructure is doing that
50 * for all net proto families now (2.5.69+)
53 * Known differences from reference BSD that was tested:
55 * [TO FIX]
56 * ECONNREFUSED is not returned from one end of a connected() socket to the
57 * other the moment one end closes.
58 * fstat() doesn't return st_dev=0, and give the blksize as high water mark
59 * and a fake inode identifier (nor the BSD first socket fstat twice bug).
60 * [NOT TO FIX]
61 * accept() returns a path name even if the connecting socket has closed
62 * in the meantime (BSD loses the path and gives up).
63 * accept() returns 0 length path for an unbound connector. BSD returns 16
64 * and a null first byte in the path (but not for gethost/peername - BSD bug ??)
65 * socketpair(...SOCK_RAW..) doesn't panic the kernel.
66 * BSD af_unix apparently has connect forgetting to block properly.
67 * (need to check this with the POSIX spec in detail)
69 * Differences from 2.0.0-11-... (ANK)
70 * Bug fixes and improvements.
71 * - client shutdown killed server socket.
72 * - removed all useless cli/sti pairs.
74 * Semantic changes/extensions.
75 * - generic control message passing.
76 * - SCM_CREDENTIALS control message.
77 * - "Abstract" (not FS based) socket bindings.
78 * Abstract names are sequences of bytes (not zero terminated)
79 * started by 0, so that this name space does not intersect
80 * with BSD names.
83 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
85 #include <linux/module.h>
86 #include <linux/kernel.h>
87 #include <linux/signal.h>
88 #include <linux/sched/signal.h>
89 #include <linux/errno.h>
90 #include <linux/string.h>
91 #include <linux/stat.h>
92 #include <linux/dcache.h>
93 #include <linux/namei.h>
94 #include <linux/socket.h>
95 #include <linux/un.h>
96 #include <linux/fcntl.h>
97 #include <linux/termios.h>
98 #include <linux/sockios.h>
99 #include <linux/net.h>
100 #include <linux/in.h>
101 #include <linux/fs.h>
102 #include <linux/slab.h>
103 #include <linux/uaccess.h>
104 #include <linux/skbuff.h>
105 #include <linux/netdevice.h>
106 #include <net/net_namespace.h>
107 #include <net/sock.h>
108 #include <net/tcp_states.h>
109 #include <net/af_unix.h>
110 #include <linux/proc_fs.h>
111 #include <linux/seq_file.h>
112 #include <net/scm.h>
113 #include <linux/init.h>
114 #include <linux/poll.h>
115 #include <linux/rtnetlink.h>
116 #include <linux/mount.h>
117 #include <net/checksum.h>
118 #include <linux/security.h>
119 #include <linux/freezer.h>
120 #include <linux/file.h>
122 struct hlist_head unix_socket_table[2 * UNIX_HASH_SIZE];
123 EXPORT_SYMBOL_GPL(unix_socket_table);
124 DEFINE_SPINLOCK(unix_table_lock);
125 EXPORT_SYMBOL_GPL(unix_table_lock);
126 static atomic_long_t unix_nr_socks;
129 static struct hlist_head *unix_sockets_unbound(void *addr)
131 unsigned long hash = (unsigned long)addr;
133 hash ^= hash >> 16;
134 hash ^= hash >> 8;
135 hash %= UNIX_HASH_SIZE;
136 return &unix_socket_table[UNIX_HASH_SIZE + hash];
139 #define UNIX_ABSTRACT(sk) (unix_sk(sk)->addr->hash < UNIX_HASH_SIZE)
141 #ifdef CONFIG_SECURITY_NETWORK
142 static void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb)
144 UNIXCB(skb).secid = scm->secid;
147 static inline void unix_set_secdata(struct scm_cookie *scm, struct sk_buff *skb)
149 scm->secid = UNIXCB(skb).secid;
152 static inline bool unix_secdata_eq(struct scm_cookie *scm, struct sk_buff *skb)
154 return (scm->secid == UNIXCB(skb).secid);
156 #else
157 static inline void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb)
160 static inline void unix_set_secdata(struct scm_cookie *scm, struct sk_buff *skb)
163 static inline bool unix_secdata_eq(struct scm_cookie *scm, struct sk_buff *skb)
165 return true;
167 #endif /* CONFIG_SECURITY_NETWORK */
170 * SMP locking strategy:
171 * hash table is protected with spinlock unix_table_lock
172 * each socket state is protected by separate spin lock.
175 static inline unsigned int unix_hash_fold(__wsum n)
177 unsigned int hash = (__force unsigned int)csum_fold(n);
179 hash ^= hash>>8;
180 return hash&(UNIX_HASH_SIZE-1);
183 #define unix_peer(sk) (unix_sk(sk)->peer)
185 static inline int unix_our_peer(struct sock *sk, struct sock *osk)
187 return unix_peer(osk) == sk;
190 static inline int unix_may_send(struct sock *sk, struct sock *osk)
192 return unix_peer(osk) == NULL || unix_our_peer(sk, osk);
195 static inline int unix_recvq_full(struct sock const *sk)
197 return skb_queue_len(&sk->sk_receive_queue) > sk->sk_max_ack_backlog;
200 struct sock *unix_peer_get(struct sock *s)
202 struct sock *peer;
204 unix_state_lock(s);
205 peer = unix_peer(s);
206 if (peer)
207 sock_hold(peer);
208 unix_state_unlock(s);
209 return peer;
211 EXPORT_SYMBOL_GPL(unix_peer_get);
213 static inline void unix_release_addr(struct unix_address *addr)
215 if (refcount_dec_and_test(&addr->refcnt))
216 kfree(addr);
220 * Check unix socket name:
221 * - should be not zero length.
222 * - if started by not zero, should be NULL terminated (FS object)
223 * - if started by zero, it is abstract name.
226 static int unix_mkname(struct sockaddr_un *sunaddr, int len, unsigned int *hashp)
228 if (len <= sizeof(short) || len > sizeof(*sunaddr))
229 return -EINVAL;
230 if (!sunaddr || sunaddr->sun_family != AF_UNIX)
231 return -EINVAL;
232 if (sunaddr->sun_path[0]) {
234 * This may look like an off by one error but it is a bit more
235 * subtle. 108 is the longest valid AF_UNIX path for a binding.
236 * sun_path[108] doesn't as such exist. However in kernel space
237 * we are guaranteed that it is a valid memory location in our
238 * kernel address buffer.
240 ((char *)sunaddr)[len] = 0;
241 len = strlen(sunaddr->sun_path)+1+sizeof(short);
242 return len;
245 *hashp = unix_hash_fold(csum_partial(sunaddr, len, 0));
246 return len;
249 static void __unix_remove_socket(struct sock *sk)
251 sk_del_node_init(sk);
254 static void __unix_insert_socket(struct hlist_head *list, struct sock *sk)
256 WARN_ON(!sk_unhashed(sk));
257 sk_add_node(sk, list);
260 static inline void unix_remove_socket(struct sock *sk)
262 spin_lock(&unix_table_lock);
263 __unix_remove_socket(sk);
264 spin_unlock(&unix_table_lock);
267 static inline void unix_insert_socket(struct hlist_head *list, struct sock *sk)
269 spin_lock(&unix_table_lock);
270 __unix_insert_socket(list, sk);
271 spin_unlock(&unix_table_lock);
274 static struct sock *__unix_find_socket_byname(struct net *net,
275 struct sockaddr_un *sunname,
276 int len, int type, unsigned int hash)
278 struct sock *s;
280 sk_for_each(s, &unix_socket_table[hash ^ type]) {
281 struct unix_sock *u = unix_sk(s);
283 if (!net_eq(sock_net(s), net))
284 continue;
286 if (u->addr->len == len &&
287 !memcmp(u->addr->name, sunname, len))
288 goto found;
290 s = NULL;
291 found:
292 return s;
295 static inline struct sock *unix_find_socket_byname(struct net *net,
296 struct sockaddr_un *sunname,
297 int len, int type,
298 unsigned int hash)
300 struct sock *s;
302 spin_lock(&unix_table_lock);
303 s = __unix_find_socket_byname(net, sunname, len, type, hash);
304 if (s)
305 sock_hold(s);
306 spin_unlock(&unix_table_lock);
307 return s;
310 static struct sock *unix_find_socket_byinode(struct inode *i)
312 struct sock *s;
314 spin_lock(&unix_table_lock);
315 sk_for_each(s,
316 &unix_socket_table[i->i_ino & (UNIX_HASH_SIZE - 1)]) {
317 struct dentry *dentry = unix_sk(s)->path.dentry;
319 if (dentry && d_backing_inode(dentry) == i) {
320 sock_hold(s);
321 goto found;
324 s = NULL;
325 found:
326 spin_unlock(&unix_table_lock);
327 return s;
330 /* Support code for asymmetrically connected dgram sockets
332 * If a datagram socket is connected to a socket not itself connected
333 * to the first socket (eg, /dev/log), clients may only enqueue more
334 * messages if the present receive queue of the server socket is not
335 * "too large". This means there's a second writeability condition
336 * poll and sendmsg need to test. The dgram recv code will do a wake
337 * up on the peer_wait wait queue of a socket upon reception of a
338 * datagram which needs to be propagated to sleeping would-be writers
339 * since these might not have sent anything so far. This can't be
340 * accomplished via poll_wait because the lifetime of the server
341 * socket might be less than that of its clients if these break their
342 * association with it or if the server socket is closed while clients
343 * are still connected to it and there's no way to inform "a polling
344 * implementation" that it should let go of a certain wait queue
346 * In order to propagate a wake up, a wait_queue_entry_t of the client
347 * socket is enqueued on the peer_wait queue of the server socket
348 * whose wake function does a wake_up on the ordinary client socket
349 * wait queue. This connection is established whenever a write (or
350 * poll for write) hit the flow control condition and broken when the
351 * association to the server socket is dissolved or after a wake up
352 * was relayed.
355 static int unix_dgram_peer_wake_relay(wait_queue_entry_t *q, unsigned mode, int flags,
356 void *key)
358 struct unix_sock *u;
359 wait_queue_head_t *u_sleep;
361 u = container_of(q, struct unix_sock, peer_wake);
363 __remove_wait_queue(&unix_sk(u->peer_wake.private)->peer_wait,
365 u->peer_wake.private = NULL;
367 /* relaying can only happen while the wq still exists */
368 u_sleep = sk_sleep(&u->sk);
369 if (u_sleep)
370 wake_up_interruptible_poll(u_sleep, key_to_poll(key));
372 return 0;
375 static int unix_dgram_peer_wake_connect(struct sock *sk, struct sock *other)
377 struct unix_sock *u, *u_other;
378 int rc;
380 u = unix_sk(sk);
381 u_other = unix_sk(other);
382 rc = 0;
383 spin_lock(&u_other->peer_wait.lock);
385 if (!u->peer_wake.private) {
386 u->peer_wake.private = other;
387 __add_wait_queue(&u_other->peer_wait, &u->peer_wake);
389 rc = 1;
392 spin_unlock(&u_other->peer_wait.lock);
393 return rc;
396 static void unix_dgram_peer_wake_disconnect(struct sock *sk,
397 struct sock *other)
399 struct unix_sock *u, *u_other;
401 u = unix_sk(sk);
402 u_other = unix_sk(other);
403 spin_lock(&u_other->peer_wait.lock);
405 if (u->peer_wake.private == other) {
406 __remove_wait_queue(&u_other->peer_wait, &u->peer_wake);
407 u->peer_wake.private = NULL;
410 spin_unlock(&u_other->peer_wait.lock);
413 static void unix_dgram_peer_wake_disconnect_wakeup(struct sock *sk,
414 struct sock *other)
416 unix_dgram_peer_wake_disconnect(sk, other);
417 wake_up_interruptible_poll(sk_sleep(sk),
418 EPOLLOUT |
419 EPOLLWRNORM |
420 EPOLLWRBAND);
423 /* preconditions:
424 * - unix_peer(sk) == other
425 * - association is stable
427 static int unix_dgram_peer_wake_me(struct sock *sk, struct sock *other)
429 int connected;
431 connected = unix_dgram_peer_wake_connect(sk, other);
433 /* If other is SOCK_DEAD, we want to make sure we signal
434 * POLLOUT, such that a subsequent write() can get a
435 * -ECONNREFUSED. Otherwise, if we haven't queued any skbs
436 * to other and its full, we will hang waiting for POLLOUT.
438 if (unix_recvq_full(other) && !sock_flag(other, SOCK_DEAD))
439 return 1;
441 if (connected)
442 unix_dgram_peer_wake_disconnect(sk, other);
444 return 0;
447 static int unix_writable(const struct sock *sk)
449 return sk->sk_state != TCP_LISTEN &&
450 (refcount_read(&sk->sk_wmem_alloc) << 2) <= sk->sk_sndbuf;
453 static void unix_write_space(struct sock *sk)
455 struct socket_wq *wq;
457 rcu_read_lock();
458 if (unix_writable(sk)) {
459 wq = rcu_dereference(sk->sk_wq);
460 if (skwq_has_sleeper(wq))
461 wake_up_interruptible_sync_poll(&wq->wait,
462 EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND);
463 sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
465 rcu_read_unlock();
468 /* When dgram socket disconnects (or changes its peer), we clear its receive
469 * queue of packets arrived from previous peer. First, it allows to do
470 * flow control based only on wmem_alloc; second, sk connected to peer
471 * may receive messages only from that peer. */
472 static void unix_dgram_disconnected(struct sock *sk, struct sock *other)
474 if (!skb_queue_empty(&sk->sk_receive_queue)) {
475 skb_queue_purge(&sk->sk_receive_queue);
476 wake_up_interruptible_all(&unix_sk(sk)->peer_wait);
478 /* If one link of bidirectional dgram pipe is disconnected,
479 * we signal error. Messages are lost. Do not make this,
480 * when peer was not connected to us.
482 if (!sock_flag(other, SOCK_DEAD) && unix_peer(other) == sk) {
483 other->sk_err = ECONNRESET;
484 other->sk_error_report(other);
489 static void unix_sock_destructor(struct sock *sk)
491 struct unix_sock *u = unix_sk(sk);
493 skb_queue_purge(&sk->sk_receive_queue);
495 WARN_ON(refcount_read(&sk->sk_wmem_alloc));
496 WARN_ON(!sk_unhashed(sk));
497 WARN_ON(sk->sk_socket);
498 if (!sock_flag(sk, SOCK_DEAD)) {
499 pr_info("Attempt to release alive unix socket: %p\n", sk);
500 return;
503 if (u->addr)
504 unix_release_addr(u->addr);
506 atomic_long_dec(&unix_nr_socks);
507 local_bh_disable();
508 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
509 local_bh_enable();
510 #ifdef UNIX_REFCNT_DEBUG
511 pr_debug("UNIX %p is destroyed, %ld are still alive.\n", sk,
512 atomic_long_read(&unix_nr_socks));
513 #endif
516 static void unix_release_sock(struct sock *sk, int embrion)
518 struct unix_sock *u = unix_sk(sk);
519 struct path path;
520 struct sock *skpair;
521 struct sk_buff *skb;
522 int state;
524 unix_remove_socket(sk);
526 /* Clear state */
527 unix_state_lock(sk);
528 sock_orphan(sk);
529 sk->sk_shutdown = SHUTDOWN_MASK;
530 path = u->path;
531 u->path.dentry = NULL;
532 u->path.mnt = NULL;
533 state = sk->sk_state;
534 sk->sk_state = TCP_CLOSE;
535 unix_state_unlock(sk);
537 wake_up_interruptible_all(&u->peer_wait);
539 skpair = unix_peer(sk);
541 if (skpair != NULL) {
542 if (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) {
543 unix_state_lock(skpair);
544 /* No more writes */
545 skpair->sk_shutdown = SHUTDOWN_MASK;
546 if (!skb_queue_empty(&sk->sk_receive_queue) || embrion)
547 skpair->sk_err = ECONNRESET;
548 unix_state_unlock(skpair);
549 skpair->sk_state_change(skpair);
550 sk_wake_async(skpair, SOCK_WAKE_WAITD, POLL_HUP);
553 unix_dgram_peer_wake_disconnect(sk, skpair);
554 sock_put(skpair); /* It may now die */
555 unix_peer(sk) = NULL;
558 /* Try to flush out this socket. Throw out buffers at least */
560 while ((skb = skb_dequeue(&sk->sk_receive_queue)) != NULL) {
561 if (state == TCP_LISTEN)
562 unix_release_sock(skb->sk, 1);
563 /* passed fds are erased in the kfree_skb hook */
564 UNIXCB(skb).consumed = skb->len;
565 kfree_skb(skb);
568 if (path.dentry)
569 path_put(&path);
571 sock_put(sk);
573 /* ---- Socket is dead now and most probably destroyed ---- */
576 * Fixme: BSD difference: In BSD all sockets connected to us get
577 * ECONNRESET and we die on the spot. In Linux we behave
578 * like files and pipes do and wait for the last
579 * dereference.
581 * Can't we simply set sock->err?
583 * What the above comment does talk about? --ANK(980817)
586 if (unix_tot_inflight)
587 unix_gc(); /* Garbage collect fds */
590 static void init_peercred(struct sock *sk)
592 put_pid(sk->sk_peer_pid);
593 if (sk->sk_peer_cred)
594 put_cred(sk->sk_peer_cred);
595 sk->sk_peer_pid = get_pid(task_tgid(current));
596 sk->sk_peer_cred = get_current_cred();
599 static void copy_peercred(struct sock *sk, struct sock *peersk)
601 put_pid(sk->sk_peer_pid);
602 if (sk->sk_peer_cred)
603 put_cred(sk->sk_peer_cred);
604 sk->sk_peer_pid = get_pid(peersk->sk_peer_pid);
605 sk->sk_peer_cred = get_cred(peersk->sk_peer_cred);
608 static int unix_listen(struct socket *sock, int backlog)
610 int err;
611 struct sock *sk = sock->sk;
612 struct unix_sock *u = unix_sk(sk);
613 struct pid *old_pid = NULL;
615 err = -EOPNOTSUPP;
616 if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET)
617 goto out; /* Only stream/seqpacket sockets accept */
618 err = -EINVAL;
619 if (!u->addr)
620 goto out; /* No listens on an unbound socket */
621 unix_state_lock(sk);
622 if (sk->sk_state != TCP_CLOSE && sk->sk_state != TCP_LISTEN)
623 goto out_unlock;
624 if (backlog > sk->sk_max_ack_backlog)
625 wake_up_interruptible_all(&u->peer_wait);
626 sk->sk_max_ack_backlog = backlog;
627 sk->sk_state = TCP_LISTEN;
628 /* set credentials so connect can copy them */
629 init_peercred(sk);
630 err = 0;
632 out_unlock:
633 unix_state_unlock(sk);
634 put_pid(old_pid);
635 out:
636 return err;
639 static int unix_release(struct socket *);
640 static int unix_bind(struct socket *, struct sockaddr *, int);
641 static int unix_stream_connect(struct socket *, struct sockaddr *,
642 int addr_len, int flags);
643 static int unix_socketpair(struct socket *, struct socket *);
644 static int unix_accept(struct socket *, struct socket *, int, bool);
645 static int unix_getname(struct socket *, struct sockaddr *, int);
646 static __poll_t unix_poll(struct file *, struct socket *, poll_table *);
647 static __poll_t unix_dgram_poll(struct file *, struct socket *,
648 poll_table *);
649 static int unix_ioctl(struct socket *, unsigned int, unsigned long);
650 static int unix_shutdown(struct socket *, int);
651 static int unix_stream_sendmsg(struct socket *, struct msghdr *, size_t);
652 static int unix_stream_recvmsg(struct socket *, struct msghdr *, size_t, int);
653 static ssize_t unix_stream_sendpage(struct socket *, struct page *, int offset,
654 size_t size, int flags);
655 static ssize_t unix_stream_splice_read(struct socket *, loff_t *ppos,
656 struct pipe_inode_info *, size_t size,
657 unsigned int flags);
658 static int unix_dgram_sendmsg(struct socket *, struct msghdr *, size_t);
659 static int unix_dgram_recvmsg(struct socket *, struct msghdr *, size_t, int);
660 static int unix_dgram_connect(struct socket *, struct sockaddr *,
661 int, int);
662 static int unix_seqpacket_sendmsg(struct socket *, struct msghdr *, size_t);
663 static int unix_seqpacket_recvmsg(struct socket *, struct msghdr *, size_t,
664 int);
666 static int unix_set_peek_off(struct sock *sk, int val)
668 struct unix_sock *u = unix_sk(sk);
670 if (mutex_lock_interruptible(&u->iolock))
671 return -EINTR;
673 sk->sk_peek_off = val;
674 mutex_unlock(&u->iolock);
676 return 0;
680 static const struct proto_ops unix_stream_ops = {
681 .family = PF_UNIX,
682 .owner = THIS_MODULE,
683 .release = unix_release,
684 .bind = unix_bind,
685 .connect = unix_stream_connect,
686 .socketpair = unix_socketpair,
687 .accept = unix_accept,
688 .getname = unix_getname,
689 .poll = unix_poll,
690 .ioctl = unix_ioctl,
691 .listen = unix_listen,
692 .shutdown = unix_shutdown,
693 .setsockopt = sock_no_setsockopt,
694 .getsockopt = sock_no_getsockopt,
695 .sendmsg = unix_stream_sendmsg,
696 .recvmsg = unix_stream_recvmsg,
697 .mmap = sock_no_mmap,
698 .sendpage = unix_stream_sendpage,
699 .splice_read = unix_stream_splice_read,
700 .set_peek_off = unix_set_peek_off,
703 static const struct proto_ops unix_dgram_ops = {
704 .family = PF_UNIX,
705 .owner = THIS_MODULE,
706 .release = unix_release,
707 .bind = unix_bind,
708 .connect = unix_dgram_connect,
709 .socketpair = unix_socketpair,
710 .accept = sock_no_accept,
711 .getname = unix_getname,
712 .poll = unix_dgram_poll,
713 .ioctl = unix_ioctl,
714 .listen = sock_no_listen,
715 .shutdown = unix_shutdown,
716 .setsockopt = sock_no_setsockopt,
717 .getsockopt = sock_no_getsockopt,
718 .sendmsg = unix_dgram_sendmsg,
719 .recvmsg = unix_dgram_recvmsg,
720 .mmap = sock_no_mmap,
721 .sendpage = sock_no_sendpage,
722 .set_peek_off = unix_set_peek_off,
725 static const struct proto_ops unix_seqpacket_ops = {
726 .family = PF_UNIX,
727 .owner = THIS_MODULE,
728 .release = unix_release,
729 .bind = unix_bind,
730 .connect = unix_stream_connect,
731 .socketpair = unix_socketpair,
732 .accept = unix_accept,
733 .getname = unix_getname,
734 .poll = unix_dgram_poll,
735 .ioctl = unix_ioctl,
736 .listen = unix_listen,
737 .shutdown = unix_shutdown,
738 .setsockopt = sock_no_setsockopt,
739 .getsockopt = sock_no_getsockopt,
740 .sendmsg = unix_seqpacket_sendmsg,
741 .recvmsg = unix_seqpacket_recvmsg,
742 .mmap = sock_no_mmap,
743 .sendpage = sock_no_sendpage,
744 .set_peek_off = unix_set_peek_off,
747 static struct proto unix_proto = {
748 .name = "UNIX",
749 .owner = THIS_MODULE,
750 .obj_size = sizeof(struct unix_sock),
753 static struct sock *unix_create1(struct net *net, struct socket *sock, int kern)
755 struct sock *sk = NULL;
756 struct unix_sock *u;
758 atomic_long_inc(&unix_nr_socks);
759 if (atomic_long_read(&unix_nr_socks) > 2 * get_max_files())
760 goto out;
762 sk = sk_alloc(net, PF_UNIX, GFP_KERNEL, &unix_proto, kern);
763 if (!sk)
764 goto out;
766 sock_init_data(sock, sk);
768 sk->sk_allocation = GFP_KERNEL_ACCOUNT;
769 sk->sk_write_space = unix_write_space;
770 sk->sk_max_ack_backlog = net->unx.sysctl_max_dgram_qlen;
771 sk->sk_destruct = unix_sock_destructor;
772 u = unix_sk(sk);
773 u->path.dentry = NULL;
774 u->path.mnt = NULL;
775 spin_lock_init(&u->lock);
776 atomic_long_set(&u->inflight, 0);
777 INIT_LIST_HEAD(&u->link);
778 mutex_init(&u->iolock); /* single task reading lock */
779 mutex_init(&u->bindlock); /* single task binding lock */
780 init_waitqueue_head(&u->peer_wait);
781 init_waitqueue_func_entry(&u->peer_wake, unix_dgram_peer_wake_relay);
782 unix_insert_socket(unix_sockets_unbound(sk), sk);
783 out:
784 if (sk == NULL)
785 atomic_long_dec(&unix_nr_socks);
786 else {
787 local_bh_disable();
788 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
789 local_bh_enable();
791 return sk;
794 static int unix_create(struct net *net, struct socket *sock, int protocol,
795 int kern)
797 if (protocol && protocol != PF_UNIX)
798 return -EPROTONOSUPPORT;
800 sock->state = SS_UNCONNECTED;
802 switch (sock->type) {
803 case SOCK_STREAM:
804 sock->ops = &unix_stream_ops;
805 break;
807 * Believe it or not BSD has AF_UNIX, SOCK_RAW though
808 * nothing uses it.
810 case SOCK_RAW:
811 sock->type = SOCK_DGRAM;
812 /* fall through */
813 case SOCK_DGRAM:
814 sock->ops = &unix_dgram_ops;
815 break;
816 case SOCK_SEQPACKET:
817 sock->ops = &unix_seqpacket_ops;
818 break;
819 default:
820 return -ESOCKTNOSUPPORT;
823 return unix_create1(net, sock, kern) ? 0 : -ENOMEM;
826 static int unix_release(struct socket *sock)
828 struct sock *sk = sock->sk;
830 if (!sk)
831 return 0;
833 unix_release_sock(sk, 0);
834 sock->sk = NULL;
836 return 0;
839 static int unix_autobind(struct socket *sock)
841 struct sock *sk = sock->sk;
842 struct net *net = sock_net(sk);
843 struct unix_sock *u = unix_sk(sk);
844 static u32 ordernum = 1;
845 struct unix_address *addr;
846 int err;
847 unsigned int retries = 0;
849 err = mutex_lock_interruptible(&u->bindlock);
850 if (err)
851 return err;
853 err = 0;
854 if (u->addr)
855 goto out;
857 err = -ENOMEM;
858 addr = kzalloc(sizeof(*addr) + sizeof(short) + 16, GFP_KERNEL);
859 if (!addr)
860 goto out;
862 addr->name->sun_family = AF_UNIX;
863 refcount_set(&addr->refcnt, 1);
865 retry:
866 addr->len = sprintf(addr->name->sun_path+1, "%05x", ordernum) + 1 + sizeof(short);
867 addr->hash = unix_hash_fold(csum_partial(addr->name, addr->len, 0));
869 spin_lock(&unix_table_lock);
870 ordernum = (ordernum+1)&0xFFFFF;
872 if (__unix_find_socket_byname(net, addr->name, addr->len, sock->type,
873 addr->hash)) {
874 spin_unlock(&unix_table_lock);
876 * __unix_find_socket_byname() may take long time if many names
877 * are already in use.
879 cond_resched();
880 /* Give up if all names seems to be in use. */
881 if (retries++ == 0xFFFFF) {
882 err = -ENOSPC;
883 kfree(addr);
884 goto out;
886 goto retry;
888 addr->hash ^= sk->sk_type;
890 __unix_remove_socket(sk);
891 smp_store_release(&u->addr, addr);
892 __unix_insert_socket(&unix_socket_table[addr->hash], sk);
893 spin_unlock(&unix_table_lock);
894 err = 0;
896 out: mutex_unlock(&u->bindlock);
897 return err;
900 static struct sock *unix_find_other(struct net *net,
901 struct sockaddr_un *sunname, int len,
902 int type, unsigned int hash, int *error)
904 struct sock *u;
905 struct path path;
906 int err = 0;
908 if (sunname->sun_path[0]) {
909 struct inode *inode;
910 err = kern_path(sunname->sun_path, LOOKUP_FOLLOW, &path);
911 if (err)
912 goto fail;
913 inode = d_backing_inode(path.dentry);
914 err = inode_permission(inode, MAY_WRITE);
915 if (err)
916 goto put_fail;
918 err = -ECONNREFUSED;
919 if (!S_ISSOCK(inode->i_mode))
920 goto put_fail;
921 u = unix_find_socket_byinode(inode);
922 if (!u)
923 goto put_fail;
925 if (u->sk_type == type)
926 touch_atime(&path);
928 path_put(&path);
930 err = -EPROTOTYPE;
931 if (u->sk_type != type) {
932 sock_put(u);
933 goto fail;
935 } else {
936 err = -ECONNREFUSED;
937 u = unix_find_socket_byname(net, sunname, len, type, hash);
938 if (u) {
939 struct dentry *dentry;
940 dentry = unix_sk(u)->path.dentry;
941 if (dentry)
942 touch_atime(&unix_sk(u)->path);
943 } else
944 goto fail;
946 return u;
948 put_fail:
949 path_put(&path);
950 fail:
951 *error = err;
952 return NULL;
955 static int unix_mknod(const char *sun_path, umode_t mode, struct path *res)
957 struct dentry *dentry;
958 struct path path;
959 int err = 0;
961 * Get the parent directory, calculate the hash for last
962 * component.
964 dentry = kern_path_create(AT_FDCWD, sun_path, &path, 0);
965 err = PTR_ERR(dentry);
966 if (IS_ERR(dentry))
967 return err;
970 * All right, let's create it.
972 err = security_path_mknod(&path, dentry, mode, 0);
973 if (!err) {
974 err = vfs_mknod(d_inode(path.dentry), dentry, mode, 0);
975 if (!err) {
976 res->mnt = mntget(path.mnt);
977 res->dentry = dget(dentry);
980 done_path_create(&path, dentry);
981 return err;
984 static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
986 struct sock *sk = sock->sk;
987 struct net *net = sock_net(sk);
988 struct unix_sock *u = unix_sk(sk);
989 struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr;
990 char *sun_path = sunaddr->sun_path;
991 int err;
992 unsigned int hash;
993 struct unix_address *addr;
994 struct hlist_head *list;
995 struct path path = { };
997 err = -EINVAL;
998 if (addr_len < offsetofend(struct sockaddr_un, sun_family) ||
999 sunaddr->sun_family != AF_UNIX)
1000 goto out;
1002 if (addr_len == sizeof(short)) {
1003 err = unix_autobind(sock);
1004 goto out;
1007 err = unix_mkname(sunaddr, addr_len, &hash);
1008 if (err < 0)
1009 goto out;
1010 addr_len = err;
1012 if (sun_path[0]) {
1013 umode_t mode = S_IFSOCK |
1014 (SOCK_INODE(sock)->i_mode & ~current_umask());
1015 err = unix_mknod(sun_path, mode, &path);
1016 if (err) {
1017 if (err == -EEXIST)
1018 err = -EADDRINUSE;
1019 goto out;
1023 err = mutex_lock_interruptible(&u->bindlock);
1024 if (err)
1025 goto out_put;
1027 err = -EINVAL;
1028 if (u->addr)
1029 goto out_up;
1031 err = -ENOMEM;
1032 addr = kmalloc(sizeof(*addr)+addr_len, GFP_KERNEL);
1033 if (!addr)
1034 goto out_up;
1036 memcpy(addr->name, sunaddr, addr_len);
1037 addr->len = addr_len;
1038 addr->hash = hash ^ sk->sk_type;
1039 refcount_set(&addr->refcnt, 1);
1041 if (sun_path[0]) {
1042 addr->hash = UNIX_HASH_SIZE;
1043 hash = d_backing_inode(path.dentry)->i_ino & (UNIX_HASH_SIZE - 1);
1044 spin_lock(&unix_table_lock);
1045 u->path = path;
1046 list = &unix_socket_table[hash];
1047 } else {
1048 spin_lock(&unix_table_lock);
1049 err = -EADDRINUSE;
1050 if (__unix_find_socket_byname(net, sunaddr, addr_len,
1051 sk->sk_type, hash)) {
1052 unix_release_addr(addr);
1053 goto out_unlock;
1056 list = &unix_socket_table[addr->hash];
1059 err = 0;
1060 __unix_remove_socket(sk);
1061 smp_store_release(&u->addr, addr);
1062 __unix_insert_socket(list, sk);
1064 out_unlock:
1065 spin_unlock(&unix_table_lock);
1066 out_up:
1067 mutex_unlock(&u->bindlock);
1068 out_put:
1069 if (err)
1070 path_put(&path);
1071 out:
1072 return err;
1075 static void unix_state_double_lock(struct sock *sk1, struct sock *sk2)
1077 if (unlikely(sk1 == sk2) || !sk2) {
1078 unix_state_lock(sk1);
1079 return;
1081 if (sk1 < sk2) {
1082 unix_state_lock(sk1);
1083 unix_state_lock_nested(sk2);
1084 } else {
1085 unix_state_lock(sk2);
1086 unix_state_lock_nested(sk1);
1090 static void unix_state_double_unlock(struct sock *sk1, struct sock *sk2)
1092 if (unlikely(sk1 == sk2) || !sk2) {
1093 unix_state_unlock(sk1);
1094 return;
1096 unix_state_unlock(sk1);
1097 unix_state_unlock(sk2);
1100 static int unix_dgram_connect(struct socket *sock, struct sockaddr *addr,
1101 int alen, int flags)
1103 struct sock *sk = sock->sk;
1104 struct net *net = sock_net(sk);
1105 struct sockaddr_un *sunaddr = (struct sockaddr_un *)addr;
1106 struct sock *other;
1107 unsigned int hash;
1108 int err;
1110 err = -EINVAL;
1111 if (alen < offsetofend(struct sockaddr, sa_family))
1112 goto out;
1114 if (addr->sa_family != AF_UNSPEC) {
1115 err = unix_mkname(sunaddr, alen, &hash);
1116 if (err < 0)
1117 goto out;
1118 alen = err;
1120 if (test_bit(SOCK_PASSCRED, &sock->flags) &&
1121 !unix_sk(sk)->addr && (err = unix_autobind(sock)) != 0)
1122 goto out;
1124 restart:
1125 other = unix_find_other(net, sunaddr, alen, sock->type, hash, &err);
1126 if (!other)
1127 goto out;
1129 unix_state_double_lock(sk, other);
1131 /* Apparently VFS overslept socket death. Retry. */
1132 if (sock_flag(other, SOCK_DEAD)) {
1133 unix_state_double_unlock(sk, other);
1134 sock_put(other);
1135 goto restart;
1138 err = -EPERM;
1139 if (!unix_may_send(sk, other))
1140 goto out_unlock;
1142 err = security_unix_may_send(sk->sk_socket, other->sk_socket);
1143 if (err)
1144 goto out_unlock;
1146 } else {
1148 * 1003.1g breaking connected state with AF_UNSPEC
1150 other = NULL;
1151 unix_state_double_lock(sk, other);
1155 * If it was connected, reconnect.
1157 if (unix_peer(sk)) {
1158 struct sock *old_peer = unix_peer(sk);
1159 unix_peer(sk) = other;
1160 unix_dgram_peer_wake_disconnect_wakeup(sk, old_peer);
1162 unix_state_double_unlock(sk, other);
1164 if (other != old_peer)
1165 unix_dgram_disconnected(sk, old_peer);
1166 sock_put(old_peer);
1167 } else {
1168 unix_peer(sk) = other;
1169 unix_state_double_unlock(sk, other);
1171 return 0;
1173 out_unlock:
1174 unix_state_double_unlock(sk, other);
1175 sock_put(other);
1176 out:
1177 return err;
1180 static long unix_wait_for_peer(struct sock *other, long timeo)
1182 struct unix_sock *u = unix_sk(other);
1183 int sched;
1184 DEFINE_WAIT(wait);
1186 prepare_to_wait_exclusive(&u->peer_wait, &wait, TASK_INTERRUPTIBLE);
1188 sched = !sock_flag(other, SOCK_DEAD) &&
1189 !(other->sk_shutdown & RCV_SHUTDOWN) &&
1190 unix_recvq_full(other);
1192 unix_state_unlock(other);
1194 if (sched)
1195 timeo = schedule_timeout(timeo);
1197 finish_wait(&u->peer_wait, &wait);
1198 return timeo;
1201 static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr,
1202 int addr_len, int flags)
1204 struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr;
1205 struct sock *sk = sock->sk;
1206 struct net *net = sock_net(sk);
1207 struct unix_sock *u = unix_sk(sk), *newu, *otheru;
1208 struct sock *newsk = NULL;
1209 struct sock *other = NULL;
1210 struct sk_buff *skb = NULL;
1211 unsigned int hash;
1212 int st;
1213 int err;
1214 long timeo;
1216 err = unix_mkname(sunaddr, addr_len, &hash);
1217 if (err < 0)
1218 goto out;
1219 addr_len = err;
1221 if (test_bit(SOCK_PASSCRED, &sock->flags) && !u->addr &&
1222 (err = unix_autobind(sock)) != 0)
1223 goto out;
1225 timeo = sock_sndtimeo(sk, flags & O_NONBLOCK);
1227 /* First of all allocate resources.
1228 If we will make it after state is locked,
1229 we will have to recheck all again in any case.
1232 err = -ENOMEM;
1234 /* create new sock for complete connection */
1235 newsk = unix_create1(sock_net(sk), NULL, 0);
1236 if (newsk == NULL)
1237 goto out;
1239 /* Allocate skb for sending to listening sock */
1240 skb = sock_wmalloc(newsk, 1, 0, GFP_KERNEL);
1241 if (skb == NULL)
1242 goto out;
1244 restart:
1245 /* Find listening sock. */
1246 other = unix_find_other(net, sunaddr, addr_len, sk->sk_type, hash, &err);
1247 if (!other)
1248 goto out;
1250 /* Latch state of peer */
1251 unix_state_lock(other);
1253 /* Apparently VFS overslept socket death. Retry. */
1254 if (sock_flag(other, SOCK_DEAD)) {
1255 unix_state_unlock(other);
1256 sock_put(other);
1257 goto restart;
1260 err = -ECONNREFUSED;
1261 if (other->sk_state != TCP_LISTEN)
1262 goto out_unlock;
1263 if (other->sk_shutdown & RCV_SHUTDOWN)
1264 goto out_unlock;
1266 if (unix_recvq_full(other)) {
1267 err = -EAGAIN;
1268 if (!timeo)
1269 goto out_unlock;
1271 timeo = unix_wait_for_peer(other, timeo);
1273 err = sock_intr_errno(timeo);
1274 if (signal_pending(current))
1275 goto out;
1276 sock_put(other);
1277 goto restart;
1280 /* Latch our state.
1282 It is tricky place. We need to grab our state lock and cannot
1283 drop lock on peer. It is dangerous because deadlock is
1284 possible. Connect to self case and simultaneous
1285 attempt to connect are eliminated by checking socket
1286 state. other is TCP_LISTEN, if sk is TCP_LISTEN we
1287 check this before attempt to grab lock.
1289 Well, and we have to recheck the state after socket locked.
1291 st = sk->sk_state;
1293 switch (st) {
1294 case TCP_CLOSE:
1295 /* This is ok... continue with connect */
1296 break;
1297 case TCP_ESTABLISHED:
1298 /* Socket is already connected */
1299 err = -EISCONN;
1300 goto out_unlock;
1301 default:
1302 err = -EINVAL;
1303 goto out_unlock;
1306 unix_state_lock_nested(sk);
1308 if (sk->sk_state != st) {
1309 unix_state_unlock(sk);
1310 unix_state_unlock(other);
1311 sock_put(other);
1312 goto restart;
1315 err = security_unix_stream_connect(sk, other, newsk);
1316 if (err) {
1317 unix_state_unlock(sk);
1318 goto out_unlock;
1321 /* The way is open! Fastly set all the necessary fields... */
1323 sock_hold(sk);
1324 unix_peer(newsk) = sk;
1325 newsk->sk_state = TCP_ESTABLISHED;
1326 newsk->sk_type = sk->sk_type;
1327 init_peercred(newsk);
1328 newu = unix_sk(newsk);
1329 RCU_INIT_POINTER(newsk->sk_wq, &newu->peer_wq);
1330 otheru = unix_sk(other);
1332 /* copy address information from listening to new sock
1334 * The contents of *(otheru->addr) and otheru->path
1335 * are seen fully set up here, since we have found
1336 * otheru in hash under unix_table_lock. Insertion
1337 * into the hash chain we'd found it in had been done
1338 * in an earlier critical area protected by unix_table_lock,
1339 * the same one where we'd set *(otheru->addr) contents,
1340 * as well as otheru->path and otheru->addr itself.
1342 * Using smp_store_release() here to set newu->addr
1343 * is enough to make those stores, as well as stores
1344 * to newu->path visible to anyone who gets newu->addr
1345 * by smp_load_acquire(). IOW, the same warranties
1346 * as for unix_sock instances bound in unix_bind() or
1347 * in unix_autobind().
1349 if (otheru->path.dentry) {
1350 path_get(&otheru->path);
1351 newu->path = otheru->path;
1353 refcount_inc(&otheru->addr->refcnt);
1354 smp_store_release(&newu->addr, otheru->addr);
1356 /* Set credentials */
1357 copy_peercred(sk, other);
1359 sock->state = SS_CONNECTED;
1360 sk->sk_state = TCP_ESTABLISHED;
1361 sock_hold(newsk);
1363 smp_mb__after_atomic(); /* sock_hold() does an atomic_inc() */
1364 unix_peer(sk) = newsk;
1366 unix_state_unlock(sk);
1368 /* take ten and and send info to listening sock */
1369 spin_lock(&other->sk_receive_queue.lock);
1370 __skb_queue_tail(&other->sk_receive_queue, skb);
1371 spin_unlock(&other->sk_receive_queue.lock);
1372 unix_state_unlock(other);
1373 other->sk_data_ready(other);
1374 sock_put(other);
1375 return 0;
1377 out_unlock:
1378 if (other)
1379 unix_state_unlock(other);
1381 out:
1382 kfree_skb(skb);
1383 if (newsk)
1384 unix_release_sock(newsk, 0);
1385 if (other)
1386 sock_put(other);
1387 return err;
1390 static int unix_socketpair(struct socket *socka, struct socket *sockb)
1392 struct sock *ska = socka->sk, *skb = sockb->sk;
1394 /* Join our sockets back to back */
1395 sock_hold(ska);
1396 sock_hold(skb);
1397 unix_peer(ska) = skb;
1398 unix_peer(skb) = ska;
1399 init_peercred(ska);
1400 init_peercred(skb);
1402 if (ska->sk_type != SOCK_DGRAM) {
1403 ska->sk_state = TCP_ESTABLISHED;
1404 skb->sk_state = TCP_ESTABLISHED;
1405 socka->state = SS_CONNECTED;
1406 sockb->state = SS_CONNECTED;
1408 return 0;
1411 static void unix_sock_inherit_flags(const struct socket *old,
1412 struct socket *new)
1414 if (test_bit(SOCK_PASSCRED, &old->flags))
1415 set_bit(SOCK_PASSCRED, &new->flags);
1416 if (test_bit(SOCK_PASSSEC, &old->flags))
1417 set_bit(SOCK_PASSSEC, &new->flags);
1420 static int unix_accept(struct socket *sock, struct socket *newsock, int flags,
1421 bool kern)
1423 struct sock *sk = sock->sk;
1424 struct sock *tsk;
1425 struct sk_buff *skb;
1426 int err;
1428 err = -EOPNOTSUPP;
1429 if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET)
1430 goto out;
1432 err = -EINVAL;
1433 if (sk->sk_state != TCP_LISTEN)
1434 goto out;
1436 /* If socket state is TCP_LISTEN it cannot change (for now...),
1437 * so that no locks are necessary.
1440 skb = skb_recv_datagram(sk, 0, flags&O_NONBLOCK, &err);
1441 if (!skb) {
1442 /* This means receive shutdown. */
1443 if (err == 0)
1444 err = -EINVAL;
1445 goto out;
1448 tsk = skb->sk;
1449 skb_free_datagram(sk, skb);
1450 wake_up_interruptible(&unix_sk(sk)->peer_wait);
1452 /* attach accepted sock to socket */
1453 unix_state_lock(tsk);
1454 newsock->state = SS_CONNECTED;
1455 unix_sock_inherit_flags(sock, newsock);
1456 sock_graft(tsk, newsock);
1457 unix_state_unlock(tsk);
1458 return 0;
1460 out:
1461 return err;
1465 static int unix_getname(struct socket *sock, struct sockaddr *uaddr, int peer)
1467 struct sock *sk = sock->sk;
1468 struct unix_address *addr;
1469 DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, uaddr);
1470 int err = 0;
1472 if (peer) {
1473 sk = unix_peer_get(sk);
1475 err = -ENOTCONN;
1476 if (!sk)
1477 goto out;
1478 err = 0;
1479 } else {
1480 sock_hold(sk);
1483 addr = smp_load_acquire(&unix_sk(sk)->addr);
1484 if (!addr) {
1485 sunaddr->sun_family = AF_UNIX;
1486 sunaddr->sun_path[0] = 0;
1487 err = sizeof(short);
1488 } else {
1489 err = addr->len;
1490 memcpy(sunaddr, addr->name, addr->len);
1492 sock_put(sk);
1493 out:
1494 return err;
1497 static void unix_detach_fds(struct scm_cookie *scm, struct sk_buff *skb)
1499 int i;
1501 scm->fp = UNIXCB(skb).fp;
1502 UNIXCB(skb).fp = NULL;
1504 for (i = scm->fp->count-1; i >= 0; i--)
1505 unix_notinflight(scm->fp->user, scm->fp->fp[i]);
1508 static void unix_destruct_scm(struct sk_buff *skb)
1510 struct scm_cookie scm;
1511 memset(&scm, 0, sizeof(scm));
1512 scm.pid = UNIXCB(skb).pid;
1513 if (UNIXCB(skb).fp)
1514 unix_detach_fds(&scm, skb);
1516 /* Alas, it calls VFS */
1517 /* So fscking what? fput() had been SMP-safe since the last Summer */
1518 scm_destroy(&scm);
1519 sock_wfree(skb);
1523 * The "user->unix_inflight" variable is protected by the garbage
1524 * collection lock, and we just read it locklessly here. If you go
1525 * over the limit, there might be a tiny race in actually noticing
1526 * it across threads. Tough.
1528 static inline bool too_many_unix_fds(struct task_struct *p)
1530 struct user_struct *user = current_user();
1532 if (unlikely(user->unix_inflight > task_rlimit(p, RLIMIT_NOFILE)))
1533 return !capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN);
1534 return false;
1537 static int unix_attach_fds(struct scm_cookie *scm, struct sk_buff *skb)
1539 int i;
1541 if (too_many_unix_fds(current))
1542 return -ETOOMANYREFS;
1545 * Need to duplicate file references for the sake of garbage
1546 * collection. Otherwise a socket in the fps might become a
1547 * candidate for GC while the skb is not yet queued.
1549 UNIXCB(skb).fp = scm_fp_dup(scm->fp);
1550 if (!UNIXCB(skb).fp)
1551 return -ENOMEM;
1553 for (i = scm->fp->count - 1; i >= 0; i--)
1554 unix_inflight(scm->fp->user, scm->fp->fp[i]);
1555 return 0;
1558 static int unix_scm_to_skb(struct scm_cookie *scm, struct sk_buff *skb, bool send_fds)
1560 int err = 0;
1562 UNIXCB(skb).pid = get_pid(scm->pid);
1563 UNIXCB(skb).uid = scm->creds.uid;
1564 UNIXCB(skb).gid = scm->creds.gid;
1565 UNIXCB(skb).fp = NULL;
1566 unix_get_secdata(scm, skb);
1567 if (scm->fp && send_fds)
1568 err = unix_attach_fds(scm, skb);
1570 skb->destructor = unix_destruct_scm;
1571 return err;
1574 static bool unix_passcred_enabled(const struct socket *sock,
1575 const struct sock *other)
1577 return test_bit(SOCK_PASSCRED, &sock->flags) ||
1578 !other->sk_socket ||
1579 test_bit(SOCK_PASSCRED, &other->sk_socket->flags);
1583 * Some apps rely on write() giving SCM_CREDENTIALS
1584 * We include credentials if source or destination socket
1585 * asserted SOCK_PASSCRED.
1587 static void maybe_add_creds(struct sk_buff *skb, const struct socket *sock,
1588 const struct sock *other)
1590 if (UNIXCB(skb).pid)
1591 return;
1592 if (unix_passcred_enabled(sock, other)) {
1593 UNIXCB(skb).pid = get_pid(task_tgid(current));
1594 current_uid_gid(&UNIXCB(skb).uid, &UNIXCB(skb).gid);
1598 static int maybe_init_creds(struct scm_cookie *scm,
1599 struct socket *socket,
1600 const struct sock *other)
1602 int err;
1603 struct msghdr msg = { .msg_controllen = 0 };
1605 err = scm_send(socket, &msg, scm, false);
1606 if (err)
1607 return err;
1609 if (unix_passcred_enabled(socket, other)) {
1610 scm->pid = get_pid(task_tgid(current));
1611 current_uid_gid(&scm->creds.uid, &scm->creds.gid);
1613 return err;
1616 static bool unix_skb_scm_eq(struct sk_buff *skb,
1617 struct scm_cookie *scm)
1619 const struct unix_skb_parms *u = &UNIXCB(skb);
1621 return u->pid == scm->pid &&
1622 uid_eq(u->uid, scm->creds.uid) &&
1623 gid_eq(u->gid, scm->creds.gid) &&
1624 unix_secdata_eq(scm, skb);
1628 * Send AF_UNIX data.
1631 static int unix_dgram_sendmsg(struct socket *sock, struct msghdr *msg,
1632 size_t len)
1634 struct sock *sk = sock->sk;
1635 struct net *net = sock_net(sk);
1636 struct unix_sock *u = unix_sk(sk);
1637 DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, msg->msg_name);
1638 struct sock *other = NULL;
1639 int namelen = 0; /* fake GCC */
1640 int err;
1641 unsigned int hash;
1642 struct sk_buff *skb;
1643 long timeo;
1644 struct scm_cookie scm;
1645 int data_len = 0;
1646 int sk_locked;
1648 wait_for_unix_gc();
1649 err = scm_send(sock, msg, &scm, false);
1650 if (err < 0)
1651 return err;
1653 err = -EOPNOTSUPP;
1654 if (msg->msg_flags&MSG_OOB)
1655 goto out;
1657 if (msg->msg_namelen) {
1658 err = unix_mkname(sunaddr, msg->msg_namelen, &hash);
1659 if (err < 0)
1660 goto out;
1661 namelen = err;
1662 } else {
1663 sunaddr = NULL;
1664 err = -ENOTCONN;
1665 other = unix_peer_get(sk);
1666 if (!other)
1667 goto out;
1670 if (test_bit(SOCK_PASSCRED, &sock->flags) && !u->addr
1671 && (err = unix_autobind(sock)) != 0)
1672 goto out;
1674 err = -EMSGSIZE;
1675 if (len > sk->sk_sndbuf - 32)
1676 goto out;
1678 if (len > SKB_MAX_ALLOC) {
1679 data_len = min_t(size_t,
1680 len - SKB_MAX_ALLOC,
1681 MAX_SKB_FRAGS * PAGE_SIZE);
1682 data_len = PAGE_ALIGN(data_len);
1684 BUILD_BUG_ON(SKB_MAX_ALLOC < PAGE_SIZE);
1687 skb = sock_alloc_send_pskb(sk, len - data_len, data_len,
1688 msg->msg_flags & MSG_DONTWAIT, &err,
1689 PAGE_ALLOC_COSTLY_ORDER);
1690 if (skb == NULL)
1691 goto out;
1693 err = unix_scm_to_skb(&scm, skb, true);
1694 if (err < 0)
1695 goto out_free;
1697 skb_put(skb, len - data_len);
1698 skb->data_len = data_len;
1699 skb->len = len;
1700 err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, len);
1701 if (err)
1702 goto out_free;
1704 timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT);
1706 restart:
1707 if (!other) {
1708 err = -ECONNRESET;
1709 if (sunaddr == NULL)
1710 goto out_free;
1712 other = unix_find_other(net, sunaddr, namelen, sk->sk_type,
1713 hash, &err);
1714 if (other == NULL)
1715 goto out_free;
1718 if (sk_filter(other, skb) < 0) {
1719 /* Toss the packet but do not return any error to the sender */
1720 err = len;
1721 goto out_free;
1724 sk_locked = 0;
1725 unix_state_lock(other);
1726 restart_locked:
1727 err = -EPERM;
1728 if (!unix_may_send(sk, other))
1729 goto out_unlock;
1731 if (unlikely(sock_flag(other, SOCK_DEAD))) {
1733 * Check with 1003.1g - what should
1734 * datagram error
1736 unix_state_unlock(other);
1737 sock_put(other);
1739 if (!sk_locked)
1740 unix_state_lock(sk);
1742 err = 0;
1743 if (unix_peer(sk) == other) {
1744 unix_peer(sk) = NULL;
1745 unix_dgram_peer_wake_disconnect_wakeup(sk, other);
1747 unix_state_unlock(sk);
1749 unix_dgram_disconnected(sk, other);
1750 sock_put(other);
1751 err = -ECONNREFUSED;
1752 } else {
1753 unix_state_unlock(sk);
1756 other = NULL;
1757 if (err)
1758 goto out_free;
1759 goto restart;
1762 err = -EPIPE;
1763 if (other->sk_shutdown & RCV_SHUTDOWN)
1764 goto out_unlock;
1766 if (sk->sk_type != SOCK_SEQPACKET) {
1767 err = security_unix_may_send(sk->sk_socket, other->sk_socket);
1768 if (err)
1769 goto out_unlock;
1772 /* other == sk && unix_peer(other) != sk if
1773 * - unix_peer(sk) == NULL, destination address bound to sk
1774 * - unix_peer(sk) == sk by time of get but disconnected before lock
1776 if (other != sk &&
1777 unlikely(unix_peer(other) != sk && unix_recvq_full(other))) {
1778 if (timeo) {
1779 timeo = unix_wait_for_peer(other, timeo);
1781 err = sock_intr_errno(timeo);
1782 if (signal_pending(current))
1783 goto out_free;
1785 goto restart;
1788 if (!sk_locked) {
1789 unix_state_unlock(other);
1790 unix_state_double_lock(sk, other);
1793 if (unix_peer(sk) != other ||
1794 unix_dgram_peer_wake_me(sk, other)) {
1795 err = -EAGAIN;
1796 sk_locked = 1;
1797 goto out_unlock;
1800 if (!sk_locked) {
1801 sk_locked = 1;
1802 goto restart_locked;
1806 if (unlikely(sk_locked))
1807 unix_state_unlock(sk);
1809 if (sock_flag(other, SOCK_RCVTSTAMP))
1810 __net_timestamp(skb);
1811 maybe_add_creds(skb, sock, other);
1812 skb_queue_tail(&other->sk_receive_queue, skb);
1813 unix_state_unlock(other);
1814 other->sk_data_ready(other);
1815 sock_put(other);
1816 scm_destroy(&scm);
1817 return len;
1819 out_unlock:
1820 if (sk_locked)
1821 unix_state_unlock(sk);
1822 unix_state_unlock(other);
1823 out_free:
1824 kfree_skb(skb);
1825 out:
1826 if (other)
1827 sock_put(other);
1828 scm_destroy(&scm);
1829 return err;
1832 /* We use paged skbs for stream sockets, and limit occupancy to 32768
1833 * bytes, and a minimum of a full page.
1835 #define UNIX_SKB_FRAGS_SZ (PAGE_SIZE << get_order(32768))
1837 static int unix_stream_sendmsg(struct socket *sock, struct msghdr *msg,
1838 size_t len)
1840 struct sock *sk = sock->sk;
1841 struct sock *other = NULL;
1842 int err, size;
1843 struct sk_buff *skb;
1844 int sent = 0;
1845 struct scm_cookie scm;
1846 bool fds_sent = false;
1847 int data_len;
1849 wait_for_unix_gc();
1850 err = scm_send(sock, msg, &scm, false);
1851 if (err < 0)
1852 return err;
1854 err = -EOPNOTSUPP;
1855 if (msg->msg_flags&MSG_OOB)
1856 goto out_err;
1858 if (msg->msg_namelen) {
1859 err = sk->sk_state == TCP_ESTABLISHED ? -EISCONN : -EOPNOTSUPP;
1860 goto out_err;
1861 } else {
1862 err = -ENOTCONN;
1863 other = unix_peer(sk);
1864 if (!other)
1865 goto out_err;
1868 if (sk->sk_shutdown & SEND_SHUTDOWN)
1869 goto pipe_err;
1871 while (sent < len) {
1872 size = len - sent;
1874 /* Keep two messages in the pipe so it schedules better */
1875 size = min_t(int, size, (sk->sk_sndbuf >> 1) - 64);
1877 /* allow fallback to order-0 allocations */
1878 size = min_t(int, size, SKB_MAX_HEAD(0) + UNIX_SKB_FRAGS_SZ);
1880 data_len = max_t(int, 0, size - SKB_MAX_HEAD(0));
1882 data_len = min_t(size_t, size, PAGE_ALIGN(data_len));
1884 skb = sock_alloc_send_pskb(sk, size - data_len, data_len,
1885 msg->msg_flags & MSG_DONTWAIT, &err,
1886 get_order(UNIX_SKB_FRAGS_SZ));
1887 if (!skb)
1888 goto out_err;
1890 /* Only send the fds in the first buffer */
1891 err = unix_scm_to_skb(&scm, skb, !fds_sent);
1892 if (err < 0) {
1893 kfree_skb(skb);
1894 goto out_err;
1896 fds_sent = true;
1898 skb_put(skb, size - data_len);
1899 skb->data_len = data_len;
1900 skb->len = size;
1901 err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, size);
1902 if (err) {
1903 kfree_skb(skb);
1904 goto out_err;
1907 unix_state_lock(other);
1909 if (sock_flag(other, SOCK_DEAD) ||
1910 (other->sk_shutdown & RCV_SHUTDOWN))
1911 goto pipe_err_free;
1913 maybe_add_creds(skb, sock, other);
1914 skb_queue_tail(&other->sk_receive_queue, skb);
1915 unix_state_unlock(other);
1916 other->sk_data_ready(other);
1917 sent += size;
1920 scm_destroy(&scm);
1922 return sent;
1924 pipe_err_free:
1925 unix_state_unlock(other);
1926 kfree_skb(skb);
1927 pipe_err:
1928 if (sent == 0 && !(msg->msg_flags&MSG_NOSIGNAL))
1929 send_sig(SIGPIPE, current, 0);
1930 err = -EPIPE;
1931 out_err:
1932 scm_destroy(&scm);
1933 return sent ? : err;
1936 static ssize_t unix_stream_sendpage(struct socket *socket, struct page *page,
1937 int offset, size_t size, int flags)
1939 int err;
1940 bool send_sigpipe = false;
1941 bool init_scm = true;
1942 struct scm_cookie scm;
1943 struct sock *other, *sk = socket->sk;
1944 struct sk_buff *skb, *newskb = NULL, *tail = NULL;
1946 if (flags & MSG_OOB)
1947 return -EOPNOTSUPP;
1949 other = unix_peer(sk);
1950 if (!other || sk->sk_state != TCP_ESTABLISHED)
1951 return -ENOTCONN;
1953 if (false) {
1954 alloc_skb:
1955 unix_state_unlock(other);
1956 mutex_unlock(&unix_sk(other)->iolock);
1957 newskb = sock_alloc_send_pskb(sk, 0, 0, flags & MSG_DONTWAIT,
1958 &err, 0);
1959 if (!newskb)
1960 goto err;
1963 /* we must acquire iolock as we modify already present
1964 * skbs in the sk_receive_queue and mess with skb->len
1966 err = mutex_lock_interruptible(&unix_sk(other)->iolock);
1967 if (err) {
1968 err = flags & MSG_DONTWAIT ? -EAGAIN : -ERESTARTSYS;
1969 goto err;
1972 if (sk->sk_shutdown & SEND_SHUTDOWN) {
1973 err = -EPIPE;
1974 send_sigpipe = true;
1975 goto err_unlock;
1978 unix_state_lock(other);
1980 if (sock_flag(other, SOCK_DEAD) ||
1981 other->sk_shutdown & RCV_SHUTDOWN) {
1982 err = -EPIPE;
1983 send_sigpipe = true;
1984 goto err_state_unlock;
1987 if (init_scm) {
1988 err = maybe_init_creds(&scm, socket, other);
1989 if (err)
1990 goto err_state_unlock;
1991 init_scm = false;
1994 skb = skb_peek_tail(&other->sk_receive_queue);
1995 if (tail && tail == skb) {
1996 skb = newskb;
1997 } else if (!skb || !unix_skb_scm_eq(skb, &scm)) {
1998 if (newskb) {
1999 skb = newskb;
2000 } else {
2001 tail = skb;
2002 goto alloc_skb;
2004 } else if (newskb) {
2005 /* this is fast path, we don't necessarily need to
2006 * call to kfree_skb even though with newskb == NULL
2007 * this - does no harm
2009 consume_skb(newskb);
2010 newskb = NULL;
2013 if (skb_append_pagefrags(skb, page, offset, size)) {
2014 tail = skb;
2015 goto alloc_skb;
2018 skb->len += size;
2019 skb->data_len += size;
2020 skb->truesize += size;
2021 refcount_add(size, &sk->sk_wmem_alloc);
2023 if (newskb) {
2024 err = unix_scm_to_skb(&scm, skb, false);
2025 if (err)
2026 goto err_state_unlock;
2027 spin_lock(&other->sk_receive_queue.lock);
2028 __skb_queue_tail(&other->sk_receive_queue, newskb);
2029 spin_unlock(&other->sk_receive_queue.lock);
2032 unix_state_unlock(other);
2033 mutex_unlock(&unix_sk(other)->iolock);
2035 other->sk_data_ready(other);
2036 scm_destroy(&scm);
2037 return size;
2039 err_state_unlock:
2040 unix_state_unlock(other);
2041 err_unlock:
2042 mutex_unlock(&unix_sk(other)->iolock);
2043 err:
2044 kfree_skb(newskb);
2045 if (send_sigpipe && !(flags & MSG_NOSIGNAL))
2046 send_sig(SIGPIPE, current, 0);
2047 if (!init_scm)
2048 scm_destroy(&scm);
2049 return err;
2052 static int unix_seqpacket_sendmsg(struct socket *sock, struct msghdr *msg,
2053 size_t len)
2055 int err;
2056 struct sock *sk = sock->sk;
2058 err = sock_error(sk);
2059 if (err)
2060 return err;
2062 if (sk->sk_state != TCP_ESTABLISHED)
2063 return -ENOTCONN;
2065 if (msg->msg_namelen)
2066 msg->msg_namelen = 0;
2068 return unix_dgram_sendmsg(sock, msg, len);
2071 static int unix_seqpacket_recvmsg(struct socket *sock, struct msghdr *msg,
2072 size_t size, int flags)
2074 struct sock *sk = sock->sk;
2076 if (sk->sk_state != TCP_ESTABLISHED)
2077 return -ENOTCONN;
2079 return unix_dgram_recvmsg(sock, msg, size, flags);
2082 static void unix_copy_addr(struct msghdr *msg, struct sock *sk)
2084 struct unix_address *addr = smp_load_acquire(&unix_sk(sk)->addr);
2086 if (addr) {
2087 msg->msg_namelen = addr->len;
2088 memcpy(msg->msg_name, addr->name, addr->len);
2092 static int unix_dgram_recvmsg(struct socket *sock, struct msghdr *msg,
2093 size_t size, int flags)
2095 struct scm_cookie scm;
2096 struct sock *sk = sock->sk;
2097 struct unix_sock *u = unix_sk(sk);
2098 struct sk_buff *skb, *last;
2099 long timeo;
2100 int err;
2101 int peeked, skip;
2103 err = -EOPNOTSUPP;
2104 if (flags&MSG_OOB)
2105 goto out;
2107 timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT);
2109 do {
2110 mutex_lock(&u->iolock);
2112 skip = sk_peek_offset(sk, flags);
2113 skb = __skb_try_recv_datagram(sk, flags, NULL, &peeked, &skip,
2114 &err, &last);
2115 if (skb)
2116 break;
2118 mutex_unlock(&u->iolock);
2120 if (err != -EAGAIN)
2121 break;
2122 } while (timeo &&
2123 !__skb_wait_for_more_packets(sk, &err, &timeo, last));
2125 if (!skb) { /* implies iolock unlocked */
2126 unix_state_lock(sk);
2127 /* Signal EOF on disconnected non-blocking SEQPACKET socket. */
2128 if (sk->sk_type == SOCK_SEQPACKET && err == -EAGAIN &&
2129 (sk->sk_shutdown & RCV_SHUTDOWN))
2130 err = 0;
2131 unix_state_unlock(sk);
2132 goto out;
2135 if (wq_has_sleeper(&u->peer_wait))
2136 wake_up_interruptible_sync_poll(&u->peer_wait,
2137 EPOLLOUT | EPOLLWRNORM |
2138 EPOLLWRBAND);
2140 if (msg->msg_name)
2141 unix_copy_addr(msg, skb->sk);
2143 if (size > skb->len - skip)
2144 size = skb->len - skip;
2145 else if (size < skb->len - skip)
2146 msg->msg_flags |= MSG_TRUNC;
2148 err = skb_copy_datagram_msg(skb, skip, msg, size);
2149 if (err)
2150 goto out_free;
2152 if (sock_flag(sk, SOCK_RCVTSTAMP))
2153 __sock_recv_timestamp(msg, sk, skb);
2155 memset(&scm, 0, sizeof(scm));
2157 scm_set_cred(&scm, UNIXCB(skb).pid, UNIXCB(skb).uid, UNIXCB(skb).gid);
2158 unix_set_secdata(&scm, skb);
2160 if (!(flags & MSG_PEEK)) {
2161 if (UNIXCB(skb).fp)
2162 unix_detach_fds(&scm, skb);
2164 sk_peek_offset_bwd(sk, skb->len);
2165 } else {
2166 /* It is questionable: on PEEK we could:
2167 - do not return fds - good, but too simple 8)
2168 - return fds, and do not return them on read (old strategy,
2169 apparently wrong)
2170 - clone fds (I chose it for now, it is the most universal
2171 solution)
2173 POSIX 1003.1g does not actually define this clearly
2174 at all. POSIX 1003.1g doesn't define a lot of things
2175 clearly however!
2179 sk_peek_offset_fwd(sk, size);
2181 if (UNIXCB(skb).fp)
2182 scm.fp = scm_fp_dup(UNIXCB(skb).fp);
2184 err = (flags & MSG_TRUNC) ? skb->len - skip : size;
2186 scm_recv(sock, msg, &scm, flags);
2188 out_free:
2189 skb_free_datagram(sk, skb);
2190 mutex_unlock(&u->iolock);
2191 out:
2192 return err;
2196 * Sleep until more data has arrived. But check for races..
2198 static long unix_stream_data_wait(struct sock *sk, long timeo,
2199 struct sk_buff *last, unsigned int last_len,
2200 bool freezable)
2202 struct sk_buff *tail;
2203 DEFINE_WAIT(wait);
2205 unix_state_lock(sk);
2207 for (;;) {
2208 prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
2210 tail = skb_peek_tail(&sk->sk_receive_queue);
2211 if (tail != last ||
2212 (tail && tail->len != last_len) ||
2213 sk->sk_err ||
2214 (sk->sk_shutdown & RCV_SHUTDOWN) ||
2215 signal_pending(current) ||
2216 !timeo)
2217 break;
2219 sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2220 unix_state_unlock(sk);
2221 if (freezable)
2222 timeo = freezable_schedule_timeout(timeo);
2223 else
2224 timeo = schedule_timeout(timeo);
2225 unix_state_lock(sk);
2227 if (sock_flag(sk, SOCK_DEAD))
2228 break;
2230 sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2233 finish_wait(sk_sleep(sk), &wait);
2234 unix_state_unlock(sk);
2235 return timeo;
2238 static unsigned int unix_skb_len(const struct sk_buff *skb)
2240 return skb->len - UNIXCB(skb).consumed;
2243 struct unix_stream_read_state {
2244 int (*recv_actor)(struct sk_buff *, int, int,
2245 struct unix_stream_read_state *);
2246 struct socket *socket;
2247 struct msghdr *msg;
2248 struct pipe_inode_info *pipe;
2249 size_t size;
2250 int flags;
2251 unsigned int splice_flags;
2254 static int unix_stream_read_generic(struct unix_stream_read_state *state,
2255 bool freezable)
2257 struct scm_cookie scm;
2258 struct socket *sock = state->socket;
2259 struct sock *sk = sock->sk;
2260 struct unix_sock *u = unix_sk(sk);
2261 int copied = 0;
2262 int flags = state->flags;
2263 int noblock = flags & MSG_DONTWAIT;
2264 bool check_creds = false;
2265 int target;
2266 int err = 0;
2267 long timeo;
2268 int skip;
2269 size_t size = state->size;
2270 unsigned int last_len;
2272 if (unlikely(sk->sk_state != TCP_ESTABLISHED)) {
2273 err = -EINVAL;
2274 goto out;
2277 if (unlikely(flags & MSG_OOB)) {
2278 err = -EOPNOTSUPP;
2279 goto out;
2282 target = sock_rcvlowat(sk, flags & MSG_WAITALL, size);
2283 timeo = sock_rcvtimeo(sk, noblock);
2285 memset(&scm, 0, sizeof(scm));
2287 /* Lock the socket to prevent queue disordering
2288 * while sleeps in memcpy_tomsg
2290 mutex_lock(&u->iolock);
2292 skip = max(sk_peek_offset(sk, flags), 0);
2294 do {
2295 int chunk;
2296 bool drop_skb;
2297 struct sk_buff *skb, *last;
2299 redo:
2300 unix_state_lock(sk);
2301 if (sock_flag(sk, SOCK_DEAD)) {
2302 err = -ECONNRESET;
2303 goto unlock;
2305 last = skb = skb_peek(&sk->sk_receive_queue);
2306 last_len = last ? last->len : 0;
2307 again:
2308 if (skb == NULL) {
2309 if (copied >= target)
2310 goto unlock;
2313 * POSIX 1003.1g mandates this order.
2316 err = sock_error(sk);
2317 if (err)
2318 goto unlock;
2319 if (sk->sk_shutdown & RCV_SHUTDOWN)
2320 goto unlock;
2322 unix_state_unlock(sk);
2323 if (!timeo) {
2324 err = -EAGAIN;
2325 break;
2328 mutex_unlock(&u->iolock);
2330 timeo = unix_stream_data_wait(sk, timeo, last,
2331 last_len, freezable);
2333 if (signal_pending(current)) {
2334 err = sock_intr_errno(timeo);
2335 scm_destroy(&scm);
2336 goto out;
2339 mutex_lock(&u->iolock);
2340 goto redo;
2341 unlock:
2342 unix_state_unlock(sk);
2343 break;
2346 while (skip >= unix_skb_len(skb)) {
2347 skip -= unix_skb_len(skb);
2348 last = skb;
2349 last_len = skb->len;
2350 skb = skb_peek_next(skb, &sk->sk_receive_queue);
2351 if (!skb)
2352 goto again;
2355 unix_state_unlock(sk);
2357 if (check_creds) {
2358 /* Never glue messages from different writers */
2359 if (!unix_skb_scm_eq(skb, &scm))
2360 break;
2361 } else if (test_bit(SOCK_PASSCRED, &sock->flags)) {
2362 /* Copy credentials */
2363 scm_set_cred(&scm, UNIXCB(skb).pid, UNIXCB(skb).uid, UNIXCB(skb).gid);
2364 unix_set_secdata(&scm, skb);
2365 check_creds = true;
2368 /* Copy address just once */
2369 if (state->msg && state->msg->msg_name) {
2370 DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr,
2371 state->msg->msg_name);
2372 unix_copy_addr(state->msg, skb->sk);
2373 sunaddr = NULL;
2376 chunk = min_t(unsigned int, unix_skb_len(skb) - skip, size);
2377 skb_get(skb);
2378 chunk = state->recv_actor(skb, skip, chunk, state);
2379 drop_skb = !unix_skb_len(skb);
2380 /* skb is only safe to use if !drop_skb */
2381 consume_skb(skb);
2382 if (chunk < 0) {
2383 if (copied == 0)
2384 copied = -EFAULT;
2385 break;
2387 copied += chunk;
2388 size -= chunk;
2390 if (drop_skb) {
2391 /* the skb was touched by a concurrent reader;
2392 * we should not expect anything from this skb
2393 * anymore and assume it invalid - we can be
2394 * sure it was dropped from the socket queue
2396 * let's report a short read
2398 err = 0;
2399 break;
2402 /* Mark read part of skb as used */
2403 if (!(flags & MSG_PEEK)) {
2404 UNIXCB(skb).consumed += chunk;
2406 sk_peek_offset_bwd(sk, chunk);
2408 if (UNIXCB(skb).fp)
2409 unix_detach_fds(&scm, skb);
2411 if (unix_skb_len(skb))
2412 break;
2414 skb_unlink(skb, &sk->sk_receive_queue);
2415 consume_skb(skb);
2417 if (scm.fp)
2418 break;
2419 } else {
2420 /* It is questionable, see note in unix_dgram_recvmsg.
2422 if (UNIXCB(skb).fp)
2423 scm.fp = scm_fp_dup(UNIXCB(skb).fp);
2425 sk_peek_offset_fwd(sk, chunk);
2427 if (UNIXCB(skb).fp)
2428 break;
2430 skip = 0;
2431 last = skb;
2432 last_len = skb->len;
2433 unix_state_lock(sk);
2434 skb = skb_peek_next(skb, &sk->sk_receive_queue);
2435 if (skb)
2436 goto again;
2437 unix_state_unlock(sk);
2438 break;
2440 } while (size);
2442 mutex_unlock(&u->iolock);
2443 if (state->msg)
2444 scm_recv(sock, state->msg, &scm, flags);
2445 else
2446 scm_destroy(&scm);
2447 out:
2448 return copied ? : err;
2451 static int unix_stream_read_actor(struct sk_buff *skb,
2452 int skip, int chunk,
2453 struct unix_stream_read_state *state)
2455 int ret;
2457 ret = skb_copy_datagram_msg(skb, UNIXCB(skb).consumed + skip,
2458 state->msg, chunk);
2459 return ret ?: chunk;
2462 static int unix_stream_recvmsg(struct socket *sock, struct msghdr *msg,
2463 size_t size, int flags)
2465 struct unix_stream_read_state state = {
2466 .recv_actor = unix_stream_read_actor,
2467 .socket = sock,
2468 .msg = msg,
2469 .size = size,
2470 .flags = flags
2473 return unix_stream_read_generic(&state, true);
2476 static int unix_stream_splice_actor(struct sk_buff *skb,
2477 int skip, int chunk,
2478 struct unix_stream_read_state *state)
2480 return skb_splice_bits(skb, state->socket->sk,
2481 UNIXCB(skb).consumed + skip,
2482 state->pipe, chunk, state->splice_flags);
2485 static ssize_t unix_stream_splice_read(struct socket *sock, loff_t *ppos,
2486 struct pipe_inode_info *pipe,
2487 size_t size, unsigned int flags)
2489 struct unix_stream_read_state state = {
2490 .recv_actor = unix_stream_splice_actor,
2491 .socket = sock,
2492 .pipe = pipe,
2493 .size = size,
2494 .splice_flags = flags,
2497 if (unlikely(*ppos))
2498 return -ESPIPE;
2500 if (sock->file->f_flags & O_NONBLOCK ||
2501 flags & SPLICE_F_NONBLOCK)
2502 state.flags = MSG_DONTWAIT;
2504 return unix_stream_read_generic(&state, false);
2507 static int unix_shutdown(struct socket *sock, int mode)
2509 struct sock *sk = sock->sk;
2510 struct sock *other;
2512 if (mode < SHUT_RD || mode > SHUT_RDWR)
2513 return -EINVAL;
2514 /* This maps:
2515 * SHUT_RD (0) -> RCV_SHUTDOWN (1)
2516 * SHUT_WR (1) -> SEND_SHUTDOWN (2)
2517 * SHUT_RDWR (2) -> SHUTDOWN_MASK (3)
2519 ++mode;
2521 unix_state_lock(sk);
2522 sk->sk_shutdown |= mode;
2523 other = unix_peer(sk);
2524 if (other)
2525 sock_hold(other);
2526 unix_state_unlock(sk);
2527 sk->sk_state_change(sk);
2529 if (other &&
2530 (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET)) {
2532 int peer_mode = 0;
2534 if (mode&RCV_SHUTDOWN)
2535 peer_mode |= SEND_SHUTDOWN;
2536 if (mode&SEND_SHUTDOWN)
2537 peer_mode |= RCV_SHUTDOWN;
2538 unix_state_lock(other);
2539 other->sk_shutdown |= peer_mode;
2540 unix_state_unlock(other);
2541 other->sk_state_change(other);
2542 if (peer_mode == SHUTDOWN_MASK)
2543 sk_wake_async(other, SOCK_WAKE_WAITD, POLL_HUP);
2544 else if (peer_mode & RCV_SHUTDOWN)
2545 sk_wake_async(other, SOCK_WAKE_WAITD, POLL_IN);
2547 if (other)
2548 sock_put(other);
2550 return 0;
2553 long unix_inq_len(struct sock *sk)
2555 struct sk_buff *skb;
2556 long amount = 0;
2558 if (sk->sk_state == TCP_LISTEN)
2559 return -EINVAL;
2561 spin_lock(&sk->sk_receive_queue.lock);
2562 if (sk->sk_type == SOCK_STREAM ||
2563 sk->sk_type == SOCK_SEQPACKET) {
2564 skb_queue_walk(&sk->sk_receive_queue, skb)
2565 amount += unix_skb_len(skb);
2566 } else {
2567 skb = skb_peek(&sk->sk_receive_queue);
2568 if (skb)
2569 amount = skb->len;
2571 spin_unlock(&sk->sk_receive_queue.lock);
2573 return amount;
2575 EXPORT_SYMBOL_GPL(unix_inq_len);
2577 long unix_outq_len(struct sock *sk)
2579 return sk_wmem_alloc_get(sk);
2581 EXPORT_SYMBOL_GPL(unix_outq_len);
2583 static int unix_open_file(struct sock *sk)
2585 struct path path;
2586 struct file *f;
2587 int fd;
2589 if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
2590 return -EPERM;
2592 if (!smp_load_acquire(&unix_sk(sk)->addr))
2593 return -ENOENT;
2595 path = unix_sk(sk)->path;
2596 if (!path.dentry)
2597 return -ENOENT;
2599 path_get(&path);
2601 fd = get_unused_fd_flags(O_CLOEXEC);
2602 if (fd < 0)
2603 goto out;
2605 f = dentry_open(&path, O_PATH, current_cred());
2606 if (IS_ERR(f)) {
2607 put_unused_fd(fd);
2608 fd = PTR_ERR(f);
2609 goto out;
2612 fd_install(fd, f);
2613 out:
2614 path_put(&path);
2616 return fd;
2619 static int unix_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
2621 struct sock *sk = sock->sk;
2622 long amount = 0;
2623 int err;
2625 switch (cmd) {
2626 case SIOCOUTQ:
2627 amount = unix_outq_len(sk);
2628 err = put_user(amount, (int __user *)arg);
2629 break;
2630 case SIOCINQ:
2631 amount = unix_inq_len(sk);
2632 if (amount < 0)
2633 err = amount;
2634 else
2635 err = put_user(amount, (int __user *)arg);
2636 break;
2637 case SIOCUNIXFILE:
2638 err = unix_open_file(sk);
2639 break;
2640 default:
2641 err = -ENOIOCTLCMD;
2642 break;
2644 return err;
2647 static __poll_t unix_poll(struct file *file, struct socket *sock, poll_table *wait)
2649 struct sock *sk = sock->sk;
2650 __poll_t mask;
2652 sock_poll_wait(file, sock, wait);
2653 mask = 0;
2655 /* exceptional events? */
2656 if (sk->sk_err)
2657 mask |= EPOLLERR;
2658 if (sk->sk_shutdown == SHUTDOWN_MASK)
2659 mask |= EPOLLHUP;
2660 if (sk->sk_shutdown & RCV_SHUTDOWN)
2661 mask |= EPOLLRDHUP | EPOLLIN | EPOLLRDNORM;
2663 /* readable? */
2664 if (!skb_queue_empty(&sk->sk_receive_queue))
2665 mask |= EPOLLIN | EPOLLRDNORM;
2667 /* Connection-based need to check for termination and startup */
2668 if ((sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) &&
2669 sk->sk_state == TCP_CLOSE)
2670 mask |= EPOLLHUP;
2673 * we set writable also when the other side has shut down the
2674 * connection. This prevents stuck sockets.
2676 if (unix_writable(sk))
2677 mask |= EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND;
2679 return mask;
2682 static __poll_t unix_dgram_poll(struct file *file, struct socket *sock,
2683 poll_table *wait)
2685 struct sock *sk = sock->sk, *other;
2686 unsigned int writable;
2687 __poll_t mask;
2689 sock_poll_wait(file, sock, wait);
2690 mask = 0;
2692 /* exceptional events? */
2693 if (sk->sk_err || !skb_queue_empty(&sk->sk_error_queue))
2694 mask |= EPOLLERR |
2695 (sock_flag(sk, SOCK_SELECT_ERR_QUEUE) ? EPOLLPRI : 0);
2697 if (sk->sk_shutdown & RCV_SHUTDOWN)
2698 mask |= EPOLLRDHUP | EPOLLIN | EPOLLRDNORM;
2699 if (sk->sk_shutdown == SHUTDOWN_MASK)
2700 mask |= EPOLLHUP;
2702 /* readable? */
2703 if (!skb_queue_empty(&sk->sk_receive_queue))
2704 mask |= EPOLLIN | EPOLLRDNORM;
2706 /* Connection-based need to check for termination and startup */
2707 if (sk->sk_type == SOCK_SEQPACKET) {
2708 if (sk->sk_state == TCP_CLOSE)
2709 mask |= EPOLLHUP;
2710 /* connection hasn't started yet? */
2711 if (sk->sk_state == TCP_SYN_SENT)
2712 return mask;
2715 /* No write status requested, avoid expensive OUT tests. */
2716 if (!(poll_requested_events(wait) & (EPOLLWRBAND|EPOLLWRNORM|EPOLLOUT)))
2717 return mask;
2719 writable = unix_writable(sk);
2720 if (writable) {
2721 unix_state_lock(sk);
2723 other = unix_peer(sk);
2724 if (other && unix_peer(other) != sk &&
2725 unix_recvq_full(other) &&
2726 unix_dgram_peer_wake_me(sk, other))
2727 writable = 0;
2729 unix_state_unlock(sk);
2732 if (writable)
2733 mask |= EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND;
2734 else
2735 sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
2737 return mask;
2740 #ifdef CONFIG_PROC_FS
2742 #define BUCKET_SPACE (BITS_PER_LONG - (UNIX_HASH_BITS + 1) - 1)
2744 #define get_bucket(x) ((x) >> BUCKET_SPACE)
2745 #define get_offset(x) ((x) & ((1L << BUCKET_SPACE) - 1))
2746 #define set_bucket_offset(b, o) ((b) << BUCKET_SPACE | (o))
2748 static struct sock *unix_from_bucket(struct seq_file *seq, loff_t *pos)
2750 unsigned long offset = get_offset(*pos);
2751 unsigned long bucket = get_bucket(*pos);
2752 struct sock *sk;
2753 unsigned long count = 0;
2755 for (sk = sk_head(&unix_socket_table[bucket]); sk; sk = sk_next(sk)) {
2756 if (sock_net(sk) != seq_file_net(seq))
2757 continue;
2758 if (++count == offset)
2759 break;
2762 return sk;
2765 static struct sock *unix_next_socket(struct seq_file *seq,
2766 struct sock *sk,
2767 loff_t *pos)
2769 unsigned long bucket;
2771 while (sk > (struct sock *)SEQ_START_TOKEN) {
2772 sk = sk_next(sk);
2773 if (!sk)
2774 goto next_bucket;
2775 if (sock_net(sk) == seq_file_net(seq))
2776 return sk;
2779 do {
2780 sk = unix_from_bucket(seq, pos);
2781 if (sk)
2782 return sk;
2784 next_bucket:
2785 bucket = get_bucket(*pos) + 1;
2786 *pos = set_bucket_offset(bucket, 1);
2787 } while (bucket < ARRAY_SIZE(unix_socket_table));
2789 return NULL;
2792 static void *unix_seq_start(struct seq_file *seq, loff_t *pos)
2793 __acquires(unix_table_lock)
2795 spin_lock(&unix_table_lock);
2797 if (!*pos)
2798 return SEQ_START_TOKEN;
2800 if (get_bucket(*pos) >= ARRAY_SIZE(unix_socket_table))
2801 return NULL;
2803 return unix_next_socket(seq, NULL, pos);
2806 static void *unix_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2808 ++*pos;
2809 return unix_next_socket(seq, v, pos);
2812 static void unix_seq_stop(struct seq_file *seq, void *v)
2813 __releases(unix_table_lock)
2815 spin_unlock(&unix_table_lock);
2818 static int unix_seq_show(struct seq_file *seq, void *v)
2821 if (v == SEQ_START_TOKEN)
2822 seq_puts(seq, "Num RefCount Protocol Flags Type St "
2823 "Inode Path\n");
2824 else {
2825 struct sock *s = v;
2826 struct unix_sock *u = unix_sk(s);
2827 unix_state_lock(s);
2829 seq_printf(seq, "%pK: %08X %08X %08X %04X %02X %5lu",
2831 refcount_read(&s->sk_refcnt),
2833 s->sk_state == TCP_LISTEN ? __SO_ACCEPTCON : 0,
2834 s->sk_type,
2835 s->sk_socket ?
2836 (s->sk_state == TCP_ESTABLISHED ? SS_CONNECTED : SS_UNCONNECTED) :
2837 (s->sk_state == TCP_ESTABLISHED ? SS_CONNECTING : SS_DISCONNECTING),
2838 sock_i_ino(s));
2840 if (u->addr) { // under unix_table_lock here
2841 int i, len;
2842 seq_putc(seq, ' ');
2844 i = 0;
2845 len = u->addr->len - sizeof(short);
2846 if (!UNIX_ABSTRACT(s))
2847 len--;
2848 else {
2849 seq_putc(seq, '@');
2850 i++;
2852 for ( ; i < len; i++)
2853 seq_putc(seq, u->addr->name->sun_path[i] ?:
2854 '@');
2856 unix_state_unlock(s);
2857 seq_putc(seq, '\n');
2860 return 0;
2863 static const struct seq_operations unix_seq_ops = {
2864 .start = unix_seq_start,
2865 .next = unix_seq_next,
2866 .stop = unix_seq_stop,
2867 .show = unix_seq_show,
2869 #endif
2871 static const struct net_proto_family unix_family_ops = {
2872 .family = PF_UNIX,
2873 .create = unix_create,
2874 .owner = THIS_MODULE,
2878 static int __net_init unix_net_init(struct net *net)
2880 int error = -ENOMEM;
2882 net->unx.sysctl_max_dgram_qlen = 10;
2883 if (unix_sysctl_register(net))
2884 goto out;
2886 #ifdef CONFIG_PROC_FS
2887 if (!proc_create_net("unix", 0, net->proc_net, &unix_seq_ops,
2888 sizeof(struct seq_net_private))) {
2889 unix_sysctl_unregister(net);
2890 goto out;
2892 #endif
2893 error = 0;
2894 out:
2895 return error;
2898 static void __net_exit unix_net_exit(struct net *net)
2900 unix_sysctl_unregister(net);
2901 remove_proc_entry("unix", net->proc_net);
2904 static struct pernet_operations unix_net_ops = {
2905 .init = unix_net_init,
2906 .exit = unix_net_exit,
2909 static int __init af_unix_init(void)
2911 int rc = -1;
2913 BUILD_BUG_ON(sizeof(struct unix_skb_parms) > FIELD_SIZEOF(struct sk_buff, cb));
2915 rc = proto_register(&unix_proto, 1);
2916 if (rc != 0) {
2917 pr_crit("%s: Cannot create unix_sock SLAB cache!\n", __func__);
2918 goto out;
2921 sock_register(&unix_family_ops);
2922 register_pernet_subsys(&unix_net_ops);
2923 out:
2924 return rc;
2927 static void __exit af_unix_exit(void)
2929 sock_unregister(PF_UNIX);
2930 proto_unregister(&unix_proto);
2931 unregister_pernet_subsys(&unix_net_ops);
2934 /* Earlier than device_initcall() so that other drivers invoking
2935 request_module() don't end up in a loop when modprobe tries
2936 to use a UNIX socket. But later than subsys_initcall() because
2937 we depend on stuff initialised there */
2938 fs_initcall(af_unix_init);
2939 module_exit(af_unix_exit);
2941 MODULE_LICENSE("GPL");
2942 MODULE_ALIAS_NETPROTO(PF_UNIX);