seccomp: Fix ioctl number for SECCOMP_IOCTL_NOTIF_ID_VALID
[linux/fpc-iii.git] / net / unix / af_unix.c
blobb3369d678f1affa2c2ca3c44ee0ddd88bccb9f82
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3 * NET4: Implementation of BSD Unix domain sockets.
5 * Authors: Alan Cox, <alan@lxorguk.ukuu.org.uk>
7 * Fixes:
8 * Linus Torvalds : Assorted bug cures.
9 * Niibe Yutaka : async I/O support.
10 * Carsten Paeth : PF_UNIX check, address fixes.
11 * Alan Cox : Limit size of allocated blocks.
12 * Alan Cox : Fixed the stupid socketpair bug.
13 * Alan Cox : BSD compatibility fine tuning.
14 * Alan Cox : Fixed a bug in connect when interrupted.
15 * Alan Cox : Sorted out a proper draft version of
16 * file descriptor passing hacked up from
17 * Mike Shaver's work.
18 * Marty Leisner : Fixes to fd passing
19 * Nick Nevin : recvmsg bugfix.
20 * Alan Cox : Started proper garbage collector
21 * Heiko EiBfeldt : Missing verify_area check
22 * Alan Cox : Started POSIXisms
23 * Andreas Schwab : Replace inode by dentry for proper
24 * reference counting
25 * Kirk Petersen : Made this a module
26 * Christoph Rohland : Elegant non-blocking accept/connect algorithm.
27 * Lots of bug fixes.
28 * Alexey Kuznetosv : Repaired (I hope) bugs introduces
29 * by above two patches.
30 * Andrea Arcangeli : If possible we block in connect(2)
31 * if the max backlog of the listen socket
32 * is been reached. This won't break
33 * old apps and it will avoid huge amount
34 * of socks hashed (this for unix_gc()
35 * performances reasons).
36 * Security fix that limits the max
37 * number of socks to 2*max_files and
38 * the number of skb queueable in the
39 * dgram receiver.
40 * Artur Skawina : Hash function optimizations
41 * Alexey Kuznetsov : Full scale SMP. Lot of bugs are introduced 8)
42 * Malcolm Beattie : Set peercred for socketpair
43 * Michal Ostrowski : Module initialization cleanup.
44 * Arnaldo C. Melo : Remove MOD_{INC,DEC}_USE_COUNT,
45 * the core infrastructure is doing that
46 * for all net proto families now (2.5.69+)
48 * Known differences from reference BSD that was tested:
50 * [TO FIX]
51 * ECONNREFUSED is not returned from one end of a connected() socket to the
52 * other the moment one end closes.
53 * fstat() doesn't return st_dev=0, and give the blksize as high water mark
54 * and a fake inode identifier (nor the BSD first socket fstat twice bug).
55 * [NOT TO FIX]
56 * accept() returns a path name even if the connecting socket has closed
57 * in the meantime (BSD loses the path and gives up).
58 * accept() returns 0 length path for an unbound connector. BSD returns 16
59 * and a null first byte in the path (but not for gethost/peername - BSD bug ??)
60 * socketpair(...SOCK_RAW..) doesn't panic the kernel.
61 * BSD af_unix apparently has connect forgetting to block properly.
62 * (need to check this with the POSIX spec in detail)
64 * Differences from 2.0.0-11-... (ANK)
65 * Bug fixes and improvements.
66 * - client shutdown killed server socket.
67 * - removed all useless cli/sti pairs.
69 * Semantic changes/extensions.
70 * - generic control message passing.
71 * - SCM_CREDENTIALS control message.
72 * - "Abstract" (not FS based) socket bindings.
73 * Abstract names are sequences of bytes (not zero terminated)
74 * started by 0, so that this name space does not intersect
75 * with BSD names.
78 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
80 #include <linux/module.h>
81 #include <linux/kernel.h>
82 #include <linux/signal.h>
83 #include <linux/sched/signal.h>
84 #include <linux/errno.h>
85 #include <linux/string.h>
86 #include <linux/stat.h>
87 #include <linux/dcache.h>
88 #include <linux/namei.h>
89 #include <linux/socket.h>
90 #include <linux/un.h>
91 #include <linux/fcntl.h>
92 #include <linux/termios.h>
93 #include <linux/sockios.h>
94 #include <linux/net.h>
95 #include <linux/in.h>
96 #include <linux/fs.h>
97 #include <linux/slab.h>
98 #include <linux/uaccess.h>
99 #include <linux/skbuff.h>
100 #include <linux/netdevice.h>
101 #include <net/net_namespace.h>
102 #include <net/sock.h>
103 #include <net/tcp_states.h>
104 #include <net/af_unix.h>
105 #include <linux/proc_fs.h>
106 #include <linux/seq_file.h>
107 #include <net/scm.h>
108 #include <linux/init.h>
109 #include <linux/poll.h>
110 #include <linux/rtnetlink.h>
111 #include <linux/mount.h>
112 #include <net/checksum.h>
113 #include <linux/security.h>
114 #include <linux/freezer.h>
115 #include <linux/file.h>
117 #include "scm.h"
119 struct hlist_head unix_socket_table[2 * UNIX_HASH_SIZE];
120 EXPORT_SYMBOL_GPL(unix_socket_table);
121 DEFINE_SPINLOCK(unix_table_lock);
122 EXPORT_SYMBOL_GPL(unix_table_lock);
123 static atomic_long_t unix_nr_socks;
126 static struct hlist_head *unix_sockets_unbound(void *addr)
128 unsigned long hash = (unsigned long)addr;
130 hash ^= hash >> 16;
131 hash ^= hash >> 8;
132 hash %= UNIX_HASH_SIZE;
133 return &unix_socket_table[UNIX_HASH_SIZE + hash];
136 #define UNIX_ABSTRACT(sk) (unix_sk(sk)->addr->hash < UNIX_HASH_SIZE)
138 #ifdef CONFIG_SECURITY_NETWORK
139 static void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb)
141 UNIXCB(skb).secid = scm->secid;
144 static inline void unix_set_secdata(struct scm_cookie *scm, struct sk_buff *skb)
146 scm->secid = UNIXCB(skb).secid;
149 static inline bool unix_secdata_eq(struct scm_cookie *scm, struct sk_buff *skb)
151 return (scm->secid == UNIXCB(skb).secid);
153 #else
154 static inline void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb)
157 static inline void unix_set_secdata(struct scm_cookie *scm, struct sk_buff *skb)
160 static inline bool unix_secdata_eq(struct scm_cookie *scm, struct sk_buff *skb)
162 return true;
164 #endif /* CONFIG_SECURITY_NETWORK */
167 * SMP locking strategy:
168 * hash table is protected with spinlock unix_table_lock
169 * each socket state is protected by separate spin lock.
172 static inline unsigned int unix_hash_fold(__wsum n)
174 unsigned int hash = (__force unsigned int)csum_fold(n);
176 hash ^= hash>>8;
177 return hash&(UNIX_HASH_SIZE-1);
180 #define unix_peer(sk) (unix_sk(sk)->peer)
182 static inline int unix_our_peer(struct sock *sk, struct sock *osk)
184 return unix_peer(osk) == sk;
187 static inline int unix_may_send(struct sock *sk, struct sock *osk)
189 return unix_peer(osk) == NULL || unix_our_peer(sk, osk);
192 static inline int unix_recvq_full(struct sock const *sk)
194 return skb_queue_len(&sk->sk_receive_queue) > sk->sk_max_ack_backlog;
197 struct sock *unix_peer_get(struct sock *s)
199 struct sock *peer;
201 unix_state_lock(s);
202 peer = unix_peer(s);
203 if (peer)
204 sock_hold(peer);
205 unix_state_unlock(s);
206 return peer;
208 EXPORT_SYMBOL_GPL(unix_peer_get);
210 static inline void unix_release_addr(struct unix_address *addr)
212 if (refcount_dec_and_test(&addr->refcnt))
213 kfree(addr);
217 * Check unix socket name:
218 * - should be not zero length.
219 * - if started by not zero, should be NULL terminated (FS object)
220 * - if started by zero, it is abstract name.
223 static int unix_mkname(struct sockaddr_un *sunaddr, int len, unsigned int *hashp)
225 *hashp = 0;
227 if (len <= sizeof(short) || len > sizeof(*sunaddr))
228 return -EINVAL;
229 if (!sunaddr || sunaddr->sun_family != AF_UNIX)
230 return -EINVAL;
231 if (sunaddr->sun_path[0]) {
233 * This may look like an off by one error but it is a bit more
234 * subtle. 108 is the longest valid AF_UNIX path for a binding.
235 * sun_path[108] doesn't as such exist. However in kernel space
236 * we are guaranteed that it is a valid memory location in our
237 * kernel address buffer.
239 ((char *)sunaddr)[len] = 0;
240 len = strlen(sunaddr->sun_path)+1+sizeof(short);
241 return len;
244 *hashp = unix_hash_fold(csum_partial(sunaddr, len, 0));
245 return len;
248 static void __unix_remove_socket(struct sock *sk)
250 sk_del_node_init(sk);
253 static void __unix_insert_socket(struct hlist_head *list, struct sock *sk)
255 WARN_ON(!sk_unhashed(sk));
256 sk_add_node(sk, list);
259 static inline void unix_remove_socket(struct sock *sk)
261 spin_lock(&unix_table_lock);
262 __unix_remove_socket(sk);
263 spin_unlock(&unix_table_lock);
266 static inline void unix_insert_socket(struct hlist_head *list, struct sock *sk)
268 spin_lock(&unix_table_lock);
269 __unix_insert_socket(list, sk);
270 spin_unlock(&unix_table_lock);
273 static struct sock *__unix_find_socket_byname(struct net *net,
274 struct sockaddr_un *sunname,
275 int len, int type, unsigned int hash)
277 struct sock *s;
279 sk_for_each(s, &unix_socket_table[hash ^ type]) {
280 struct unix_sock *u = unix_sk(s);
282 if (!net_eq(sock_net(s), net))
283 continue;
285 if (u->addr->len == len &&
286 !memcmp(u->addr->name, sunname, len))
287 goto found;
289 s = NULL;
290 found:
291 return s;
294 static inline struct sock *unix_find_socket_byname(struct net *net,
295 struct sockaddr_un *sunname,
296 int len, int type,
297 unsigned int hash)
299 struct sock *s;
301 spin_lock(&unix_table_lock);
302 s = __unix_find_socket_byname(net, sunname, len, type, hash);
303 if (s)
304 sock_hold(s);
305 spin_unlock(&unix_table_lock);
306 return s;
309 static struct sock *unix_find_socket_byinode(struct inode *i)
311 struct sock *s;
313 spin_lock(&unix_table_lock);
314 sk_for_each(s,
315 &unix_socket_table[i->i_ino & (UNIX_HASH_SIZE - 1)]) {
316 struct dentry *dentry = unix_sk(s)->path.dentry;
318 if (dentry && d_backing_inode(dentry) == i) {
319 sock_hold(s);
320 goto found;
323 s = NULL;
324 found:
325 spin_unlock(&unix_table_lock);
326 return s;
329 /* Support code for asymmetrically connected dgram sockets
331 * If a datagram socket is connected to a socket not itself connected
332 * to the first socket (eg, /dev/log), clients may only enqueue more
333 * messages if the present receive queue of the server socket is not
334 * "too large". This means there's a second writeability condition
335 * poll and sendmsg need to test. The dgram recv code will do a wake
336 * up on the peer_wait wait queue of a socket upon reception of a
337 * datagram which needs to be propagated to sleeping would-be writers
338 * since these might not have sent anything so far. This can't be
339 * accomplished via poll_wait because the lifetime of the server
340 * socket might be less than that of its clients if these break their
341 * association with it or if the server socket is closed while clients
342 * are still connected to it and there's no way to inform "a polling
343 * implementation" that it should let go of a certain wait queue
345 * In order to propagate a wake up, a wait_queue_entry_t of the client
346 * socket is enqueued on the peer_wait queue of the server socket
347 * whose wake function does a wake_up on the ordinary client socket
348 * wait queue. This connection is established whenever a write (or
349 * poll for write) hit the flow control condition and broken when the
350 * association to the server socket is dissolved or after a wake up
351 * was relayed.
354 static int unix_dgram_peer_wake_relay(wait_queue_entry_t *q, unsigned mode, int flags,
355 void *key)
357 struct unix_sock *u;
358 wait_queue_head_t *u_sleep;
360 u = container_of(q, struct unix_sock, peer_wake);
362 __remove_wait_queue(&unix_sk(u->peer_wake.private)->peer_wait,
364 u->peer_wake.private = NULL;
366 /* relaying can only happen while the wq still exists */
367 u_sleep = sk_sleep(&u->sk);
368 if (u_sleep)
369 wake_up_interruptible_poll(u_sleep, key_to_poll(key));
371 return 0;
374 static int unix_dgram_peer_wake_connect(struct sock *sk, struct sock *other)
376 struct unix_sock *u, *u_other;
377 int rc;
379 u = unix_sk(sk);
380 u_other = unix_sk(other);
381 rc = 0;
382 spin_lock(&u_other->peer_wait.lock);
384 if (!u->peer_wake.private) {
385 u->peer_wake.private = other;
386 __add_wait_queue(&u_other->peer_wait, &u->peer_wake);
388 rc = 1;
391 spin_unlock(&u_other->peer_wait.lock);
392 return rc;
395 static void unix_dgram_peer_wake_disconnect(struct sock *sk,
396 struct sock *other)
398 struct unix_sock *u, *u_other;
400 u = unix_sk(sk);
401 u_other = unix_sk(other);
402 spin_lock(&u_other->peer_wait.lock);
404 if (u->peer_wake.private == other) {
405 __remove_wait_queue(&u_other->peer_wait, &u->peer_wake);
406 u->peer_wake.private = NULL;
409 spin_unlock(&u_other->peer_wait.lock);
412 static void unix_dgram_peer_wake_disconnect_wakeup(struct sock *sk,
413 struct sock *other)
415 unix_dgram_peer_wake_disconnect(sk, other);
416 wake_up_interruptible_poll(sk_sleep(sk),
417 EPOLLOUT |
418 EPOLLWRNORM |
419 EPOLLWRBAND);
422 /* preconditions:
423 * - unix_peer(sk) == other
424 * - association is stable
426 static int unix_dgram_peer_wake_me(struct sock *sk, struct sock *other)
428 int connected;
430 connected = unix_dgram_peer_wake_connect(sk, other);
432 /* If other is SOCK_DEAD, we want to make sure we signal
433 * POLLOUT, such that a subsequent write() can get a
434 * -ECONNREFUSED. Otherwise, if we haven't queued any skbs
435 * to other and its full, we will hang waiting for POLLOUT.
437 if (unix_recvq_full(other) && !sock_flag(other, SOCK_DEAD))
438 return 1;
440 if (connected)
441 unix_dgram_peer_wake_disconnect(sk, other);
443 return 0;
446 static int unix_writable(const struct sock *sk)
448 return sk->sk_state != TCP_LISTEN &&
449 (refcount_read(&sk->sk_wmem_alloc) << 2) <= sk->sk_sndbuf;
452 static void unix_write_space(struct sock *sk)
454 struct socket_wq *wq;
456 rcu_read_lock();
457 if (unix_writable(sk)) {
458 wq = rcu_dereference(sk->sk_wq);
459 if (skwq_has_sleeper(wq))
460 wake_up_interruptible_sync_poll(&wq->wait,
461 EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND);
462 sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
464 rcu_read_unlock();
467 /* When dgram socket disconnects (or changes its peer), we clear its receive
468 * queue of packets arrived from previous peer. First, it allows to do
469 * flow control based only on wmem_alloc; second, sk connected to peer
470 * may receive messages only from that peer. */
471 static void unix_dgram_disconnected(struct sock *sk, struct sock *other)
473 if (!skb_queue_empty(&sk->sk_receive_queue)) {
474 skb_queue_purge(&sk->sk_receive_queue);
475 wake_up_interruptible_all(&unix_sk(sk)->peer_wait);
477 /* If one link of bidirectional dgram pipe is disconnected,
478 * we signal error. Messages are lost. Do not make this,
479 * when peer was not connected to us.
481 if (!sock_flag(other, SOCK_DEAD) && unix_peer(other) == sk) {
482 other->sk_err = ECONNRESET;
483 other->sk_error_report(other);
488 static void unix_sock_destructor(struct sock *sk)
490 struct unix_sock *u = unix_sk(sk);
492 skb_queue_purge(&sk->sk_receive_queue);
494 WARN_ON(refcount_read(&sk->sk_wmem_alloc));
495 WARN_ON(!sk_unhashed(sk));
496 WARN_ON(sk->sk_socket);
497 if (!sock_flag(sk, SOCK_DEAD)) {
498 pr_info("Attempt to release alive unix socket: %p\n", sk);
499 return;
502 if (u->addr)
503 unix_release_addr(u->addr);
505 atomic_long_dec(&unix_nr_socks);
506 local_bh_disable();
507 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
508 local_bh_enable();
509 #ifdef UNIX_REFCNT_DEBUG
510 pr_debug("UNIX %p is destroyed, %ld are still alive.\n", sk,
511 atomic_long_read(&unix_nr_socks));
512 #endif
515 static void unix_release_sock(struct sock *sk, int embrion)
517 struct unix_sock *u = unix_sk(sk);
518 struct path path;
519 struct sock *skpair;
520 struct sk_buff *skb;
521 int state;
523 unix_remove_socket(sk);
525 /* Clear state */
526 unix_state_lock(sk);
527 sock_orphan(sk);
528 sk->sk_shutdown = SHUTDOWN_MASK;
529 path = u->path;
530 u->path.dentry = NULL;
531 u->path.mnt = NULL;
532 state = sk->sk_state;
533 sk->sk_state = TCP_CLOSE;
534 unix_state_unlock(sk);
536 wake_up_interruptible_all(&u->peer_wait);
538 skpair = unix_peer(sk);
540 if (skpair != NULL) {
541 if (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) {
542 unix_state_lock(skpair);
543 /* No more writes */
544 skpair->sk_shutdown = SHUTDOWN_MASK;
545 if (!skb_queue_empty(&sk->sk_receive_queue) || embrion)
546 skpair->sk_err = ECONNRESET;
547 unix_state_unlock(skpair);
548 skpair->sk_state_change(skpair);
549 sk_wake_async(skpair, SOCK_WAKE_WAITD, POLL_HUP);
552 unix_dgram_peer_wake_disconnect(sk, skpair);
553 sock_put(skpair); /* It may now die */
554 unix_peer(sk) = NULL;
557 /* Try to flush out this socket. Throw out buffers at least */
559 while ((skb = skb_dequeue(&sk->sk_receive_queue)) != NULL) {
560 if (state == TCP_LISTEN)
561 unix_release_sock(skb->sk, 1);
562 /* passed fds are erased in the kfree_skb hook */
563 UNIXCB(skb).consumed = skb->len;
564 kfree_skb(skb);
567 if (path.dentry)
568 path_put(&path);
570 sock_put(sk);
572 /* ---- Socket is dead now and most probably destroyed ---- */
575 * Fixme: BSD difference: In BSD all sockets connected to us get
576 * ECONNRESET and we die on the spot. In Linux we behave
577 * like files and pipes do and wait for the last
578 * dereference.
580 * Can't we simply set sock->err?
582 * What the above comment does talk about? --ANK(980817)
585 if (unix_tot_inflight)
586 unix_gc(); /* Garbage collect fds */
589 static void init_peercred(struct sock *sk)
591 put_pid(sk->sk_peer_pid);
592 if (sk->sk_peer_cred)
593 put_cred(sk->sk_peer_cred);
594 sk->sk_peer_pid = get_pid(task_tgid(current));
595 sk->sk_peer_cred = get_current_cred();
598 static void copy_peercred(struct sock *sk, struct sock *peersk)
600 put_pid(sk->sk_peer_pid);
601 if (sk->sk_peer_cred)
602 put_cred(sk->sk_peer_cred);
603 sk->sk_peer_pid = get_pid(peersk->sk_peer_pid);
604 sk->sk_peer_cred = get_cred(peersk->sk_peer_cred);
607 static int unix_listen(struct socket *sock, int backlog)
609 int err;
610 struct sock *sk = sock->sk;
611 struct unix_sock *u = unix_sk(sk);
612 struct pid *old_pid = NULL;
614 err = -EOPNOTSUPP;
615 if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET)
616 goto out; /* Only stream/seqpacket sockets accept */
617 err = -EINVAL;
618 if (!u->addr)
619 goto out; /* No listens on an unbound socket */
620 unix_state_lock(sk);
621 if (sk->sk_state != TCP_CLOSE && sk->sk_state != TCP_LISTEN)
622 goto out_unlock;
623 if (backlog > sk->sk_max_ack_backlog)
624 wake_up_interruptible_all(&u->peer_wait);
625 sk->sk_max_ack_backlog = backlog;
626 sk->sk_state = TCP_LISTEN;
627 /* set credentials so connect can copy them */
628 init_peercred(sk);
629 err = 0;
631 out_unlock:
632 unix_state_unlock(sk);
633 put_pid(old_pid);
634 out:
635 return err;
638 static int unix_release(struct socket *);
639 static int unix_bind(struct socket *, struct sockaddr *, int);
640 static int unix_stream_connect(struct socket *, struct sockaddr *,
641 int addr_len, int flags);
642 static int unix_socketpair(struct socket *, struct socket *);
643 static int unix_accept(struct socket *, struct socket *, int, bool);
644 static int unix_getname(struct socket *, struct sockaddr *, int);
645 static __poll_t unix_poll(struct file *, struct socket *, poll_table *);
646 static __poll_t unix_dgram_poll(struct file *, struct socket *,
647 poll_table *);
648 static int unix_ioctl(struct socket *, unsigned int, unsigned long);
649 #ifdef CONFIG_COMPAT
650 static int unix_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg);
651 #endif
652 static int unix_shutdown(struct socket *, int);
653 static int unix_stream_sendmsg(struct socket *, struct msghdr *, size_t);
654 static int unix_stream_recvmsg(struct socket *, struct msghdr *, size_t, int);
655 static ssize_t unix_stream_sendpage(struct socket *, struct page *, int offset,
656 size_t size, int flags);
657 static ssize_t unix_stream_splice_read(struct socket *, loff_t *ppos,
658 struct pipe_inode_info *, size_t size,
659 unsigned int flags);
660 static int unix_dgram_sendmsg(struct socket *, struct msghdr *, size_t);
661 static int unix_dgram_recvmsg(struct socket *, struct msghdr *, size_t, int);
662 static int unix_dgram_connect(struct socket *, struct sockaddr *,
663 int, int);
664 static int unix_seqpacket_sendmsg(struct socket *, struct msghdr *, size_t);
665 static int unix_seqpacket_recvmsg(struct socket *, struct msghdr *, size_t,
666 int);
668 static int unix_set_peek_off(struct sock *sk, int val)
670 struct unix_sock *u = unix_sk(sk);
672 if (mutex_lock_interruptible(&u->iolock))
673 return -EINTR;
675 sk->sk_peek_off = val;
676 mutex_unlock(&u->iolock);
678 return 0;
682 static const struct proto_ops unix_stream_ops = {
683 .family = PF_UNIX,
684 .owner = THIS_MODULE,
685 .release = unix_release,
686 .bind = unix_bind,
687 .connect = unix_stream_connect,
688 .socketpair = unix_socketpair,
689 .accept = unix_accept,
690 .getname = unix_getname,
691 .poll = unix_poll,
692 .ioctl = unix_ioctl,
693 #ifdef CONFIG_COMPAT
694 .compat_ioctl = unix_compat_ioctl,
695 #endif
696 .listen = unix_listen,
697 .shutdown = unix_shutdown,
698 .setsockopt = sock_no_setsockopt,
699 .getsockopt = sock_no_getsockopt,
700 .sendmsg = unix_stream_sendmsg,
701 .recvmsg = unix_stream_recvmsg,
702 .mmap = sock_no_mmap,
703 .sendpage = unix_stream_sendpage,
704 .splice_read = unix_stream_splice_read,
705 .set_peek_off = unix_set_peek_off,
708 static const struct proto_ops unix_dgram_ops = {
709 .family = PF_UNIX,
710 .owner = THIS_MODULE,
711 .release = unix_release,
712 .bind = unix_bind,
713 .connect = unix_dgram_connect,
714 .socketpair = unix_socketpair,
715 .accept = sock_no_accept,
716 .getname = unix_getname,
717 .poll = unix_dgram_poll,
718 .ioctl = unix_ioctl,
719 #ifdef CONFIG_COMPAT
720 .compat_ioctl = unix_compat_ioctl,
721 #endif
722 .listen = sock_no_listen,
723 .shutdown = unix_shutdown,
724 .setsockopt = sock_no_setsockopt,
725 .getsockopt = sock_no_getsockopt,
726 .sendmsg = unix_dgram_sendmsg,
727 .recvmsg = unix_dgram_recvmsg,
728 .mmap = sock_no_mmap,
729 .sendpage = sock_no_sendpage,
730 .set_peek_off = unix_set_peek_off,
733 static const struct proto_ops unix_seqpacket_ops = {
734 .family = PF_UNIX,
735 .owner = THIS_MODULE,
736 .release = unix_release,
737 .bind = unix_bind,
738 .connect = unix_stream_connect,
739 .socketpair = unix_socketpair,
740 .accept = unix_accept,
741 .getname = unix_getname,
742 .poll = unix_dgram_poll,
743 .ioctl = unix_ioctl,
744 #ifdef CONFIG_COMPAT
745 .compat_ioctl = unix_compat_ioctl,
746 #endif
747 .listen = unix_listen,
748 .shutdown = unix_shutdown,
749 .setsockopt = sock_no_setsockopt,
750 .getsockopt = sock_no_getsockopt,
751 .sendmsg = unix_seqpacket_sendmsg,
752 .recvmsg = unix_seqpacket_recvmsg,
753 .mmap = sock_no_mmap,
754 .sendpage = sock_no_sendpage,
755 .set_peek_off = unix_set_peek_off,
758 static struct proto unix_proto = {
759 .name = "UNIX",
760 .owner = THIS_MODULE,
761 .obj_size = sizeof(struct unix_sock),
764 static struct sock *unix_create1(struct net *net, struct socket *sock, int kern)
766 struct sock *sk = NULL;
767 struct unix_sock *u;
769 atomic_long_inc(&unix_nr_socks);
770 if (atomic_long_read(&unix_nr_socks) > 2 * get_max_files())
771 goto out;
773 sk = sk_alloc(net, PF_UNIX, GFP_KERNEL, &unix_proto, kern);
774 if (!sk)
775 goto out;
777 sock_init_data(sock, sk);
779 sk->sk_allocation = GFP_KERNEL_ACCOUNT;
780 sk->sk_write_space = unix_write_space;
781 sk->sk_max_ack_backlog = net->unx.sysctl_max_dgram_qlen;
782 sk->sk_destruct = unix_sock_destructor;
783 u = unix_sk(sk);
784 u->path.dentry = NULL;
785 u->path.mnt = NULL;
786 spin_lock_init(&u->lock);
787 atomic_long_set(&u->inflight, 0);
788 INIT_LIST_HEAD(&u->link);
789 mutex_init(&u->iolock); /* single task reading lock */
790 mutex_init(&u->bindlock); /* single task binding lock */
791 init_waitqueue_head(&u->peer_wait);
792 init_waitqueue_func_entry(&u->peer_wake, unix_dgram_peer_wake_relay);
793 unix_insert_socket(unix_sockets_unbound(sk), sk);
794 out:
795 if (sk == NULL)
796 atomic_long_dec(&unix_nr_socks);
797 else {
798 local_bh_disable();
799 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
800 local_bh_enable();
802 return sk;
805 static int unix_create(struct net *net, struct socket *sock, int protocol,
806 int kern)
808 if (protocol && protocol != PF_UNIX)
809 return -EPROTONOSUPPORT;
811 sock->state = SS_UNCONNECTED;
813 switch (sock->type) {
814 case SOCK_STREAM:
815 sock->ops = &unix_stream_ops;
816 break;
818 * Believe it or not BSD has AF_UNIX, SOCK_RAW though
819 * nothing uses it.
821 case SOCK_RAW:
822 sock->type = SOCK_DGRAM;
823 /* fall through */
824 case SOCK_DGRAM:
825 sock->ops = &unix_dgram_ops;
826 break;
827 case SOCK_SEQPACKET:
828 sock->ops = &unix_seqpacket_ops;
829 break;
830 default:
831 return -ESOCKTNOSUPPORT;
834 return unix_create1(net, sock, kern) ? 0 : -ENOMEM;
837 static int unix_release(struct socket *sock)
839 struct sock *sk = sock->sk;
841 if (!sk)
842 return 0;
844 unix_release_sock(sk, 0);
845 sock->sk = NULL;
847 return 0;
850 static int unix_autobind(struct socket *sock)
852 struct sock *sk = sock->sk;
853 struct net *net = sock_net(sk);
854 struct unix_sock *u = unix_sk(sk);
855 static u32 ordernum = 1;
856 struct unix_address *addr;
857 int err;
858 unsigned int retries = 0;
860 err = mutex_lock_interruptible(&u->bindlock);
861 if (err)
862 return err;
864 err = 0;
865 if (u->addr)
866 goto out;
868 err = -ENOMEM;
869 addr = kzalloc(sizeof(*addr) + sizeof(short) + 16, GFP_KERNEL);
870 if (!addr)
871 goto out;
873 addr->name->sun_family = AF_UNIX;
874 refcount_set(&addr->refcnt, 1);
876 retry:
877 addr->len = sprintf(addr->name->sun_path+1, "%05x", ordernum) + 1 + sizeof(short);
878 addr->hash = unix_hash_fold(csum_partial(addr->name, addr->len, 0));
880 spin_lock(&unix_table_lock);
881 ordernum = (ordernum+1)&0xFFFFF;
883 if (__unix_find_socket_byname(net, addr->name, addr->len, sock->type,
884 addr->hash)) {
885 spin_unlock(&unix_table_lock);
887 * __unix_find_socket_byname() may take long time if many names
888 * are already in use.
890 cond_resched();
891 /* Give up if all names seems to be in use. */
892 if (retries++ == 0xFFFFF) {
893 err = -ENOSPC;
894 kfree(addr);
895 goto out;
897 goto retry;
899 addr->hash ^= sk->sk_type;
901 __unix_remove_socket(sk);
902 smp_store_release(&u->addr, addr);
903 __unix_insert_socket(&unix_socket_table[addr->hash], sk);
904 spin_unlock(&unix_table_lock);
905 err = 0;
907 out: mutex_unlock(&u->bindlock);
908 return err;
911 static struct sock *unix_find_other(struct net *net,
912 struct sockaddr_un *sunname, int len,
913 int type, unsigned int hash, int *error)
915 struct sock *u;
916 struct path path;
917 int err = 0;
919 if (sunname->sun_path[0]) {
920 struct inode *inode;
921 err = kern_path(sunname->sun_path, LOOKUP_FOLLOW, &path);
922 if (err)
923 goto fail;
924 inode = d_backing_inode(path.dentry);
925 err = inode_permission(inode, MAY_WRITE);
926 if (err)
927 goto put_fail;
929 err = -ECONNREFUSED;
930 if (!S_ISSOCK(inode->i_mode))
931 goto put_fail;
932 u = unix_find_socket_byinode(inode);
933 if (!u)
934 goto put_fail;
936 if (u->sk_type == type)
937 touch_atime(&path);
939 path_put(&path);
941 err = -EPROTOTYPE;
942 if (u->sk_type != type) {
943 sock_put(u);
944 goto fail;
946 } else {
947 err = -ECONNREFUSED;
948 u = unix_find_socket_byname(net, sunname, len, type, hash);
949 if (u) {
950 struct dentry *dentry;
951 dentry = unix_sk(u)->path.dentry;
952 if (dentry)
953 touch_atime(&unix_sk(u)->path);
954 } else
955 goto fail;
957 return u;
959 put_fail:
960 path_put(&path);
961 fail:
962 *error = err;
963 return NULL;
966 static int unix_mknod(const char *sun_path, umode_t mode, struct path *res)
968 struct dentry *dentry;
969 struct path path;
970 int err = 0;
972 * Get the parent directory, calculate the hash for last
973 * component.
975 dentry = kern_path_create(AT_FDCWD, sun_path, &path, 0);
976 err = PTR_ERR(dentry);
977 if (IS_ERR(dentry))
978 return err;
981 * All right, let's create it.
983 err = security_path_mknod(&path, dentry, mode, 0);
984 if (!err) {
985 err = vfs_mknod(d_inode(path.dentry), dentry, mode, 0);
986 if (!err) {
987 res->mnt = mntget(path.mnt);
988 res->dentry = dget(dentry);
991 done_path_create(&path, dentry);
992 return err;
995 static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
997 struct sock *sk = sock->sk;
998 struct net *net = sock_net(sk);
999 struct unix_sock *u = unix_sk(sk);
1000 struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr;
1001 char *sun_path = sunaddr->sun_path;
1002 int err;
1003 unsigned int hash;
1004 struct unix_address *addr;
1005 struct hlist_head *list;
1006 struct path path = { };
1008 err = -EINVAL;
1009 if (addr_len < offsetofend(struct sockaddr_un, sun_family) ||
1010 sunaddr->sun_family != AF_UNIX)
1011 goto out;
1013 if (addr_len == sizeof(short)) {
1014 err = unix_autobind(sock);
1015 goto out;
1018 err = unix_mkname(sunaddr, addr_len, &hash);
1019 if (err < 0)
1020 goto out;
1021 addr_len = err;
1023 if (sun_path[0]) {
1024 umode_t mode = S_IFSOCK |
1025 (SOCK_INODE(sock)->i_mode & ~current_umask());
1026 err = unix_mknod(sun_path, mode, &path);
1027 if (err) {
1028 if (err == -EEXIST)
1029 err = -EADDRINUSE;
1030 goto out;
1034 err = mutex_lock_interruptible(&u->bindlock);
1035 if (err)
1036 goto out_put;
1038 err = -EINVAL;
1039 if (u->addr)
1040 goto out_up;
1042 err = -ENOMEM;
1043 addr = kmalloc(sizeof(*addr)+addr_len, GFP_KERNEL);
1044 if (!addr)
1045 goto out_up;
1047 memcpy(addr->name, sunaddr, addr_len);
1048 addr->len = addr_len;
1049 addr->hash = hash ^ sk->sk_type;
1050 refcount_set(&addr->refcnt, 1);
1052 if (sun_path[0]) {
1053 addr->hash = UNIX_HASH_SIZE;
1054 hash = d_backing_inode(path.dentry)->i_ino & (UNIX_HASH_SIZE - 1);
1055 spin_lock(&unix_table_lock);
1056 u->path = path;
1057 list = &unix_socket_table[hash];
1058 } else {
1059 spin_lock(&unix_table_lock);
1060 err = -EADDRINUSE;
1061 if (__unix_find_socket_byname(net, sunaddr, addr_len,
1062 sk->sk_type, hash)) {
1063 unix_release_addr(addr);
1064 goto out_unlock;
1067 list = &unix_socket_table[addr->hash];
1070 err = 0;
1071 __unix_remove_socket(sk);
1072 smp_store_release(&u->addr, addr);
1073 __unix_insert_socket(list, sk);
1075 out_unlock:
1076 spin_unlock(&unix_table_lock);
1077 out_up:
1078 mutex_unlock(&u->bindlock);
1079 out_put:
1080 if (err)
1081 path_put(&path);
1082 out:
1083 return err;
1086 static void unix_state_double_lock(struct sock *sk1, struct sock *sk2)
1088 if (unlikely(sk1 == sk2) || !sk2) {
1089 unix_state_lock(sk1);
1090 return;
1092 if (sk1 < sk2) {
1093 unix_state_lock(sk1);
1094 unix_state_lock_nested(sk2);
1095 } else {
1096 unix_state_lock(sk2);
1097 unix_state_lock_nested(sk1);
1101 static void unix_state_double_unlock(struct sock *sk1, struct sock *sk2)
1103 if (unlikely(sk1 == sk2) || !sk2) {
1104 unix_state_unlock(sk1);
1105 return;
1107 unix_state_unlock(sk1);
1108 unix_state_unlock(sk2);
1111 static int unix_dgram_connect(struct socket *sock, struct sockaddr *addr,
1112 int alen, int flags)
1114 struct sock *sk = sock->sk;
1115 struct net *net = sock_net(sk);
1116 struct sockaddr_un *sunaddr = (struct sockaddr_un *)addr;
1117 struct sock *other;
1118 unsigned int hash;
1119 int err;
1121 err = -EINVAL;
1122 if (alen < offsetofend(struct sockaddr, sa_family))
1123 goto out;
1125 if (addr->sa_family != AF_UNSPEC) {
1126 err = unix_mkname(sunaddr, alen, &hash);
1127 if (err < 0)
1128 goto out;
1129 alen = err;
1131 if (test_bit(SOCK_PASSCRED, &sock->flags) &&
1132 !unix_sk(sk)->addr && (err = unix_autobind(sock)) != 0)
1133 goto out;
1135 restart:
1136 other = unix_find_other(net, sunaddr, alen, sock->type, hash, &err);
1137 if (!other)
1138 goto out;
1140 unix_state_double_lock(sk, other);
1142 /* Apparently VFS overslept socket death. Retry. */
1143 if (sock_flag(other, SOCK_DEAD)) {
1144 unix_state_double_unlock(sk, other);
1145 sock_put(other);
1146 goto restart;
1149 err = -EPERM;
1150 if (!unix_may_send(sk, other))
1151 goto out_unlock;
1153 err = security_unix_may_send(sk->sk_socket, other->sk_socket);
1154 if (err)
1155 goto out_unlock;
1157 } else {
1159 * 1003.1g breaking connected state with AF_UNSPEC
1161 other = NULL;
1162 unix_state_double_lock(sk, other);
1166 * If it was connected, reconnect.
1168 if (unix_peer(sk)) {
1169 struct sock *old_peer = unix_peer(sk);
1170 unix_peer(sk) = other;
1171 unix_dgram_peer_wake_disconnect_wakeup(sk, old_peer);
1173 unix_state_double_unlock(sk, other);
1175 if (other != old_peer)
1176 unix_dgram_disconnected(sk, old_peer);
1177 sock_put(old_peer);
1178 } else {
1179 unix_peer(sk) = other;
1180 unix_state_double_unlock(sk, other);
1182 return 0;
1184 out_unlock:
1185 unix_state_double_unlock(sk, other);
1186 sock_put(other);
1187 out:
1188 return err;
1191 static long unix_wait_for_peer(struct sock *other, long timeo)
1193 struct unix_sock *u = unix_sk(other);
1194 int sched;
1195 DEFINE_WAIT(wait);
1197 prepare_to_wait_exclusive(&u->peer_wait, &wait, TASK_INTERRUPTIBLE);
1199 sched = !sock_flag(other, SOCK_DEAD) &&
1200 !(other->sk_shutdown & RCV_SHUTDOWN) &&
1201 unix_recvq_full(other);
1203 unix_state_unlock(other);
1205 if (sched)
1206 timeo = schedule_timeout(timeo);
1208 finish_wait(&u->peer_wait, &wait);
1209 return timeo;
1212 static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr,
1213 int addr_len, int flags)
1215 struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr;
1216 struct sock *sk = sock->sk;
1217 struct net *net = sock_net(sk);
1218 struct unix_sock *u = unix_sk(sk), *newu, *otheru;
1219 struct sock *newsk = NULL;
1220 struct sock *other = NULL;
1221 struct sk_buff *skb = NULL;
1222 unsigned int hash;
1223 int st;
1224 int err;
1225 long timeo;
1227 err = unix_mkname(sunaddr, addr_len, &hash);
1228 if (err < 0)
1229 goto out;
1230 addr_len = err;
1232 if (test_bit(SOCK_PASSCRED, &sock->flags) && !u->addr &&
1233 (err = unix_autobind(sock)) != 0)
1234 goto out;
1236 timeo = sock_sndtimeo(sk, flags & O_NONBLOCK);
1238 /* First of all allocate resources.
1239 If we will make it after state is locked,
1240 we will have to recheck all again in any case.
1243 err = -ENOMEM;
1245 /* create new sock for complete connection */
1246 newsk = unix_create1(sock_net(sk), NULL, 0);
1247 if (newsk == NULL)
1248 goto out;
1250 /* Allocate skb for sending to listening sock */
1251 skb = sock_wmalloc(newsk, 1, 0, GFP_KERNEL);
1252 if (skb == NULL)
1253 goto out;
1255 restart:
1256 /* Find listening sock. */
1257 other = unix_find_other(net, sunaddr, addr_len, sk->sk_type, hash, &err);
1258 if (!other)
1259 goto out;
1261 /* Latch state of peer */
1262 unix_state_lock(other);
1264 /* Apparently VFS overslept socket death. Retry. */
1265 if (sock_flag(other, SOCK_DEAD)) {
1266 unix_state_unlock(other);
1267 sock_put(other);
1268 goto restart;
1271 err = -ECONNREFUSED;
1272 if (other->sk_state != TCP_LISTEN)
1273 goto out_unlock;
1274 if (other->sk_shutdown & RCV_SHUTDOWN)
1275 goto out_unlock;
1277 if (unix_recvq_full(other)) {
1278 err = -EAGAIN;
1279 if (!timeo)
1280 goto out_unlock;
1282 timeo = unix_wait_for_peer(other, timeo);
1284 err = sock_intr_errno(timeo);
1285 if (signal_pending(current))
1286 goto out;
1287 sock_put(other);
1288 goto restart;
1291 /* Latch our state.
1293 It is tricky place. We need to grab our state lock and cannot
1294 drop lock on peer. It is dangerous because deadlock is
1295 possible. Connect to self case and simultaneous
1296 attempt to connect are eliminated by checking socket
1297 state. other is TCP_LISTEN, if sk is TCP_LISTEN we
1298 check this before attempt to grab lock.
1300 Well, and we have to recheck the state after socket locked.
1302 st = sk->sk_state;
1304 switch (st) {
1305 case TCP_CLOSE:
1306 /* This is ok... continue with connect */
1307 break;
1308 case TCP_ESTABLISHED:
1309 /* Socket is already connected */
1310 err = -EISCONN;
1311 goto out_unlock;
1312 default:
1313 err = -EINVAL;
1314 goto out_unlock;
1317 unix_state_lock_nested(sk);
1319 if (sk->sk_state != st) {
1320 unix_state_unlock(sk);
1321 unix_state_unlock(other);
1322 sock_put(other);
1323 goto restart;
1326 err = security_unix_stream_connect(sk, other, newsk);
1327 if (err) {
1328 unix_state_unlock(sk);
1329 goto out_unlock;
1332 /* The way is open! Fastly set all the necessary fields... */
1334 sock_hold(sk);
1335 unix_peer(newsk) = sk;
1336 newsk->sk_state = TCP_ESTABLISHED;
1337 newsk->sk_type = sk->sk_type;
1338 init_peercred(newsk);
1339 newu = unix_sk(newsk);
1340 RCU_INIT_POINTER(newsk->sk_wq, &newu->peer_wq);
1341 otheru = unix_sk(other);
1343 /* copy address information from listening to new sock
1345 * The contents of *(otheru->addr) and otheru->path
1346 * are seen fully set up here, since we have found
1347 * otheru in hash under unix_table_lock. Insertion
1348 * into the hash chain we'd found it in had been done
1349 * in an earlier critical area protected by unix_table_lock,
1350 * the same one where we'd set *(otheru->addr) contents,
1351 * as well as otheru->path and otheru->addr itself.
1353 * Using smp_store_release() here to set newu->addr
1354 * is enough to make those stores, as well as stores
1355 * to newu->path visible to anyone who gets newu->addr
1356 * by smp_load_acquire(). IOW, the same warranties
1357 * as for unix_sock instances bound in unix_bind() or
1358 * in unix_autobind().
1360 if (otheru->path.dentry) {
1361 path_get(&otheru->path);
1362 newu->path = otheru->path;
1364 refcount_inc(&otheru->addr->refcnt);
1365 smp_store_release(&newu->addr, otheru->addr);
1367 /* Set credentials */
1368 copy_peercred(sk, other);
1370 sock->state = SS_CONNECTED;
1371 sk->sk_state = TCP_ESTABLISHED;
1372 sock_hold(newsk);
1374 smp_mb__after_atomic(); /* sock_hold() does an atomic_inc() */
1375 unix_peer(sk) = newsk;
1377 unix_state_unlock(sk);
1379 /* take ten and and send info to listening sock */
1380 spin_lock(&other->sk_receive_queue.lock);
1381 __skb_queue_tail(&other->sk_receive_queue, skb);
1382 spin_unlock(&other->sk_receive_queue.lock);
1383 unix_state_unlock(other);
1384 other->sk_data_ready(other);
1385 sock_put(other);
1386 return 0;
1388 out_unlock:
1389 if (other)
1390 unix_state_unlock(other);
1392 out:
1393 kfree_skb(skb);
1394 if (newsk)
1395 unix_release_sock(newsk, 0);
1396 if (other)
1397 sock_put(other);
1398 return err;
1401 static int unix_socketpair(struct socket *socka, struct socket *sockb)
1403 struct sock *ska = socka->sk, *skb = sockb->sk;
1405 /* Join our sockets back to back */
1406 sock_hold(ska);
1407 sock_hold(skb);
1408 unix_peer(ska) = skb;
1409 unix_peer(skb) = ska;
1410 init_peercred(ska);
1411 init_peercred(skb);
1413 if (ska->sk_type != SOCK_DGRAM) {
1414 ska->sk_state = TCP_ESTABLISHED;
1415 skb->sk_state = TCP_ESTABLISHED;
1416 socka->state = SS_CONNECTED;
1417 sockb->state = SS_CONNECTED;
1419 return 0;
1422 static void unix_sock_inherit_flags(const struct socket *old,
1423 struct socket *new)
1425 if (test_bit(SOCK_PASSCRED, &old->flags))
1426 set_bit(SOCK_PASSCRED, &new->flags);
1427 if (test_bit(SOCK_PASSSEC, &old->flags))
1428 set_bit(SOCK_PASSSEC, &new->flags);
1431 static int unix_accept(struct socket *sock, struct socket *newsock, int flags,
1432 bool kern)
1434 struct sock *sk = sock->sk;
1435 struct sock *tsk;
1436 struct sk_buff *skb;
1437 int err;
1439 err = -EOPNOTSUPP;
1440 if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET)
1441 goto out;
1443 err = -EINVAL;
1444 if (sk->sk_state != TCP_LISTEN)
1445 goto out;
1447 /* If socket state is TCP_LISTEN it cannot change (for now...),
1448 * so that no locks are necessary.
1451 skb = skb_recv_datagram(sk, 0, flags&O_NONBLOCK, &err);
1452 if (!skb) {
1453 /* This means receive shutdown. */
1454 if (err == 0)
1455 err = -EINVAL;
1456 goto out;
1459 tsk = skb->sk;
1460 skb_free_datagram(sk, skb);
1461 wake_up_interruptible(&unix_sk(sk)->peer_wait);
1463 /* attach accepted sock to socket */
1464 unix_state_lock(tsk);
1465 newsock->state = SS_CONNECTED;
1466 unix_sock_inherit_flags(sock, newsock);
1467 sock_graft(tsk, newsock);
1468 unix_state_unlock(tsk);
1469 return 0;
1471 out:
1472 return err;
1476 static int unix_getname(struct socket *sock, struct sockaddr *uaddr, int peer)
1478 struct sock *sk = sock->sk;
1479 struct unix_address *addr;
1480 DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, uaddr);
1481 int err = 0;
1483 if (peer) {
1484 sk = unix_peer_get(sk);
1486 err = -ENOTCONN;
1487 if (!sk)
1488 goto out;
1489 err = 0;
1490 } else {
1491 sock_hold(sk);
1494 addr = smp_load_acquire(&unix_sk(sk)->addr);
1495 if (!addr) {
1496 sunaddr->sun_family = AF_UNIX;
1497 sunaddr->sun_path[0] = 0;
1498 err = sizeof(short);
1499 } else {
1500 err = addr->len;
1501 memcpy(sunaddr, addr->name, addr->len);
1503 sock_put(sk);
1504 out:
1505 return err;
1508 static int unix_scm_to_skb(struct scm_cookie *scm, struct sk_buff *skb, bool send_fds)
1510 int err = 0;
1512 UNIXCB(skb).pid = get_pid(scm->pid);
1513 UNIXCB(skb).uid = scm->creds.uid;
1514 UNIXCB(skb).gid = scm->creds.gid;
1515 UNIXCB(skb).fp = NULL;
1516 unix_get_secdata(scm, skb);
1517 if (scm->fp && send_fds)
1518 err = unix_attach_fds(scm, skb);
1520 skb->destructor = unix_destruct_scm;
1521 return err;
1524 static bool unix_passcred_enabled(const struct socket *sock,
1525 const struct sock *other)
1527 return test_bit(SOCK_PASSCRED, &sock->flags) ||
1528 !other->sk_socket ||
1529 test_bit(SOCK_PASSCRED, &other->sk_socket->flags);
1533 * Some apps rely on write() giving SCM_CREDENTIALS
1534 * We include credentials if source or destination socket
1535 * asserted SOCK_PASSCRED.
1537 static void maybe_add_creds(struct sk_buff *skb, const struct socket *sock,
1538 const struct sock *other)
1540 if (UNIXCB(skb).pid)
1541 return;
1542 if (unix_passcred_enabled(sock, other)) {
1543 UNIXCB(skb).pid = get_pid(task_tgid(current));
1544 current_uid_gid(&UNIXCB(skb).uid, &UNIXCB(skb).gid);
1548 static int maybe_init_creds(struct scm_cookie *scm,
1549 struct socket *socket,
1550 const struct sock *other)
1552 int err;
1553 struct msghdr msg = { .msg_controllen = 0 };
1555 err = scm_send(socket, &msg, scm, false);
1556 if (err)
1557 return err;
1559 if (unix_passcred_enabled(socket, other)) {
1560 scm->pid = get_pid(task_tgid(current));
1561 current_uid_gid(&scm->creds.uid, &scm->creds.gid);
1563 return err;
1566 static bool unix_skb_scm_eq(struct sk_buff *skb,
1567 struct scm_cookie *scm)
1569 const struct unix_skb_parms *u = &UNIXCB(skb);
1571 return u->pid == scm->pid &&
1572 uid_eq(u->uid, scm->creds.uid) &&
1573 gid_eq(u->gid, scm->creds.gid) &&
1574 unix_secdata_eq(scm, skb);
1578 * Send AF_UNIX data.
1581 static int unix_dgram_sendmsg(struct socket *sock, struct msghdr *msg,
1582 size_t len)
1584 struct sock *sk = sock->sk;
1585 struct net *net = sock_net(sk);
1586 struct unix_sock *u = unix_sk(sk);
1587 DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, msg->msg_name);
1588 struct sock *other = NULL;
1589 int namelen = 0; /* fake GCC */
1590 int err;
1591 unsigned int hash;
1592 struct sk_buff *skb;
1593 long timeo;
1594 struct scm_cookie scm;
1595 int data_len = 0;
1596 int sk_locked;
1598 wait_for_unix_gc();
1599 err = scm_send(sock, msg, &scm, false);
1600 if (err < 0)
1601 return err;
1603 err = -EOPNOTSUPP;
1604 if (msg->msg_flags&MSG_OOB)
1605 goto out;
1607 if (msg->msg_namelen) {
1608 err = unix_mkname(sunaddr, msg->msg_namelen, &hash);
1609 if (err < 0)
1610 goto out;
1611 namelen = err;
1612 } else {
1613 sunaddr = NULL;
1614 err = -ENOTCONN;
1615 other = unix_peer_get(sk);
1616 if (!other)
1617 goto out;
1620 if (test_bit(SOCK_PASSCRED, &sock->flags) && !u->addr
1621 && (err = unix_autobind(sock)) != 0)
1622 goto out;
1624 err = -EMSGSIZE;
1625 if (len > sk->sk_sndbuf - 32)
1626 goto out;
1628 if (len > SKB_MAX_ALLOC) {
1629 data_len = min_t(size_t,
1630 len - SKB_MAX_ALLOC,
1631 MAX_SKB_FRAGS * PAGE_SIZE);
1632 data_len = PAGE_ALIGN(data_len);
1634 BUILD_BUG_ON(SKB_MAX_ALLOC < PAGE_SIZE);
1637 skb = sock_alloc_send_pskb(sk, len - data_len, data_len,
1638 msg->msg_flags & MSG_DONTWAIT, &err,
1639 PAGE_ALLOC_COSTLY_ORDER);
1640 if (skb == NULL)
1641 goto out;
1643 err = unix_scm_to_skb(&scm, skb, true);
1644 if (err < 0)
1645 goto out_free;
1647 skb_put(skb, len - data_len);
1648 skb->data_len = data_len;
1649 skb->len = len;
1650 err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, len);
1651 if (err)
1652 goto out_free;
1654 timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT);
1656 restart:
1657 if (!other) {
1658 err = -ECONNRESET;
1659 if (sunaddr == NULL)
1660 goto out_free;
1662 other = unix_find_other(net, sunaddr, namelen, sk->sk_type,
1663 hash, &err);
1664 if (other == NULL)
1665 goto out_free;
1668 if (sk_filter(other, skb) < 0) {
1669 /* Toss the packet but do not return any error to the sender */
1670 err = len;
1671 goto out_free;
1674 sk_locked = 0;
1675 unix_state_lock(other);
1676 restart_locked:
1677 err = -EPERM;
1678 if (!unix_may_send(sk, other))
1679 goto out_unlock;
1681 if (unlikely(sock_flag(other, SOCK_DEAD))) {
1683 * Check with 1003.1g - what should
1684 * datagram error
1686 unix_state_unlock(other);
1687 sock_put(other);
1689 if (!sk_locked)
1690 unix_state_lock(sk);
1692 err = 0;
1693 if (unix_peer(sk) == other) {
1694 unix_peer(sk) = NULL;
1695 unix_dgram_peer_wake_disconnect_wakeup(sk, other);
1697 unix_state_unlock(sk);
1699 unix_dgram_disconnected(sk, other);
1700 sock_put(other);
1701 err = -ECONNREFUSED;
1702 } else {
1703 unix_state_unlock(sk);
1706 other = NULL;
1707 if (err)
1708 goto out_free;
1709 goto restart;
1712 err = -EPIPE;
1713 if (other->sk_shutdown & RCV_SHUTDOWN)
1714 goto out_unlock;
1716 if (sk->sk_type != SOCK_SEQPACKET) {
1717 err = security_unix_may_send(sk->sk_socket, other->sk_socket);
1718 if (err)
1719 goto out_unlock;
1722 /* other == sk && unix_peer(other) != sk if
1723 * - unix_peer(sk) == NULL, destination address bound to sk
1724 * - unix_peer(sk) == sk by time of get but disconnected before lock
1726 if (other != sk &&
1727 unlikely(unix_peer(other) != sk && unix_recvq_full(other))) {
1728 if (timeo) {
1729 timeo = unix_wait_for_peer(other, timeo);
1731 err = sock_intr_errno(timeo);
1732 if (signal_pending(current))
1733 goto out_free;
1735 goto restart;
1738 if (!sk_locked) {
1739 unix_state_unlock(other);
1740 unix_state_double_lock(sk, other);
1743 if (unix_peer(sk) != other ||
1744 unix_dgram_peer_wake_me(sk, other)) {
1745 err = -EAGAIN;
1746 sk_locked = 1;
1747 goto out_unlock;
1750 if (!sk_locked) {
1751 sk_locked = 1;
1752 goto restart_locked;
1756 if (unlikely(sk_locked))
1757 unix_state_unlock(sk);
1759 if (sock_flag(other, SOCK_RCVTSTAMP))
1760 __net_timestamp(skb);
1761 maybe_add_creds(skb, sock, other);
1762 skb_queue_tail(&other->sk_receive_queue, skb);
1763 unix_state_unlock(other);
1764 other->sk_data_ready(other);
1765 sock_put(other);
1766 scm_destroy(&scm);
1767 return len;
1769 out_unlock:
1770 if (sk_locked)
1771 unix_state_unlock(sk);
1772 unix_state_unlock(other);
1773 out_free:
1774 kfree_skb(skb);
1775 out:
1776 if (other)
1777 sock_put(other);
1778 scm_destroy(&scm);
1779 return err;
1782 /* We use paged skbs for stream sockets, and limit occupancy to 32768
1783 * bytes, and a minimum of a full page.
1785 #define UNIX_SKB_FRAGS_SZ (PAGE_SIZE << get_order(32768))
1787 static int unix_stream_sendmsg(struct socket *sock, struct msghdr *msg,
1788 size_t len)
1790 struct sock *sk = sock->sk;
1791 struct sock *other = NULL;
1792 int err, size;
1793 struct sk_buff *skb;
1794 int sent = 0;
1795 struct scm_cookie scm;
1796 bool fds_sent = false;
1797 int data_len;
1799 wait_for_unix_gc();
1800 err = scm_send(sock, msg, &scm, false);
1801 if (err < 0)
1802 return err;
1804 err = -EOPNOTSUPP;
1805 if (msg->msg_flags&MSG_OOB)
1806 goto out_err;
1808 if (msg->msg_namelen) {
1809 err = sk->sk_state == TCP_ESTABLISHED ? -EISCONN : -EOPNOTSUPP;
1810 goto out_err;
1811 } else {
1812 err = -ENOTCONN;
1813 other = unix_peer(sk);
1814 if (!other)
1815 goto out_err;
1818 if (sk->sk_shutdown & SEND_SHUTDOWN)
1819 goto pipe_err;
1821 while (sent < len) {
1822 size = len - sent;
1824 /* Keep two messages in the pipe so it schedules better */
1825 size = min_t(int, size, (sk->sk_sndbuf >> 1) - 64);
1827 /* allow fallback to order-0 allocations */
1828 size = min_t(int, size, SKB_MAX_HEAD(0) + UNIX_SKB_FRAGS_SZ);
1830 data_len = max_t(int, 0, size - SKB_MAX_HEAD(0));
1832 data_len = min_t(size_t, size, PAGE_ALIGN(data_len));
1834 skb = sock_alloc_send_pskb(sk, size - data_len, data_len,
1835 msg->msg_flags & MSG_DONTWAIT, &err,
1836 get_order(UNIX_SKB_FRAGS_SZ));
1837 if (!skb)
1838 goto out_err;
1840 /* Only send the fds in the first buffer */
1841 err = unix_scm_to_skb(&scm, skb, !fds_sent);
1842 if (err < 0) {
1843 kfree_skb(skb);
1844 goto out_err;
1846 fds_sent = true;
1848 skb_put(skb, size - data_len);
1849 skb->data_len = data_len;
1850 skb->len = size;
1851 err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, size);
1852 if (err) {
1853 kfree_skb(skb);
1854 goto out_err;
1857 unix_state_lock(other);
1859 if (sock_flag(other, SOCK_DEAD) ||
1860 (other->sk_shutdown & RCV_SHUTDOWN))
1861 goto pipe_err_free;
1863 maybe_add_creds(skb, sock, other);
1864 skb_queue_tail(&other->sk_receive_queue, skb);
1865 unix_state_unlock(other);
1866 other->sk_data_ready(other);
1867 sent += size;
1870 scm_destroy(&scm);
1872 return sent;
1874 pipe_err_free:
1875 unix_state_unlock(other);
1876 kfree_skb(skb);
1877 pipe_err:
1878 if (sent == 0 && !(msg->msg_flags&MSG_NOSIGNAL))
1879 send_sig(SIGPIPE, current, 0);
1880 err = -EPIPE;
1881 out_err:
1882 scm_destroy(&scm);
1883 return sent ? : err;
1886 static ssize_t unix_stream_sendpage(struct socket *socket, struct page *page,
1887 int offset, size_t size, int flags)
1889 int err;
1890 bool send_sigpipe = false;
1891 bool init_scm = true;
1892 struct scm_cookie scm;
1893 struct sock *other, *sk = socket->sk;
1894 struct sk_buff *skb, *newskb = NULL, *tail = NULL;
1896 if (flags & MSG_OOB)
1897 return -EOPNOTSUPP;
1899 other = unix_peer(sk);
1900 if (!other || sk->sk_state != TCP_ESTABLISHED)
1901 return -ENOTCONN;
1903 if (false) {
1904 alloc_skb:
1905 unix_state_unlock(other);
1906 mutex_unlock(&unix_sk(other)->iolock);
1907 newskb = sock_alloc_send_pskb(sk, 0, 0, flags & MSG_DONTWAIT,
1908 &err, 0);
1909 if (!newskb)
1910 goto err;
1913 /* we must acquire iolock as we modify already present
1914 * skbs in the sk_receive_queue and mess with skb->len
1916 err = mutex_lock_interruptible(&unix_sk(other)->iolock);
1917 if (err) {
1918 err = flags & MSG_DONTWAIT ? -EAGAIN : -ERESTARTSYS;
1919 goto err;
1922 if (sk->sk_shutdown & SEND_SHUTDOWN) {
1923 err = -EPIPE;
1924 send_sigpipe = true;
1925 goto err_unlock;
1928 unix_state_lock(other);
1930 if (sock_flag(other, SOCK_DEAD) ||
1931 other->sk_shutdown & RCV_SHUTDOWN) {
1932 err = -EPIPE;
1933 send_sigpipe = true;
1934 goto err_state_unlock;
1937 if (init_scm) {
1938 err = maybe_init_creds(&scm, socket, other);
1939 if (err)
1940 goto err_state_unlock;
1941 init_scm = false;
1944 skb = skb_peek_tail(&other->sk_receive_queue);
1945 if (tail && tail == skb) {
1946 skb = newskb;
1947 } else if (!skb || !unix_skb_scm_eq(skb, &scm)) {
1948 if (newskb) {
1949 skb = newskb;
1950 } else {
1951 tail = skb;
1952 goto alloc_skb;
1954 } else if (newskb) {
1955 /* this is fast path, we don't necessarily need to
1956 * call to kfree_skb even though with newskb == NULL
1957 * this - does no harm
1959 consume_skb(newskb);
1960 newskb = NULL;
1963 if (skb_append_pagefrags(skb, page, offset, size)) {
1964 tail = skb;
1965 goto alloc_skb;
1968 skb->len += size;
1969 skb->data_len += size;
1970 skb->truesize += size;
1971 refcount_add(size, &sk->sk_wmem_alloc);
1973 if (newskb) {
1974 err = unix_scm_to_skb(&scm, skb, false);
1975 if (err)
1976 goto err_state_unlock;
1977 spin_lock(&other->sk_receive_queue.lock);
1978 __skb_queue_tail(&other->sk_receive_queue, newskb);
1979 spin_unlock(&other->sk_receive_queue.lock);
1982 unix_state_unlock(other);
1983 mutex_unlock(&unix_sk(other)->iolock);
1985 other->sk_data_ready(other);
1986 scm_destroy(&scm);
1987 return size;
1989 err_state_unlock:
1990 unix_state_unlock(other);
1991 err_unlock:
1992 mutex_unlock(&unix_sk(other)->iolock);
1993 err:
1994 kfree_skb(newskb);
1995 if (send_sigpipe && !(flags & MSG_NOSIGNAL))
1996 send_sig(SIGPIPE, current, 0);
1997 if (!init_scm)
1998 scm_destroy(&scm);
1999 return err;
2002 static int unix_seqpacket_sendmsg(struct socket *sock, struct msghdr *msg,
2003 size_t len)
2005 int err;
2006 struct sock *sk = sock->sk;
2008 err = sock_error(sk);
2009 if (err)
2010 return err;
2012 if (sk->sk_state != TCP_ESTABLISHED)
2013 return -ENOTCONN;
2015 if (msg->msg_namelen)
2016 msg->msg_namelen = 0;
2018 return unix_dgram_sendmsg(sock, msg, len);
2021 static int unix_seqpacket_recvmsg(struct socket *sock, struct msghdr *msg,
2022 size_t size, int flags)
2024 struct sock *sk = sock->sk;
2026 if (sk->sk_state != TCP_ESTABLISHED)
2027 return -ENOTCONN;
2029 return unix_dgram_recvmsg(sock, msg, size, flags);
2032 static void unix_copy_addr(struct msghdr *msg, struct sock *sk)
2034 struct unix_address *addr = smp_load_acquire(&unix_sk(sk)->addr);
2036 if (addr) {
2037 msg->msg_namelen = addr->len;
2038 memcpy(msg->msg_name, addr->name, addr->len);
2042 static int unix_dgram_recvmsg(struct socket *sock, struct msghdr *msg,
2043 size_t size, int flags)
2045 struct scm_cookie scm;
2046 struct sock *sk = sock->sk;
2047 struct unix_sock *u = unix_sk(sk);
2048 struct sk_buff *skb, *last;
2049 long timeo;
2050 int skip;
2051 int err;
2053 err = -EOPNOTSUPP;
2054 if (flags&MSG_OOB)
2055 goto out;
2057 timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT);
2059 do {
2060 mutex_lock(&u->iolock);
2062 skip = sk_peek_offset(sk, flags);
2063 skb = __skb_try_recv_datagram(sk, flags, NULL, &skip, &err,
2064 &last);
2065 if (skb)
2066 break;
2068 mutex_unlock(&u->iolock);
2070 if (err != -EAGAIN)
2071 break;
2072 } while (timeo &&
2073 !__skb_wait_for_more_packets(sk, &err, &timeo, last));
2075 if (!skb) { /* implies iolock unlocked */
2076 unix_state_lock(sk);
2077 /* Signal EOF on disconnected non-blocking SEQPACKET socket. */
2078 if (sk->sk_type == SOCK_SEQPACKET && err == -EAGAIN &&
2079 (sk->sk_shutdown & RCV_SHUTDOWN))
2080 err = 0;
2081 unix_state_unlock(sk);
2082 goto out;
2085 if (wq_has_sleeper(&u->peer_wait))
2086 wake_up_interruptible_sync_poll(&u->peer_wait,
2087 EPOLLOUT | EPOLLWRNORM |
2088 EPOLLWRBAND);
2090 if (msg->msg_name)
2091 unix_copy_addr(msg, skb->sk);
2093 if (size > skb->len - skip)
2094 size = skb->len - skip;
2095 else if (size < skb->len - skip)
2096 msg->msg_flags |= MSG_TRUNC;
2098 err = skb_copy_datagram_msg(skb, skip, msg, size);
2099 if (err)
2100 goto out_free;
2102 if (sock_flag(sk, SOCK_RCVTSTAMP))
2103 __sock_recv_timestamp(msg, sk, skb);
2105 memset(&scm, 0, sizeof(scm));
2107 scm_set_cred(&scm, UNIXCB(skb).pid, UNIXCB(skb).uid, UNIXCB(skb).gid);
2108 unix_set_secdata(&scm, skb);
2110 if (!(flags & MSG_PEEK)) {
2111 if (UNIXCB(skb).fp)
2112 unix_detach_fds(&scm, skb);
2114 sk_peek_offset_bwd(sk, skb->len);
2115 } else {
2116 /* It is questionable: on PEEK we could:
2117 - do not return fds - good, but too simple 8)
2118 - return fds, and do not return them on read (old strategy,
2119 apparently wrong)
2120 - clone fds (I chose it for now, it is the most universal
2121 solution)
2123 POSIX 1003.1g does not actually define this clearly
2124 at all. POSIX 1003.1g doesn't define a lot of things
2125 clearly however!
2129 sk_peek_offset_fwd(sk, size);
2131 if (UNIXCB(skb).fp)
2132 scm.fp = scm_fp_dup(UNIXCB(skb).fp);
2134 err = (flags & MSG_TRUNC) ? skb->len - skip : size;
2136 scm_recv(sock, msg, &scm, flags);
2138 out_free:
2139 skb_free_datagram(sk, skb);
2140 mutex_unlock(&u->iolock);
2141 out:
2142 return err;
2146 * Sleep until more data has arrived. But check for races..
2148 static long unix_stream_data_wait(struct sock *sk, long timeo,
2149 struct sk_buff *last, unsigned int last_len,
2150 bool freezable)
2152 struct sk_buff *tail;
2153 DEFINE_WAIT(wait);
2155 unix_state_lock(sk);
2157 for (;;) {
2158 prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
2160 tail = skb_peek_tail(&sk->sk_receive_queue);
2161 if (tail != last ||
2162 (tail && tail->len != last_len) ||
2163 sk->sk_err ||
2164 (sk->sk_shutdown & RCV_SHUTDOWN) ||
2165 signal_pending(current) ||
2166 !timeo)
2167 break;
2169 sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2170 unix_state_unlock(sk);
2171 if (freezable)
2172 timeo = freezable_schedule_timeout(timeo);
2173 else
2174 timeo = schedule_timeout(timeo);
2175 unix_state_lock(sk);
2177 if (sock_flag(sk, SOCK_DEAD))
2178 break;
2180 sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2183 finish_wait(sk_sleep(sk), &wait);
2184 unix_state_unlock(sk);
2185 return timeo;
2188 static unsigned int unix_skb_len(const struct sk_buff *skb)
2190 return skb->len - UNIXCB(skb).consumed;
2193 struct unix_stream_read_state {
2194 int (*recv_actor)(struct sk_buff *, int, int,
2195 struct unix_stream_read_state *);
2196 struct socket *socket;
2197 struct msghdr *msg;
2198 struct pipe_inode_info *pipe;
2199 size_t size;
2200 int flags;
2201 unsigned int splice_flags;
2204 static int unix_stream_read_generic(struct unix_stream_read_state *state,
2205 bool freezable)
2207 struct scm_cookie scm;
2208 struct socket *sock = state->socket;
2209 struct sock *sk = sock->sk;
2210 struct unix_sock *u = unix_sk(sk);
2211 int copied = 0;
2212 int flags = state->flags;
2213 int noblock = flags & MSG_DONTWAIT;
2214 bool check_creds = false;
2215 int target;
2216 int err = 0;
2217 long timeo;
2218 int skip;
2219 size_t size = state->size;
2220 unsigned int last_len;
2222 if (unlikely(sk->sk_state != TCP_ESTABLISHED)) {
2223 err = -EINVAL;
2224 goto out;
2227 if (unlikely(flags & MSG_OOB)) {
2228 err = -EOPNOTSUPP;
2229 goto out;
2232 target = sock_rcvlowat(sk, flags & MSG_WAITALL, size);
2233 timeo = sock_rcvtimeo(sk, noblock);
2235 memset(&scm, 0, sizeof(scm));
2237 /* Lock the socket to prevent queue disordering
2238 * while sleeps in memcpy_tomsg
2240 mutex_lock(&u->iolock);
2242 skip = max(sk_peek_offset(sk, flags), 0);
2244 do {
2245 int chunk;
2246 bool drop_skb;
2247 struct sk_buff *skb, *last;
2249 redo:
2250 unix_state_lock(sk);
2251 if (sock_flag(sk, SOCK_DEAD)) {
2252 err = -ECONNRESET;
2253 goto unlock;
2255 last = skb = skb_peek(&sk->sk_receive_queue);
2256 last_len = last ? last->len : 0;
2257 again:
2258 if (skb == NULL) {
2259 if (copied >= target)
2260 goto unlock;
2263 * POSIX 1003.1g mandates this order.
2266 err = sock_error(sk);
2267 if (err)
2268 goto unlock;
2269 if (sk->sk_shutdown & RCV_SHUTDOWN)
2270 goto unlock;
2272 unix_state_unlock(sk);
2273 if (!timeo) {
2274 err = -EAGAIN;
2275 break;
2278 mutex_unlock(&u->iolock);
2280 timeo = unix_stream_data_wait(sk, timeo, last,
2281 last_len, freezable);
2283 if (signal_pending(current)) {
2284 err = sock_intr_errno(timeo);
2285 scm_destroy(&scm);
2286 goto out;
2289 mutex_lock(&u->iolock);
2290 goto redo;
2291 unlock:
2292 unix_state_unlock(sk);
2293 break;
2296 while (skip >= unix_skb_len(skb)) {
2297 skip -= unix_skb_len(skb);
2298 last = skb;
2299 last_len = skb->len;
2300 skb = skb_peek_next(skb, &sk->sk_receive_queue);
2301 if (!skb)
2302 goto again;
2305 unix_state_unlock(sk);
2307 if (check_creds) {
2308 /* Never glue messages from different writers */
2309 if (!unix_skb_scm_eq(skb, &scm))
2310 break;
2311 } else if (test_bit(SOCK_PASSCRED, &sock->flags)) {
2312 /* Copy credentials */
2313 scm_set_cred(&scm, UNIXCB(skb).pid, UNIXCB(skb).uid, UNIXCB(skb).gid);
2314 unix_set_secdata(&scm, skb);
2315 check_creds = true;
2318 /* Copy address just once */
2319 if (state->msg && state->msg->msg_name) {
2320 DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr,
2321 state->msg->msg_name);
2322 unix_copy_addr(state->msg, skb->sk);
2323 sunaddr = NULL;
2326 chunk = min_t(unsigned int, unix_skb_len(skb) - skip, size);
2327 skb_get(skb);
2328 chunk = state->recv_actor(skb, skip, chunk, state);
2329 drop_skb = !unix_skb_len(skb);
2330 /* skb is only safe to use if !drop_skb */
2331 consume_skb(skb);
2332 if (chunk < 0) {
2333 if (copied == 0)
2334 copied = -EFAULT;
2335 break;
2337 copied += chunk;
2338 size -= chunk;
2340 if (drop_skb) {
2341 /* the skb was touched by a concurrent reader;
2342 * we should not expect anything from this skb
2343 * anymore and assume it invalid - we can be
2344 * sure it was dropped from the socket queue
2346 * let's report a short read
2348 err = 0;
2349 break;
2352 /* Mark read part of skb as used */
2353 if (!(flags & MSG_PEEK)) {
2354 UNIXCB(skb).consumed += chunk;
2356 sk_peek_offset_bwd(sk, chunk);
2358 if (UNIXCB(skb).fp)
2359 unix_detach_fds(&scm, skb);
2361 if (unix_skb_len(skb))
2362 break;
2364 skb_unlink(skb, &sk->sk_receive_queue);
2365 consume_skb(skb);
2367 if (scm.fp)
2368 break;
2369 } else {
2370 /* It is questionable, see note in unix_dgram_recvmsg.
2372 if (UNIXCB(skb).fp)
2373 scm.fp = scm_fp_dup(UNIXCB(skb).fp);
2375 sk_peek_offset_fwd(sk, chunk);
2377 if (UNIXCB(skb).fp)
2378 break;
2380 skip = 0;
2381 last = skb;
2382 last_len = skb->len;
2383 unix_state_lock(sk);
2384 skb = skb_peek_next(skb, &sk->sk_receive_queue);
2385 if (skb)
2386 goto again;
2387 unix_state_unlock(sk);
2388 break;
2390 } while (size);
2392 mutex_unlock(&u->iolock);
2393 if (state->msg)
2394 scm_recv(sock, state->msg, &scm, flags);
2395 else
2396 scm_destroy(&scm);
2397 out:
2398 return copied ? : err;
2401 static int unix_stream_read_actor(struct sk_buff *skb,
2402 int skip, int chunk,
2403 struct unix_stream_read_state *state)
2405 int ret;
2407 ret = skb_copy_datagram_msg(skb, UNIXCB(skb).consumed + skip,
2408 state->msg, chunk);
2409 return ret ?: chunk;
2412 static int unix_stream_recvmsg(struct socket *sock, struct msghdr *msg,
2413 size_t size, int flags)
2415 struct unix_stream_read_state state = {
2416 .recv_actor = unix_stream_read_actor,
2417 .socket = sock,
2418 .msg = msg,
2419 .size = size,
2420 .flags = flags
2423 return unix_stream_read_generic(&state, true);
2426 static int unix_stream_splice_actor(struct sk_buff *skb,
2427 int skip, int chunk,
2428 struct unix_stream_read_state *state)
2430 return skb_splice_bits(skb, state->socket->sk,
2431 UNIXCB(skb).consumed + skip,
2432 state->pipe, chunk, state->splice_flags);
2435 static ssize_t unix_stream_splice_read(struct socket *sock, loff_t *ppos,
2436 struct pipe_inode_info *pipe,
2437 size_t size, unsigned int flags)
2439 struct unix_stream_read_state state = {
2440 .recv_actor = unix_stream_splice_actor,
2441 .socket = sock,
2442 .pipe = pipe,
2443 .size = size,
2444 .splice_flags = flags,
2447 if (unlikely(*ppos))
2448 return -ESPIPE;
2450 if (sock->file->f_flags & O_NONBLOCK ||
2451 flags & SPLICE_F_NONBLOCK)
2452 state.flags = MSG_DONTWAIT;
2454 return unix_stream_read_generic(&state, false);
2457 static int unix_shutdown(struct socket *sock, int mode)
2459 struct sock *sk = sock->sk;
2460 struct sock *other;
2462 if (mode < SHUT_RD || mode > SHUT_RDWR)
2463 return -EINVAL;
2464 /* This maps:
2465 * SHUT_RD (0) -> RCV_SHUTDOWN (1)
2466 * SHUT_WR (1) -> SEND_SHUTDOWN (2)
2467 * SHUT_RDWR (2) -> SHUTDOWN_MASK (3)
2469 ++mode;
2471 unix_state_lock(sk);
2472 sk->sk_shutdown |= mode;
2473 other = unix_peer(sk);
2474 if (other)
2475 sock_hold(other);
2476 unix_state_unlock(sk);
2477 sk->sk_state_change(sk);
2479 if (other &&
2480 (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET)) {
2482 int peer_mode = 0;
2484 if (mode&RCV_SHUTDOWN)
2485 peer_mode |= SEND_SHUTDOWN;
2486 if (mode&SEND_SHUTDOWN)
2487 peer_mode |= RCV_SHUTDOWN;
2488 unix_state_lock(other);
2489 other->sk_shutdown |= peer_mode;
2490 unix_state_unlock(other);
2491 other->sk_state_change(other);
2492 if (peer_mode == SHUTDOWN_MASK)
2493 sk_wake_async(other, SOCK_WAKE_WAITD, POLL_HUP);
2494 else if (peer_mode & RCV_SHUTDOWN)
2495 sk_wake_async(other, SOCK_WAKE_WAITD, POLL_IN);
2497 if (other)
2498 sock_put(other);
2500 return 0;
2503 long unix_inq_len(struct sock *sk)
2505 struct sk_buff *skb;
2506 long amount = 0;
2508 if (sk->sk_state == TCP_LISTEN)
2509 return -EINVAL;
2511 spin_lock(&sk->sk_receive_queue.lock);
2512 if (sk->sk_type == SOCK_STREAM ||
2513 sk->sk_type == SOCK_SEQPACKET) {
2514 skb_queue_walk(&sk->sk_receive_queue, skb)
2515 amount += unix_skb_len(skb);
2516 } else {
2517 skb = skb_peek(&sk->sk_receive_queue);
2518 if (skb)
2519 amount = skb->len;
2521 spin_unlock(&sk->sk_receive_queue.lock);
2523 return amount;
2525 EXPORT_SYMBOL_GPL(unix_inq_len);
2527 long unix_outq_len(struct sock *sk)
2529 return sk_wmem_alloc_get(sk);
2531 EXPORT_SYMBOL_GPL(unix_outq_len);
2533 static int unix_open_file(struct sock *sk)
2535 struct path path;
2536 struct file *f;
2537 int fd;
2539 if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
2540 return -EPERM;
2542 if (!smp_load_acquire(&unix_sk(sk)->addr))
2543 return -ENOENT;
2545 path = unix_sk(sk)->path;
2546 if (!path.dentry)
2547 return -ENOENT;
2549 path_get(&path);
2551 fd = get_unused_fd_flags(O_CLOEXEC);
2552 if (fd < 0)
2553 goto out;
2555 f = dentry_open(&path, O_PATH, current_cred());
2556 if (IS_ERR(f)) {
2557 put_unused_fd(fd);
2558 fd = PTR_ERR(f);
2559 goto out;
2562 fd_install(fd, f);
2563 out:
2564 path_put(&path);
2566 return fd;
2569 static int unix_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
2571 struct sock *sk = sock->sk;
2572 long amount = 0;
2573 int err;
2575 switch (cmd) {
2576 case SIOCOUTQ:
2577 amount = unix_outq_len(sk);
2578 err = put_user(amount, (int __user *)arg);
2579 break;
2580 case SIOCINQ:
2581 amount = unix_inq_len(sk);
2582 if (amount < 0)
2583 err = amount;
2584 else
2585 err = put_user(amount, (int __user *)arg);
2586 break;
2587 case SIOCUNIXFILE:
2588 err = unix_open_file(sk);
2589 break;
2590 default:
2591 err = -ENOIOCTLCMD;
2592 break;
2594 return err;
2597 #ifdef CONFIG_COMPAT
2598 static int unix_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
2600 return unix_ioctl(sock, cmd, (unsigned long)compat_ptr(arg));
2602 #endif
2604 static __poll_t unix_poll(struct file *file, struct socket *sock, poll_table *wait)
2606 struct sock *sk = sock->sk;
2607 __poll_t mask;
2609 sock_poll_wait(file, sock, wait);
2610 mask = 0;
2612 /* exceptional events? */
2613 if (sk->sk_err)
2614 mask |= EPOLLERR;
2615 if (sk->sk_shutdown == SHUTDOWN_MASK)
2616 mask |= EPOLLHUP;
2617 if (sk->sk_shutdown & RCV_SHUTDOWN)
2618 mask |= EPOLLRDHUP | EPOLLIN | EPOLLRDNORM;
2620 /* readable? */
2621 if (!skb_queue_empty_lockless(&sk->sk_receive_queue))
2622 mask |= EPOLLIN | EPOLLRDNORM;
2624 /* Connection-based need to check for termination and startup */
2625 if ((sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) &&
2626 sk->sk_state == TCP_CLOSE)
2627 mask |= EPOLLHUP;
2630 * we set writable also when the other side has shut down the
2631 * connection. This prevents stuck sockets.
2633 if (unix_writable(sk))
2634 mask |= EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND;
2636 return mask;
2639 static __poll_t unix_dgram_poll(struct file *file, struct socket *sock,
2640 poll_table *wait)
2642 struct sock *sk = sock->sk, *other;
2643 unsigned int writable;
2644 __poll_t mask;
2646 sock_poll_wait(file, sock, wait);
2647 mask = 0;
2649 /* exceptional events? */
2650 if (sk->sk_err || !skb_queue_empty_lockless(&sk->sk_error_queue))
2651 mask |= EPOLLERR |
2652 (sock_flag(sk, SOCK_SELECT_ERR_QUEUE) ? EPOLLPRI : 0);
2654 if (sk->sk_shutdown & RCV_SHUTDOWN)
2655 mask |= EPOLLRDHUP | EPOLLIN | EPOLLRDNORM;
2656 if (sk->sk_shutdown == SHUTDOWN_MASK)
2657 mask |= EPOLLHUP;
2659 /* readable? */
2660 if (!skb_queue_empty_lockless(&sk->sk_receive_queue))
2661 mask |= EPOLLIN | EPOLLRDNORM;
2663 /* Connection-based need to check for termination and startup */
2664 if (sk->sk_type == SOCK_SEQPACKET) {
2665 if (sk->sk_state == TCP_CLOSE)
2666 mask |= EPOLLHUP;
2667 /* connection hasn't started yet? */
2668 if (sk->sk_state == TCP_SYN_SENT)
2669 return mask;
2672 /* No write status requested, avoid expensive OUT tests. */
2673 if (!(poll_requested_events(wait) & (EPOLLWRBAND|EPOLLWRNORM|EPOLLOUT)))
2674 return mask;
2676 writable = unix_writable(sk);
2677 if (writable) {
2678 unix_state_lock(sk);
2680 other = unix_peer(sk);
2681 if (other && unix_peer(other) != sk &&
2682 unix_recvq_full(other) &&
2683 unix_dgram_peer_wake_me(sk, other))
2684 writable = 0;
2686 unix_state_unlock(sk);
2689 if (writable)
2690 mask |= EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND;
2691 else
2692 sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
2694 return mask;
2697 #ifdef CONFIG_PROC_FS
2699 #define BUCKET_SPACE (BITS_PER_LONG - (UNIX_HASH_BITS + 1) - 1)
2701 #define get_bucket(x) ((x) >> BUCKET_SPACE)
2702 #define get_offset(x) ((x) & ((1L << BUCKET_SPACE) - 1))
2703 #define set_bucket_offset(b, o) ((b) << BUCKET_SPACE | (o))
2705 static struct sock *unix_from_bucket(struct seq_file *seq, loff_t *pos)
2707 unsigned long offset = get_offset(*pos);
2708 unsigned long bucket = get_bucket(*pos);
2709 struct sock *sk;
2710 unsigned long count = 0;
2712 for (sk = sk_head(&unix_socket_table[bucket]); sk; sk = sk_next(sk)) {
2713 if (sock_net(sk) != seq_file_net(seq))
2714 continue;
2715 if (++count == offset)
2716 break;
2719 return sk;
2722 static struct sock *unix_next_socket(struct seq_file *seq,
2723 struct sock *sk,
2724 loff_t *pos)
2726 unsigned long bucket;
2728 while (sk > (struct sock *)SEQ_START_TOKEN) {
2729 sk = sk_next(sk);
2730 if (!sk)
2731 goto next_bucket;
2732 if (sock_net(sk) == seq_file_net(seq))
2733 return sk;
2736 do {
2737 sk = unix_from_bucket(seq, pos);
2738 if (sk)
2739 return sk;
2741 next_bucket:
2742 bucket = get_bucket(*pos) + 1;
2743 *pos = set_bucket_offset(bucket, 1);
2744 } while (bucket < ARRAY_SIZE(unix_socket_table));
2746 return NULL;
2749 static void *unix_seq_start(struct seq_file *seq, loff_t *pos)
2750 __acquires(unix_table_lock)
2752 spin_lock(&unix_table_lock);
2754 if (!*pos)
2755 return SEQ_START_TOKEN;
2757 if (get_bucket(*pos) >= ARRAY_SIZE(unix_socket_table))
2758 return NULL;
2760 return unix_next_socket(seq, NULL, pos);
2763 static void *unix_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2765 ++*pos;
2766 return unix_next_socket(seq, v, pos);
2769 static void unix_seq_stop(struct seq_file *seq, void *v)
2770 __releases(unix_table_lock)
2772 spin_unlock(&unix_table_lock);
2775 static int unix_seq_show(struct seq_file *seq, void *v)
2778 if (v == SEQ_START_TOKEN)
2779 seq_puts(seq, "Num RefCount Protocol Flags Type St "
2780 "Inode Path\n");
2781 else {
2782 struct sock *s = v;
2783 struct unix_sock *u = unix_sk(s);
2784 unix_state_lock(s);
2786 seq_printf(seq, "%pK: %08X %08X %08X %04X %02X %5lu",
2788 refcount_read(&s->sk_refcnt),
2790 s->sk_state == TCP_LISTEN ? __SO_ACCEPTCON : 0,
2791 s->sk_type,
2792 s->sk_socket ?
2793 (s->sk_state == TCP_ESTABLISHED ? SS_CONNECTED : SS_UNCONNECTED) :
2794 (s->sk_state == TCP_ESTABLISHED ? SS_CONNECTING : SS_DISCONNECTING),
2795 sock_i_ino(s));
2797 if (u->addr) { // under unix_table_lock here
2798 int i, len;
2799 seq_putc(seq, ' ');
2801 i = 0;
2802 len = u->addr->len - sizeof(short);
2803 if (!UNIX_ABSTRACT(s))
2804 len--;
2805 else {
2806 seq_putc(seq, '@');
2807 i++;
2809 for ( ; i < len; i++)
2810 seq_putc(seq, u->addr->name->sun_path[i] ?:
2811 '@');
2813 unix_state_unlock(s);
2814 seq_putc(seq, '\n');
2817 return 0;
2820 static const struct seq_operations unix_seq_ops = {
2821 .start = unix_seq_start,
2822 .next = unix_seq_next,
2823 .stop = unix_seq_stop,
2824 .show = unix_seq_show,
2826 #endif
2828 static const struct net_proto_family unix_family_ops = {
2829 .family = PF_UNIX,
2830 .create = unix_create,
2831 .owner = THIS_MODULE,
2835 static int __net_init unix_net_init(struct net *net)
2837 int error = -ENOMEM;
2839 net->unx.sysctl_max_dgram_qlen = 10;
2840 if (unix_sysctl_register(net))
2841 goto out;
2843 #ifdef CONFIG_PROC_FS
2844 if (!proc_create_net("unix", 0, net->proc_net, &unix_seq_ops,
2845 sizeof(struct seq_net_private))) {
2846 unix_sysctl_unregister(net);
2847 goto out;
2849 #endif
2850 error = 0;
2851 out:
2852 return error;
2855 static void __net_exit unix_net_exit(struct net *net)
2857 unix_sysctl_unregister(net);
2858 remove_proc_entry("unix", net->proc_net);
2861 static struct pernet_operations unix_net_ops = {
2862 .init = unix_net_init,
2863 .exit = unix_net_exit,
2866 static int __init af_unix_init(void)
2868 int rc = -1;
2870 BUILD_BUG_ON(sizeof(struct unix_skb_parms) > FIELD_SIZEOF(struct sk_buff, cb));
2872 rc = proto_register(&unix_proto, 1);
2873 if (rc != 0) {
2874 pr_crit("%s: Cannot create unix_sock SLAB cache!\n", __func__);
2875 goto out;
2878 sock_register(&unix_family_ops);
2879 register_pernet_subsys(&unix_net_ops);
2880 out:
2881 return rc;
2884 static void __exit af_unix_exit(void)
2886 sock_unregister(PF_UNIX);
2887 proto_unregister(&unix_proto);
2888 unregister_pernet_subsys(&unix_net_ops);
2891 /* Earlier than device_initcall() so that other drivers invoking
2892 request_module() don't end up in a loop when modprobe tries
2893 to use a UNIX socket. But later than subsys_initcall() because
2894 we depend on stuff initialised there */
2895 fs_initcall(af_unix_init);
2896 module_exit(af_unix_exit);
2898 MODULE_LICENSE("GPL");
2899 MODULE_ALIAS_NETPROTO(PF_UNIX);