Linux 4.19.168
[linux/fpc-iii.git] / net / unix / af_unix.c
blob2020306468af4e4e9950aa7ae0c7c6c1a6e36ff8
1 /*
2 * NET4: Implementation of BSD Unix domain sockets.
4 * Authors: Alan Cox, <alan@lxorguk.ukuu.org.uk>
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version
9 * 2 of the License, or (at your option) any later version.
11 * Fixes:
12 * Linus Torvalds : Assorted bug cures.
13 * Niibe Yutaka : async I/O support.
14 * Carsten Paeth : PF_UNIX check, address fixes.
15 * Alan Cox : Limit size of allocated blocks.
16 * Alan Cox : Fixed the stupid socketpair bug.
17 * Alan Cox : BSD compatibility fine tuning.
18 * Alan Cox : Fixed a bug in connect when interrupted.
19 * Alan Cox : Sorted out a proper draft version of
20 * file descriptor passing hacked up from
21 * Mike Shaver's work.
22 * Marty Leisner : Fixes to fd passing
23 * Nick Nevin : recvmsg bugfix.
24 * Alan Cox : Started proper garbage collector
25 * Heiko EiBfeldt : Missing verify_area check
26 * Alan Cox : Started POSIXisms
27 * Andreas Schwab : Replace inode by dentry for proper
28 * reference counting
29 * Kirk Petersen : Made this a module
30 * Christoph Rohland : Elegant non-blocking accept/connect algorithm.
31 * Lots of bug fixes.
32 * Alexey Kuznetosv : Repaired (I hope) bugs introduces
33 * by above two patches.
34 * Andrea Arcangeli : If possible we block in connect(2)
35 * if the max backlog of the listen socket
36 * is been reached. This won't break
37 * old apps and it will avoid huge amount
38 * of socks hashed (this for unix_gc()
39 * performances reasons).
40 * Security fix that limits the max
41 * number of socks to 2*max_files and
42 * the number of skb queueable in the
43 * dgram receiver.
44 * Artur Skawina : Hash function optimizations
45 * Alexey Kuznetsov : Full scale SMP. Lot of bugs are introduced 8)
46 * Malcolm Beattie : Set peercred for socketpair
47 * Michal Ostrowski : Module initialization cleanup.
48 * Arnaldo C. Melo : Remove MOD_{INC,DEC}_USE_COUNT,
49 * the core infrastructure is doing that
50 * for all net proto families now (2.5.69+)
53 * Known differences from reference BSD that was tested:
55 * [TO FIX]
56 * ECONNREFUSED is not returned from one end of a connected() socket to the
57 * other the moment one end closes.
58 * fstat() doesn't return st_dev=0, and give the blksize as high water mark
59 * and a fake inode identifier (nor the BSD first socket fstat twice bug).
60 * [NOT TO FIX]
61 * accept() returns a path name even if the connecting socket has closed
62 * in the meantime (BSD loses the path and gives up).
63 * accept() returns 0 length path for an unbound connector. BSD returns 16
64 * and a null first byte in the path (but not for gethost/peername - BSD bug ??)
65 * socketpair(...SOCK_RAW..) doesn't panic the kernel.
66 * BSD af_unix apparently has connect forgetting to block properly.
67 * (need to check this with the POSIX spec in detail)
69 * Differences from 2.0.0-11-... (ANK)
70 * Bug fixes and improvements.
71 * - client shutdown killed server socket.
72 * - removed all useless cli/sti pairs.
74 * Semantic changes/extensions.
75 * - generic control message passing.
76 * - SCM_CREDENTIALS control message.
77 * - "Abstract" (not FS based) socket bindings.
78 * Abstract names are sequences of bytes (not zero terminated)
79 * started by 0, so that this name space does not intersect
80 * with BSD names.
83 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
85 #include <linux/module.h>
86 #include <linux/kernel.h>
87 #include <linux/signal.h>
88 #include <linux/sched/signal.h>
89 #include <linux/errno.h>
90 #include <linux/string.h>
91 #include <linux/stat.h>
92 #include <linux/dcache.h>
93 #include <linux/namei.h>
94 #include <linux/socket.h>
95 #include <linux/un.h>
96 #include <linux/fcntl.h>
97 #include <linux/termios.h>
98 #include <linux/sockios.h>
99 #include <linux/net.h>
100 #include <linux/in.h>
101 #include <linux/fs.h>
102 #include <linux/slab.h>
103 #include <linux/uaccess.h>
104 #include <linux/skbuff.h>
105 #include <linux/netdevice.h>
106 #include <net/net_namespace.h>
107 #include <net/sock.h>
108 #include <net/tcp_states.h>
109 #include <net/af_unix.h>
110 #include <linux/proc_fs.h>
111 #include <linux/seq_file.h>
112 #include <net/scm.h>
113 #include <linux/init.h>
114 #include <linux/poll.h>
115 #include <linux/rtnetlink.h>
116 #include <linux/mount.h>
117 #include <net/checksum.h>
118 #include <linux/security.h>
119 #include <linux/freezer.h>
120 #include <linux/file.h>
122 struct hlist_head unix_socket_table[2 * UNIX_HASH_SIZE];
123 EXPORT_SYMBOL_GPL(unix_socket_table);
124 DEFINE_SPINLOCK(unix_table_lock);
125 EXPORT_SYMBOL_GPL(unix_table_lock);
126 static atomic_long_t unix_nr_socks;
129 static struct hlist_head *unix_sockets_unbound(void *addr)
131 unsigned long hash = (unsigned long)addr;
133 hash ^= hash >> 16;
134 hash ^= hash >> 8;
135 hash %= UNIX_HASH_SIZE;
136 return &unix_socket_table[UNIX_HASH_SIZE + hash];
139 #define UNIX_ABSTRACT(sk) (unix_sk(sk)->addr->hash < UNIX_HASH_SIZE)
141 #ifdef CONFIG_SECURITY_NETWORK
142 static void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb)
144 UNIXCB(skb).secid = scm->secid;
147 static inline void unix_set_secdata(struct scm_cookie *scm, struct sk_buff *skb)
149 scm->secid = UNIXCB(skb).secid;
152 static inline bool unix_secdata_eq(struct scm_cookie *scm, struct sk_buff *skb)
154 return (scm->secid == UNIXCB(skb).secid);
156 #else
157 static inline void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb)
160 static inline void unix_set_secdata(struct scm_cookie *scm, struct sk_buff *skb)
163 static inline bool unix_secdata_eq(struct scm_cookie *scm, struct sk_buff *skb)
165 return true;
167 #endif /* CONFIG_SECURITY_NETWORK */
170 * SMP locking strategy:
171 * hash table is protected with spinlock unix_table_lock
172 * each socket state is protected by separate spin lock.
175 static inline unsigned int unix_hash_fold(__wsum n)
177 unsigned int hash = (__force unsigned int)csum_fold(n);
179 hash ^= hash>>8;
180 return hash&(UNIX_HASH_SIZE-1);
183 #define unix_peer(sk) (unix_sk(sk)->peer)
185 static inline int unix_our_peer(struct sock *sk, struct sock *osk)
187 return unix_peer(osk) == sk;
190 static inline int unix_may_send(struct sock *sk, struct sock *osk)
192 return unix_peer(osk) == NULL || unix_our_peer(sk, osk);
195 static inline int unix_recvq_full(const struct sock *sk)
197 return skb_queue_len(&sk->sk_receive_queue) > sk->sk_max_ack_backlog;
200 static inline int unix_recvq_full_lockless(const struct sock *sk)
202 return skb_queue_len_lockless(&sk->sk_receive_queue) >
203 READ_ONCE(sk->sk_max_ack_backlog);
206 struct sock *unix_peer_get(struct sock *s)
208 struct sock *peer;
210 unix_state_lock(s);
211 peer = unix_peer(s);
212 if (peer)
213 sock_hold(peer);
214 unix_state_unlock(s);
215 return peer;
217 EXPORT_SYMBOL_GPL(unix_peer_get);
219 static inline void unix_release_addr(struct unix_address *addr)
221 if (refcount_dec_and_test(&addr->refcnt))
222 kfree(addr);
226 * Check unix socket name:
227 * - should be not zero length.
228 * - if started by not zero, should be NULL terminated (FS object)
229 * - if started by zero, it is abstract name.
232 static int unix_mkname(struct sockaddr_un *sunaddr, int len, unsigned int *hashp)
234 *hashp = 0;
236 if (len <= sizeof(short) || len > sizeof(*sunaddr))
237 return -EINVAL;
238 if (!sunaddr || sunaddr->sun_family != AF_UNIX)
239 return -EINVAL;
240 if (sunaddr->sun_path[0]) {
242 * This may look like an off by one error but it is a bit more
243 * subtle. 108 is the longest valid AF_UNIX path for a binding.
244 * sun_path[108] doesn't as such exist. However in kernel space
245 * we are guaranteed that it is a valid memory location in our
246 * kernel address buffer.
248 ((char *)sunaddr)[len] = 0;
249 len = strlen(sunaddr->sun_path)+1+sizeof(short);
250 return len;
253 *hashp = unix_hash_fold(csum_partial(sunaddr, len, 0));
254 return len;
257 static void __unix_remove_socket(struct sock *sk)
259 sk_del_node_init(sk);
262 static void __unix_insert_socket(struct hlist_head *list, struct sock *sk)
264 WARN_ON(!sk_unhashed(sk));
265 sk_add_node(sk, list);
268 static inline void unix_remove_socket(struct sock *sk)
270 spin_lock(&unix_table_lock);
271 __unix_remove_socket(sk);
272 spin_unlock(&unix_table_lock);
275 static inline void unix_insert_socket(struct hlist_head *list, struct sock *sk)
277 spin_lock(&unix_table_lock);
278 __unix_insert_socket(list, sk);
279 spin_unlock(&unix_table_lock);
282 static struct sock *__unix_find_socket_byname(struct net *net,
283 struct sockaddr_un *sunname,
284 int len, int type, unsigned int hash)
286 struct sock *s;
288 sk_for_each(s, &unix_socket_table[hash ^ type]) {
289 struct unix_sock *u = unix_sk(s);
291 if (!net_eq(sock_net(s), net))
292 continue;
294 if (u->addr->len == len &&
295 !memcmp(u->addr->name, sunname, len))
296 goto found;
298 s = NULL;
299 found:
300 return s;
303 static inline struct sock *unix_find_socket_byname(struct net *net,
304 struct sockaddr_un *sunname,
305 int len, int type,
306 unsigned int hash)
308 struct sock *s;
310 spin_lock(&unix_table_lock);
311 s = __unix_find_socket_byname(net, sunname, len, type, hash);
312 if (s)
313 sock_hold(s);
314 spin_unlock(&unix_table_lock);
315 return s;
318 static struct sock *unix_find_socket_byinode(struct inode *i)
320 struct sock *s;
322 spin_lock(&unix_table_lock);
323 sk_for_each(s,
324 &unix_socket_table[i->i_ino & (UNIX_HASH_SIZE - 1)]) {
325 struct dentry *dentry = unix_sk(s)->path.dentry;
327 if (dentry && d_backing_inode(dentry) == i) {
328 sock_hold(s);
329 goto found;
332 s = NULL;
333 found:
334 spin_unlock(&unix_table_lock);
335 return s;
338 /* Support code for asymmetrically connected dgram sockets
340 * If a datagram socket is connected to a socket not itself connected
341 * to the first socket (eg, /dev/log), clients may only enqueue more
342 * messages if the present receive queue of the server socket is not
343 * "too large". This means there's a second writeability condition
344 * poll and sendmsg need to test. The dgram recv code will do a wake
345 * up on the peer_wait wait queue of a socket upon reception of a
346 * datagram which needs to be propagated to sleeping would-be writers
347 * since these might not have sent anything so far. This can't be
348 * accomplished via poll_wait because the lifetime of the server
349 * socket might be less than that of its clients if these break their
350 * association with it or if the server socket is closed while clients
351 * are still connected to it and there's no way to inform "a polling
352 * implementation" that it should let go of a certain wait queue
354 * In order to propagate a wake up, a wait_queue_entry_t of the client
355 * socket is enqueued on the peer_wait queue of the server socket
356 * whose wake function does a wake_up on the ordinary client socket
357 * wait queue. This connection is established whenever a write (or
358 * poll for write) hit the flow control condition and broken when the
359 * association to the server socket is dissolved or after a wake up
360 * was relayed.
363 static int unix_dgram_peer_wake_relay(wait_queue_entry_t *q, unsigned mode, int flags,
364 void *key)
366 struct unix_sock *u;
367 wait_queue_head_t *u_sleep;
369 u = container_of(q, struct unix_sock, peer_wake);
371 __remove_wait_queue(&unix_sk(u->peer_wake.private)->peer_wait,
373 u->peer_wake.private = NULL;
375 /* relaying can only happen while the wq still exists */
376 u_sleep = sk_sleep(&u->sk);
377 if (u_sleep)
378 wake_up_interruptible_poll(u_sleep, key_to_poll(key));
380 return 0;
383 static int unix_dgram_peer_wake_connect(struct sock *sk, struct sock *other)
385 struct unix_sock *u, *u_other;
386 int rc;
388 u = unix_sk(sk);
389 u_other = unix_sk(other);
390 rc = 0;
391 spin_lock(&u_other->peer_wait.lock);
393 if (!u->peer_wake.private) {
394 u->peer_wake.private = other;
395 __add_wait_queue(&u_other->peer_wait, &u->peer_wake);
397 rc = 1;
400 spin_unlock(&u_other->peer_wait.lock);
401 return rc;
404 static void unix_dgram_peer_wake_disconnect(struct sock *sk,
405 struct sock *other)
407 struct unix_sock *u, *u_other;
409 u = unix_sk(sk);
410 u_other = unix_sk(other);
411 spin_lock(&u_other->peer_wait.lock);
413 if (u->peer_wake.private == other) {
414 __remove_wait_queue(&u_other->peer_wait, &u->peer_wake);
415 u->peer_wake.private = NULL;
418 spin_unlock(&u_other->peer_wait.lock);
421 static void unix_dgram_peer_wake_disconnect_wakeup(struct sock *sk,
422 struct sock *other)
424 unix_dgram_peer_wake_disconnect(sk, other);
425 wake_up_interruptible_poll(sk_sleep(sk),
426 EPOLLOUT |
427 EPOLLWRNORM |
428 EPOLLWRBAND);
431 /* preconditions:
432 * - unix_peer(sk) == other
433 * - association is stable
435 static int unix_dgram_peer_wake_me(struct sock *sk, struct sock *other)
437 int connected;
439 connected = unix_dgram_peer_wake_connect(sk, other);
441 /* If other is SOCK_DEAD, we want to make sure we signal
442 * POLLOUT, such that a subsequent write() can get a
443 * -ECONNREFUSED. Otherwise, if we haven't queued any skbs
444 * to other and its full, we will hang waiting for POLLOUT.
446 if (unix_recvq_full(other) && !sock_flag(other, SOCK_DEAD))
447 return 1;
449 if (connected)
450 unix_dgram_peer_wake_disconnect(sk, other);
452 return 0;
455 static int unix_writable(const struct sock *sk)
457 return sk->sk_state != TCP_LISTEN &&
458 (refcount_read(&sk->sk_wmem_alloc) << 2) <= sk->sk_sndbuf;
461 static void unix_write_space(struct sock *sk)
463 struct socket_wq *wq;
465 rcu_read_lock();
466 if (unix_writable(sk)) {
467 wq = rcu_dereference(sk->sk_wq);
468 if (skwq_has_sleeper(wq))
469 wake_up_interruptible_sync_poll(&wq->wait,
470 EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND);
471 sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
473 rcu_read_unlock();
476 /* When dgram socket disconnects (or changes its peer), we clear its receive
477 * queue of packets arrived from previous peer. First, it allows to do
478 * flow control based only on wmem_alloc; second, sk connected to peer
479 * may receive messages only from that peer. */
480 static void unix_dgram_disconnected(struct sock *sk, struct sock *other)
482 if (!skb_queue_empty(&sk->sk_receive_queue)) {
483 skb_queue_purge(&sk->sk_receive_queue);
484 wake_up_interruptible_all(&unix_sk(sk)->peer_wait);
486 /* If one link of bidirectional dgram pipe is disconnected,
487 * we signal error. Messages are lost. Do not make this,
488 * when peer was not connected to us.
490 if (!sock_flag(other, SOCK_DEAD) && unix_peer(other) == sk) {
491 other->sk_err = ECONNRESET;
492 other->sk_error_report(other);
497 static void unix_sock_destructor(struct sock *sk)
499 struct unix_sock *u = unix_sk(sk);
501 skb_queue_purge(&sk->sk_receive_queue);
503 WARN_ON(refcount_read(&sk->sk_wmem_alloc));
504 WARN_ON(!sk_unhashed(sk));
505 WARN_ON(sk->sk_socket);
506 if (!sock_flag(sk, SOCK_DEAD)) {
507 pr_info("Attempt to release alive unix socket: %p\n", sk);
508 return;
511 if (u->addr)
512 unix_release_addr(u->addr);
514 atomic_long_dec(&unix_nr_socks);
515 local_bh_disable();
516 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
517 local_bh_enable();
518 #ifdef UNIX_REFCNT_DEBUG
519 pr_debug("UNIX %p is destroyed, %ld are still alive.\n", sk,
520 atomic_long_read(&unix_nr_socks));
521 #endif
524 static void unix_release_sock(struct sock *sk, int embrion)
526 struct unix_sock *u = unix_sk(sk);
527 struct path path;
528 struct sock *skpair;
529 struct sk_buff *skb;
530 int state;
532 unix_remove_socket(sk);
534 /* Clear state */
535 unix_state_lock(sk);
536 sock_orphan(sk);
537 sk->sk_shutdown = SHUTDOWN_MASK;
538 path = u->path;
539 u->path.dentry = NULL;
540 u->path.mnt = NULL;
541 state = sk->sk_state;
542 sk->sk_state = TCP_CLOSE;
543 unix_state_unlock(sk);
545 wake_up_interruptible_all(&u->peer_wait);
547 skpair = unix_peer(sk);
549 if (skpair != NULL) {
550 if (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) {
551 unix_state_lock(skpair);
552 /* No more writes */
553 skpair->sk_shutdown = SHUTDOWN_MASK;
554 if (!skb_queue_empty(&sk->sk_receive_queue) || embrion)
555 skpair->sk_err = ECONNRESET;
556 unix_state_unlock(skpair);
557 skpair->sk_state_change(skpair);
558 sk_wake_async(skpair, SOCK_WAKE_WAITD, POLL_HUP);
561 unix_dgram_peer_wake_disconnect(sk, skpair);
562 sock_put(skpair); /* It may now die */
563 unix_peer(sk) = NULL;
566 /* Try to flush out this socket. Throw out buffers at least */
568 while ((skb = skb_dequeue(&sk->sk_receive_queue)) != NULL) {
569 if (state == TCP_LISTEN)
570 unix_release_sock(skb->sk, 1);
571 /* passed fds are erased in the kfree_skb hook */
572 UNIXCB(skb).consumed = skb->len;
573 kfree_skb(skb);
576 if (path.dentry)
577 path_put(&path);
579 sock_put(sk);
581 /* ---- Socket is dead now and most probably destroyed ---- */
584 * Fixme: BSD difference: In BSD all sockets connected to us get
585 * ECONNRESET and we die on the spot. In Linux we behave
586 * like files and pipes do and wait for the last
587 * dereference.
589 * Can't we simply set sock->err?
591 * What the above comment does talk about? --ANK(980817)
594 if (unix_tot_inflight)
595 unix_gc(); /* Garbage collect fds */
598 static void init_peercred(struct sock *sk)
600 put_pid(sk->sk_peer_pid);
601 if (sk->sk_peer_cred)
602 put_cred(sk->sk_peer_cred);
603 sk->sk_peer_pid = get_pid(task_tgid(current));
604 sk->sk_peer_cred = get_current_cred();
607 static void copy_peercred(struct sock *sk, struct sock *peersk)
609 put_pid(sk->sk_peer_pid);
610 if (sk->sk_peer_cred)
611 put_cred(sk->sk_peer_cred);
612 sk->sk_peer_pid = get_pid(peersk->sk_peer_pid);
613 sk->sk_peer_cred = get_cred(peersk->sk_peer_cred);
616 static int unix_listen(struct socket *sock, int backlog)
618 int err;
619 struct sock *sk = sock->sk;
620 struct unix_sock *u = unix_sk(sk);
621 struct pid *old_pid = NULL;
623 err = -EOPNOTSUPP;
624 if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET)
625 goto out; /* Only stream/seqpacket sockets accept */
626 err = -EINVAL;
627 if (!u->addr)
628 goto out; /* No listens on an unbound socket */
629 unix_state_lock(sk);
630 if (sk->sk_state != TCP_CLOSE && sk->sk_state != TCP_LISTEN)
631 goto out_unlock;
632 if (backlog > sk->sk_max_ack_backlog)
633 wake_up_interruptible_all(&u->peer_wait);
634 sk->sk_max_ack_backlog = backlog;
635 sk->sk_state = TCP_LISTEN;
636 /* set credentials so connect can copy them */
637 init_peercred(sk);
638 err = 0;
640 out_unlock:
641 unix_state_unlock(sk);
642 put_pid(old_pid);
643 out:
644 return err;
647 static int unix_release(struct socket *);
648 static int unix_bind(struct socket *, struct sockaddr *, int);
649 static int unix_stream_connect(struct socket *, struct sockaddr *,
650 int addr_len, int flags);
651 static int unix_socketpair(struct socket *, struct socket *);
652 static int unix_accept(struct socket *, struct socket *, int, bool);
653 static int unix_getname(struct socket *, struct sockaddr *, int);
654 static __poll_t unix_poll(struct file *, struct socket *, poll_table *);
655 static __poll_t unix_dgram_poll(struct file *, struct socket *,
656 poll_table *);
657 static int unix_ioctl(struct socket *, unsigned int, unsigned long);
658 #ifdef CONFIG_COMPAT
659 static int unix_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg);
660 #endif
661 static int unix_shutdown(struct socket *, int);
662 static int unix_stream_sendmsg(struct socket *, struct msghdr *, size_t);
663 static int unix_stream_recvmsg(struct socket *, struct msghdr *, size_t, int);
664 static ssize_t unix_stream_sendpage(struct socket *, struct page *, int offset,
665 size_t size, int flags);
666 static ssize_t unix_stream_splice_read(struct socket *, loff_t *ppos,
667 struct pipe_inode_info *, size_t size,
668 unsigned int flags);
669 static int unix_dgram_sendmsg(struct socket *, struct msghdr *, size_t);
670 static int unix_dgram_recvmsg(struct socket *, struct msghdr *, size_t, int);
671 static int unix_dgram_connect(struct socket *, struct sockaddr *,
672 int, int);
673 static int unix_seqpacket_sendmsg(struct socket *, struct msghdr *, size_t);
674 static int unix_seqpacket_recvmsg(struct socket *, struct msghdr *, size_t,
675 int);
677 static int unix_set_peek_off(struct sock *sk, int val)
679 struct unix_sock *u = unix_sk(sk);
681 if (mutex_lock_interruptible(&u->iolock))
682 return -EINTR;
684 sk->sk_peek_off = val;
685 mutex_unlock(&u->iolock);
687 return 0;
691 static const struct proto_ops unix_stream_ops = {
692 .family = PF_UNIX,
693 .owner = THIS_MODULE,
694 .release = unix_release,
695 .bind = unix_bind,
696 .connect = unix_stream_connect,
697 .socketpair = unix_socketpair,
698 .accept = unix_accept,
699 .getname = unix_getname,
700 .poll = unix_poll,
701 .ioctl = unix_ioctl,
702 #ifdef CONFIG_COMPAT
703 .compat_ioctl = unix_compat_ioctl,
704 #endif
705 .listen = unix_listen,
706 .shutdown = unix_shutdown,
707 .setsockopt = sock_no_setsockopt,
708 .getsockopt = sock_no_getsockopt,
709 .sendmsg = unix_stream_sendmsg,
710 .recvmsg = unix_stream_recvmsg,
711 .mmap = sock_no_mmap,
712 .sendpage = unix_stream_sendpage,
713 .splice_read = unix_stream_splice_read,
714 .set_peek_off = unix_set_peek_off,
717 static const struct proto_ops unix_dgram_ops = {
718 .family = PF_UNIX,
719 .owner = THIS_MODULE,
720 .release = unix_release,
721 .bind = unix_bind,
722 .connect = unix_dgram_connect,
723 .socketpair = unix_socketpair,
724 .accept = sock_no_accept,
725 .getname = unix_getname,
726 .poll = unix_dgram_poll,
727 .ioctl = unix_ioctl,
728 #ifdef CONFIG_COMPAT
729 .compat_ioctl = unix_compat_ioctl,
730 #endif
731 .listen = sock_no_listen,
732 .shutdown = unix_shutdown,
733 .setsockopt = sock_no_setsockopt,
734 .getsockopt = sock_no_getsockopt,
735 .sendmsg = unix_dgram_sendmsg,
736 .recvmsg = unix_dgram_recvmsg,
737 .mmap = sock_no_mmap,
738 .sendpage = sock_no_sendpage,
739 .set_peek_off = unix_set_peek_off,
742 static const struct proto_ops unix_seqpacket_ops = {
743 .family = PF_UNIX,
744 .owner = THIS_MODULE,
745 .release = unix_release,
746 .bind = unix_bind,
747 .connect = unix_stream_connect,
748 .socketpair = unix_socketpair,
749 .accept = unix_accept,
750 .getname = unix_getname,
751 .poll = unix_dgram_poll,
752 .ioctl = unix_ioctl,
753 #ifdef CONFIG_COMPAT
754 .compat_ioctl = unix_compat_ioctl,
755 #endif
756 .listen = unix_listen,
757 .shutdown = unix_shutdown,
758 .setsockopt = sock_no_setsockopt,
759 .getsockopt = sock_no_getsockopt,
760 .sendmsg = unix_seqpacket_sendmsg,
761 .recvmsg = unix_seqpacket_recvmsg,
762 .mmap = sock_no_mmap,
763 .sendpage = sock_no_sendpage,
764 .set_peek_off = unix_set_peek_off,
767 static struct proto unix_proto = {
768 .name = "UNIX",
769 .owner = THIS_MODULE,
770 .obj_size = sizeof(struct unix_sock),
773 static struct sock *unix_create1(struct net *net, struct socket *sock, int kern)
775 struct sock *sk = NULL;
776 struct unix_sock *u;
778 atomic_long_inc(&unix_nr_socks);
779 if (atomic_long_read(&unix_nr_socks) > 2 * get_max_files())
780 goto out;
782 sk = sk_alloc(net, PF_UNIX, GFP_KERNEL, &unix_proto, kern);
783 if (!sk)
784 goto out;
786 sock_init_data(sock, sk);
788 sk->sk_allocation = GFP_KERNEL_ACCOUNT;
789 sk->sk_write_space = unix_write_space;
790 sk->sk_max_ack_backlog = net->unx.sysctl_max_dgram_qlen;
791 sk->sk_destruct = unix_sock_destructor;
792 u = unix_sk(sk);
793 u->path.dentry = NULL;
794 u->path.mnt = NULL;
795 spin_lock_init(&u->lock);
796 atomic_long_set(&u->inflight, 0);
797 INIT_LIST_HEAD(&u->link);
798 mutex_init(&u->iolock); /* single task reading lock */
799 mutex_init(&u->bindlock); /* single task binding lock */
800 init_waitqueue_head(&u->peer_wait);
801 init_waitqueue_func_entry(&u->peer_wake, unix_dgram_peer_wake_relay);
802 unix_insert_socket(unix_sockets_unbound(sk), sk);
803 out:
804 if (sk == NULL)
805 atomic_long_dec(&unix_nr_socks);
806 else {
807 local_bh_disable();
808 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
809 local_bh_enable();
811 return sk;
814 static int unix_create(struct net *net, struct socket *sock, int protocol,
815 int kern)
817 if (protocol && protocol != PF_UNIX)
818 return -EPROTONOSUPPORT;
820 sock->state = SS_UNCONNECTED;
822 switch (sock->type) {
823 case SOCK_STREAM:
824 sock->ops = &unix_stream_ops;
825 break;
827 * Believe it or not BSD has AF_UNIX, SOCK_RAW though
828 * nothing uses it.
830 case SOCK_RAW:
831 sock->type = SOCK_DGRAM;
832 /* fall through */
833 case SOCK_DGRAM:
834 sock->ops = &unix_dgram_ops;
835 break;
836 case SOCK_SEQPACKET:
837 sock->ops = &unix_seqpacket_ops;
838 break;
839 default:
840 return -ESOCKTNOSUPPORT;
843 return unix_create1(net, sock, kern) ? 0 : -ENOMEM;
846 static int unix_release(struct socket *sock)
848 struct sock *sk = sock->sk;
850 if (!sk)
851 return 0;
853 unix_release_sock(sk, 0);
854 sock->sk = NULL;
856 return 0;
859 static int unix_autobind(struct socket *sock)
861 struct sock *sk = sock->sk;
862 struct net *net = sock_net(sk);
863 struct unix_sock *u = unix_sk(sk);
864 static u32 ordernum = 1;
865 struct unix_address *addr;
866 int err;
867 unsigned int retries = 0;
869 err = mutex_lock_interruptible(&u->bindlock);
870 if (err)
871 return err;
873 err = 0;
874 if (u->addr)
875 goto out;
877 err = -ENOMEM;
878 addr = kzalloc(sizeof(*addr) + sizeof(short) + 16, GFP_KERNEL);
879 if (!addr)
880 goto out;
882 addr->name->sun_family = AF_UNIX;
883 refcount_set(&addr->refcnt, 1);
885 retry:
886 addr->len = sprintf(addr->name->sun_path+1, "%05x", ordernum) + 1 + sizeof(short);
887 addr->hash = unix_hash_fold(csum_partial(addr->name, addr->len, 0));
889 spin_lock(&unix_table_lock);
890 ordernum = (ordernum+1)&0xFFFFF;
892 if (__unix_find_socket_byname(net, addr->name, addr->len, sock->type,
893 addr->hash)) {
894 spin_unlock(&unix_table_lock);
896 * __unix_find_socket_byname() may take long time if many names
897 * are already in use.
899 cond_resched();
900 /* Give up if all names seems to be in use. */
901 if (retries++ == 0xFFFFF) {
902 err = -ENOSPC;
903 kfree(addr);
904 goto out;
906 goto retry;
908 addr->hash ^= sk->sk_type;
910 __unix_remove_socket(sk);
911 smp_store_release(&u->addr, addr);
912 __unix_insert_socket(&unix_socket_table[addr->hash], sk);
913 spin_unlock(&unix_table_lock);
914 err = 0;
916 out: mutex_unlock(&u->bindlock);
917 return err;
920 static struct sock *unix_find_other(struct net *net,
921 struct sockaddr_un *sunname, int len,
922 int type, unsigned int hash, int *error)
924 struct sock *u;
925 struct path path;
926 int err = 0;
928 if (sunname->sun_path[0]) {
929 struct inode *inode;
930 err = kern_path(sunname->sun_path, LOOKUP_FOLLOW, &path);
931 if (err)
932 goto fail;
933 inode = d_backing_inode(path.dentry);
934 err = inode_permission(inode, MAY_WRITE);
935 if (err)
936 goto put_fail;
938 err = -ECONNREFUSED;
939 if (!S_ISSOCK(inode->i_mode))
940 goto put_fail;
941 u = unix_find_socket_byinode(inode);
942 if (!u)
943 goto put_fail;
945 if (u->sk_type == type)
946 touch_atime(&path);
948 path_put(&path);
950 err = -EPROTOTYPE;
951 if (u->sk_type != type) {
952 sock_put(u);
953 goto fail;
955 } else {
956 err = -ECONNREFUSED;
957 u = unix_find_socket_byname(net, sunname, len, type, hash);
958 if (u) {
959 struct dentry *dentry;
960 dentry = unix_sk(u)->path.dentry;
961 if (dentry)
962 touch_atime(&unix_sk(u)->path);
963 } else
964 goto fail;
966 return u;
968 put_fail:
969 path_put(&path);
970 fail:
971 *error = err;
972 return NULL;
975 static int unix_mknod(const char *sun_path, umode_t mode, struct path *res)
977 struct dentry *dentry;
978 struct path path;
979 int err = 0;
981 * Get the parent directory, calculate the hash for last
982 * component.
984 dentry = kern_path_create(AT_FDCWD, sun_path, &path, 0);
985 err = PTR_ERR(dentry);
986 if (IS_ERR(dentry))
987 return err;
990 * All right, let's create it.
992 err = security_path_mknod(&path, dentry, mode, 0);
993 if (!err) {
994 err = vfs_mknod(d_inode(path.dentry), dentry, mode, 0);
995 if (!err) {
996 res->mnt = mntget(path.mnt);
997 res->dentry = dget(dentry);
1000 done_path_create(&path, dentry);
1001 return err;
1004 static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
1006 struct sock *sk = sock->sk;
1007 struct net *net = sock_net(sk);
1008 struct unix_sock *u = unix_sk(sk);
1009 struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr;
1010 char *sun_path = sunaddr->sun_path;
1011 int err;
1012 unsigned int hash;
1013 struct unix_address *addr;
1014 struct hlist_head *list;
1015 struct path path = { };
1017 err = -EINVAL;
1018 if (addr_len < offsetofend(struct sockaddr_un, sun_family) ||
1019 sunaddr->sun_family != AF_UNIX)
1020 goto out;
1022 if (addr_len == sizeof(short)) {
1023 err = unix_autobind(sock);
1024 goto out;
1027 err = unix_mkname(sunaddr, addr_len, &hash);
1028 if (err < 0)
1029 goto out;
1030 addr_len = err;
1032 if (sun_path[0]) {
1033 umode_t mode = S_IFSOCK |
1034 (SOCK_INODE(sock)->i_mode & ~current_umask());
1035 err = unix_mknod(sun_path, mode, &path);
1036 if (err) {
1037 if (err == -EEXIST)
1038 err = -EADDRINUSE;
1039 goto out;
1043 err = mutex_lock_interruptible(&u->bindlock);
1044 if (err)
1045 goto out_put;
1047 err = -EINVAL;
1048 if (u->addr)
1049 goto out_up;
1051 err = -ENOMEM;
1052 addr = kmalloc(sizeof(*addr)+addr_len, GFP_KERNEL);
1053 if (!addr)
1054 goto out_up;
1056 memcpy(addr->name, sunaddr, addr_len);
1057 addr->len = addr_len;
1058 addr->hash = hash ^ sk->sk_type;
1059 refcount_set(&addr->refcnt, 1);
1061 if (sun_path[0]) {
1062 addr->hash = UNIX_HASH_SIZE;
1063 hash = d_backing_inode(path.dentry)->i_ino & (UNIX_HASH_SIZE - 1);
1064 spin_lock(&unix_table_lock);
1065 u->path = path;
1066 list = &unix_socket_table[hash];
1067 } else {
1068 spin_lock(&unix_table_lock);
1069 err = -EADDRINUSE;
1070 if (__unix_find_socket_byname(net, sunaddr, addr_len,
1071 sk->sk_type, hash)) {
1072 unix_release_addr(addr);
1073 goto out_unlock;
1076 list = &unix_socket_table[addr->hash];
1079 err = 0;
1080 __unix_remove_socket(sk);
1081 smp_store_release(&u->addr, addr);
1082 __unix_insert_socket(list, sk);
1084 out_unlock:
1085 spin_unlock(&unix_table_lock);
1086 out_up:
1087 mutex_unlock(&u->bindlock);
1088 out_put:
1089 if (err)
1090 path_put(&path);
1091 out:
1092 return err;
1095 static void unix_state_double_lock(struct sock *sk1, struct sock *sk2)
1097 if (unlikely(sk1 == sk2) || !sk2) {
1098 unix_state_lock(sk1);
1099 return;
1101 if (sk1 < sk2) {
1102 unix_state_lock(sk1);
1103 unix_state_lock_nested(sk2);
1104 } else {
1105 unix_state_lock(sk2);
1106 unix_state_lock_nested(sk1);
1110 static void unix_state_double_unlock(struct sock *sk1, struct sock *sk2)
1112 if (unlikely(sk1 == sk2) || !sk2) {
1113 unix_state_unlock(sk1);
1114 return;
1116 unix_state_unlock(sk1);
1117 unix_state_unlock(sk2);
1120 static int unix_dgram_connect(struct socket *sock, struct sockaddr *addr,
1121 int alen, int flags)
1123 struct sock *sk = sock->sk;
1124 struct net *net = sock_net(sk);
1125 struct sockaddr_un *sunaddr = (struct sockaddr_un *)addr;
1126 struct sock *other;
1127 unsigned int hash;
1128 int err;
1130 err = -EINVAL;
1131 if (alen < offsetofend(struct sockaddr, sa_family))
1132 goto out;
1134 if (addr->sa_family != AF_UNSPEC) {
1135 err = unix_mkname(sunaddr, alen, &hash);
1136 if (err < 0)
1137 goto out;
1138 alen = err;
1140 if (test_bit(SOCK_PASSCRED, &sock->flags) &&
1141 !unix_sk(sk)->addr && (err = unix_autobind(sock)) != 0)
1142 goto out;
1144 restart:
1145 other = unix_find_other(net, sunaddr, alen, sock->type, hash, &err);
1146 if (!other)
1147 goto out;
1149 unix_state_double_lock(sk, other);
1151 /* Apparently VFS overslept socket death. Retry. */
1152 if (sock_flag(other, SOCK_DEAD)) {
1153 unix_state_double_unlock(sk, other);
1154 sock_put(other);
1155 goto restart;
1158 err = -EPERM;
1159 if (!unix_may_send(sk, other))
1160 goto out_unlock;
1162 err = security_unix_may_send(sk->sk_socket, other->sk_socket);
1163 if (err)
1164 goto out_unlock;
1166 } else {
1168 * 1003.1g breaking connected state with AF_UNSPEC
1170 other = NULL;
1171 unix_state_double_lock(sk, other);
1175 * If it was connected, reconnect.
1177 if (unix_peer(sk)) {
1178 struct sock *old_peer = unix_peer(sk);
1179 unix_peer(sk) = other;
1180 unix_dgram_peer_wake_disconnect_wakeup(sk, old_peer);
1182 unix_state_double_unlock(sk, other);
1184 if (other != old_peer)
1185 unix_dgram_disconnected(sk, old_peer);
1186 sock_put(old_peer);
1187 } else {
1188 unix_peer(sk) = other;
1189 unix_state_double_unlock(sk, other);
1191 return 0;
1193 out_unlock:
1194 unix_state_double_unlock(sk, other);
1195 sock_put(other);
1196 out:
1197 return err;
1200 static long unix_wait_for_peer(struct sock *other, long timeo)
1202 struct unix_sock *u = unix_sk(other);
1203 int sched;
1204 DEFINE_WAIT(wait);
1206 prepare_to_wait_exclusive(&u->peer_wait, &wait, TASK_INTERRUPTIBLE);
1208 sched = !sock_flag(other, SOCK_DEAD) &&
1209 !(other->sk_shutdown & RCV_SHUTDOWN) &&
1210 unix_recvq_full(other);
1212 unix_state_unlock(other);
1214 if (sched)
1215 timeo = schedule_timeout(timeo);
1217 finish_wait(&u->peer_wait, &wait);
1218 return timeo;
1221 static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr,
1222 int addr_len, int flags)
1224 struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr;
1225 struct sock *sk = sock->sk;
1226 struct net *net = sock_net(sk);
1227 struct unix_sock *u = unix_sk(sk), *newu, *otheru;
1228 struct sock *newsk = NULL;
1229 struct sock *other = NULL;
1230 struct sk_buff *skb = NULL;
1231 unsigned int hash;
1232 int st;
1233 int err;
1234 long timeo;
1236 err = unix_mkname(sunaddr, addr_len, &hash);
1237 if (err < 0)
1238 goto out;
1239 addr_len = err;
1241 if (test_bit(SOCK_PASSCRED, &sock->flags) && !u->addr &&
1242 (err = unix_autobind(sock)) != 0)
1243 goto out;
1245 timeo = sock_sndtimeo(sk, flags & O_NONBLOCK);
1247 /* First of all allocate resources.
1248 If we will make it after state is locked,
1249 we will have to recheck all again in any case.
1252 err = -ENOMEM;
1254 /* create new sock for complete connection */
1255 newsk = unix_create1(sock_net(sk), NULL, 0);
1256 if (newsk == NULL)
1257 goto out;
1259 /* Allocate skb for sending to listening sock */
1260 skb = sock_wmalloc(newsk, 1, 0, GFP_KERNEL);
1261 if (skb == NULL)
1262 goto out;
1264 restart:
1265 /* Find listening sock. */
1266 other = unix_find_other(net, sunaddr, addr_len, sk->sk_type, hash, &err);
1267 if (!other)
1268 goto out;
1270 /* Latch state of peer */
1271 unix_state_lock(other);
1273 /* Apparently VFS overslept socket death. Retry. */
1274 if (sock_flag(other, SOCK_DEAD)) {
1275 unix_state_unlock(other);
1276 sock_put(other);
1277 goto restart;
1280 err = -ECONNREFUSED;
1281 if (other->sk_state != TCP_LISTEN)
1282 goto out_unlock;
1283 if (other->sk_shutdown & RCV_SHUTDOWN)
1284 goto out_unlock;
1286 if (unix_recvq_full(other)) {
1287 err = -EAGAIN;
1288 if (!timeo)
1289 goto out_unlock;
1291 timeo = unix_wait_for_peer(other, timeo);
1293 err = sock_intr_errno(timeo);
1294 if (signal_pending(current))
1295 goto out;
1296 sock_put(other);
1297 goto restart;
1300 /* Latch our state.
1302 It is tricky place. We need to grab our state lock and cannot
1303 drop lock on peer. It is dangerous because deadlock is
1304 possible. Connect to self case and simultaneous
1305 attempt to connect are eliminated by checking socket
1306 state. other is TCP_LISTEN, if sk is TCP_LISTEN we
1307 check this before attempt to grab lock.
1309 Well, and we have to recheck the state after socket locked.
1311 st = sk->sk_state;
1313 switch (st) {
1314 case TCP_CLOSE:
1315 /* This is ok... continue with connect */
1316 break;
1317 case TCP_ESTABLISHED:
1318 /* Socket is already connected */
1319 err = -EISCONN;
1320 goto out_unlock;
1321 default:
1322 err = -EINVAL;
1323 goto out_unlock;
1326 unix_state_lock_nested(sk);
1328 if (sk->sk_state != st) {
1329 unix_state_unlock(sk);
1330 unix_state_unlock(other);
1331 sock_put(other);
1332 goto restart;
1335 err = security_unix_stream_connect(sk, other, newsk);
1336 if (err) {
1337 unix_state_unlock(sk);
1338 goto out_unlock;
1341 /* The way is open! Fastly set all the necessary fields... */
1343 sock_hold(sk);
1344 unix_peer(newsk) = sk;
1345 newsk->sk_state = TCP_ESTABLISHED;
1346 newsk->sk_type = sk->sk_type;
1347 init_peercred(newsk);
1348 newu = unix_sk(newsk);
1349 RCU_INIT_POINTER(newsk->sk_wq, &newu->peer_wq);
1350 otheru = unix_sk(other);
1352 /* copy address information from listening to new sock
1354 * The contents of *(otheru->addr) and otheru->path
1355 * are seen fully set up here, since we have found
1356 * otheru in hash under unix_table_lock. Insertion
1357 * into the hash chain we'd found it in had been done
1358 * in an earlier critical area protected by unix_table_lock,
1359 * the same one where we'd set *(otheru->addr) contents,
1360 * as well as otheru->path and otheru->addr itself.
1362 * Using smp_store_release() here to set newu->addr
1363 * is enough to make those stores, as well as stores
1364 * to newu->path visible to anyone who gets newu->addr
1365 * by smp_load_acquire(). IOW, the same warranties
1366 * as for unix_sock instances bound in unix_bind() or
1367 * in unix_autobind().
1369 if (otheru->path.dentry) {
1370 path_get(&otheru->path);
1371 newu->path = otheru->path;
1373 refcount_inc(&otheru->addr->refcnt);
1374 smp_store_release(&newu->addr, otheru->addr);
1376 /* Set credentials */
1377 copy_peercred(sk, other);
1379 sock->state = SS_CONNECTED;
1380 sk->sk_state = TCP_ESTABLISHED;
1381 sock_hold(newsk);
1383 smp_mb__after_atomic(); /* sock_hold() does an atomic_inc() */
1384 unix_peer(sk) = newsk;
1386 unix_state_unlock(sk);
1388 /* take ten and and send info to listening sock */
1389 spin_lock(&other->sk_receive_queue.lock);
1390 __skb_queue_tail(&other->sk_receive_queue, skb);
1391 spin_unlock(&other->sk_receive_queue.lock);
1392 unix_state_unlock(other);
1393 other->sk_data_ready(other);
1394 sock_put(other);
1395 return 0;
1397 out_unlock:
1398 if (other)
1399 unix_state_unlock(other);
1401 out:
1402 kfree_skb(skb);
1403 if (newsk)
1404 unix_release_sock(newsk, 0);
1405 if (other)
1406 sock_put(other);
1407 return err;
1410 static int unix_socketpair(struct socket *socka, struct socket *sockb)
1412 struct sock *ska = socka->sk, *skb = sockb->sk;
1414 /* Join our sockets back to back */
1415 sock_hold(ska);
1416 sock_hold(skb);
1417 unix_peer(ska) = skb;
1418 unix_peer(skb) = ska;
1419 init_peercred(ska);
1420 init_peercred(skb);
1422 if (ska->sk_type != SOCK_DGRAM) {
1423 ska->sk_state = TCP_ESTABLISHED;
1424 skb->sk_state = TCP_ESTABLISHED;
1425 socka->state = SS_CONNECTED;
1426 sockb->state = SS_CONNECTED;
1428 return 0;
1431 static void unix_sock_inherit_flags(const struct socket *old,
1432 struct socket *new)
1434 if (test_bit(SOCK_PASSCRED, &old->flags))
1435 set_bit(SOCK_PASSCRED, &new->flags);
1436 if (test_bit(SOCK_PASSSEC, &old->flags))
1437 set_bit(SOCK_PASSSEC, &new->flags);
1440 static int unix_accept(struct socket *sock, struct socket *newsock, int flags,
1441 bool kern)
1443 struct sock *sk = sock->sk;
1444 struct sock *tsk;
1445 struct sk_buff *skb;
1446 int err;
1448 err = -EOPNOTSUPP;
1449 if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET)
1450 goto out;
1452 err = -EINVAL;
1453 if (sk->sk_state != TCP_LISTEN)
1454 goto out;
1456 /* If socket state is TCP_LISTEN it cannot change (for now...),
1457 * so that no locks are necessary.
1460 skb = skb_recv_datagram(sk, 0, flags&O_NONBLOCK, &err);
1461 if (!skb) {
1462 /* This means receive shutdown. */
1463 if (err == 0)
1464 err = -EINVAL;
1465 goto out;
1468 tsk = skb->sk;
1469 skb_free_datagram(sk, skb);
1470 wake_up_interruptible(&unix_sk(sk)->peer_wait);
1472 /* attach accepted sock to socket */
1473 unix_state_lock(tsk);
1474 newsock->state = SS_CONNECTED;
1475 unix_sock_inherit_flags(sock, newsock);
1476 sock_graft(tsk, newsock);
1477 unix_state_unlock(tsk);
1478 return 0;
1480 out:
1481 return err;
1485 static int unix_getname(struct socket *sock, struct sockaddr *uaddr, int peer)
1487 struct sock *sk = sock->sk;
1488 struct unix_address *addr;
1489 DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, uaddr);
1490 int err = 0;
1492 if (peer) {
1493 sk = unix_peer_get(sk);
1495 err = -ENOTCONN;
1496 if (!sk)
1497 goto out;
1498 err = 0;
1499 } else {
1500 sock_hold(sk);
1503 addr = smp_load_acquire(&unix_sk(sk)->addr);
1504 if (!addr) {
1505 sunaddr->sun_family = AF_UNIX;
1506 sunaddr->sun_path[0] = 0;
1507 err = sizeof(short);
1508 } else {
1509 err = addr->len;
1510 memcpy(sunaddr, addr->name, addr->len);
1512 sock_put(sk);
1513 out:
1514 return err;
1517 static void unix_detach_fds(struct scm_cookie *scm, struct sk_buff *skb)
1519 int i;
1521 scm->fp = UNIXCB(skb).fp;
1522 UNIXCB(skb).fp = NULL;
1524 for (i = scm->fp->count-1; i >= 0; i--)
1525 unix_notinflight(scm->fp->user, scm->fp->fp[i]);
1528 static void unix_destruct_scm(struct sk_buff *skb)
1530 struct scm_cookie scm;
1531 memset(&scm, 0, sizeof(scm));
1532 scm.pid = UNIXCB(skb).pid;
1533 if (UNIXCB(skb).fp)
1534 unix_detach_fds(&scm, skb);
1536 /* Alas, it calls VFS */
1537 /* So fscking what? fput() had been SMP-safe since the last Summer */
1538 scm_destroy(&scm);
1539 sock_wfree(skb);
1543 * The "user->unix_inflight" variable is protected by the garbage
1544 * collection lock, and we just read it locklessly here. If you go
1545 * over the limit, there might be a tiny race in actually noticing
1546 * it across threads. Tough.
1548 static inline bool too_many_unix_fds(struct task_struct *p)
1550 struct user_struct *user = current_user();
1552 if (unlikely(user->unix_inflight > task_rlimit(p, RLIMIT_NOFILE)))
1553 return !capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN);
1554 return false;
1557 static int unix_attach_fds(struct scm_cookie *scm, struct sk_buff *skb)
1559 int i;
1561 if (too_many_unix_fds(current))
1562 return -ETOOMANYREFS;
1565 * Need to duplicate file references for the sake of garbage
1566 * collection. Otherwise a socket in the fps might become a
1567 * candidate for GC while the skb is not yet queued.
1569 UNIXCB(skb).fp = scm_fp_dup(scm->fp);
1570 if (!UNIXCB(skb).fp)
1571 return -ENOMEM;
1573 for (i = scm->fp->count - 1; i >= 0; i--)
1574 unix_inflight(scm->fp->user, scm->fp->fp[i]);
1575 return 0;
1578 static int unix_scm_to_skb(struct scm_cookie *scm, struct sk_buff *skb, bool send_fds)
1580 int err = 0;
1582 UNIXCB(skb).pid = get_pid(scm->pid);
1583 UNIXCB(skb).uid = scm->creds.uid;
1584 UNIXCB(skb).gid = scm->creds.gid;
1585 UNIXCB(skb).fp = NULL;
1586 unix_get_secdata(scm, skb);
1587 if (scm->fp && send_fds)
1588 err = unix_attach_fds(scm, skb);
1590 skb->destructor = unix_destruct_scm;
1591 return err;
1594 static bool unix_passcred_enabled(const struct socket *sock,
1595 const struct sock *other)
1597 return test_bit(SOCK_PASSCRED, &sock->flags) ||
1598 !other->sk_socket ||
1599 test_bit(SOCK_PASSCRED, &other->sk_socket->flags);
1603 * Some apps rely on write() giving SCM_CREDENTIALS
1604 * We include credentials if source or destination socket
1605 * asserted SOCK_PASSCRED.
1607 static void maybe_add_creds(struct sk_buff *skb, const struct socket *sock,
1608 const struct sock *other)
1610 if (UNIXCB(skb).pid)
1611 return;
1612 if (unix_passcred_enabled(sock, other)) {
1613 UNIXCB(skb).pid = get_pid(task_tgid(current));
1614 current_uid_gid(&UNIXCB(skb).uid, &UNIXCB(skb).gid);
1618 static int maybe_init_creds(struct scm_cookie *scm,
1619 struct socket *socket,
1620 const struct sock *other)
1622 int err;
1623 struct msghdr msg = { .msg_controllen = 0 };
1625 err = scm_send(socket, &msg, scm, false);
1626 if (err)
1627 return err;
1629 if (unix_passcred_enabled(socket, other)) {
1630 scm->pid = get_pid(task_tgid(current));
1631 current_uid_gid(&scm->creds.uid, &scm->creds.gid);
1633 return err;
1636 static bool unix_skb_scm_eq(struct sk_buff *skb,
1637 struct scm_cookie *scm)
1639 const struct unix_skb_parms *u = &UNIXCB(skb);
1641 return u->pid == scm->pid &&
1642 uid_eq(u->uid, scm->creds.uid) &&
1643 gid_eq(u->gid, scm->creds.gid) &&
1644 unix_secdata_eq(scm, skb);
1648 * Send AF_UNIX data.
1651 static int unix_dgram_sendmsg(struct socket *sock, struct msghdr *msg,
1652 size_t len)
1654 struct sock *sk = sock->sk;
1655 struct net *net = sock_net(sk);
1656 struct unix_sock *u = unix_sk(sk);
1657 DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, msg->msg_name);
1658 struct sock *other = NULL;
1659 int namelen = 0; /* fake GCC */
1660 int err;
1661 unsigned int hash;
1662 struct sk_buff *skb;
1663 long timeo;
1664 struct scm_cookie scm;
1665 int data_len = 0;
1666 int sk_locked;
1668 wait_for_unix_gc();
1669 err = scm_send(sock, msg, &scm, false);
1670 if (err < 0)
1671 return err;
1673 err = -EOPNOTSUPP;
1674 if (msg->msg_flags&MSG_OOB)
1675 goto out;
1677 if (msg->msg_namelen) {
1678 err = unix_mkname(sunaddr, msg->msg_namelen, &hash);
1679 if (err < 0)
1680 goto out;
1681 namelen = err;
1682 } else {
1683 sunaddr = NULL;
1684 err = -ENOTCONN;
1685 other = unix_peer_get(sk);
1686 if (!other)
1687 goto out;
1690 if (test_bit(SOCK_PASSCRED, &sock->flags) && !u->addr
1691 && (err = unix_autobind(sock)) != 0)
1692 goto out;
1694 err = -EMSGSIZE;
1695 if (len > sk->sk_sndbuf - 32)
1696 goto out;
1698 if (len > SKB_MAX_ALLOC) {
1699 data_len = min_t(size_t,
1700 len - SKB_MAX_ALLOC,
1701 MAX_SKB_FRAGS * PAGE_SIZE);
1702 data_len = PAGE_ALIGN(data_len);
1704 BUILD_BUG_ON(SKB_MAX_ALLOC < PAGE_SIZE);
1707 skb = sock_alloc_send_pskb(sk, len - data_len, data_len,
1708 msg->msg_flags & MSG_DONTWAIT, &err,
1709 PAGE_ALLOC_COSTLY_ORDER);
1710 if (skb == NULL)
1711 goto out;
1713 err = unix_scm_to_skb(&scm, skb, true);
1714 if (err < 0)
1715 goto out_free;
1717 skb_put(skb, len - data_len);
1718 skb->data_len = data_len;
1719 skb->len = len;
1720 err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, len);
1721 if (err)
1722 goto out_free;
1724 timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT);
1726 restart:
1727 if (!other) {
1728 err = -ECONNRESET;
1729 if (sunaddr == NULL)
1730 goto out_free;
1732 other = unix_find_other(net, sunaddr, namelen, sk->sk_type,
1733 hash, &err);
1734 if (other == NULL)
1735 goto out_free;
1738 if (sk_filter(other, skb) < 0) {
1739 /* Toss the packet but do not return any error to the sender */
1740 err = len;
1741 goto out_free;
1744 sk_locked = 0;
1745 unix_state_lock(other);
1746 restart_locked:
1747 err = -EPERM;
1748 if (!unix_may_send(sk, other))
1749 goto out_unlock;
1751 if (unlikely(sock_flag(other, SOCK_DEAD))) {
1753 * Check with 1003.1g - what should
1754 * datagram error
1756 unix_state_unlock(other);
1757 sock_put(other);
1759 if (!sk_locked)
1760 unix_state_lock(sk);
1762 err = 0;
1763 if (unix_peer(sk) == other) {
1764 unix_peer(sk) = NULL;
1765 unix_dgram_peer_wake_disconnect_wakeup(sk, other);
1767 unix_state_unlock(sk);
1769 unix_dgram_disconnected(sk, other);
1770 sock_put(other);
1771 err = -ECONNREFUSED;
1772 } else {
1773 unix_state_unlock(sk);
1776 other = NULL;
1777 if (err)
1778 goto out_free;
1779 goto restart;
1782 err = -EPIPE;
1783 if (other->sk_shutdown & RCV_SHUTDOWN)
1784 goto out_unlock;
1786 if (sk->sk_type != SOCK_SEQPACKET) {
1787 err = security_unix_may_send(sk->sk_socket, other->sk_socket);
1788 if (err)
1789 goto out_unlock;
1792 /* other == sk && unix_peer(other) != sk if
1793 * - unix_peer(sk) == NULL, destination address bound to sk
1794 * - unix_peer(sk) == sk by time of get but disconnected before lock
1796 if (other != sk &&
1797 unlikely(unix_peer(other) != sk &&
1798 unix_recvq_full_lockless(other))) {
1799 if (timeo) {
1800 timeo = unix_wait_for_peer(other, timeo);
1802 err = sock_intr_errno(timeo);
1803 if (signal_pending(current))
1804 goto out_free;
1806 goto restart;
1809 if (!sk_locked) {
1810 unix_state_unlock(other);
1811 unix_state_double_lock(sk, other);
1814 if (unix_peer(sk) != other ||
1815 unix_dgram_peer_wake_me(sk, other)) {
1816 err = -EAGAIN;
1817 sk_locked = 1;
1818 goto out_unlock;
1821 if (!sk_locked) {
1822 sk_locked = 1;
1823 goto restart_locked;
1827 if (unlikely(sk_locked))
1828 unix_state_unlock(sk);
1830 if (sock_flag(other, SOCK_RCVTSTAMP))
1831 __net_timestamp(skb);
1832 maybe_add_creds(skb, sock, other);
1833 skb_queue_tail(&other->sk_receive_queue, skb);
1834 unix_state_unlock(other);
1835 other->sk_data_ready(other);
1836 sock_put(other);
1837 scm_destroy(&scm);
1838 return len;
1840 out_unlock:
1841 if (sk_locked)
1842 unix_state_unlock(sk);
1843 unix_state_unlock(other);
1844 out_free:
1845 kfree_skb(skb);
1846 out:
1847 if (other)
1848 sock_put(other);
1849 scm_destroy(&scm);
1850 return err;
1853 /* We use paged skbs for stream sockets, and limit occupancy to 32768
1854 * bytes, and a minimum of a full page.
1856 #define UNIX_SKB_FRAGS_SZ (PAGE_SIZE << get_order(32768))
1858 static int unix_stream_sendmsg(struct socket *sock, struct msghdr *msg,
1859 size_t len)
1861 struct sock *sk = sock->sk;
1862 struct sock *other = NULL;
1863 int err, size;
1864 struct sk_buff *skb;
1865 int sent = 0;
1866 struct scm_cookie scm;
1867 bool fds_sent = false;
1868 int data_len;
1870 wait_for_unix_gc();
1871 err = scm_send(sock, msg, &scm, false);
1872 if (err < 0)
1873 return err;
1875 err = -EOPNOTSUPP;
1876 if (msg->msg_flags&MSG_OOB)
1877 goto out_err;
1879 if (msg->msg_namelen) {
1880 err = sk->sk_state == TCP_ESTABLISHED ? -EISCONN : -EOPNOTSUPP;
1881 goto out_err;
1882 } else {
1883 err = -ENOTCONN;
1884 other = unix_peer(sk);
1885 if (!other)
1886 goto out_err;
1889 if (sk->sk_shutdown & SEND_SHUTDOWN)
1890 goto pipe_err;
1892 while (sent < len) {
1893 size = len - sent;
1895 /* Keep two messages in the pipe so it schedules better */
1896 size = min_t(int, size, (sk->sk_sndbuf >> 1) - 64);
1898 /* allow fallback to order-0 allocations */
1899 size = min_t(int, size, SKB_MAX_HEAD(0) + UNIX_SKB_FRAGS_SZ);
1901 data_len = max_t(int, 0, size - SKB_MAX_HEAD(0));
1903 data_len = min_t(size_t, size, PAGE_ALIGN(data_len));
1905 skb = sock_alloc_send_pskb(sk, size - data_len, data_len,
1906 msg->msg_flags & MSG_DONTWAIT, &err,
1907 get_order(UNIX_SKB_FRAGS_SZ));
1908 if (!skb)
1909 goto out_err;
1911 /* Only send the fds in the first buffer */
1912 err = unix_scm_to_skb(&scm, skb, !fds_sent);
1913 if (err < 0) {
1914 kfree_skb(skb);
1915 goto out_err;
1917 fds_sent = true;
1919 skb_put(skb, size - data_len);
1920 skb->data_len = data_len;
1921 skb->len = size;
1922 err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, size);
1923 if (err) {
1924 kfree_skb(skb);
1925 goto out_err;
1928 unix_state_lock(other);
1930 if (sock_flag(other, SOCK_DEAD) ||
1931 (other->sk_shutdown & RCV_SHUTDOWN))
1932 goto pipe_err_free;
1934 maybe_add_creds(skb, sock, other);
1935 skb_queue_tail(&other->sk_receive_queue, skb);
1936 unix_state_unlock(other);
1937 other->sk_data_ready(other);
1938 sent += size;
1941 scm_destroy(&scm);
1943 return sent;
1945 pipe_err_free:
1946 unix_state_unlock(other);
1947 kfree_skb(skb);
1948 pipe_err:
1949 if (sent == 0 && !(msg->msg_flags&MSG_NOSIGNAL))
1950 send_sig(SIGPIPE, current, 0);
1951 err = -EPIPE;
1952 out_err:
1953 scm_destroy(&scm);
1954 return sent ? : err;
1957 static ssize_t unix_stream_sendpage(struct socket *socket, struct page *page,
1958 int offset, size_t size, int flags)
1960 int err;
1961 bool send_sigpipe = false;
1962 bool init_scm = true;
1963 struct scm_cookie scm;
1964 struct sock *other, *sk = socket->sk;
1965 struct sk_buff *skb, *newskb = NULL, *tail = NULL;
1967 if (flags & MSG_OOB)
1968 return -EOPNOTSUPP;
1970 other = unix_peer(sk);
1971 if (!other || sk->sk_state != TCP_ESTABLISHED)
1972 return -ENOTCONN;
1974 if (false) {
1975 alloc_skb:
1976 unix_state_unlock(other);
1977 mutex_unlock(&unix_sk(other)->iolock);
1978 newskb = sock_alloc_send_pskb(sk, 0, 0, flags & MSG_DONTWAIT,
1979 &err, 0);
1980 if (!newskb)
1981 goto err;
1984 /* we must acquire iolock as we modify already present
1985 * skbs in the sk_receive_queue and mess with skb->len
1987 err = mutex_lock_interruptible(&unix_sk(other)->iolock);
1988 if (err) {
1989 err = flags & MSG_DONTWAIT ? -EAGAIN : -ERESTARTSYS;
1990 goto err;
1993 if (sk->sk_shutdown & SEND_SHUTDOWN) {
1994 err = -EPIPE;
1995 send_sigpipe = true;
1996 goto err_unlock;
1999 unix_state_lock(other);
2001 if (sock_flag(other, SOCK_DEAD) ||
2002 other->sk_shutdown & RCV_SHUTDOWN) {
2003 err = -EPIPE;
2004 send_sigpipe = true;
2005 goto err_state_unlock;
2008 if (init_scm) {
2009 err = maybe_init_creds(&scm, socket, other);
2010 if (err)
2011 goto err_state_unlock;
2012 init_scm = false;
2015 skb = skb_peek_tail(&other->sk_receive_queue);
2016 if (tail && tail == skb) {
2017 skb = newskb;
2018 } else if (!skb || !unix_skb_scm_eq(skb, &scm)) {
2019 if (newskb) {
2020 skb = newskb;
2021 } else {
2022 tail = skb;
2023 goto alloc_skb;
2025 } else if (newskb) {
2026 /* this is fast path, we don't necessarily need to
2027 * call to kfree_skb even though with newskb == NULL
2028 * this - does no harm
2030 consume_skb(newskb);
2031 newskb = NULL;
2034 if (skb_append_pagefrags(skb, page, offset, size)) {
2035 tail = skb;
2036 goto alloc_skb;
2039 skb->len += size;
2040 skb->data_len += size;
2041 skb->truesize += size;
2042 refcount_add(size, &sk->sk_wmem_alloc);
2044 if (newskb) {
2045 err = unix_scm_to_skb(&scm, skb, false);
2046 if (err)
2047 goto err_state_unlock;
2048 spin_lock(&other->sk_receive_queue.lock);
2049 __skb_queue_tail(&other->sk_receive_queue, newskb);
2050 spin_unlock(&other->sk_receive_queue.lock);
2053 unix_state_unlock(other);
2054 mutex_unlock(&unix_sk(other)->iolock);
2056 other->sk_data_ready(other);
2057 scm_destroy(&scm);
2058 return size;
2060 err_state_unlock:
2061 unix_state_unlock(other);
2062 err_unlock:
2063 mutex_unlock(&unix_sk(other)->iolock);
2064 err:
2065 kfree_skb(newskb);
2066 if (send_sigpipe && !(flags & MSG_NOSIGNAL))
2067 send_sig(SIGPIPE, current, 0);
2068 if (!init_scm)
2069 scm_destroy(&scm);
2070 return err;
2073 static int unix_seqpacket_sendmsg(struct socket *sock, struct msghdr *msg,
2074 size_t len)
2076 int err;
2077 struct sock *sk = sock->sk;
2079 err = sock_error(sk);
2080 if (err)
2081 return err;
2083 if (sk->sk_state != TCP_ESTABLISHED)
2084 return -ENOTCONN;
2086 if (msg->msg_namelen)
2087 msg->msg_namelen = 0;
2089 return unix_dgram_sendmsg(sock, msg, len);
2092 static int unix_seqpacket_recvmsg(struct socket *sock, struct msghdr *msg,
2093 size_t size, int flags)
2095 struct sock *sk = sock->sk;
2097 if (sk->sk_state != TCP_ESTABLISHED)
2098 return -ENOTCONN;
2100 return unix_dgram_recvmsg(sock, msg, size, flags);
2103 static void unix_copy_addr(struct msghdr *msg, struct sock *sk)
2105 struct unix_address *addr = smp_load_acquire(&unix_sk(sk)->addr);
2107 if (addr) {
2108 msg->msg_namelen = addr->len;
2109 memcpy(msg->msg_name, addr->name, addr->len);
2113 static int unix_dgram_recvmsg(struct socket *sock, struct msghdr *msg,
2114 size_t size, int flags)
2116 struct scm_cookie scm;
2117 struct sock *sk = sock->sk;
2118 struct unix_sock *u = unix_sk(sk);
2119 struct sk_buff *skb, *last;
2120 long timeo;
2121 int err;
2122 int peeked, skip;
2124 err = -EOPNOTSUPP;
2125 if (flags&MSG_OOB)
2126 goto out;
2128 timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT);
2130 do {
2131 mutex_lock(&u->iolock);
2133 skip = sk_peek_offset(sk, flags);
2134 skb = __skb_try_recv_datagram(sk, flags, NULL, &peeked, &skip,
2135 &err, &last);
2136 if (skb)
2137 break;
2139 mutex_unlock(&u->iolock);
2141 if (err != -EAGAIN)
2142 break;
2143 } while (timeo &&
2144 !__skb_wait_for_more_packets(sk, &err, &timeo, last));
2146 if (!skb) { /* implies iolock unlocked */
2147 unix_state_lock(sk);
2148 /* Signal EOF on disconnected non-blocking SEQPACKET socket. */
2149 if (sk->sk_type == SOCK_SEQPACKET && err == -EAGAIN &&
2150 (sk->sk_shutdown & RCV_SHUTDOWN))
2151 err = 0;
2152 unix_state_unlock(sk);
2153 goto out;
2156 if (wq_has_sleeper(&u->peer_wait))
2157 wake_up_interruptible_sync_poll(&u->peer_wait,
2158 EPOLLOUT | EPOLLWRNORM |
2159 EPOLLWRBAND);
2161 if (msg->msg_name)
2162 unix_copy_addr(msg, skb->sk);
2164 if (size > skb->len - skip)
2165 size = skb->len - skip;
2166 else if (size < skb->len - skip)
2167 msg->msg_flags |= MSG_TRUNC;
2169 err = skb_copy_datagram_msg(skb, skip, msg, size);
2170 if (err)
2171 goto out_free;
2173 if (sock_flag(sk, SOCK_RCVTSTAMP))
2174 __sock_recv_timestamp(msg, sk, skb);
2176 memset(&scm, 0, sizeof(scm));
2178 scm_set_cred(&scm, UNIXCB(skb).pid, UNIXCB(skb).uid, UNIXCB(skb).gid);
2179 unix_set_secdata(&scm, skb);
2181 if (!(flags & MSG_PEEK)) {
2182 if (UNIXCB(skb).fp)
2183 unix_detach_fds(&scm, skb);
2185 sk_peek_offset_bwd(sk, skb->len);
2186 } else {
2187 /* It is questionable: on PEEK we could:
2188 - do not return fds - good, but too simple 8)
2189 - return fds, and do not return them on read (old strategy,
2190 apparently wrong)
2191 - clone fds (I chose it for now, it is the most universal
2192 solution)
2194 POSIX 1003.1g does not actually define this clearly
2195 at all. POSIX 1003.1g doesn't define a lot of things
2196 clearly however!
2200 sk_peek_offset_fwd(sk, size);
2202 if (UNIXCB(skb).fp)
2203 scm.fp = scm_fp_dup(UNIXCB(skb).fp);
2205 err = (flags & MSG_TRUNC) ? skb->len - skip : size;
2207 scm_recv(sock, msg, &scm, flags);
2209 out_free:
2210 skb_free_datagram(sk, skb);
2211 mutex_unlock(&u->iolock);
2212 out:
2213 return err;
2217 * Sleep until more data has arrived. But check for races..
2219 static long unix_stream_data_wait(struct sock *sk, long timeo,
2220 struct sk_buff *last, unsigned int last_len,
2221 bool freezable)
2223 struct sk_buff *tail;
2224 DEFINE_WAIT(wait);
2226 unix_state_lock(sk);
2228 for (;;) {
2229 prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
2231 tail = skb_peek_tail(&sk->sk_receive_queue);
2232 if (tail != last ||
2233 (tail && tail->len != last_len) ||
2234 sk->sk_err ||
2235 (sk->sk_shutdown & RCV_SHUTDOWN) ||
2236 signal_pending(current) ||
2237 !timeo)
2238 break;
2240 sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2241 unix_state_unlock(sk);
2242 if (freezable)
2243 timeo = freezable_schedule_timeout(timeo);
2244 else
2245 timeo = schedule_timeout(timeo);
2246 unix_state_lock(sk);
2248 if (sock_flag(sk, SOCK_DEAD))
2249 break;
2251 sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2254 finish_wait(sk_sleep(sk), &wait);
2255 unix_state_unlock(sk);
2256 return timeo;
2259 static unsigned int unix_skb_len(const struct sk_buff *skb)
2261 return skb->len - UNIXCB(skb).consumed;
2264 struct unix_stream_read_state {
2265 int (*recv_actor)(struct sk_buff *, int, int,
2266 struct unix_stream_read_state *);
2267 struct socket *socket;
2268 struct msghdr *msg;
2269 struct pipe_inode_info *pipe;
2270 size_t size;
2271 int flags;
2272 unsigned int splice_flags;
2275 static int unix_stream_read_generic(struct unix_stream_read_state *state,
2276 bool freezable)
2278 struct scm_cookie scm;
2279 struct socket *sock = state->socket;
2280 struct sock *sk = sock->sk;
2281 struct unix_sock *u = unix_sk(sk);
2282 int copied = 0;
2283 int flags = state->flags;
2284 int noblock = flags & MSG_DONTWAIT;
2285 bool check_creds = false;
2286 int target;
2287 int err = 0;
2288 long timeo;
2289 int skip;
2290 size_t size = state->size;
2291 unsigned int last_len;
2293 if (unlikely(sk->sk_state != TCP_ESTABLISHED)) {
2294 err = -EINVAL;
2295 goto out;
2298 if (unlikely(flags & MSG_OOB)) {
2299 err = -EOPNOTSUPP;
2300 goto out;
2303 target = sock_rcvlowat(sk, flags & MSG_WAITALL, size);
2304 timeo = sock_rcvtimeo(sk, noblock);
2306 memset(&scm, 0, sizeof(scm));
2308 /* Lock the socket to prevent queue disordering
2309 * while sleeps in memcpy_tomsg
2311 mutex_lock(&u->iolock);
2313 skip = max(sk_peek_offset(sk, flags), 0);
2315 do {
2316 int chunk;
2317 bool drop_skb;
2318 struct sk_buff *skb, *last;
2320 redo:
2321 unix_state_lock(sk);
2322 if (sock_flag(sk, SOCK_DEAD)) {
2323 err = -ECONNRESET;
2324 goto unlock;
2326 last = skb = skb_peek(&sk->sk_receive_queue);
2327 last_len = last ? last->len : 0;
2328 again:
2329 if (skb == NULL) {
2330 if (copied >= target)
2331 goto unlock;
2334 * POSIX 1003.1g mandates this order.
2337 err = sock_error(sk);
2338 if (err)
2339 goto unlock;
2340 if (sk->sk_shutdown & RCV_SHUTDOWN)
2341 goto unlock;
2343 unix_state_unlock(sk);
2344 if (!timeo) {
2345 err = -EAGAIN;
2346 break;
2349 mutex_unlock(&u->iolock);
2351 timeo = unix_stream_data_wait(sk, timeo, last,
2352 last_len, freezable);
2354 if (signal_pending(current)) {
2355 err = sock_intr_errno(timeo);
2356 scm_destroy(&scm);
2357 goto out;
2360 mutex_lock(&u->iolock);
2361 goto redo;
2362 unlock:
2363 unix_state_unlock(sk);
2364 break;
2367 while (skip >= unix_skb_len(skb)) {
2368 skip -= unix_skb_len(skb);
2369 last = skb;
2370 last_len = skb->len;
2371 skb = skb_peek_next(skb, &sk->sk_receive_queue);
2372 if (!skb)
2373 goto again;
2376 unix_state_unlock(sk);
2378 if (check_creds) {
2379 /* Never glue messages from different writers */
2380 if (!unix_skb_scm_eq(skb, &scm))
2381 break;
2382 } else if (test_bit(SOCK_PASSCRED, &sock->flags)) {
2383 /* Copy credentials */
2384 scm_set_cred(&scm, UNIXCB(skb).pid, UNIXCB(skb).uid, UNIXCB(skb).gid);
2385 unix_set_secdata(&scm, skb);
2386 check_creds = true;
2389 /* Copy address just once */
2390 if (state->msg && state->msg->msg_name) {
2391 DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr,
2392 state->msg->msg_name);
2393 unix_copy_addr(state->msg, skb->sk);
2394 sunaddr = NULL;
2397 chunk = min_t(unsigned int, unix_skb_len(skb) - skip, size);
2398 skb_get(skb);
2399 chunk = state->recv_actor(skb, skip, chunk, state);
2400 drop_skb = !unix_skb_len(skb);
2401 /* skb is only safe to use if !drop_skb */
2402 consume_skb(skb);
2403 if (chunk < 0) {
2404 if (copied == 0)
2405 copied = -EFAULT;
2406 break;
2408 copied += chunk;
2409 size -= chunk;
2411 if (drop_skb) {
2412 /* the skb was touched by a concurrent reader;
2413 * we should not expect anything from this skb
2414 * anymore and assume it invalid - we can be
2415 * sure it was dropped from the socket queue
2417 * let's report a short read
2419 err = 0;
2420 break;
2423 /* Mark read part of skb as used */
2424 if (!(flags & MSG_PEEK)) {
2425 UNIXCB(skb).consumed += chunk;
2427 sk_peek_offset_bwd(sk, chunk);
2429 if (UNIXCB(skb).fp)
2430 unix_detach_fds(&scm, skb);
2432 if (unix_skb_len(skb))
2433 break;
2435 skb_unlink(skb, &sk->sk_receive_queue);
2436 consume_skb(skb);
2438 if (scm.fp)
2439 break;
2440 } else {
2441 /* It is questionable, see note in unix_dgram_recvmsg.
2443 if (UNIXCB(skb).fp)
2444 scm.fp = scm_fp_dup(UNIXCB(skb).fp);
2446 sk_peek_offset_fwd(sk, chunk);
2448 if (UNIXCB(skb).fp)
2449 break;
2451 skip = 0;
2452 last = skb;
2453 last_len = skb->len;
2454 unix_state_lock(sk);
2455 skb = skb_peek_next(skb, &sk->sk_receive_queue);
2456 if (skb)
2457 goto again;
2458 unix_state_unlock(sk);
2459 break;
2461 } while (size);
2463 mutex_unlock(&u->iolock);
2464 if (state->msg)
2465 scm_recv(sock, state->msg, &scm, flags);
2466 else
2467 scm_destroy(&scm);
2468 out:
2469 return copied ? : err;
2472 static int unix_stream_read_actor(struct sk_buff *skb,
2473 int skip, int chunk,
2474 struct unix_stream_read_state *state)
2476 int ret;
2478 ret = skb_copy_datagram_msg(skb, UNIXCB(skb).consumed + skip,
2479 state->msg, chunk);
2480 return ret ?: chunk;
2483 static int unix_stream_recvmsg(struct socket *sock, struct msghdr *msg,
2484 size_t size, int flags)
2486 struct unix_stream_read_state state = {
2487 .recv_actor = unix_stream_read_actor,
2488 .socket = sock,
2489 .msg = msg,
2490 .size = size,
2491 .flags = flags
2494 return unix_stream_read_generic(&state, true);
2497 static int unix_stream_splice_actor(struct sk_buff *skb,
2498 int skip, int chunk,
2499 struct unix_stream_read_state *state)
2501 return skb_splice_bits(skb, state->socket->sk,
2502 UNIXCB(skb).consumed + skip,
2503 state->pipe, chunk, state->splice_flags);
2506 static ssize_t unix_stream_splice_read(struct socket *sock, loff_t *ppos,
2507 struct pipe_inode_info *pipe,
2508 size_t size, unsigned int flags)
2510 struct unix_stream_read_state state = {
2511 .recv_actor = unix_stream_splice_actor,
2512 .socket = sock,
2513 .pipe = pipe,
2514 .size = size,
2515 .splice_flags = flags,
2518 if (unlikely(*ppos))
2519 return -ESPIPE;
2521 if (sock->file->f_flags & O_NONBLOCK ||
2522 flags & SPLICE_F_NONBLOCK)
2523 state.flags = MSG_DONTWAIT;
2525 return unix_stream_read_generic(&state, false);
2528 static int unix_shutdown(struct socket *sock, int mode)
2530 struct sock *sk = sock->sk;
2531 struct sock *other;
2533 if (mode < SHUT_RD || mode > SHUT_RDWR)
2534 return -EINVAL;
2535 /* This maps:
2536 * SHUT_RD (0) -> RCV_SHUTDOWN (1)
2537 * SHUT_WR (1) -> SEND_SHUTDOWN (2)
2538 * SHUT_RDWR (2) -> SHUTDOWN_MASK (3)
2540 ++mode;
2542 unix_state_lock(sk);
2543 sk->sk_shutdown |= mode;
2544 other = unix_peer(sk);
2545 if (other)
2546 sock_hold(other);
2547 unix_state_unlock(sk);
2548 sk->sk_state_change(sk);
2550 if (other &&
2551 (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET)) {
2553 int peer_mode = 0;
2555 if (mode&RCV_SHUTDOWN)
2556 peer_mode |= SEND_SHUTDOWN;
2557 if (mode&SEND_SHUTDOWN)
2558 peer_mode |= RCV_SHUTDOWN;
2559 unix_state_lock(other);
2560 other->sk_shutdown |= peer_mode;
2561 unix_state_unlock(other);
2562 other->sk_state_change(other);
2563 if (peer_mode == SHUTDOWN_MASK)
2564 sk_wake_async(other, SOCK_WAKE_WAITD, POLL_HUP);
2565 else if (peer_mode & RCV_SHUTDOWN)
2566 sk_wake_async(other, SOCK_WAKE_WAITD, POLL_IN);
2568 if (other)
2569 sock_put(other);
2571 return 0;
2574 long unix_inq_len(struct sock *sk)
2576 struct sk_buff *skb;
2577 long amount = 0;
2579 if (sk->sk_state == TCP_LISTEN)
2580 return -EINVAL;
2582 spin_lock(&sk->sk_receive_queue.lock);
2583 if (sk->sk_type == SOCK_STREAM ||
2584 sk->sk_type == SOCK_SEQPACKET) {
2585 skb_queue_walk(&sk->sk_receive_queue, skb)
2586 amount += unix_skb_len(skb);
2587 } else {
2588 skb = skb_peek(&sk->sk_receive_queue);
2589 if (skb)
2590 amount = skb->len;
2592 spin_unlock(&sk->sk_receive_queue.lock);
2594 return amount;
2596 EXPORT_SYMBOL_GPL(unix_inq_len);
2598 long unix_outq_len(struct sock *sk)
2600 return sk_wmem_alloc_get(sk);
2602 EXPORT_SYMBOL_GPL(unix_outq_len);
2604 static int unix_open_file(struct sock *sk)
2606 struct path path;
2607 struct file *f;
2608 int fd;
2610 if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
2611 return -EPERM;
2613 if (!smp_load_acquire(&unix_sk(sk)->addr))
2614 return -ENOENT;
2616 path = unix_sk(sk)->path;
2617 if (!path.dentry)
2618 return -ENOENT;
2620 path_get(&path);
2622 fd = get_unused_fd_flags(O_CLOEXEC);
2623 if (fd < 0)
2624 goto out;
2626 f = dentry_open(&path, O_PATH, current_cred());
2627 if (IS_ERR(f)) {
2628 put_unused_fd(fd);
2629 fd = PTR_ERR(f);
2630 goto out;
2633 fd_install(fd, f);
2634 out:
2635 path_put(&path);
2637 return fd;
2640 static int unix_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
2642 struct sock *sk = sock->sk;
2643 long amount = 0;
2644 int err;
2646 switch (cmd) {
2647 case SIOCOUTQ:
2648 amount = unix_outq_len(sk);
2649 err = put_user(amount, (int __user *)arg);
2650 break;
2651 case SIOCINQ:
2652 amount = unix_inq_len(sk);
2653 if (amount < 0)
2654 err = amount;
2655 else
2656 err = put_user(amount, (int __user *)arg);
2657 break;
2658 case SIOCUNIXFILE:
2659 err = unix_open_file(sk);
2660 break;
2661 default:
2662 err = -ENOIOCTLCMD;
2663 break;
2665 return err;
2668 #ifdef CONFIG_COMPAT
2669 static int unix_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
2671 return unix_ioctl(sock, cmd, (unsigned long)compat_ptr(arg));
2673 #endif
2675 static __poll_t unix_poll(struct file *file, struct socket *sock, poll_table *wait)
2677 struct sock *sk = sock->sk;
2678 __poll_t mask;
2680 sock_poll_wait(file, sock, wait);
2681 mask = 0;
2683 /* exceptional events? */
2684 if (sk->sk_err)
2685 mask |= EPOLLERR;
2686 if (sk->sk_shutdown == SHUTDOWN_MASK)
2687 mask |= EPOLLHUP;
2688 if (sk->sk_shutdown & RCV_SHUTDOWN)
2689 mask |= EPOLLRDHUP | EPOLLIN | EPOLLRDNORM;
2691 /* readable? */
2692 if (!skb_queue_empty_lockless(&sk->sk_receive_queue))
2693 mask |= EPOLLIN | EPOLLRDNORM;
2695 /* Connection-based need to check for termination and startup */
2696 if ((sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) &&
2697 sk->sk_state == TCP_CLOSE)
2698 mask |= EPOLLHUP;
2701 * we set writable also when the other side has shut down the
2702 * connection. This prevents stuck sockets.
2704 if (unix_writable(sk))
2705 mask |= EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND;
2707 return mask;
2710 static __poll_t unix_dgram_poll(struct file *file, struct socket *sock,
2711 poll_table *wait)
2713 struct sock *sk = sock->sk, *other;
2714 unsigned int writable;
2715 __poll_t mask;
2717 sock_poll_wait(file, sock, wait);
2718 mask = 0;
2720 /* exceptional events? */
2721 if (sk->sk_err || !skb_queue_empty_lockless(&sk->sk_error_queue))
2722 mask |= EPOLLERR |
2723 (sock_flag(sk, SOCK_SELECT_ERR_QUEUE) ? EPOLLPRI : 0);
2725 if (sk->sk_shutdown & RCV_SHUTDOWN)
2726 mask |= EPOLLRDHUP | EPOLLIN | EPOLLRDNORM;
2727 if (sk->sk_shutdown == SHUTDOWN_MASK)
2728 mask |= EPOLLHUP;
2730 /* readable? */
2731 if (!skb_queue_empty_lockless(&sk->sk_receive_queue))
2732 mask |= EPOLLIN | EPOLLRDNORM;
2734 /* Connection-based need to check for termination and startup */
2735 if (sk->sk_type == SOCK_SEQPACKET) {
2736 if (sk->sk_state == TCP_CLOSE)
2737 mask |= EPOLLHUP;
2738 /* connection hasn't started yet? */
2739 if (sk->sk_state == TCP_SYN_SENT)
2740 return mask;
2743 /* No write status requested, avoid expensive OUT tests. */
2744 if (!(poll_requested_events(wait) & (EPOLLWRBAND|EPOLLWRNORM|EPOLLOUT)))
2745 return mask;
2747 writable = unix_writable(sk);
2748 if (writable) {
2749 unix_state_lock(sk);
2751 other = unix_peer(sk);
2752 if (other && unix_peer(other) != sk &&
2753 unix_recvq_full(other) &&
2754 unix_dgram_peer_wake_me(sk, other))
2755 writable = 0;
2757 unix_state_unlock(sk);
2760 if (writable)
2761 mask |= EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND;
2762 else
2763 sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
2765 return mask;
2768 #ifdef CONFIG_PROC_FS
2770 #define BUCKET_SPACE (BITS_PER_LONG - (UNIX_HASH_BITS + 1) - 1)
2772 #define get_bucket(x) ((x) >> BUCKET_SPACE)
2773 #define get_offset(x) ((x) & ((1L << BUCKET_SPACE) - 1))
2774 #define set_bucket_offset(b, o) ((b) << BUCKET_SPACE | (o))
2776 static struct sock *unix_from_bucket(struct seq_file *seq, loff_t *pos)
2778 unsigned long offset = get_offset(*pos);
2779 unsigned long bucket = get_bucket(*pos);
2780 struct sock *sk;
2781 unsigned long count = 0;
2783 for (sk = sk_head(&unix_socket_table[bucket]); sk; sk = sk_next(sk)) {
2784 if (sock_net(sk) != seq_file_net(seq))
2785 continue;
2786 if (++count == offset)
2787 break;
2790 return sk;
2793 static struct sock *unix_next_socket(struct seq_file *seq,
2794 struct sock *sk,
2795 loff_t *pos)
2797 unsigned long bucket;
2799 while (sk > (struct sock *)SEQ_START_TOKEN) {
2800 sk = sk_next(sk);
2801 if (!sk)
2802 goto next_bucket;
2803 if (sock_net(sk) == seq_file_net(seq))
2804 return sk;
2807 do {
2808 sk = unix_from_bucket(seq, pos);
2809 if (sk)
2810 return sk;
2812 next_bucket:
2813 bucket = get_bucket(*pos) + 1;
2814 *pos = set_bucket_offset(bucket, 1);
2815 } while (bucket < ARRAY_SIZE(unix_socket_table));
2817 return NULL;
2820 static void *unix_seq_start(struct seq_file *seq, loff_t *pos)
2821 __acquires(unix_table_lock)
2823 spin_lock(&unix_table_lock);
2825 if (!*pos)
2826 return SEQ_START_TOKEN;
2828 if (get_bucket(*pos) >= ARRAY_SIZE(unix_socket_table))
2829 return NULL;
2831 return unix_next_socket(seq, NULL, pos);
2834 static void *unix_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2836 ++*pos;
2837 return unix_next_socket(seq, v, pos);
2840 static void unix_seq_stop(struct seq_file *seq, void *v)
2841 __releases(unix_table_lock)
2843 spin_unlock(&unix_table_lock);
2846 static int unix_seq_show(struct seq_file *seq, void *v)
2849 if (v == SEQ_START_TOKEN)
2850 seq_puts(seq, "Num RefCount Protocol Flags Type St "
2851 "Inode Path\n");
2852 else {
2853 struct sock *s = v;
2854 struct unix_sock *u = unix_sk(s);
2855 unix_state_lock(s);
2857 seq_printf(seq, "%pK: %08X %08X %08X %04X %02X %5lu",
2859 refcount_read(&s->sk_refcnt),
2861 s->sk_state == TCP_LISTEN ? __SO_ACCEPTCON : 0,
2862 s->sk_type,
2863 s->sk_socket ?
2864 (s->sk_state == TCP_ESTABLISHED ? SS_CONNECTED : SS_UNCONNECTED) :
2865 (s->sk_state == TCP_ESTABLISHED ? SS_CONNECTING : SS_DISCONNECTING),
2866 sock_i_ino(s));
2868 if (u->addr) { // under unix_table_lock here
2869 int i, len;
2870 seq_putc(seq, ' ');
2872 i = 0;
2873 len = u->addr->len - sizeof(short);
2874 if (!UNIX_ABSTRACT(s))
2875 len--;
2876 else {
2877 seq_putc(seq, '@');
2878 i++;
2880 for ( ; i < len; i++)
2881 seq_putc(seq, u->addr->name->sun_path[i] ?:
2882 '@');
2884 unix_state_unlock(s);
2885 seq_putc(seq, '\n');
2888 return 0;
2891 static const struct seq_operations unix_seq_ops = {
2892 .start = unix_seq_start,
2893 .next = unix_seq_next,
2894 .stop = unix_seq_stop,
2895 .show = unix_seq_show,
2897 #endif
2899 static const struct net_proto_family unix_family_ops = {
2900 .family = PF_UNIX,
2901 .create = unix_create,
2902 .owner = THIS_MODULE,
2906 static int __net_init unix_net_init(struct net *net)
2908 int error = -ENOMEM;
2910 net->unx.sysctl_max_dgram_qlen = 10;
2911 if (unix_sysctl_register(net))
2912 goto out;
2914 #ifdef CONFIG_PROC_FS
2915 if (!proc_create_net("unix", 0, net->proc_net, &unix_seq_ops,
2916 sizeof(struct seq_net_private))) {
2917 unix_sysctl_unregister(net);
2918 goto out;
2920 #endif
2921 error = 0;
2922 out:
2923 return error;
2926 static void __net_exit unix_net_exit(struct net *net)
2928 unix_sysctl_unregister(net);
2929 remove_proc_entry("unix", net->proc_net);
2932 static struct pernet_operations unix_net_ops = {
2933 .init = unix_net_init,
2934 .exit = unix_net_exit,
2937 static int __init af_unix_init(void)
2939 int rc = -1;
2941 BUILD_BUG_ON(sizeof(struct unix_skb_parms) > FIELD_SIZEOF(struct sk_buff, cb));
2943 rc = proto_register(&unix_proto, 1);
2944 if (rc != 0) {
2945 pr_crit("%s: Cannot create unix_sock SLAB cache!\n", __func__);
2946 goto out;
2949 sock_register(&unix_family_ops);
2950 register_pernet_subsys(&unix_net_ops);
2951 out:
2952 return rc;
2955 static void __exit af_unix_exit(void)
2957 sock_unregister(PF_UNIX);
2958 proto_unregister(&unix_proto);
2959 unregister_pernet_subsys(&unix_net_ops);
2962 /* Earlier than device_initcall() so that other drivers invoking
2963 request_module() don't end up in a loop when modprobe tries
2964 to use a UNIX socket. But later than subsys_initcall() because
2965 we depend on stuff initialised there */
2966 fs_initcall(af_unix_init);
2967 module_exit(af_unix_exit);
2969 MODULE_LICENSE("GPL");
2970 MODULE_ALIAS_NETPROTO(PF_UNIX);