2 * NET4: Implementation of BSD Unix domain sockets.
4 * Authors: Alan Cox, <alan@lxorguk.ukuu.org.uk>
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version
9 * 2 of the License, or (at your option) any later version.
12 * Linus Torvalds : Assorted bug cures.
13 * Niibe Yutaka : async I/O support.
14 * Carsten Paeth : PF_UNIX check, address fixes.
15 * Alan Cox : Limit size of allocated blocks.
16 * Alan Cox : Fixed the stupid socketpair bug.
17 * Alan Cox : BSD compatibility fine tuning.
18 * Alan Cox : Fixed a bug in connect when interrupted.
19 * Alan Cox : Sorted out a proper draft version of
20 * file descriptor passing hacked up from
22 * Marty Leisner : Fixes to fd passing
23 * Nick Nevin : recvmsg bugfix.
24 * Alan Cox : Started proper garbage collector
25 * Heiko EiBfeldt : Missing verify_area check
26 * Alan Cox : Started POSIXisms
27 * Andreas Schwab : Replace inode by dentry for proper
29 * Kirk Petersen : Made this a module
30 * Christoph Rohland : Elegant non-blocking accept/connect algorithm.
32 * Alexey Kuznetosv : Repaired (I hope) bugs introduces
33 * by above two patches.
34 * Andrea Arcangeli : If possible we block in connect(2)
35 * if the max backlog of the listen socket
36 * is been reached. This won't break
37 * old apps and it will avoid huge amount
38 * of socks hashed (this for unix_gc()
39 * performances reasons).
40 * Security fix that limits the max
41 * number of socks to 2*max_files and
42 * the number of skb queueable in the
44 * Artur Skawina : Hash function optimizations
45 * Alexey Kuznetsov : Full scale SMP. Lot of bugs are introduced 8)
46 * Malcolm Beattie : Set peercred for socketpair
47 * Michal Ostrowski : Module initialization cleanup.
48 * Arnaldo C. Melo : Remove MOD_{INC,DEC}_USE_COUNT,
49 * the core infrastructure is doing that
50 * for all net proto families now (2.5.69+)
53 * Known differences from reference BSD that was tested:
56 * ECONNREFUSED is not returned from one end of a connected() socket to the
57 * other the moment one end closes.
58 * fstat() doesn't return st_dev=0, and give the blksize as high water mark
59 * and a fake inode identifier (nor the BSD first socket fstat twice bug).
61 * accept() returns a path name even if the connecting socket has closed
62 * in the meantime (BSD loses the path and gives up).
63 * accept() returns 0 length path for an unbound connector. BSD returns 16
64 * and a null first byte in the path (but not for gethost/peername - BSD bug ??)
65 * socketpair(...SOCK_RAW..) doesn't panic the kernel.
66 * BSD af_unix apparently has connect forgetting to block properly.
67 * (need to check this with the POSIX spec in detail)
69 * Differences from 2.0.0-11-... (ANK)
70 * Bug fixes and improvements.
71 * - client shutdown killed server socket.
72 * - removed all useless cli/sti pairs.
74 * Semantic changes/extensions.
75 * - generic control message passing.
76 * - SCM_CREDENTIALS control message.
77 * - "Abstract" (not FS based) socket bindings.
78 * Abstract names are sequences of bytes (not zero terminated)
79 * started by 0, so that this name space does not intersect
83 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
85 #include <linux/module.h>
86 #include <linux/kernel.h>
87 #include <linux/signal.h>
88 #include <linux/sched/signal.h>
89 #include <linux/errno.h>
90 #include <linux/string.h>
91 #include <linux/stat.h>
92 #include <linux/dcache.h>
93 #include <linux/namei.h>
94 #include <linux/socket.h>
96 #include <linux/fcntl.h>
97 #include <linux/termios.h>
98 #include <linux/sockios.h>
99 #include <linux/net.h>
100 #include <linux/in.h>
101 #include <linux/fs.h>
102 #include <linux/slab.h>
103 #include <linux/uaccess.h>
104 #include <linux/skbuff.h>
105 #include <linux/netdevice.h>
106 #include <net/net_namespace.h>
107 #include <net/sock.h>
108 #include <net/tcp_states.h>
109 #include <net/af_unix.h>
110 #include <linux/proc_fs.h>
111 #include <linux/seq_file.h>
113 #include <linux/init.h>
114 #include <linux/poll.h>
115 #include <linux/rtnetlink.h>
116 #include <linux/mount.h>
117 #include <net/checksum.h>
118 #include <linux/security.h>
119 #include <linux/freezer.h>
120 #include <linux/file.h>
122 struct hlist_head unix_socket_table
[2 * UNIX_HASH_SIZE
];
123 EXPORT_SYMBOL_GPL(unix_socket_table
);
124 DEFINE_SPINLOCK(unix_table_lock
);
125 EXPORT_SYMBOL_GPL(unix_table_lock
);
126 static atomic_long_t unix_nr_socks
;
129 static struct hlist_head
*unix_sockets_unbound(void *addr
)
131 unsigned long hash
= (unsigned long)addr
;
135 hash
%= UNIX_HASH_SIZE
;
136 return &unix_socket_table
[UNIX_HASH_SIZE
+ hash
];
139 #define UNIX_ABSTRACT(sk) (unix_sk(sk)->addr->hash < UNIX_HASH_SIZE)
141 #ifdef CONFIG_SECURITY_NETWORK
142 static void unix_get_secdata(struct scm_cookie
*scm
, struct sk_buff
*skb
)
144 UNIXCB(skb
).secid
= scm
->secid
;
147 static inline void unix_set_secdata(struct scm_cookie
*scm
, struct sk_buff
*skb
)
149 scm
->secid
= UNIXCB(skb
).secid
;
152 static inline bool unix_secdata_eq(struct scm_cookie
*scm
, struct sk_buff
*skb
)
154 return (scm
->secid
== UNIXCB(skb
).secid
);
157 static inline void unix_get_secdata(struct scm_cookie
*scm
, struct sk_buff
*skb
)
160 static inline void unix_set_secdata(struct scm_cookie
*scm
, struct sk_buff
*skb
)
163 static inline bool unix_secdata_eq(struct scm_cookie
*scm
, struct sk_buff
*skb
)
167 #endif /* CONFIG_SECURITY_NETWORK */
170 * SMP locking strategy:
171 * hash table is protected with spinlock unix_table_lock
172 * each socket state is protected by separate spin lock.
175 static inline unsigned int unix_hash_fold(__wsum n
)
177 unsigned int hash
= (__force
unsigned int)csum_fold(n
);
180 return hash
&(UNIX_HASH_SIZE
-1);
183 #define unix_peer(sk) (unix_sk(sk)->peer)
185 static inline int unix_our_peer(struct sock
*sk
, struct sock
*osk
)
187 return unix_peer(osk
) == sk
;
190 static inline int unix_may_send(struct sock
*sk
, struct sock
*osk
)
192 return unix_peer(osk
) == NULL
|| unix_our_peer(sk
, osk
);
195 static inline int unix_recvq_full(const struct sock
*sk
)
197 return skb_queue_len(&sk
->sk_receive_queue
) > sk
->sk_max_ack_backlog
;
200 static inline int unix_recvq_full_lockless(const struct sock
*sk
)
202 return skb_queue_len_lockless(&sk
->sk_receive_queue
) >
203 READ_ONCE(sk
->sk_max_ack_backlog
);
206 struct sock
*unix_peer_get(struct sock
*s
)
214 unix_state_unlock(s
);
217 EXPORT_SYMBOL_GPL(unix_peer_get
);
219 static inline void unix_release_addr(struct unix_address
*addr
)
221 if (refcount_dec_and_test(&addr
->refcnt
))
226 * Check unix socket name:
227 * - should be not zero length.
228 * - if started by not zero, should be NULL terminated (FS object)
229 * - if started by zero, it is abstract name.
232 static int unix_mkname(struct sockaddr_un
*sunaddr
, int len
, unsigned int *hashp
)
236 if (len
<= sizeof(short) || len
> sizeof(*sunaddr
))
238 if (!sunaddr
|| sunaddr
->sun_family
!= AF_UNIX
)
240 if (sunaddr
->sun_path
[0]) {
242 * This may look like an off by one error but it is a bit more
243 * subtle. 108 is the longest valid AF_UNIX path for a binding.
244 * sun_path[108] doesn't as such exist. However in kernel space
245 * we are guaranteed that it is a valid memory location in our
246 * kernel address buffer.
248 ((char *)sunaddr
)[len
] = 0;
249 len
= strlen(sunaddr
->sun_path
)+1+sizeof(short);
253 *hashp
= unix_hash_fold(csum_partial(sunaddr
, len
, 0));
257 static void __unix_remove_socket(struct sock
*sk
)
259 sk_del_node_init(sk
);
262 static void __unix_insert_socket(struct hlist_head
*list
, struct sock
*sk
)
264 WARN_ON(!sk_unhashed(sk
));
265 sk_add_node(sk
, list
);
268 static inline void unix_remove_socket(struct sock
*sk
)
270 spin_lock(&unix_table_lock
);
271 __unix_remove_socket(sk
);
272 spin_unlock(&unix_table_lock
);
275 static inline void unix_insert_socket(struct hlist_head
*list
, struct sock
*sk
)
277 spin_lock(&unix_table_lock
);
278 __unix_insert_socket(list
, sk
);
279 spin_unlock(&unix_table_lock
);
282 static struct sock
*__unix_find_socket_byname(struct net
*net
,
283 struct sockaddr_un
*sunname
,
284 int len
, int type
, unsigned int hash
)
288 sk_for_each(s
, &unix_socket_table
[hash
^ type
]) {
289 struct unix_sock
*u
= unix_sk(s
);
291 if (!net_eq(sock_net(s
), net
))
294 if (u
->addr
->len
== len
&&
295 !memcmp(u
->addr
->name
, sunname
, len
))
303 static inline struct sock
*unix_find_socket_byname(struct net
*net
,
304 struct sockaddr_un
*sunname
,
310 spin_lock(&unix_table_lock
);
311 s
= __unix_find_socket_byname(net
, sunname
, len
, type
, hash
);
314 spin_unlock(&unix_table_lock
);
318 static struct sock
*unix_find_socket_byinode(struct inode
*i
)
322 spin_lock(&unix_table_lock
);
324 &unix_socket_table
[i
->i_ino
& (UNIX_HASH_SIZE
- 1)]) {
325 struct dentry
*dentry
= unix_sk(s
)->path
.dentry
;
327 if (dentry
&& d_backing_inode(dentry
) == i
) {
334 spin_unlock(&unix_table_lock
);
338 /* Support code for asymmetrically connected dgram sockets
340 * If a datagram socket is connected to a socket not itself connected
341 * to the first socket (eg, /dev/log), clients may only enqueue more
342 * messages if the present receive queue of the server socket is not
343 * "too large". This means there's a second writeability condition
344 * poll and sendmsg need to test. The dgram recv code will do a wake
345 * up on the peer_wait wait queue of a socket upon reception of a
346 * datagram which needs to be propagated to sleeping would-be writers
347 * since these might not have sent anything so far. This can't be
348 * accomplished via poll_wait because the lifetime of the server
349 * socket might be less than that of its clients if these break their
350 * association with it or if the server socket is closed while clients
351 * are still connected to it and there's no way to inform "a polling
352 * implementation" that it should let go of a certain wait queue
354 * In order to propagate a wake up, a wait_queue_entry_t of the client
355 * socket is enqueued on the peer_wait queue of the server socket
356 * whose wake function does a wake_up on the ordinary client socket
357 * wait queue. This connection is established whenever a write (or
358 * poll for write) hit the flow control condition and broken when the
359 * association to the server socket is dissolved or after a wake up
363 static int unix_dgram_peer_wake_relay(wait_queue_entry_t
*q
, unsigned mode
, int flags
,
367 wait_queue_head_t
*u_sleep
;
369 u
= container_of(q
, struct unix_sock
, peer_wake
);
371 __remove_wait_queue(&unix_sk(u
->peer_wake
.private)->peer_wait
,
373 u
->peer_wake
.private = NULL
;
375 /* relaying can only happen while the wq still exists */
376 u_sleep
= sk_sleep(&u
->sk
);
378 wake_up_interruptible_poll(u_sleep
, key_to_poll(key
));
383 static int unix_dgram_peer_wake_connect(struct sock
*sk
, struct sock
*other
)
385 struct unix_sock
*u
, *u_other
;
389 u_other
= unix_sk(other
);
391 spin_lock(&u_other
->peer_wait
.lock
);
393 if (!u
->peer_wake
.private) {
394 u
->peer_wake
.private = other
;
395 __add_wait_queue(&u_other
->peer_wait
, &u
->peer_wake
);
400 spin_unlock(&u_other
->peer_wait
.lock
);
404 static void unix_dgram_peer_wake_disconnect(struct sock
*sk
,
407 struct unix_sock
*u
, *u_other
;
410 u_other
= unix_sk(other
);
411 spin_lock(&u_other
->peer_wait
.lock
);
413 if (u
->peer_wake
.private == other
) {
414 __remove_wait_queue(&u_other
->peer_wait
, &u
->peer_wake
);
415 u
->peer_wake
.private = NULL
;
418 spin_unlock(&u_other
->peer_wait
.lock
);
421 static void unix_dgram_peer_wake_disconnect_wakeup(struct sock
*sk
,
424 unix_dgram_peer_wake_disconnect(sk
, other
);
425 wake_up_interruptible_poll(sk_sleep(sk
),
432 * - unix_peer(sk) == other
433 * - association is stable
435 static int unix_dgram_peer_wake_me(struct sock
*sk
, struct sock
*other
)
439 connected
= unix_dgram_peer_wake_connect(sk
, other
);
441 /* If other is SOCK_DEAD, we want to make sure we signal
442 * POLLOUT, such that a subsequent write() can get a
443 * -ECONNREFUSED. Otherwise, if we haven't queued any skbs
444 * to other and its full, we will hang waiting for POLLOUT.
446 if (unix_recvq_full(other
) && !sock_flag(other
, SOCK_DEAD
))
450 unix_dgram_peer_wake_disconnect(sk
, other
);
455 static int unix_writable(const struct sock
*sk
)
457 return sk
->sk_state
!= TCP_LISTEN
&&
458 (refcount_read(&sk
->sk_wmem_alloc
) << 2) <= sk
->sk_sndbuf
;
461 static void unix_write_space(struct sock
*sk
)
463 struct socket_wq
*wq
;
466 if (unix_writable(sk
)) {
467 wq
= rcu_dereference(sk
->sk_wq
);
468 if (skwq_has_sleeper(wq
))
469 wake_up_interruptible_sync_poll(&wq
->wait
,
470 EPOLLOUT
| EPOLLWRNORM
| EPOLLWRBAND
);
471 sk_wake_async(sk
, SOCK_WAKE_SPACE
, POLL_OUT
);
476 /* When dgram socket disconnects (or changes its peer), we clear its receive
477 * queue of packets arrived from previous peer. First, it allows to do
478 * flow control based only on wmem_alloc; second, sk connected to peer
479 * may receive messages only from that peer. */
480 static void unix_dgram_disconnected(struct sock
*sk
, struct sock
*other
)
482 if (!skb_queue_empty(&sk
->sk_receive_queue
)) {
483 skb_queue_purge(&sk
->sk_receive_queue
);
484 wake_up_interruptible_all(&unix_sk(sk
)->peer_wait
);
486 /* If one link of bidirectional dgram pipe is disconnected,
487 * we signal error. Messages are lost. Do not make this,
488 * when peer was not connected to us.
490 if (!sock_flag(other
, SOCK_DEAD
) && unix_peer(other
) == sk
) {
491 other
->sk_err
= ECONNRESET
;
492 other
->sk_error_report(other
);
497 static void unix_sock_destructor(struct sock
*sk
)
499 struct unix_sock
*u
= unix_sk(sk
);
501 skb_queue_purge(&sk
->sk_receive_queue
);
503 WARN_ON(refcount_read(&sk
->sk_wmem_alloc
));
504 WARN_ON(!sk_unhashed(sk
));
505 WARN_ON(sk
->sk_socket
);
506 if (!sock_flag(sk
, SOCK_DEAD
)) {
507 pr_info("Attempt to release alive unix socket: %p\n", sk
);
512 unix_release_addr(u
->addr
);
514 atomic_long_dec(&unix_nr_socks
);
516 sock_prot_inuse_add(sock_net(sk
), sk
->sk_prot
, -1);
518 #ifdef UNIX_REFCNT_DEBUG
519 pr_debug("UNIX %p is destroyed, %ld are still alive.\n", sk
,
520 atomic_long_read(&unix_nr_socks
));
524 static void unix_release_sock(struct sock
*sk
, int embrion
)
526 struct unix_sock
*u
= unix_sk(sk
);
532 unix_remove_socket(sk
);
537 sk
->sk_shutdown
= SHUTDOWN_MASK
;
539 u
->path
.dentry
= NULL
;
541 state
= sk
->sk_state
;
542 sk
->sk_state
= TCP_CLOSE
;
543 unix_state_unlock(sk
);
545 wake_up_interruptible_all(&u
->peer_wait
);
547 skpair
= unix_peer(sk
);
549 if (skpair
!= NULL
) {
550 if (sk
->sk_type
== SOCK_STREAM
|| sk
->sk_type
== SOCK_SEQPACKET
) {
551 unix_state_lock(skpair
);
553 skpair
->sk_shutdown
= SHUTDOWN_MASK
;
554 if (!skb_queue_empty(&sk
->sk_receive_queue
) || embrion
)
555 skpair
->sk_err
= ECONNRESET
;
556 unix_state_unlock(skpair
);
557 skpair
->sk_state_change(skpair
);
558 sk_wake_async(skpair
, SOCK_WAKE_WAITD
, POLL_HUP
);
561 unix_dgram_peer_wake_disconnect(sk
, skpair
);
562 sock_put(skpair
); /* It may now die */
563 unix_peer(sk
) = NULL
;
566 /* Try to flush out this socket. Throw out buffers at least */
568 while ((skb
= skb_dequeue(&sk
->sk_receive_queue
)) != NULL
) {
569 if (state
== TCP_LISTEN
)
570 unix_release_sock(skb
->sk
, 1);
571 /* passed fds are erased in the kfree_skb hook */
572 UNIXCB(skb
).consumed
= skb
->len
;
581 /* ---- Socket is dead now and most probably destroyed ---- */
584 * Fixme: BSD difference: In BSD all sockets connected to us get
585 * ECONNRESET and we die on the spot. In Linux we behave
586 * like files and pipes do and wait for the last
589 * Can't we simply set sock->err?
591 * What the above comment does talk about? --ANK(980817)
594 if (unix_tot_inflight
)
595 unix_gc(); /* Garbage collect fds */
598 static void init_peercred(struct sock
*sk
)
600 put_pid(sk
->sk_peer_pid
);
601 if (sk
->sk_peer_cred
)
602 put_cred(sk
->sk_peer_cred
);
603 sk
->sk_peer_pid
= get_pid(task_tgid(current
));
604 sk
->sk_peer_cred
= get_current_cred();
607 static void copy_peercred(struct sock
*sk
, struct sock
*peersk
)
609 put_pid(sk
->sk_peer_pid
);
610 if (sk
->sk_peer_cred
)
611 put_cred(sk
->sk_peer_cred
);
612 sk
->sk_peer_pid
= get_pid(peersk
->sk_peer_pid
);
613 sk
->sk_peer_cred
= get_cred(peersk
->sk_peer_cred
);
616 static int unix_listen(struct socket
*sock
, int backlog
)
619 struct sock
*sk
= sock
->sk
;
620 struct unix_sock
*u
= unix_sk(sk
);
621 struct pid
*old_pid
= NULL
;
624 if (sock
->type
!= SOCK_STREAM
&& sock
->type
!= SOCK_SEQPACKET
)
625 goto out
; /* Only stream/seqpacket sockets accept */
628 goto out
; /* No listens on an unbound socket */
630 if (sk
->sk_state
!= TCP_CLOSE
&& sk
->sk_state
!= TCP_LISTEN
)
632 if (backlog
> sk
->sk_max_ack_backlog
)
633 wake_up_interruptible_all(&u
->peer_wait
);
634 sk
->sk_max_ack_backlog
= backlog
;
635 sk
->sk_state
= TCP_LISTEN
;
636 /* set credentials so connect can copy them */
641 unix_state_unlock(sk
);
647 static int unix_release(struct socket
*);
648 static int unix_bind(struct socket
*, struct sockaddr
*, int);
649 static int unix_stream_connect(struct socket
*, struct sockaddr
*,
650 int addr_len
, int flags
);
651 static int unix_socketpair(struct socket
*, struct socket
*);
652 static int unix_accept(struct socket
*, struct socket
*, int, bool);
653 static int unix_getname(struct socket
*, struct sockaddr
*, int);
654 static __poll_t
unix_poll(struct file
*, struct socket
*, poll_table
*);
655 static __poll_t
unix_dgram_poll(struct file
*, struct socket
*,
657 static int unix_ioctl(struct socket
*, unsigned int, unsigned long);
659 static int unix_compat_ioctl(struct socket
*sock
, unsigned int cmd
, unsigned long arg
);
661 static int unix_shutdown(struct socket
*, int);
662 static int unix_stream_sendmsg(struct socket
*, struct msghdr
*, size_t);
663 static int unix_stream_recvmsg(struct socket
*, struct msghdr
*, size_t, int);
664 static ssize_t
unix_stream_sendpage(struct socket
*, struct page
*, int offset
,
665 size_t size
, int flags
);
666 static ssize_t
unix_stream_splice_read(struct socket
*, loff_t
*ppos
,
667 struct pipe_inode_info
*, size_t size
,
669 static int unix_dgram_sendmsg(struct socket
*, struct msghdr
*, size_t);
670 static int unix_dgram_recvmsg(struct socket
*, struct msghdr
*, size_t, int);
671 static int unix_dgram_connect(struct socket
*, struct sockaddr
*,
673 static int unix_seqpacket_sendmsg(struct socket
*, struct msghdr
*, size_t);
674 static int unix_seqpacket_recvmsg(struct socket
*, struct msghdr
*, size_t,
677 static int unix_set_peek_off(struct sock
*sk
, int val
)
679 struct unix_sock
*u
= unix_sk(sk
);
681 if (mutex_lock_interruptible(&u
->iolock
))
684 sk
->sk_peek_off
= val
;
685 mutex_unlock(&u
->iolock
);
691 static const struct proto_ops unix_stream_ops
= {
693 .owner
= THIS_MODULE
,
694 .release
= unix_release
,
696 .connect
= unix_stream_connect
,
697 .socketpair
= unix_socketpair
,
698 .accept
= unix_accept
,
699 .getname
= unix_getname
,
703 .compat_ioctl
= unix_compat_ioctl
,
705 .listen
= unix_listen
,
706 .shutdown
= unix_shutdown
,
707 .setsockopt
= sock_no_setsockopt
,
708 .getsockopt
= sock_no_getsockopt
,
709 .sendmsg
= unix_stream_sendmsg
,
710 .recvmsg
= unix_stream_recvmsg
,
711 .mmap
= sock_no_mmap
,
712 .sendpage
= unix_stream_sendpage
,
713 .splice_read
= unix_stream_splice_read
,
714 .set_peek_off
= unix_set_peek_off
,
717 static const struct proto_ops unix_dgram_ops
= {
719 .owner
= THIS_MODULE
,
720 .release
= unix_release
,
722 .connect
= unix_dgram_connect
,
723 .socketpair
= unix_socketpair
,
724 .accept
= sock_no_accept
,
725 .getname
= unix_getname
,
726 .poll
= unix_dgram_poll
,
729 .compat_ioctl
= unix_compat_ioctl
,
731 .listen
= sock_no_listen
,
732 .shutdown
= unix_shutdown
,
733 .setsockopt
= sock_no_setsockopt
,
734 .getsockopt
= sock_no_getsockopt
,
735 .sendmsg
= unix_dgram_sendmsg
,
736 .recvmsg
= unix_dgram_recvmsg
,
737 .mmap
= sock_no_mmap
,
738 .sendpage
= sock_no_sendpage
,
739 .set_peek_off
= unix_set_peek_off
,
742 static const struct proto_ops unix_seqpacket_ops
= {
744 .owner
= THIS_MODULE
,
745 .release
= unix_release
,
747 .connect
= unix_stream_connect
,
748 .socketpair
= unix_socketpair
,
749 .accept
= unix_accept
,
750 .getname
= unix_getname
,
751 .poll
= unix_dgram_poll
,
754 .compat_ioctl
= unix_compat_ioctl
,
756 .listen
= unix_listen
,
757 .shutdown
= unix_shutdown
,
758 .setsockopt
= sock_no_setsockopt
,
759 .getsockopt
= sock_no_getsockopt
,
760 .sendmsg
= unix_seqpacket_sendmsg
,
761 .recvmsg
= unix_seqpacket_recvmsg
,
762 .mmap
= sock_no_mmap
,
763 .sendpage
= sock_no_sendpage
,
764 .set_peek_off
= unix_set_peek_off
,
767 static struct proto unix_proto
= {
769 .owner
= THIS_MODULE
,
770 .obj_size
= sizeof(struct unix_sock
),
773 static struct sock
*unix_create1(struct net
*net
, struct socket
*sock
, int kern
)
775 struct sock
*sk
= NULL
;
778 atomic_long_inc(&unix_nr_socks
);
779 if (atomic_long_read(&unix_nr_socks
) > 2 * get_max_files())
782 sk
= sk_alloc(net
, PF_UNIX
, GFP_KERNEL
, &unix_proto
, kern
);
786 sock_init_data(sock
, sk
);
788 sk
->sk_allocation
= GFP_KERNEL_ACCOUNT
;
789 sk
->sk_write_space
= unix_write_space
;
790 sk
->sk_max_ack_backlog
= net
->unx
.sysctl_max_dgram_qlen
;
791 sk
->sk_destruct
= unix_sock_destructor
;
793 u
->path
.dentry
= NULL
;
795 spin_lock_init(&u
->lock
);
796 atomic_long_set(&u
->inflight
, 0);
797 INIT_LIST_HEAD(&u
->link
);
798 mutex_init(&u
->iolock
); /* single task reading lock */
799 mutex_init(&u
->bindlock
); /* single task binding lock */
800 init_waitqueue_head(&u
->peer_wait
);
801 init_waitqueue_func_entry(&u
->peer_wake
, unix_dgram_peer_wake_relay
);
802 unix_insert_socket(unix_sockets_unbound(sk
), sk
);
805 atomic_long_dec(&unix_nr_socks
);
808 sock_prot_inuse_add(sock_net(sk
), sk
->sk_prot
, 1);
814 static int unix_create(struct net
*net
, struct socket
*sock
, int protocol
,
817 if (protocol
&& protocol
!= PF_UNIX
)
818 return -EPROTONOSUPPORT
;
820 sock
->state
= SS_UNCONNECTED
;
822 switch (sock
->type
) {
824 sock
->ops
= &unix_stream_ops
;
827 * Believe it or not BSD has AF_UNIX, SOCK_RAW though
831 sock
->type
= SOCK_DGRAM
;
834 sock
->ops
= &unix_dgram_ops
;
837 sock
->ops
= &unix_seqpacket_ops
;
840 return -ESOCKTNOSUPPORT
;
843 return unix_create1(net
, sock
, kern
) ? 0 : -ENOMEM
;
846 static int unix_release(struct socket
*sock
)
848 struct sock
*sk
= sock
->sk
;
853 unix_release_sock(sk
, 0);
859 static int unix_autobind(struct socket
*sock
)
861 struct sock
*sk
= sock
->sk
;
862 struct net
*net
= sock_net(sk
);
863 struct unix_sock
*u
= unix_sk(sk
);
864 static u32 ordernum
= 1;
865 struct unix_address
*addr
;
867 unsigned int retries
= 0;
869 err
= mutex_lock_interruptible(&u
->bindlock
);
878 addr
= kzalloc(sizeof(*addr
) + sizeof(short) + 16, GFP_KERNEL
);
882 addr
->name
->sun_family
= AF_UNIX
;
883 refcount_set(&addr
->refcnt
, 1);
886 addr
->len
= sprintf(addr
->name
->sun_path
+1, "%05x", ordernum
) + 1 + sizeof(short);
887 addr
->hash
= unix_hash_fold(csum_partial(addr
->name
, addr
->len
, 0));
889 spin_lock(&unix_table_lock
);
890 ordernum
= (ordernum
+1)&0xFFFFF;
892 if (__unix_find_socket_byname(net
, addr
->name
, addr
->len
, sock
->type
,
894 spin_unlock(&unix_table_lock
);
896 * __unix_find_socket_byname() may take long time if many names
897 * are already in use.
900 /* Give up if all names seems to be in use. */
901 if (retries
++ == 0xFFFFF) {
908 addr
->hash
^= sk
->sk_type
;
910 __unix_remove_socket(sk
);
911 smp_store_release(&u
->addr
, addr
);
912 __unix_insert_socket(&unix_socket_table
[addr
->hash
], sk
);
913 spin_unlock(&unix_table_lock
);
916 out
: mutex_unlock(&u
->bindlock
);
920 static struct sock
*unix_find_other(struct net
*net
,
921 struct sockaddr_un
*sunname
, int len
,
922 int type
, unsigned int hash
, int *error
)
928 if (sunname
->sun_path
[0]) {
930 err
= kern_path(sunname
->sun_path
, LOOKUP_FOLLOW
, &path
);
933 inode
= d_backing_inode(path
.dentry
);
934 err
= inode_permission(inode
, MAY_WRITE
);
939 if (!S_ISSOCK(inode
->i_mode
))
941 u
= unix_find_socket_byinode(inode
);
945 if (u
->sk_type
== type
)
951 if (u
->sk_type
!= type
) {
957 u
= unix_find_socket_byname(net
, sunname
, len
, type
, hash
);
959 struct dentry
*dentry
;
960 dentry
= unix_sk(u
)->path
.dentry
;
962 touch_atime(&unix_sk(u
)->path
);
975 static int unix_mknod(const char *sun_path
, umode_t mode
, struct path
*res
)
977 struct dentry
*dentry
;
981 * Get the parent directory, calculate the hash for last
984 dentry
= kern_path_create(AT_FDCWD
, sun_path
, &path
, 0);
985 err
= PTR_ERR(dentry
);
990 * All right, let's create it.
992 err
= security_path_mknod(&path
, dentry
, mode
, 0);
994 err
= vfs_mknod(d_inode(path
.dentry
), dentry
, mode
, 0);
996 res
->mnt
= mntget(path
.mnt
);
997 res
->dentry
= dget(dentry
);
1000 done_path_create(&path
, dentry
);
1004 static int unix_bind(struct socket
*sock
, struct sockaddr
*uaddr
, int addr_len
)
1006 struct sock
*sk
= sock
->sk
;
1007 struct net
*net
= sock_net(sk
);
1008 struct unix_sock
*u
= unix_sk(sk
);
1009 struct sockaddr_un
*sunaddr
= (struct sockaddr_un
*)uaddr
;
1010 char *sun_path
= sunaddr
->sun_path
;
1013 struct unix_address
*addr
;
1014 struct hlist_head
*list
;
1015 struct path path
= { };
1018 if (addr_len
< offsetofend(struct sockaddr_un
, sun_family
) ||
1019 sunaddr
->sun_family
!= AF_UNIX
)
1022 if (addr_len
== sizeof(short)) {
1023 err
= unix_autobind(sock
);
1027 err
= unix_mkname(sunaddr
, addr_len
, &hash
);
1033 umode_t mode
= S_IFSOCK
|
1034 (SOCK_INODE(sock
)->i_mode
& ~current_umask());
1035 err
= unix_mknod(sun_path
, mode
, &path
);
1043 err
= mutex_lock_interruptible(&u
->bindlock
);
1052 addr
= kmalloc(sizeof(*addr
)+addr_len
, GFP_KERNEL
);
1056 memcpy(addr
->name
, sunaddr
, addr_len
);
1057 addr
->len
= addr_len
;
1058 addr
->hash
= hash
^ sk
->sk_type
;
1059 refcount_set(&addr
->refcnt
, 1);
1062 addr
->hash
= UNIX_HASH_SIZE
;
1063 hash
= d_backing_inode(path
.dentry
)->i_ino
& (UNIX_HASH_SIZE
- 1);
1064 spin_lock(&unix_table_lock
);
1066 list
= &unix_socket_table
[hash
];
1068 spin_lock(&unix_table_lock
);
1070 if (__unix_find_socket_byname(net
, sunaddr
, addr_len
,
1071 sk
->sk_type
, hash
)) {
1072 unix_release_addr(addr
);
1076 list
= &unix_socket_table
[addr
->hash
];
1080 __unix_remove_socket(sk
);
1081 smp_store_release(&u
->addr
, addr
);
1082 __unix_insert_socket(list
, sk
);
1085 spin_unlock(&unix_table_lock
);
1087 mutex_unlock(&u
->bindlock
);
1095 static void unix_state_double_lock(struct sock
*sk1
, struct sock
*sk2
)
1097 if (unlikely(sk1
== sk2
) || !sk2
) {
1098 unix_state_lock(sk1
);
1102 unix_state_lock(sk1
);
1103 unix_state_lock_nested(sk2
);
1105 unix_state_lock(sk2
);
1106 unix_state_lock_nested(sk1
);
1110 static void unix_state_double_unlock(struct sock
*sk1
, struct sock
*sk2
)
1112 if (unlikely(sk1
== sk2
) || !sk2
) {
1113 unix_state_unlock(sk1
);
1116 unix_state_unlock(sk1
);
1117 unix_state_unlock(sk2
);
1120 static int unix_dgram_connect(struct socket
*sock
, struct sockaddr
*addr
,
1121 int alen
, int flags
)
1123 struct sock
*sk
= sock
->sk
;
1124 struct net
*net
= sock_net(sk
);
1125 struct sockaddr_un
*sunaddr
= (struct sockaddr_un
*)addr
;
1131 if (alen
< offsetofend(struct sockaddr
, sa_family
))
1134 if (addr
->sa_family
!= AF_UNSPEC
) {
1135 err
= unix_mkname(sunaddr
, alen
, &hash
);
1140 if (test_bit(SOCK_PASSCRED
, &sock
->flags
) &&
1141 !unix_sk(sk
)->addr
&& (err
= unix_autobind(sock
)) != 0)
1145 other
= unix_find_other(net
, sunaddr
, alen
, sock
->type
, hash
, &err
);
1149 unix_state_double_lock(sk
, other
);
1151 /* Apparently VFS overslept socket death. Retry. */
1152 if (sock_flag(other
, SOCK_DEAD
)) {
1153 unix_state_double_unlock(sk
, other
);
1159 if (!unix_may_send(sk
, other
))
1162 err
= security_unix_may_send(sk
->sk_socket
, other
->sk_socket
);
1168 * 1003.1g breaking connected state with AF_UNSPEC
1171 unix_state_double_lock(sk
, other
);
1175 * If it was connected, reconnect.
1177 if (unix_peer(sk
)) {
1178 struct sock
*old_peer
= unix_peer(sk
);
1179 unix_peer(sk
) = other
;
1180 unix_dgram_peer_wake_disconnect_wakeup(sk
, old_peer
);
1182 unix_state_double_unlock(sk
, other
);
1184 if (other
!= old_peer
)
1185 unix_dgram_disconnected(sk
, old_peer
);
1188 unix_peer(sk
) = other
;
1189 unix_state_double_unlock(sk
, other
);
1194 unix_state_double_unlock(sk
, other
);
1200 static long unix_wait_for_peer(struct sock
*other
, long timeo
)
1202 struct unix_sock
*u
= unix_sk(other
);
1206 prepare_to_wait_exclusive(&u
->peer_wait
, &wait
, TASK_INTERRUPTIBLE
);
1208 sched
= !sock_flag(other
, SOCK_DEAD
) &&
1209 !(other
->sk_shutdown
& RCV_SHUTDOWN
) &&
1210 unix_recvq_full(other
);
1212 unix_state_unlock(other
);
1215 timeo
= schedule_timeout(timeo
);
1217 finish_wait(&u
->peer_wait
, &wait
);
1221 static int unix_stream_connect(struct socket
*sock
, struct sockaddr
*uaddr
,
1222 int addr_len
, int flags
)
1224 struct sockaddr_un
*sunaddr
= (struct sockaddr_un
*)uaddr
;
1225 struct sock
*sk
= sock
->sk
;
1226 struct net
*net
= sock_net(sk
);
1227 struct unix_sock
*u
= unix_sk(sk
), *newu
, *otheru
;
1228 struct sock
*newsk
= NULL
;
1229 struct sock
*other
= NULL
;
1230 struct sk_buff
*skb
= NULL
;
1236 err
= unix_mkname(sunaddr
, addr_len
, &hash
);
1241 if (test_bit(SOCK_PASSCRED
, &sock
->flags
) && !u
->addr
&&
1242 (err
= unix_autobind(sock
)) != 0)
1245 timeo
= sock_sndtimeo(sk
, flags
& O_NONBLOCK
);
1247 /* First of all allocate resources.
1248 If we will make it after state is locked,
1249 we will have to recheck all again in any case.
1254 /* create new sock for complete connection */
1255 newsk
= unix_create1(sock_net(sk
), NULL
, 0);
1259 /* Allocate skb for sending to listening sock */
1260 skb
= sock_wmalloc(newsk
, 1, 0, GFP_KERNEL
);
1265 /* Find listening sock. */
1266 other
= unix_find_other(net
, sunaddr
, addr_len
, sk
->sk_type
, hash
, &err
);
1270 /* Latch state of peer */
1271 unix_state_lock(other
);
1273 /* Apparently VFS overslept socket death. Retry. */
1274 if (sock_flag(other
, SOCK_DEAD
)) {
1275 unix_state_unlock(other
);
1280 err
= -ECONNREFUSED
;
1281 if (other
->sk_state
!= TCP_LISTEN
)
1283 if (other
->sk_shutdown
& RCV_SHUTDOWN
)
1286 if (unix_recvq_full(other
)) {
1291 timeo
= unix_wait_for_peer(other
, timeo
);
1293 err
= sock_intr_errno(timeo
);
1294 if (signal_pending(current
))
1302 It is tricky place. We need to grab our state lock and cannot
1303 drop lock on peer. It is dangerous because deadlock is
1304 possible. Connect to self case and simultaneous
1305 attempt to connect are eliminated by checking socket
1306 state. other is TCP_LISTEN, if sk is TCP_LISTEN we
1307 check this before attempt to grab lock.
1309 Well, and we have to recheck the state after socket locked.
1315 /* This is ok... continue with connect */
1317 case TCP_ESTABLISHED
:
1318 /* Socket is already connected */
1326 unix_state_lock_nested(sk
);
1328 if (sk
->sk_state
!= st
) {
1329 unix_state_unlock(sk
);
1330 unix_state_unlock(other
);
1335 err
= security_unix_stream_connect(sk
, other
, newsk
);
1337 unix_state_unlock(sk
);
1341 /* The way is open! Fastly set all the necessary fields... */
1344 unix_peer(newsk
) = sk
;
1345 newsk
->sk_state
= TCP_ESTABLISHED
;
1346 newsk
->sk_type
= sk
->sk_type
;
1347 init_peercred(newsk
);
1348 newu
= unix_sk(newsk
);
1349 RCU_INIT_POINTER(newsk
->sk_wq
, &newu
->peer_wq
);
1350 otheru
= unix_sk(other
);
1352 /* copy address information from listening to new sock
1354 * The contents of *(otheru->addr) and otheru->path
1355 * are seen fully set up here, since we have found
1356 * otheru in hash under unix_table_lock. Insertion
1357 * into the hash chain we'd found it in had been done
1358 * in an earlier critical area protected by unix_table_lock,
1359 * the same one where we'd set *(otheru->addr) contents,
1360 * as well as otheru->path and otheru->addr itself.
1362 * Using smp_store_release() here to set newu->addr
1363 * is enough to make those stores, as well as stores
1364 * to newu->path visible to anyone who gets newu->addr
1365 * by smp_load_acquire(). IOW, the same warranties
1366 * as for unix_sock instances bound in unix_bind() or
1367 * in unix_autobind().
1369 if (otheru
->path
.dentry
) {
1370 path_get(&otheru
->path
);
1371 newu
->path
= otheru
->path
;
1373 refcount_inc(&otheru
->addr
->refcnt
);
1374 smp_store_release(&newu
->addr
, otheru
->addr
);
1376 /* Set credentials */
1377 copy_peercred(sk
, other
);
1379 sock
->state
= SS_CONNECTED
;
1380 sk
->sk_state
= TCP_ESTABLISHED
;
1383 smp_mb__after_atomic(); /* sock_hold() does an atomic_inc() */
1384 unix_peer(sk
) = newsk
;
1386 unix_state_unlock(sk
);
1388 /* take ten and and send info to listening sock */
1389 spin_lock(&other
->sk_receive_queue
.lock
);
1390 __skb_queue_tail(&other
->sk_receive_queue
, skb
);
1391 spin_unlock(&other
->sk_receive_queue
.lock
);
1392 unix_state_unlock(other
);
1393 other
->sk_data_ready(other
);
1399 unix_state_unlock(other
);
1404 unix_release_sock(newsk
, 0);
1410 static int unix_socketpair(struct socket
*socka
, struct socket
*sockb
)
1412 struct sock
*ska
= socka
->sk
, *skb
= sockb
->sk
;
1414 /* Join our sockets back to back */
1417 unix_peer(ska
) = skb
;
1418 unix_peer(skb
) = ska
;
1422 if (ska
->sk_type
!= SOCK_DGRAM
) {
1423 ska
->sk_state
= TCP_ESTABLISHED
;
1424 skb
->sk_state
= TCP_ESTABLISHED
;
1425 socka
->state
= SS_CONNECTED
;
1426 sockb
->state
= SS_CONNECTED
;
1431 static void unix_sock_inherit_flags(const struct socket
*old
,
1434 if (test_bit(SOCK_PASSCRED
, &old
->flags
))
1435 set_bit(SOCK_PASSCRED
, &new->flags
);
1436 if (test_bit(SOCK_PASSSEC
, &old
->flags
))
1437 set_bit(SOCK_PASSSEC
, &new->flags
);
1440 static int unix_accept(struct socket
*sock
, struct socket
*newsock
, int flags
,
1443 struct sock
*sk
= sock
->sk
;
1445 struct sk_buff
*skb
;
1449 if (sock
->type
!= SOCK_STREAM
&& sock
->type
!= SOCK_SEQPACKET
)
1453 if (sk
->sk_state
!= TCP_LISTEN
)
1456 /* If socket state is TCP_LISTEN it cannot change (for now...),
1457 * so that no locks are necessary.
1460 skb
= skb_recv_datagram(sk
, 0, flags
&O_NONBLOCK
, &err
);
1462 /* This means receive shutdown. */
1469 skb_free_datagram(sk
, skb
);
1470 wake_up_interruptible(&unix_sk(sk
)->peer_wait
);
1472 /* attach accepted sock to socket */
1473 unix_state_lock(tsk
);
1474 newsock
->state
= SS_CONNECTED
;
1475 unix_sock_inherit_flags(sock
, newsock
);
1476 sock_graft(tsk
, newsock
);
1477 unix_state_unlock(tsk
);
1485 static int unix_getname(struct socket
*sock
, struct sockaddr
*uaddr
, int peer
)
1487 struct sock
*sk
= sock
->sk
;
1488 struct unix_address
*addr
;
1489 DECLARE_SOCKADDR(struct sockaddr_un
*, sunaddr
, uaddr
);
1493 sk
= unix_peer_get(sk
);
1503 addr
= smp_load_acquire(&unix_sk(sk
)->addr
);
1505 sunaddr
->sun_family
= AF_UNIX
;
1506 sunaddr
->sun_path
[0] = 0;
1507 err
= sizeof(short);
1510 memcpy(sunaddr
, addr
->name
, addr
->len
);
1517 static void unix_detach_fds(struct scm_cookie
*scm
, struct sk_buff
*skb
)
1521 scm
->fp
= UNIXCB(skb
).fp
;
1522 UNIXCB(skb
).fp
= NULL
;
1524 for (i
= scm
->fp
->count
-1; i
>= 0; i
--)
1525 unix_notinflight(scm
->fp
->user
, scm
->fp
->fp
[i
]);
1528 static void unix_destruct_scm(struct sk_buff
*skb
)
1530 struct scm_cookie scm
;
1531 memset(&scm
, 0, sizeof(scm
));
1532 scm
.pid
= UNIXCB(skb
).pid
;
1534 unix_detach_fds(&scm
, skb
);
1536 /* Alas, it calls VFS */
1537 /* So fscking what? fput() had been SMP-safe since the last Summer */
1543 * The "user->unix_inflight" variable is protected by the garbage
1544 * collection lock, and we just read it locklessly here. If you go
1545 * over the limit, there might be a tiny race in actually noticing
1546 * it across threads. Tough.
1548 static inline bool too_many_unix_fds(struct task_struct
*p
)
1550 struct user_struct
*user
= current_user();
1552 if (unlikely(user
->unix_inflight
> task_rlimit(p
, RLIMIT_NOFILE
)))
1553 return !capable(CAP_SYS_RESOURCE
) && !capable(CAP_SYS_ADMIN
);
1557 static int unix_attach_fds(struct scm_cookie
*scm
, struct sk_buff
*skb
)
1561 if (too_many_unix_fds(current
))
1562 return -ETOOMANYREFS
;
1565 * Need to duplicate file references for the sake of garbage
1566 * collection. Otherwise a socket in the fps might become a
1567 * candidate for GC while the skb is not yet queued.
1569 UNIXCB(skb
).fp
= scm_fp_dup(scm
->fp
);
1570 if (!UNIXCB(skb
).fp
)
1573 for (i
= scm
->fp
->count
- 1; i
>= 0; i
--)
1574 unix_inflight(scm
->fp
->user
, scm
->fp
->fp
[i
]);
1578 static int unix_scm_to_skb(struct scm_cookie
*scm
, struct sk_buff
*skb
, bool send_fds
)
1582 UNIXCB(skb
).pid
= get_pid(scm
->pid
);
1583 UNIXCB(skb
).uid
= scm
->creds
.uid
;
1584 UNIXCB(skb
).gid
= scm
->creds
.gid
;
1585 UNIXCB(skb
).fp
= NULL
;
1586 unix_get_secdata(scm
, skb
);
1587 if (scm
->fp
&& send_fds
)
1588 err
= unix_attach_fds(scm
, skb
);
1590 skb
->destructor
= unix_destruct_scm
;
1594 static bool unix_passcred_enabled(const struct socket
*sock
,
1595 const struct sock
*other
)
1597 return test_bit(SOCK_PASSCRED
, &sock
->flags
) ||
1598 !other
->sk_socket
||
1599 test_bit(SOCK_PASSCRED
, &other
->sk_socket
->flags
);
1603 * Some apps rely on write() giving SCM_CREDENTIALS
1604 * We include credentials if source or destination socket
1605 * asserted SOCK_PASSCRED.
1607 static void maybe_add_creds(struct sk_buff
*skb
, const struct socket
*sock
,
1608 const struct sock
*other
)
1610 if (UNIXCB(skb
).pid
)
1612 if (unix_passcred_enabled(sock
, other
)) {
1613 UNIXCB(skb
).pid
= get_pid(task_tgid(current
));
1614 current_uid_gid(&UNIXCB(skb
).uid
, &UNIXCB(skb
).gid
);
1618 static int maybe_init_creds(struct scm_cookie
*scm
,
1619 struct socket
*socket
,
1620 const struct sock
*other
)
1623 struct msghdr msg
= { .msg_controllen
= 0 };
1625 err
= scm_send(socket
, &msg
, scm
, false);
1629 if (unix_passcred_enabled(socket
, other
)) {
1630 scm
->pid
= get_pid(task_tgid(current
));
1631 current_uid_gid(&scm
->creds
.uid
, &scm
->creds
.gid
);
1636 static bool unix_skb_scm_eq(struct sk_buff
*skb
,
1637 struct scm_cookie
*scm
)
1639 const struct unix_skb_parms
*u
= &UNIXCB(skb
);
1641 return u
->pid
== scm
->pid
&&
1642 uid_eq(u
->uid
, scm
->creds
.uid
) &&
1643 gid_eq(u
->gid
, scm
->creds
.gid
) &&
1644 unix_secdata_eq(scm
, skb
);
1648 * Send AF_UNIX data.
1651 static int unix_dgram_sendmsg(struct socket
*sock
, struct msghdr
*msg
,
1654 struct sock
*sk
= sock
->sk
;
1655 struct net
*net
= sock_net(sk
);
1656 struct unix_sock
*u
= unix_sk(sk
);
1657 DECLARE_SOCKADDR(struct sockaddr_un
*, sunaddr
, msg
->msg_name
);
1658 struct sock
*other
= NULL
;
1659 int namelen
= 0; /* fake GCC */
1662 struct sk_buff
*skb
;
1664 struct scm_cookie scm
;
1669 err
= scm_send(sock
, msg
, &scm
, false);
1674 if (msg
->msg_flags
&MSG_OOB
)
1677 if (msg
->msg_namelen
) {
1678 err
= unix_mkname(sunaddr
, msg
->msg_namelen
, &hash
);
1685 other
= unix_peer_get(sk
);
1690 if (test_bit(SOCK_PASSCRED
, &sock
->flags
) && !u
->addr
1691 && (err
= unix_autobind(sock
)) != 0)
1695 if (len
> sk
->sk_sndbuf
- 32)
1698 if (len
> SKB_MAX_ALLOC
) {
1699 data_len
= min_t(size_t,
1700 len
- SKB_MAX_ALLOC
,
1701 MAX_SKB_FRAGS
* PAGE_SIZE
);
1702 data_len
= PAGE_ALIGN(data_len
);
1704 BUILD_BUG_ON(SKB_MAX_ALLOC
< PAGE_SIZE
);
1707 skb
= sock_alloc_send_pskb(sk
, len
- data_len
, data_len
,
1708 msg
->msg_flags
& MSG_DONTWAIT
, &err
,
1709 PAGE_ALLOC_COSTLY_ORDER
);
1713 err
= unix_scm_to_skb(&scm
, skb
, true);
1717 skb_put(skb
, len
- data_len
);
1718 skb
->data_len
= data_len
;
1720 err
= skb_copy_datagram_from_iter(skb
, 0, &msg
->msg_iter
, len
);
1724 timeo
= sock_sndtimeo(sk
, msg
->msg_flags
& MSG_DONTWAIT
);
1729 if (sunaddr
== NULL
)
1732 other
= unix_find_other(net
, sunaddr
, namelen
, sk
->sk_type
,
1738 if (sk_filter(other
, skb
) < 0) {
1739 /* Toss the packet but do not return any error to the sender */
1745 unix_state_lock(other
);
1748 if (!unix_may_send(sk
, other
))
1751 if (unlikely(sock_flag(other
, SOCK_DEAD
))) {
1753 * Check with 1003.1g - what should
1756 unix_state_unlock(other
);
1760 unix_state_lock(sk
);
1763 if (unix_peer(sk
) == other
) {
1764 unix_peer(sk
) = NULL
;
1765 unix_dgram_peer_wake_disconnect_wakeup(sk
, other
);
1767 unix_state_unlock(sk
);
1769 unix_dgram_disconnected(sk
, other
);
1771 err
= -ECONNREFUSED
;
1773 unix_state_unlock(sk
);
1783 if (other
->sk_shutdown
& RCV_SHUTDOWN
)
1786 if (sk
->sk_type
!= SOCK_SEQPACKET
) {
1787 err
= security_unix_may_send(sk
->sk_socket
, other
->sk_socket
);
1792 /* other == sk && unix_peer(other) != sk if
1793 * - unix_peer(sk) == NULL, destination address bound to sk
1794 * - unix_peer(sk) == sk by time of get but disconnected before lock
1797 unlikely(unix_peer(other
) != sk
&&
1798 unix_recvq_full_lockless(other
))) {
1800 timeo
= unix_wait_for_peer(other
, timeo
);
1802 err
= sock_intr_errno(timeo
);
1803 if (signal_pending(current
))
1810 unix_state_unlock(other
);
1811 unix_state_double_lock(sk
, other
);
1814 if (unix_peer(sk
) != other
||
1815 unix_dgram_peer_wake_me(sk
, other
)) {
1823 goto restart_locked
;
1827 if (unlikely(sk_locked
))
1828 unix_state_unlock(sk
);
1830 if (sock_flag(other
, SOCK_RCVTSTAMP
))
1831 __net_timestamp(skb
);
1832 maybe_add_creds(skb
, sock
, other
);
1833 skb_queue_tail(&other
->sk_receive_queue
, skb
);
1834 unix_state_unlock(other
);
1835 other
->sk_data_ready(other
);
1842 unix_state_unlock(sk
);
1843 unix_state_unlock(other
);
1853 /* We use paged skbs for stream sockets, and limit occupancy to 32768
1854 * bytes, and a minimum of a full page.
1856 #define UNIX_SKB_FRAGS_SZ (PAGE_SIZE << get_order(32768))
1858 static int unix_stream_sendmsg(struct socket
*sock
, struct msghdr
*msg
,
1861 struct sock
*sk
= sock
->sk
;
1862 struct sock
*other
= NULL
;
1864 struct sk_buff
*skb
;
1866 struct scm_cookie scm
;
1867 bool fds_sent
= false;
1871 err
= scm_send(sock
, msg
, &scm
, false);
1876 if (msg
->msg_flags
&MSG_OOB
)
1879 if (msg
->msg_namelen
) {
1880 err
= sk
->sk_state
== TCP_ESTABLISHED
? -EISCONN
: -EOPNOTSUPP
;
1884 other
= unix_peer(sk
);
1889 if (sk
->sk_shutdown
& SEND_SHUTDOWN
)
1892 while (sent
< len
) {
1895 /* Keep two messages in the pipe so it schedules better */
1896 size
= min_t(int, size
, (sk
->sk_sndbuf
>> 1) - 64);
1898 /* allow fallback to order-0 allocations */
1899 size
= min_t(int, size
, SKB_MAX_HEAD(0) + UNIX_SKB_FRAGS_SZ
);
1901 data_len
= max_t(int, 0, size
- SKB_MAX_HEAD(0));
1903 data_len
= min_t(size_t, size
, PAGE_ALIGN(data_len
));
1905 skb
= sock_alloc_send_pskb(sk
, size
- data_len
, data_len
,
1906 msg
->msg_flags
& MSG_DONTWAIT
, &err
,
1907 get_order(UNIX_SKB_FRAGS_SZ
));
1911 /* Only send the fds in the first buffer */
1912 err
= unix_scm_to_skb(&scm
, skb
, !fds_sent
);
1919 skb_put(skb
, size
- data_len
);
1920 skb
->data_len
= data_len
;
1922 err
= skb_copy_datagram_from_iter(skb
, 0, &msg
->msg_iter
, size
);
1928 unix_state_lock(other
);
1930 if (sock_flag(other
, SOCK_DEAD
) ||
1931 (other
->sk_shutdown
& RCV_SHUTDOWN
))
1934 maybe_add_creds(skb
, sock
, other
);
1935 skb_queue_tail(&other
->sk_receive_queue
, skb
);
1936 unix_state_unlock(other
);
1937 other
->sk_data_ready(other
);
1946 unix_state_unlock(other
);
1949 if (sent
== 0 && !(msg
->msg_flags
&MSG_NOSIGNAL
))
1950 send_sig(SIGPIPE
, current
, 0);
1954 return sent
? : err
;
1957 static ssize_t
unix_stream_sendpage(struct socket
*socket
, struct page
*page
,
1958 int offset
, size_t size
, int flags
)
1961 bool send_sigpipe
= false;
1962 bool init_scm
= true;
1963 struct scm_cookie scm
;
1964 struct sock
*other
, *sk
= socket
->sk
;
1965 struct sk_buff
*skb
, *newskb
= NULL
, *tail
= NULL
;
1967 if (flags
& MSG_OOB
)
1970 other
= unix_peer(sk
);
1971 if (!other
|| sk
->sk_state
!= TCP_ESTABLISHED
)
1976 unix_state_unlock(other
);
1977 mutex_unlock(&unix_sk(other
)->iolock
);
1978 newskb
= sock_alloc_send_pskb(sk
, 0, 0, flags
& MSG_DONTWAIT
,
1984 /* we must acquire iolock as we modify already present
1985 * skbs in the sk_receive_queue and mess with skb->len
1987 err
= mutex_lock_interruptible(&unix_sk(other
)->iolock
);
1989 err
= flags
& MSG_DONTWAIT
? -EAGAIN
: -ERESTARTSYS
;
1993 if (sk
->sk_shutdown
& SEND_SHUTDOWN
) {
1995 send_sigpipe
= true;
1999 unix_state_lock(other
);
2001 if (sock_flag(other
, SOCK_DEAD
) ||
2002 other
->sk_shutdown
& RCV_SHUTDOWN
) {
2004 send_sigpipe
= true;
2005 goto err_state_unlock
;
2009 err
= maybe_init_creds(&scm
, socket
, other
);
2011 goto err_state_unlock
;
2015 skb
= skb_peek_tail(&other
->sk_receive_queue
);
2016 if (tail
&& tail
== skb
) {
2018 } else if (!skb
|| !unix_skb_scm_eq(skb
, &scm
)) {
2025 } else if (newskb
) {
2026 /* this is fast path, we don't necessarily need to
2027 * call to kfree_skb even though with newskb == NULL
2028 * this - does no harm
2030 consume_skb(newskb
);
2034 if (skb_append_pagefrags(skb
, page
, offset
, size
)) {
2040 skb
->data_len
+= size
;
2041 skb
->truesize
+= size
;
2042 refcount_add(size
, &sk
->sk_wmem_alloc
);
2045 err
= unix_scm_to_skb(&scm
, skb
, false);
2047 goto err_state_unlock
;
2048 spin_lock(&other
->sk_receive_queue
.lock
);
2049 __skb_queue_tail(&other
->sk_receive_queue
, newskb
);
2050 spin_unlock(&other
->sk_receive_queue
.lock
);
2053 unix_state_unlock(other
);
2054 mutex_unlock(&unix_sk(other
)->iolock
);
2056 other
->sk_data_ready(other
);
2061 unix_state_unlock(other
);
2063 mutex_unlock(&unix_sk(other
)->iolock
);
2066 if (send_sigpipe
&& !(flags
& MSG_NOSIGNAL
))
2067 send_sig(SIGPIPE
, current
, 0);
2073 static int unix_seqpacket_sendmsg(struct socket
*sock
, struct msghdr
*msg
,
2077 struct sock
*sk
= sock
->sk
;
2079 err
= sock_error(sk
);
2083 if (sk
->sk_state
!= TCP_ESTABLISHED
)
2086 if (msg
->msg_namelen
)
2087 msg
->msg_namelen
= 0;
2089 return unix_dgram_sendmsg(sock
, msg
, len
);
2092 static int unix_seqpacket_recvmsg(struct socket
*sock
, struct msghdr
*msg
,
2093 size_t size
, int flags
)
2095 struct sock
*sk
= sock
->sk
;
2097 if (sk
->sk_state
!= TCP_ESTABLISHED
)
2100 return unix_dgram_recvmsg(sock
, msg
, size
, flags
);
2103 static void unix_copy_addr(struct msghdr
*msg
, struct sock
*sk
)
2105 struct unix_address
*addr
= smp_load_acquire(&unix_sk(sk
)->addr
);
2108 msg
->msg_namelen
= addr
->len
;
2109 memcpy(msg
->msg_name
, addr
->name
, addr
->len
);
2113 static int unix_dgram_recvmsg(struct socket
*sock
, struct msghdr
*msg
,
2114 size_t size
, int flags
)
2116 struct scm_cookie scm
;
2117 struct sock
*sk
= sock
->sk
;
2118 struct unix_sock
*u
= unix_sk(sk
);
2119 struct sk_buff
*skb
, *last
;
2128 timeo
= sock_rcvtimeo(sk
, flags
& MSG_DONTWAIT
);
2131 mutex_lock(&u
->iolock
);
2133 skip
= sk_peek_offset(sk
, flags
);
2134 skb
= __skb_try_recv_datagram(sk
, flags
, NULL
, &peeked
, &skip
,
2139 mutex_unlock(&u
->iolock
);
2144 !__skb_wait_for_more_packets(sk
, &err
, &timeo
, last
));
2146 if (!skb
) { /* implies iolock unlocked */
2147 unix_state_lock(sk
);
2148 /* Signal EOF on disconnected non-blocking SEQPACKET socket. */
2149 if (sk
->sk_type
== SOCK_SEQPACKET
&& err
== -EAGAIN
&&
2150 (sk
->sk_shutdown
& RCV_SHUTDOWN
))
2152 unix_state_unlock(sk
);
2156 if (wq_has_sleeper(&u
->peer_wait
))
2157 wake_up_interruptible_sync_poll(&u
->peer_wait
,
2158 EPOLLOUT
| EPOLLWRNORM
|
2162 unix_copy_addr(msg
, skb
->sk
);
2164 if (size
> skb
->len
- skip
)
2165 size
= skb
->len
- skip
;
2166 else if (size
< skb
->len
- skip
)
2167 msg
->msg_flags
|= MSG_TRUNC
;
2169 err
= skb_copy_datagram_msg(skb
, skip
, msg
, size
);
2173 if (sock_flag(sk
, SOCK_RCVTSTAMP
))
2174 __sock_recv_timestamp(msg
, sk
, skb
);
2176 memset(&scm
, 0, sizeof(scm
));
2178 scm_set_cred(&scm
, UNIXCB(skb
).pid
, UNIXCB(skb
).uid
, UNIXCB(skb
).gid
);
2179 unix_set_secdata(&scm
, skb
);
2181 if (!(flags
& MSG_PEEK
)) {
2183 unix_detach_fds(&scm
, skb
);
2185 sk_peek_offset_bwd(sk
, skb
->len
);
2187 /* It is questionable: on PEEK we could:
2188 - do not return fds - good, but too simple 8)
2189 - return fds, and do not return them on read (old strategy,
2191 - clone fds (I chose it for now, it is the most universal
2194 POSIX 1003.1g does not actually define this clearly
2195 at all. POSIX 1003.1g doesn't define a lot of things
2200 sk_peek_offset_fwd(sk
, size
);
2203 scm
.fp
= scm_fp_dup(UNIXCB(skb
).fp
);
2205 err
= (flags
& MSG_TRUNC
) ? skb
->len
- skip
: size
;
2207 scm_recv(sock
, msg
, &scm
, flags
);
2210 skb_free_datagram(sk
, skb
);
2211 mutex_unlock(&u
->iolock
);
2217 * Sleep until more data has arrived. But check for races..
2219 static long unix_stream_data_wait(struct sock
*sk
, long timeo
,
2220 struct sk_buff
*last
, unsigned int last_len
,
2223 struct sk_buff
*tail
;
2226 unix_state_lock(sk
);
2229 prepare_to_wait(sk_sleep(sk
), &wait
, TASK_INTERRUPTIBLE
);
2231 tail
= skb_peek_tail(&sk
->sk_receive_queue
);
2233 (tail
&& tail
->len
!= last_len
) ||
2235 (sk
->sk_shutdown
& RCV_SHUTDOWN
) ||
2236 signal_pending(current
) ||
2240 sk_set_bit(SOCKWQ_ASYNC_WAITDATA
, sk
);
2241 unix_state_unlock(sk
);
2243 timeo
= freezable_schedule_timeout(timeo
);
2245 timeo
= schedule_timeout(timeo
);
2246 unix_state_lock(sk
);
2248 if (sock_flag(sk
, SOCK_DEAD
))
2251 sk_clear_bit(SOCKWQ_ASYNC_WAITDATA
, sk
);
2254 finish_wait(sk_sleep(sk
), &wait
);
2255 unix_state_unlock(sk
);
2259 static unsigned int unix_skb_len(const struct sk_buff
*skb
)
2261 return skb
->len
- UNIXCB(skb
).consumed
;
2264 struct unix_stream_read_state
{
2265 int (*recv_actor
)(struct sk_buff
*, int, int,
2266 struct unix_stream_read_state
*);
2267 struct socket
*socket
;
2269 struct pipe_inode_info
*pipe
;
2272 unsigned int splice_flags
;
2275 static int unix_stream_read_generic(struct unix_stream_read_state
*state
,
2278 struct scm_cookie scm
;
2279 struct socket
*sock
= state
->socket
;
2280 struct sock
*sk
= sock
->sk
;
2281 struct unix_sock
*u
= unix_sk(sk
);
2283 int flags
= state
->flags
;
2284 int noblock
= flags
& MSG_DONTWAIT
;
2285 bool check_creds
= false;
2290 size_t size
= state
->size
;
2291 unsigned int last_len
;
2293 if (unlikely(sk
->sk_state
!= TCP_ESTABLISHED
)) {
2298 if (unlikely(flags
& MSG_OOB
)) {
2303 target
= sock_rcvlowat(sk
, flags
& MSG_WAITALL
, size
);
2304 timeo
= sock_rcvtimeo(sk
, noblock
);
2306 memset(&scm
, 0, sizeof(scm
));
2308 /* Lock the socket to prevent queue disordering
2309 * while sleeps in memcpy_tomsg
2311 mutex_lock(&u
->iolock
);
2313 skip
= max(sk_peek_offset(sk
, flags
), 0);
2318 struct sk_buff
*skb
, *last
;
2321 unix_state_lock(sk
);
2322 if (sock_flag(sk
, SOCK_DEAD
)) {
2326 last
= skb
= skb_peek(&sk
->sk_receive_queue
);
2327 last_len
= last
? last
->len
: 0;
2330 if (copied
>= target
)
2334 * POSIX 1003.1g mandates this order.
2337 err
= sock_error(sk
);
2340 if (sk
->sk_shutdown
& RCV_SHUTDOWN
)
2343 unix_state_unlock(sk
);
2349 mutex_unlock(&u
->iolock
);
2351 timeo
= unix_stream_data_wait(sk
, timeo
, last
,
2352 last_len
, freezable
);
2354 if (signal_pending(current
)) {
2355 err
= sock_intr_errno(timeo
);
2360 mutex_lock(&u
->iolock
);
2363 unix_state_unlock(sk
);
2367 while (skip
>= unix_skb_len(skb
)) {
2368 skip
-= unix_skb_len(skb
);
2370 last_len
= skb
->len
;
2371 skb
= skb_peek_next(skb
, &sk
->sk_receive_queue
);
2376 unix_state_unlock(sk
);
2379 /* Never glue messages from different writers */
2380 if (!unix_skb_scm_eq(skb
, &scm
))
2382 } else if (test_bit(SOCK_PASSCRED
, &sock
->flags
)) {
2383 /* Copy credentials */
2384 scm_set_cred(&scm
, UNIXCB(skb
).pid
, UNIXCB(skb
).uid
, UNIXCB(skb
).gid
);
2385 unix_set_secdata(&scm
, skb
);
2389 /* Copy address just once */
2390 if (state
->msg
&& state
->msg
->msg_name
) {
2391 DECLARE_SOCKADDR(struct sockaddr_un
*, sunaddr
,
2392 state
->msg
->msg_name
);
2393 unix_copy_addr(state
->msg
, skb
->sk
);
2397 chunk
= min_t(unsigned int, unix_skb_len(skb
) - skip
, size
);
2399 chunk
= state
->recv_actor(skb
, skip
, chunk
, state
);
2400 drop_skb
= !unix_skb_len(skb
);
2401 /* skb is only safe to use if !drop_skb */
2412 /* the skb was touched by a concurrent reader;
2413 * we should not expect anything from this skb
2414 * anymore and assume it invalid - we can be
2415 * sure it was dropped from the socket queue
2417 * let's report a short read
2423 /* Mark read part of skb as used */
2424 if (!(flags
& MSG_PEEK
)) {
2425 UNIXCB(skb
).consumed
+= chunk
;
2427 sk_peek_offset_bwd(sk
, chunk
);
2430 unix_detach_fds(&scm
, skb
);
2432 if (unix_skb_len(skb
))
2435 skb_unlink(skb
, &sk
->sk_receive_queue
);
2441 /* It is questionable, see note in unix_dgram_recvmsg.
2444 scm
.fp
= scm_fp_dup(UNIXCB(skb
).fp
);
2446 sk_peek_offset_fwd(sk
, chunk
);
2453 last_len
= skb
->len
;
2454 unix_state_lock(sk
);
2455 skb
= skb_peek_next(skb
, &sk
->sk_receive_queue
);
2458 unix_state_unlock(sk
);
2463 mutex_unlock(&u
->iolock
);
2465 scm_recv(sock
, state
->msg
, &scm
, flags
);
2469 return copied
? : err
;
2472 static int unix_stream_read_actor(struct sk_buff
*skb
,
2473 int skip
, int chunk
,
2474 struct unix_stream_read_state
*state
)
2478 ret
= skb_copy_datagram_msg(skb
, UNIXCB(skb
).consumed
+ skip
,
2480 return ret
?: chunk
;
2483 static int unix_stream_recvmsg(struct socket
*sock
, struct msghdr
*msg
,
2484 size_t size
, int flags
)
2486 struct unix_stream_read_state state
= {
2487 .recv_actor
= unix_stream_read_actor
,
2494 return unix_stream_read_generic(&state
, true);
2497 static int unix_stream_splice_actor(struct sk_buff
*skb
,
2498 int skip
, int chunk
,
2499 struct unix_stream_read_state
*state
)
2501 return skb_splice_bits(skb
, state
->socket
->sk
,
2502 UNIXCB(skb
).consumed
+ skip
,
2503 state
->pipe
, chunk
, state
->splice_flags
);
2506 static ssize_t
unix_stream_splice_read(struct socket
*sock
, loff_t
*ppos
,
2507 struct pipe_inode_info
*pipe
,
2508 size_t size
, unsigned int flags
)
2510 struct unix_stream_read_state state
= {
2511 .recv_actor
= unix_stream_splice_actor
,
2515 .splice_flags
= flags
,
2518 if (unlikely(*ppos
))
2521 if (sock
->file
->f_flags
& O_NONBLOCK
||
2522 flags
& SPLICE_F_NONBLOCK
)
2523 state
.flags
= MSG_DONTWAIT
;
2525 return unix_stream_read_generic(&state
, false);
2528 static int unix_shutdown(struct socket
*sock
, int mode
)
2530 struct sock
*sk
= sock
->sk
;
2533 if (mode
< SHUT_RD
|| mode
> SHUT_RDWR
)
2536 * SHUT_RD (0) -> RCV_SHUTDOWN (1)
2537 * SHUT_WR (1) -> SEND_SHUTDOWN (2)
2538 * SHUT_RDWR (2) -> SHUTDOWN_MASK (3)
2542 unix_state_lock(sk
);
2543 sk
->sk_shutdown
|= mode
;
2544 other
= unix_peer(sk
);
2547 unix_state_unlock(sk
);
2548 sk
->sk_state_change(sk
);
2551 (sk
->sk_type
== SOCK_STREAM
|| sk
->sk_type
== SOCK_SEQPACKET
)) {
2555 if (mode
&RCV_SHUTDOWN
)
2556 peer_mode
|= SEND_SHUTDOWN
;
2557 if (mode
&SEND_SHUTDOWN
)
2558 peer_mode
|= RCV_SHUTDOWN
;
2559 unix_state_lock(other
);
2560 other
->sk_shutdown
|= peer_mode
;
2561 unix_state_unlock(other
);
2562 other
->sk_state_change(other
);
2563 if (peer_mode
== SHUTDOWN_MASK
)
2564 sk_wake_async(other
, SOCK_WAKE_WAITD
, POLL_HUP
);
2565 else if (peer_mode
& RCV_SHUTDOWN
)
2566 sk_wake_async(other
, SOCK_WAKE_WAITD
, POLL_IN
);
2574 long unix_inq_len(struct sock
*sk
)
2576 struct sk_buff
*skb
;
2579 if (sk
->sk_state
== TCP_LISTEN
)
2582 spin_lock(&sk
->sk_receive_queue
.lock
);
2583 if (sk
->sk_type
== SOCK_STREAM
||
2584 sk
->sk_type
== SOCK_SEQPACKET
) {
2585 skb_queue_walk(&sk
->sk_receive_queue
, skb
)
2586 amount
+= unix_skb_len(skb
);
2588 skb
= skb_peek(&sk
->sk_receive_queue
);
2592 spin_unlock(&sk
->sk_receive_queue
.lock
);
2596 EXPORT_SYMBOL_GPL(unix_inq_len
);
2598 long unix_outq_len(struct sock
*sk
)
2600 return sk_wmem_alloc_get(sk
);
2602 EXPORT_SYMBOL_GPL(unix_outq_len
);
2604 static int unix_open_file(struct sock
*sk
)
2610 if (!ns_capable(sock_net(sk
)->user_ns
, CAP_NET_ADMIN
))
2613 if (!smp_load_acquire(&unix_sk(sk
)->addr
))
2616 path
= unix_sk(sk
)->path
;
2622 fd
= get_unused_fd_flags(O_CLOEXEC
);
2626 f
= dentry_open(&path
, O_PATH
, current_cred());
2640 static int unix_ioctl(struct socket
*sock
, unsigned int cmd
, unsigned long arg
)
2642 struct sock
*sk
= sock
->sk
;
2648 amount
= unix_outq_len(sk
);
2649 err
= put_user(amount
, (int __user
*)arg
);
2652 amount
= unix_inq_len(sk
);
2656 err
= put_user(amount
, (int __user
*)arg
);
2659 err
= unix_open_file(sk
);
2668 #ifdef CONFIG_COMPAT
2669 static int unix_compat_ioctl(struct socket
*sock
, unsigned int cmd
, unsigned long arg
)
2671 return unix_ioctl(sock
, cmd
, (unsigned long)compat_ptr(arg
));
2675 static __poll_t
unix_poll(struct file
*file
, struct socket
*sock
, poll_table
*wait
)
2677 struct sock
*sk
= sock
->sk
;
2680 sock_poll_wait(file
, sock
, wait
);
2683 /* exceptional events? */
2686 if (sk
->sk_shutdown
== SHUTDOWN_MASK
)
2688 if (sk
->sk_shutdown
& RCV_SHUTDOWN
)
2689 mask
|= EPOLLRDHUP
| EPOLLIN
| EPOLLRDNORM
;
2692 if (!skb_queue_empty_lockless(&sk
->sk_receive_queue
))
2693 mask
|= EPOLLIN
| EPOLLRDNORM
;
2695 /* Connection-based need to check for termination and startup */
2696 if ((sk
->sk_type
== SOCK_STREAM
|| sk
->sk_type
== SOCK_SEQPACKET
) &&
2697 sk
->sk_state
== TCP_CLOSE
)
2701 * we set writable also when the other side has shut down the
2702 * connection. This prevents stuck sockets.
2704 if (unix_writable(sk
))
2705 mask
|= EPOLLOUT
| EPOLLWRNORM
| EPOLLWRBAND
;
2710 static __poll_t
unix_dgram_poll(struct file
*file
, struct socket
*sock
,
2713 struct sock
*sk
= sock
->sk
, *other
;
2714 unsigned int writable
;
2717 sock_poll_wait(file
, sock
, wait
);
2720 /* exceptional events? */
2721 if (sk
->sk_err
|| !skb_queue_empty_lockless(&sk
->sk_error_queue
))
2723 (sock_flag(sk
, SOCK_SELECT_ERR_QUEUE
) ? EPOLLPRI
: 0);
2725 if (sk
->sk_shutdown
& RCV_SHUTDOWN
)
2726 mask
|= EPOLLRDHUP
| EPOLLIN
| EPOLLRDNORM
;
2727 if (sk
->sk_shutdown
== SHUTDOWN_MASK
)
2731 if (!skb_queue_empty_lockless(&sk
->sk_receive_queue
))
2732 mask
|= EPOLLIN
| EPOLLRDNORM
;
2734 /* Connection-based need to check for termination and startup */
2735 if (sk
->sk_type
== SOCK_SEQPACKET
) {
2736 if (sk
->sk_state
== TCP_CLOSE
)
2738 /* connection hasn't started yet? */
2739 if (sk
->sk_state
== TCP_SYN_SENT
)
2743 /* No write status requested, avoid expensive OUT tests. */
2744 if (!(poll_requested_events(wait
) & (EPOLLWRBAND
|EPOLLWRNORM
|EPOLLOUT
)))
2747 writable
= unix_writable(sk
);
2749 unix_state_lock(sk
);
2751 other
= unix_peer(sk
);
2752 if (other
&& unix_peer(other
) != sk
&&
2753 unix_recvq_full(other
) &&
2754 unix_dgram_peer_wake_me(sk
, other
))
2757 unix_state_unlock(sk
);
2761 mask
|= EPOLLOUT
| EPOLLWRNORM
| EPOLLWRBAND
;
2763 sk_set_bit(SOCKWQ_ASYNC_NOSPACE
, sk
);
2768 #ifdef CONFIG_PROC_FS
2770 #define BUCKET_SPACE (BITS_PER_LONG - (UNIX_HASH_BITS + 1) - 1)
2772 #define get_bucket(x) ((x) >> BUCKET_SPACE)
2773 #define get_offset(x) ((x) & ((1L << BUCKET_SPACE) - 1))
2774 #define set_bucket_offset(b, o) ((b) << BUCKET_SPACE | (o))
2776 static struct sock
*unix_from_bucket(struct seq_file
*seq
, loff_t
*pos
)
2778 unsigned long offset
= get_offset(*pos
);
2779 unsigned long bucket
= get_bucket(*pos
);
2781 unsigned long count
= 0;
2783 for (sk
= sk_head(&unix_socket_table
[bucket
]); sk
; sk
= sk_next(sk
)) {
2784 if (sock_net(sk
) != seq_file_net(seq
))
2786 if (++count
== offset
)
2793 static struct sock
*unix_next_socket(struct seq_file
*seq
,
2797 unsigned long bucket
;
2799 while (sk
> (struct sock
*)SEQ_START_TOKEN
) {
2803 if (sock_net(sk
) == seq_file_net(seq
))
2808 sk
= unix_from_bucket(seq
, pos
);
2813 bucket
= get_bucket(*pos
) + 1;
2814 *pos
= set_bucket_offset(bucket
, 1);
2815 } while (bucket
< ARRAY_SIZE(unix_socket_table
));
2820 static void *unix_seq_start(struct seq_file
*seq
, loff_t
*pos
)
2821 __acquires(unix_table_lock
)
2823 spin_lock(&unix_table_lock
);
2826 return SEQ_START_TOKEN
;
2828 if (get_bucket(*pos
) >= ARRAY_SIZE(unix_socket_table
))
2831 return unix_next_socket(seq
, NULL
, pos
);
2834 static void *unix_seq_next(struct seq_file
*seq
, void *v
, loff_t
*pos
)
2837 return unix_next_socket(seq
, v
, pos
);
2840 static void unix_seq_stop(struct seq_file
*seq
, void *v
)
2841 __releases(unix_table_lock
)
2843 spin_unlock(&unix_table_lock
);
2846 static int unix_seq_show(struct seq_file
*seq
, void *v
)
2849 if (v
== SEQ_START_TOKEN
)
2850 seq_puts(seq
, "Num RefCount Protocol Flags Type St "
2854 struct unix_sock
*u
= unix_sk(s
);
2857 seq_printf(seq
, "%pK: %08X %08X %08X %04X %02X %5lu",
2859 refcount_read(&s
->sk_refcnt
),
2861 s
->sk_state
== TCP_LISTEN
? __SO_ACCEPTCON
: 0,
2864 (s
->sk_state
== TCP_ESTABLISHED
? SS_CONNECTED
: SS_UNCONNECTED
) :
2865 (s
->sk_state
== TCP_ESTABLISHED
? SS_CONNECTING
: SS_DISCONNECTING
),
2868 if (u
->addr
) { // under unix_table_lock here
2873 len
= u
->addr
->len
- sizeof(short);
2874 if (!UNIX_ABSTRACT(s
))
2880 for ( ; i
< len
; i
++)
2881 seq_putc(seq
, u
->addr
->name
->sun_path
[i
] ?:
2884 unix_state_unlock(s
);
2885 seq_putc(seq
, '\n');
2891 static const struct seq_operations unix_seq_ops
= {
2892 .start
= unix_seq_start
,
2893 .next
= unix_seq_next
,
2894 .stop
= unix_seq_stop
,
2895 .show
= unix_seq_show
,
2899 static const struct net_proto_family unix_family_ops
= {
2901 .create
= unix_create
,
2902 .owner
= THIS_MODULE
,
2906 static int __net_init
unix_net_init(struct net
*net
)
2908 int error
= -ENOMEM
;
2910 net
->unx
.sysctl_max_dgram_qlen
= 10;
2911 if (unix_sysctl_register(net
))
2914 #ifdef CONFIG_PROC_FS
2915 if (!proc_create_net("unix", 0, net
->proc_net
, &unix_seq_ops
,
2916 sizeof(struct seq_net_private
))) {
2917 unix_sysctl_unregister(net
);
2926 static void __net_exit
unix_net_exit(struct net
*net
)
2928 unix_sysctl_unregister(net
);
2929 remove_proc_entry("unix", net
->proc_net
);
2932 static struct pernet_operations unix_net_ops
= {
2933 .init
= unix_net_init
,
2934 .exit
= unix_net_exit
,
2937 static int __init
af_unix_init(void)
2941 BUILD_BUG_ON(sizeof(struct unix_skb_parms
) > FIELD_SIZEOF(struct sk_buff
, cb
));
2943 rc
= proto_register(&unix_proto
, 1);
2945 pr_crit("%s: Cannot create unix_sock SLAB cache!\n", __func__
);
2949 sock_register(&unix_family_ops
);
2950 register_pernet_subsys(&unix_net_ops
);
2955 static void __exit
af_unix_exit(void)
2957 sock_unregister(PF_UNIX
);
2958 proto_unregister(&unix_proto
);
2959 unregister_pernet_subsys(&unix_net_ops
);
2962 /* Earlier than device_initcall() so that other drivers invoking
2963 request_module() don't end up in a loop when modprobe tries
2964 to use a UNIX socket. But later than subsys_initcall() because
2965 we depend on stuff initialised there */
2966 fs_initcall(af_unix_init
);
2967 module_exit(af_unix_exit
);
2969 MODULE_LICENSE("GPL");
2970 MODULE_ALIAS_NETPROTO(PF_UNIX
);