2 * NET4: Implementation of BSD Unix domain sockets.
4 * Authors: Alan Cox, <alan@lxorguk.ukuu.org.uk>
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version
9 * 2 of the License, or (at your option) any later version.
12 * Linus Torvalds : Assorted bug cures.
13 * Niibe Yutaka : async I/O support.
14 * Carsten Paeth : PF_UNIX check, address fixes.
15 * Alan Cox : Limit size of allocated blocks.
16 * Alan Cox : Fixed the stupid socketpair bug.
17 * Alan Cox : BSD compatibility fine tuning.
18 * Alan Cox : Fixed a bug in connect when interrupted.
19 * Alan Cox : Sorted out a proper draft version of
20 * file descriptor passing hacked up from
22 * Marty Leisner : Fixes to fd passing
23 * Nick Nevin : recvmsg bugfix.
24 * Alan Cox : Started proper garbage collector
25 * Heiko EiBfeldt : Missing verify_area check
26 * Alan Cox : Started POSIXisms
27 * Andreas Schwab : Replace inode by dentry for proper
29 * Kirk Petersen : Made this a module
30 * Christoph Rohland : Elegant non-blocking accept/connect algorithm.
32 * Alexey Kuznetosv : Repaired (I hope) bugs introduces
33 * by above two patches.
34 * Andrea Arcangeli : If possible we block in connect(2)
35 * if the max backlog of the listen socket
36 * is been reached. This won't break
37 * old apps and it will avoid huge amount
38 * of socks hashed (this for unix_gc()
39 * performances reasons).
40 * Security fix that limits the max
41 * number of socks to 2*max_files and
42 * the number of skb queueable in the
44 * Artur Skawina : Hash function optimizations
45 * Alexey Kuznetsov : Full scale SMP. Lot of bugs are introduced 8)
46 * Malcolm Beattie : Set peercred for socketpair
47 * Michal Ostrowski : Module initialization cleanup.
48 * Arnaldo C. Melo : Remove MOD_{INC,DEC}_USE_COUNT,
49 * the core infrastructure is doing that
50 * for all net proto families now (2.5.69+)
53 * Known differences from reference BSD that was tested:
56 * ECONNREFUSED is not returned from one end of a connected() socket to the
57 * other the moment one end closes.
58 * fstat() doesn't return st_dev=0, and give the blksize as high water mark
59 * and a fake inode identifier (nor the BSD first socket fstat twice bug).
61 * accept() returns a path name even if the connecting socket has closed
62 * in the meantime (BSD loses the path and gives up).
63 * accept() returns 0 length path for an unbound connector. BSD returns 16
64 * and a null first byte in the path (but not for gethost/peername - BSD bug ??)
65 * socketpair(...SOCK_RAW..) doesn't panic the kernel.
66 * BSD af_unix apparently has connect forgetting to block properly.
67 * (need to check this with the POSIX spec in detail)
69 * Differences from 2.0.0-11-... (ANK)
70 * Bug fixes and improvements.
71 * - client shutdown killed server socket.
72 * - removed all useless cli/sti pairs.
74 * Semantic changes/extensions.
75 * - generic control message passing.
76 * - SCM_CREDENTIALS control message.
77 * - "Abstract" (not FS based) socket bindings.
78 * Abstract names are sequences of bytes (not zero terminated)
79 * started by 0, so that this name space does not intersect
83 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
85 #include <linux/module.h>
86 #include <linux/kernel.h>
87 #include <linux/signal.h>
88 #include <linux/sched/signal.h>
89 #include <linux/errno.h>
90 #include <linux/string.h>
91 #include <linux/stat.h>
92 #include <linux/dcache.h>
93 #include <linux/namei.h>
94 #include <linux/socket.h>
96 #include <linux/fcntl.h>
97 #include <linux/termios.h>
98 #include <linux/sockios.h>
99 #include <linux/net.h>
100 #include <linux/in.h>
101 #include <linux/fs.h>
102 #include <linux/slab.h>
103 #include <linux/uaccess.h>
104 #include <linux/skbuff.h>
105 #include <linux/netdevice.h>
106 #include <net/net_namespace.h>
107 #include <net/sock.h>
108 #include <net/tcp_states.h>
109 #include <net/af_unix.h>
110 #include <linux/proc_fs.h>
111 #include <linux/seq_file.h>
113 #include <linux/init.h>
114 #include <linux/poll.h>
115 #include <linux/rtnetlink.h>
116 #include <linux/mount.h>
117 #include <net/checksum.h>
118 #include <linux/security.h>
119 #include <linux/freezer.h>
120 #include <linux/file.h>
122 struct hlist_head unix_socket_table
[2 * UNIX_HASH_SIZE
];
123 EXPORT_SYMBOL_GPL(unix_socket_table
);
124 DEFINE_SPINLOCK(unix_table_lock
);
125 EXPORT_SYMBOL_GPL(unix_table_lock
);
126 static atomic_long_t unix_nr_socks
;
129 static struct hlist_head
*unix_sockets_unbound(void *addr
)
131 unsigned long hash
= (unsigned long)addr
;
135 hash
%= UNIX_HASH_SIZE
;
136 return &unix_socket_table
[UNIX_HASH_SIZE
+ hash
];
139 #define UNIX_ABSTRACT(sk) (unix_sk(sk)->addr->hash < UNIX_HASH_SIZE)
141 #ifdef CONFIG_SECURITY_NETWORK
142 static void unix_get_secdata(struct scm_cookie
*scm
, struct sk_buff
*skb
)
144 UNIXCB(skb
).secid
= scm
->secid
;
147 static inline void unix_set_secdata(struct scm_cookie
*scm
, struct sk_buff
*skb
)
149 scm
->secid
= UNIXCB(skb
).secid
;
152 static inline bool unix_secdata_eq(struct scm_cookie
*scm
, struct sk_buff
*skb
)
154 return (scm
->secid
== UNIXCB(skb
).secid
);
157 static inline void unix_get_secdata(struct scm_cookie
*scm
, struct sk_buff
*skb
)
160 static inline void unix_set_secdata(struct scm_cookie
*scm
, struct sk_buff
*skb
)
163 static inline bool unix_secdata_eq(struct scm_cookie
*scm
, struct sk_buff
*skb
)
167 #endif /* CONFIG_SECURITY_NETWORK */
170 * SMP locking strategy:
171 * hash table is protected with spinlock unix_table_lock
172 * each socket state is protected by separate spin lock.
175 static inline unsigned int unix_hash_fold(__wsum n
)
177 unsigned int hash
= (__force
unsigned int)csum_fold(n
);
180 return hash
&(UNIX_HASH_SIZE
-1);
183 #define unix_peer(sk) (unix_sk(sk)->peer)
185 static inline int unix_our_peer(struct sock
*sk
, struct sock
*osk
)
187 return unix_peer(osk
) == sk
;
190 static inline int unix_may_send(struct sock
*sk
, struct sock
*osk
)
192 return unix_peer(osk
) == NULL
|| unix_our_peer(sk
, osk
);
195 static inline int unix_recvq_full(struct sock
const *sk
)
197 return skb_queue_len(&sk
->sk_receive_queue
) > sk
->sk_max_ack_backlog
;
200 struct sock
*unix_peer_get(struct sock
*s
)
208 unix_state_unlock(s
);
211 EXPORT_SYMBOL_GPL(unix_peer_get
);
213 static inline void unix_release_addr(struct unix_address
*addr
)
215 if (refcount_dec_and_test(&addr
->refcnt
))
220 * Check unix socket name:
221 * - should be not zero length.
222 * - if started by not zero, should be NULL terminated (FS object)
223 * - if started by zero, it is abstract name.
226 static int unix_mkname(struct sockaddr_un
*sunaddr
, int len
, unsigned int *hashp
)
228 if (len
<= sizeof(short) || len
> sizeof(*sunaddr
))
230 if (!sunaddr
|| sunaddr
->sun_family
!= AF_UNIX
)
232 if (sunaddr
->sun_path
[0]) {
234 * This may look like an off by one error but it is a bit more
235 * subtle. 108 is the longest valid AF_UNIX path for a binding.
236 * sun_path[108] doesn't as such exist. However in kernel space
237 * we are guaranteed that it is a valid memory location in our
238 * kernel address buffer.
240 ((char *)sunaddr
)[len
] = 0;
241 len
= strlen(sunaddr
->sun_path
)+1+sizeof(short);
245 *hashp
= unix_hash_fold(csum_partial(sunaddr
, len
, 0));
249 static void __unix_remove_socket(struct sock
*sk
)
251 sk_del_node_init(sk
);
254 static void __unix_insert_socket(struct hlist_head
*list
, struct sock
*sk
)
256 WARN_ON(!sk_unhashed(sk
));
257 sk_add_node(sk
, list
);
260 static inline void unix_remove_socket(struct sock
*sk
)
262 spin_lock(&unix_table_lock
);
263 __unix_remove_socket(sk
);
264 spin_unlock(&unix_table_lock
);
267 static inline void unix_insert_socket(struct hlist_head
*list
, struct sock
*sk
)
269 spin_lock(&unix_table_lock
);
270 __unix_insert_socket(list
, sk
);
271 spin_unlock(&unix_table_lock
);
274 static struct sock
*__unix_find_socket_byname(struct net
*net
,
275 struct sockaddr_un
*sunname
,
276 int len
, int type
, unsigned int hash
)
280 sk_for_each(s
, &unix_socket_table
[hash
^ type
]) {
281 struct unix_sock
*u
= unix_sk(s
);
283 if (!net_eq(sock_net(s
), net
))
286 if (u
->addr
->len
== len
&&
287 !memcmp(u
->addr
->name
, sunname
, len
))
295 static inline struct sock
*unix_find_socket_byname(struct net
*net
,
296 struct sockaddr_un
*sunname
,
302 spin_lock(&unix_table_lock
);
303 s
= __unix_find_socket_byname(net
, sunname
, len
, type
, hash
);
306 spin_unlock(&unix_table_lock
);
310 static struct sock
*unix_find_socket_byinode(struct inode
*i
)
314 spin_lock(&unix_table_lock
);
316 &unix_socket_table
[i
->i_ino
& (UNIX_HASH_SIZE
- 1)]) {
317 struct dentry
*dentry
= unix_sk(s
)->path
.dentry
;
319 if (dentry
&& d_backing_inode(dentry
) == i
) {
326 spin_unlock(&unix_table_lock
);
330 /* Support code for asymmetrically connected dgram sockets
332 * If a datagram socket is connected to a socket not itself connected
333 * to the first socket (eg, /dev/log), clients may only enqueue more
334 * messages if the present receive queue of the server socket is not
335 * "too large". This means there's a second writeability condition
336 * poll and sendmsg need to test. The dgram recv code will do a wake
337 * up on the peer_wait wait queue of a socket upon reception of a
338 * datagram which needs to be propagated to sleeping would-be writers
339 * since these might not have sent anything so far. This can't be
340 * accomplished via poll_wait because the lifetime of the server
341 * socket might be less than that of its clients if these break their
342 * association with it or if the server socket is closed while clients
343 * are still connected to it and there's no way to inform "a polling
344 * implementation" that it should let go of a certain wait queue
346 * In order to propagate a wake up, a wait_queue_entry_t of the client
347 * socket is enqueued on the peer_wait queue of the server socket
348 * whose wake function does a wake_up on the ordinary client socket
349 * wait queue. This connection is established whenever a write (or
350 * poll for write) hit the flow control condition and broken when the
351 * association to the server socket is dissolved or after a wake up
355 static int unix_dgram_peer_wake_relay(wait_queue_entry_t
*q
, unsigned mode
, int flags
,
359 wait_queue_head_t
*u_sleep
;
361 u
= container_of(q
, struct unix_sock
, peer_wake
);
363 __remove_wait_queue(&unix_sk(u
->peer_wake
.private)->peer_wait
,
365 u
->peer_wake
.private = NULL
;
367 /* relaying can only happen while the wq still exists */
368 u_sleep
= sk_sleep(&u
->sk
);
370 wake_up_interruptible_poll(u_sleep
, key_to_poll(key
));
375 static int unix_dgram_peer_wake_connect(struct sock
*sk
, struct sock
*other
)
377 struct unix_sock
*u
, *u_other
;
381 u_other
= unix_sk(other
);
383 spin_lock(&u_other
->peer_wait
.lock
);
385 if (!u
->peer_wake
.private) {
386 u
->peer_wake
.private = other
;
387 __add_wait_queue(&u_other
->peer_wait
, &u
->peer_wake
);
392 spin_unlock(&u_other
->peer_wait
.lock
);
396 static void unix_dgram_peer_wake_disconnect(struct sock
*sk
,
399 struct unix_sock
*u
, *u_other
;
402 u_other
= unix_sk(other
);
403 spin_lock(&u_other
->peer_wait
.lock
);
405 if (u
->peer_wake
.private == other
) {
406 __remove_wait_queue(&u_other
->peer_wait
, &u
->peer_wake
);
407 u
->peer_wake
.private = NULL
;
410 spin_unlock(&u_other
->peer_wait
.lock
);
413 static void unix_dgram_peer_wake_disconnect_wakeup(struct sock
*sk
,
416 unix_dgram_peer_wake_disconnect(sk
, other
);
417 wake_up_interruptible_poll(sk_sleep(sk
),
424 * - unix_peer(sk) == other
425 * - association is stable
427 static int unix_dgram_peer_wake_me(struct sock
*sk
, struct sock
*other
)
431 connected
= unix_dgram_peer_wake_connect(sk
, other
);
433 /* If other is SOCK_DEAD, we want to make sure we signal
434 * POLLOUT, such that a subsequent write() can get a
435 * -ECONNREFUSED. Otherwise, if we haven't queued any skbs
436 * to other and its full, we will hang waiting for POLLOUT.
438 if (unix_recvq_full(other
) && !sock_flag(other
, SOCK_DEAD
))
442 unix_dgram_peer_wake_disconnect(sk
, other
);
447 static int unix_writable(const struct sock
*sk
)
449 return sk
->sk_state
!= TCP_LISTEN
&&
450 (refcount_read(&sk
->sk_wmem_alloc
) << 2) <= sk
->sk_sndbuf
;
453 static void unix_write_space(struct sock
*sk
)
455 struct socket_wq
*wq
;
458 if (unix_writable(sk
)) {
459 wq
= rcu_dereference(sk
->sk_wq
);
460 if (skwq_has_sleeper(wq
))
461 wake_up_interruptible_sync_poll(&wq
->wait
,
462 EPOLLOUT
| EPOLLWRNORM
| EPOLLWRBAND
);
463 sk_wake_async(sk
, SOCK_WAKE_SPACE
, POLL_OUT
);
468 /* When dgram socket disconnects (or changes its peer), we clear its receive
469 * queue of packets arrived from previous peer. First, it allows to do
470 * flow control based only on wmem_alloc; second, sk connected to peer
471 * may receive messages only from that peer. */
472 static void unix_dgram_disconnected(struct sock
*sk
, struct sock
*other
)
474 if (!skb_queue_empty(&sk
->sk_receive_queue
)) {
475 skb_queue_purge(&sk
->sk_receive_queue
);
476 wake_up_interruptible_all(&unix_sk(sk
)->peer_wait
);
478 /* If one link of bidirectional dgram pipe is disconnected,
479 * we signal error. Messages are lost. Do not make this,
480 * when peer was not connected to us.
482 if (!sock_flag(other
, SOCK_DEAD
) && unix_peer(other
) == sk
) {
483 other
->sk_err
= ECONNRESET
;
484 other
->sk_error_report(other
);
489 static void unix_sock_destructor(struct sock
*sk
)
491 struct unix_sock
*u
= unix_sk(sk
);
493 skb_queue_purge(&sk
->sk_receive_queue
);
495 WARN_ON(refcount_read(&sk
->sk_wmem_alloc
));
496 WARN_ON(!sk_unhashed(sk
));
497 WARN_ON(sk
->sk_socket
);
498 if (!sock_flag(sk
, SOCK_DEAD
)) {
499 pr_info("Attempt to release alive unix socket: %p\n", sk
);
504 unix_release_addr(u
->addr
);
506 atomic_long_dec(&unix_nr_socks
);
508 sock_prot_inuse_add(sock_net(sk
), sk
->sk_prot
, -1);
510 #ifdef UNIX_REFCNT_DEBUG
511 pr_debug("UNIX %p is destroyed, %ld are still alive.\n", sk
,
512 atomic_long_read(&unix_nr_socks
));
516 static void unix_release_sock(struct sock
*sk
, int embrion
)
518 struct unix_sock
*u
= unix_sk(sk
);
524 unix_remove_socket(sk
);
529 sk
->sk_shutdown
= SHUTDOWN_MASK
;
531 u
->path
.dentry
= NULL
;
533 state
= sk
->sk_state
;
534 sk
->sk_state
= TCP_CLOSE
;
535 unix_state_unlock(sk
);
537 wake_up_interruptible_all(&u
->peer_wait
);
539 skpair
= unix_peer(sk
);
541 if (skpair
!= NULL
) {
542 if (sk
->sk_type
== SOCK_STREAM
|| sk
->sk_type
== SOCK_SEQPACKET
) {
543 unix_state_lock(skpair
);
545 skpair
->sk_shutdown
= SHUTDOWN_MASK
;
546 if (!skb_queue_empty(&sk
->sk_receive_queue
) || embrion
)
547 skpair
->sk_err
= ECONNRESET
;
548 unix_state_unlock(skpair
);
549 skpair
->sk_state_change(skpair
);
550 sk_wake_async(skpair
, SOCK_WAKE_WAITD
, POLL_HUP
);
553 unix_dgram_peer_wake_disconnect(sk
, skpair
);
554 sock_put(skpair
); /* It may now die */
555 unix_peer(sk
) = NULL
;
558 /* Try to flush out this socket. Throw out buffers at least */
560 while ((skb
= skb_dequeue(&sk
->sk_receive_queue
)) != NULL
) {
561 if (state
== TCP_LISTEN
)
562 unix_release_sock(skb
->sk
, 1);
563 /* passed fds are erased in the kfree_skb hook */
564 UNIXCB(skb
).consumed
= skb
->len
;
573 /* ---- Socket is dead now and most probably destroyed ---- */
576 * Fixme: BSD difference: In BSD all sockets connected to us get
577 * ECONNRESET and we die on the spot. In Linux we behave
578 * like files and pipes do and wait for the last
581 * Can't we simply set sock->err?
583 * What the above comment does talk about? --ANK(980817)
586 if (unix_tot_inflight
)
587 unix_gc(); /* Garbage collect fds */
590 static void init_peercred(struct sock
*sk
)
592 put_pid(sk
->sk_peer_pid
);
593 if (sk
->sk_peer_cred
)
594 put_cred(sk
->sk_peer_cred
);
595 sk
->sk_peer_pid
= get_pid(task_tgid(current
));
596 sk
->sk_peer_cred
= get_current_cred();
599 static void copy_peercred(struct sock
*sk
, struct sock
*peersk
)
601 put_pid(sk
->sk_peer_pid
);
602 if (sk
->sk_peer_cred
)
603 put_cred(sk
->sk_peer_cred
);
604 sk
->sk_peer_pid
= get_pid(peersk
->sk_peer_pid
);
605 sk
->sk_peer_cred
= get_cred(peersk
->sk_peer_cred
);
608 static int unix_listen(struct socket
*sock
, int backlog
)
611 struct sock
*sk
= sock
->sk
;
612 struct unix_sock
*u
= unix_sk(sk
);
613 struct pid
*old_pid
= NULL
;
616 if (sock
->type
!= SOCK_STREAM
&& sock
->type
!= SOCK_SEQPACKET
)
617 goto out
; /* Only stream/seqpacket sockets accept */
620 goto out
; /* No listens on an unbound socket */
622 if (sk
->sk_state
!= TCP_CLOSE
&& sk
->sk_state
!= TCP_LISTEN
)
624 if (backlog
> sk
->sk_max_ack_backlog
)
625 wake_up_interruptible_all(&u
->peer_wait
);
626 sk
->sk_max_ack_backlog
= backlog
;
627 sk
->sk_state
= TCP_LISTEN
;
628 /* set credentials so connect can copy them */
633 unix_state_unlock(sk
);
639 static int unix_release(struct socket
*);
640 static int unix_bind(struct socket
*, struct sockaddr
*, int);
641 static int unix_stream_connect(struct socket
*, struct sockaddr
*,
642 int addr_len
, int flags
);
643 static int unix_socketpair(struct socket
*, struct socket
*);
644 static int unix_accept(struct socket
*, struct socket
*, int, bool);
645 static int unix_getname(struct socket
*, struct sockaddr
*, int);
646 static __poll_t
unix_poll(struct file
*, struct socket
*, poll_table
*);
647 static __poll_t
unix_dgram_poll(struct file
*, struct socket
*,
649 static int unix_ioctl(struct socket
*, unsigned int, unsigned long);
650 static int unix_shutdown(struct socket
*, int);
651 static int unix_stream_sendmsg(struct socket
*, struct msghdr
*, size_t);
652 static int unix_stream_recvmsg(struct socket
*, struct msghdr
*, size_t, int);
653 static ssize_t
unix_stream_sendpage(struct socket
*, struct page
*, int offset
,
654 size_t size
, int flags
);
655 static ssize_t
unix_stream_splice_read(struct socket
*, loff_t
*ppos
,
656 struct pipe_inode_info
*, size_t size
,
658 static int unix_dgram_sendmsg(struct socket
*, struct msghdr
*, size_t);
659 static int unix_dgram_recvmsg(struct socket
*, struct msghdr
*, size_t, int);
660 static int unix_dgram_connect(struct socket
*, struct sockaddr
*,
662 static int unix_seqpacket_sendmsg(struct socket
*, struct msghdr
*, size_t);
663 static int unix_seqpacket_recvmsg(struct socket
*, struct msghdr
*, size_t,
666 static int unix_set_peek_off(struct sock
*sk
, int val
)
668 struct unix_sock
*u
= unix_sk(sk
);
670 if (mutex_lock_interruptible(&u
->iolock
))
673 sk
->sk_peek_off
= val
;
674 mutex_unlock(&u
->iolock
);
680 static const struct proto_ops unix_stream_ops
= {
682 .owner
= THIS_MODULE
,
683 .release
= unix_release
,
685 .connect
= unix_stream_connect
,
686 .socketpair
= unix_socketpair
,
687 .accept
= unix_accept
,
688 .getname
= unix_getname
,
691 .listen
= unix_listen
,
692 .shutdown
= unix_shutdown
,
693 .setsockopt
= sock_no_setsockopt
,
694 .getsockopt
= sock_no_getsockopt
,
695 .sendmsg
= unix_stream_sendmsg
,
696 .recvmsg
= unix_stream_recvmsg
,
697 .mmap
= sock_no_mmap
,
698 .sendpage
= unix_stream_sendpage
,
699 .splice_read
= unix_stream_splice_read
,
700 .set_peek_off
= unix_set_peek_off
,
703 static const struct proto_ops unix_dgram_ops
= {
705 .owner
= THIS_MODULE
,
706 .release
= unix_release
,
708 .connect
= unix_dgram_connect
,
709 .socketpair
= unix_socketpair
,
710 .accept
= sock_no_accept
,
711 .getname
= unix_getname
,
712 .poll
= unix_dgram_poll
,
714 .listen
= sock_no_listen
,
715 .shutdown
= unix_shutdown
,
716 .setsockopt
= sock_no_setsockopt
,
717 .getsockopt
= sock_no_getsockopt
,
718 .sendmsg
= unix_dgram_sendmsg
,
719 .recvmsg
= unix_dgram_recvmsg
,
720 .mmap
= sock_no_mmap
,
721 .sendpage
= sock_no_sendpage
,
722 .set_peek_off
= unix_set_peek_off
,
725 static const struct proto_ops unix_seqpacket_ops
= {
727 .owner
= THIS_MODULE
,
728 .release
= unix_release
,
730 .connect
= unix_stream_connect
,
731 .socketpair
= unix_socketpair
,
732 .accept
= unix_accept
,
733 .getname
= unix_getname
,
734 .poll
= unix_dgram_poll
,
736 .listen
= unix_listen
,
737 .shutdown
= unix_shutdown
,
738 .setsockopt
= sock_no_setsockopt
,
739 .getsockopt
= sock_no_getsockopt
,
740 .sendmsg
= unix_seqpacket_sendmsg
,
741 .recvmsg
= unix_seqpacket_recvmsg
,
742 .mmap
= sock_no_mmap
,
743 .sendpage
= sock_no_sendpage
,
744 .set_peek_off
= unix_set_peek_off
,
747 static struct proto unix_proto
= {
749 .owner
= THIS_MODULE
,
750 .obj_size
= sizeof(struct unix_sock
),
753 static struct sock
*unix_create1(struct net
*net
, struct socket
*sock
, int kern
)
755 struct sock
*sk
= NULL
;
758 atomic_long_inc(&unix_nr_socks
);
759 if (atomic_long_read(&unix_nr_socks
) > 2 * get_max_files())
762 sk
= sk_alloc(net
, PF_UNIX
, GFP_KERNEL
, &unix_proto
, kern
);
766 sock_init_data(sock
, sk
);
768 sk
->sk_allocation
= GFP_KERNEL_ACCOUNT
;
769 sk
->sk_write_space
= unix_write_space
;
770 sk
->sk_max_ack_backlog
= net
->unx
.sysctl_max_dgram_qlen
;
771 sk
->sk_destruct
= unix_sock_destructor
;
773 u
->path
.dentry
= NULL
;
775 spin_lock_init(&u
->lock
);
776 atomic_long_set(&u
->inflight
, 0);
777 INIT_LIST_HEAD(&u
->link
);
778 mutex_init(&u
->iolock
); /* single task reading lock */
779 mutex_init(&u
->bindlock
); /* single task binding lock */
780 init_waitqueue_head(&u
->peer_wait
);
781 init_waitqueue_func_entry(&u
->peer_wake
, unix_dgram_peer_wake_relay
);
782 unix_insert_socket(unix_sockets_unbound(sk
), sk
);
785 atomic_long_dec(&unix_nr_socks
);
788 sock_prot_inuse_add(sock_net(sk
), sk
->sk_prot
, 1);
794 static int unix_create(struct net
*net
, struct socket
*sock
, int protocol
,
797 if (protocol
&& protocol
!= PF_UNIX
)
798 return -EPROTONOSUPPORT
;
800 sock
->state
= SS_UNCONNECTED
;
802 switch (sock
->type
) {
804 sock
->ops
= &unix_stream_ops
;
807 * Believe it or not BSD has AF_UNIX, SOCK_RAW though
811 sock
->type
= SOCK_DGRAM
;
814 sock
->ops
= &unix_dgram_ops
;
817 sock
->ops
= &unix_seqpacket_ops
;
820 return -ESOCKTNOSUPPORT
;
823 return unix_create1(net
, sock
, kern
) ? 0 : -ENOMEM
;
826 static int unix_release(struct socket
*sock
)
828 struct sock
*sk
= sock
->sk
;
833 unix_release_sock(sk
, 0);
839 static int unix_autobind(struct socket
*sock
)
841 struct sock
*sk
= sock
->sk
;
842 struct net
*net
= sock_net(sk
);
843 struct unix_sock
*u
= unix_sk(sk
);
844 static u32 ordernum
= 1;
845 struct unix_address
*addr
;
847 unsigned int retries
= 0;
849 err
= mutex_lock_interruptible(&u
->bindlock
);
858 addr
= kzalloc(sizeof(*addr
) + sizeof(short) + 16, GFP_KERNEL
);
862 addr
->name
->sun_family
= AF_UNIX
;
863 refcount_set(&addr
->refcnt
, 1);
866 addr
->len
= sprintf(addr
->name
->sun_path
+1, "%05x", ordernum
) + 1 + sizeof(short);
867 addr
->hash
= unix_hash_fold(csum_partial(addr
->name
, addr
->len
, 0));
869 spin_lock(&unix_table_lock
);
870 ordernum
= (ordernum
+1)&0xFFFFF;
872 if (__unix_find_socket_byname(net
, addr
->name
, addr
->len
, sock
->type
,
874 spin_unlock(&unix_table_lock
);
876 * __unix_find_socket_byname() may take long time if many names
877 * are already in use.
880 /* Give up if all names seems to be in use. */
881 if (retries
++ == 0xFFFFF) {
888 addr
->hash
^= sk
->sk_type
;
890 __unix_remove_socket(sk
);
891 smp_store_release(&u
->addr
, addr
);
892 __unix_insert_socket(&unix_socket_table
[addr
->hash
], sk
);
893 spin_unlock(&unix_table_lock
);
896 out
: mutex_unlock(&u
->bindlock
);
900 static struct sock
*unix_find_other(struct net
*net
,
901 struct sockaddr_un
*sunname
, int len
,
902 int type
, unsigned int hash
, int *error
)
908 if (sunname
->sun_path
[0]) {
910 err
= kern_path(sunname
->sun_path
, LOOKUP_FOLLOW
, &path
);
913 inode
= d_backing_inode(path
.dentry
);
914 err
= inode_permission(inode
, MAY_WRITE
);
919 if (!S_ISSOCK(inode
->i_mode
))
921 u
= unix_find_socket_byinode(inode
);
925 if (u
->sk_type
== type
)
931 if (u
->sk_type
!= type
) {
937 u
= unix_find_socket_byname(net
, sunname
, len
, type
, hash
);
939 struct dentry
*dentry
;
940 dentry
= unix_sk(u
)->path
.dentry
;
942 touch_atime(&unix_sk(u
)->path
);
955 static int unix_mknod(const char *sun_path
, umode_t mode
, struct path
*res
)
957 struct dentry
*dentry
;
961 * Get the parent directory, calculate the hash for last
964 dentry
= kern_path_create(AT_FDCWD
, sun_path
, &path
, 0);
965 err
= PTR_ERR(dentry
);
970 * All right, let's create it.
972 err
= security_path_mknod(&path
, dentry
, mode
, 0);
974 err
= vfs_mknod(d_inode(path
.dentry
), dentry
, mode
, 0);
976 res
->mnt
= mntget(path
.mnt
);
977 res
->dentry
= dget(dentry
);
980 done_path_create(&path
, dentry
);
984 static int unix_bind(struct socket
*sock
, struct sockaddr
*uaddr
, int addr_len
)
986 struct sock
*sk
= sock
->sk
;
987 struct net
*net
= sock_net(sk
);
988 struct unix_sock
*u
= unix_sk(sk
);
989 struct sockaddr_un
*sunaddr
= (struct sockaddr_un
*)uaddr
;
990 char *sun_path
= sunaddr
->sun_path
;
993 struct unix_address
*addr
;
994 struct hlist_head
*list
;
995 struct path path
= { };
998 if (addr_len
< offsetofend(struct sockaddr_un
, sun_family
) ||
999 sunaddr
->sun_family
!= AF_UNIX
)
1002 if (addr_len
== sizeof(short)) {
1003 err
= unix_autobind(sock
);
1007 err
= unix_mkname(sunaddr
, addr_len
, &hash
);
1013 umode_t mode
= S_IFSOCK
|
1014 (SOCK_INODE(sock
)->i_mode
& ~current_umask());
1015 err
= unix_mknod(sun_path
, mode
, &path
);
1023 err
= mutex_lock_interruptible(&u
->bindlock
);
1032 addr
= kmalloc(sizeof(*addr
)+addr_len
, GFP_KERNEL
);
1036 memcpy(addr
->name
, sunaddr
, addr_len
);
1037 addr
->len
= addr_len
;
1038 addr
->hash
= hash
^ sk
->sk_type
;
1039 refcount_set(&addr
->refcnt
, 1);
1042 addr
->hash
= UNIX_HASH_SIZE
;
1043 hash
= d_backing_inode(path
.dentry
)->i_ino
& (UNIX_HASH_SIZE
- 1);
1044 spin_lock(&unix_table_lock
);
1046 list
= &unix_socket_table
[hash
];
1048 spin_lock(&unix_table_lock
);
1050 if (__unix_find_socket_byname(net
, sunaddr
, addr_len
,
1051 sk
->sk_type
, hash
)) {
1052 unix_release_addr(addr
);
1056 list
= &unix_socket_table
[addr
->hash
];
1060 __unix_remove_socket(sk
);
1061 smp_store_release(&u
->addr
, addr
);
1062 __unix_insert_socket(list
, sk
);
1065 spin_unlock(&unix_table_lock
);
1067 mutex_unlock(&u
->bindlock
);
1075 static void unix_state_double_lock(struct sock
*sk1
, struct sock
*sk2
)
1077 if (unlikely(sk1
== sk2
) || !sk2
) {
1078 unix_state_lock(sk1
);
1082 unix_state_lock(sk1
);
1083 unix_state_lock_nested(sk2
);
1085 unix_state_lock(sk2
);
1086 unix_state_lock_nested(sk1
);
1090 static void unix_state_double_unlock(struct sock
*sk1
, struct sock
*sk2
)
1092 if (unlikely(sk1
== sk2
) || !sk2
) {
1093 unix_state_unlock(sk1
);
1096 unix_state_unlock(sk1
);
1097 unix_state_unlock(sk2
);
1100 static int unix_dgram_connect(struct socket
*sock
, struct sockaddr
*addr
,
1101 int alen
, int flags
)
1103 struct sock
*sk
= sock
->sk
;
1104 struct net
*net
= sock_net(sk
);
1105 struct sockaddr_un
*sunaddr
= (struct sockaddr_un
*)addr
;
1111 if (alen
< offsetofend(struct sockaddr
, sa_family
))
1114 if (addr
->sa_family
!= AF_UNSPEC
) {
1115 err
= unix_mkname(sunaddr
, alen
, &hash
);
1120 if (test_bit(SOCK_PASSCRED
, &sock
->flags
) &&
1121 !unix_sk(sk
)->addr
&& (err
= unix_autobind(sock
)) != 0)
1125 other
= unix_find_other(net
, sunaddr
, alen
, sock
->type
, hash
, &err
);
1129 unix_state_double_lock(sk
, other
);
1131 /* Apparently VFS overslept socket death. Retry. */
1132 if (sock_flag(other
, SOCK_DEAD
)) {
1133 unix_state_double_unlock(sk
, other
);
1139 if (!unix_may_send(sk
, other
))
1142 err
= security_unix_may_send(sk
->sk_socket
, other
->sk_socket
);
1148 * 1003.1g breaking connected state with AF_UNSPEC
1151 unix_state_double_lock(sk
, other
);
1155 * If it was connected, reconnect.
1157 if (unix_peer(sk
)) {
1158 struct sock
*old_peer
= unix_peer(sk
);
1159 unix_peer(sk
) = other
;
1160 unix_dgram_peer_wake_disconnect_wakeup(sk
, old_peer
);
1162 unix_state_double_unlock(sk
, other
);
1164 if (other
!= old_peer
)
1165 unix_dgram_disconnected(sk
, old_peer
);
1168 unix_peer(sk
) = other
;
1169 unix_state_double_unlock(sk
, other
);
1174 unix_state_double_unlock(sk
, other
);
1180 static long unix_wait_for_peer(struct sock
*other
, long timeo
)
1182 struct unix_sock
*u
= unix_sk(other
);
1186 prepare_to_wait_exclusive(&u
->peer_wait
, &wait
, TASK_INTERRUPTIBLE
);
1188 sched
= !sock_flag(other
, SOCK_DEAD
) &&
1189 !(other
->sk_shutdown
& RCV_SHUTDOWN
) &&
1190 unix_recvq_full(other
);
1192 unix_state_unlock(other
);
1195 timeo
= schedule_timeout(timeo
);
1197 finish_wait(&u
->peer_wait
, &wait
);
1201 static int unix_stream_connect(struct socket
*sock
, struct sockaddr
*uaddr
,
1202 int addr_len
, int flags
)
1204 struct sockaddr_un
*sunaddr
= (struct sockaddr_un
*)uaddr
;
1205 struct sock
*sk
= sock
->sk
;
1206 struct net
*net
= sock_net(sk
);
1207 struct unix_sock
*u
= unix_sk(sk
), *newu
, *otheru
;
1208 struct sock
*newsk
= NULL
;
1209 struct sock
*other
= NULL
;
1210 struct sk_buff
*skb
= NULL
;
1216 err
= unix_mkname(sunaddr
, addr_len
, &hash
);
1221 if (test_bit(SOCK_PASSCRED
, &sock
->flags
) && !u
->addr
&&
1222 (err
= unix_autobind(sock
)) != 0)
1225 timeo
= sock_sndtimeo(sk
, flags
& O_NONBLOCK
);
1227 /* First of all allocate resources.
1228 If we will make it after state is locked,
1229 we will have to recheck all again in any case.
1234 /* create new sock for complete connection */
1235 newsk
= unix_create1(sock_net(sk
), NULL
, 0);
1239 /* Allocate skb for sending to listening sock */
1240 skb
= sock_wmalloc(newsk
, 1, 0, GFP_KERNEL
);
1245 /* Find listening sock. */
1246 other
= unix_find_other(net
, sunaddr
, addr_len
, sk
->sk_type
, hash
, &err
);
1250 /* Latch state of peer */
1251 unix_state_lock(other
);
1253 /* Apparently VFS overslept socket death. Retry. */
1254 if (sock_flag(other
, SOCK_DEAD
)) {
1255 unix_state_unlock(other
);
1260 err
= -ECONNREFUSED
;
1261 if (other
->sk_state
!= TCP_LISTEN
)
1263 if (other
->sk_shutdown
& RCV_SHUTDOWN
)
1266 if (unix_recvq_full(other
)) {
1271 timeo
= unix_wait_for_peer(other
, timeo
);
1273 err
= sock_intr_errno(timeo
);
1274 if (signal_pending(current
))
1282 It is tricky place. We need to grab our state lock and cannot
1283 drop lock on peer. It is dangerous because deadlock is
1284 possible. Connect to self case and simultaneous
1285 attempt to connect are eliminated by checking socket
1286 state. other is TCP_LISTEN, if sk is TCP_LISTEN we
1287 check this before attempt to grab lock.
1289 Well, and we have to recheck the state after socket locked.
1295 /* This is ok... continue with connect */
1297 case TCP_ESTABLISHED
:
1298 /* Socket is already connected */
1306 unix_state_lock_nested(sk
);
1308 if (sk
->sk_state
!= st
) {
1309 unix_state_unlock(sk
);
1310 unix_state_unlock(other
);
1315 err
= security_unix_stream_connect(sk
, other
, newsk
);
1317 unix_state_unlock(sk
);
1321 /* The way is open! Fastly set all the necessary fields... */
1324 unix_peer(newsk
) = sk
;
1325 newsk
->sk_state
= TCP_ESTABLISHED
;
1326 newsk
->sk_type
= sk
->sk_type
;
1327 init_peercred(newsk
);
1328 newu
= unix_sk(newsk
);
1329 RCU_INIT_POINTER(newsk
->sk_wq
, &newu
->peer_wq
);
1330 otheru
= unix_sk(other
);
1332 /* copy address information from listening to new sock
1334 * The contents of *(otheru->addr) and otheru->path
1335 * are seen fully set up here, since we have found
1336 * otheru in hash under unix_table_lock. Insertion
1337 * into the hash chain we'd found it in had been done
1338 * in an earlier critical area protected by unix_table_lock,
1339 * the same one where we'd set *(otheru->addr) contents,
1340 * as well as otheru->path and otheru->addr itself.
1342 * Using smp_store_release() here to set newu->addr
1343 * is enough to make those stores, as well as stores
1344 * to newu->path visible to anyone who gets newu->addr
1345 * by smp_load_acquire(). IOW, the same warranties
1346 * as for unix_sock instances bound in unix_bind() or
1347 * in unix_autobind().
1349 if (otheru
->path
.dentry
) {
1350 path_get(&otheru
->path
);
1351 newu
->path
= otheru
->path
;
1353 refcount_inc(&otheru
->addr
->refcnt
);
1354 smp_store_release(&newu
->addr
, otheru
->addr
);
1356 /* Set credentials */
1357 copy_peercred(sk
, other
);
1359 sock
->state
= SS_CONNECTED
;
1360 sk
->sk_state
= TCP_ESTABLISHED
;
1363 smp_mb__after_atomic(); /* sock_hold() does an atomic_inc() */
1364 unix_peer(sk
) = newsk
;
1366 unix_state_unlock(sk
);
1368 /* take ten and and send info to listening sock */
1369 spin_lock(&other
->sk_receive_queue
.lock
);
1370 __skb_queue_tail(&other
->sk_receive_queue
, skb
);
1371 spin_unlock(&other
->sk_receive_queue
.lock
);
1372 unix_state_unlock(other
);
1373 other
->sk_data_ready(other
);
1379 unix_state_unlock(other
);
1384 unix_release_sock(newsk
, 0);
1390 static int unix_socketpair(struct socket
*socka
, struct socket
*sockb
)
1392 struct sock
*ska
= socka
->sk
, *skb
= sockb
->sk
;
1394 /* Join our sockets back to back */
1397 unix_peer(ska
) = skb
;
1398 unix_peer(skb
) = ska
;
1402 if (ska
->sk_type
!= SOCK_DGRAM
) {
1403 ska
->sk_state
= TCP_ESTABLISHED
;
1404 skb
->sk_state
= TCP_ESTABLISHED
;
1405 socka
->state
= SS_CONNECTED
;
1406 sockb
->state
= SS_CONNECTED
;
1411 static void unix_sock_inherit_flags(const struct socket
*old
,
1414 if (test_bit(SOCK_PASSCRED
, &old
->flags
))
1415 set_bit(SOCK_PASSCRED
, &new->flags
);
1416 if (test_bit(SOCK_PASSSEC
, &old
->flags
))
1417 set_bit(SOCK_PASSSEC
, &new->flags
);
1420 static int unix_accept(struct socket
*sock
, struct socket
*newsock
, int flags
,
1423 struct sock
*sk
= sock
->sk
;
1425 struct sk_buff
*skb
;
1429 if (sock
->type
!= SOCK_STREAM
&& sock
->type
!= SOCK_SEQPACKET
)
1433 if (sk
->sk_state
!= TCP_LISTEN
)
1436 /* If socket state is TCP_LISTEN it cannot change (for now...),
1437 * so that no locks are necessary.
1440 skb
= skb_recv_datagram(sk
, 0, flags
&O_NONBLOCK
, &err
);
1442 /* This means receive shutdown. */
1449 skb_free_datagram(sk
, skb
);
1450 wake_up_interruptible(&unix_sk(sk
)->peer_wait
);
1452 /* attach accepted sock to socket */
1453 unix_state_lock(tsk
);
1454 newsock
->state
= SS_CONNECTED
;
1455 unix_sock_inherit_flags(sock
, newsock
);
1456 sock_graft(tsk
, newsock
);
1457 unix_state_unlock(tsk
);
1465 static int unix_getname(struct socket
*sock
, struct sockaddr
*uaddr
, int peer
)
1467 struct sock
*sk
= sock
->sk
;
1468 struct unix_address
*addr
;
1469 DECLARE_SOCKADDR(struct sockaddr_un
*, sunaddr
, uaddr
);
1473 sk
= unix_peer_get(sk
);
1483 addr
= smp_load_acquire(&unix_sk(sk
)->addr
);
1485 sunaddr
->sun_family
= AF_UNIX
;
1486 sunaddr
->sun_path
[0] = 0;
1487 err
= sizeof(short);
1490 memcpy(sunaddr
, addr
->name
, addr
->len
);
1497 static void unix_detach_fds(struct scm_cookie
*scm
, struct sk_buff
*skb
)
1501 scm
->fp
= UNIXCB(skb
).fp
;
1502 UNIXCB(skb
).fp
= NULL
;
1504 for (i
= scm
->fp
->count
-1; i
>= 0; i
--)
1505 unix_notinflight(scm
->fp
->user
, scm
->fp
->fp
[i
]);
1508 static void unix_destruct_scm(struct sk_buff
*skb
)
1510 struct scm_cookie scm
;
1511 memset(&scm
, 0, sizeof(scm
));
1512 scm
.pid
= UNIXCB(skb
).pid
;
1514 unix_detach_fds(&scm
, skb
);
1516 /* Alas, it calls VFS */
1517 /* So fscking what? fput() had been SMP-safe since the last Summer */
1523 * The "user->unix_inflight" variable is protected by the garbage
1524 * collection lock, and we just read it locklessly here. If you go
1525 * over the limit, there might be a tiny race in actually noticing
1526 * it across threads. Tough.
1528 static inline bool too_many_unix_fds(struct task_struct
*p
)
1530 struct user_struct
*user
= current_user();
1532 if (unlikely(user
->unix_inflight
> task_rlimit(p
, RLIMIT_NOFILE
)))
1533 return !capable(CAP_SYS_RESOURCE
) && !capable(CAP_SYS_ADMIN
);
1537 static int unix_attach_fds(struct scm_cookie
*scm
, struct sk_buff
*skb
)
1541 if (too_many_unix_fds(current
))
1542 return -ETOOMANYREFS
;
1545 * Need to duplicate file references for the sake of garbage
1546 * collection. Otherwise a socket in the fps might become a
1547 * candidate for GC while the skb is not yet queued.
1549 UNIXCB(skb
).fp
= scm_fp_dup(scm
->fp
);
1550 if (!UNIXCB(skb
).fp
)
1553 for (i
= scm
->fp
->count
- 1; i
>= 0; i
--)
1554 unix_inflight(scm
->fp
->user
, scm
->fp
->fp
[i
]);
1558 static int unix_scm_to_skb(struct scm_cookie
*scm
, struct sk_buff
*skb
, bool send_fds
)
1562 UNIXCB(skb
).pid
= get_pid(scm
->pid
);
1563 UNIXCB(skb
).uid
= scm
->creds
.uid
;
1564 UNIXCB(skb
).gid
= scm
->creds
.gid
;
1565 UNIXCB(skb
).fp
= NULL
;
1566 unix_get_secdata(scm
, skb
);
1567 if (scm
->fp
&& send_fds
)
1568 err
= unix_attach_fds(scm
, skb
);
1570 skb
->destructor
= unix_destruct_scm
;
1574 static bool unix_passcred_enabled(const struct socket
*sock
,
1575 const struct sock
*other
)
1577 return test_bit(SOCK_PASSCRED
, &sock
->flags
) ||
1578 !other
->sk_socket
||
1579 test_bit(SOCK_PASSCRED
, &other
->sk_socket
->flags
);
1583 * Some apps rely on write() giving SCM_CREDENTIALS
1584 * We include credentials if source or destination socket
1585 * asserted SOCK_PASSCRED.
1587 static void maybe_add_creds(struct sk_buff
*skb
, const struct socket
*sock
,
1588 const struct sock
*other
)
1590 if (UNIXCB(skb
).pid
)
1592 if (unix_passcred_enabled(sock
, other
)) {
1593 UNIXCB(skb
).pid
= get_pid(task_tgid(current
));
1594 current_uid_gid(&UNIXCB(skb
).uid
, &UNIXCB(skb
).gid
);
1598 static int maybe_init_creds(struct scm_cookie
*scm
,
1599 struct socket
*socket
,
1600 const struct sock
*other
)
1603 struct msghdr msg
= { .msg_controllen
= 0 };
1605 err
= scm_send(socket
, &msg
, scm
, false);
1609 if (unix_passcred_enabled(socket
, other
)) {
1610 scm
->pid
= get_pid(task_tgid(current
));
1611 current_uid_gid(&scm
->creds
.uid
, &scm
->creds
.gid
);
1616 static bool unix_skb_scm_eq(struct sk_buff
*skb
,
1617 struct scm_cookie
*scm
)
1619 const struct unix_skb_parms
*u
= &UNIXCB(skb
);
1621 return u
->pid
== scm
->pid
&&
1622 uid_eq(u
->uid
, scm
->creds
.uid
) &&
1623 gid_eq(u
->gid
, scm
->creds
.gid
) &&
1624 unix_secdata_eq(scm
, skb
);
1628 * Send AF_UNIX data.
1631 static int unix_dgram_sendmsg(struct socket
*sock
, struct msghdr
*msg
,
1634 struct sock
*sk
= sock
->sk
;
1635 struct net
*net
= sock_net(sk
);
1636 struct unix_sock
*u
= unix_sk(sk
);
1637 DECLARE_SOCKADDR(struct sockaddr_un
*, sunaddr
, msg
->msg_name
);
1638 struct sock
*other
= NULL
;
1639 int namelen
= 0; /* fake GCC */
1642 struct sk_buff
*skb
;
1644 struct scm_cookie scm
;
1649 err
= scm_send(sock
, msg
, &scm
, false);
1654 if (msg
->msg_flags
&MSG_OOB
)
1657 if (msg
->msg_namelen
) {
1658 err
= unix_mkname(sunaddr
, msg
->msg_namelen
, &hash
);
1665 other
= unix_peer_get(sk
);
1670 if (test_bit(SOCK_PASSCRED
, &sock
->flags
) && !u
->addr
1671 && (err
= unix_autobind(sock
)) != 0)
1675 if (len
> sk
->sk_sndbuf
- 32)
1678 if (len
> SKB_MAX_ALLOC
) {
1679 data_len
= min_t(size_t,
1680 len
- SKB_MAX_ALLOC
,
1681 MAX_SKB_FRAGS
* PAGE_SIZE
);
1682 data_len
= PAGE_ALIGN(data_len
);
1684 BUILD_BUG_ON(SKB_MAX_ALLOC
< PAGE_SIZE
);
1687 skb
= sock_alloc_send_pskb(sk
, len
- data_len
, data_len
,
1688 msg
->msg_flags
& MSG_DONTWAIT
, &err
,
1689 PAGE_ALLOC_COSTLY_ORDER
);
1693 err
= unix_scm_to_skb(&scm
, skb
, true);
1697 skb_put(skb
, len
- data_len
);
1698 skb
->data_len
= data_len
;
1700 err
= skb_copy_datagram_from_iter(skb
, 0, &msg
->msg_iter
, len
);
1704 timeo
= sock_sndtimeo(sk
, msg
->msg_flags
& MSG_DONTWAIT
);
1709 if (sunaddr
== NULL
)
1712 other
= unix_find_other(net
, sunaddr
, namelen
, sk
->sk_type
,
1718 if (sk_filter(other
, skb
) < 0) {
1719 /* Toss the packet but do not return any error to the sender */
1725 unix_state_lock(other
);
1728 if (!unix_may_send(sk
, other
))
1731 if (unlikely(sock_flag(other
, SOCK_DEAD
))) {
1733 * Check with 1003.1g - what should
1736 unix_state_unlock(other
);
1740 unix_state_lock(sk
);
1743 if (unix_peer(sk
) == other
) {
1744 unix_peer(sk
) = NULL
;
1745 unix_dgram_peer_wake_disconnect_wakeup(sk
, other
);
1747 unix_state_unlock(sk
);
1749 unix_dgram_disconnected(sk
, other
);
1751 err
= -ECONNREFUSED
;
1753 unix_state_unlock(sk
);
1763 if (other
->sk_shutdown
& RCV_SHUTDOWN
)
1766 if (sk
->sk_type
!= SOCK_SEQPACKET
) {
1767 err
= security_unix_may_send(sk
->sk_socket
, other
->sk_socket
);
1772 /* other == sk && unix_peer(other) != sk if
1773 * - unix_peer(sk) == NULL, destination address bound to sk
1774 * - unix_peer(sk) == sk by time of get but disconnected before lock
1777 unlikely(unix_peer(other
) != sk
&& unix_recvq_full(other
))) {
1779 timeo
= unix_wait_for_peer(other
, timeo
);
1781 err
= sock_intr_errno(timeo
);
1782 if (signal_pending(current
))
1789 unix_state_unlock(other
);
1790 unix_state_double_lock(sk
, other
);
1793 if (unix_peer(sk
) != other
||
1794 unix_dgram_peer_wake_me(sk
, other
)) {
1802 goto restart_locked
;
1806 if (unlikely(sk_locked
))
1807 unix_state_unlock(sk
);
1809 if (sock_flag(other
, SOCK_RCVTSTAMP
))
1810 __net_timestamp(skb
);
1811 maybe_add_creds(skb
, sock
, other
);
1812 skb_queue_tail(&other
->sk_receive_queue
, skb
);
1813 unix_state_unlock(other
);
1814 other
->sk_data_ready(other
);
1821 unix_state_unlock(sk
);
1822 unix_state_unlock(other
);
1832 /* We use paged skbs for stream sockets, and limit occupancy to 32768
1833 * bytes, and a minimum of a full page.
1835 #define UNIX_SKB_FRAGS_SZ (PAGE_SIZE << get_order(32768))
1837 static int unix_stream_sendmsg(struct socket
*sock
, struct msghdr
*msg
,
1840 struct sock
*sk
= sock
->sk
;
1841 struct sock
*other
= NULL
;
1843 struct sk_buff
*skb
;
1845 struct scm_cookie scm
;
1846 bool fds_sent
= false;
1850 err
= scm_send(sock
, msg
, &scm
, false);
1855 if (msg
->msg_flags
&MSG_OOB
)
1858 if (msg
->msg_namelen
) {
1859 err
= sk
->sk_state
== TCP_ESTABLISHED
? -EISCONN
: -EOPNOTSUPP
;
1863 other
= unix_peer(sk
);
1868 if (sk
->sk_shutdown
& SEND_SHUTDOWN
)
1871 while (sent
< len
) {
1874 /* Keep two messages in the pipe so it schedules better */
1875 size
= min_t(int, size
, (sk
->sk_sndbuf
>> 1) - 64);
1877 /* allow fallback to order-0 allocations */
1878 size
= min_t(int, size
, SKB_MAX_HEAD(0) + UNIX_SKB_FRAGS_SZ
);
1880 data_len
= max_t(int, 0, size
- SKB_MAX_HEAD(0));
1882 data_len
= min_t(size_t, size
, PAGE_ALIGN(data_len
));
1884 skb
= sock_alloc_send_pskb(sk
, size
- data_len
, data_len
,
1885 msg
->msg_flags
& MSG_DONTWAIT
, &err
,
1886 get_order(UNIX_SKB_FRAGS_SZ
));
1890 /* Only send the fds in the first buffer */
1891 err
= unix_scm_to_skb(&scm
, skb
, !fds_sent
);
1898 skb_put(skb
, size
- data_len
);
1899 skb
->data_len
= data_len
;
1901 err
= skb_copy_datagram_from_iter(skb
, 0, &msg
->msg_iter
, size
);
1907 unix_state_lock(other
);
1909 if (sock_flag(other
, SOCK_DEAD
) ||
1910 (other
->sk_shutdown
& RCV_SHUTDOWN
))
1913 maybe_add_creds(skb
, sock
, other
);
1914 skb_queue_tail(&other
->sk_receive_queue
, skb
);
1915 unix_state_unlock(other
);
1916 other
->sk_data_ready(other
);
1925 unix_state_unlock(other
);
1928 if (sent
== 0 && !(msg
->msg_flags
&MSG_NOSIGNAL
))
1929 send_sig(SIGPIPE
, current
, 0);
1933 return sent
? : err
;
1936 static ssize_t
unix_stream_sendpage(struct socket
*socket
, struct page
*page
,
1937 int offset
, size_t size
, int flags
)
1940 bool send_sigpipe
= false;
1941 bool init_scm
= true;
1942 struct scm_cookie scm
;
1943 struct sock
*other
, *sk
= socket
->sk
;
1944 struct sk_buff
*skb
, *newskb
= NULL
, *tail
= NULL
;
1946 if (flags
& MSG_OOB
)
1949 other
= unix_peer(sk
);
1950 if (!other
|| sk
->sk_state
!= TCP_ESTABLISHED
)
1955 unix_state_unlock(other
);
1956 mutex_unlock(&unix_sk(other
)->iolock
);
1957 newskb
= sock_alloc_send_pskb(sk
, 0, 0, flags
& MSG_DONTWAIT
,
1963 /* we must acquire iolock as we modify already present
1964 * skbs in the sk_receive_queue and mess with skb->len
1966 err
= mutex_lock_interruptible(&unix_sk(other
)->iolock
);
1968 err
= flags
& MSG_DONTWAIT
? -EAGAIN
: -ERESTARTSYS
;
1972 if (sk
->sk_shutdown
& SEND_SHUTDOWN
) {
1974 send_sigpipe
= true;
1978 unix_state_lock(other
);
1980 if (sock_flag(other
, SOCK_DEAD
) ||
1981 other
->sk_shutdown
& RCV_SHUTDOWN
) {
1983 send_sigpipe
= true;
1984 goto err_state_unlock
;
1988 err
= maybe_init_creds(&scm
, socket
, other
);
1990 goto err_state_unlock
;
1994 skb
= skb_peek_tail(&other
->sk_receive_queue
);
1995 if (tail
&& tail
== skb
) {
1997 } else if (!skb
|| !unix_skb_scm_eq(skb
, &scm
)) {
2004 } else if (newskb
) {
2005 /* this is fast path, we don't necessarily need to
2006 * call to kfree_skb even though with newskb == NULL
2007 * this - does no harm
2009 consume_skb(newskb
);
2013 if (skb_append_pagefrags(skb
, page
, offset
, size
)) {
2019 skb
->data_len
+= size
;
2020 skb
->truesize
+= size
;
2021 refcount_add(size
, &sk
->sk_wmem_alloc
);
2024 err
= unix_scm_to_skb(&scm
, skb
, false);
2026 goto err_state_unlock
;
2027 spin_lock(&other
->sk_receive_queue
.lock
);
2028 __skb_queue_tail(&other
->sk_receive_queue
, newskb
);
2029 spin_unlock(&other
->sk_receive_queue
.lock
);
2032 unix_state_unlock(other
);
2033 mutex_unlock(&unix_sk(other
)->iolock
);
2035 other
->sk_data_ready(other
);
2040 unix_state_unlock(other
);
2042 mutex_unlock(&unix_sk(other
)->iolock
);
2045 if (send_sigpipe
&& !(flags
& MSG_NOSIGNAL
))
2046 send_sig(SIGPIPE
, current
, 0);
2052 static int unix_seqpacket_sendmsg(struct socket
*sock
, struct msghdr
*msg
,
2056 struct sock
*sk
= sock
->sk
;
2058 err
= sock_error(sk
);
2062 if (sk
->sk_state
!= TCP_ESTABLISHED
)
2065 if (msg
->msg_namelen
)
2066 msg
->msg_namelen
= 0;
2068 return unix_dgram_sendmsg(sock
, msg
, len
);
2071 static int unix_seqpacket_recvmsg(struct socket
*sock
, struct msghdr
*msg
,
2072 size_t size
, int flags
)
2074 struct sock
*sk
= sock
->sk
;
2076 if (sk
->sk_state
!= TCP_ESTABLISHED
)
2079 return unix_dgram_recvmsg(sock
, msg
, size
, flags
);
2082 static void unix_copy_addr(struct msghdr
*msg
, struct sock
*sk
)
2084 struct unix_address
*addr
= smp_load_acquire(&unix_sk(sk
)->addr
);
2087 msg
->msg_namelen
= addr
->len
;
2088 memcpy(msg
->msg_name
, addr
->name
, addr
->len
);
2092 static int unix_dgram_recvmsg(struct socket
*sock
, struct msghdr
*msg
,
2093 size_t size
, int flags
)
2095 struct scm_cookie scm
;
2096 struct sock
*sk
= sock
->sk
;
2097 struct unix_sock
*u
= unix_sk(sk
);
2098 struct sk_buff
*skb
, *last
;
2107 timeo
= sock_rcvtimeo(sk
, flags
& MSG_DONTWAIT
);
2110 mutex_lock(&u
->iolock
);
2112 skip
= sk_peek_offset(sk
, flags
);
2113 skb
= __skb_try_recv_datagram(sk
, flags
, NULL
, &peeked
, &skip
,
2118 mutex_unlock(&u
->iolock
);
2123 !__skb_wait_for_more_packets(sk
, &err
, &timeo
, last
));
2125 if (!skb
) { /* implies iolock unlocked */
2126 unix_state_lock(sk
);
2127 /* Signal EOF on disconnected non-blocking SEQPACKET socket. */
2128 if (sk
->sk_type
== SOCK_SEQPACKET
&& err
== -EAGAIN
&&
2129 (sk
->sk_shutdown
& RCV_SHUTDOWN
))
2131 unix_state_unlock(sk
);
2135 if (wq_has_sleeper(&u
->peer_wait
))
2136 wake_up_interruptible_sync_poll(&u
->peer_wait
,
2137 EPOLLOUT
| EPOLLWRNORM
|
2141 unix_copy_addr(msg
, skb
->sk
);
2143 if (size
> skb
->len
- skip
)
2144 size
= skb
->len
- skip
;
2145 else if (size
< skb
->len
- skip
)
2146 msg
->msg_flags
|= MSG_TRUNC
;
2148 err
= skb_copy_datagram_msg(skb
, skip
, msg
, size
);
2152 if (sock_flag(sk
, SOCK_RCVTSTAMP
))
2153 __sock_recv_timestamp(msg
, sk
, skb
);
2155 memset(&scm
, 0, sizeof(scm
));
2157 scm_set_cred(&scm
, UNIXCB(skb
).pid
, UNIXCB(skb
).uid
, UNIXCB(skb
).gid
);
2158 unix_set_secdata(&scm
, skb
);
2160 if (!(flags
& MSG_PEEK
)) {
2162 unix_detach_fds(&scm
, skb
);
2164 sk_peek_offset_bwd(sk
, skb
->len
);
2166 /* It is questionable: on PEEK we could:
2167 - do not return fds - good, but too simple 8)
2168 - return fds, and do not return them on read (old strategy,
2170 - clone fds (I chose it for now, it is the most universal
2173 POSIX 1003.1g does not actually define this clearly
2174 at all. POSIX 1003.1g doesn't define a lot of things
2179 sk_peek_offset_fwd(sk
, size
);
2182 scm
.fp
= scm_fp_dup(UNIXCB(skb
).fp
);
2184 err
= (flags
& MSG_TRUNC
) ? skb
->len
- skip
: size
;
2186 scm_recv(sock
, msg
, &scm
, flags
);
2189 skb_free_datagram(sk
, skb
);
2190 mutex_unlock(&u
->iolock
);
2196 * Sleep until more data has arrived. But check for races..
2198 static long unix_stream_data_wait(struct sock
*sk
, long timeo
,
2199 struct sk_buff
*last
, unsigned int last_len
,
2202 struct sk_buff
*tail
;
2205 unix_state_lock(sk
);
2208 prepare_to_wait(sk_sleep(sk
), &wait
, TASK_INTERRUPTIBLE
);
2210 tail
= skb_peek_tail(&sk
->sk_receive_queue
);
2212 (tail
&& tail
->len
!= last_len
) ||
2214 (sk
->sk_shutdown
& RCV_SHUTDOWN
) ||
2215 signal_pending(current
) ||
2219 sk_set_bit(SOCKWQ_ASYNC_WAITDATA
, sk
);
2220 unix_state_unlock(sk
);
2222 timeo
= freezable_schedule_timeout(timeo
);
2224 timeo
= schedule_timeout(timeo
);
2225 unix_state_lock(sk
);
2227 if (sock_flag(sk
, SOCK_DEAD
))
2230 sk_clear_bit(SOCKWQ_ASYNC_WAITDATA
, sk
);
2233 finish_wait(sk_sleep(sk
), &wait
);
2234 unix_state_unlock(sk
);
2238 static unsigned int unix_skb_len(const struct sk_buff
*skb
)
2240 return skb
->len
- UNIXCB(skb
).consumed
;
2243 struct unix_stream_read_state
{
2244 int (*recv_actor
)(struct sk_buff
*, int, int,
2245 struct unix_stream_read_state
*);
2246 struct socket
*socket
;
2248 struct pipe_inode_info
*pipe
;
2251 unsigned int splice_flags
;
2254 static int unix_stream_read_generic(struct unix_stream_read_state
*state
,
2257 struct scm_cookie scm
;
2258 struct socket
*sock
= state
->socket
;
2259 struct sock
*sk
= sock
->sk
;
2260 struct unix_sock
*u
= unix_sk(sk
);
2262 int flags
= state
->flags
;
2263 int noblock
= flags
& MSG_DONTWAIT
;
2264 bool check_creds
= false;
2269 size_t size
= state
->size
;
2270 unsigned int last_len
;
2272 if (unlikely(sk
->sk_state
!= TCP_ESTABLISHED
)) {
2277 if (unlikely(flags
& MSG_OOB
)) {
2282 target
= sock_rcvlowat(sk
, flags
& MSG_WAITALL
, size
);
2283 timeo
= sock_rcvtimeo(sk
, noblock
);
2285 memset(&scm
, 0, sizeof(scm
));
2287 /* Lock the socket to prevent queue disordering
2288 * while sleeps in memcpy_tomsg
2290 mutex_lock(&u
->iolock
);
2292 skip
= max(sk_peek_offset(sk
, flags
), 0);
2297 struct sk_buff
*skb
, *last
;
2300 unix_state_lock(sk
);
2301 if (sock_flag(sk
, SOCK_DEAD
)) {
2305 last
= skb
= skb_peek(&sk
->sk_receive_queue
);
2306 last_len
= last
? last
->len
: 0;
2309 if (copied
>= target
)
2313 * POSIX 1003.1g mandates this order.
2316 err
= sock_error(sk
);
2319 if (sk
->sk_shutdown
& RCV_SHUTDOWN
)
2322 unix_state_unlock(sk
);
2328 mutex_unlock(&u
->iolock
);
2330 timeo
= unix_stream_data_wait(sk
, timeo
, last
,
2331 last_len
, freezable
);
2333 if (signal_pending(current
)) {
2334 err
= sock_intr_errno(timeo
);
2339 mutex_lock(&u
->iolock
);
2342 unix_state_unlock(sk
);
2346 while (skip
>= unix_skb_len(skb
)) {
2347 skip
-= unix_skb_len(skb
);
2349 last_len
= skb
->len
;
2350 skb
= skb_peek_next(skb
, &sk
->sk_receive_queue
);
2355 unix_state_unlock(sk
);
2358 /* Never glue messages from different writers */
2359 if (!unix_skb_scm_eq(skb
, &scm
))
2361 } else if (test_bit(SOCK_PASSCRED
, &sock
->flags
)) {
2362 /* Copy credentials */
2363 scm_set_cred(&scm
, UNIXCB(skb
).pid
, UNIXCB(skb
).uid
, UNIXCB(skb
).gid
);
2364 unix_set_secdata(&scm
, skb
);
2368 /* Copy address just once */
2369 if (state
->msg
&& state
->msg
->msg_name
) {
2370 DECLARE_SOCKADDR(struct sockaddr_un
*, sunaddr
,
2371 state
->msg
->msg_name
);
2372 unix_copy_addr(state
->msg
, skb
->sk
);
2376 chunk
= min_t(unsigned int, unix_skb_len(skb
) - skip
, size
);
2378 chunk
= state
->recv_actor(skb
, skip
, chunk
, state
);
2379 drop_skb
= !unix_skb_len(skb
);
2380 /* skb is only safe to use if !drop_skb */
2391 /* the skb was touched by a concurrent reader;
2392 * we should not expect anything from this skb
2393 * anymore and assume it invalid - we can be
2394 * sure it was dropped from the socket queue
2396 * let's report a short read
2402 /* Mark read part of skb as used */
2403 if (!(flags
& MSG_PEEK
)) {
2404 UNIXCB(skb
).consumed
+= chunk
;
2406 sk_peek_offset_bwd(sk
, chunk
);
2409 unix_detach_fds(&scm
, skb
);
2411 if (unix_skb_len(skb
))
2414 skb_unlink(skb
, &sk
->sk_receive_queue
);
2420 /* It is questionable, see note in unix_dgram_recvmsg.
2423 scm
.fp
= scm_fp_dup(UNIXCB(skb
).fp
);
2425 sk_peek_offset_fwd(sk
, chunk
);
2432 last_len
= skb
->len
;
2433 unix_state_lock(sk
);
2434 skb
= skb_peek_next(skb
, &sk
->sk_receive_queue
);
2437 unix_state_unlock(sk
);
2442 mutex_unlock(&u
->iolock
);
2444 scm_recv(sock
, state
->msg
, &scm
, flags
);
2448 return copied
? : err
;
2451 static int unix_stream_read_actor(struct sk_buff
*skb
,
2452 int skip
, int chunk
,
2453 struct unix_stream_read_state
*state
)
2457 ret
= skb_copy_datagram_msg(skb
, UNIXCB(skb
).consumed
+ skip
,
2459 return ret
?: chunk
;
2462 static int unix_stream_recvmsg(struct socket
*sock
, struct msghdr
*msg
,
2463 size_t size
, int flags
)
2465 struct unix_stream_read_state state
= {
2466 .recv_actor
= unix_stream_read_actor
,
2473 return unix_stream_read_generic(&state
, true);
2476 static int unix_stream_splice_actor(struct sk_buff
*skb
,
2477 int skip
, int chunk
,
2478 struct unix_stream_read_state
*state
)
2480 return skb_splice_bits(skb
, state
->socket
->sk
,
2481 UNIXCB(skb
).consumed
+ skip
,
2482 state
->pipe
, chunk
, state
->splice_flags
);
2485 static ssize_t
unix_stream_splice_read(struct socket
*sock
, loff_t
*ppos
,
2486 struct pipe_inode_info
*pipe
,
2487 size_t size
, unsigned int flags
)
2489 struct unix_stream_read_state state
= {
2490 .recv_actor
= unix_stream_splice_actor
,
2494 .splice_flags
= flags
,
2497 if (unlikely(*ppos
))
2500 if (sock
->file
->f_flags
& O_NONBLOCK
||
2501 flags
& SPLICE_F_NONBLOCK
)
2502 state
.flags
= MSG_DONTWAIT
;
2504 return unix_stream_read_generic(&state
, false);
2507 static int unix_shutdown(struct socket
*sock
, int mode
)
2509 struct sock
*sk
= sock
->sk
;
2512 if (mode
< SHUT_RD
|| mode
> SHUT_RDWR
)
2515 * SHUT_RD (0) -> RCV_SHUTDOWN (1)
2516 * SHUT_WR (1) -> SEND_SHUTDOWN (2)
2517 * SHUT_RDWR (2) -> SHUTDOWN_MASK (3)
2521 unix_state_lock(sk
);
2522 sk
->sk_shutdown
|= mode
;
2523 other
= unix_peer(sk
);
2526 unix_state_unlock(sk
);
2527 sk
->sk_state_change(sk
);
2530 (sk
->sk_type
== SOCK_STREAM
|| sk
->sk_type
== SOCK_SEQPACKET
)) {
2534 if (mode
&RCV_SHUTDOWN
)
2535 peer_mode
|= SEND_SHUTDOWN
;
2536 if (mode
&SEND_SHUTDOWN
)
2537 peer_mode
|= RCV_SHUTDOWN
;
2538 unix_state_lock(other
);
2539 other
->sk_shutdown
|= peer_mode
;
2540 unix_state_unlock(other
);
2541 other
->sk_state_change(other
);
2542 if (peer_mode
== SHUTDOWN_MASK
)
2543 sk_wake_async(other
, SOCK_WAKE_WAITD
, POLL_HUP
);
2544 else if (peer_mode
& RCV_SHUTDOWN
)
2545 sk_wake_async(other
, SOCK_WAKE_WAITD
, POLL_IN
);
2553 long unix_inq_len(struct sock
*sk
)
2555 struct sk_buff
*skb
;
2558 if (sk
->sk_state
== TCP_LISTEN
)
2561 spin_lock(&sk
->sk_receive_queue
.lock
);
2562 if (sk
->sk_type
== SOCK_STREAM
||
2563 sk
->sk_type
== SOCK_SEQPACKET
) {
2564 skb_queue_walk(&sk
->sk_receive_queue
, skb
)
2565 amount
+= unix_skb_len(skb
);
2567 skb
= skb_peek(&sk
->sk_receive_queue
);
2571 spin_unlock(&sk
->sk_receive_queue
.lock
);
2575 EXPORT_SYMBOL_GPL(unix_inq_len
);
2577 long unix_outq_len(struct sock
*sk
)
2579 return sk_wmem_alloc_get(sk
);
2581 EXPORT_SYMBOL_GPL(unix_outq_len
);
2583 static int unix_open_file(struct sock
*sk
)
2589 if (!ns_capable(sock_net(sk
)->user_ns
, CAP_NET_ADMIN
))
2592 if (!smp_load_acquire(&unix_sk(sk
)->addr
))
2595 path
= unix_sk(sk
)->path
;
2601 fd
= get_unused_fd_flags(O_CLOEXEC
);
2605 f
= dentry_open(&path
, O_PATH
, current_cred());
2619 static int unix_ioctl(struct socket
*sock
, unsigned int cmd
, unsigned long arg
)
2621 struct sock
*sk
= sock
->sk
;
2627 amount
= unix_outq_len(sk
);
2628 err
= put_user(amount
, (int __user
*)arg
);
2631 amount
= unix_inq_len(sk
);
2635 err
= put_user(amount
, (int __user
*)arg
);
2638 err
= unix_open_file(sk
);
2647 static __poll_t
unix_poll(struct file
*file
, struct socket
*sock
, poll_table
*wait
)
2649 struct sock
*sk
= sock
->sk
;
2652 sock_poll_wait(file
, sock
, wait
);
2655 /* exceptional events? */
2658 if (sk
->sk_shutdown
== SHUTDOWN_MASK
)
2660 if (sk
->sk_shutdown
& RCV_SHUTDOWN
)
2661 mask
|= EPOLLRDHUP
| EPOLLIN
| EPOLLRDNORM
;
2664 if (!skb_queue_empty(&sk
->sk_receive_queue
))
2665 mask
|= EPOLLIN
| EPOLLRDNORM
;
2667 /* Connection-based need to check for termination and startup */
2668 if ((sk
->sk_type
== SOCK_STREAM
|| sk
->sk_type
== SOCK_SEQPACKET
) &&
2669 sk
->sk_state
== TCP_CLOSE
)
2673 * we set writable also when the other side has shut down the
2674 * connection. This prevents stuck sockets.
2676 if (unix_writable(sk
))
2677 mask
|= EPOLLOUT
| EPOLLWRNORM
| EPOLLWRBAND
;
2682 static __poll_t
unix_dgram_poll(struct file
*file
, struct socket
*sock
,
2685 struct sock
*sk
= sock
->sk
, *other
;
2686 unsigned int writable
;
2689 sock_poll_wait(file
, sock
, wait
);
2692 /* exceptional events? */
2693 if (sk
->sk_err
|| !skb_queue_empty(&sk
->sk_error_queue
))
2695 (sock_flag(sk
, SOCK_SELECT_ERR_QUEUE
) ? EPOLLPRI
: 0);
2697 if (sk
->sk_shutdown
& RCV_SHUTDOWN
)
2698 mask
|= EPOLLRDHUP
| EPOLLIN
| EPOLLRDNORM
;
2699 if (sk
->sk_shutdown
== SHUTDOWN_MASK
)
2703 if (!skb_queue_empty(&sk
->sk_receive_queue
))
2704 mask
|= EPOLLIN
| EPOLLRDNORM
;
2706 /* Connection-based need to check for termination and startup */
2707 if (sk
->sk_type
== SOCK_SEQPACKET
) {
2708 if (sk
->sk_state
== TCP_CLOSE
)
2710 /* connection hasn't started yet? */
2711 if (sk
->sk_state
== TCP_SYN_SENT
)
2715 /* No write status requested, avoid expensive OUT tests. */
2716 if (!(poll_requested_events(wait
) & (EPOLLWRBAND
|EPOLLWRNORM
|EPOLLOUT
)))
2719 writable
= unix_writable(sk
);
2721 unix_state_lock(sk
);
2723 other
= unix_peer(sk
);
2724 if (other
&& unix_peer(other
) != sk
&&
2725 unix_recvq_full(other
) &&
2726 unix_dgram_peer_wake_me(sk
, other
))
2729 unix_state_unlock(sk
);
2733 mask
|= EPOLLOUT
| EPOLLWRNORM
| EPOLLWRBAND
;
2735 sk_set_bit(SOCKWQ_ASYNC_NOSPACE
, sk
);
2740 #ifdef CONFIG_PROC_FS
2742 #define BUCKET_SPACE (BITS_PER_LONG - (UNIX_HASH_BITS + 1) - 1)
2744 #define get_bucket(x) ((x) >> BUCKET_SPACE)
2745 #define get_offset(x) ((x) & ((1L << BUCKET_SPACE) - 1))
2746 #define set_bucket_offset(b, o) ((b) << BUCKET_SPACE | (o))
2748 static struct sock
*unix_from_bucket(struct seq_file
*seq
, loff_t
*pos
)
2750 unsigned long offset
= get_offset(*pos
);
2751 unsigned long bucket
= get_bucket(*pos
);
2753 unsigned long count
= 0;
2755 for (sk
= sk_head(&unix_socket_table
[bucket
]); sk
; sk
= sk_next(sk
)) {
2756 if (sock_net(sk
) != seq_file_net(seq
))
2758 if (++count
== offset
)
2765 static struct sock
*unix_next_socket(struct seq_file
*seq
,
2769 unsigned long bucket
;
2771 while (sk
> (struct sock
*)SEQ_START_TOKEN
) {
2775 if (sock_net(sk
) == seq_file_net(seq
))
2780 sk
= unix_from_bucket(seq
, pos
);
2785 bucket
= get_bucket(*pos
) + 1;
2786 *pos
= set_bucket_offset(bucket
, 1);
2787 } while (bucket
< ARRAY_SIZE(unix_socket_table
));
2792 static void *unix_seq_start(struct seq_file
*seq
, loff_t
*pos
)
2793 __acquires(unix_table_lock
)
2795 spin_lock(&unix_table_lock
);
2798 return SEQ_START_TOKEN
;
2800 if (get_bucket(*pos
) >= ARRAY_SIZE(unix_socket_table
))
2803 return unix_next_socket(seq
, NULL
, pos
);
2806 static void *unix_seq_next(struct seq_file
*seq
, void *v
, loff_t
*pos
)
2809 return unix_next_socket(seq
, v
, pos
);
2812 static void unix_seq_stop(struct seq_file
*seq
, void *v
)
2813 __releases(unix_table_lock
)
2815 spin_unlock(&unix_table_lock
);
2818 static int unix_seq_show(struct seq_file
*seq
, void *v
)
2821 if (v
== SEQ_START_TOKEN
)
2822 seq_puts(seq
, "Num RefCount Protocol Flags Type St "
2826 struct unix_sock
*u
= unix_sk(s
);
2829 seq_printf(seq
, "%pK: %08X %08X %08X %04X %02X %5lu",
2831 refcount_read(&s
->sk_refcnt
),
2833 s
->sk_state
== TCP_LISTEN
? __SO_ACCEPTCON
: 0,
2836 (s
->sk_state
== TCP_ESTABLISHED
? SS_CONNECTED
: SS_UNCONNECTED
) :
2837 (s
->sk_state
== TCP_ESTABLISHED
? SS_CONNECTING
: SS_DISCONNECTING
),
2840 if (u
->addr
) { // under unix_table_lock here
2845 len
= u
->addr
->len
- sizeof(short);
2846 if (!UNIX_ABSTRACT(s
))
2852 for ( ; i
< len
; i
++)
2853 seq_putc(seq
, u
->addr
->name
->sun_path
[i
] ?:
2856 unix_state_unlock(s
);
2857 seq_putc(seq
, '\n');
2863 static const struct seq_operations unix_seq_ops
= {
2864 .start
= unix_seq_start
,
2865 .next
= unix_seq_next
,
2866 .stop
= unix_seq_stop
,
2867 .show
= unix_seq_show
,
2871 static const struct net_proto_family unix_family_ops
= {
2873 .create
= unix_create
,
2874 .owner
= THIS_MODULE
,
2878 static int __net_init
unix_net_init(struct net
*net
)
2880 int error
= -ENOMEM
;
2882 net
->unx
.sysctl_max_dgram_qlen
= 10;
2883 if (unix_sysctl_register(net
))
2886 #ifdef CONFIG_PROC_FS
2887 if (!proc_create_net("unix", 0, net
->proc_net
, &unix_seq_ops
,
2888 sizeof(struct seq_net_private
))) {
2889 unix_sysctl_unregister(net
);
2898 static void __net_exit
unix_net_exit(struct net
*net
)
2900 unix_sysctl_unregister(net
);
2901 remove_proc_entry("unix", net
->proc_net
);
2904 static struct pernet_operations unix_net_ops
= {
2905 .init
= unix_net_init
,
2906 .exit
= unix_net_exit
,
2909 static int __init
af_unix_init(void)
2913 BUILD_BUG_ON(sizeof(struct unix_skb_parms
) > FIELD_SIZEOF(struct sk_buff
, cb
));
2915 rc
= proto_register(&unix_proto
, 1);
2917 pr_crit("%s: Cannot create unix_sock SLAB cache!\n", __func__
);
2921 sock_register(&unix_family_ops
);
2922 register_pernet_subsys(&unix_net_ops
);
2927 static void __exit
af_unix_exit(void)
2929 sock_unregister(PF_UNIX
);
2930 proto_unregister(&unix_proto
);
2931 unregister_pernet_subsys(&unix_net_ops
);
2934 /* Earlier than device_initcall() so that other drivers invoking
2935 request_module() don't end up in a loop when modprobe tries
2936 to use a UNIX socket. But later than subsys_initcall() because
2937 we depend on stuff initialised there */
2938 fs_initcall(af_unix_init
);
2939 module_exit(af_unix_exit
);
2941 MODULE_LICENSE("GPL");
2942 MODULE_ALIAS_NETPROTO(PF_UNIX
);